From ba45bd34d6d0036e6f346466469853b073243b21 Mon Sep 17 00:00:00 2001
From: Boris Nagaev <bnagaev@gmail.com>
Date: Wed, 6 Jul 2016 19:46:25 +0300
Subject: [PATCH 001/166] fix iter_wrapper for iterator=pointer

---
 src/util/ue2_containers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/util/ue2_containers.h b/src/util/ue2_containers.h
index e3b01363..217d08ea 100644
--- a/src/util/ue2_containers.h
+++ b/src/util/ue2_containers.h
@@ -82,7 +82,7 @@ private:
     void increment() { ++it; }
     void decrement() { --it; }
     void advance(size_t n) { it += n; }
-    typename WrappedIter::difference_type
+    typename std::iterator_traits<WrappedIter>::difference_type
     distance_to(const iter_wrapper &other) const {
         return other.it - it;
     }

From 6d87533ef0ded3c4c80e7bc61cc78c29dcd943ce Mon Sep 17 00:00:00 2001
From: Boris Nagaev <bnagaev@gmail.com>
Date: Wed, 6 Jul 2016 19:46:41 +0300
Subject: [PATCH 002/166] fix add_to_engine_blob for iterator=pointer

---
 src/rose/rose_build_bytecode.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 758dd442..d37e95dc 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -460,7 +460,7 @@ u32 add_to_engine_blob(build_context &bc, const T &a, const size_t len) {
 template<typename Iter>
 static
 u32 add_to_engine_blob(build_context &bc, Iter b, const Iter &e) {
-    using value_type = typename Iter::value_type;
+    using value_type = typename std::iterator_traits<Iter>::value_type;
     static_assert(is_pod<value_type>::value, "should be pod");
 
     if (b == e) {

From 9b7eca5400f5f844f485d3cda03b6c40471309bf Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 26 May 2016 10:11:19 +1000
Subject: [PATCH 003/166] rose: dump leftfix/suffix queue indices

---
 src/rose/rose_build_bytecode.cpp |  4 +-
 src/rose/rose_build_dump.cpp     | 69 ++++++++++++++++++--------------
 src/rose/rose_build_impl.h       |  6 +++
 3 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index d37e95dc..dac2e79c 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -1102,7 +1102,7 @@ void setLeftNfaProperties(NFA &n, const left_id &left) {
 }
 
 static
-bool buildLeftfixes(const RoseBuildImpl &tbi, build_context &bc,
+bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
                     QueueIndexFactory &qif, set<u32> *no_retrigger_queues,
                     bool do_prefix) {
     const RoseGraph &g = tbi.g;
@@ -1174,6 +1174,7 @@ bool buildLeftfixes(const RoseBuildImpl &tbi, build_context &bc,
             setLeftNfaProperties(*nfa, leftfix);
 
             qi = qif.get_queue();
+            tbi.leftfix_queue_map.emplace(leftfix, qi);
             nfa->queueIndex = qi;
 
             if (!is_prefix && !leftfix.haig() && leftfix.graph() &&
@@ -1458,6 +1459,7 @@ void assignSuffixQueues(RoseBuildImpl &build, build_context &bc) {
         u32 queue = build.qif.get_queue();
         DEBUG_PRINTF("assigning %p to queue %u\n", s.graph(), queue);
         bc.suffixes.emplace(s, queue);
+        build.suffix_queue_map.emplace(s, queue);
     }
 }
 
diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp
index 079dd556..46d1676d 100644
--- a/src/rose/rose_build_dump.cpp
+++ b/src/rose/rose_build_dump.cpp
@@ -30,12 +30,13 @@
 
 #include "rose_build_dump.h"
 
-#include "hwlm/hwlm_build.h"
 #include "rose_build_impl.h"
 #include "rose_build_matchers.h"
 #include "rose/rose_dump.h"
 #include "rose_internal.h"
 #include "ue2common.h"
+#include "hwlm/hwlm_build.h"
+#include "nfa/castlecompile.h"
 #include "nfa/nfa_internal.h"
 #include "nfagraph/ng_dump.h"
 #include "som/slot_manager_dump.h"
@@ -64,22 +65,40 @@ static
 string to_string(nfa_kind k) {
     switch (k) {
     case NFA_PREFIX:
-        return "p";
+        return "PREFIX";
     case NFA_INFIX:
-        return "i";
+        return "INFIX";
     case NFA_SUFFIX:
-        return "s";
+        return "SUFFIX";
     case NFA_OUTFIX:
-        return "o";
+        return "OUTFIX";
     case NFA_REV_PREFIX:
-        return "r";
+        return "REV_PREFIX";
     case NFA_OUTFIX_RAW:
-        return "O";
+        return "OUTFIX_RAW";
     }
     assert(0);
     return "?";
 }
 
+/** \brief Return the kind of a left_id or a suffix_id. */
+template<class Graph>
+string render_kind(const Graph &g) {
+    if (g.graph()) {
+        return to_string(g.graph()->kind);
+    }
+    if (g.dfa()) {
+        return to_string(g.dfa()->kind);
+    }
+    if (g.haig()) {
+        return to_string(g.haig()->kind);
+    }
+    if (g.castle()) {
+        return to_string(g.castle()->kind);
+    }
+    return "UNKNOWN";
+}
+
 namespace {
 
 class RoseGraphWriter {
@@ -130,22 +149,12 @@ public:
         }
 
         if (g[v].suffix) {
-            os << "\\nSUFFIX (TOP " << g[v].suffix.top;
-            // Can't dump the queue number, but we can identify the suffix.
-            if (g[v].suffix.graph) {
-                os << ", graph=" << g[v].suffix.graph.get() << " "
-                   << to_string(g[v].suffix.graph->kind);
+            suffix_id suff(g[v].suffix);
+            os << "\\n" << render_kind(suff) << " (top " << g[v].suffix.top;
+            auto it = build.suffix_queue_map.find(suff);
+            if (it != end(build.suffix_queue_map)) {
+                os << ", queue " << it->second;
             }
-            if (g[v].suffix.castle) {
-                os << ", castle=" << g[v].suffix.castle.get();
-            }
-            if (g[v].suffix.rdfa) {
-                os << ", dfa=" << g[v].suffix.rdfa.get();
-            }
-            if (g[v].suffix.haig) {
-                os << ", haig=" << g[v].suffix.haig.get();
-            }
-
             os << ")";
         }
 
@@ -154,15 +163,15 @@ public:
         }
 
         if (g[v].left) {
-            const char *roseKind =
-                build.isRootSuccessor(v) ? "PREFIX" : "INFIX";
-            os << "\\nROSE " << roseKind;
-            os << " (";
-            os << "report " << g[v].left.leftfix_report << ")";
-
-            if (g[v].left.graph) {
-                os << " " << to_string(g[v].left.graph->kind);
+            left_id left(g[v].left);
+            os << "\\n" << render_kind(left) << " (queue ";
+            auto it = build.leftfix_queue_map.find(left);
+            if (it != end(build.leftfix_queue_map)) {
+                os << it->second;
+            } else {
+                os << "??";
             }
+            os << ", report " << g[v].left.leftfix_report << ")";
         }
 
         os << "\"";
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index 4122e0bd..d5f75a22 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -541,6 +541,12 @@ public:
 
     u32 ematcher_region_size; /**< number of bytes the eod table runs over */
 
+    /** \brief Mapping from leftfix to queue ID (used in dump code). */
+    unordered_map<left_id, u32> leftfix_queue_map;
+
+    /** \brief Mapping from suffix to queue ID (used in dump code). */
+    unordered_map<suffix_id, u32> suffix_queue_map;
+
     /** \brief Mapping from anchored literal ID to the original literal suffix
      * present when the literal was added to the literal matcher. Used for
      * overlap calculation in history assignment. */

From 6e49544558d4eb6912eebdb9afed6fd54caf98a7 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 26 May 2016 14:21:40 +1000
Subject: [PATCH 004/166] exhaust: move functions to report.h

---
 src/report.h       | 36 ++++++++++++++++++++++++++++++++++++
 src/rose/catchup.c | 33 +++++++++++++++++++++++++++++++++
 src/rose/runtime.h | 34 ----------------------------------
 src/util/exhaust.h | 38 --------------------------------------
 4 files changed, 69 insertions(+), 72 deletions(-)

diff --git a/src/report.h b/src/report.h
index d037d11b..4a5f401e 100644
--- a/src/report.h
+++ b/src/report.h
@@ -115,6 +115,42 @@ enum DedupeResult dedupeCatchup(const struct RoseEngine *rose,
     return DEDUPE_CONTINUE;
 }
 
+/** \brief Test whether the given key (\a ekey) is set in the exhaustion vector
+ * \a evec. */
+static really_inline
+int isExhausted(const struct RoseEngine *rose, const char *evec, u32 ekey) {
+    DEBUG_PRINTF("checking exhaustion %p %u\n", evec, ekey);
+    assert(ekey != INVALID_EKEY);
+    assert(ekey < rose->ekeyCount);
+    return mmbit_isset((const u8 *)evec, rose->ekeyCount, ekey);
+}
+
+/** \brief Returns 1 if all exhaustion keys in the bitvector are on. */
+static really_inline
+int isAllExhausted(const struct RoseEngine *rose, const char *evec) {
+    if (!rose->canExhaust) {
+        return 0; /* pattern set is inexhaustible */
+    }
+
+    return mmbit_all((const u8 *)evec, rose->ekeyCount);
+}
+
+/** \brief Mark key \a ekey on in the exhaustion vector. */
+static really_inline
+void markAsMatched(const struct RoseEngine *rose, char *evec, u32 ekey) {
+    DEBUG_PRINTF("marking as exhausted key %u\n", ekey);
+    assert(ekey != INVALID_EKEY);
+    assert(ekey < rose->ekeyCount);
+    mmbit_set((u8 *)evec, rose->ekeyCount, ekey);
+}
+
+/** \brief Clear all keys in the exhaustion vector. */
+static really_inline
+void clearEvec(const struct RoseEngine *rose, char *evec) {
+    DEBUG_PRINTF("clearing evec %p %u\n", evec, rose->ekeyCount);
+    mmbit_clear((u8 *)evec, rose->ekeyCount);
+}
+
 /**
  * \brief Deliver the given report to the user callback.
  *
diff --git a/src/rose/catchup.c b/src/rose/catchup.c
index dba9629e..2460f086 100644
--- a/src/rose/catchup.c
+++ b/src/rose/catchup.c
@@ -59,6 +59,39 @@ int roseNfaRunProgram(const struct RoseEngine *rose, struct hs_scratch *scratch,
     return can_stop_matching(scratch) ? MO_HALT_MATCHING : MO_CONTINUE_MATCHING;
 }
 
+static rose_inline
+char roseSuffixInfoIsExhausted(const struct RoseEngine *rose,
+                               const struct NfaInfo *info,
+                               const char *exhausted) {
+    if (!info->ekeyListOffset) {
+        return 0;
+    }
+
+    DEBUG_PRINTF("check exhaustion -> start at %u\n", info->ekeyListOffset);
+
+    /* INVALID_EKEY terminated list */
+    const u32 *ekeys = getByOffset(rose, info->ekeyListOffset);
+    while (*ekeys != INVALID_EKEY) {
+        DEBUG_PRINTF("check %u\n", *ekeys);
+        if (!isExhausted(rose, exhausted, *ekeys)) {
+            DEBUG_PRINTF("not exhausted -> alive\n");
+            return 0;
+        }
+        ++ekeys;
+    }
+
+    DEBUG_PRINTF("all ekeys exhausted -> dead\n");
+    return 1;
+}
+
+static really_inline
+char roseSuffixIsExhausted(const struct RoseEngine *rose, u32 qi,
+                           const char *exhausted) {
+    DEBUG_PRINTF("check queue %u\n", qi);
+    const struct NfaInfo *info = getNfaInfoByQueue(rose, qi);
+    return roseSuffixInfoIsExhausted(rose, info, exhausted);
+}
+
 static really_inline
 void deactivateQueue(const struct RoseEngine *t, u8 *aa, u32 qi,
                      struct hs_scratch *scratch) {
diff --git a/src/rose/runtime.h b/src/rose/runtime.h
index d4309bfb..f7f6641d 100644
--- a/src/rose/runtime.h
+++ b/src/rose/runtime.h
@@ -35,7 +35,6 @@
 
 #include "rose_internal.h"
 #include "scratch.h"
-#include "util/exhaust.h" // for isExhausted
 #include "util/partial_store.h"
 
 /*
@@ -108,39 +107,6 @@ const u8 *getLeftfixLagTableConst(const struct RoseEngine *t,
     return (const u8 *)(state + t->stateOffsets.leftfixLagTable);
 }
 
-static rose_inline
-char roseSuffixInfoIsExhausted(const struct RoseEngine *t,
-                               const struct NfaInfo *info,
-                               const char *exhausted) {
-    if (!info->ekeyListOffset) {
-        return 0;
-    }
-
-    DEBUG_PRINTF("check exhaustion -> start at %u\n", info->ekeyListOffset);
-
-    /* INVALID_EKEY terminated list */
-    const u32 *ekeys = (const u32 *)((const char *)t + info->ekeyListOffset);
-    while (*ekeys != INVALID_EKEY) {
-        DEBUG_PRINTF("check %u\n", *ekeys);
-        if (!isExhausted(t, exhausted, *ekeys)) {
-            DEBUG_PRINTF("not exhausted -> alive\n");
-            return 0;
-        }
-        ++ekeys;
-    }
-
-    DEBUG_PRINTF("all ekeys exhausted -> dead\n");
-    return 1;
-}
-
-static really_inline
-char roseSuffixIsExhausted(const struct RoseEngine *t, u32 qi,
-                           const char *exhausted) {
-    DEBUG_PRINTF("check queue %u\n", qi);
-    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
-    return roseSuffixInfoIsExhausted(t, info, exhausted);
-}
-
 static really_inline
 u32 has_chained_nfas(const struct RoseEngine *t) {
     return t->outfixBeginQueue;
diff --git a/src/util/exhaust.h b/src/util/exhaust.h
index b55c52d7..d6f2ac06 100644
--- a/src/util/exhaust.h
+++ b/src/util/exhaust.h
@@ -33,47 +33,9 @@
 #ifndef EXHAUST_H
 #define EXHAUST_H
 
-#include "rose/rose_internal.h"
-#include "util/multibit.h"
 #include "ue2common.h"
 
 /** Index meaning a given exhaustion key is invalid. */
 #define INVALID_EKEY    (~(u32)0)
 
-/** \brief Test whether the given key (\a ekey) is set in the exhaustion vector
- * \a evec. */
-static really_inline
-int isExhausted(const struct RoseEngine *t, const char *evec, u32 ekey) {
-    DEBUG_PRINTF("checking exhaustion %p %u\n", evec, ekey);
-    assert(ekey != INVALID_EKEY);
-    assert(ekey < t->ekeyCount);
-    return mmbit_isset((const u8 *)evec, t->ekeyCount, ekey);
-}
-
-/** \brief Returns 1 if all exhaustion keys in the bitvector are on. */
-static really_inline
-int isAllExhausted(const struct RoseEngine *t, const char *evec) {
-    if (!t->canExhaust) {
-        return 0; /* pattern set is inexhaustible */
-    }
-
-    return mmbit_all((const u8 *)evec, t->ekeyCount);
-}
-
-/** \brief Mark key \a ekey on in the exhaustion vector. */
-static really_inline
-void markAsMatched(const struct RoseEngine *t, char *evec, u32 ekey) {
-    DEBUG_PRINTF("marking as exhausted key %u\n", ekey);
-    assert(ekey != INVALID_EKEY);
-    assert(ekey < t->ekeyCount);
-    mmbit_set((u8 *)evec, t->ekeyCount, ekey);
-}
-
-/** \brief Clear all keys in the exhaustion vector. */
-static really_inline
-void clearEvec(const struct RoseEngine *t, char *evec) {
-    DEBUG_PRINTF("clearing evec %p %u\n", evec, t->ekeyCount);
-    mmbit_clear((u8 *)evec, t->ekeyCount);
-}
-
 #endif

From afd378b09ecbce7f7526271852ae94cbf7a297c8 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Tue, 17 May 2016 15:05:42 +1000
Subject: [PATCH 005/166] UE-2963: be more aggressive in using
 buildSufPQ_final()

---
 src/rose/catchup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rose/catchup.c b/src/rose/catchup.c
index 2460f086..c61079a8 100644
--- a/src/rose/catchup.c
+++ b/src/rose/catchup.c
@@ -787,7 +787,7 @@ hwlmcb_rv_t buildSufPQ(const struct RoseEngine *t, char *state, s64a safe_loc,
             = scratch->catchup_pq.qm_size ? pq_top_loc(&scratch->catchup_pq)
                                           : safe_loc;
         second_place_loc = MIN(second_place_loc, safe_loc);
-        if (n_qi == MMB_INVALID && report_ok_loc < second_place_loc) {
+        if (n_qi == MMB_INVALID && report_ok_loc <= second_place_loc) {
             if (buildSufPQ_final(t, report_ok_loc, second_place_loc, final_loc,
                                  scratch, aa, a_qi)
                 == HWLM_TERMINATE_MATCHING) {

From b097cb1b533a0b40f05d4bcd26f8420c88ae33f0 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Fri, 6 May 2016 13:20:00 +1000
Subject: [PATCH 006/166] use the correct way to refer to void *

---
 src/nfagraph/ng_calc_components.cpp   | 10 +++++-----
 src/nfagraph/ng_rose.cpp              |  4 ++--
 src/rose/rose_build_add.cpp           |  8 ++++----
 src/rose/rose_build_matchers.cpp      |  2 +-
 src/rose/rose_build_merge.cpp         |  2 +-
 src/rose/rose_build_role_aliasing.cpp |  3 +--
 6 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/nfagraph/ng_calc_components.cpp b/src/nfagraph/ng_calc_components.cpp
index 5ca5ce3a..9365cfb3 100644
--- a/src/nfagraph/ng_calc_components.cpp
+++ b/src/nfagraph/ng_calc_components.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -219,8 +219,8 @@ vector<NFAEdge> findShellEdges(const NGHolder &g,
 
 static
 void removeVertices(const flat_set<NFAVertex> &verts, NFAUndirectedGraph &ug,
-                    ue2::unordered_map<NFAVertex, NFAUndirectedVertex> &old2new,
-                    ue2::unordered_map<NFAVertex, NFAUndirectedVertex> &new2old) {
+                   ue2::unordered_map<NFAVertex, NFAUndirectedVertex> &old2new,
+                   ue2::unordered_map<NFAUndirectedVertex, NFAVertex> &new2old) {
     for (auto v : verts) {
         assert(contains(old2new, v));
         auto uv = old2new.at(v);
@@ -280,7 +280,7 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
     createUnGraph(g.g, true, true, ug, old2new, newIdx2old);
 
     // Construct reverse mapping.
-    ue2::unordered_map<NFAVertex, NFAUndirectedVertex> new2old;
+    ue2::unordered_map<NFAUndirectedVertex, NFAVertex> new2old;
     for (const auto &m : old2new) {
         new2old.emplace(m.second, m.first);
     }
@@ -308,7 +308,7 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
 
     // Collect vertex lists per component.
     for (const auto &m : split_components) {
-        NFAVertex uv = m.first;
+        NFAUndirectedVertex uv = m.first;
         u32 c = m.second;
         assert(contains(new2old, uv));
         NFAVertex v = new2old.at(uv);
diff --git a/src/nfagraph/ng_rose.cpp b/src/nfagraph/ng_rose.cpp
index 3015af4c..997191d2 100644
--- a/src/nfagraph/ng_rose.cpp
+++ b/src/nfagraph/ng_rose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -2245,7 +2245,7 @@ bool improveLHS(RoseInGraph &ig, const vector<RoseInEdge> &edges,
         const vector<RoseInEdge> &local = by_src[v];
 
         vector<NGHolder *> graphs;
-        map<RoseInVertex, vector<RoseInEdge> > by_graph;
+        map<NGHolder *, vector<RoseInEdge> > by_graph;
         for (const auto &e : local) {
             NGHolder *gp = ig[e].graph.get();
             if (!contains(by_graph, gp)) {
diff --git a/src/rose/rose_build_add.cpp b/src/rose/rose_build_add.cpp
index 23c122a7..ae155361 100644
--- a/src/rose/rose_build_add.cpp
+++ b/src/rose/rose_build_add.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -315,7 +315,7 @@ void createVertices(RoseBuildImpl *tbi,
             w = created[key];
         }
 
-        NFAVertex p = pv.first;
+        RoseVertex p = pv.first;
 
         RoseEdge e;
         bool added;
@@ -943,7 +943,7 @@ void populateRoseGraph(RoseBuildImpl *tbi, RoseBuildData &bd) {
             const vector<RoseVertex> &images = vertex_map[u];
 
             // We should have no dupes.
-            assert(set<NFAVertex>(images.begin(), images.end()).size()
+            assert(set<RoseVertex>(images.begin(), images.end()).size()
                    == images.size());
 
             for (auto v_image : images) {
@@ -1133,7 +1133,7 @@ u32 maxAvailableDelay(const ue2_literal &pred_key, const ue2_literal &lit_key) {
 }
 
 static
-u32 findMaxSafeDelay(const RoseInGraph &ig, RoseInVertex u, RoseVertex v) {
+u32 findMaxSafeDelay(const RoseInGraph &ig, RoseInVertex u, RoseInVertex v) {
     // First, check the overlap constraints on (u,v).
     size_t max_delay;
     if (ig[v].type == RIV_LITERAL) {
diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp
index 83c49556..12aadd72 100644
--- a/src/rose/rose_build_matchers.cpp
+++ b/src/rose/rose_build_matchers.cpp
@@ -359,7 +359,7 @@ bool isDirectHighlander(const RoseBuildImpl &build, const u32 id,
 
 // Called by isNoRunsLiteral below.
 static
-bool isNoRunsVertex(const RoseBuildImpl &build, NFAVertex u) {
+bool isNoRunsVertex(const RoseBuildImpl &build, RoseVertex u) {
     const RoseGraph &g = build.g;
     if (!g[u].isBoring()) {
         DEBUG_PRINTF("u=%zu is not boring\n", g[u].idx);
diff --git a/src/rose/rose_build_merge.cpp b/src/rose/rose_build_merge.cpp
index a10bc86e..759e0dbe 100644
--- a/src/rose/rose_build_merge.cpp
+++ b/src/rose/rose_build_merge.cpp
@@ -338,7 +338,7 @@ void findUncalcLeavesCandidates(RoseBuildImpl &tbi,
                            deque<UncalcLeafKey> &ordered) {
     const RoseGraph &g = tbi.g;
 
-    vector<NFAVertex> suffix_vertices; // vertices with suffix graphs
+    vector<RoseVertex> suffix_vertices; // vertices with suffix graphs
     ue2::unordered_map<const NGHolder *, u32> fcount; // ref count per graph
 
     for (auto v : vertices_range(g)) {
diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index 1f873403..8e883ab9 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -1185,8 +1185,7 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
         ReportID new_report = tbi.getNewNfaReport();
         shared_ptr<NGHolder> new_graph = cloneHolder(*b_h);
         duplicateReport(*new_graph, b_left.leftfix_report, new_report);
-        pruneReportIfUnused(tbi, new_graph, set<NFAVertex>(),
-                            b_left.leftfix_report);
+        pruneReportIfUnused(tbi, new_graph, {}, b_left.leftfix_report);
 
         rrm[a_left_id].erase(a);
         rrm[b_left_id].erase(b);

From cb7067f59d085c3737e4c3406c36b01f2866a16f Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Mon, 30 May 2016 14:00:31 +1000
Subject: [PATCH 007/166] Prevent trying to build smallwrite engine for large
 cases

---
 src/grey.cpp                        |  6 +++++-
 src/grey.h                          |  4 +++-
 src/hs.cpp                          |  4 ++--
 src/nfagraph/ng.cpp                 |  5 +++--
 src/nfagraph/ng.h                   |  5 +++--
 src/smallwrite/smallwrite_build.cpp | 20 +++++++++++++++-----
 src/smallwrite/smallwrite_build.h   |  5 +++--
 7 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/src/grey.cpp b/src/grey.cpp
index 69dab627..e2022e74 100644
--- a/src/grey.cpp
+++ b/src/grey.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -126,6 +126,8 @@ Grey::Grey(void) :
                                                 // are given to rose &co
                    smallWriteLargestBufferBad(35),
                    limitSmallWriteOutfixSize(1048576), // 1 MB
+                   smallWriteMaxPatterns(10000),
+                   smallWriteMaxLiterals(10000),
                    dumpFlags(0),
                    limitPatternCount(8000000), // 8M patterns
                    limitPatternLength(16000),  // 16K bytes
@@ -273,6 +275,8 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(smallWriteLargestBuffer);
         G_UPDATE(smallWriteLargestBufferBad);
         G_UPDATE(limitSmallWriteOutfixSize);
+        G_UPDATE(smallWriteMaxPatterns);
+        G_UPDATE(smallWriteMaxLiterals);
         G_UPDATE(limitPatternCount);
         G_UPDATE(limitPatternLength);
         G_UPDATE(limitGraphVertices);
diff --git a/src/grey.h b/src/grey.h
index a2261052..8ac9e6b1 100644
--- a/src/grey.h
+++ b/src/grey.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -143,6 +143,8 @@ struct Grey {
     u32 smallWriteLargestBuffer;  // largest buffer that can be small write
     u32 smallWriteLargestBufferBad;// largest buffer that can be small write
     u32 limitSmallWriteOutfixSize; //!< max total size of outfix DFAs
+    u32 smallWriteMaxPatterns; // only try small writes if fewer patterns
+    u32 smallWriteMaxLiterals; // only try small writes if fewer literals
 
     enum DumpFlags {
         DUMP_NONE       = 0,
diff --git a/src/hs.cpp b/src/hs.cpp
index 3680e79e..07f6d2c1 100644
--- a/src/hs.cpp
+++ b/src/hs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -219,7 +219,7 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags,
                                     : get_current_target();
 
     CompileContext cc(isStreaming, isVectored, target_info, g);
-    NG ng(cc, somPrecision);
+    NG ng(cc, elements, somPrecision);
 
     try {
         for (unsigned int i = 0; i < elements; i++) {
diff --git a/src/nfagraph/ng.cpp b/src/nfagraph/ng.cpp
index b4b34d74..5d4f1b97 100644
--- a/src/nfagraph/ng.cpp
+++ b/src/nfagraph/ng.cpp
@@ -75,14 +75,15 @@ using namespace std;
 
 namespace ue2 {
 
-NG::NG(const CompileContext &in_cc, unsigned in_somPrecision)
+NG::NG(const CompileContext &in_cc, size_t num_patterns,
+       unsigned in_somPrecision)
     : maxSomRevHistoryAvailable(in_cc.grey.somMaxRevNfaLength),
       minWidth(depth::infinity()),
       rm(in_cc.grey),
       ssm(in_somPrecision),
       cc(in_cc),
       rose(makeRoseBuilder(rm, ssm, cc, boundary)),
-      smwr(makeSmallWriteBuilder(rm, cc)) {
+      smwr(makeSmallWriteBuilder(num_patterns, rm, cc)) {
 }
 
 NG::~NG() {
diff --git a/src/nfagraph/ng.h b/src/nfagraph/ng.h
index 52353da9..95936fcc 100644
--- a/src/nfagraph/ng.h
+++ b/src/nfagraph/ng.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -87,7 +87,8 @@ class SmallWriteBuild;
 
 class NG : boost::noncopyable {
 public:
-    NG(const CompileContext &in_cc, unsigned in_somPrecision);
+    NG(const CompileContext &in_cc, size_t num_patterns,
+       unsigned in_somPrecision);
     ~NG();
 
     /** \brief Consumes a pattern, returns false or throws a CompileError
diff --git a/src/smallwrite/smallwrite_build.cpp b/src/smallwrite/smallwrite_build.cpp
index 792a3d5b..7fb54440 100644
--- a/src/smallwrite/smallwrite_build.cpp
+++ b/src/smallwrite/smallwrite_build.cpp
@@ -65,7 +65,8 @@ namespace { // unnamed
 // Concrete impl class
 class SmallWriteBuildImpl : public SmallWriteBuild {
 public:
-    SmallWriteBuildImpl(const ReportManager &rm, const CompileContext &cc);
+    SmallWriteBuildImpl(size_t num_patterns, const ReportManager &rm,
+                        const CompileContext &cc);
 
     // Construct a runtime implementation.
     aligned_unique_ptr<SmallWriteEngine> build(u32 roseQuality) override;
@@ -87,11 +88,14 @@ public:
 
 SmallWriteBuild::~SmallWriteBuild() { }
 
-SmallWriteBuildImpl::SmallWriteBuildImpl(const ReportManager &rm_in,
+SmallWriteBuildImpl::SmallWriteBuildImpl(size_t num_patterns,
+                                         const ReportManager &rm_in,
                                          const CompileContext &cc_in)
     : rm(rm_in), cc(cc_in),
       /* small write is block mode only */
-      poisoned(!cc.grey.allowSmallWrite || cc.streaming) {
+      poisoned(!cc.grey.allowSmallWrite
+               || cc.streaming
+               || num_patterns > cc.grey.smallWriteMaxPatterns) {
 }
 
 void SmallWriteBuildImpl::add(const NGWrapper &w) {
@@ -163,6 +167,10 @@ void SmallWriteBuildImpl::add(const ue2_literal &literal, ReportID r) {
     }
 
     cand_literals.push_back(make_pair(literal, r));
+
+    if (cand_literals.size() > cc.grey.smallWriteMaxLiterals) {
+        poisoned = true;
+    }
 }
 
 static
@@ -181,6 +189,7 @@ void lit_to_graph(NGHolder *h, const ue2_literal &literal, ReportID r) {
 bool SmallWriteBuildImpl::determiniseLiterals() {
     DEBUG_PRINTF("handling literals\n");
     assert(!poisoned);
+    assert(cand_literals.size() <= cc.grey.smallWriteMaxLiterals);
 
     if (cand_literals.empty()) {
         return true; /* nothing to do */
@@ -352,9 +361,10 @@ aligned_unique_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
 }
 
 // SmallWriteBuild factory
-unique_ptr<SmallWriteBuild> makeSmallWriteBuilder(const ReportManager &rm,
+unique_ptr<SmallWriteBuild> makeSmallWriteBuilder(size_t num_patterns,
+                                                  const ReportManager &rm,
                                                   const CompileContext &cc) {
-    return ue2::make_unique<SmallWriteBuildImpl>(rm, cc);
+    return ue2::make_unique<SmallWriteBuildImpl>(num_patterns, rm, cc);
 }
 
 aligned_unique_ptr<SmallWriteEngine>
diff --git a/src/smallwrite/smallwrite_build.h b/src/smallwrite/smallwrite_build.h
index 9c3de9d3..59a8528a 100644
--- a/src/smallwrite/smallwrite_build.h
+++ b/src/smallwrite/smallwrite_build.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -64,7 +64,8 @@ public:
 };
 
 // Construct a usable SmallWrite builder.
-std::unique_ptr<SmallWriteBuild> makeSmallWriteBuilder(const ReportManager &rm,
+std::unique_ptr<SmallWriteBuild> makeSmallWriteBuilder(size_t num_patterns,
+                                                       const ReportManager &rm,
                                                        const CompileContext &cc);
 
 size_t smwrSize(const SmallWriteEngine *t);

From a8aa2d022f196cc1804787899fbe1fd121c008a7 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Tue, 31 May 2016 14:46:09 +1000
Subject: [PATCH 008/166] Provide access to scratch in queues

This largely reverts commit f8ecf33236b0695d9539e8e34cf0bb0467614cdf.
---
 src/nfa/nfa_api_queue.h     | 1 +
 src/scratch.c               | 5 +++++
 unit/internal/lbr.cpp       | 1 +
 unit/internal/limex_nfa.cpp | 2 ++
 4 files changed, 9 insertions(+)

diff --git a/src/nfa/nfa_api_queue.h b/src/nfa/nfa_api_queue.h
index 1373425d..59c18fca 100644
--- a/src/nfa/nfa_api_queue.h
+++ b/src/nfa/nfa_api_queue.h
@@ -91,6 +91,7 @@ struct mq {
                         * history buffer; (logically) immediately before the
                         * main buffer */
     size_t hlength; /**< length of the history buffer */
+    struct hs_scratch *scratch; /**< global scratch space */
     char report_current; /**<
                           * report_current matches at starting offset through
                           * callback. If true, the queue must be located at a
diff --git a/src/scratch.c b/src/scratch.c
index d8742e7d..dae2c672 100644
--- a/src/scratch.c
+++ b/src/scratch.c
@@ -227,6 +227,11 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
     // Don't get too big for your boots
     assert((size_t)(current - (char *)s) <= alloc_size);
 
+    // Init q->scratch ptr for every queue.
+    for (struct mq *qi = s->queues; qi != s->queues + queueCount; ++qi) {
+        qi->scratch = s;
+    }
+
     return HS_SUCCESS;
 }
 
diff --git a/unit/internal/lbr.cpp b/unit/internal/lbr.cpp
index 2bb359df..f335e184 100644
--- a/unit/internal/lbr.cpp
+++ b/unit/internal/lbr.cpp
@@ -122,6 +122,7 @@ protected:
         q.length = 0; // filled in by test
         q.history = nullptr;
         q.hlength = 0;
+        q.scratch = nullptr; // not needed by LBR
         q.report_current = 0;
         q.cb = onMatch;
         q.som_cb = nullptr; // only used by Haig
diff --git a/unit/internal/limex_nfa.cpp b/unit/internal/limex_nfa.cpp
index 91ab09db..9d3c00b5 100644
--- a/unit/internal/limex_nfa.cpp
+++ b/unit/internal/limex_nfa.cpp
@@ -102,6 +102,7 @@ protected:
         q.length = SCAN_DATA.size();
         q.history = nullptr;
         q.hlength = 0;
+        q.scratch = nullptr; /* limex does not use scratch */
         q.report_current = 0;
         q.cb = onMatch;
         q.som_cb = nullptr; // only used by Haig
@@ -396,6 +397,7 @@ protected:
         q.length = ZOMBIE_SCAN_DATA.length();
         q.history = nullptr;
         q.hlength = 0;
+        q.scratch = nullptr; /* limex does not use scratch */
         q.report_current = 0;
         q.cb = onMatch;
         q.som_cb = nullptr; // only used by Haig

From beec5e59dfd24d75dad1d2982cdfff0814fd4145 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 27 May 2016 16:51:41 +1000
Subject: [PATCH 009/166] rose: linear scan for lookaround during build

This allows us to reuse more lookaround entries in the bytecode.
---
 src/rose/rose_build_bytecode.cpp | 40 ++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index dac2e79c..578c4b4a 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -2434,6 +2434,33 @@ bool onlyAtEod(const RoseBuildImpl &tbi, RoseVertex v) {
     return true;
 }
 
+static
+u32 addLookaround(build_context &bc, const vector<LookEntry> &look) {
+    // Check the cache.
+    auto it = bc.lookaround_cache.find(look);
+    if (it != bc.lookaround_cache.end()) {
+        DEBUG_PRINTF("reusing look at idx %zu\n", it->second);
+        return verify_u32(it->second);
+    }
+
+    // Linear scan for sequence.
+    auto seq_it = search(begin(bc.lookaround), end(bc.lookaround), begin(look),
+                         end(look));
+    if (seq_it != end(bc.lookaround)) {
+        size_t idx = distance(begin(bc.lookaround), seq_it);
+        DEBUG_PRINTF("linear scan found look at idx %zu\n", idx);
+        bc.lookaround_cache.emplace(look, idx);
+        return verify_u32(idx);
+    }
+
+    // New sequence.
+    size_t idx = bc.lookaround.size();
+    bc.lookaround_cache.emplace(look, idx);
+    insert(&bc.lookaround, bc.lookaround.end(), look);
+    DEBUG_PRINTF("adding look at idx %zu\n", idx);
+    return verify_u32(idx);
+}
+
 static
 void makeRoleLookaround(RoseBuildImpl &build, build_context &bc, RoseVertex v,
                         vector<RoseInstruction> &program) {
@@ -2460,18 +2487,7 @@ void makeRoleLookaround(RoseBuildImpl &build, build_context &bc, RoseVertex v,
     }
 
     DEBUG_PRINTF("role has lookaround\n");
-    u32 look_idx;
-    auto it = bc.lookaround_cache.find(look);
-    if (it != bc.lookaround_cache.end()) {
-        DEBUG_PRINTF("reusing look at idx %zu\n", it->second);
-        look_idx = verify_u32(it->second);
-    } else {
-        size_t idx = bc.lookaround.size();
-        bc.lookaround_cache.emplace(look, idx);
-        insert(&bc.lookaround, bc.lookaround.end(), look);
-        DEBUG_PRINTF("adding look at idx %zu\n", idx);
-        look_idx = verify_u32(idx);
-    }
+    u32 look_idx = addLookaround(bc, look);
     u32 look_count = verify_u32(look.size());
 
     auto ri = RoseInstruction(ROSE_INSTR_CHECK_LOOKAROUND,

From 6a6b0e5da6f20f6d3842295cba5f13bcbc62fa53 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 27 May 2016 15:05:24 +1000
Subject: [PATCH 010/166] rose: Do HWLM advisory masks as a complete pass

---
 src/rose/rose_build_compile.cpp  |  3 ++
 src/rose/rose_build_matchers.cpp | 83 +++++++++++++++++++++++++++-----
 src/rose/rose_build_matchers.h   |  2 +
 3 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index 12500599..7f350109 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -34,6 +34,7 @@
 #include "rose_build_castle.h"
 #include "rose_build_convert.h"
 #include "rose_build_dump.h"
+#include "rose_build_matchers.h"
 #include "rose_build_merge.h"
 #include "rose_build_role_aliasing.h"
 #include "rose_build_util.h"
@@ -2181,6 +2182,8 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildRose(u32 minWidth) {
 
     assert(!danglingVertexRef(*this));
 
+    findMoreLiteralMasks(*this);
+
     assignGroupsToLiterals();
     assignGroupsToRoles();
     findGroupSquashers(*this);
diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp
index 12aadd72..f4597de7 100644
--- a/src/rose/rose_build_matchers.cpp
+++ b/src/rose/rose_build_matchers.cpp
@@ -333,6 +333,73 @@ bool findHamsterMask(const RoseBuildImpl &build, const rose_literal_id &id,
     return true;
 }
 
+void findMoreLiteralMasks(RoseBuildImpl &build) {
+    if (!build.cc.grey.roseHamsterMasks) {
+        return;
+    }
+
+    vector<u32> candidates;
+    for (const auto &e : build.literals.right) {
+        const u32 id = e.first;
+        const auto &lit = e.second;
+
+        // This pass takes place before final IDs are assigned to literals.
+        assert(!build.hasFinalId(id));
+
+        if (lit.delay || build.isDelayed(id)) {
+            continue;
+        }
+
+        // Literal masks are only allowed for literals that will end up in an
+        // HWLM table.
+        switch (lit.table) {
+        case ROSE_FLOATING:
+        case ROSE_EOD_ANCHORED:
+        case ROSE_ANCHORED_SMALL_BLOCK:
+            break;
+        default:
+            continue;
+        }
+
+        if (!lit.msk.empty()) {
+            continue;
+        }
+
+        const auto &lit_info = build.literal_info.at(id);
+        if (lit_info.requires_benefits) {
+            continue;
+        }
+        candidates.push_back(id);
+    }
+
+    for (const u32 &id : candidates) {
+        const auto &lit = build.literals.right.at(id);
+        auto &lit_info = build.literal_info.at(id);
+
+        vector<u8> msk, cmp;
+        if (!findHamsterMask(build, lit, lit_info, msk, cmp)) {
+            continue;
+        }
+        assert(!msk.empty());
+        DEBUG_PRINTF("found advisory mask for lit_id=%u\n", id);
+        u32 new_id = build.getLiteralId(lit.s, msk, cmp, lit.delay, lit.table);
+        assert(new_id != id);
+        DEBUG_PRINTF("replacing with new lit_id=%u\n", new_id);
+
+        // Note that our new literal may already exist and have vertices, etc.
+        // We assume that this transform is happening prior to group assignment.
+        assert(lit_info.group_mask == 0);
+        auto &new_info = build.literal_info.at(new_id);
+        new_info.vertices.insert(begin(lit_info.vertices),
+                                 end(lit_info.vertices));
+        for (auto v : lit_info.vertices) {
+            build.g[v].literals.erase(id);
+            build.g[v].literals.insert(new_id);
+        }
+        lit_info.vertices.clear();
+    }
+}
+
 static
 bool isDirectHighlander(const RoseBuildImpl &build, const u32 id,
                         const rose_literal_info &info) {
@@ -472,17 +539,8 @@ vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
 
         DEBUG_PRINTF("lit='%s'\n", escapeString(lit).c_str());
 
-        vector<u8> msk = e.second.msk; // copy
-        vector<u8> cmp = e.second.cmp; // copy
-
-        if (msk.empty()) {
-            // Try and pick up an advisory mask.
-            if (!findHamsterMask(build, e.second, info, msk, cmp)) {
-                msk.clear(); cmp.clear();
-            } else {
-                DEBUG_PRINTF("picked up late mask %zu\n", msk.size());
-            }
-        }
+        const vector<u8> &msk = e.second.msk;
+        const vector<u8> &cmp = e.second.cmp;
 
         bool noruns = isNoRunsLiteral(build, id, info);
 
@@ -514,8 +572,7 @@ vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
                 continue;
             }
 
-            lits.emplace_back(lit.get_string(), lit.any_nocase(), noruns,
-                              final_id, groups, msk, cmp);
+            lits.emplace_back(s, nocase, noruns, final_id, groups, msk, cmp);
         }
     }
 
diff --git a/src/rose/rose_build_matchers.h b/src/rose/rose_build_matchers.h
index 9781f514..1dd53cd8 100644
--- a/src/rose/rose_build_matchers.h
+++ b/src/rose/rose_build_matchers.h
@@ -58,6 +58,8 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
 aligned_unique_ptr<HWLM> buildEodAnchoredMatcher(const RoseBuildImpl &build,
                                                  size_t *esize);
 
+void findMoreLiteralMasks(RoseBuildImpl &build);
+
 } // namespace ue2
 
 #endif // ROSE_BUILD_MATCHERS_H

From c2496fbf769cbba62bebc252c1fb4b3d81480eca Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 31 May 2016 15:24:59 +1000
Subject: [PATCH 011/166] rose: elide SET_GROUPS when possible

---
 CMakeLists.txt                   |   2 +
 src/rose/rose_build_bytecode.cpp |  57 ++++++++++++---
 src/rose/rose_build_groups.cpp   | 116 +++++++++++++++++++++++++++++++
 src/rose/rose_build_groups.h     |  50 +++++++++++++
 4 files changed, 216 insertions(+), 9 deletions(-)
 create mode 100644 src/rose/rose_build_groups.cpp
 create mode 100644 src/rose/rose_build_groups.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c824b6a6..536be260 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -825,6 +825,8 @@ SET (hs_SRCS
     src/rose/rose_build_compile.cpp
     src/rose/rose_build_convert.cpp
     src/rose/rose_build_convert.h
+    src/rose/rose_build_groups.cpp
+    src/rose/rose_build_groups.h
     src/rose/rose_build_impl.h
     src/rose/rose_build_infix.cpp
     src/rose/rose_build_infix.h
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 578c4b4a..3d1b5c6b 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -33,6 +33,7 @@
 #include "hs_compile.h" // for HS_MODE_*
 #include "rose_build_add_internal.h"
 #include "rose_build_anchored.h"
+#include "rose_build_groups.h"
 #include "rose_build_infix.h"
 #include "rose_build_lookaround.h"
 #include "rose_build_matchers.h"
@@ -408,6 +409,13 @@ struct build_context : boost::noncopyable {
     /** \brief Resources in use (tracked as programs are added). */
     RoseResources resources;
 
+    /** \brief Mapping from every vertex to the groups that must be on for that
+     * vertex to be reached. */
+    ue2::unordered_map<RoseVertex, rose_group> vertex_group_map;
+
+    /** \brief Global bitmap of groups that can be squashed. */
+    rose_group squashable_groups = 0;
+
     /** \brief Base offset of engine_blob in the Rose engine bytecode. */
     static constexpr u32 engine_blob_base = ROUNDUP_CL(sizeof(RoseEngine));
 };
@@ -2911,11 +2919,38 @@ void makeRoleSuffix(RoseBuildImpl &build, build_context &bc, RoseVertex v,
 }
 
 static
-void makeRoleGroups(const rose_group &groups,
+void makeRoleGroups(RoseBuildImpl &build, build_context &bc, RoseVertex v,
                     vector<RoseInstruction> &program) {
+    const auto &g = build.g;
+    rose_group groups = g[v].groups;
     if (!groups) {
         return;
     }
+
+    // The set of "already on" groups as we process this vertex is the
+    // intersection of the groups set by our predecessors.
+    assert(in_degree(v, g) > 0);
+    rose_group already_on = ~rose_group{0};
+    for (const auto &u : inv_adjacent_vertices_range(v, g)) {
+        already_on &= bc.vertex_group_map.at(u);
+    }
+
+    DEBUG_PRINTF("already_on=0x%llx\n", already_on);
+    DEBUG_PRINTF("squashable=0x%llx\n", bc.squashable_groups);
+    DEBUG_PRINTF("groups=0x%llx\n", groups);
+
+    already_on &= ~bc.squashable_groups;
+    DEBUG_PRINTF("squashed already_on=0x%llx\n", already_on);
+
+    // We don't *have* to mask off the groups that we know are already on, but
+    // this will make bugs more apparent.
+    groups &= ~already_on;
+
+    if (!groups) {
+        DEBUG_PRINTF("no new groups to set, skipping\n");
+        return;
+    }
+
     auto ri = RoseInstruction(ROSE_INSTR_SET_GROUPS);
     ri.u.setGroups.groups = groups;
     program.push_back(ri);
@@ -3086,11 +3121,12 @@ vector<RoseInstruction> makeProgram(RoseBuildImpl &build, build_context &bc,
     // Next, we can add program instructions that have effects.
 
     makeRoleReports(build, bc, v, program);
+
     makeRoleInfixTriggers(build, bc, v, program);
 
     // Note: SET_GROUPS instruction must be after infix triggers, as an infix
     // going dead may switch off groups.
-    makeRoleGroups(g[v].groups, program);
+    makeRoleGroups(build, bc, v, program);
 
     makeRoleSuffix(build, bc, v, program);
     makeRoleSetState(bc, v, program);
@@ -3457,8 +3493,7 @@ void makePushDelayedInstructions(const RoseBuildImpl &build, u32 final_id,
 }
 
 static
-void makeGroupCheckInstruction(const RoseBuildImpl &build, u32 final_id,
-                               vector<RoseInstruction> &program) {
+rose_group getFinalIdGroupsUnion(const RoseBuildImpl &build, u32 final_id) {
     assert(contains(build.final_id_to_literal, final_id));
     const auto &lit_infos = getLiteralInfoByFinalId(build, final_id);
 
@@ -3466,7 +3501,13 @@ void makeGroupCheckInstruction(const RoseBuildImpl &build, u32 final_id,
     for (const auto &li : lit_infos) {
         groups |= li->group_mask;
     }
+    return groups;
+}
 
+static
+void makeGroupCheckInstruction(const RoseBuildImpl &build, u32 final_id,
+                               vector<RoseInstruction> &program) {
+    rose_group groups = getFinalIdGroupsUnion(build, final_id);
     if (!groups) {
         return;
     }
@@ -3515,11 +3556,7 @@ void makeGroupSquashInstruction(const RoseBuildImpl &build, u32 final_id,
         return;
     }
 
-    rose_group groups = 0;
-    for (const auto &li : lit_infos) {
-        groups |= li->group_mask;
-    }
-
+    rose_group groups = getFinalIdGroupsUnion(build, final_id);
     if (!groups) {
         return;
     }
@@ -3999,6 +4036,8 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
         bc.resources.has_anchored = true;
     }
     bc.needs_mpv_catchup = needsMpvCatchup(*this);
+    bc.vertex_group_map = getVertexGroupMap(*this);
+    bc.squashable_groups = getSquashableGroups(*this);
 
     auto boundary_out = makeBoundaryPrograms(*this, bc, boundary, dboundary);
 
diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
new file mode 100644
index 00000000..f99ac171
--- /dev/null
+++ b/src/rose/rose_build_groups.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Rose build: code for analysing literal groups.
+ */
+
+#include "rose_build_groups.h"
+
+#include <vector>
+
+#include <boost/graph/topological_sort.hpp>
+#include <boost/range/adaptor/reversed.hpp>
+
+using namespace std;
+
+namespace ue2 {
+
+/**
+ * \brief Returns a mapping from each graph vertex v to the intersection of the
+ * groups switched on by all of the paths leading up to (and including) v from
+ * the start vertexes.
+ */
+unordered_map<RoseVertex, rose_group>
+getVertexGroupMap(const RoseBuildImpl &build) {
+    const RoseGraph &g = build.g;
+    vector<RoseVertex> v_order;
+    v_order.reserve(num_vertices(g));
+
+    boost::topological_sort(g, back_inserter(v_order),
+                            vertex_index_map(get(&RoseVertexProps::idx, g)));
+
+    unordered_map<RoseVertex, rose_group> vertex_group_map;
+    vertex_group_map.reserve(num_vertices(g));
+
+    const rose_group initial_groups = build.getInitialGroups();
+
+    for (const auto &v : boost::adaptors::reverse(v_order)) {
+        DEBUG_PRINTF("vertex %zu\n", g[v].idx);
+
+        if (build.isAnyStart(v)) {
+            DEBUG_PRINTF("start vertex, groups=0x%llx\n", initial_groups);
+            vertex_group_map.emplace(v, initial_groups);
+            continue;
+        }
+
+        // To get to this vertex, we must have come through a predecessor, and
+        // everyone who isn't a start vertex has one.
+        assert(in_degree(v, g) > 0);
+        rose_group pred_groups = ~rose_group{0};
+        for (auto u : inv_adjacent_vertices_range(v, g)) {
+            DEBUG_PRINTF("pred %zu\n", g[u].idx);
+            assert(contains(vertex_group_map, u));
+            pred_groups &= vertex_group_map.at(u);
+        }
+
+        DEBUG_PRINTF("pred_groups=0x%llx\n", pred_groups);
+        DEBUG_PRINTF("g[v].groups=0x%llx\n", g[v].groups);
+
+        rose_group v_groups = pred_groups | g[v].groups;
+        DEBUG_PRINTF("v_groups=0x%llx\n", v_groups);
+
+        vertex_group_map.emplace(v, v_groups);
+    }
+
+    return vertex_group_map;
+}
+
+/**
+ * \brief Find the set of groups that can be squashed anywhere in the graph,
+ * either by a literal or by a leftfix.
+ */
+rose_group getSquashableGroups(const RoseBuildImpl &build) {
+    rose_group squashable_groups = 0;
+    for (const auto &info : build.literal_info) {
+        if (info.squash_group) {
+            DEBUG_PRINTF("lit squash mask 0x%llx\n", info.group_mask);
+            squashable_groups |= info.group_mask;
+        }
+    }
+    for (const auto &m : build.rose_squash_masks) {
+        DEBUG_PRINTF("left squash mask 0x%llx\n", ~m.second);
+        squashable_groups |= ~m.second;
+    }
+
+    DEBUG_PRINTF("squashable groups=0x%llx\n", squashable_groups);
+    return squashable_groups;
+}
+
+} // namespace ue2
diff --git a/src/rose/rose_build_groups.h b/src/rose/rose_build_groups.h
new file mode 100644
index 00000000..608eda4a
--- /dev/null
+++ b/src/rose/rose_build_groups.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Rose build: code for analysing literal groups.
+ */
+
+#ifndef ROSE_BUILD_GROUPS_H
+#define ROSE_BUILD_GROUPS_H
+
+#include "rose_build_impl.h"
+#include "util/ue2_containers.h"
+
+namespace ue2 {
+
+unordered_map<RoseVertex, rose_group>
+getVertexGroupMap(const RoseBuildImpl &build);
+
+rose_group getSquashableGroups(const RoseBuildImpl &build);
+
+} // namespace ue2
+
+#endif // ROSE_BUILD_GROUPS_H
+

From 2b24000b1a14985efddd171fa9749e8f10018b62 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 2 Jun 2016 13:10:42 +1000
Subject: [PATCH 012/166] rose_build_groups: move assignGroupsToLiterals

---
 src/rose/rose_build_compile.cpp | 294 --------------------------------
 src/rose/rose_build_groups.cpp  | 293 +++++++++++++++++++++++++++++++
 2 files changed, 293 insertions(+), 294 deletions(-)

diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index 7f350109..a6d18f67 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -69,7 +69,6 @@
 #include <algorithm>
 #include <functional>
 #include <map>
-#include <queue>
 #include <set>
 #include <string>
 #include <vector>
@@ -78,41 +77,16 @@
 #include <boost/range/adaptor/map.hpp>
 
 using namespace std;
-using boost::adaptors::map_keys;
 using boost::adaptors::map_values;
 
 namespace ue2 {
 
-#define ROSE_LONG_LITERAL_LEN 8
-
 #define ANCHORED_REHOME_MIN_FLOATING 800
 #define ANCHORED_REHOME_MIN_FLOATING_SHORT 50
 #define ANCHORED_REHOME_ALLOW_SHORT 20
 #define ANCHORED_REHOME_DEEP 25
 #define ANCHORED_REHOME_SHORT_LEN 3
 
-static
-bool superStrong(const rose_literal_id &lit) {
-    if (lit.s.length() < ROSE_LONG_LITERAL_LEN) {
-        return false;
-    }
-
-    const u32 EXPECTED_FDR_BUCKET_LENGTH = 8;
-
-    assert(lit.s.length() >= EXPECTED_FDR_BUCKET_LENGTH);
-    size_t len = lit.s.length();
-    const string &s = lit.s.get_string();
-
-    for (size_t i = 1; i < EXPECTED_FDR_BUCKET_LENGTH; i++) {
-        if (s[len - 1 - i] != s[len - 1]) {
-            return true; /* we have at least some variation in the tail */
-        }
-    }
-    DEBUG_PRINTF("lit '%s' is not superstrong due to tail\n",
-                 escapeString(s).c_str());
-    return false;
-}
-
 rose_group RoseBuildImpl::getGroups(RoseVertex v) const {
     rose_group groups = 0;
 
@@ -863,274 +837,6 @@ bool RoseBuildImpl::hasFinalId(u32 id) const {
     return literal_info.at(id).final_id != MO_INVALID_IDX;
 }
 
-static
-bool eligibleForAlwaysOnGroup(const RoseBuildImpl &tbi, u32 id) {
-    /* returns true if it or any of its delay versions have root role */
-    for (auto v : tbi.literal_info[id].vertices) {
-        if (tbi.isRootSuccessor(v)) {
-            NGHolder *h = tbi.g[v].left.graph.get();
-            if (!h || proper_out_degree(h->startDs, *h)) {
-                return true;
-            }
-        }
-    }
-
-    for (u32 delayed_id : tbi.literal_info[id].delayed_ids) {
-        for (auto v : tbi.literal_info[delayed_id].vertices) {
-            if (tbi.isRootSuccessor(v)) {
-                NGHolder *h = tbi.g[v].left.graph.get();
-                if (!h || proper_out_degree(h->startDs, *h)) {
-                    return true;
-                }
-            }
-        }
-    }
-
-    return false;
-}
-
-static
-bool requires_group_assignment(const rose_literal_id &lit,
-                               const rose_literal_info &info) {
-    if (lit.delay) { /* we will check the shadow's master */
-        return false;
-    }
-
-    if (lit.table == ROSE_ANCHORED || lit.table == ROSE_EVENT) {
-        return false;
-    }
-
-    // If we already have a group applied, skip.
-    if (info.group_mask) {
-        return false;
-    }
-
-    if (info.vertices.empty() && info.delayed_ids.empty()) {
-        DEBUG_PRINTF("literal is good for nothing\n");
-        return false;
-    }
-
-    return true;
-}
-
-static
-rose_group calcLocalGroup(const RoseVertex v, const RoseGraph &g,
-                          const deque<rose_literal_info> &literal_info,
-                          const bool small_literal_count) {
-    rose_group local_group = 0;
-
-    for (auto u : inv_adjacent_vertices_range(v, g)) {
-        /* In small cases, ensure that siblings have the same rose parentage to
-         * allow rose squashing. In larger cases, don't do this as groups are
-         * probably too scarce. */
-        for (auto w : adjacent_vertices_range(u, g)) {
-            if (!small_literal_count || g[v].left == g[w].left) {
-                for (u32 lit_id : g[w].literals) {
-                    local_group |= literal_info[lit_id].group_mask;
-                }
-            } else {
-                DEBUG_PRINTF("not sibling different mother %zu %zu\n",
-                             g[v].idx, g[w].idx);
-            }
-        }
-    }
-
-    return local_group;
-}
-
-/* group constants */
-#define MAX_LIGHT_LITERAL_CASE 200 /* allow rose to affect group decisions below
-                                    * this */
-
-static
-flat_set<RoseVertex> getAssociatedVertices(const RoseBuildImpl &build, u32 id) {
-    flat_set<RoseVertex> out;
-    const auto &info = build.literal_info[id];
-    insert(&out, info.vertices);
-    for (const auto &delayed : info.delayed_ids) {
-        insert(&out, build.literal_info[delayed].vertices);
-    }
-    return out;
-}
-
-static
-u32 next_available_group(u32 counter, u32 min_start_group) {
-    counter++;
-    if (counter == ROSE_GROUPS_MAX) {
-        DEBUG_PRINTF("resetting groups\n");
-        counter = min_start_group;
-    }
-
-    return counter;
-}
-
-// Assigns groups to literals in the general case, when we have more literals
-// than available groups.
-void RoseBuildImpl::assignGroupsToLiterals() {
-    bool small_literal_count = literal_info.size() <= MAX_LIGHT_LITERAL_CASE;
-
-    map<u8, u32> groupCount; /* group index to number of members */
-
-    u32 counter = 0;
-    u32 group_always_on = 0;
-
-    // First pass: handle always on literals.
-    for (const auto &e : literals.right) {
-        u32 id = e.first;
-        const rose_literal_id &lit = e.second;
-        rose_literal_info &info = literal_info[id];
-
-        if (!requires_group_assignment(lit, info)) {
-            continue;
-        }
-
-        // If this literal has a root role, we always have to search for it
-        // anyway, so it goes in the always-on group.
-        /* We could end up squashing it if it is followed by a .* */
-        if (eligibleForAlwaysOnGroup(*this, id)) {
-            info.group_mask = 1ULL << group_always_on;
-            groupCount[group_always_on]++;
-            continue;
-        }
-    }
-
-    u32 group_long_lit;
-    if (groupCount[group_always_on]) {
-        DEBUG_PRINTF("%u always on literals\n", groupCount[group_always_on]);
-        group_long_lit = group_always_on;
-        counter++;
-    } else {
-        group_long_lit = counter;
-        counter++;
-    }
-
-    u32 min_start_group = counter;
-    priority_queue<pair<pair<s32, s32>, u32> > pq;
-
-    // Second pass: the other literals.
-    for (const auto &e : literals.right) {
-        u32 id = e.first;
-        const rose_literal_id &lit = e.second;
-        rose_literal_info &info = literal_info[id];
-
-        if (!requires_group_assignment(lit, info)) {
-            continue;
-        }
-
-        assert(!eligibleForAlwaysOnGroup(*this, id));
-        pq.push(make_pair(make_pair(-(s32)literal_info[id].vertices.size(),
-                                    -(s32)lit.s.length()), id));
-    }
-
-    vector<u32> long_lits;
-    while (!pq.empty()) {
-        u32 id = pq.top().second;
-        pq.pop();
-        UNUSED const rose_literal_id &lit = literals.right.at(id);
-        DEBUG_PRINTF("assigning groups to lit %u (v %zu l %zu)\n", id,
-                     literal_info[id].vertices.size(), lit.s.length());
-
-        u8 group_id = 0;
-        rose_group group = ~0ULL;
-        for (auto v : getAssociatedVertices(*this, id)) {
-            rose_group local_group = calcLocalGroup(v, g, literal_info,
-                                                    small_literal_count);
-            group &= local_group;
-            if (!group) {
-                break;
-            }
-        }
-
-        if (group == ~0ULL) {
-            goto boring;
-        }
-
-        group &= ~((1ULL << min_start_group) - 1); /* ensure the purity of the
-                                                    * always_on groups */
-        if (!group) {
-            goto boring;
-        }
-
-        group_id = ctz64(group);
-
-        /* TODO: fairness */
-        DEBUG_PRINTF("picking sibling group %hhd\n", group_id);
-        literal_info[id].group_mask = 1ULL << group_id;
-        groupCount[group_id]++;
-
-        continue;
-
-    boring:
-        /* long literals will either be stuck in a mega group or spread around
-         * depending on availability */
-        if (superStrong(lit)) {
-            long_lits.push_back(id);
-            continue;
-        }
-
-        // Other literals are assigned to our remaining groups round-robin.
-        group_id = counter;
-
-        DEBUG_PRINTF("picking boring group %hhd\n", group_id);
-        literal_info[id].group_mask = 1ULL << group_id;
-        groupCount[group_id]++;
-        counter = next_available_group(counter, min_start_group);
-    }
-
-    /* spread long literals out amongst unused groups if any, otherwise stick
-     * them in the always on the group */
-
-    if (groupCount[counter]) {
-        DEBUG_PRINTF("sticking long literals in the image of the always on\n");
-        for (u32 lit_id : long_lits) {
-            literal_info[lit_id].group_mask = 1ULL << group_long_lit;
-            groupCount[group_long_lit]++;
-        }
-    } else {
-        u32 min_long_counter = counter;
-        DEBUG_PRINTF("base long lit group = %u\n", min_long_counter);
-        for (u32 lit_id : long_lits) {
-            u8 group_id = counter;
-            literal_info[lit_id].group_mask = 1ULL << group_id;
-            groupCount[group_id]++;
-            counter = next_available_group(counter, min_long_counter);
-        }
-    }
-
-    /* assign delayed literals to the same group as their parent */
-    for (const auto &e : literals.right) {
-        u32 id = e.first;
-        const rose_literal_id &lit = e.second;
-
-        if (!lit.delay) {
-            continue;
-        }
-
-        u32 parent = literal_info[id].undelayed_id;
-        DEBUG_PRINTF("%u is shadow picking up groups from %u\n", id, parent);
-        assert(literal_info[parent].undelayed_id == parent);
-        assert(literal_info[parent].group_mask);
-        literal_info[id].group_mask = literal_info[parent].group_mask;
-        /* don't increment the group count - these don't really exist */
-    }
-
-    DEBUG_PRINTF("populate group to literal mapping\n");
-    for (const u32 id : literals.right | map_keys) {
-        rose_group groups = literal_info[id].group_mask;
-        while (groups) {
-            u32 group_id = findAndClearLSB_64(&groups);
-            group_to_literal[group_id].insert(id);
-        }
-    }
-
-    /* find how many groups we allocated */
-    for (u32 i = 0; i < ROSE_GROUPS_MAX; i++) {
-        if (groupCount[i]) {
-            group_end = MAX(group_end, i + 1);
-        }
-    }
-}
-
 bool RoseBuildImpl::hasDelayedLiteral(RoseVertex v) const {
     for (u32 lit_id : g[v].literals) {
         if (literals.right.at(lit_id).delay) {
diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
index f99ac171..127731be 100644
--- a/src/rose/rose_build_groups.cpp
+++ b/src/rose/rose_build_groups.cpp
@@ -33,15 +33,308 @@
 
 #include "rose_build_groups.h"
 
+#include <queue>
 #include <vector>
 
 #include <boost/graph/topological_sort.hpp>
+#include <boost/range/adaptor/map.hpp>
 #include <boost/range/adaptor/reversed.hpp>
 
 using namespace std;
+using boost::adaptors::map_keys;
 
 namespace ue2 {
 
+#define ROSE_LONG_LITERAL_LEN 8
+
+static
+bool superStrong(const rose_literal_id &lit) {
+    if (lit.s.length() < ROSE_LONG_LITERAL_LEN) {
+        return false;
+    }
+
+    const u32 EXPECTED_FDR_BUCKET_LENGTH = 8;
+
+    assert(lit.s.length() >= EXPECTED_FDR_BUCKET_LENGTH);
+    size_t len = lit.s.length();
+    const string &s = lit.s.get_string();
+
+    for (size_t i = 1; i < EXPECTED_FDR_BUCKET_LENGTH; i++) {
+        if (s[len - 1 - i] != s[len - 1]) {
+            return true; /* we have at least some variation in the tail */
+        }
+    }
+    DEBUG_PRINTF("lit '%s' is not superstrong due to tail\n",
+                 escapeString(s).c_str());
+    return false;
+}
+
+static
+bool eligibleForAlwaysOnGroup(const RoseBuildImpl &build, u32 id) {
+    /* returns true if it or any of its delay versions have root role */
+    for (auto v : build.literal_info[id].vertices) {
+        if (build.isRootSuccessor(v)) {
+            NGHolder *h = build.g[v].left.graph.get();
+            if (!h || proper_out_degree(h->startDs, *h)) {
+                return true;
+            }
+        }
+    }
+
+    for (u32 delayed_id : build.literal_info[id].delayed_ids) {
+        for (auto v : build.literal_info[delayed_id].vertices) {
+            if (build.isRootSuccessor(v)) {
+                NGHolder *h = build.g[v].left.graph.get();
+                if (!h || proper_out_degree(h->startDs, *h)) {
+                    return true;
+                }
+            }
+        }
+    }
+
+    return false;
+}
+
+static
+bool requires_group_assignment(const rose_literal_id &lit,
+                               const rose_literal_info &info) {
+    if (lit.delay) { /* we will check the shadow's master */
+        return false;
+    }
+
+    if (lit.table == ROSE_ANCHORED || lit.table == ROSE_EVENT) {
+        return false;
+    }
+
+    // If we already have a group applied, skip.
+    if (info.group_mask) {
+        return false;
+    }
+
+    if (info.vertices.empty() && info.delayed_ids.empty()) {
+        DEBUG_PRINTF("literal is good for nothing\n");
+        return false;
+    }
+
+    return true;
+}
+
+static
+rose_group calcLocalGroup(const RoseVertex v, const RoseGraph &g,
+                          const deque<rose_literal_info> &literal_info,
+                          const bool small_literal_count) {
+    rose_group local_group = 0;
+
+    for (auto u : inv_adjacent_vertices_range(v, g)) {
+        /* In small cases, ensure that siblings have the same rose parentage to
+         * allow rose squashing. In larger cases, don't do this as groups are
+         * probably too scarce. */
+        for (auto w : adjacent_vertices_range(u, g)) {
+            if (!small_literal_count || g[v].left == g[w].left) {
+                for (u32 lit_id : g[w].literals) {
+                    local_group |= literal_info[lit_id].group_mask;
+                }
+            } else {
+                DEBUG_PRINTF("not sibling different mother %zu %zu\n",
+                             g[v].idx, g[w].idx);
+            }
+        }
+    }
+
+    return local_group;
+}
+
+/* group constants */
+#define MAX_LIGHT_LITERAL_CASE 200 /* allow rose to affect group decisions below
+                                    * this */
+
+static
+flat_set<RoseVertex> getAssociatedVertices(const RoseBuildImpl &build, u32 id) {
+    flat_set<RoseVertex> out;
+    const auto &info = build.literal_info[id];
+    insert(&out, info.vertices);
+    for (const auto &delayed : info.delayed_ids) {
+        insert(&out, build.literal_info[delayed].vertices);
+    }
+    return out;
+}
+
+static
+u32 next_available_group(u32 counter, u32 min_start_group) {
+    counter++;
+    if (counter == ROSE_GROUPS_MAX) {
+        DEBUG_PRINTF("resetting groups\n");
+        counter = min_start_group;
+    }
+
+    return counter;
+}
+
+// Assigns groups to literals in the general case, when we have more literals
+// than available groups.
+void RoseBuildImpl::assignGroupsToLiterals() {
+    bool small_literal_count = literal_info.size() <= MAX_LIGHT_LITERAL_CASE;
+
+    map<u8, u32> groupCount; /* group index to number of members */
+
+    u32 counter = 0;
+    u32 group_always_on = 0;
+
+    // First pass: handle always on literals.
+    for (const auto &e : literals.right) {
+        u32 id = e.first;
+        const rose_literal_id &lit = e.second;
+        rose_literal_info &info = literal_info[id];
+
+        if (!requires_group_assignment(lit, info)) {
+            continue;
+        }
+
+        // If this literal has a root role, we always have to search for it
+        // anyway, so it goes in the always-on group.
+        /* We could end up squashing it if it is followed by a .* */
+        if (eligibleForAlwaysOnGroup(*this, id)) {
+            info.group_mask = 1ULL << group_always_on;
+            groupCount[group_always_on]++;
+            continue;
+        }
+    }
+
+    u32 group_long_lit;
+    if (groupCount[group_always_on]) {
+        DEBUG_PRINTF("%u always on literals\n", groupCount[group_always_on]);
+        group_long_lit = group_always_on;
+        counter++;
+    } else {
+        group_long_lit = counter;
+        counter++;
+    }
+
+    u32 min_start_group = counter;
+    priority_queue<pair<pair<s32, s32>, u32> > pq;
+
+    // Second pass: the other literals.
+    for (const auto &e : literals.right) {
+        u32 id = e.first;
+        const rose_literal_id &lit = e.second;
+        rose_literal_info &info = literal_info[id];
+
+        if (!requires_group_assignment(lit, info)) {
+            continue;
+        }
+
+        assert(!eligibleForAlwaysOnGroup(*this, id));
+        pq.push(make_pair(make_pair(-(s32)literal_info[id].vertices.size(),
+                                    -(s32)lit.s.length()), id));
+    }
+    vector<u32> long_lits;
+    while (!pq.empty()) {
+        u32 id = pq.top().second;
+        pq.pop();
+        UNUSED const rose_literal_id &lit = literals.right.at(id);
+        DEBUG_PRINTF("assigning groups to lit %u (v %zu l %zu)\n", id,
+                     literal_info[id].vertices.size(), lit.s.length());
+
+        u8 group_id = 0;
+        rose_group group = ~0ULL;
+        for (auto v : getAssociatedVertices(*this, id)) {
+            rose_group local_group = calcLocalGroup(v, g, literal_info,
+                                                    small_literal_count);
+            group &= local_group;
+            if (!group) {
+                break;
+            }
+        }
+
+        if (group == ~0ULL) {
+            goto boring;
+        }
+
+        group &= ~((1ULL << min_start_group) - 1); /* ensure the purity of the
+                                                    * always_on groups */
+        if (!group) {
+            goto boring;
+        }
+
+        group_id = ctz64(group);
+
+        /* TODO: fairness */
+        DEBUG_PRINTF("picking sibling group %hhd\n", group_id);
+        literal_info[id].group_mask = 1ULL << group_id;
+        groupCount[group_id]++;
+
+        continue;
+
+    boring:
+        /* long literals will either be stuck in a mega group or spread around
+         * depending on availability */
+        if (superStrong(lit)) {
+            long_lits.push_back(id);
+            continue;
+        }
+
+        // Other literals are assigned to our remaining groups round-robin.
+        group_id = counter;
+
+        DEBUG_PRINTF("picking boring group %hhd\n", group_id);
+        literal_info[id].group_mask = 1ULL << group_id;
+        groupCount[group_id]++;
+        counter = next_available_group(counter, min_start_group);
+    }
+
+    /* spread long literals out amongst unused groups if any, otherwise stick
+     * them in the always on the group */
+
+    if (groupCount[counter]) {
+        DEBUG_PRINTF("sticking long literals in the image of the always on\n");
+        for (u32 lit_id : long_lits) {
+            literal_info[lit_id].group_mask = 1ULL << group_long_lit;
+            groupCount[group_long_lit]++;
+        }
+    } else {
+        u32 min_long_counter = counter;
+        DEBUG_PRINTF("base long lit group = %u\n", min_long_counter);
+        for (u32 lit_id : long_lits) {
+            u8 group_id = counter;
+            literal_info[lit_id].group_mask = 1ULL << group_id;
+            groupCount[group_id]++;
+            counter = next_available_group(counter, min_long_counter);
+        }
+    }
+    /* assign delayed literals to the same group as their parent */
+    for (const auto &e : literals.right) {
+        u32 id = e.first;
+        const rose_literal_id &lit = e.second;
+
+        if (!lit.delay) {
+            continue;
+        }
+
+        u32 parent = literal_info[id].undelayed_id;
+        DEBUG_PRINTF("%u is shadow picking up groups from %u\n", id, parent);
+        assert(literal_info[parent].undelayed_id == parent);
+        assert(literal_info[parent].group_mask);
+        literal_info[id].group_mask = literal_info[parent].group_mask;
+        /* don't increment the group count - these don't really exist */
+    }
+
+    DEBUG_PRINTF("populate group to literal mapping\n");
+    for (const u32 id : literals.right | map_keys) {
+        rose_group groups = literal_info[id].group_mask;
+        while (groups) {
+            u32 group_id = findAndClearLSB_64(&groups);
+            group_to_literal[group_id].insert(id);
+        }
+    }
+
+    /* find how many groups we allocated */
+    for (u32 i = 0; i < ROSE_GROUPS_MAX; i++) {
+        if (groupCount[i]) {
+            group_end = MAX(group_end, i + 1);
+        }
+    }
+}
+
 /**
  * \brief Returns a mapping from each graph vertex v to the intersection of the
  * groups switched on by all of the paths leading up to (and including) v from

From 42419a26d2920d656ce4fd01f8d7dfc3ad8a5635 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 2 Jun 2016 13:13:48 +1000
Subject: [PATCH 013/166] rose_build_groups: move assignGroupsToRoles

---
 src/rose/rose_build_compile.cpp | 24 ------------------------
 src/rose/rose_build_groups.cpp  | 24 ++++++++++++++++++++++++
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index a6d18f67..05bde212 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -1050,30 +1050,6 @@ void findGroupSquashers(RoseBuildImpl &tbi) {
     }
 }
 
-/**
- * The groups that a role sets are determined by the union of its successor
- * literals. Requires the literals already have had groups assigned.
- */
-void RoseBuildImpl::assignGroupsToRoles() {
-    /* Note: if there is a succ literal in the sidematcher, its successors
-     * literals must be added instead */
-    for (auto v : vertices_range(g)) {
-        if (isAnyStart(v)) {
-            continue;
-        }
-
-        const rose_group succ_groups = getSuccGroups(v);
-        g[v].groups |= succ_groups;
-
-        if (ghost.find(v) != ghost.end()) {
-            /* delayed roles need to supply their groups to the ghost role */
-            g[ghost[v]].groups |= succ_groups;
-        }
-
-        DEBUG_PRINTF("vertex %zu: groups=%llx\n", g[v].idx, g[v].groups);
-    }
-}
-
 void RoseBuildImpl::findTransientLeftfixes(void) {
     for (auto v : vertices_range(g)) {
         if (!g[v].left) {
diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
index 127731be..1a8d556e 100644
--- a/src/rose/rose_build_groups.cpp
+++ b/src/rose/rose_build_groups.cpp
@@ -335,6 +335,30 @@ void RoseBuildImpl::assignGroupsToLiterals() {
     }
 }
 
+/**
+ * The groups that a role sets are determined by the union of its successor
+ * literals. Requires the literals already have had groups assigned.
+ */
+void RoseBuildImpl::assignGroupsToRoles() {
+    /* Note: if there is a succ literal in the sidematcher, its successors
+     * literals must be added instead */
+    for (auto v : vertices_range(g)) {
+        if (isAnyStart(v)) {
+            continue;
+        }
+
+        const rose_group succ_groups = getSuccGroups(v);
+        g[v].groups |= succ_groups;
+
+        if (ghost.find(v) != ghost.end()) {
+            /* delayed roles need to supply their groups to the ghost role */
+            g[ghost[v]].groups |= succ_groups;
+        }
+
+        DEBUG_PRINTF("vertex %zu: groups=%llx\n", g[v].idx, g[v].groups);
+    }
+}
+
 /**
  * \brief Returns a mapping from each graph vertex v to the intersection of the
  * groups switched on by all of the paths leading up to (and including) v from

From 70ef229b2b36e2cb32d1029412c9a7dd369c2b69 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 2 Jun 2016 13:18:23 +1000
Subject: [PATCH 014/166] rose_build_groups: move getGroups, getSuccGroups

---
 src/rose/rose_build_compile.cpp | 24 ------------------------
 src/rose/rose_build_groups.cpp  | 24 ++++++++++++++++++++++++
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index 05bde212..af8f7e96 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -87,30 +87,6 @@ namespace ue2 {
 #define ANCHORED_REHOME_DEEP 25
 #define ANCHORED_REHOME_SHORT_LEN 3
 
-rose_group RoseBuildImpl::getGroups(RoseVertex v) const {
-    rose_group groups = 0;
-
-    for (u32 id : g[v].literals) {
-        u32 lit_id = literal_info.at(id).undelayed_id;
-
-        rose_group mygroups = literal_info[lit_id].group_mask;
-        groups |= mygroups;
-    }
-
-    return groups;
-}
-
-/** \brief Get the groups of the successor literals of a given vertex. */
-rose_group RoseBuildImpl::getSuccGroups(RoseVertex start) const {
-    rose_group initialGroups = 0;
-
-    for (auto v : adjacent_vertices_range(start, g)) {
-        initialGroups |= getGroups(v);
-    }
-
-    return initialGroups;
-}
-
 #ifdef DEBUG
 static UNUSED
 void printLitInfo(const rose_literal_info &li, u32 id) {
diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
index 1a8d556e..4fc125d2 100644
--- a/src/rose/rose_build_groups.cpp
+++ b/src/rose/rose_build_groups.cpp
@@ -335,6 +335,30 @@ void RoseBuildImpl::assignGroupsToLiterals() {
     }
 }
 
+rose_group RoseBuildImpl::getGroups(RoseVertex v) const {
+    rose_group groups = 0;
+
+    for (u32 id : g[v].literals) {
+        u32 lit_id = literal_info.at(id).undelayed_id;
+
+        rose_group mygroups = literal_info[lit_id].group_mask;
+        groups |= mygroups;
+    }
+
+    return groups;
+}
+
+/** \brief Get the groups of the successor literals of a given vertex. */
+rose_group RoseBuildImpl::getSuccGroups(RoseVertex start) const {
+    rose_group initialGroups = 0;
+
+    for (auto v : adjacent_vertices_range(start, g)) {
+        initialGroups |= getGroups(v);
+    }
+
+    return initialGroups;
+}
+
 /**
  * The groups that a role sets are determined by the union of its successor
  * literals. Requires the literals already have had groups assigned.

From de201997b7b71c0aed83dc614e3ec4dc86d61602 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 2 Jun 2016 13:24:47 +1000
Subject: [PATCH 015/166] rose_build_groups: move findGroupSquashers

---
 src/rose/rose_build_compile.cpp | 184 +------------------------------
 src/rose/rose_build_groups.cpp  | 186 ++++++++++++++++++++++++++++++++
 src/rose/rose_build_groups.h    |   2 +
 3 files changed, 189 insertions(+), 183 deletions(-)

diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index af8f7e96..dca7310c 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -34,6 +34,7 @@
 #include "rose_build_castle.h"
 #include "rose_build_convert.h"
 #include "rose_build_dump.h"
+#include "rose_build_groups.h"
 #include "rose_build_matchers.h"
 #include "rose_build_merge.h"
 #include "rose_build_role_aliasing.h"
@@ -843,189 +844,6 @@ bool RoseBuildImpl::hasAnchoredTablePred(RoseVertex v) const {
     return false;
 }
 
-/* returns true if every vertex associated with a groups also belongs to
-   lit_info */
-static
-bool coversGroup(const RoseBuildImpl &tbi, const rose_literal_info &lit_info) {
-    if (lit_info.vertices.empty()) {
-        DEBUG_PRINTF("no vertices - does not cover\n");
-        return false;
-    }
-
-    if (!lit_info.group_mask) {
-        DEBUG_PRINTF("no group - does not cover\n");
-        return false; /* no group (not a floating lit?) */
-    }
-
-    assert(popcount64(lit_info.group_mask) == 1);
-
-    /* for each lit in group, ensure that vertices are a subset of lit_info's */
-    rose_group groups = lit_info.group_mask;
-    while (groups) {
-        u32 group_id = findAndClearLSB_64(&groups);
-        for (u32 id : tbi.group_to_literal.at(group_id)) {
-            DEBUG_PRINTF(" checking against friend %u\n", id);
-            if (!is_subset_of(tbi.literal_info[id].vertices,
-                              lit_info.vertices)) {
-                DEBUG_PRINTF("fail\n");
-                return false;
-            }
-        }
-    }
-
-    DEBUG_PRINTF("ok\n");
-    return true;
-}
-
-static
-bool isGroupSquasher(const RoseBuildImpl &tbi, const u32 id /* literal id */,
-                     rose_group forbidden_squash_group) {
-    const RoseGraph &g = tbi.g;
-
-    const rose_literal_info &lit_info = tbi.literal_info.at(id);
-
-    DEBUG_PRINTF("checking if %u '%s' is a group squasher %016llx\n", id,
-                  dumpString(tbi.literals.right.at(id).s).c_str(),
-                  lit_info.group_mask);
-
-    if (tbi.literals.right.at(id).table == ROSE_EVENT) {
-        DEBUG_PRINTF("event literal, has no groups to squash\n");
-        return false;
-    }
-
-    if (!coversGroup(tbi, lit_info)) {
-        DEBUG_PRINTF("does not cover group\n");
-        return false;
-    }
-
-    if (lit_info.group_mask & forbidden_squash_group) {
-        /* probably a delayed lit */
-        DEBUG_PRINTF("skipping as involves a forbidden group\n");
-        return false;
-    }
-
-    // Single-vertex, less constrained case than the multiple-vertex one below.
-    if (lit_info.vertices.size() == 1) {
-        const RoseVertex &v = *lit_info.vertices.begin();
-
-        if (tbi.hasDelayPred(v)) { /* due to rebuild issues */
-            return false;
-        }
-
-        /* there are two ways to be a group squasher:
-         * 1) only care about the first accepted match
-         * 2) can only match once after a pred match
-         *
-         * (2) requires analysis of the infix before v and is not implemented,
-         * TODO
-         */
-
-        /* Case 1 */
-
-        // Can't squash cases with accepts
-        if (!g[v].reports.empty()) {
-            return false;
-        }
-
-        /* Can't squash cases with a suffix without analysis of the suffix.
-         * TODO: look at suffixes */
-        if (g[v].suffix) {
-            return false;
-        }
-
-        // Out-edges must have inf max bound, + no other shenanigans */
-        for (const auto &e : out_edges_range(v, g)) {
-            if (g[e].maxBound != ROSE_BOUND_INF) {
-                return false;
-            }
-
-            if (g[target(e, g)].left) {
-                return false; /* is an infix rose trigger, TODO: analysis */
-            }
-        }
-
-        DEBUG_PRINTF("%u is a path 1 group squasher\n", id);
-        return true;
-
-        /* note: we could also squash the groups of its preds (if nobody else is
-         * using them. TODO. */
-    }
-
-    // Multiple-vertex case
-    for (auto v : lit_info.vertices) {
-        assert(!tbi.isAnyStart(v));
-
-        // Can't squash cases with accepts
-        if (!g[v].reports.empty()) {
-            return false;
-        }
-
-        // Suffixes and leftfixes are out too as first literal may not match
-        // for everyone.
-        if (!g[v].isBoring()) {
-            return false;
-        }
-
-        /* TODO: checks are solid but we should explain */
-        if (tbi.hasDelayPred(v) || tbi.hasAnchoredTablePred(v)) {
-            return false;
-        }
-
-        // Out-edges must have inf max bound and not directly lead to another
-        // vertex with this group, e.g. 'foobar.*foobar'.
-        for (const auto &e : out_edges_range(v, g)) {
-            if (g[e].maxBound != ROSE_BOUND_INF) {
-                return false;
-            }
-            RoseVertex t = target(e, g);
-
-            if (g[t].left) {
-                return false; /* is an infix rose trigger */
-            }
-
-            for (u32 lit_id : g[t].literals) {
-                if (tbi.literal_info[lit_id].group_mask & lit_info.group_mask) {
-                    return false;
-                }
-            }
-        }
-
-        // In-edges must all be dot-stars with no overlap at all, as overlap
-        // also causes history to be used.
-        /* Different tables are already forbidden by previous checks */
-        for (const auto &e : in_edges_range(v, g)) {
-            if (!(g[e].minBound == 0 && g[e].maxBound == ROSE_BOUND_INF)) {
-                return false;
-            }
-
-            // Check overlap, if source was a literal.
-            RoseVertex u = source(e, g);
-            if (tbi.maxLiteralOverlap(u, v)) {
-                return false;
-            }
-        }
-    }
-
-    DEBUG_PRINTF("literal %u is a multi-vertex group squasher\n", id);
-    return true;
-}
-
-static
-void findGroupSquashers(RoseBuildImpl &tbi) {
-    rose_group forbidden_squash_group = 0;
-    for (const auto &e : tbi.literals.right) {
-        if (e.second.delay) {
-            forbidden_squash_group |= tbi.literal_info[e.first].group_mask;
-        }
-    }
-
-    for (u32 id = 0; id < tbi.literal_info.size(); id++) {
-        if (isGroupSquasher(tbi, id, forbidden_squash_group)) {
-            tbi.literal_info[id].squash_group = true;
-        }
-    }
-}
-
 void RoseBuildImpl::findTransientLeftfixes(void) {
     for (auto v : vertices_range(g)) {
         if (!g[v].left) {
diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
index 4fc125d2..5467e1ab 100644
--- a/src/rose/rose_build_groups.cpp
+++ b/src/rose/rose_build_groups.cpp
@@ -454,4 +454,190 @@ rose_group getSquashableGroups(const RoseBuildImpl &build) {
     return squashable_groups;
 }
 
+/**
+ * \brief True if every vertex associated with a group also belongs to
+ * lit_info.
+ */
+static
+bool coversGroup(const RoseBuildImpl &build,
+                 const rose_literal_info &lit_info) {
+    if (lit_info.vertices.empty()) {
+        DEBUG_PRINTF("no vertices - does not cover\n");
+        return false;
+    }
+
+    if (!lit_info.group_mask) {
+        DEBUG_PRINTF("no group - does not cover\n");
+        return false; /* no group (not a floating lit?) */
+    }
+
+    assert(popcount64(lit_info.group_mask) == 1);
+
+    /* for each lit in group, ensure that vertices are a subset of lit_info's */
+    rose_group groups = lit_info.group_mask;
+    while (groups) {
+        u32 group_id = findAndClearLSB_64(&groups);
+        for (u32 id : build.group_to_literal.at(group_id)) {
+            DEBUG_PRINTF(" checking against friend %u\n", id);
+            if (!is_subset_of(build.literal_info[id].vertices,
+                              lit_info.vertices)) {
+                DEBUG_PRINTF("fail\n");
+                return false;
+            }
+        }
+    }
+
+    DEBUG_PRINTF("ok\n");
+    return true;
+}
+
+static
+bool isGroupSquasher(const RoseBuildImpl &build, const u32 id /* literal id */,
+                     rose_group forbidden_squash_group) {
+    const RoseGraph &g = build.g;
+
+    const rose_literal_info &lit_info = build.literal_info.at(id);
+
+    DEBUG_PRINTF("checking if %u '%s' is a group squasher %016llx\n", id,
+                  dumpString(build.literals.right.at(id).s).c_str(),
+                  lit_info.group_mask);
+
+    if (build.literals.right.at(id).table == ROSE_EVENT) {
+        DEBUG_PRINTF("event literal, has no groups to squash\n");
+        return false;
+    }
+
+    if (!coversGroup(build, lit_info)) {
+        DEBUG_PRINTF("does not cover group\n");
+        return false;
+    }
+
+    if (lit_info.group_mask & forbidden_squash_group) {
+        /* probably a delayed lit */
+        DEBUG_PRINTF("skipping as involves a forbidden group\n");
+        return false;
+    }
+
+    // Single-vertex, less constrained case than the multiple-vertex one below.
+    if (lit_info.vertices.size() == 1) {
+        const RoseVertex &v = *lit_info.vertices.begin();
+
+        if (build.hasDelayPred(v)) { /* due to rebuild issues */
+            return false;
+        }
+
+        /* there are two ways to be a group squasher:
+         * 1) only care about the first accepted match
+         * 2) can only match once after a pred match
+         *
+         * (2) requires analysis of the infix before v and is not implemented,
+         * TODO
+         */
+
+        /* Case 1 */
+
+        // Can't squash cases with accepts
+        if (!g[v].reports.empty()) {
+            return false;
+        }
+
+        /* Can't squash cases with a suffix without analysis of the suffix.
+         * TODO: look at suffixes */
+        if (g[v].suffix) {
+            return false;
+        }
+
+        // Out-edges must have inf max bound, + no other shenanigans */
+        for (const auto &e : out_edges_range(v, g)) {
+            if (g[e].maxBound != ROSE_BOUND_INF) {
+                return false;
+            }
+
+            if (g[target(e, g)].left) {
+                return false; /* is an infix rose trigger, TODO: analysis */
+            }
+        }
+
+        DEBUG_PRINTF("%u is a path 1 group squasher\n", id);
+        return true;
+
+        /* note: we could also squash the groups of its preds (if nobody else is
+         * using them. TODO. */
+    }
+
+    // Multiple-vertex case
+    for (auto v : lit_info.vertices) {
+        assert(!build.isAnyStart(v));
+
+        // Can't squash cases with accepts
+        if (!g[v].reports.empty()) {
+            return false;
+        }
+
+        // Suffixes and leftfixes are out too as first literal may not match
+        // for everyone.
+        if (!g[v].isBoring()) {
+            return false;
+        }
+
+        /* TODO: checks are solid but we should explain */
+        if (build.hasDelayPred(v) || build.hasAnchoredTablePred(v)) {
+            return false;
+        }
+
+        // Out-edges must have inf max bound and not directly lead to another
+        // vertex with this group, e.g. 'foobar.*foobar'.
+        for (const auto &e : out_edges_range(v, g)) {
+            if (g[e].maxBound != ROSE_BOUND_INF) {
+                return false;
+            }
+            RoseVertex t = target(e, g);
+
+            if (g[t].left) {
+                return false; /* is an infix rose trigger */
+            }
+
+            for (u32 lit_id : g[t].literals) {
+                if (build.literal_info[lit_id].group_mask &
+                    lit_info.group_mask) {
+                    return false;
+                }
+            }
+        }
+
+        // In-edges must all be dot-stars with no overlap at all, as overlap
+        // also causes history to be used.
+        /* Different tables are already forbidden by previous checks */
+        for (const auto &e : in_edges_range(v, g)) {
+            if (!(g[e].minBound == 0 && g[e].maxBound == ROSE_BOUND_INF)) {
+                return false;
+            }
+
+            // Check overlap, if source was a literal.
+            RoseVertex u = source(e, g);
+            if (build.maxLiteralOverlap(u, v)) {
+                return false;
+            }
+        }
+    }
+
+    DEBUG_PRINTF("literal %u is a multi-vertex group squasher\n", id);
+    return true;
+}
+
+void findGroupSquashers(RoseBuildImpl &build) {
+    rose_group forbidden_squash_group = 0;
+    for (const auto &e : build.literals.right) {
+        if (e.second.delay) {
+            forbidden_squash_group |= build.literal_info[e.first].group_mask;
+        }
+    }
+
+    for (u32 id = 0; id < build.literal_info.size(); id++) {
+        if (isGroupSquasher(build, id, forbidden_squash_group)) {
+            build.literal_info[id].squash_group = true;
+        }
+    }
+}
+
 } // namespace ue2
diff --git a/src/rose/rose_build_groups.h b/src/rose/rose_build_groups.h
index 608eda4a..f24a11c3 100644
--- a/src/rose/rose_build_groups.h
+++ b/src/rose/rose_build_groups.h
@@ -44,6 +44,8 @@ getVertexGroupMap(const RoseBuildImpl &build);
 
 rose_group getSquashableGroups(const RoseBuildImpl &build);
 
+void findGroupSquashers(RoseBuildImpl &build);
+
 } // namespace ue2
 
 #endif // ROSE_BUILD_GROUPS_H

From 89dbbe6c53b3aa3791711dc33f65e120199cacdb Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 2 Jun 2016 13:52:29 +1000
Subject: [PATCH 016/166] rose: make assignGroupsToRoles a free function

---
 src/rose/rose_build_compile.cpp |  2 +-
 src/rose/rose_build_groups.cpp  | 13 ++++++++-----
 src/rose/rose_build_groups.h    |  2 ++
 src/rose/rose_build_impl.h      |  4 ----
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index dca7310c..b86058de 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -1661,7 +1661,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildRose(u32 minWidth) {
     findMoreLiteralMasks(*this);
 
     assignGroupsToLiterals();
-    assignGroupsToRoles();
+    assignGroupsToRoles(*this);
     findGroupSquashers(*this);
 
     /* final prep work */
diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
index 5467e1ab..fd96fbd6 100644
--- a/src/rose/rose_build_groups.cpp
+++ b/src/rose/rose_build_groups.cpp
@@ -363,20 +363,23 @@ rose_group RoseBuildImpl::getSuccGroups(RoseVertex start) const {
  * The groups that a role sets are determined by the union of its successor
  * literals. Requires the literals already have had groups assigned.
  */
-void RoseBuildImpl::assignGroupsToRoles() {
+void assignGroupsToRoles(RoseBuildImpl &build) {
+    auto &g = build.g;
+
     /* Note: if there is a succ literal in the sidematcher, its successors
      * literals must be added instead */
     for (auto v : vertices_range(g)) {
-        if (isAnyStart(v)) {
+        if (build.isAnyStart(v)) {
             continue;
         }
 
-        const rose_group succ_groups = getSuccGroups(v);
+        const rose_group succ_groups = build.getSuccGroups(v);
         g[v].groups |= succ_groups;
 
-        if (ghost.find(v) != ghost.end()) {
+        auto ghost_it = build.ghost.find(v);
+        if (ghost_it != end(build.ghost)) {
             /* delayed roles need to supply their groups to the ghost role */
-            g[ghost[v]].groups |= succ_groups;
+            g[ghost_it->second].groups |= succ_groups;
         }
 
         DEBUG_PRINTF("vertex %zu: groups=%llx\n", g[v].idx, g[v].groups);
diff --git a/src/rose/rose_build_groups.h b/src/rose/rose_build_groups.h
index f24a11c3..6719fbea 100644
--- a/src/rose/rose_build_groups.h
+++ b/src/rose/rose_build_groups.h
@@ -44,6 +44,8 @@ getVertexGroupMap(const RoseBuildImpl &build);
 
 rose_group getSquashableGroups(const RoseBuildImpl &build);
 
+void assignGroupsToRoles(RoseBuildImpl &build);
+
 void findGroupSquashers(RoseBuildImpl &build);
 
 } // namespace ue2
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index d5f75a22..b1d7ac36 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -439,10 +439,6 @@ public:
     // Find the maximum bound on the edges to this vertex's successors.
     u32 calcSuccMaxBound(RoseVertex u) const;
 
-    // Assign roles to groups, writing the groups bitset into each role in the
-    // graph.
-    void assignGroupsToRoles();
-
     /* Returns the ID of the given literal in the literal map, adding it if
      * necessary. */
     u32 getLiteralId(const ue2_literal &s, u32 delay, rose_literal_table table);

From 7690881f85e29fef3031737fc948f2c2ea9eb028 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 2 Jun 2016 13:57:03 +1000
Subject: [PATCH 017/166] rose: make assignGroupsToLiterals a free function

---
 src/rose/rose_build_compile.cpp |  2 +-
 src/rose/rose_build_groups.cpp  | 19 ++++++++++---------
 src/rose/rose_build_groups.h    |  2 ++
 src/rose/rose_build_impl.h      |  2 --
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index b86058de..a6868ff8 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -1660,7 +1660,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildRose(u32 minWidth) {
 
     findMoreLiteralMasks(*this);
 
-    assignGroupsToLiterals();
+    assignGroupsToLiterals(*this);
     assignGroupsToRoles(*this);
     findGroupSquashers(*this);
 
diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
index fd96fbd6..e1a130ef 100644
--- a/src/rose/rose_build_groups.cpp
+++ b/src/rose/rose_build_groups.cpp
@@ -170,9 +170,10 @@ u32 next_available_group(u32 counter, u32 min_start_group) {
     return counter;
 }
 
-// Assigns groups to literals in the general case, when we have more literals
-// than available groups.
-void RoseBuildImpl::assignGroupsToLiterals() {
+void assignGroupsToLiterals(RoseBuildImpl &build) {
+    auto &literals = build.literals;
+    auto &literal_info = build.literal_info;
+
     bool small_literal_count = literal_info.size() <= MAX_LIGHT_LITERAL_CASE;
 
     map<u8, u32> groupCount; /* group index to number of members */
@@ -193,7 +194,7 @@ void RoseBuildImpl::assignGroupsToLiterals() {
         // If this literal has a root role, we always have to search for it
         // anyway, so it goes in the always-on group.
         /* We could end up squashing it if it is followed by a .* */
-        if (eligibleForAlwaysOnGroup(*this, id)) {
+        if (eligibleForAlwaysOnGroup(build, id)) {
             info.group_mask = 1ULL << group_always_on;
             groupCount[group_always_on]++;
             continue;
@@ -223,7 +224,7 @@ void RoseBuildImpl::assignGroupsToLiterals() {
             continue;
         }
 
-        assert(!eligibleForAlwaysOnGroup(*this, id));
+        assert(!eligibleForAlwaysOnGroup(build, id));
         pq.push(make_pair(make_pair(-(s32)literal_info[id].vertices.size(),
                                     -(s32)lit.s.length()), id));
     }
@@ -237,8 +238,8 @@ void RoseBuildImpl::assignGroupsToLiterals() {
 
         u8 group_id = 0;
         rose_group group = ~0ULL;
-        for (auto v : getAssociatedVertices(*this, id)) {
-            rose_group local_group = calcLocalGroup(v, g, literal_info,
+        for (auto v : getAssociatedVertices(build, id)) {
+            rose_group local_group = calcLocalGroup(v, build.g, literal_info,
                                                     small_literal_count);
             group &= local_group;
             if (!group) {
@@ -323,14 +324,14 @@ void RoseBuildImpl::assignGroupsToLiterals() {
         rose_group groups = literal_info[id].group_mask;
         while (groups) {
             u32 group_id = findAndClearLSB_64(&groups);
-            group_to_literal[group_id].insert(id);
+            build.group_to_literal[group_id].insert(id);
         }
     }
 
     /* find how many groups we allocated */
     for (u32 i = 0; i < ROSE_GROUPS_MAX; i++) {
         if (groupCount[i]) {
-            group_end = MAX(group_end, i + 1);
+            build.group_end = max(build.group_end, i + 1);
         }
     }
 }
diff --git a/src/rose/rose_build_groups.h b/src/rose/rose_build_groups.h
index 6719fbea..3ab5eb78 100644
--- a/src/rose/rose_build_groups.h
+++ b/src/rose/rose_build_groups.h
@@ -44,6 +44,8 @@ getVertexGroupMap(const RoseBuildImpl &build);
 
 rose_group getSquashableGroups(const RoseBuildImpl &build);
 
+void assignGroupsToLiterals(RoseBuildImpl &build);
+
 void assignGroupsToRoles(RoseBuildImpl &build);
 
 void findGroupSquashers(RoseBuildImpl &build);
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index b1d7ac36..5f1871e4 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -470,8 +470,6 @@ public:
     bool hasLiteralInTable(RoseVertex v, enum rose_literal_table t) const;
     bool hasAnchoredTablePred(RoseVertex v) const;
 
-    void assignGroupsToLiterals(void);
-
     // Is the given vertex a successor of either root or anchored_root?
     bool isRootSuccessor(const RoseVertex &v) const;
     /* Is the given vertex a successor of something other than root or

From 1df4da16adafc0a27a7347bb1c3d6c59d3aaa469 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 3 May 2016 16:16:20 +1000
Subject: [PATCH 018/166] rose: parameterise CHECK_LIT_EARLY

---
 src/rose/program_runtime.h       |  6 ++---
 src/rose/rose_build_bytecode.cpp | 39 +++++++++++++++++++++++++++-----
 src/rose/rose_dump.cpp           |  4 +++-
 src/rose/rose_program.h          |  1 +
 4 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index 78397070..a913ae27 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -880,9 +880,9 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
             PROGRAM_NEXT_INSTRUCTION
 
             PROGRAM_CASE(CHECK_LIT_EARLY) {
-                if (end < t->floatingMinLiteralMatchOffset) {
-                    DEBUG_PRINTF("halt: too soon, min offset=%u\n",
-                                 t->floatingMinLiteralMatchOffset);
+                if (end < ri->min_offset) {
+                    DEBUG_PRINTF("halt: before min_offset=%u\n",
+                                 ri->min_offset);
                     return HWLM_CONTINUE_MATCHING;
                 }
             }
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 3d1b5c6b..b148155b 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -3568,6 +3568,19 @@ void makeGroupSquashInstruction(const RoseBuildImpl &build, u32 final_id,
     program.push_back(move(ri));
 }
 
+static
+u32 findMinOffset(const RoseBuildImpl &build, u32 lit_id) {
+    const auto &lit_vertices = build.literal_info.at(lit_id).vertices;
+    assert(!lit_vertices.empty());
+
+    u32 min_offset = UINT32_MAX;
+    for (const auto &v : lit_vertices) {
+        min_offset = min(min_offset, build.g[v].min_offset);
+    }
+
+    return min_offset;
+}
+
 static
 void makeCheckLitEarlyInstruction(const RoseBuildImpl &build, build_context &bc,
                                   u32 final_id,
@@ -3591,22 +3604,36 @@ void makeCheckLitEarlyInstruction(const RoseBuildImpl &build, build_context &bc,
         return;
     }
 
-    size_t min_offset = SIZE_MAX;
+    size_t min_len = SIZE_MAX;
+    u32 min_offset = UINT32_MAX;
     for (u32 lit_id : lit_ids) {
         const auto &lit = build.literals.right.at(lit_id);
-        min_offset = min(min_offset, lit.elength());
+        size_t lit_min_len = lit.elength();
+        u32 lit_min_offset = findMinOffset(build, lit_id);
+        DEBUG_PRINTF("lit_id=%u has min_len=%zu, min_offset=%u\n", lit_id,
+                     lit_min_len, lit_min_offset);
+        min_len = min(min_len, lit_min_len);
+        min_offset = min(min_offset, lit_min_offset);
     }
 
-    DEBUG_PRINTF("%zu lits, min_offset=%zu\n", lit_ids.size(), min_offset);
+    DEBUG_PRINTF("final_id=%u has min_len=%zu, min_offset=%u, "
+                 "global min is %u\n", final_id, min_len, min_offset,
+                 bc.floatingMinLiteralMatchOffset);
 
     // If we can't match before the min offset, we don't need the check.
-    if (min_offset >= bc.floatingMinLiteralMatchOffset) {
+    if (min_len >= bc.floatingMinLiteralMatchOffset) {
         DEBUG_PRINTF("no need for check, min is %u\n",
-                      bc.floatingMinLiteralMatchOffset);
+                     bc.floatingMinLiteralMatchOffset);
         return;
     }
 
-    program.push_back(RoseInstruction(ROSE_INSTR_CHECK_LIT_EARLY));
+    assert(min_offset >= bc.floatingMinLiteralMatchOffset);
+    assert(min_offset < UINT32_MAX);
+
+    DEBUG_PRINTF("adding lit early check, min_offset=%u\n", min_offset);
+    auto ri = RoseInstruction(ROSE_INSTR_CHECK_LIT_EARLY);
+    ri.u.checkLitEarly.min_offset = min_offset;
+    program.push_back(move(ri));
 }
 
 static
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index f6badd1b..ad776780 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -253,7 +253,9 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
-            PROGRAM_CASE(CHECK_LIT_EARLY) {}
+            PROGRAM_CASE(CHECK_LIT_EARLY) {
+                os << "    min_offset " << ri->min_offset << endl;
+            }
             PROGRAM_NEXT_INSTRUCTION
 
             PROGRAM_CASE(CHECK_GROUPS) {
diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h
index 01572dbd..5c57bf54 100644
--- a/src/rose/rose_program.h
+++ b/src/rose/rose_program.h
@@ -120,6 +120,7 @@ struct ROSE_STRUCT_CHECK_LIT_MASK {
 /** Note: check failure will halt program. */
 struct ROSE_STRUCT_CHECK_LIT_EARLY {
     u8 code; //!< From enum RoseInstructionCode.
+    u32 min_offset; //!< Minimum offset for this literal.
 };
 
 /** Note: check failure will halt program. */

From 0548a6d9953666c2dc2ddd99e7204b2bd9427b65 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Fri, 3 Jun 2016 15:33:35 +1000
Subject: [PATCH 019/166] use edge index to avoid assoc property maps

---
 src/nfagraph/ng_literal_analysis.cpp | 81 +++++++++++++---------------
 1 file changed, 37 insertions(+), 44 deletions(-)

diff --git a/src/nfagraph/ng_literal_analysis.cpp b/src/nfagraph/ng_literal_analysis.cpp
index f9ef6061..9cb0091e 100644
--- a/src/nfagraph/ng_literal_analysis.cpp
+++ b/src/nfagraph/ng_literal_analysis.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -64,10 +64,6 @@ namespace {
 /* Small literal graph type used for the suffix tree used in
  * compressAndScore. */
 
-typedef boost::adjacency_list_traits<boost::vecS, boost::vecS,
-                                     boost::bidirectionalS> LitGraphTraits;
-typedef LitGraphTraits::vertex_descriptor LitVertex;
-typedef LitGraphTraits::edge_descriptor LitEdge;
 
 struct LitGraphVertexProps {
     LitGraphVertexProps() {}
@@ -79,11 +75,15 @@ struct LitGraphEdgeProps {
     LitGraphEdgeProps() {}
     explicit LitGraphEdgeProps(u64a score_in) : score(score_in) {}
     u64a score = NO_LITERAL_AT_EDGE_SCORE;
+    size_t index; /* only initialised when the reverse edges are added. */
 };
 
+/* keep edgeList = listS as you cannot remove edges if edgeList = vecS */
 typedef boost::adjacency_list<boost::vecS, boost::vecS, boost::bidirectionalS,
                               LitGraphVertexProps, LitGraphEdgeProps,
                               boost::no_property> LitGraph;
+typedef LitGraph::vertex_descriptor LitVertex;
+typedef LitGraph::edge_descriptor LitEdge;
 
 typedef pair<LitVertex, NFAVertex> VertexPair;
 typedef std::queue<VertexPair> LitVertexQ;
@@ -474,43 +474,36 @@ const char *describeColor(boost::default_color_type c) {
 
 /**
  * The BGL's boykov_kolmogorov_max_flow requires that all edges have their
- * reverse edge in the graph. This function adds them, returning the new edges
- * and constructing a map of (edge, rev edge).
+ * reverse edge in the graph. This function adds them, returning a vector
+ * mapping edge index to reverse edge. Note: LitGraph should be a DAG so there
+ * should be no existing reverse_edges.
  */
 static
-vector<LitEdge> addReverseEdges(LitGraph &lg,
-                ue2::unordered_map<LitEdge, LitEdge> &reverse_edge_map) {
-    vector<LitEdge> reverseMe;
-
-    reverse_edge_map.clear();
-    reverse_edge_map.reserve(num_edges(lg) * 2);
+vector<LitEdge> add_reverse_edges_and_index(LitGraph &lg) {
+    vector<LitEdge> fwd_edges;
 
+    size_t next_index = 0;
     for (const auto &e : edges_range(lg)) {
-        LitVertex u = source(e, lg), v = target(e, lg);
-        assert(u != v);
-
-        bool exists;
-        LitEdge rev;
-        tie(rev, exists) = edge(v, u, lg);
-        if (exists) {
-            reverse_edge_map[e] = rev;
-        } else {
-            reverseMe.push_back(e);
-        }
+        lg[e].index = next_index++;
+        fwd_edges.push_back(e);
     }
 
-    vector<LitEdge> reverseEdges;
-    reverseEdges.reserve(reverseMe.size());
+    vector<LitEdge> rev_map(2 * num_edges(lg));
 
-    for (const auto &e : reverseMe) {
-        LitVertex u = source(e, lg), v = target(e, lg);
-        LitEdge rev = add_edge(v, u, lg[e], lg).first;
-        reverseEdges.push_back(rev);
-        reverse_edge_map[e] = rev;
-        reverse_edge_map[rev] = e;
+    for (const auto &e : fwd_edges) {
+        LitVertex u = source(e, lg);
+        LitVertex v = target(e, lg);
+
+        assert(!edge(v, u, lg).second);
+
+        LitEdge rev = add_edge(v, u, lg).first;
+        lg[rev].score = 0;
+        lg[rev].index = next_index++;
+        rev_map[lg[e].index] = rev;
+        rev_map[lg[rev].index] = e;
     }
 
-    return reverseEdges;
+    return rev_map;
 }
 
 static
@@ -522,33 +515,33 @@ void findMinCut(LitGraph &lg, const LitVertex &root, const LitVertex &sink,
 
     assert(!in_degree(root, lg));
     assert(!out_degree(sink, lg));
+    size_t num_real_edges = num_edges(lg);
 
     // Add reverse edges for the convenience of the BGL's max flow algorithm.
-    ue2::unordered_map<LitEdge, LitEdge> reverse_edge_map;
-    vector<LitEdge> tempEdges = addReverseEdges(lg, reverse_edge_map);
+    vector<LitEdge> rev_edges = add_reverse_edges_and_index(lg);
 
     const auto v_index_map = get(vertex_index, lg);
+    const auto e_index_map = get(&LitGraphEdgeProps::index, lg);
     const size_t num_verts = num_vertices(lg);
     vector<boost::default_color_type> colors(num_verts);
     vector<s32> distances(num_verts);
     vector<LitEdge> predecessors(num_verts);
-    ue2::unordered_map<LitEdge, u64a> residuals;
-    residuals.reserve(num_edges(lg));
+    vector<u64a> residuals(num_edges(lg));
 
     UNUSED u64a flow = boykov_kolmogorov_max_flow(lg,
             get(&LitGraphEdgeProps::score, lg),
-            make_assoc_property_map(residuals),
-            make_assoc_property_map(reverse_edge_map),
+            make_iterator_property_map(residuals.begin(), e_index_map),
+            make_iterator_property_map(rev_edges.begin(), e_index_map),
             make_iterator_property_map(predecessors.begin(), v_index_map),
             make_iterator_property_map(colors.begin(), v_index_map),
             make_iterator_property_map(distances.begin(), v_index_map),
-            get(vertex_index, lg), root, sink);
+            v_index_map, root, sink);
     DEBUG_PRINTF("done, flow = %llu\n", flow);
 
-    // Remove temporary reverse edges.
-    for (const auto &e : tempEdges) {
-        remove_edge(e, lg);
-    }
+    /* remove reverse edges */
+    remove_edge_if([&](const LitEdge &e) {
+                       return lg[e].index >= num_real_edges;
+                   }, lg);
 
     vector<LitEdge> white_cut, black_cut;
     u64a white_flow = 0, black_flow = 0;

From 9dddb4efc30cb884fa9a4e017e4dd754da56ebc7 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 2 Jun 2016 16:25:36 +1000
Subject: [PATCH 020/166] ng_equivalence: reduce compile time on large cases

---
 src/nfagraph/ng_equivalence.cpp | 191 ++++++++++++++++----------------
 1 file changed, 95 insertions(+), 96 deletions(-)

diff --git a/src/nfagraph/ng_equivalence.cpp b/src/nfagraph/ng_equivalence.cpp
index b8e5a8d6..d0ab7c4a 100644
--- a/src/nfagraph/ng_equivalence.cpp
+++ b/src/nfagraph/ng_equivalence.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,7 +38,7 @@
 #include "ng_util.h"
 #include "util/compile_context.h"
 #include "util/graph_range.h"
-#include "util/order_check.h"
+#include "util/ue2_containers.h"
 
 #include <algorithm>
 #include <set>
@@ -53,9 +53,8 @@ using boost::ptr_vector;
 namespace ue2 {
 
 enum EquivalenceType {
-    LEFT_EQUIVALENCE = 0,
+    LEFT_EQUIVALENCE,
     RIGHT_EQUIVALENCE,
-    MAX_EQUIVALENCE
 };
 
 namespace {
@@ -91,7 +90,6 @@ public:
 }
 
 typedef ue2::unordered_set<VertexInfo *, VertexInfoPtrCmp> VertexInfoSet;
-typedef ue2::unordered_map<unsigned, VertexInfoSet> ClassMap;
 
 // compare two vertex info pointers on their vertex index
 bool VertexInfoPtrCmp::operator()(const VertexInfo *a,
@@ -118,27 +116,34 @@ public:
         DepthMinMax d1;
         DepthMinMax d2;
     };
-    ClassInfo(const NGHolder &g, VertexInfo &vi, ClassDepth &d_in,
+    ClassInfo(const NGHolder &g, const VertexInfo &vi, const ClassDepth &d_in,
               EquivalenceType eq)
-        : vertex_flags(vi.vertex_flags), edge_top(vi.edge_top), cr(vi.cr),
-          depth(d_in) {
+        : /* reports only matter for right-equiv */
+          rs(eq == RIGHT_EQUIVALENCE ? g[vi.v].reports : flat_set<ReportID>()),
+          vertex_flags(vi.vertex_flags), edge_top(vi.edge_top), cr(vi.cr),
+          adjacent_cr(eq == LEFT_EQUIVALENCE ? vi.pred_cr : vi.succ_cr),
+          /* treat non-special vertices the same */
+          node_type(min(g[vi.v].index, u32{N_SPECIALS})), depth(d_in) {}
 
-        // hackety-hack!
-        node_type = g[vi.v].index;
-        if (node_type > N_SPECIALS) {
-            // we treat all regular vertices the same
-            node_type = N_SPECIALS;
-        }
-
-        // get all the adjacent vertices' CharReach
-        adjacent_cr = eq == LEFT_EQUIVALENCE ? vi.pred_cr : vi.succ_cr;
-
-        if (eq == RIGHT_EQUIVALENCE) {
-            rs = g[vi.v].reports;
-        }
+    bool operator==(const ClassInfo &b) const {
+        return node_type == b.node_type && depth.d1 == b.depth.d1 &&
+               depth.d2 == b.depth.d2 && cr == b.cr &&
+               adjacent_cr == b.adjacent_cr && edge_top == b.edge_top &&
+               vertex_flags == b.vertex_flags && rs == b.rs;
     }
 
-    bool operator<(const ClassInfo &b) const;
+    friend size_t hash_value(const ClassInfo &c) {
+        size_t val = 0;
+        boost::hash_combine(val, boost::hash_range(begin(c.rs), end(c.rs)));
+        boost::hash_combine(val, c.vertex_flags);
+        boost::hash_combine(val, c.edge_top);
+        boost::hash_combine(val, c.cr);
+        boost::hash_combine(val, c.adjacent_cr);
+        boost::hash_combine(val, c.node_type);
+        boost::hash_combine(val, c.depth.d1);
+        boost::hash_combine(val, c.depth.d2);
+        return val;
+    }
 
 private:
     flat_set<ReportID> rs; /* for right equiv only */
@@ -200,26 +205,12 @@ public:
         return q.capacity();
     }
 private:
-    set<unsigned> ids; //!< stores id's, for uniqueness
+    unordered_set<unsigned> ids; //!< stores id's, for uniqueness
     vector<unsigned> q; //!< vector of id's that we use as FILO.
 };
 
 }
 
-bool ClassInfo::operator<(const ClassInfo &b) const {
-    const ClassInfo &a = *this;
-
-    ORDER_CHECK(node_type);
-    ORDER_CHECK(depth.d1);
-    ORDER_CHECK(depth.d2);
-    ORDER_CHECK(cr);
-    ORDER_CHECK(adjacent_cr);
-    ORDER_CHECK(edge_top);
-    ORDER_CHECK(vertex_flags);
-    ORDER_CHECK(rs);
-    return false;
-}
-
 static
 bool outIsIrreducible(NFAVertex &v, const NGHolder &g) {
     unsigned nonSpecialVertices = 0;
@@ -286,9 +277,14 @@ bool hasEdgeAsserts(NFAVertex v, const NGHolder &g) {
 
 // populate VertexInfo table
 static
-void getVertexInfos(const NGHolder &g, ptr_vector<VertexInfo> &infos) {
+ptr_vector<VertexInfo> getVertexInfos(const NGHolder &g) {
+    const size_t num_verts = num_vertices(g);
+
+    ptr_vector<VertexInfo> infos;
+    infos.reserve(num_verts * 2);
+
     vector<VertexInfo *> vertex_map; // indexed by vertex_index property
-    vertex_map.resize(num_vertices(g));
+    vertex_map.resize(num_verts);
 
     for (auto v : vertices_range(g)) {
         VertexInfo *vi = new VertexInfo(v, g);
@@ -323,14 +319,24 @@ void getVertexInfos(const NGHolder &g, ptr_vector<VertexInfo> &infos) {
         }
         assert(!hasEdgeAsserts(cur_vi.v, g));
     }
+
+    return infos;
 }
 
 // store equivalence class in VertexInfo for each vertex
 static
-void partitionGraph(ptr_vector<VertexInfo> &infos, ClassMap &classes,
-                    WorkQueue &work_queue, const NGHolder &g,
-                    EquivalenceType eq) {
-    map<ClassInfo, unsigned> classinfomap;
+vector<VertexInfoSet> partitionGraph(ptr_vector<VertexInfo> &infos,
+                                     WorkQueue &work_queue, const NGHolder &g,
+                                     EquivalenceType eq) {
+    const size_t num_verts = infos.size();
+
+    vector<VertexInfoSet> classes;
+    unordered_map<ClassInfo, unsigned> classinfomap;
+
+    // assume we will have lots of classes, so we don't waste time resizing
+    // these structures.
+    classes.reserve(num_verts);
+    classinfomap.reserve(num_verts);
 
     // get distances from start (or accept) for all vertices
     // only one of them is used at a time, never both
@@ -356,28 +362,25 @@ void partitionGraph(ptr_vector<VertexInfo> &infos, ClassMap &classes,
 
         auto ii = classinfomap.find(ci);
         if (ii == classinfomap.end()) {
-            unsigned new_class = classinfomap.size();
-            vi.equivalence_class = new_class;
-
-            classinfomap[ci] = new_class;
-
-            // insert this vertex into the class map
-            VertexInfoSet &vertices = classes[new_class];
-            vertices.insert(&vi);
+            // vertex is in a new equivalence class by itself.
+            unsigned eq_class = classes.size();
+            vi.equivalence_class = eq_class;
+            classes.push_back({&vi});
+            classinfomap.emplace(move(ci), eq_class);
         } else {
+            // vertex is added to an existing class.
             unsigned eq_class = ii->second;
             vi.equivalence_class = eq_class;
-
-            // insert this vertex into the class map
-            VertexInfoSet &vertices = classes[eq_class];
-            vertices.insert(&vi);
+            classes.at(eq_class).insert(&vi);
 
             // we now know that this particular class has more than one
             // vertex, so we add it to the work queue
             work_queue.push(eq_class);
         }
     }
-    DEBUG_PRINTF("partitioned, %zu equivalence classes\n", classinfomap.size());
+
+    DEBUG_PRINTF("partitioned, %zu equivalence classes\n", classes.size());
+    return classes;
 }
 
 // generalized equivalence processing (left and right)
@@ -388,7 +391,7 @@ void partitionGraph(ptr_vector<VertexInfo> &infos, ClassMap &classes,
 // equivalence, predecessors for right equivalence) classes get revalidated in
 // case of a split.
 static
-void equivalence(ClassMap &classmap, WorkQueue &work_queue,
+void equivalence(vector<VertexInfoSet> &classes, WorkQueue &work_queue,
                  EquivalenceType eq_type) {
     // now, go through the work queue until it's empty
     map<flat_set<unsigned>, VertexInfoSet> tentative_classmap;
@@ -397,12 +400,11 @@ void equivalence(ClassMap &classmap, WorkQueue &work_queue,
     WorkQueue reval_queue(work_queue.capacity());
 
     while (!work_queue.empty()) {
-
         // dequeue our class from the work queue
         unsigned cur_class = work_queue.pop();
 
         // get all vertices in current equivalence class
-        VertexInfoSet &cur_class_vertices = classmap[cur_class];
+        VertexInfoSet &cur_class_vertices = classes.at(cur_class);
 
         if (cur_class_vertices.size() < 2) {
             continue;
@@ -445,16 +447,20 @@ void equivalence(ClassMap &classmap, WorkQueue &work_queue,
 
             // start from the second class
             for (++tmi; tmi != tentative_classmap.end(); ++tmi) {
-                unsigned new_class = classmap.size();
                 const VertexInfoSet &vertices_to_split = tmi->second;
-                VertexInfoSet &new_class_vertices = classmap[new_class];
+                unsigned new_class = classes.size();
+                VertexInfoSet new_class_vertices;
 
                 for (VertexInfo *vi : vertices_to_split) {
                     vi->equivalence_class = new_class;
-                    cur_class_vertices.erase(vi);
+                    // note: we cannot use the cur_class_vertices ref, as it is
+                    // invalidated by modifications to the classes vector.
+                    classes[cur_class].erase(vi);
                     new_class_vertices.insert(vi);
                 }
-                if (tmi->first.find(cur_class) != tmi->first.end()) {
+                classes.push_back(move(new_class_vertices));
+
+                if (contains(tmi->first, cur_class)) {
                     reval_queue.push(new_class);
                 }
             }
@@ -619,16 +625,15 @@ void mergeClass(ptr_vector<VertexInfo> &infos, NGHolder &g, unsigned eq_class,
 // vertex (or, in rare cases for left equiv, a pair if we cannot satisfy the
 // report behaviour with a single vertex).
 static
-bool mergeEquivalentClasses(ClassMap &classmap, ptr_vector<VertexInfo> &infos,
-                            NGHolder &g) {
+bool mergeEquivalentClasses(vector<VertexInfoSet> &classes,
+                            ptr_vector<VertexInfo> &infos, NGHolder &g) {
     bool merged = false;
     set<NFAVertex> toRemove;
 
     // go through all classes and merge classes with more than one vertex
-    for (auto &cm : classmap) {
+    for (unsigned eq_class = 0; eq_class < classes.size(); eq_class++) {
         // get all vertices in current equivalence class
-        unsigned eq_class = cm.first;
-        VertexInfoSet &cur_class_vertices = cm.second;
+        VertexInfoSet &cur_class_vertices = classes[eq_class];
 
         // we don't care for single-vertex classes
         if (cur_class_vertices.size() > 1) {
@@ -644,6 +649,26 @@ bool mergeEquivalentClasses(ClassMap &classmap, ptr_vector<VertexInfo> &infos,
     return merged;
 }
 
+static
+bool reduceGraphEquivalences(NGHolder &g, EquivalenceType eq_type) {
+    // create a list of equivalence classes to check
+    WorkQueue work_queue(num_vertices(g));
+
+    // get information on every vertex in the graph
+    // new vertices are allocated here, and stored in infos
+    ptr_vector<VertexInfo> infos = getVertexInfos(g);
+
+    // partition the graph
+    auto classes = partitionGraph(infos, work_queue, g, eq_type);
+
+    // do equivalence processing
+    equivalence(classes, work_queue, eq_type);
+
+    // replace equivalent classes with single vertices
+    // new vertices are (possibly) allocated here, and stored in infos
+    return mergeEquivalentClasses(classes, infos, g);
+}
+
 bool reduceGraphEquivalences(NGHolder &g, const CompileContext &cc) {
     if (!cc.grey.equivalenceEnable) {
         DEBUG_PRINTF("equivalence processing disabled in grey box\n");
@@ -661,34 +686,8 @@ bool reduceGraphEquivalences(NGHolder &g, const CompileContext &cc) {
 
     // take note if we have merged any vertices
     bool merge = false;
-
-    for (int eqi = 0; eqi < MAX_EQUIVALENCE; ++eqi) {
-        // map of all information pertaining a vertex
-        ptr_vector<VertexInfo> infos;
-        ClassMap classes;
-
-        // create a list of equivalence classes to check
-        WorkQueue work_queue(num_vertices(g));
-        EquivalenceType eq_type = (EquivalenceType) eqi;
-
-        // resize the vector, make room for twice the vertices we have
-        infos.reserve(num_vertices(g) * 2);
-
-        // get information on every vertex in the graph
-        // new vertices are allocated here, and stored in infos
-        getVertexInfos(g, infos);
-
-        // partition the graph
-        partitionGraph(infos, classes, work_queue, g, eq_type);
-
-        // do equivalence processing
-        equivalence(classes, work_queue, eq_type);
-
-        // replace equivalent classes with single vertices
-        // new vertices are (possibly) allocated here, and stored in infos
-        merge |= mergeEquivalentClasses(classes, infos, g);
-    }
-
+    merge |= reduceGraphEquivalences(g, LEFT_EQUIVALENCE);
+    merge |= reduceGraphEquivalences(g, RIGHT_EQUIVALENCE);
     return merge;
 }
 

From 8e4c68e9df957d2f2a9af9376610f88f43b9a62a Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 19 May 2016 17:22:37 +1000
Subject: [PATCH 021/166] rose: eagerly report EOD literal matches

Where possible, eagerly report a match when a literal that matches at
EOD occurs, rather than setting a state bit and waiting for EOD
processing.
---
 src/rose/rose_build_bytecode.cpp | 120 ++++++++++++++++++++++++++++---
 1 file changed, 110 insertions(+), 10 deletions(-)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index b148155b..ea602017 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -350,6 +350,7 @@ struct RoseResources {
     bool has_lit_delay = false;
     bool has_lit_mask = false;
     bool has_anchored = false;
+    bool has_eod = false;
 };
 
 struct build_context : boost::noncopyable {
@@ -575,6 +576,11 @@ bool isPureFloating(const RoseResources &resources) {
         return false;
     }
 
+    if (resources.has_eod) {
+        DEBUG_PRINTF("has eod work to do\n");
+        return false;
+    }
+
     if (resources.has_states) {
         DEBUG_PRINTF("has states\n");
         return false;
@@ -630,6 +636,7 @@ u8 pickRuntimeImpl(const RoseBuildImpl &build, const build_context &bc,
     DEBUG_PRINTF("has_lit_delay=%d\n", bc.resources.has_lit_delay);
     DEBUG_PRINTF("has_lit_mask=%d\n", bc.resources.has_lit_mask);
     DEBUG_PRINTF("has_anchored=%d\n", bc.resources.has_anchored);
+    DEBUG_PRINTF("has_eod=%d\n", bc.resources.has_eod);
 
     if (isPureFloating(bc.resources)) {
         return ROSE_RUNTIME_PURE_LITERAL;
@@ -1775,9 +1782,13 @@ u32 buildLastByteIter(const RoseGraph &g, build_context &bc) {
     vector<u32> lb_roles;
 
     for (auto v : vertices_range(g)) {
-        if (hasLastByteHistoryOutEdge(g, v)) {
-            assert(contains(bc.roleStateIndices, v));
-            lb_roles.push_back(bc.roleStateIndices.at(v));
+        if (!hasLastByteHistoryOutEdge(g, v)) {
+            continue;
+        }
+        // Eager EOD reporters won't have state indices.
+        auto it = bc.roleStateIndices.find(v);
+        if (it != end(bc.roleStateIndices)) {
+            lb_roles.push_back(it->second);
         }
     }
 
@@ -2273,6 +2284,18 @@ void recordResources(RoseResources &resources,
             break;
         }
     }
+
+    const auto &g = build.g;
+    for (const auto &v : vertices_range(g)) {
+        if (g[v].eod_accept) {
+            resources.has_eod = true;
+            break;
+        }
+        if (g[v].suffix && has_eod_accepts(g[v].suffix)) {
+            resources.has_eod = true;
+            break;
+        }
+    }
 }
 
 static
@@ -2338,7 +2361,37 @@ void buildActiveLeftIter(const vector<LeftNfaInfo> &leftTable,
 }
 
 static
-bool hasEodAnchors(const RoseBuildImpl &tbi, const build_context &bc,
+bool canEagerlyReportAtEod(const RoseBuildImpl &build, const RoseEdge &e) {
+    const auto &g = build.g;
+    const auto v = target(e, g);
+
+    if (!build.g[v].eod_accept) {
+        return false;
+    }
+
+    // If there's a graph between us and EOD, we shouldn't be eager.
+    if (build.g[v].left) {
+        return false;
+    }
+
+    // Must be exactly at EOD.
+    if (g[e].minBound != 0 || g[e].maxBound != 0) {
+        return false;
+    }
+
+    // In streaming mode, we can only eagerly report EOD for literals in the
+    // EOD-anchored table, as that's the only time we actually know where EOD
+    // is. In block mode, we always have this information.
+    const auto u = source(e, g);
+    if (build.cc.streaming && !build.isInETable(u)) {
+        return false;
+    }
+
+    return true;
+}
+
+static
+bool hasEodAnchors(const RoseBuildImpl &build, const build_context &bc,
                    u32 outfixEndQueue) {
     for (u32 i = 0; i < outfixEndQueue; i++) {
         if (nfaAcceptsEod(get_nfa_from_blob(bc, i))) {
@@ -2347,16 +2400,18 @@ bool hasEodAnchors(const RoseBuildImpl &tbi, const build_context &bc,
         }
     }
 
-    if (tbi.eod_event_literal_id != MO_INVALID_IDX) {
+    if (build.eod_event_literal_id != MO_INVALID_IDX) {
         DEBUG_PRINTF("eod is an event to be celebrated\n");
         return true;
     }
-    for (auto v : vertices_range(tbi.g)) {
-        if (tbi.g[v].eod_accept) {
+
+    const RoseGraph &g = build.g;
+    for (auto v : vertices_range(g)) {
+        if (g[v].eod_accept) {
             DEBUG_PRINTF("literally report eod\n");
             return true;
         }
-        if (tbi.g[v].suffix && has_eod_accepts(tbi.g[v].suffix)) {
+        if (g[v].suffix && has_eod_accepts(g[v].suffix)) {
             DEBUG_PRINTF("eod suffix\n");
             return true;
         }
@@ -3085,6 +3140,30 @@ void makeRoleCheckNotHandled(build_context &bc, RoseVertex v,
     program.push_back(move(ri));
 }
 
+static
+void makeRoleEagerEodReports(RoseBuildImpl &build, build_context &bc,
+                             RoseVertex v, vector<RoseInstruction> &program) {
+    vector<RoseInstruction> eod_program;
+
+    for (const auto &e : out_edges_range(v, build.g)) {
+        if (canEagerlyReportAtEod(build, e)) {
+            makeRoleReports(build, bc, target(e, build.g), eod_program);
+        }
+    }
+
+    if (eod_program.empty()) {
+        return;
+    }
+
+    if (!onlyAtEod(build, v)) {
+        // The rest of our program wasn't EOD anchored, so we need to guard
+        // these reports with a check.
+        program.emplace_back(ROSE_INSTR_CHECK_ONLY_EOD, JumpTarget::NEXT_BLOCK);
+    }
+
+    program.insert(end(program), begin(eod_program), end(eod_program));
+}
+
 static
 vector<RoseInstruction> makeProgram(RoseBuildImpl &build, build_context &bc,
                                     const RoseEdge &e) {
@@ -3129,8 +3208,13 @@ vector<RoseInstruction> makeProgram(RoseBuildImpl &build, build_context &bc,
     makeRoleGroups(build, bc, v, program);
 
     makeRoleSuffix(build, bc, v, program);
+
     makeRoleSetState(bc, v, program);
 
+    // Note: EOD eager reports may generate a CHECK_ONLY_EOD instruction (if
+    // the program doesn't have one already).
+    makeRoleEagerEodReports(build, bc, v, program);
+
     return program;
 }
 
@@ -3189,10 +3273,21 @@ void assignStateIndices(const RoseBuildImpl &build, build_context &bc) {
         if (build.isVirtualVertex(v)) {
             continue;
         }
-        // Leaf nodes don't need state indices, as they don't have successors.
-        if (isLeafNode(v, g)) {
+
+        // We only need a state index if we have successors that are not
+        // eagerly-reported EOD vertices.
+        bool needs_state_index = false;
+        for (const auto &e : out_edges_range(v, g)) {
+            if (!canEagerlyReportAtEod(build, e)) {
+                needs_state_index = true;
+                break;
+            }
+        }
+
+        if (!needs_state_index) {
             continue;
         }
+
         /* TODO: also don't need a state index if all edges are nfa based */
         bc.roleStateIndices.emplace(v, state++);
     }
@@ -3895,6 +3990,11 @@ pair<u32, u32> buildEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
         for (const auto &e : in_edges_range(v, g)) {
             RoseVertex u = source(e, g);
 
+            if (canEagerlyReportAtEod(build, e)) {
+                DEBUG_PRINTF("already done report for vertex %zu\n", g[u].idx);
+                continue;
+            }
+
             assert(contains(bc.roleStateIndices, u));
             u32 predStateIdx = bc.roleStateIndices.at(u);
 

From ea62ba107f1b0abdd521f7069950c2aa8311a006 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 7 Jun 2016 12:00:30 +1000
Subject: [PATCH 022/166] unit: make multiaccel tests deterministic

Use a PRNG, not rand().
---
 unit/internal/multiaccel_matcher.cpp | 71 ++++++++++++++--------------
 1 file changed, 36 insertions(+), 35 deletions(-)

diff --git a/unit/internal/multiaccel_matcher.cpp b/unit/internal/multiaccel_matcher.cpp
index 45a24f46..bdf56ff9 100644
--- a/unit/internal/multiaccel_matcher.cpp
+++ b/unit/internal/multiaccel_matcher.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -43,47 +43,16 @@ extern "C" {
 #include "util/alloc.h"
 #include "util/charreach.h"
 
+#include <algorithm>
+#include <iostream>
+#include <random>
 #include <string>
 #include <vector>
-#include <algorithm>
-#include <stdlib.h>
-#include <iostream>
 
 using namespace ue2;
 using namespace std;
 using namespace testing;
 
-/*
- * Static functions needed for this test's wellbeing
- */
-
-// char generator
-static inline
-char getChar(const CharReach &cr, bool match) {
-    char result;
-    do {
-        result = rand() % CharReach::npos;
-    } while (cr.test(result) != match);
-    return result;
-}
-
-// appends a string with matches/unmatches according to input match pattern
-static
-void getMatch(u8 *result, u32 start, const string &pattern,
-              const CharReach &cr) {
-    for (const auto &c : pattern) {
-        result[start++] = getChar(cr, c == '1');
-    }
-}
-
-// appends non-matching noise of certain lengths
-static
-void getNoise(u8 *result, u32 start, u32 len, const CharReach &cr) {
-    for (unsigned i = 0; i < len; i++) {
-        result[start + i] = getChar(cr, false);
-    }
-}
-
 // test parameters structure
 struct MultiaccelTestParam {
     string match_pattern;
@@ -126,6 +95,34 @@ protected:
         test_all_offsets = p.test_all_offsets;
     }
 
+    char getChar(const CharReach &cr) {
+        assert(cr.count() > 0);
+        auto dist = uniform_int_distribution<size_t>(0, cr.count() - 1);
+        size_t result = cr.find_nth(dist(prng));
+        assert(result != CharReach::npos);
+        return (char)result;
+    }
+
+    // char generator
+    char getChar(const CharReach &cr, bool match) {
+        return getChar(match ? cr : ~cr);
+    }
+
+    // appends a string with matches/unmatches according to input match pattern
+    void getMatch(u8 *result, u32 start, const string &pattern,
+                  const CharReach &cr) {
+        for (const auto &c : pattern) {
+            result[start++] = getChar(cr, c == '1');
+        }
+    }
+
+    // appends non-matching noise of certain lengths
+    void getNoise(u8 *result, u32 start, u32 len, const CharReach &cr) {
+        for (unsigned i = 0; i < len; i++) {
+            result[start + i] = getChar(cr, false);
+        }
+    }
+
     // deferred buffer generation, as we don't know CharReach before we run the test
     void GenerateBuffer(const CharReach &cr) {
         const MultiaccelTestParam &p = GetParam();
@@ -167,6 +164,10 @@ protected:
         aligned_free(buffer);
     }
 
+    // We want our tests to be deterministic, so we use a PRNG in the test
+    // fixture.
+    mt19937 prng;
+
     u32 match_idx;
     u8 *buffer;
     bool test_all_offsets;

From 964fc22b1ac2e6618675ae6e93cf3821406e9989 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 1 Mar 2016 15:26:33 +1100
Subject: [PATCH 023/166] fdr_compile_internal: remove unused decl

getFDRConfirm() is static.
---
 src/fdr/fdr_compile_internal.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/fdr/fdr_compile_internal.h b/src/fdr/fdr_compile_internal.h
index d98bb518..4b89d329 100644
--- a/src/fdr/fdr_compile_internal.h
+++ b/src/fdr/fdr_compile_internal.h
@@ -56,9 +56,6 @@ class EngineDescription;
 class FDREngineDescription;
 struct hwlmStreamingControl;
 
-size_t getFDRConfirm(const std::vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
-                     bool make_small);
-
 std::pair<u8 *, size_t> setupFullMultiConfs(
     const std::vector<hwlmLiteral> &lits, const EngineDescription &eng,
     std::map<BucketIndex, std::vector<LiteralIndex> > &bucketToLits,

From 69653aaec53ee0255141d083533e0a3906a13319 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 1 Mar 2016 15:45:17 +1100
Subject: [PATCH 024/166] fdr_confirm_compile: use smart ptrs

---
 src/fdr/fdr_confirm_compile.cpp | 50 ++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp
index 08946a5f..6838d56a 100644
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -47,7 +47,8 @@ namespace ue2 {
 
 typedef u8 ConfSplitType;
 typedef pair<BucketIndex, ConfSplitType> BucketSplitPair;
-typedef map<BucketSplitPair, pair<FDRConfirm *, size_t> > BC2CONF;
+typedef map<BucketSplitPair, pair<aligned_unique_ptr<FDRConfirm>, size_t>>
+    BC2CONF;
 
 // return the number of bytes beyond a length threshold in all strings in lits
 static
@@ -149,9 +150,9 @@ void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,
 
 //#define FDR_CONFIRM_DUMP 1
 
-static
-size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
-                     bool applyOneCharOpt, bool make_small, bool make_confirm) {
+static pair<aligned_unique_ptr<FDRConfirm>, size_t>
+getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
+              bool make_small, bool make_confirm) {
     vector<LitInfo> tmpLitInfo(lits.size());
     CONF_TYPE andmsk;
     fillLitInfo(lits, tmpLitInfo, andmsk);
@@ -281,7 +282,7 @@ size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
                   sizeof(LitInfo) * lits.size() + totalLitSize;
     size = ROUNDUP_N(size, alignof(FDRConfirm));
 
-    FDRConfirm *fdrc = (FDRConfirm *)aligned_zmalloc(size);
+    auto fdrc = aligned_zmalloc_unique<FDRConfirm>(size);
     assert(fdrc); // otherwise would have thrown std::bad_alloc
 
     fdrc->andmsk = andmsk;
@@ -295,7 +296,7 @@ size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
     fdrc->groups = gm;
 
     // After the FDRConfirm, we have the lit index array.
-    u8 *fdrc_base = (u8 *)fdrc;
+    u8 *fdrc_base = (u8 *)fdrc.get();
     u8 *ptr = fdrc_base + sizeof(*fdrc);
     ptr = ROUNDUP_PTR(ptr, alignof(u32));
     u32 *bitsToLitIndex = (u32 *)ptr;
@@ -311,7 +312,7 @@ size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
              i = res2lits.begin(), e = res2lits.end(); i != e; ++i) {
         const u32 hash = i->first;
         const vector<LiteralIndex> &vlidx = i->second;
-        bitsToLitIndex[hash] = verify_u32(ptr - (u8 *)fdrc);
+        bitsToLitIndex[hash] = verify_u32(ptr - fdrc_base);
         for (vector<LiteralIndex>::const_iterator i2 = vlidx.begin(),
              e2 = vlidx.end(); i2 != e2; ++i2) {
             LiteralIndex litIdx = *i2;
@@ -348,14 +349,13 @@ size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
         assert((size_t)(ptr - fdrc_base) <= size);
     }
 
-    *fdrc_p = fdrc;
-
     // Return actual used size, not worst-case size. Must be rounded up to
     // FDRConfirm alignment so that the caller can lay out a sequence of these.
     size_t actual_size = ROUNDUP_N((size_t)(ptr - fdrc_base),
                                    alignof(FDRConfirm));
     assert(actual_size <= size);
-    return actual_size;
+
+    return {move(fdrc), actual_size};
 }
 
 static
@@ -424,16 +424,16 @@ u32 setupMultiConfirms(const vector<hwlmLiteral> &lits,
             }
 
             for (u32 c = 0; c < eng.getConfirmTopLevelSplit(); c++) {
-                if (!vl[c].empty()) {
-                    DEBUG_PRINTF("b %d c %02x sz %zu\n", b, c, vl[c].size());
-                    FDRConfirm *fdrc;
-                    size_t size = getFDRConfirm(vl[c], &fdrc,
-                                                eng.typicallyHoldsOneCharLits(),
-                                                make_small, makeConfirm);
-                    BucketSplitPair p = make_pair(b, c);
-                    bc2Conf[p] = make_pair(fdrc, size);
-                    totalConfirmSize += size;
+                if (vl[c].empty()) {
+                    continue;
                 }
+                DEBUG_PRINTF("b %d c %02x sz %zu\n", b, c, vl[c].size());
+                auto key = make_pair(b, c);
+                auto fc = getFDRConfirm(vl[c], eng.typicallyHoldsOneCharLits(),
+                                        make_small, makeConfirm);
+                totalConfirmSize += fc.second;
+                assert(bc2Conf.find(key) == end(bc2Conf));
+                bc2Conf.emplace(key, move(fc));
             }
         }
     }
@@ -459,16 +459,14 @@ pair<u8 *, size_t> setupFullMultiConfs(const vector<hwlmLiteral> &lits,
     u32 *confBase = (u32 *)buf;
     u8 *ptr = buf + totalConfSwitchSize;
 
-    for (BC2CONF::const_iterator i = bc2Conf.begin(), e = bc2Conf.end(); i != e;
-         ++i) {
-        const pair<FDRConfirm *, size_t> &p = i->second;
+    for (const auto &m : bc2Conf) {
+        const BucketIndex &b = m.first.first;
+        const u8 &c = m.first.second;
+        const pair<aligned_unique_ptr<FDRConfirm>, size_t> &p = m.second;
         // confirm offset is relative to the base of this structure, now
         u32 confirm_offset = verify_u32(ptr - (u8 *)buf);
-        memcpy(ptr, p.first, p.second);
+        memcpy(ptr, p.first.get(), p.second);
         ptr += p.second;
-        aligned_free(p.first);
-        BucketIndex b = i->first.first;
-        u8 c = i->first.second;
         u32 idx = c * nBuckets + b;
         confBase[idx] = confirm_offset;
     }

From 139a472af0a4b08e96afe3624b1138be41d68334 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 1 Mar 2016 15:47:41 +1100
Subject: [PATCH 025/166] fdr_confirm_compile: typedef -> using

---
 src/fdr/fdr_confirm_compile.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp
index 6838d56a..f4ce7366 100644
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -45,10 +45,10 @@ using namespace std;
 
 namespace ue2 {
 
-typedef u8 ConfSplitType;
-typedef pair<BucketIndex, ConfSplitType> BucketSplitPair;
-typedef map<BucketSplitPair, pair<aligned_unique_ptr<FDRConfirm>, size_t>>
-    BC2CONF;
+using ConfSplitType = u8;
+using BucketSplitPair = pair<BucketIndex, ConfSplitType>;
+using BC2CONF = map<BucketSplitPair,
+                    pair<aligned_unique_ptr<FDRConfirm>, size_t>>;
 
 // return the number of bytes beyond a length threshold in all strings in lits
 static

From 81880d5a8de505cb19bf2ea39f95657ba1026e8b Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 1 Mar 2016 15:56:18 +1100
Subject: [PATCH 026/166] fdr_confirm_compile: clean up for loops, iterators

---
 src/fdr/fdr_confirm_compile.cpp | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp
index f4ce7366..f65b36b9 100644
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -308,14 +308,12 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
 
     // Walk the map by hash value assigning indexes and laying out the
     // elements (and their associated string confirm material) in memory.
-    for (std::map<u32, vector<LiteralIndex> >::const_iterator
-             i = res2lits.begin(), e = res2lits.end(); i != e; ++i) {
-        const u32 hash = i->first;
-        const vector<LiteralIndex> &vlidx = i->second;
+    for (const auto &m : res2lits) {
+        const u32 hash = m.first;
+        const vector<LiteralIndex> &vlidx = m.second;
         bitsToLitIndex[hash] = verify_u32(ptr - fdrc_base);
-        for (vector<LiteralIndex>::const_iterator i2 = vlidx.begin(),
-             e2 = vlidx.end(); i2 != e2; ++i2) {
-            LiteralIndex litIdx = *i2;
+        for (auto i = vlidx.begin(), e = vlidx.end(); i != e; ++i) {
+            LiteralIndex litIdx = *i;
 
             // Write LitInfo header.
             u8 *oldPtr = ptr;
@@ -334,7 +332,7 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
             }
 
             ptr = ROUNDUP_PTR(ptr, alignof(LitInfo));
-            if (i2 + 1 == e2) {
+            if (next(i) == e) {
                 finalLI.next = 0x0;
             } else {
                 // our next field represents an adjustment on top of
@@ -377,12 +375,9 @@ u32 setupMultiConfirms(const vector<hwlmLiteral> &lits,
     u32 totalConfirmSize = 0;
     for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
         if (!bucketToLits[b].empty()) {
-            vector<vector<hwlmLiteral> > vl(eng.getConfirmTopLevelSplit());
-            for (vector<LiteralIndex>::const_iterator
-                     i = bucketToLits[b].begin(),
-                     e = bucketToLits[b].end();
-                 i != e; ++i) {
-                hwlmLiteral lit = lits[*i]; // copy
+            vector<vector<hwlmLiteral>> vl(eng.getConfirmTopLevelSplit());
+            for (const LiteralIndex &lit_idx : bucketToLits[b]) {
+                hwlmLiteral lit = lits[lit_idx]; // copy
                 // c is last char of this literal
                 u8 c = *(lit.s.rbegin());
 

From 9953a026f8d9d5e6511103dd68873b108a7cedd4 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 1 Mar 2016 16:29:50 +1100
Subject: [PATCH 027/166] setupFDRFloodControl: return smart ptr

---
 src/fdr/fdr_compile.cpp        |  5 ++---
 src/fdr/fdr_compile_internal.h |  3 ++-
 src/fdr/flood_compile.cpp      | 14 ++++++++------
 src/fdr/teddy_compile.cpp      |  5 ++---
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index 0c4ef35d..bd074313 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -144,7 +144,7 @@ void FDRCompiler::createInitialState(FDR *fdr) {
 aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
     size_t tabSize = eng.getTabSizeBytes();
 
-    pair<u8 *, size_t> floodControlTmp = setupFDRFloodControl(lits, eng);
+    auto floodControlTmp = setupFDRFloodControl(lits, eng);
 
     pair<u8 *, size_t> confirmTmp =
         setupFullMultiConfs(lits, eng, bucketToLits, make_small);
@@ -180,9 +180,8 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
     aligned_free(confirmTmp.first);
 
     fdr->floodOffset = verify_u32(ptr - fdr_base);
-    memcpy(ptr, floodControlTmp.first, floodControlTmp.second);
+    memcpy(ptr, floodControlTmp.first.get(), floodControlTmp.second);
     ptr += floodControlTmp.second;
-    aligned_free(floodControlTmp.first);
 
     /*  we are allowing domains 9 to 15 only */
     assert(eng.bits > 8 && eng.bits < 16);
diff --git a/src/fdr/fdr_compile_internal.h b/src/fdr/fdr_compile_internal.h
index 4b89d329..e7376620 100644
--- a/src/fdr/fdr_compile_internal.h
+++ b/src/fdr/fdr_compile_internal.h
@@ -31,6 +31,7 @@
 
 #include "ue2common.h"
 #include "hwlm/hwlm_literal.h"
+#include "util/alloc.h"
 
 #include <map>
 #include <utility>
@@ -65,7 +66,7 @@ std::pair<u8 *, size_t> setupFullMultiConfs(
 // we always read a full-scale flood "behind" us in terms of what's in our
 // state; if we don't have a flood that's long enough we won't be in the
 // right state yet to allow blindly advancing
-std::pair<u8 *, size_t>
+std::pair<aligned_unique_ptr<u8>, size_t>
 setupFDRFloodControl(const std::vector<hwlmLiteral> &lits,
                      const EngineDescription &eng);
 
diff --git a/src/fdr/flood_compile.cpp b/src/fdr/flood_compile.cpp
index 2c131788..f7a3e083 100644
--- a/src/fdr/flood_compile.cpp
+++ b/src/fdr/flood_compile.cpp
@@ -90,8 +90,9 @@ void addFlood(vector<FDRFlood> &tmpFlood, u8 c, const hwlmLiteral &lit,
    }
 }
 
-pair<u8 *, size_t> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
-                                        const EngineDescription &eng) {
+pair<aligned_unique_ptr<u8>, size_t>
+setupFDRFloodControl(const vector<hwlmLiteral> &lits,
+                     const EngineDescription &eng) {
     vector<FDRFlood> tmpFlood(N_CHARS);
     u32 default_suffix = eng.getDefaultFloodSuffixLength();
 
@@ -195,11 +196,12 @@ pair<u8 *, size_t> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
     size_t floodHeaderSize = sizeof(u32) * N_CHARS;
     size_t floodStructSize = sizeof(FDRFlood) * nDistinctFloods;
     size_t totalSize = ROUNDUP_16(floodHeaderSize + floodStructSize);
-    u8 *buf = (u8 *)aligned_zmalloc(totalSize);
+
+    auto buf = aligned_zmalloc_unique<u8>(totalSize);
     assert(buf); // otherwise would have thrown std::bad_alloc
 
-    u32 *floodHeader = (u32 *)buf;
-    FDRFlood *layoutFlood = (FDRFlood * )(buf + floodHeaderSize);
+    u32 *floodHeader = (u32 *)buf.get();
+    FDRFlood *layoutFlood = (FDRFlood *)(buf.get() + floodHeaderSize);
 
     u32 currentFloodIndex = 0;
     for (const auto &m : flood2chars) {
@@ -215,7 +217,7 @@ pair<u8 *, size_t> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
     DEBUG_PRINTF("made a flood structure with %zu + %zu = %zu\n",
                  floodHeaderSize, floodStructSize, totalSize);
 
-    return make_pair((u8 *)buf, totalSize);
+    return make_pair(move(buf), totalSize);
 }
 
 } // namespace ue2
diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp
index c1e46d85..80543bb4 100644
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -314,7 +314,7 @@ aligned_unique_ptr<FDR> TeddyCompiler::build(pair<u8 *, size_t> link) {
 
     size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
 
-    pair<u8 *, size_t> floodControlTmp = setupFDRFloodControl(lits, eng);
+    auto floodControlTmp = setupFDRFloodControl(lits, eng);
     pair<u8 *, size_t> confirmTmp
         = setupFullMultiConfs(lits, eng, bucketToLits, make_small);
 
@@ -339,9 +339,8 @@ aligned_unique_ptr<FDR> TeddyCompiler::build(pair<u8 *, size_t> link) {
     aligned_free(confirmTmp.first);
 
     teddy->floodOffset = verify_u32(ptr - teddy_base);
-    memcpy(ptr, floodControlTmp.first, floodControlTmp.second);
+    memcpy(ptr, floodControlTmp.first.get(), floodControlTmp.second);
     ptr += floodControlTmp.second;
-    aligned_free(floodControlTmp.first);
 
     if (link.first) {
         teddy->link = verify_u32(ptr - teddy_base);

From aebbd4f1691a64d545b6177d7ca548f94a1c49f6 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 1 Mar 2016 16:35:09 +1100
Subject: [PATCH 028/166] setupFullMultiConfs: return smart ptr

---
 src/fdr/fdr_compile.cpp         |  7 ++-----
 src/fdr/fdr_compile_internal.h  |  4 ++--
 src/fdr/fdr_confirm_compile.cpp | 19 ++++++++++---------
 src/fdr/teddy_compile.cpp       |  6 ++----
 4 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index bd074313..885c2c87 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -145,9 +145,7 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
     size_t tabSize = eng.getTabSizeBytes();
 
     auto floodControlTmp = setupFDRFloodControl(lits, eng);
-
-    pair<u8 *, size_t> confirmTmp =
-        setupFullMultiConfs(lits, eng, bucketToLits, make_small);
+    auto confirmTmp = setupFullMultiConfs(lits, eng, bucketToLits, make_small);
 
     assert(ISALIGNED_16(tabSize));
     assert(ISALIGNED_16(confirmTmp.second));
@@ -175,9 +173,8 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
     copy(tab.begin(), tab.end(), ptr);
     ptr += tabSize;
 
-    memcpy(ptr, confirmTmp.first, confirmTmp.second);
+    memcpy(ptr, confirmTmp.first.get(), confirmTmp.second);
     ptr += confirmTmp.second;
-    aligned_free(confirmTmp.first);
 
     fdr->floodOffset = verify_u32(ptr - fdr_base);
     memcpy(ptr, floodControlTmp.first.get(), floodControlTmp.second);
diff --git a/src/fdr/fdr_compile_internal.h b/src/fdr/fdr_compile_internal.h
index e7376620..ac6d1257 100644
--- a/src/fdr/fdr_compile_internal.h
+++ b/src/fdr/fdr_compile_internal.h
@@ -57,9 +57,9 @@ class EngineDescription;
 class FDREngineDescription;
 struct hwlmStreamingControl;
 
-std::pair<u8 *, size_t> setupFullMultiConfs(
+std::pair<aligned_unique_ptr<u8>, size_t> setupFullMultiConfs(
     const std::vector<hwlmLiteral> &lits, const EngineDescription &eng,
-    std::map<BucketIndex, std::vector<LiteralIndex> > &bucketToLits,
+    std::map<BucketIndex, std::vector<LiteralIndex>> &bucketToLits,
     bool make_small);
 
 // all suffixes include an implicit max_bucket_width suffix to ensure that
diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp
index f65b36b9..4a129bc4 100644
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -435,10 +435,11 @@ u32 setupMultiConfirms(const vector<hwlmLiteral> &lits,
     return totalConfirmSize;
 }
 
-pair<u8 *, size_t> setupFullMultiConfs(const vector<hwlmLiteral> &lits,
-        const EngineDescription &eng,
-        map<BucketIndex, vector<LiteralIndex> > &bucketToLits,
-        bool make_small) {
+pair<aligned_unique_ptr<u8>, size_t>
+setupFullMultiConfs(const vector<hwlmLiteral> &lits,
+                    const EngineDescription &eng,
+                    map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
+                    bool make_small) {
     BC2CONF bc2Conf;
     u32 totalConfirmSize = setupMultiConfirms(lits, eng, bc2Conf, bucketToLits,
                                               make_small);
@@ -448,24 +449,24 @@ pair<u8 *, size_t> setupFullMultiConfs(const vector<hwlmLiteral> &lits,
     u32 totalConfSwitchSize = primarySwitch * nBuckets * sizeof(u32);
     u32 totalSize = ROUNDUP_16(totalConfSwitchSize + totalConfirmSize);
 
-    u8 *buf = (u8 *)aligned_zmalloc(totalSize);
+    auto buf = aligned_zmalloc_unique<u8>(totalSize);
     assert(buf); // otherwise would have thrown std::bad_alloc
 
-    u32 *confBase = (u32 *)buf;
-    u8 *ptr = buf + totalConfSwitchSize;
+    u32 *confBase = (u32 *)buf.get();
+    u8 *ptr = buf.get() + totalConfSwitchSize;
 
     for (const auto &m : bc2Conf) {
         const BucketIndex &b = m.first.first;
         const u8 &c = m.first.second;
         const pair<aligned_unique_ptr<FDRConfirm>, size_t> &p = m.second;
         // confirm offset is relative to the base of this structure, now
-        u32 confirm_offset = verify_u32(ptr - (u8 *)buf);
+        u32 confirm_offset = verify_u32(ptr - buf.get());
         memcpy(ptr, p.first.get(), p.second);
         ptr += p.second;
         u32 idx = c * nBuckets + b;
         confBase[idx] = confirm_offset;
     }
-    return make_pair(buf, totalSize);
+    return make_pair(move(buf), totalSize);
 }
 
 } // namespace ue2
diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp
index 80543bb4..21f5c901 100644
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -315,8 +315,7 @@ aligned_unique_ptr<FDR> TeddyCompiler::build(pair<u8 *, size_t> link) {
     size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
 
     auto floodControlTmp = setupFDRFloodControl(lits, eng);
-    pair<u8 *, size_t> confirmTmp
-        = setupFullMultiConfs(lits, eng, bucketToLits, make_small);
+    auto confirmTmp = setupFullMultiConfs(lits, eng, bucketToLits, make_small);
 
     size_t size = ROUNDUP_N(sizeof(Teddy) +
                              maskLen +
@@ -334,9 +333,8 @@ aligned_unique_ptr<FDR> TeddyCompiler::build(pair<u8 *, size_t> link) {
     teddy->maxStringLen = verify_u32(maxLen(lits));
 
     u8 *ptr = teddy_base + sizeof(Teddy) + maskLen;
-    memcpy(ptr, confirmTmp.first, confirmTmp.second);
+    memcpy(ptr, confirmTmp.first.get(), confirmTmp.second);
     ptr += confirmTmp.second;
-    aligned_free(confirmTmp.first);
 
     teddy->floodOffset = verify_u32(ptr - teddy_base);
     memcpy(ptr, floodControlTmp.first.get(), floodControlTmp.second);

From 57cd2331f53acc6478d812b5e5b8914cd1f691db Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 1 Mar 2016 16:46:25 +1100
Subject: [PATCH 029/166] fdr/teddy: switch over remaining smart ptrs

---
 src/fdr/fdr_compile.cpp           | 20 +++++++++----------
 src/fdr/fdr_compile_internal.h    |  4 ++--
 src/fdr/fdr_streaming_compile.cpp | 32 +++++++++++++++----------------
 src/fdr/teddy_compile.cpp         | 16 ++++++++--------
 src/fdr/teddy_compile.h           |  4 ++--
 5 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index 885c2c87..f8d1bd0e 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -81,7 +81,7 @@ private:
     void dumpMasks(const u8 *defaultMask);
 #endif
     void setupTab();
-    aligned_unique_ptr<FDR> setupFDR(pair<u8 *, size_t> link);
+    aligned_unique_ptr<FDR> setupFDR(pair<aligned_unique_ptr<u8>, size_t> &link);
     void createInitialState(FDR *fdr);
 
 public:
@@ -90,7 +90,7 @@ public:
         : eng(eng_in), tab(eng_in.getTabSizeBytes()), lits(lits_in),
           make_small(make_small_in) {}
 
-    aligned_unique_ptr<FDR> build(pair<u8 *, size_t> link);
+    aligned_unique_ptr<FDR> build(pair<aligned_unique_ptr<u8>, size_t> &link);
 };
 
 u8 *FDRCompiler::tabIndexToMask(u32 indexInTable) {
@@ -141,7 +141,8 @@ void FDRCompiler::createInitialState(FDR *fdr) {
     }
 }
 
-aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
+aligned_unique_ptr<FDR>
+FDRCompiler::setupFDR(pair<aligned_unique_ptr<u8>, size_t> &link) {
     size_t tabSize = eng.getTabSizeBytes();
 
     auto floodControlTmp = setupFDRFloodControl(lits, eng);
@@ -189,8 +190,7 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
 
     if (link.first) {
         fdr->link = verify_u32(ptr - fdr_base);
-        memcpy(ptr, link.first, link.second);
-        aligned_free(link.first);
+        memcpy(ptr, link.first.get(), link.second);
     } else {
         fdr->link = 0;
     }
@@ -498,7 +498,8 @@ void FDRCompiler::setupTab() {
 #endif
 }
 
-aligned_unique_ptr<FDR> FDRCompiler::build(pair<u8 *, size_t> link) {
+aligned_unique_ptr<FDR>
+FDRCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
     assignStringsToBuckets();
     setupTab();
     return setupFDR(link);
@@ -511,16 +512,15 @@ aligned_unique_ptr<FDR>
 fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
                       const target_t &target, const Grey &grey, u32 hint,
                       hwlmStreamingControl *stream_control) {
-    pair<u8 *, size_t> link(nullptr, 0);
+    pair<aligned_unique_ptr<u8>, size_t> link(nullptr, 0);
     if (stream_control) {
-        link = fdrBuildTableStreaming(lits, stream_control);
+        link = fdrBuildTableStreaming(lits, *stream_control);
     }
 
     DEBUG_PRINTF("cpu has %s\n", target.has_avx2() ? "avx2" : "no-avx2");
 
     if (grey.fdrAllowTeddy) {
-        aligned_unique_ptr<FDR> fdr
-            = teddyBuildTableHinted(lits, make_small, hint, target, link);
+        auto fdr = teddyBuildTableHinted(lits, make_small, hint, target, link);
         if (fdr) {
             DEBUG_PRINTF("build with teddy succeeded\n");
             return fdr;
diff --git a/src/fdr/fdr_compile_internal.h b/src/fdr/fdr_compile_internal.h
index ac6d1257..9b0c323f 100644
--- a/src/fdr/fdr_compile_internal.h
+++ b/src/fdr/fdr_compile_internal.h
@@ -70,9 +70,9 @@ std::pair<aligned_unique_ptr<u8>, size_t>
 setupFDRFloodControl(const std::vector<hwlmLiteral> &lits,
                      const EngineDescription &eng);
 
-std::pair<u8 *, size_t>
+std::pair<aligned_unique_ptr<u8>, size_t>
 fdrBuildTableStreaming(const std::vector<hwlmLiteral> &lits,
-                       hwlmStreamingControl *stream_control);
+                       hwlmStreamingControl &stream_control);
 
 static constexpr u32 HINT_INVALID = 0xffffffff;
 
diff --git a/src/fdr/fdr_streaming_compile.cpp b/src/fdr/fdr_streaming_compile.cpp
index 34536eec..3422d74c 100644
--- a/src/fdr/fdr_streaming_compile.cpp
+++ b/src/fdr/fdr_streaming_compile.cpp
@@ -306,24 +306,24 @@ size_t maxMaskLen(const vector<hwlmLiteral> &lits) {
     return rv;
 }
 
-pair<u8 *, size_t>
+pair<aligned_unique_ptr<u8>, size_t>
 fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
-                       hwlmStreamingControl *stream_control) {
+                       hwlmStreamingControl &stream_control) {
     // refuse to compile if we are forced to have smaller than minimum
     // history required for long-literal support, full stop
     // otherwise, choose the maximum of the preferred history quantity
     // (currently a fairly extravagant 32) or the already used history
-    // quantity - subject to the limitation of stream_control->history_max
+    // quantity - subject to the limitation of stream_control.history_max
 
     const size_t MIN_HISTORY_REQUIRED = 32;
 
-    if (MIN_HISTORY_REQUIRED > stream_control->history_max) {
+    if (MIN_HISTORY_REQUIRED > stream_control.history_max) {
         throw std::logic_error("Cannot set history to minimum history required");
     }
 
     size_t max_len =
-        MIN(stream_control->history_max,
-            MAX(MIN_HISTORY_REQUIRED, stream_control->history_min));
+        MIN(stream_control.history_max,
+            MAX(MIN_HISTORY_REQUIRED, stream_control.history_min));
     assert(max_len >= MIN_HISTORY_REQUIRED);
     size_t max_mask_len = maxMaskLen(lits);
 
@@ -334,9 +334,9 @@ fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
 
         // we want enough history to manage the longest literal and the longest
         // mask.
-        stream_control->literal_history_required =
+        stream_control.literal_history_required =
                     max(maxLen(lits), max_mask_len) - 1;
-        stream_control->literal_stream_state_required = 0;
+        stream_control.literal_stream_state_required = 0;
         return make_pair(nullptr, size_t{0});
     }
 
@@ -381,11 +381,11 @@ fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
     streamBits[CASELESS] = lg2(roundUpToPowerOfTwo(positions[CASELESS] + 2));
     u32 tot_state_bytes = (streamBits[CASEFUL] + streamBits[CASELESS] + 7) / 8;
 
-    u8 * secondaryTable = (u8 *)aligned_zmalloc(tabSize);
+    auto secondaryTable = aligned_zmalloc_unique<u8>(tabSize);
     assert(secondaryTable); // otherwise would have thrown std::bad_alloc
 
     // then fill it in
-    u8 * ptr = secondaryTable;
+    u8 * ptr = secondaryTable.get();
     FDRSTableHeader * header = (FDRSTableHeader *)ptr;
     // fill in header
     header->pseudoEngineID = (u32)0xffffffff;
@@ -411,7 +411,7 @@ fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
                                              e = long_lits.end();
          i != e; ++i) {
         u32 entry = verify_u32(i - long_lits.begin());
-        u32 offset = verify_u32(ptr - secondaryTable);
+        u32 offset = verify_u32(ptr - secondaryTable.get());
 
         // point the table entry to the string location
         litTabPtr[entry].offset = offset;
@@ -425,10 +425,10 @@ fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
     }
 
     // fill in final lit table entry with current ptr (serves as end value)
-    litTabPtr[long_lits.size()].offset = verify_u32(ptr - secondaryTable);
+    litTabPtr[long_lits.size()].offset = verify_u32(ptr - secondaryTable.get());
 
     // fill hash tables
-    ptr = secondaryTable + htOffset[CASEFUL];
+    ptr = secondaryTable.get() + htOffset[CASEFUL];
     for (u32 m = CASEFUL; m < MAX_MODES; ++m) {
         fillHashes(long_lits, max_len, (FDRSHashEntry *)ptr, hashEntries[m],
                    (MODES)m, litToOffsetVal);
@@ -436,9 +436,9 @@ fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
     }
 
     // tell the world what we did
-    stream_control->literal_history_required = max_len;
-    stream_control->literal_stream_state_required = tot_state_bytes;
-    return make_pair(secondaryTable, tabSize);
+    stream_control.literal_history_required = max_len;
+    stream_control.literal_stream_state_required = tot_state_bytes;
+    return make_pair(move(secondaryTable), tabSize);
 }
 
 } // namespace ue2
diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp
index 21f5c901..21bdb409 100644
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -74,7 +74,7 @@ public:
                   const TeddyEngineDescription &eng_in, bool make_small_in)
         : eng(eng_in), lits(lits_in), make_small(make_small_in) {}
 
-    aligned_unique_ptr<FDR> build(pair<u8 *, size_t> link);
+    aligned_unique_ptr<FDR> build(pair<aligned_unique_ptr<u8>, size_t> &link);
     bool pack(map<BucketIndex, std::vector<LiteralIndex> > &bucketToLits);
 };
 
@@ -281,7 +281,8 @@ bool TeddyCompiler::pack(map<BucketIndex,
     return true;
 }
 
-aligned_unique_ptr<FDR> TeddyCompiler::build(pair<u8 *, size_t> link) {
+aligned_unique_ptr<FDR>
+TeddyCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
     if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
         DEBUG_PRINTF("too many literals: %zu\n", lits.size());
         return nullptr;
@@ -342,8 +343,7 @@ aligned_unique_ptr<FDR> TeddyCompiler::build(pair<u8 *, size_t> link) {
 
     if (link.first) {
         teddy->link = verify_u32(ptr - teddy_base);
-        memcpy(ptr, link.first, link.second);
-        aligned_free(link.first);
+        memcpy(ptr, link.first.get(), link.second);
     } else {
         teddy->link = 0;
     }
@@ -436,10 +436,10 @@ aligned_unique_ptr<FDR> TeddyCompiler::build(pair<u8 *, size_t> link) {
 
 } // namespace
 
-aligned_unique_ptr<FDR> teddyBuildTableHinted(const vector<hwlmLiteral> &lits,
-                                              bool make_small, u32 hint,
-                                              const target_t &target,
-                                              pair<u8 *, size_t> link) {
+aligned_unique_ptr<FDR>
+teddyBuildTableHinted(const vector<hwlmLiteral> &lits, bool make_small,
+                      u32 hint, const target_t &target,
+                      pair<aligned_unique_ptr<u8>, size_t> &link) {
     unique_ptr<TeddyEngineDescription> des;
     if (hint == HINT_INVALID) {
         des = chooseTeddyEngine(target, lits);
diff --git a/src/fdr/teddy_compile.h b/src/fdr/teddy_compile.h
index fba6a3d1..276c1347 100644
--- a/src/fdr/teddy_compile.h
+++ b/src/fdr/teddy_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -49,7 +49,7 @@ struct hwlmLiteral;
 ue2::aligned_unique_ptr<FDR>
 teddyBuildTableHinted(const std::vector<hwlmLiteral> &lits, bool make_small,
                       u32 hint, const target_t &target,
-                      std::pair<u8 *, size_t> link);
+                      std::pair<aligned_unique_ptr<u8>, size_t> &link);
 
 } // namespace ue2
 

From 466fc940e55e2ec6fc1dece82366f9d8b3a4834f Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 2 Mar 2016 10:15:48 +1100
Subject: [PATCH 030/166] fdr_streaming_compile: modernise loops, etc

---
 src/fdr/fdr_streaming_compile.cpp | 75 ++++++++++++-------------------
 1 file changed, 28 insertions(+), 47 deletions(-)

diff --git a/src/fdr/fdr_streaming_compile.cpp b/src/fdr/fdr_streaming_compile.cpp
index 3422d74c..f6db0c15 100644
--- a/src/fdr/fdr_streaming_compile.cpp
+++ b/src/fdr/fdr_streaming_compile.cpp
@@ -94,14 +94,13 @@ static
 bool setupLongLits(const vector<hwlmLiteral> &lits,
                    vector<hwlmLiteral> &long_lits, size_t max_len) {
     long_lits.reserve(lits.size());
-    for (vector<hwlmLiteral>::const_iterator it = lits.begin();
-         it != lits.end(); ++it) {
-        if (it->s.length() > max_len) {
-            hwlmLiteral tmp = *it; // copy
-            tmp.s.erase(tmp.s.size() - 1, 1); // erase last char
+    for (const auto &lit : lits) {
+        if (lit.s.length() > max_len) {
+            hwlmLiteral tmp = lit; // copy
+            tmp.s.pop_back();
             tmp.id = 0; // recalc later
             tmp.groups = 0; // filled in later by hash bucket(s)
-            long_lits.push_back(tmp);
+            long_lits.push_back(move(tmp));
         }
     }
 
@@ -112,15 +111,12 @@ bool setupLongLits(const vector<hwlmLiteral> &lits,
     // sort long_literals by caseful/caseless and in lexicographical order,
     // remove duplicates
     stable_sort(long_lits.begin(), long_lits.end(), LongLitOrder());
-    vector<hwlmLiteral>::iterator new_end =
-        unique(long_lits.begin(), long_lits.end(), hwlmLitEqual);
+    auto new_end = unique(long_lits.begin(), long_lits.end(), hwlmLitEqual);
     long_lits.erase(new_end, long_lits.end());
 
     // fill in ids; not currently used
-    for (vector<hwlmLiteral>::iterator i = long_lits.begin(),
-                                       e = long_lits.end();
-         i != e; ++i) {
-        i->id = i - long_lits.begin();
+    for (auto i = long_lits.begin(), e = long_lits.end(); i != e; ++i) {
+        i->id = distance(long_lits.begin(), i);
     }
     return true;
 }
@@ -143,23 +139,19 @@ void analyzeLits(const vector<hwlmLiteral> &long_lits, size_t max_len,
         hashedPositions[m] = 0;
     }
 
-    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
-                                             e = long_lits.end();
-         i != e; ++i) {
+    for (auto i = long_lits.begin(), e = long_lits.end(); i != e; ++i) {
         if (i->nocase) {
-            boundaries[CASEFUL] = verify_u32(i - long_lits.begin());
+            boundaries[CASEFUL] = verify_u32(distance(long_lits.begin(), i));
             break;
         }
     }
 
-    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
-                                             e = long_lits.end();
-         i != e; ++i) {
-        MODES m = i->nocase ? CASELESS : CASEFUL;
-        for (u32 j = 1; j < i->s.size() - max_len + 1; j++) {
+    for (const auto &lit : long_lits) {
+        MODES m = lit.nocase ? CASELESS : CASEFUL;
+        for (u32 j = 1; j < lit.s.size() - max_len + 1; j++) {
             hashedPositions[m]++;
         }
-        positions[m] += i->s.size();
+        positions[m] += lit.s.size();
     }
 
     for (u32 m = CASEFUL; m < MAX_MODES; m++) {
@@ -209,18 +201,15 @@ void fillHashes(const vector<hwlmLiteral> &long_lits, size_t max_len,
     map<u32, deque<pair<u32, u32> > > bucketToLitOffPairs;
     map<u32, u64a> bucketToBitfield;
 
-    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
-                                             e = long_lits.end();
-         i != e; ++i) {
-        const hwlmLiteral &l = *i;
-        if ((m == CASELESS) != i->nocase) {
+    for (const auto &lit : long_lits) {
+        if ((m == CASELESS) != lit.nocase) {
             continue;
         }
-        for (u32 j = 1; j < i->s.size() - max_len + 1; j++) {
-            u32 h = hashLit(l, j, max_len, m);
+        for (u32 j = 1; j < lit.s.size() - max_len + 1; j++) {
+            u32 h = hashLit(lit, j, max_len, m);
             u32 h_ent = h & ((1U << nbits) - 1);
             u32 h_low = (h >> nbits) & 63;
-            bucketToLitOffPairs[h_ent].push_back(make_pair(i->id, j));
+            bucketToLitOffPairs[h_ent].emplace_back(lit.id, j);
             bucketToBitfield[h_ent] |= (1ULL << h_low);
         }
     }
@@ -231,11 +220,9 @@ void fillHashes(const vector<hwlmLiteral> &long_lits, size_t max_len,
 
     // sweep out bitfield entries and save the results swapped accordingly
     // also, anything with bitfield entries is put in filledBuckets
-    for (map<u32, u64a>::const_iterator i = bucketToBitfield.begin(),
-                                        e = bucketToBitfield.end();
-         i != e; ++i) {
-        u32 bucket = i->first;
-        u64a contents = i->second;
+    for (const auto &m : bucketToBitfield) {
+        const u32 &bucket = m.first;
+        const u64a &contents = m.second;
         tab[bucket].bitfield = contents;
         filledBuckets.set(bucket);
     }
@@ -243,12 +230,9 @@ void fillHashes(const vector<hwlmLiteral> &long_lits, size_t max_len,
     // store out all our chains based on free values in our hash table.
     // find nearest free locations that are empty (there will always be more
     // entries than strings, at present)
-    for (map<u32, deque<pair<u32, u32> > >::iterator
-             i = bucketToLitOffPairs.begin(),
-             e = bucketToLitOffPairs.end();
-         i != e; ++i) {
-        u32 bucket = i->first;
-        deque<pair<u32, u32> > &d = i->second;
+    for (auto &m : bucketToLitOffPairs) {
+        u32 bucket = m.first;
+        deque<pair<u32, u32>> &d = m.second;
 
         // sort d by distance of the residual string (len minus our depth into
         // the string). We need to put the 'furthest back' string first...
@@ -299,9 +283,8 @@ void fillHashes(const vector<hwlmLiteral> &long_lits, size_t max_len,
 static
 size_t maxMaskLen(const vector<hwlmLiteral> &lits) {
     size_t rv = 0;
-    vector<hwlmLiteral>::const_iterator it, ite;
-    for (it = lits.begin(), ite = lits.end(); it != ite; ++it) {
-        rv = max(rv, it->msk.size());
+    for (const auto &lit : lits) {
+        rv = max(rv, lit.msk.size());
     }
     return rv;
 }
@@ -407,9 +390,7 @@ fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
     ptr += litTabSize;
 
     map<u32, u32> litToOffsetVal;
-    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
-                                             e = long_lits.end();
-         i != e; ++i) {
+    for (auto i = long_lits.begin(), e = long_lits.end(); i != e; ++i) {
         u32 entry = verify_u32(i - long_lits.begin());
         u32 offset = verify_u32(ptr - secondaryTable.get());
 

From 88e6485e7575ffa0c010ad171854f04c47ef030a Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 2 Mar 2016 10:23:31 +1100
Subject: [PATCH 031/166] fdr_compile: tidy up

---
 src/fdr/fdr_compile.cpp | 40 ++++++++++++++--------------------------
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index f8d1bd0e..8552c2d6 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -124,10 +124,8 @@ void FDRCompiler::createInitialState(FDR *fdr) {
         // Find the minimum length for the literals in this bucket.
         const vector<LiteralIndex> &bucket_lits = bucketToLits[b];
         u32 min_len = ~0U;
-        for (vector<LiteralIndex>::const_iterator it = bucket_lits.begin(),
-                                                  ite = bucket_lits.end();
-             it != ite; ++it) {
-            min_len = min(min_len, verify_u32(lits[*it].s.length()));
+        for (const LiteralIndex &lit_idx : bucket_lits) {
+            min_len = min(min_len, verify_u32(lits[lit_idx].s.length()));
         }
 
         DEBUG_PRINTF("bucket %u has min_len=%u\n", b, min_len);
@@ -213,13 +211,11 @@ struct LitOrder {
         if (len1 != len2) {
             return len1 < len2;
         } else {
-            string::const_reverse_iterator it1, it2;
-            tie(it1, it2) =
-                std::mismatch(i1s.rbegin(), i1s.rend(), i2s.rbegin());
-            if (it1 == i1s.rend()) {
+            auto p = std::mismatch(i1s.rbegin(), i1s.rend(), i2s.rbegin());
+            if (p.first == i1s.rend()) {
                 return false;
             }
-            return *it1 < *it2;
+            return *p.first < *p.second;
         }
     }
 
@@ -262,9 +258,8 @@ void FDRCompiler::assignStringsToBuckets() {
     stable_sort(vli.begin(), vli.end(), LitOrder(lits));
 
 #ifdef DEBUG_ASSIGNMENT
-    for (map<u32, u32>::iterator i = lenCounts.begin(), e = lenCounts.end();
-         i != e; ++i) {
-        printf("l<%d>:%d ", i->first, i->second);
+    for (const auto &m : lenCounts) {
+        printf("l<%u>:%u ", m.first, m.second);
     }
     printf("\n");
 #endif
@@ -401,8 +396,7 @@ bool getMultiEntriesAtPosition(const FDREngineDescription &eng,
         distance = 4;
     }
 
-    for (vector<LiteralIndex>::const_iterator i = vl.begin(), e = vl.end();
-         i != e; ++i) {
+    for (auto i = vl.begin(), e = vl.end(); i != e; ++i) {
         if (e - i > 5) {
             __builtin_prefetch(&lits[*(i + 5)]);
         }
@@ -456,31 +450,25 @@ void FDRCompiler::setupTab() {
         memcpy(tabIndexToMask(i), &defaultMask[0], mask_size);
     }
 
-    typedef std::map<u32, ue2::unordered_set<u32> > M2SET;
-
     for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
         const vector<LiteralIndex> &vl = bucketToLits[b];
         SuffixPositionInString pLimit = eng.getBucketWidth(b);
         for (SuffixPositionInString pos = 0; pos < pLimit; pos++) {
             u32 bit = eng.getSchemeBit(b, pos);
-            M2SET m2;
+            map<u32, ue2::unordered_set<u32>> m2;
             bool done = getMultiEntriesAtPosition(eng, vl, lits, pos, m2);
             if (done) {
                 clearbit(&defaultMask[0], bit);
                 continue;
             }
-            for (M2SET::const_iterator i = m2.begin(), e = m2.end(); i != e;
-                 ++i) {
-                u32 dc = i->first;
-                const ue2::unordered_set<u32> &mskSet = i->second;
+            for (const auto &elem : m2) {
+                u32 dc = elem.first;
+                const ue2::unordered_set<u32> &mskSet = elem.second;
                 u32 v = ~dc;
                 do {
                     u32 b2 = v & dc;
-                    for (ue2::unordered_set<u32>::const_iterator
-                             i2 = mskSet.begin(),
-                             e2 = mskSet.end();
-                         i2 != e2; ++i2) {
-                        u32 val = (*i2 & ~dc) | b2;
+                    for (const u32 &mskVal : mskSet) {
+                        u32 val = (mskVal & ~dc) | b2;
                         clearbit(tabIndexToMask(val), bit);
                     }
                     v = (v + (dc & -dc)) | ~dc;

From 37e7c964246a12cdf398306a7d4cefca29fb7816 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 2 Mar 2016 10:32:40 +1100
Subject: [PATCH 032/166] teddy_compile: modernise

---
 src/fdr/teddy_compile.cpp | 45 +++++++++++++++------------------------
 1 file changed, 17 insertions(+), 28 deletions(-)

diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp
index 21bdb409..287a3373 100644
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -106,8 +106,8 @@ public:
         }
         printf("\nnlits: %zu\nLit ids: ", litCount());
         printf("Prob: %llu\n", probability());
-        for (set<u32>::iterator i = litIds.begin(), e = litIds.end(); i != e; ++i) {
-            printf("%u ", *i);
+        for (const auto &id : litIds) {
+            printf("%u ", id);
         }
         printf("\n");
         printf("Flood prone : %s\n", isRunProne()?"yes":"no");
@@ -193,20 +193,18 @@ bool TeddyCompiler::pack(map<BucketIndex,
     while (1) {
 #ifdef TEDDY_DEBUG
         printf("Size %zu\n", sts.size());
-        for (set<TeddySet>::const_iterator i1 = sts.begin(), e1 = sts.end(); i1 != e1; ++i1) {
-            printf("\n"); i1->dump();
+        for (const TeddySet &ts : sts) {
+            printf("\n"); ts.dump();
         }
         printf("\n===============================================\n");
 #endif
 
-        set<TeddySet>::iterator m1 = sts.end(), m2 = sts.end();
+        auto m1 = sts.end(), m2 = sts.end();
         u64a best = 0xffffffffffffffffULL;
 
-        for (set<TeddySet>::iterator i1 = sts.begin(), e1 = sts.end(); i1 != e1; ++i1) {
-            set<TeddySet>::iterator i2 = i1;
-            ++i2;
+        for (auto i1 = sts.begin(), e1 = sts.end(); i1 != e1; ++i1) {
             const TeddySet &s1 = *i1;
-            for (set<TeddySet>::iterator e2 = sts.end(); i2 != e2; ++i2) {
+            for (auto i2 = next(i1), e2 = sts.end(); i2 != e2; ++i2) {
                 const TeddySet &s2 = *i2;
 
                 // be more conservative if we don't absolutely need to
@@ -263,19 +261,16 @@ bool TeddyCompiler::pack(map<BucketIndex,
         sts.erase(m2);
         sts.insert(nts);
     }
-    u32 cnt = 0;
 
     if (sts.size() > eng.getNumBuckets()) {
         return false;
     }
 
-    for (set<TeddySet>::const_iterator i = sts.begin(), e = sts.end(); i != e;
-         ++i) {
-        for (set<u32>::const_iterator i2 = i->getLits().begin(),
-                                      e2 = i->getLits().end();
-             i2 != e2; ++i2) {
-            bucketToLits[cnt].push_back(*i2);
-        }
+    u32 cnt = 0;
+    for (const TeddySet &ts : sts) {
+        const auto &lits = ts.getLits();
+        bucketToLits[cnt].insert(end(bucketToLits[cnt]), begin(lits),
+                                 end(lits));
         cnt++;
     }
     return true;
@@ -350,19 +345,13 @@ TeddyCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
 
     u8 *baseMsk = teddy_base + sizeof(Teddy);
 
-    for (map<BucketIndex, std::vector<LiteralIndex> >::const_iterator
-             i = bucketToLits.begin(),
-             e = bucketToLits.end();
-         i != e; ++i) {
-        const u32 bucket_id = i->first;
-        const vector<LiteralIndex> &ids = i->second;
+    for (const auto &b2l : bucketToLits) {
+        const u32 &bucket_id = b2l.first;
+        const vector<LiteralIndex> &ids = b2l.second;
         const u8 bmsk = 1U << (bucket_id % 8);
 
-        for (vector<LiteralIndex>::const_iterator i2 = ids.begin(),
-                                                  e2 = ids.end();
-             i2 != e2; ++i2) {
-            LiteralIndex lit_id = *i2;
-            const hwlmLiteral & l = lits[lit_id];
+        for (const LiteralIndex &lit_id : ids) {
+            const hwlmLiteral &l = lits[lit_id];
             DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
             const u32 sz = verify_u32(l.s.size());
 

From d626cb68e0fc247770fb6a0659e831845fa0a8f2 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 1 Apr 2016 17:34:27 +1100
Subject: [PATCH 033/166] fdr_confirm_compile: clean up debugging code

---
 src/fdr/fdr_confirm_compile.cpp | 92 ++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 43 deletions(-)

diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp
index 4a129bc4..b0c3644f 100644
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -221,55 +221,61 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
 #ifdef FDR_CONFIRM_DUMP
     // print out the literals reversed - makes it easier to line up analyses
     // that are end-offset based
-    for (map<u32, vector<LiteralIndex> >::iterator i = res2lits.begin(),
-         e = res2lits.end(); i != e; ++i) {
-        u32 hash = i->first;
-        vector<LiteralIndex> & vlidx = i->second;
-        if (vlidx.size() > 1) {
-            printf("%x -> %zu literals\n", hash, vlidx.size());
-            u32 min_len = lits[vlidx.front()].s.size();
-            vector<set<u8> > vsl; // contains the set of chars at each location
-                                  // reversed from the end
-            vsl.resize(1024);
-            u32 total_string_size = 0;
-            for (vector<LiteralIndex>::iterator i2 = vlidx.begin(),
-                 e2 = vlidx.end(); i2 != e2; ++i2) {
-                LiteralIndex litIdx = *i2;
-                total_string_size += lits[litIdx].s.size();
-                for (u32 j = lits[litIdx].s.size(); j != 0 ; j--) {
-                    vsl[lits[litIdx].s.size()-j].insert(lits[litIdx].s.c_str()[j - 1]);
-                }
-                min_len = MIN(min_len, lits[litIdx].s.size());
+    for (const auto &m : res2lits) {
+        const u32 &hash = m.first;
+        const vector<LiteralIndex> &vlidx = m.second;
+        if (vlidx.size() <= 1) {
+            continue;
+        }
+        printf("%x -> %zu literals\n", hash, vlidx.size());
+        size_t min_len = lits[vlidx.front()].s.size();
+
+        vector<set<u8>> vsl; // contains the set of chars at each location
+                             // reversed from the end
+
+        for (const auto &litIdx : vlidx) {
+            const auto &lit = lits[litIdx];
+            if (lit.s.size() > vsl.size()) {
+                vsl.resize(lit.s.size());
             }
-            printf("common     ");
-            for (u32 j = 0; j < min_len; j++) {
-                if (vsl[j].size() == 1) {
-                    printf("%02x", (u32)*vsl[j].begin());
-                } else {
+            for (size_t j = lit.s.size(); j != 0; j--) {
+                vsl[lit.s.size() - j].insert(lit.s[j - 1]);
+            }
+            min_len = min(min_len, lit.s.size());
+        }
+        printf("common     ");
+        for (size_t j = 0; j < min_len; j++) {
+            if (vsl[j].size() == 1) {
+                printf("%02x", *vsl[j].begin());
+            } else {
+                printf("__");
+            }
+        }
+        printf("\n");
+        for (const auto &litIdx : vlidx) {
+            const auto &lit = lits[litIdx];
+            printf("%8x  %c", lit.id, lit.nocase ? '!' : ' ');
+            for (size_t j = lit.s.size(); j != 0; j--) {
+                size_t dist_from_end = lit.s.size() - j;
+                if (dist_from_end < min_len && vsl[dist_from_end].size() == 1) {
                     printf("__");
+                } else {
+                    printf("%02x", lit.s[j - 1]);
                 }
             }
             printf("\n");
-            for (vector<LiteralIndex>::iterator i2 = vlidx.begin(),
-                 e2 = vlidx.end(); i2 != e2; ++i2) {
-                LiteralIndex litIdx = *i2;
-                printf("%8x  %c", lits[litIdx].id, lits[litIdx].nocase ? '!' : ' ');
-                for (u32 j = lits[litIdx].s.size(); j != 0 ; j--) {
-                    u32 dist_from_end = lits[litIdx].s.size() - j;
-                    if (dist_from_end < min_len && vsl[dist_from_end].size() == 1) {
-                        printf("__");
-                    } else {
-                        printf("%02x", (u32)lits[litIdx].s.c_str()[j-1]);
-                    }
-                }
-                printf("\n");
-            }
-            u32 total_compares = 0;
-            for (u32 j = 0; j < 1024; j++) { // naughty
-                total_compares += vsl[j].size();
-            }
-            printf("Total compare load: %d Total string size: %d\n\n", total_compares, total_string_size);
         }
+        size_t total_compares = 0;
+        for (const auto &v : vsl) {
+            total_compares += v.size();
+        }
+        size_t total_string_size = 0;
+        for (const auto &litIdx : vlidx) {
+            const auto &lit = lits[litIdx];
+            total_string_size += lit.s.size();
+        }
+        printf("Total compare load: %zu Total string size: %zu\n\n",
+               total_compares, total_string_size);
     }
 #endif
 

From bae7a072ca7725846fff22b84a0e92d58f596a87 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 5 Apr 2016 16:34:06 +1000
Subject: [PATCH 034/166] TeddySet: doesn't need a reference to lits

---
 src/fdr/teddy_compile.cpp | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp
index 287a3373..364e877e 100644
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -79,7 +79,6 @@ public:
 };
 
 class TeddySet {
-    const vector<hwlmLiteral> &lits;
     u32 len;
     // nibbleSets is a series of bitfields over 16 predicates
     // that represent the whether shufti nibble set
@@ -89,8 +88,7 @@ class TeddySet {
     vector<u16> nibbleSets;
     set<u32> litIds;
 public:
-    TeddySet(const vector<hwlmLiteral> &lits_in, u32 len_in)
-        : lits(lits_in), len(len_in), nibbleSets(len_in * 2, 0) {}
+    explicit TeddySet(u32 len_in) : len(len_in), nibbleSets(len_in * 2, 0) {}
     const set<u32> & getLits() const { return litIds; }
     size_t litCount() const { return litIds.size(); }
 
@@ -118,15 +116,15 @@ public:
         return nibbleSets == ts.nibbleSets;
     }
 
-    void addLiteral(u32 lit_id) {
-        const string &s = lits[lit_id].s;
+    void addLiteral(u32 lit_id, const hwlmLiteral &lit) {
+        const string &s = lit.s;
         for (u32 i = 0; i < len; i++) {
             if (i < s.size()) {
                 u8 c = s[s.size() - i - 1];
                 u8 c_hi = (c >> 4) & 0xf;
                 u8 c_lo = c & 0xf;
                 nibbleSets[i*2] = 1 << c_lo;
-                if (lits[lit_id].nocase && ourisalpha(c)) {
+                if (lit.nocase && ourisalpha(c)) {
                     nibbleSets[i*2+1] =  (1 << (c_hi&0xd)) | (1 << (c_hi|0x2));
                 } else {
                     nibbleSets[i*2+1] =  1 << c_hi;
@@ -185,8 +183,8 @@ bool TeddyCompiler::pack(map<BucketIndex,
     set<TeddySet> sts;
 
     for (u32 i = 0; i < lits.size(); i++) {
-        TeddySet ts(lits, eng.numMasks);
-        ts.addLiteral(i);
+        TeddySet ts(eng.numMasks);
+        ts.addLiteral(i, lits[i]);
         sts.insert(ts);
     }
 
@@ -214,7 +212,7 @@ bool TeddyCompiler::pack(map<BucketIndex,
                     continue;
                 }
 
-                TeddySet tmpSet(lits, eng.numMasks);
+                TeddySet tmpSet(eng.numMasks);
                 tmpSet.merge(s1);
                 tmpSet.merge(s2);
                 u64a newScore = tmpSet.heuristic();
@@ -244,7 +242,7 @@ bool TeddyCompiler::pack(map<BucketIndex,
         }
 
         // do the merge
-        TeddySet nts(lits, eng.numMasks);
+        TeddySet nts(eng.numMasks);
         nts.merge(*m1);
         nts.merge(*m2);
 #ifdef TEDDY_DEBUG

From b8cd169cde17cd26ca80668277421702948fcf8f Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 28 Apr 2016 16:34:48 +1000
Subject: [PATCH 035/166] teddy compile: rename loop var for readability

---
 src/fdr/teddy_compile.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp
index 364e877e..729c9c1f 100644
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -264,12 +264,12 @@ bool TeddyCompiler::pack(map<BucketIndex,
         return false;
     }
 
-    u32 cnt = 0;
+    u32 bucket_id = 0;
     for (const TeddySet &ts : sts) {
         const auto &lits = ts.getLits();
-        bucketToLits[cnt].insert(end(bucketToLits[cnt]), begin(lits),
-                                 end(lits));
-        cnt++;
+        auto &bucket_lits = bucketToLits[bucket_id];
+        bucket_lits.insert(end(bucket_lits), begin(lits), end(lits));
+        bucket_id++;
     }
     return true;
 }

From ad74f3b895c53f34b0bb913cc855c1d6df9b379b Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 29 Apr 2016 09:40:44 +1000
Subject: [PATCH 036/166] fdr: use braced init syntax for pairs

---
 src/fdr/fdr_compile.cpp           | 8 ++++----
 src/fdr/fdr_confirm_compile.cpp   | 2 +-
 src/fdr/fdr_streaming_compile.cpp | 4 ++--
 src/fdr/flood_compile.cpp         | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index 8552c2d6..89a0ff72 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -315,12 +315,12 @@ void FDRCompiler::assignStringsToBuckets() {
         for (u32 k = j; k < nChunks; ++k) {
             cnt += count[k];
         }
-        t[j][0] = make_pair(getScoreUtil(length[j], cnt), 0);
+        t[j][0] = {getScoreUtil(length[j], cnt), 0};
     }
 
     for (u32 i = 1; i < nb; i++) {
         for (u32 j = 0; j < nChunks - 1; j++) { // don't process last, empty row
-            SCORE_INDEX_PAIR best = make_pair(MAX_SCORE, 0);
+            SCORE_INDEX_PAIR best = {MAX_SCORE, 0};
             u32 cnt = count[j];
             for (u32 k = j + 1; k < nChunks - 1; k++, cnt += count[k]) {
                 SCORE score = getScoreUtil(length[j], cnt);
@@ -329,12 +329,12 @@ void FDRCompiler::assignStringsToBuckets() {
                 }
                 score += t[k][i-1].first;
                 if (score < best.first) {
-                    best = make_pair(score, k);
+                    best = {score, k};
                 }
             }
             t[j][i] = best;
         }
-        t[nChunks - 1][i] = make_pair(0,0); // fill in empty final row for next iteration
+        t[nChunks - 1][i] = {0,0}; // fill in empty final row for next iteration
     }
 
 #ifdef DEBUG_ASSIGNMENT
diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp
index b0c3644f..23437fe2 100644
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -472,7 +472,7 @@ setupFullMultiConfs(const vector<hwlmLiteral> &lits,
         u32 idx = c * nBuckets + b;
         confBase[idx] = confirm_offset;
     }
-    return make_pair(move(buf), totalSize);
+    return {move(buf), totalSize};
 }
 
 } // namespace ue2
diff --git a/src/fdr/fdr_streaming_compile.cpp b/src/fdr/fdr_streaming_compile.cpp
index f6db0c15..f3001743 100644
--- a/src/fdr/fdr_streaming_compile.cpp
+++ b/src/fdr/fdr_streaming_compile.cpp
@@ -320,7 +320,7 @@ fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
         stream_control.literal_history_required =
                     max(maxLen(lits), max_mask_len) - 1;
         stream_control.literal_stream_state_required = 0;
-        return make_pair(nullptr, size_t{0});
+        return {nullptr, size_t{0}};
     }
 
     // Ensure that we have enough room for the longest mask.
@@ -419,7 +419,7 @@ fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
     // tell the world what we did
     stream_control.literal_history_required = max_len;
     stream_control.literal_stream_state_required = tot_state_bytes;
-    return make_pair(move(secondaryTable), tabSize);
+    return {move(secondaryTable), tabSize};
 }
 
 } // namespace ue2
diff --git a/src/fdr/flood_compile.cpp b/src/fdr/flood_compile.cpp
index f7a3e083..2ee5a1c5 100644
--- a/src/fdr/flood_compile.cpp
+++ b/src/fdr/flood_compile.cpp
@@ -217,7 +217,7 @@ setupFDRFloodControl(const vector<hwlmLiteral> &lits,
     DEBUG_PRINTF("made a flood structure with %zu + %zu = %zu\n",
                  floodHeaderSize, floodStructSize, totalSize);
 
-    return make_pair(move(buf), totalSize);
+    return {move(buf), totalSize};
 }
 
 } // namespace ue2

From 08d44fbed57ca875858a8f597296194dcc0c411f Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 1 Jun 2016 14:47:37 +1000
Subject: [PATCH 037/166] fdr: remove unused typedef 'ConfirmIndex'

---
 src/fdr/fdr_compile_internal.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/fdr/fdr_compile_internal.h b/src/fdr/fdr_compile_internal.h
index 9b0c323f..48e2ed6f 100644
--- a/src/fdr/fdr_compile_internal.h
+++ b/src/fdr/fdr_compile_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -45,7 +45,6 @@ namespace ue2 {
 // a pile of decorative typedefs
 // good for documentation purposes more than anything else
 typedef u32 LiteralIndex;
-typedef u32 ConfirmIndex;
 typedef u32 SuffixPositionInString; // zero is last byte, counting back
                                     // into the string
 typedef u32 BucketIndex;

From c7212a7478b0a1d4f61b82ba2bdb79b8cd146519 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Thu, 9 Jun 2016 01:57:08 +1000
Subject: [PATCH 038/166] Only omit frame pointers on 32bit release builds

Frame pointers are very useful for debugging and testing, and only
really make a difference to performance on IA32.
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 536be260..92caf4ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -207,7 +207,7 @@ else()
         set(EXTRA_CXX_FLAGS "-O0 ${EXTRA_CXX_FLAGS}")
     endif(OPTIMISE)
 
-    if(NOT RELEASE_BUILD)
+    if (NOT(ARCH_IA32 AND RELEASE_BUILD))
         set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
         set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
     endif()

From 7e3d56579b8a237e6ed1a58a695c83b99ad40a9f Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 9 Jun 2016 14:19:01 +1000
Subject: [PATCH 039/166] eod: stop passing both scratch and state around

---
 src/rose/eod.c | 41 +++++++++++++++++------------------------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/src/rose/eod.c b/src/rose/eod.c
index 7e8d4b3d..32702bed 100644
--- a/src/rose/eod.c
+++ b/src/rose/eod.c
@@ -33,10 +33,11 @@
 #include "util/fatbit.h"
 
 static really_inline
-void initContext(const struct RoseEngine *t, char *state, u64a offset,
+void initContext(const struct RoseEngine *t, u64a offset,
                  struct hs_scratch *scratch) {
     struct RoseContext *tctxt = &scratch->tctxt;
-    tctxt->groups = loadGroups(t, state); /* TODO: diff groups for eod */
+    /* TODO: diff groups for eod */
+    tctxt->groups = loadGroups(t, scratch->core_info.state);
     tctxt->lit_offset_adjust = scratch->core_info.buf_offset
                              - scratch->core_info.hlen
                              + 1; // index after last byte
@@ -128,9 +129,8 @@ int roseEodRunIterator(const struct RoseEngine *t, u64a offset,
  * \return MO_HALT_MATCHING if the user instructs us to stop.
  */
 static rose_inline
-int roseCheckNfaEod(const struct RoseEngine *t, char *state,
-                     struct hs_scratch *scratch, u64a offset,
-                     const char is_streaming) {
+int roseCheckNfaEod(const struct RoseEngine *t, struct hs_scratch *scratch,
+                    u64a offset, const char is_streaming) {
     if (!t->eodNfaIterOffset) {
         DEBUG_PRINTF("no engines that report at EOD\n");
         return MO_CONTINUE_MATCHING;
@@ -144,7 +144,7 @@ int roseCheckNfaEod(const struct RoseEngine *t, char *state,
         key = eod_len ? eod_data[eod_len - 1] : 0;
     }
 
-    const u8 *aa = getActiveLeafArray(t, state);
+    const u8 *aa = getActiveLeafArray(t, scratch->core_info.state);
     const u32 aaCount = t->activeArrayCount;
 
     const struct mmbit_sparse_iter *it = getByOffset(t, t->eodNfaIterOffset);
@@ -163,7 +163,7 @@ int roseCheckNfaEod(const struct RoseEngine *t, char *state,
         assert(nfaAcceptsEod(nfa));
 
         char *fstate = scratch->fullState + info->fullStateOffset;
-        const char *sstate = (const char *)state + info->stateOffset;
+        const char *sstate = scratch->core_info.state + info->stateOffset;
 
         if (is_streaming) {
             // Decompress stream state.
@@ -189,9 +189,9 @@ void cleanupAfterEodMatcher(const struct RoseEngine *t, u64a offset,
 }
 
 static rose_inline
-void roseCheckEodSuffixes(const struct RoseEngine *t, char *state, u64a offset,
+void roseCheckEodSuffixes(const struct RoseEngine *t, u64a offset,
                           struct hs_scratch *scratch) {
-    const u8 *aa = getActiveLeafArray(t, state);
+    const u8 *aa = getActiveLeafArray(t, scratch->core_info.state);
     const u32 aaCount = t->activeArrayCount;
     UNUSED u32 qCount = t->queueCount;
 
@@ -208,7 +208,7 @@ void roseCheckEodSuffixes(const struct RoseEngine *t, char *state, u64a offset,
                                                            triggered */
 
         char *fstate = scratch->fullState + info->fullStateOffset;
-        const char *sstate = (const char *)state + info->stateOffset;
+        const char *sstate = scratch->core_info.state + info->stateOffset;
 
         struct mq *q = scratch->queues + qi;
 
@@ -257,7 +257,7 @@ int roseRunEodProgram(const struct RoseEngine *t, u64a offset,
 }
 
 static really_inline
-void roseEodExec_i(const struct RoseEngine *t, char *state, u64a offset,
+void roseEodExec_i(const struct RoseEngine *t, u64a offset,
                    struct hs_scratch *scratch, const char is_streaming) {
     assert(t);
     assert(scratch->core_info.buf || scratch->core_info.hbuf);
@@ -269,8 +269,7 @@ void roseEodExec_i(const struct RoseEngine *t, char *state, u64a offset,
         return;
     }
 
-    if (roseCheckNfaEod(t, state, scratch, offset, is_streaming) ==
-        MO_HALT_MATCHING) {
+    if (roseCheckNfaEod(t, scratch, offset, is_streaming) == MO_HALT_MATCHING) {
         return;
     }
 
@@ -288,6 +287,7 @@ void roseEodExec_i(const struct RoseEngine *t, char *state, u64a offset,
     if (t->ematcherOffset) {
         assert(t->ematcherRegionSize);
         // Unset the reports we just fired so we don't fire them again below.
+        char *state = scratch->core_info.state;
         mmbit_clear(getRoleState(state), t->rolesWithStateCount);
         mmbit_clear(getActiveLeafArray(t, state), t->activeArrayCount);
 
@@ -303,7 +303,7 @@ void roseEodExec_i(const struct RoseEngine *t, char *state, u64a offset,
             return;
         }
 
-        roseCheckEodSuffixes(t, state, offset, scratch);
+        roseCheckEodSuffixes(t, offset, scratch);
     }
 }
 
@@ -326,12 +326,8 @@ void roseEodExec(const struct RoseEngine *t, u64a offset,
         return;
     }
 
-    char *state = scratch->core_info.state;
-    assert(state);
-
-    initContext(t, state, offset, scratch);
-
-    roseEodExec_i(t, state, offset, scratch, 1);
+    initContext(t, offset, scratch);
+    roseEodExec_i(t, offset, scratch, 1);
 }
 
 static rose_inline
@@ -349,10 +345,7 @@ void roseBlockEodExec(const struct RoseEngine *t, u64a offset,
 
     assert(!can_stop_matching(scratch));
 
-    char *state = scratch->core_info.state;
-
     // Ensure that history is correct before we look for EOD matches
     prepForEod(t, scratch, scratch->core_info.len);
-
-    roseEodExec_i(t, state, offset, scratch, 0);
+    roseEodExec_i(t, offset, scratch, 0);
 }

From 7a6a47672345f7ccb62c160760fe119fcc209e98 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 9 Jun 2016 14:41:15 +1000
Subject: [PATCH 040/166] eod: move engine checks into ENGINES_EOD instr

---
 src/rose/eod.c                   |  63 -------------------
 src/rose/program_runtime.h       |  59 ++++++++++++++++++
 src/rose/rose_build_bytecode.cpp | 102 ++++++++++++++++++++-----------
 src/rose/rose_dump.cpp           |   6 +-
 src/rose/rose_internal.h         |   3 -
 src/rose/rose_program.h          |   9 +++
 6 files changed, 141 insertions(+), 101 deletions(-)

diff --git a/src/rose/eod.c b/src/rose/eod.c
index 32702bed..4961a728 100644
--- a/src/rose/eod.c
+++ b/src/rose/eod.c
@@ -122,65 +122,6 @@ int roseEodRunIterator(const struct RoseEngine *t, u64a offset,
     return MO_CONTINUE_MATCHING;
 }
 
-/**
- * \brief Check for (and deliver) reports from active output-exposed (suffix
- * or outfix) NFAs.
- *
- * \return MO_HALT_MATCHING if the user instructs us to stop.
- */
-static rose_inline
-int roseCheckNfaEod(const struct RoseEngine *t, struct hs_scratch *scratch,
-                    u64a offset, const char is_streaming) {
-    if (!t->eodNfaIterOffset) {
-        DEBUG_PRINTF("no engines that report at EOD\n");
-        return MO_CONTINUE_MATCHING;
-    }
-
-    /* data, len is used for state decompress, should be full available data */
-    u8 key = 0;
-    if (is_streaming) {
-        const u8 *eod_data = scratch->core_info.hbuf;
-        size_t eod_len = scratch->core_info.hlen;
-        key = eod_len ? eod_data[eod_len - 1] : 0;
-    }
-
-    const u8 *aa = getActiveLeafArray(t, scratch->core_info.state);
-    const u32 aaCount = t->activeArrayCount;
-
-    const struct mmbit_sparse_iter *it = getByOffset(t, t->eodNfaIterOffset);
-    assert(ISALIGNED(it));
-
-    u32 idx = 0;
-    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
-
-    for (u32 qi = mmbit_sparse_iter_begin(aa, aaCount, &idx, it, si_state);
-         qi != MMB_INVALID;
-         qi = mmbit_sparse_iter_next(aa, aaCount, qi, &idx, it, si_state)) {
-        const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
-        const struct NFA *nfa = getNfaByInfo(t, info);
-
-        DEBUG_PRINTF("checking nfa %u\n", qi);
-        assert(nfaAcceptsEod(nfa));
-
-        char *fstate = scratch->fullState + info->fullStateOffset;
-        const char *sstate = scratch->core_info.state + info->stateOffset;
-
-        if (is_streaming) {
-            // Decompress stream state.
-            nfaExpandState(nfa, fstate, sstate, offset, key);
-        }
-
-        if (nfaCheckFinalState(nfa, fstate, sstate, offset, roseReportAdaptor,
-                               roseReportSomAdaptor,
-                               scratch) == MO_HALT_MATCHING) {
-            DEBUG_PRINTF("user instructed us to stop\n");
-            return MO_HALT_MATCHING;
-        }
-    }
-
-    return MO_CONTINUE_MATCHING;
-}
-
 static rose_inline
 void cleanupAfterEodMatcher(const struct RoseEngine *t, u64a offset,
                             struct hs_scratch *scratch) {
@@ -269,10 +210,6 @@ void roseEodExec_i(const struct RoseEngine *t, u64a offset,
         return;
     }
 
-    if (roseCheckNfaEod(t, scratch, offset, is_streaming) == MO_HALT_MATCHING) {
-        return;
-    }
-
     if (!t->eodIterProgramOffset && !t->ematcherOffset) {
         DEBUG_PRINTF("no eod accepts\n");
         return;
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index a913ae27..a656c715 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -800,6 +800,57 @@ char roseCheckBounds(u64a end, u64a min_bound, u64a max_bound) {
     return end >= min_bound && end <= max_bound;
 }
 
+static rose_inline
+hwlmcb_rv_t roseEnginesEod(const struct RoseEngine *rose,
+                           struct hs_scratch *scratch, u64a offset,
+                           u32 iter_offset) {
+    const char is_streaming = rose->mode != HS_MODE_BLOCK;
+
+    /* data, len is used for state decompress, should be full available data */
+    u8 key = 0;
+    if (is_streaming) {
+        const u8 *eod_data = scratch->core_info.hbuf;
+        size_t eod_len = scratch->core_info.hlen;
+        key = eod_len ? eod_data[eod_len - 1] : 0;
+    }
+
+    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
+    const u32 aaCount = rose->activeArrayCount;
+
+    const struct mmbit_sparse_iter *it = getByOffset(rose, iter_offset);
+    assert(ISALIGNED(it));
+
+    u32 idx = 0;
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    for (u32 qi = mmbit_sparse_iter_begin(aa, aaCount, &idx, it, si_state);
+         qi != MMB_INVALID;
+         qi = mmbit_sparse_iter_next(aa, aaCount, qi, &idx, it, si_state)) {
+        const struct NfaInfo *info = getNfaInfoByQueue(rose, qi);
+        const struct NFA *nfa = getNfaByInfo(rose, info);
+
+        DEBUG_PRINTF("checking nfa %u\n", qi);
+        assert(nfaAcceptsEod(nfa));
+
+        char *fstate = scratch->fullState + info->fullStateOffset;
+        const char *sstate = scratch->core_info.state + info->stateOffset;
+
+        if (is_streaming) {
+            // Decompress stream state.
+            nfaExpandState(nfa, fstate, sstate, offset, key);
+        }
+
+        if (nfaCheckFinalState(nfa, fstate, sstate, offset, roseReportAdaptor,
+                               roseReportSomAdaptor,
+                               scratch) == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("user instructed us to stop\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
 static
 void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
                     const char from_mpv) {
@@ -1301,6 +1352,14 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(ENGINES_EOD) {
+                if (roseEnginesEod(t, scratch, end, ri->iter_offset) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(END) {
                 DEBUG_PRINTF("finished\n");
                 return HWLM_CONTINUE_MATCHING;
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index ea602017..904f8df9 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -223,6 +223,7 @@ public:
         case ROSE_INSTR_CHECK_STATE: return &u.checkState;
         case ROSE_INSTR_SPARSE_ITER_BEGIN: return &u.sparseIterBegin;
         case ROSE_INSTR_SPARSE_ITER_NEXT: return &u.sparseIterNext;
+        case ROSE_INSTR_ENGINES_EOD: return &u.enginesEod;
         case ROSE_INSTR_END: return &u.end;
         }
         assert(0);
@@ -269,6 +270,7 @@ public:
         case ROSE_INSTR_CHECK_STATE: return sizeof(u.checkState);
         case ROSE_INSTR_SPARSE_ITER_BEGIN: return sizeof(u.sparseIterBegin);
         case ROSE_INSTR_SPARSE_ITER_NEXT: return sizeof(u.sparseIterNext);
+        case ROSE_INSTR_ENGINES_EOD: return sizeof(u.enginesEod);
         case ROSE_INSTR_END: return sizeof(u.end);
         }
         assert(0);
@@ -314,6 +316,7 @@ public:
         ROSE_STRUCT_CHECK_STATE checkState;
         ROSE_STRUCT_SPARSE_ITER_BEGIN sparseIterBegin;
         ROSE_STRUCT_SPARSE_ITER_NEXT sparseIterNext;
+        ROSE_STRUCT_ENGINES_EOD enginesEod;
         ROSE_STRUCT_END end;
     } u;
 
@@ -3532,7 +3535,7 @@ u32 addPredBlocks(build_context &bc,
  * Returns the pair (program offset, sparse iter offset).
  */
 static
-pair<u32, u32> makeSparseIterProgram(build_context &bc,
+vector<RoseInstruction> makeSparseIterProgram(build_context &bc,
                     map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
                     const vector<RoseInstruction> &root_program,
                     const vector<RoseInstruction> &pre_program) {
@@ -3548,7 +3551,7 @@ pair<u32, u32> makeSparseIterProgram(build_context &bc,
     // Add blocks to deal with non-root edges (triggered by sparse iterator or
     // mmbit_isset checks). This operation will flatten the program up to this
     // point.
-    u32 iter_offset = addPredBlocks(bc, predProgramLists, program, false);
+    addPredBlocks(bc, predProgramLists, program, false);
 
     // If we have a root program, replace the END instruction with it. Note
     // that the root program has already been flattened.
@@ -3559,8 +3562,7 @@ pair<u32, u32> makeSparseIterProgram(build_context &bc,
         program.insert(end(program), begin(root_program), end(root_program));
     }
 
-    applyFinalSpecialisation(program);
-    return {writeProgram(bc, program), iter_offset};
+    return program;
 }
 
 static
@@ -3778,8 +3780,9 @@ vector<RoseInstruction> buildLitInitialProgram(RoseBuildImpl &build,
 }
 
 static
-u32 buildLiteralProgram(RoseBuildImpl &build, build_context &bc, u32 final_id,
-                        const vector<RoseEdge> &lit_edges) {
+vector<RoseInstruction> buildLiteralProgram(RoseBuildImpl &build,
+                                            build_context &bc, u32 final_id,
+                                            const vector<RoseEdge> &lit_edges) {
     const auto &g = build.g;
 
     DEBUG_PRINTF("final id %u, %zu lit edges\n", final_id, lit_edges.size());
@@ -3831,7 +3834,19 @@ u32 buildLiteralProgram(RoseBuildImpl &build, build_context &bc, u32 final_id,
 
     // Put it all together.
     return makeSparseIterProgram(bc, predProgramLists, root_program,
-                                 pre_program).first;
+                                 pre_program);
+}
+
+static
+u32 writeLiteralProgram(RoseBuildImpl &build, build_context &bc, u32 final_id,
+                        const vector<RoseEdge> &lit_edges) {
+    auto program = buildLiteralProgram(build, bc, final_id, lit_edges);
+    if (program.empty()) {
+        return 0;
+    }
+    // Note: already flattened.
+    applyFinalSpecialisation(program);
+    return writeProgram(bc, program);
 }
 
 static
@@ -3904,7 +3919,7 @@ pair<u32, u32> buildLiteralPrograms(RoseBuildImpl &build, build_context &bc) {
         const auto &lit_edges = lit_edge_map[finalId];
 
         litPrograms[finalId] =
-            buildLiteralProgram(build, bc, finalId, lit_edges);
+            writeLiteralProgram(build, bc, finalId, lit_edges);
         delayRebuildPrograms[finalId] =
             buildDelayRebuildProgram(build, bc, finalId);
     }
@@ -4020,33 +4035,53 @@ pair<u32, u32> buildEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
 }
 
 static
-u32 writeEodProgram(RoseBuildImpl &build, build_context &bc) {
-    if (build.eod_event_literal_id == MO_INVALID_IDX) {
+u32 writeEodProgram(RoseBuildImpl &build, build_context &bc,
+                    u32 eodNfaIterOffset) {
+    vector<RoseInstruction> program;
+
+    if (build.eod_event_literal_id != MO_INVALID_IDX) {
+        const RoseGraph &g = build.g;
+        const auto &lit_info =
+            build.literal_info.at(build.eod_event_literal_id);
+        assert(lit_info.delayed_ids.empty());
+        assert(!lit_info.squash_group);
+        assert(!lit_info.requires_benefits);
+
+        // Collect all edges leading into EOD event literal vertices.
+        vector<RoseEdge> edge_list;
+        for (const auto &v : lit_info.vertices) {
+            for (const auto &e : in_edges_range(v, g)) {
+                edge_list.push_back(e);
+            }
+        }
+
+        // Sort edge list for determinism, prettiness.
+        sort(begin(edge_list), end(edge_list),
+             [&g](const RoseEdge &a, const RoseEdge &b) {
+                 return tie(g[source(a, g)].idx, g[target(a, g)].idx) <
+                        tie(g[source(b, g)].idx, g[target(b, g)].idx);
+             });
+
+        program = buildLiteralProgram(build, bc, MO_INVALID_IDX, edge_list);
+    }
+
+    if (eodNfaIterOffset) {
+        auto ri = RoseInstruction(ROSE_INSTR_ENGINES_EOD);
+        ri.u.enginesEod.iter_offset = eodNfaIterOffset;
+        if (!program.empty()) {
+            assert(program.back().code() == ROSE_INSTR_END);
+            program.pop_back();
+        }
+        program.push_back(move(ri));
+        program = flattenProgram({program});
+    }
+
+    if (program.empty()) {
         return 0;
     }
 
-    const RoseGraph &g = build.g;
-    const auto &lit_info = build.literal_info.at(build.eod_event_literal_id);
-    assert(lit_info.delayed_ids.empty());
-    assert(!lit_info.squash_group);
-    assert(!lit_info.requires_benefits);
-
-    // Collect all edges leading into EOD event literal vertices.
-    vector<RoseEdge> edge_list;
-    for (const auto &v : lit_info.vertices) {
-        for (const auto &e : in_edges_range(v, g)) {
-            edge_list.push_back(e);
-        }
-    }
-
-    // Sort edge list for determinism, prettiness.
-    sort(begin(edge_list), end(edge_list),
-         [&g](const RoseEdge &a, const RoseEdge &b) {
-             return tie(g[source(a, g)].idx, g[target(a, g)].idx) <
-                    tie(g[source(b, g)].idx, g[target(b, g)].idx);
-         });
-
-    return buildLiteralProgram(build, bc, MO_INVALID_IDX, edge_list);
+    applyFinalSpecialisation(program);
+    return writeProgram(bc, program);
 }
 
 static
@@ -4210,7 +4245,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     tie(litProgramOffset, litDelayRebuildProgramOffset) =
         buildLiteralPrograms(*this, bc);
 
-    u32 eodProgramOffset = writeEodProgram(*this, bc);
+    u32 eodProgramOffset = writeEodProgram(*this, bc, eodNfaIterOffset);
     u32 eodIterProgramOffset;
     u32 eodIterOffset;
     tie(eodIterProgramOffset, eodIterOffset) = buildEodAnchorProgram(*this, bc);
@@ -4412,7 +4447,6 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     engine->eodProgramOffset = eodProgramOffset;
     engine->eodIterProgramOffset = eodIterProgramOffset;
     engine->eodIterOffset = eodIterOffset;
-    engine->eodNfaIterOffset = eodNfaIterOffset;
 
     engine->lastByteHistoryIterOffset = lastByteOffset;
 
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index ad776780..59f7f751 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -476,6 +476,11 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(ENGINES_EOD) {
+                os << "    iter_offset " << ri->iter_offset << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(END) { return; }
             PROGRAM_NEXT_INSTRUCTION
 
@@ -1022,7 +1027,6 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, eodProgramOffset);
     DUMP_U32(t, eodIterProgramOffset);
     DUMP_U32(t, eodIterOffset);
-    DUMP_U32(t, eodNfaIterOffset);
     DUMP_U32(t, lastByteHistoryIterOffset);
     DUMP_U32(t, minWidth);
     DUMP_U32(t, minWidthExcludingBoundaries);
diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h
index bbe0b1b6..2e921542 100644
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@@ -380,9 +380,6 @@ struct RoseEngine {
     u32 eodIterProgramOffset; // or 0 if no eod iterator program
     u32 eodIterOffset; // offset to EOD sparse iter or 0 if none
 
-    /** \brief Offset to sparse iter over outfix/suffix NFAs that accept EOD. */
-    u32 eodNfaIterOffset;
-
     u32 lastByteHistoryIterOffset; // if non-zero
 
     /** \brief Minimum number of bytes required to match. */
diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h
index 5c57bf54..b8961117 100644
--- a/src/rose/rose_program.h
+++ b/src/rose/rose_program.h
@@ -96,6 +96,10 @@ enum RoseInstructionCode {
     ROSE_INSTR_CHECK_STATE,       //!< Test a single bit in the state multibit.
     ROSE_INSTR_SPARSE_ITER_BEGIN, //!< Begin running a sparse iter over states.
     ROSE_INSTR_SPARSE_ITER_NEXT,  //!< Continue running sparse iter over states.
+
+    /** \brief Check outfixes and suffixes for EOD and fire reports if so. */
+    ROSE_INSTR_ENGINES_EOD,
+
     ROSE_INSTR_END                //!< End of program.
 };
 
@@ -352,6 +356,11 @@ struct ROSE_STRUCT_SPARSE_ITER_NEXT {
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
+struct ROSE_STRUCT_ENGINES_EOD {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 iter_offset; //!< Offset of mmbit_sparse_iter structure.
+};
+
 struct ROSE_STRUCT_END {
     u8 code; //!< From enum RoseInstructionCode.
 };

From 02595cda1f48302777d91077bbdac341bf14137c Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 10 Jun 2016 10:09:15 +1000
Subject: [PATCH 041/166] eod: consolidate eod anchor programs

---
 src/rose/eod.c                   | 10 ------
 src/rose/rose_build_bytecode.cpp | 58 ++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/src/rose/eod.c b/src/rose/eod.c
index 4961a728..30b8db0e 100644
--- a/src/rose/eod.c
+++ b/src/rose/eod.c
@@ -210,16 +210,6 @@ void roseEodExec_i(const struct RoseEngine *t, u64a offset,
         return;
     }
 
-    if (!t->eodIterProgramOffset && !t->ematcherOffset) {
-        DEBUG_PRINTF("no eod accepts\n");
-        return;
-    }
-
-    // Handle pending EOD reports.
-    if (roseEodRunIterator(t, offset, scratch) == MO_HALT_MATCHING) {
-        return;
-    }
-
     // Run the EOD anchored matcher if there is one.
     if (t->ematcherOffset) {
         assert(t->ematcherRegionSize);
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 904f8df9..1c8e8cd0 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -4005,6 +4005,11 @@ pair<u32, u32> buildEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
         for (const auto &e : in_edges_range(v, g)) {
             RoseVertex u = source(e, g);
 
+            if (!build.isInETable(u)) {
+                DEBUG_PRINTF("pred %zu is not in etable\n", g[u].idx);
+                continue;
+            }
+
             if (canEagerlyReportAtEod(build, e)) {
                 DEBUG_PRINTF("already done report for vertex %zu\n", g[u].idx);
                 continue;
@@ -4034,6 +4039,57 @@ pair<u32, u32> buildEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
     return {writeProgram(bc, program), iter_offset};
 }
 
+static
+void addGeneralEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
+                                vector<RoseInstruction> &program) {
+    const RoseGraph &g = build.g;
+
+    // pred state id -> list of programs
+    map<u32, vector<vector<RoseInstruction>>> predProgramLists;
+
+    for (auto v : vertices_range(g)) {
+        if (!g[v].eod_accept) {
+            continue;
+        }
+
+        DEBUG_PRINTF("vertex %zu (with %zu preds) fires on EOD\n", g[v].idx,
+                     in_degree(v, g));
+
+        for (const auto &e : in_edges_range(v, g)) {
+            RoseVertex u = source(e, g);
+
+            if (build.isInETable(u)) {
+                DEBUG_PRINTF("pred %zu is in etable\n", g[u].idx);
+                continue;
+            }
+
+            if (canEagerlyReportAtEod(build, e)) {
+                DEBUG_PRINTF("already done report for vertex %zu\n", g[u].idx);
+                continue;
+            }
+
+            assert(contains(bc.roleStateIndices, u));
+            u32 predStateIdx = bc.roleStateIndices.at(u);
+
+            auto program = makeEodAnchorProgram(build, bc, e);
+            predProgramLists[predStateIdx].push_back(program);
+        }
+    }
+
+    if (predProgramLists.empty()) {
+        DEBUG_PRINTF("no eod anchored roles\n");
+        return;
+    }
+
+    if (!program.empty()) {
+        assert(program.back().code() == ROSE_INSTR_END);
+        program.pop_back();
+    }
+    // TODO: don't force sparse iter, be more careful with generating
+    // CHECK_NOT_HANDLED.
+    addPredBlocks(bc, predProgramLists, program, true);
+}
+
 static
 u32 writeEodProgram(RoseBuildImpl &build, build_context &bc,
                     u32 eodNfaIterOffset) {
@@ -4076,6 +4132,8 @@ u32 writeEodProgram(RoseBuildImpl &build, build_context &bc,
         program = flattenProgram({program});
     }
 
+    addGeneralEodAnchorProgram(build, bc, program);
+
     if (program.empty()) {
         return 0;
     }

From 7a7dff5b705e94a511e30fbca71e49f02c4fedb0 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 10 Jun 2016 11:39:22 +1000
Subject: [PATCH 042/166] eod: don't force sparse iter for general prog

---
 src/rose/rose_build_bytecode.cpp | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 1c8e8cd0..b37b259a 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -3958,7 +3958,8 @@ u32 buildReportPrograms(RoseBuildImpl &build, build_context &bc) {
 static
 vector<RoseInstruction> makeEodAnchorProgram(RoseBuildImpl &build,
                                              build_context &bc,
-                                             const RoseEdge &e) {
+                                             const RoseEdge &e,
+                                             const bool multiple_preds) {
     const RoseGraph &g = build.g;
     const RoseVertex v = target(e, g);
 
@@ -3968,7 +3969,7 @@ vector<RoseInstruction> makeEodAnchorProgram(RoseBuildImpl &build,
         makeRoleCheckBounds(build, v, e, program);
     }
 
-    if (hasGreaterInDegree(1, v, g)) {
+    if (multiple_preds) {
         // Only necessary when there is more than one pred.
         makeRoleCheckNotHandled(bc, v, program);
     }
@@ -4002,23 +4003,27 @@ pair<u32, u32> buildEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
         DEBUG_PRINTF("vertex %zu (with %zu preds) fires on EOD\n", g[v].idx,
                      in_degree(v, g));
 
+        vector<RoseEdge> edge_list;
         for (const auto &e : in_edges_range(v, g)) {
             RoseVertex u = source(e, g);
-
             if (!build.isInETable(u)) {
                 DEBUG_PRINTF("pred %zu is not in etable\n", g[u].idx);
                 continue;
             }
-
             if (canEagerlyReportAtEod(build, e)) {
                 DEBUG_PRINTF("already done report for vertex %zu\n", g[u].idx);
                 continue;
             }
+            edge_list.push_back(e);
+        }
 
+        const bool multiple_preds = edge_list.size() > 1;
+        for (const auto &e : edge_list) {
+            RoseVertex u = source(e, g);
             assert(contains(bc.roleStateIndices, u));
             u32 predStateIdx = bc.roleStateIndices.at(u);
 
-            auto program = makeEodAnchorProgram(build, bc, e);
+            auto program = makeEodAnchorProgram(build, bc, e, multiple_preds);
             predProgramLists[predStateIdx].push_back(program);
         }
     }
@@ -4055,23 +4060,27 @@ void addGeneralEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
         DEBUG_PRINTF("vertex %zu (with %zu preds) fires on EOD\n", g[v].idx,
                      in_degree(v, g));
 
+        vector<RoseEdge> edge_list;
         for (const auto &e : in_edges_range(v, g)) {
             RoseVertex u = source(e, g);
-
             if (build.isInETable(u)) {
                 DEBUG_PRINTF("pred %zu is in etable\n", g[u].idx);
                 continue;
             }
-
             if (canEagerlyReportAtEod(build, e)) {
                 DEBUG_PRINTF("already done report for vertex %zu\n", g[u].idx);
                 continue;
             }
+            edge_list.push_back(e);
+        }
 
+        const bool multiple_preds = edge_list.size() > 1;
+        for (const auto &e : edge_list) {
+            RoseVertex u = source(e, g);
             assert(contains(bc.roleStateIndices, u));
             u32 predStateIdx = bc.roleStateIndices.at(u);
 
-            auto program = makeEodAnchorProgram(build, bc, e);
+            auto program = makeEodAnchorProgram(build, bc, e, multiple_preds);
             predProgramLists[predStateIdx].push_back(program);
         }
     }
@@ -4085,9 +4094,7 @@ void addGeneralEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
         assert(program.back().code() == ROSE_INSTR_END);
         program.pop_back();
     }
-    // TODO: don't force sparse iter, be more careful with generating
-    // CHECK_NOT_HANDLED.
-    addPredBlocks(bc, predProgramLists, program, true);
+    addPredBlocks(bc, predProgramLists, program, false);
 }
 
 static

From 9669e0fe941318a729c363a570121a8d110ce27a Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 10 Jun 2016 11:49:08 +1000
Subject: [PATCH 043/166] eod: remove forced sparse iter optimization

---
 src/rose/rose.h                  | 39 +-------------------------------
 src/rose/rose_build_bytecode.cpp | 16 ++++---------
 src/rose/rose_dump.cpp           |  1 -
 src/rose/rose_internal.h         |  1 -
 4 files changed, 6 insertions(+), 51 deletions(-)

diff --git a/src/rose/rose.h b/src/rose/rose.h
index e90d2f21..d79c2f0c 100644
--- a/src/rose/rose.h
+++ b/src/rose/rose.h
@@ -43,39 +43,6 @@ void roseBlockEodExec(const struct RoseEngine *t, u64a offset,
                       struct hs_scratch *scratch);
 void roseBlockExec_i(const struct RoseEngine *t, struct hs_scratch *scratch);
 
-static really_inline
-int roseBlockHasEodWork(const struct RoseEngine *t,
-                        struct hs_scratch *scratch) {
-    if (t->ematcherOffset) {
-        DEBUG_PRINTF("eod matcher to run\n");
-        return 1;
-    }
-
-    if (t->eodProgramOffset) {
-        DEBUG_PRINTF("has eod program\n");
-        return 1;
-    }
-
-    void *state = scratch->core_info.state;
-    if (mmbit_any(getActiveLeafArray(t, state), t->activeArrayCount)) {
-        DEBUG_PRINTF("active outfix/suffix engines\n");
-        return 1;
-    }
-
-    if (t->eodIterOffset) {
-        u32 idx;
-        const struct mmbit_sparse_iter *it = getByOffset(t, t->eodIterOffset);
-        struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
-        if (mmbit_sparse_iter_begin(getRoleState(state), t->rolesWithStateCount,
-                                    &idx, it, si_state) != MMB_INVALID) {
-            DEBUG_PRINTF("eod iter has states on\n");
-            return 1;
-        }
-    }
-
-    return 0;
-}
-
 /* assumes core_info in scratch has been init to point to data */
 static really_inline
 void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
@@ -102,6 +69,7 @@ void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
     roseBlockExec_i(t, scratch);
 
     if (!t->requiresEodCheck) {
+        DEBUG_PRINTF("no eod check required\n");
         return;
     }
 
@@ -110,11 +78,6 @@ void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
         return;
     }
 
-    if (!roseBlockHasEodWork(t, scratch)) {
-        DEBUG_PRINTF("no eod work\n");
-        return;
-    }
-
     roseBlockEodExec(t, length, scratch);
 }
 
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index b37b259a..5d235fca 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -3989,7 +3989,7 @@ vector<RoseInstruction> makeEodAnchorProgram(RoseBuildImpl &build,
  * Returns the pair (program offset, sparse iter offset).
  */
 static
-pair<u32, u32> buildEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
+u32 writeEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
     const RoseGraph &g = build.g;
 
     // pred state id -> list of programs
@@ -4030,18 +4030,15 @@ pair<u32, u32> buildEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
 
     if (predProgramLists.empty()) {
         DEBUG_PRINTF("no eod anchored roles\n");
-        return {0, 0};
+        return 0;
     }
 
     vector<RoseInstruction> program;
-
-    // Note: we force the use of a sparse iterator for the EOD program so we
-    // can easily guard EOD execution at runtime.
-    u32 iter_offset = addPredBlocks(bc, predProgramLists, program, true);
+    addPredBlocks(bc, predProgramLists, program, false);
 
     assert(program.size() > 1);
     applyFinalSpecialisation(program);
-    return {writeProgram(bc, program), iter_offset};
+    return writeProgram(bc, program);
 }
 
 static
@@ -4311,9 +4308,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
         buildLiteralPrograms(*this, bc);
 
     u32 eodProgramOffset = writeEodProgram(*this, bc, eodNfaIterOffset);
-    u32 eodIterProgramOffset;
-    u32 eodIterOffset;
-    tie(eodIterProgramOffset, eodIterOffset) = buildEodAnchorProgram(*this, bc);
+    u32 eodIterProgramOffset = writeEodAnchorProgram(*this, bc);
 
     vector<mmbit_sparse_iter> activeLeftIter;
     buildActiveLeftIter(leftInfoTable, activeLeftIter);
@@ -4511,7 +4506,6 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
 
     engine->eodProgramOffset = eodProgramOffset;
     engine->eodIterProgramOffset = eodIterProgramOffset;
-    engine->eodIterOffset = eodIterOffset;
 
     engine->lastByteHistoryIterOffset = lastByteOffset;
 
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index 59f7f751..8de33a44 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -1026,7 +1026,6 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, lookaroundReachOffset);
     DUMP_U32(t, eodProgramOffset);
     DUMP_U32(t, eodIterProgramOffset);
-    DUMP_U32(t, eodIterOffset);
     DUMP_U32(t, lastByteHistoryIterOffset);
     DUMP_U32(t, minWidth);
     DUMP_U32(t, minWidthExcludingBoundaries);
diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h
index 2e921542..faab45f7 100644
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@@ -378,7 +378,6 @@ struct RoseEngine {
 
     u32 eodProgramOffset; //!< Unconditional EOD program, otherwise 0.
     u32 eodIterProgramOffset; // or 0 if no eod iterator program
-    u32 eodIterOffset; // offset to EOD sparse iter or 0 if none
 
     u32 lastByteHistoryIterOffset; // if non-zero
 

From 2761e0105d16341eaa80c7f978abaf07df923756 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 10 Jun 2016 14:51:15 +1000
Subject: [PATCH 044/166] eod: more suffix iteration into program

---
 src/rose/eod.c                   | 43 ---------------------------
 src/rose/program_runtime.h       | 50 ++++++++++++++++++++++++++++++++
 src/rose/rose_build_bytecode.cpp | 35 +++++++++++++++++++---
 src/rose/rose_dump.cpp           |  3 ++
 src/rose/rose_program.h          |  8 +++++
 5 files changed, 92 insertions(+), 47 deletions(-)

diff --git a/src/rose/eod.c b/src/rose/eod.c
index 30b8db0e..48b330d4 100644
--- a/src/rose/eod.c
+++ b/src/rose/eod.c
@@ -129,47 +129,6 @@ void cleanupAfterEodMatcher(const struct RoseEngine *t, u64a offset,
     roseFlushLastByteHistory(t, scratch, offset);
 }
 
-static rose_inline
-void roseCheckEodSuffixes(const struct RoseEngine *t, u64a offset,
-                          struct hs_scratch *scratch) {
-    const u8 *aa = getActiveLeafArray(t, scratch->core_info.state);
-    const u32 aaCount = t->activeArrayCount;
-    UNUSED u32 qCount = t->queueCount;
-
-    for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID;
-         qi = mmbit_iterate(aa, aaCount, qi)) {
-        const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
-        const struct NFA *nfa = getNfaByInfo(t, info);
-
-        assert(nfaAcceptsEod(nfa));
-
-        DEBUG_PRINTF("checking nfa %u\n", qi);
-
-        assert(fatbit_isset(scratch->aqa, qCount, qi)); /* we have just been
-                                                           triggered */
-
-        char *fstate = scratch->fullState + info->fullStateOffset;
-        const char *sstate = scratch->core_info.state + info->stateOffset;
-
-        struct mq *q = scratch->queues + qi;
-
-        pushQueueNoMerge(q, MQE_END, scratch->core_info.len);
-
-        q->context = NULL;
-        /* rose exec is used as we don't want to / can't raise matches in the
-         * history buffer. */
-        char rv = nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX);
-        if (rv) { /* nfa is still alive */
-            if (nfaCheckFinalState(nfa, fstate, sstate, offset,
-                                   roseReportAdaptor, roseReportSomAdaptor,
-                                   scratch) == MO_HALT_MATCHING) {
-                DEBUG_PRINTF("user instructed us to stop\n");
-                return;
-            }
-        }
-    }
-}
-
 static rose_inline
 int roseRunEodProgram(const struct RoseEngine *t, u64a offset,
                       struct hs_scratch *scratch) {
@@ -229,8 +188,6 @@ void roseEodExec_i(const struct RoseEngine *t, u64a offset,
         if (roseEodRunIterator(t, offset, scratch) == MO_HALT_MATCHING) {
             return;
         }
-
-        roseCheckEodSuffixes(t, offset, scratch);
     }
 }
 
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index a656c715..5387f59f 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -851,6 +851,48 @@ hwlmcb_rv_t roseEnginesEod(const struct RoseEngine *rose,
     return HWLM_CONTINUE_MATCHING;
 }
 
+static rose_inline
+hwlmcb_rv_t roseSuffixesEod(const struct RoseEngine *rose,
+                            struct hs_scratch *scratch, u64a offset) {
+    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
+    const u32 aaCount = rose->activeArrayCount;
+
+    for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID;
+         qi = mmbit_iterate(aa, aaCount, qi)) {
+        const struct NfaInfo *info = getNfaInfoByQueue(rose, qi);
+        const struct NFA *nfa = getNfaByInfo(rose, info);
+
+        assert(nfaAcceptsEod(nfa));
+
+        DEBUG_PRINTF("checking nfa %u\n", qi);
+
+        /* We have just been triggered. */
+        assert(fatbit_isset(scratch->aqa, rose->queueCount, qi));
+
+        char *fstate = scratch->fullState + info->fullStateOffset;
+        const char *sstate = scratch->core_info.state + info->stateOffset;
+
+        struct mq *q = scratch->queues + qi;
+
+        pushQueueNoMerge(q, MQE_END, scratch->core_info.len);
+
+        q->context = NULL;
+        /* rose exec is used as we don't want to / can't raise matches in the
+         * history buffer. */
+        if (!nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX)) {
+            DEBUG_PRINTF("nfa is dead\n");
+            continue;
+        }
+        if (nfaCheckFinalState(nfa, fstate, sstate, offset, roseReportAdaptor,
+                               roseReportSomAdaptor,
+                               scratch) == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("user instructed us to stop\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+    return HWLM_CONTINUE_MATCHING;
+}
+
 static
 void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
                     const char from_mpv) {
@@ -1360,6 +1402,14 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(SUFFIXES_EOD) {
+                if (roseSuffixesEod(t, scratch, end) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(END) {
                 DEBUG_PRINTF("finished\n");
                 return HWLM_CONTINUE_MATCHING;
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 5d235fca..846e3e1b 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -224,6 +224,7 @@ public:
         case ROSE_INSTR_SPARSE_ITER_BEGIN: return &u.sparseIterBegin;
         case ROSE_INSTR_SPARSE_ITER_NEXT: return &u.sparseIterNext;
         case ROSE_INSTR_ENGINES_EOD: return &u.enginesEod;
+        case ROSE_INSTR_SUFFIXES_EOD: return &u.suffixesEod;
         case ROSE_INSTR_END: return &u.end;
         }
         assert(0);
@@ -271,6 +272,7 @@ public:
         case ROSE_INSTR_SPARSE_ITER_BEGIN: return sizeof(u.sparseIterBegin);
         case ROSE_INSTR_SPARSE_ITER_NEXT: return sizeof(u.sparseIterNext);
         case ROSE_INSTR_ENGINES_EOD: return sizeof(u.enginesEod);
+        case ROSE_INSTR_SUFFIXES_EOD: return sizeof(u.suffixesEod);
         case ROSE_INSTR_END: return sizeof(u.end);
         }
         assert(0);
@@ -317,6 +319,7 @@ public:
         ROSE_STRUCT_SPARSE_ITER_BEGIN sparseIterBegin;
         ROSE_STRUCT_SPARSE_ITER_NEXT sparseIterNext;
         ROSE_STRUCT_ENGINES_EOD enginesEod;
+        ROSE_STRUCT_SUFFIXES_EOD suffixesEod;
         ROSE_STRUCT_END end;
     } u;
 
@@ -3985,6 +3988,19 @@ vector<RoseInstruction> makeEodAnchorProgram(RoseBuildImpl &build,
     return program;
 }
 
+static
+bool hasEodAnchoredSuffix(const RoseBuildImpl &build) {
+    const RoseGraph &g = build.g;
+    for (auto v : vertices_range(g)) {
+        if (g[v].suffix && build.isInETable(v)) {
+            DEBUG_PRINTF("vertex %zu is in eod table and has a suffix\n",
+                         g[v].idx);
+            return true;
+        }
+    }
+    return false;
+}
+
 /**
  * Returns the pair (program offset, sparse iter offset).
  */
@@ -4028,13 +4044,24 @@ u32 writeEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
         }
     }
 
-    if (predProgramLists.empty()) {
-        DEBUG_PRINTF("no eod anchored roles\n");
+    vector<RoseInstruction> program;
+    if (!predProgramLists.empty()) {
+        addPredBlocks(bc, predProgramLists, program, false);
+    }
+
+    if (hasEodAnchoredSuffix(build)) {
+        if (!program.empty()) {
+            assert(program.back().code() == ROSE_INSTR_END);
+            program.pop_back();
+        }
+        program.emplace_back(ROSE_INSTR_SUFFIXES_EOD);
+    }
+
+    if (program.empty()) {
         return 0;
     }
 
-    vector<RoseInstruction> program;
-    addPredBlocks(bc, predProgramLists, program, false);
+    program = flattenProgram({program});
 
     assert(program.size() > 1);
     applyFinalSpecialisation(program);
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index 8de33a44..be43e559 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -481,6 +481,9 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(SUFFIXES_EOD) {}
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(END) { return; }
             PROGRAM_NEXT_INSTRUCTION
 
diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h
index b8961117..4a5521ef 100644
--- a/src/rose/rose_program.h
+++ b/src/rose/rose_program.h
@@ -100,6 +100,10 @@ enum RoseInstructionCode {
     /** \brief Check outfixes and suffixes for EOD and fire reports if so. */
     ROSE_INSTR_ENGINES_EOD,
 
+    /** \brief Catch up and check active suffixes for EOD and fire reports if
+     * so. */
+    ROSE_INSTR_SUFFIXES_EOD,
+
     ROSE_INSTR_END                //!< End of program.
 };
 
@@ -361,6 +365,10 @@ struct ROSE_STRUCT_ENGINES_EOD {
     u32 iter_offset; //!< Offset of mmbit_sparse_iter structure.
 };
 
+struct ROSE_STRUCT_SUFFIXES_EOD {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
 struct ROSE_STRUCT_END {
     u8 code; //!< From enum RoseInstructionCode.
 };

From b8f771e824202dffbd06de6320e006c44a1047e8 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 10 Jun 2016 16:10:03 +1000
Subject: [PATCH 045/166] rose_build_bytecode: tidy up addPredBlocks

---
 src/rose/rose_build_bytecode.cpp | 33 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 846e3e1b..6abd77db 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -3404,7 +3404,7 @@ void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
 }
 
 static
-u32 addPredBlocksSingle(
+void addPredBlocksSingle(
     map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
     vector<RoseInstruction> &program) {
 
@@ -3426,7 +3426,6 @@ u32 addPredBlocksSingle(
 
     auto prog = flattenProgram(prog_blocks);
     program.insert(end(program), begin(prog), end(prog));
-    return 0; // No iterator.
 }
 
 static
@@ -3439,7 +3438,7 @@ u32 programLength(const vector<RoseInstruction> &program) {
 }
 
 static
-u32 addPredBlocksMulti(build_context &bc,
+void addPredBlocksMulti(build_context &bc,
                     map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
                     vector<RoseInstruction> &program) {
     assert(!predProgramLists.empty());
@@ -3514,24 +3513,24 @@ u32 addPredBlocksMulti(build_context &bc,
     }
 
     program.insert(end(program), begin(sparse_program), end(sparse_program));
-
-    return iter_offset;
 }
 
 static
-u32 addPredBlocks(build_context &bc,
-                  map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
-                  vector<RoseInstruction> &program,
-                  bool force_sparse_iter) {
+void addPredBlocks(build_context &bc,
+                   map<u32, vector<vector<RoseInstruction>>> &predProgramLists,
+                   vector<RoseInstruction> &program) {
     const size_t num_preds = predProgramLists.size();
     if (num_preds == 0) {
         program = flattenProgram({program});
-        return 0; // No iterator.
-    } else if (!force_sparse_iter && num_preds == 1) {
-        return addPredBlocksSingle(predProgramLists, program);
-    } else {
-        return addPredBlocksMulti(bc, predProgramLists, program);
+        return;
     }
+
+    if (num_preds == 1) {
+        addPredBlocksSingle(predProgramLists, program);
+        return;
+    }
+
+    addPredBlocksMulti(bc, predProgramLists, program);
 }
 
 /**
@@ -3554,7 +3553,7 @@ vector<RoseInstruction> makeSparseIterProgram(build_context &bc,
     // Add blocks to deal with non-root edges (triggered by sparse iterator or
     // mmbit_isset checks). This operation will flatten the program up to this
     // point.
-    addPredBlocks(bc, predProgramLists, program, false);
+    addPredBlocks(bc, predProgramLists, program);
 
     // If we have a root program, replace the END instruction with it. Note
     // that the root program has already been flattened.
@@ -4046,7 +4045,7 @@ u32 writeEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
 
     vector<RoseInstruction> program;
     if (!predProgramLists.empty()) {
-        addPredBlocks(bc, predProgramLists, program, false);
+        addPredBlocks(bc, predProgramLists, program);
     }
 
     if (hasEodAnchoredSuffix(build)) {
@@ -4118,7 +4117,7 @@ void addGeneralEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
         assert(program.back().code() == ROSE_INSTR_END);
         program.pop_back();
     }
-    addPredBlocks(bc, predProgramLists, program, false);
+    addPredBlocks(bc, predProgramLists, program);
 }
 
 static

From 39461cc8066849dfa6b9f9f879ddbc10f91df7ae Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 14 Jun 2016 10:01:28 +1000
Subject: [PATCH 046/166] eod: move hwlm execution into MATCHER_EOD instr

---
 src/rose/eod.c                   | 101 +-----------------------------
 src/rose/program_runtime.h       | 102 +++++++++++++++++++++++++++++++
 src/rose/rose_build_bytecode.cpp |  35 ++++++++---
 src/rose/rose_dump.cpp           |   3 +
 src/rose/rose_program.h          |   7 +++
 5 files changed, 142 insertions(+), 106 deletions(-)

diff --git a/src/rose/eod.c b/src/rose/eod.c
index 48b330d4..771c77fe 100644
--- a/src/rose/eod.c
+++ b/src/rose/eod.c
@@ -55,80 +55,6 @@ void initContext(const struct RoseEngine *t, u64a offset,
     fatbit_clear(scratch->aqa);
 }
 
-static rose_inline
-hwlmcb_rv_t roseEodRunMatcher(const struct RoseEngine *t, u64a offset,
-                              struct hs_scratch *scratch,
-                              const char is_streaming) {
-    assert(t->ematcherOffset);
-
-    size_t eod_len;
-    const u8 *eod_data;
-    if (!is_streaming) { /* Block */
-        eod_data = scratch->core_info.buf;
-        eod_len = scratch->core_info.len;
-    } else { /* Streaming */
-        eod_len = scratch->core_info.hlen;
-        eod_data = scratch->core_info.hbuf;
-    }
-
-    assert(eod_data);
-    assert(eod_len);
-
-    // If we don't have enough bytes to produce a match from an EOD table scan,
-    // there's no point scanning.
-    if (eod_len < t->eodmatcherMinWidth) {
-        DEBUG_PRINTF("len=%zu < eodmatcherMinWidth=%u\n", eod_len,
-                     t->eodmatcherMinWidth);
-        return HWLM_CONTINUE_MATCHING;
-    }
-
-    // Ensure that we only need scan the last N bytes, where N is the length of
-    // the eod-anchored matcher region.
-    size_t adj = eod_len - MIN(eod_len, t->ematcherRegionSize);
-
-    DEBUG_PRINTF("eod offset=%llu, eod length=%zu\n", offset, eod_len);
-
-    struct RoseContext *tctxt = &scratch->tctxt;
-    const struct HWLM *etable = getELiteralMatcher(t);
-
-    hwlmExec(etable, eod_data, eod_len, adj, roseCallback, scratch,
-             tctxt->groups);
-
-    // We may need to fire delayed matches
-    return cleanUpDelayed(t, scratch, 0, offset);
-}
-
-static rose_inline
-int roseEodRunIterator(const struct RoseEngine *t, u64a offset,
-                       struct hs_scratch *scratch) {
-    if (!t->eodIterProgramOffset) {
-        return MO_CONTINUE_MATCHING;
-    }
-
-    DEBUG_PRINTF("running eod program at offset %u\n", t->eodIterProgramOffset);
-
-    const u64a som = 0;
-    const size_t match_len = 0;
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 1;
-    if (roseRunProgram(t, scratch, t->eodIterProgramOffset, som, offset,
-                       match_len, in_anchored, in_catchup,
-                       from_mpv, skip_mpv_catchup) == HWLM_TERMINATE_MATCHING) {
-        return MO_HALT_MATCHING;
-    }
-
-    return MO_CONTINUE_MATCHING;
-}
-
-static rose_inline
-void cleanupAfterEodMatcher(const struct RoseEngine *t, u64a offset,
-                            struct hs_scratch *scratch) {
-    // Flush history to make sure it's consistent.
-    roseFlushLastByteHistory(t, scratch, offset);
-}
-
 static rose_inline
 int roseRunEodProgram(const struct RoseEngine *t, u64a offset,
                       struct hs_scratch *scratch) {
@@ -158,37 +84,14 @@ int roseRunEodProgram(const struct RoseEngine *t, u64a offset,
 
 static really_inline
 void roseEodExec_i(const struct RoseEngine *t, u64a offset,
-                   struct hs_scratch *scratch, const char is_streaming) {
+                   struct hs_scratch *scratch, UNUSED const char is_streaming) {
     assert(t);
     assert(scratch->core_info.buf || scratch->core_info.hbuf);
     assert(!scratch->core_info.buf || !scratch->core_info.hbuf);
     assert(!can_stop_matching(scratch));
 
     // Run the unconditional EOD program.
-    if (roseRunEodProgram(t, offset, scratch) == MO_HALT_MATCHING) {
-        return;
-    }
-
-    // Run the EOD anchored matcher if there is one.
-    if (t->ematcherOffset) {
-        assert(t->ematcherRegionSize);
-        // Unset the reports we just fired so we don't fire them again below.
-        char *state = scratch->core_info.state;
-        mmbit_clear(getRoleState(state), t->rolesWithStateCount);
-        mmbit_clear(getActiveLeafArray(t, state), t->activeArrayCount);
-
-        if (roseEodRunMatcher(t, offset, scratch, is_streaming) ==
-            HWLM_TERMINATE_MATCHING) {
-            return;
-        }
-
-        cleanupAfterEodMatcher(t, offset, scratch);
-
-        // Fire any new EOD reports.
-        if (roseEodRunIterator(t, offset, scratch) == MO_HALT_MATCHING) {
-            return;
-        }
-    }
+    roseRunEodProgram(t, offset, scratch);
 }
 
 void roseEodExec(const struct RoseEngine *t, u64a offset,
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index 5387f59f..5d255cf1 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -46,6 +46,13 @@
 #include "util/fatbit.h"
 #include "util/multibit.h"
 
+static rose_inline
+hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
+                           struct hs_scratch *scratch, u32 programOffset,
+                           u64a som, u64a end, size_t match_len,
+                           char in_anchored, char in_catchup, char from_mpv,
+                           char skip_mpv_catchup);
+
 static rose_inline
 int roseCheckBenefits(const struct core_info *ci, u64a end, u32 mask_rewind,
                       const u8 *and_mask, const u8 *exp_mask) {
@@ -893,6 +900,93 @@ hwlmcb_rv_t roseSuffixesEod(const struct RoseEngine *rose,
     return HWLM_CONTINUE_MATCHING;
 }
 
+static rose_inline
+int roseEodRunIterator(const struct RoseEngine *t, u64a offset,
+                       struct hs_scratch *scratch) {
+    if (!t->eodIterProgramOffset) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    DEBUG_PRINTF("running eod program at offset %u\n", t->eodIterProgramOffset);
+
+    const u64a som = 0;
+    const size_t match_len = 0;
+    const char in_anchored = 0;
+    const char in_catchup = 0;
+    const char from_mpv = 0;
+    const char skip_mpv_catchup = 1;
+    if (roseRunProgram(t, scratch, t->eodIterProgramOffset, som, offset,
+                       match_len, in_anchored, in_catchup,
+                       from_mpv, skip_mpv_catchup) == HWLM_TERMINATE_MATCHING) {
+        return MO_HALT_MATCHING;
+    }
+
+    return MO_CONTINUE_MATCHING;
+}
+
+static
+hwlmcb_rv_t roseMatcherEod(const struct RoseEngine *rose,
+                           struct hs_scratch *scratch, u64a offset) {
+    assert(rose->ematcherOffset);
+    assert(rose->ematcherRegionSize);
+
+    // Clear role state and active engines, since we have already handled all
+    // outstanding work there.
+    DEBUG_PRINTF("clear role state and active leaf array\n");
+    char *state = scratch->core_info.state;
+    mmbit_clear(getRoleState(state), rose->rolesWithStateCount);
+    mmbit_clear(getActiveLeafArray(rose, state), rose->activeArrayCount);
+
+    const char is_streaming = rose->mode != HS_MODE_BLOCK;
+
+    size_t eod_len;
+    const u8 *eod_data;
+    if (!is_streaming) { /* Block */
+        eod_data = scratch->core_info.buf;
+        eod_len = scratch->core_info.len;
+    } else { /* Streaming */
+        eod_len = scratch->core_info.hlen;
+        eod_data = scratch->core_info.hbuf;
+    }
+
+    assert(eod_data);
+    assert(eod_len);
+
+    DEBUG_PRINTF("%zu bytes of eod data to scan at offset %llu\n", eod_len,
+                 offset);
+
+    // If we don't have enough bytes to produce a match from an EOD table scan,
+    // there's no point scanning.
+    if (eod_len < rose->eodmatcherMinWidth) {
+        DEBUG_PRINTF("too short for min width %u\n", rose->eodmatcherMinWidth);
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    // Ensure that we only need scan the last N bytes, where N is the length of
+    // the eod-anchored matcher region.
+    size_t adj = eod_len - MIN(eod_len, rose->ematcherRegionSize);
+
+    const struct HWLM *etable = getELiteralMatcher(rose);
+    hwlmExec(etable, eod_data, eod_len, adj, roseCallback, scratch,
+             scratch->tctxt.groups);
+
+    // We may need to fire delayed matches.
+    if (cleanUpDelayed(rose, scratch, 0, offset) == HWLM_TERMINATE_MATCHING) {
+        DEBUG_PRINTF("user instructed us to stop\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    roseFlushLastByteHistory(rose, scratch, offset);
+
+    // Fire any new EOD reports.
+    if (roseEodRunIterator(rose, offset, scratch) == MO_HALT_MATCHING) {
+        DEBUG_PRINTF("user instructed us to stop\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
 static
 void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
                     const char from_mpv) {
@@ -1410,6 +1504,14 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(MATCHER_EOD) {
+                if (roseMatcherEod(t, scratch, end) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(END) {
                 DEBUG_PRINTF("finished\n");
                 return HWLM_CONTINUE_MATCHING;
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 6abd77db..c472337d 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -225,6 +225,7 @@ public:
         case ROSE_INSTR_SPARSE_ITER_NEXT: return &u.sparseIterNext;
         case ROSE_INSTR_ENGINES_EOD: return &u.enginesEod;
         case ROSE_INSTR_SUFFIXES_EOD: return &u.suffixesEod;
+        case ROSE_INSTR_MATCHER_EOD: return &u.matcherEod;
         case ROSE_INSTR_END: return &u.end;
         }
         assert(0);
@@ -273,6 +274,7 @@ public:
         case ROSE_INSTR_SPARSE_ITER_NEXT: return sizeof(u.sparseIterNext);
         case ROSE_INSTR_ENGINES_EOD: return sizeof(u.enginesEod);
         case ROSE_INSTR_SUFFIXES_EOD: return sizeof(u.suffixesEod);
+        case ROSE_INSTR_MATCHER_EOD: return sizeof(u.matcherEod);
         case ROSE_INSTR_END: return sizeof(u.end);
         }
         assert(0);
@@ -320,6 +322,7 @@ public:
         ROSE_STRUCT_SPARSE_ITER_NEXT sparseIterNext;
         ROSE_STRUCT_ENGINES_EOD enginesEod;
         ROSE_STRUCT_SUFFIXES_EOD suffixesEod;
+        ROSE_STRUCT_MATCHER_EOD matcherEod;
         ROSE_STRUCT_END end;
     } u;
 
@@ -4000,6 +4003,18 @@ bool hasEodAnchoredSuffix(const RoseBuildImpl &build) {
     return false;
 }
 
+static
+bool hasEodMatcher(const RoseBuildImpl &build) {
+    const RoseGraph &g = build.g;
+    for (auto v : vertices_range(g)) {
+        if (build.isInETable(v)) {
+            DEBUG_PRINTF("vertex %zu is in eod table\n", g[v].idx);
+            return true;
+        }
+    }
+    return false;
+}
+
 /**
  * Returns the pair (program offset, sparse iter offset).
  */
@@ -4108,16 +4123,22 @@ void addGeneralEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
         }
     }
 
-    if (predProgramLists.empty()) {
-        DEBUG_PRINTF("no eod anchored roles\n");
-        return;
+    if (!predProgramLists.empty()) {
+        if (!program.empty()) {
+            assert(program.back().code() == ROSE_INSTR_END);
+            program.pop_back();
+        }
+        addPredBlocks(bc, predProgramLists, program);
     }
 
-    if (!program.empty()) {
-        assert(program.back().code() == ROSE_INSTR_END);
-        program.pop_back();
+    if (hasEodMatcher(build)) {
+        if (!program.empty()) {
+            assert(program.back().code() == ROSE_INSTR_END);
+            program.pop_back();
+        }
+        program.emplace_back(ROSE_INSTR_MATCHER_EOD);
+        program.emplace_back(ROSE_INSTR_END);
     }
-    addPredBlocks(bc, predProgramLists, program);
 }
 
 static
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index be43e559..5bcff4fc 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -484,6 +484,9 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             PROGRAM_CASE(SUFFIXES_EOD) {}
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(MATCHER_EOD) {}
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(END) { return; }
             PROGRAM_NEXT_INSTRUCTION
 
diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h
index 4a5521ef..cc3d07b0 100644
--- a/src/rose/rose_program.h
+++ b/src/rose/rose_program.h
@@ -104,6 +104,9 @@ enum RoseInstructionCode {
      * so. */
     ROSE_INSTR_SUFFIXES_EOD,
 
+    /** \brief Run the EOD-anchored HWLM literal matcher. */
+    ROSE_INSTR_MATCHER_EOD,
+
     ROSE_INSTR_END                //!< End of program.
 };
 
@@ -369,6 +372,10 @@ struct ROSE_STRUCT_SUFFIXES_EOD {
     u8 code; //!< From enum RoseInstructionCode.
 };
 
+struct ROSE_STRUCT_MATCHER_EOD {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
 struct ROSE_STRUCT_END {
     u8 code; //!< From enum RoseInstructionCode.
 };

From 78e4332a8b783330887a15dc1416595f47564670 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 14 Jun 2016 10:39:02 +1000
Subject: [PATCH 047/166] move eod iter program into general eod program

---
 src/rose/program_runtime.h       | 38 --------------------------------
 src/rose/rose_build_bytecode.cpp | 36 ++++++++++++++----------------
 src/rose/rose_dump.cpp           | 11 +--------
 src/rose/rose_internal.h         |  3 +--
 4 files changed, 18 insertions(+), 70 deletions(-)

diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index 5d255cf1..e23a395c 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -46,13 +46,6 @@
 #include "util/fatbit.h"
 #include "util/multibit.h"
 
-static rose_inline
-hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
-                           struct hs_scratch *scratch, u32 programOffset,
-                           u64a som, u64a end, size_t match_len,
-                           char in_anchored, char in_catchup, char from_mpv,
-                           char skip_mpv_catchup);
-
 static rose_inline
 int roseCheckBenefits(const struct core_info *ci, u64a end, u32 mask_rewind,
                       const u8 *and_mask, const u8 *exp_mask) {
@@ -901,30 +894,6 @@ hwlmcb_rv_t roseSuffixesEod(const struct RoseEngine *rose,
 }
 
 static rose_inline
-int roseEodRunIterator(const struct RoseEngine *t, u64a offset,
-                       struct hs_scratch *scratch) {
-    if (!t->eodIterProgramOffset) {
-        return MO_CONTINUE_MATCHING;
-    }
-
-    DEBUG_PRINTF("running eod program at offset %u\n", t->eodIterProgramOffset);
-
-    const u64a som = 0;
-    const size_t match_len = 0;
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 1;
-    if (roseRunProgram(t, scratch, t->eodIterProgramOffset, som, offset,
-                       match_len, in_anchored, in_catchup,
-                       from_mpv, skip_mpv_catchup) == HWLM_TERMINATE_MATCHING) {
-        return MO_HALT_MATCHING;
-    }
-
-    return MO_CONTINUE_MATCHING;
-}
-
-static
 hwlmcb_rv_t roseMatcherEod(const struct RoseEngine *rose,
                            struct hs_scratch *scratch, u64a offset) {
     assert(rose->ematcherOffset);
@@ -977,13 +946,6 @@ hwlmcb_rv_t roseMatcherEod(const struct RoseEngine *rose,
     }
 
     roseFlushLastByteHistory(rose, scratch, offset);
-
-    // Fire any new EOD reports.
-    if (roseEodRunIterator(rose, offset, scratch) == MO_HALT_MATCHING) {
-        DEBUG_PRINTF("user instructed us to stop\n");
-        return HWLM_TERMINATE_MATCHING;
-    }
-
     return HWLM_CONTINUE_MATCHING;
 }
 
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index c472337d..32150b79 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -4015,11 +4015,9 @@ bool hasEodMatcher(const RoseBuildImpl &build) {
     return false;
 }
 
-/**
- * Returns the pair (program offset, sparse iter offset).
- */
 static
-u32 writeEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
+void addEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
+                         vector<RoseInstruction> &program) {
     const RoseGraph &g = build.g;
 
     // pred state id -> list of programs
@@ -4058,10 +4056,7 @@ u32 writeEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
         }
     }
 
-    vector<RoseInstruction> program;
-    if (!predProgramLists.empty()) {
-        addPredBlocks(bc, predProgramLists, program);
-    }
+    addPredBlocks(bc, predProgramLists, program);
 
     if (hasEodAnchoredSuffix(build)) {
         if (!program.empty()) {
@@ -4069,17 +4064,8 @@ u32 writeEodAnchorProgram(RoseBuildImpl &build, build_context &bc) {
             program.pop_back();
         }
         program.emplace_back(ROSE_INSTR_SUFFIXES_EOD);
+        program.emplace_back(ROSE_INSTR_END);
     }
-
-    if (program.empty()) {
-        return 0;
-    }
-
-    program = flattenProgram({program});
-
-    assert(program.size() > 1);
-    applyFinalSpecialisation(program);
-    return writeProgram(bc, program);
 }
 
 static
@@ -4139,6 +4125,11 @@ void addGeneralEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
         program.emplace_back(ROSE_INSTR_MATCHER_EOD);
         program.emplace_back(ROSE_INSTR_END);
     }
+
+    if (!program.empty()) {
+        assert(program.back().code() == ROSE_INSTR_END);
+        program.pop_back();
+    }
 }
 
 static
@@ -4184,11 +4175,18 @@ u32 writeEodProgram(RoseBuildImpl &build, build_context &bc,
     }
 
     addGeneralEodAnchorProgram(build, bc, program);
+    addEodAnchorProgram(build, bc, program);
+
+    if (program.size() == 1) {
+        assert(program.back().code() == ROSE_INSTR_END);
+        return 0;
+    }
 
     if (program.empty()) {
         return 0;
     }
 
+
     applyFinalSpecialisation(program);
     return writeProgram(bc, program);
 }
@@ -4355,7 +4353,6 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
         buildLiteralPrograms(*this, bc);
 
     u32 eodProgramOffset = writeEodProgram(*this, bc, eodNfaIterOffset);
-    u32 eodIterProgramOffset = writeEodAnchorProgram(*this, bc);
 
     vector<mmbit_sparse_iter> activeLeftIter;
     buildActiveLeftIter(leftInfoTable, activeLeftIter);
@@ -4552,7 +4549,6 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     engine->nfaInfoOffset = nfaInfoOffset;
 
     engine->eodProgramOffset = eodProgramOffset;
-    engine->eodIterProgramOffset = eodIterProgramOffset;
 
     engine->lastByteHistoryIterOffset = lastByteOffset;
 
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index 5bcff4fc..40979e8c 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -542,7 +542,7 @@ void dumpRoseEodPrograms(const RoseEngine *t, const string &filename) {
     ofstream os(filename);
     const char *base = (const char *)t;
 
-    os << "Unconditional EOD Program:" << endl;
+    os << "EOD Program:" << endl;
 
     if (t->eodProgramOffset) {
         dumpProgram(os, t, base + t->eodProgramOffset);
@@ -551,14 +551,6 @@ void dumpRoseEodPrograms(const RoseEngine *t, const string &filename) {
         os << "<No EOD Program>" << endl;
     }
 
-    os << "Sparse Iter EOD Program:" << endl;
-
-    if (t->eodIterProgramOffset) {
-        dumpProgram(os, t, base + t->eodIterProgramOffset);
-    } else {
-        os << "<No EOD Iter Program>" << endl;
-    }
-
     os.close();
 }
 
@@ -1031,7 +1023,6 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, lookaroundTableOffset);
     DUMP_U32(t, lookaroundReachOffset);
     DUMP_U32(t, eodProgramOffset);
-    DUMP_U32(t, eodIterProgramOffset);
     DUMP_U32(t, lastByteHistoryIterOffset);
     DUMP_U32(t, minWidth);
     DUMP_U32(t, minWidthExcludingBoundaries);
diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h
index faab45f7..366636b6 100644
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@@ -376,8 +376,7 @@ struct RoseEngine {
     u32 lookaroundReachOffset; /**< base of lookaround reach bitvectors (32
                                 * bytes each) */
 
-    u32 eodProgramOffset; //!< Unconditional EOD program, otherwise 0.
-    u32 eodIterProgramOffset; // or 0 if no eod iterator program
+    u32 eodProgramOffset; //!< EOD program, otherwise 0.
 
     u32 lastByteHistoryIterOffset; // if non-zero
 

From ae157034e904bf7e7e2c300a770433391126d3d7 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 14 Jun 2016 10:56:30 +1000
Subject: [PATCH 048/166] eod: tidy runtime

---
 src/rose/eod.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/src/rose/eod.c b/src/rose/eod.c
index 771c77fe..4dee0150 100644
--- a/src/rose/eod.c
+++ b/src/rose/eod.c
@@ -55,11 +55,17 @@ void initContext(const struct RoseEngine *t, u64a offset,
     fatbit_clear(scratch->aqa);
 }
 
-static rose_inline
-int roseRunEodProgram(const struct RoseEngine *t, u64a offset,
-                      struct hs_scratch *scratch) {
+static really_inline
+void roseEodExec_i(const struct RoseEngine *t, u64a offset,
+                   struct hs_scratch *scratch, UNUSED const char is_streaming) {
+    assert(t);
+    assert(scratch->core_info.buf || scratch->core_info.hbuf);
+    assert(!scratch->core_info.buf || !scratch->core_info.hbuf);
+    assert(!can_stop_matching(scratch));
+
     if (!t->eodProgramOffset) {
-        return MO_CONTINUE_MATCHING;
+        DEBUG_PRINTF("no eod program\n");
+        return;
     }
 
     DEBUG_PRINTF("running eod program at %u\n", t->eodProgramOffset);
@@ -73,25 +79,11 @@ int roseRunEodProgram(const struct RoseEngine *t, u64a offset,
     const char in_catchup = 0;
     const char from_mpv = 0;
     const char skip_mpv_catchup = 1;
-    if (roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len,
-                       in_anchored, in_catchup, from_mpv,
-                       skip_mpv_catchup) == HWLM_TERMINATE_MATCHING) {
-        return MO_HALT_MATCHING;
-    }
 
-    return MO_CONTINUE_MATCHING;
-}
-
-static really_inline
-void roseEodExec_i(const struct RoseEngine *t, u64a offset,
-                   struct hs_scratch *scratch, UNUSED const char is_streaming) {
-    assert(t);
-    assert(scratch->core_info.buf || scratch->core_info.hbuf);
-    assert(!scratch->core_info.buf || !scratch->core_info.hbuf);
-    assert(!can_stop_matching(scratch));
-
-    // Run the unconditional EOD program.
-    roseRunEodProgram(t, offset, scratch);
+    // Note: we ignore the result, as this is the last thing to ever happen on
+    // a scan.
+    roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len,
+                   in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
 }
 
 void roseEodExec(const struct RoseEngine *t, u64a offset,

From 426bfc9cfb1e1df5d11989d7741c4030f549f0db Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 14 Jun 2016 11:47:02 +1000
Subject: [PATCH 049/166] rose_build_bytecode: clean up

---
 src/rose/rose_build_bytecode.cpp | 176 ++++++++++++++-----------------
 1 file changed, 78 insertions(+), 98 deletions(-)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 32150b79..60df01b7 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -4017,7 +4017,7 @@ bool hasEodMatcher(const RoseBuildImpl &build) {
 
 static
 void addEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
-                         vector<RoseInstruction> &program) {
+                         bool in_etable, vector<RoseInstruction> &program) {
     const RoseGraph &g = build.g;
 
     // pred state id -> list of programs
@@ -4034,8 +4034,9 @@ void addEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
         vector<RoseEdge> edge_list;
         for (const auto &e : in_edges_range(v, g)) {
             RoseVertex u = source(e, g);
-            if (!build.isInETable(u)) {
-                DEBUG_PRINTF("pred %zu is not in etable\n", g[u].idx);
+            if (build.isInETable(u) != in_etable) {
+                DEBUG_PRINTF("pred %zu %s in etable\n", g[u].idx,
+                             in_etable ? "is not" : "is");
                 continue;
             }
             if (canEagerlyReportAtEod(build, e)) {
@@ -4056,80 +4057,93 @@ void addEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
         }
     }
 
-    addPredBlocks(bc, predProgramLists, program);
-
-    if (hasEodAnchoredSuffix(build)) {
-        if (!program.empty()) {
-            assert(program.back().code() == ROSE_INSTR_END);
-            program.pop_back();
-        }
-        program.emplace_back(ROSE_INSTR_SUFFIXES_EOD);
-        program.emplace_back(ROSE_INSTR_END);
+    if (predProgramLists.empty()) {
+        return;
     }
+    if (!program.empty()) {
+        assert(program.back().code() == ROSE_INSTR_END);
+        program.pop_back();
+    }
+    addPredBlocks(bc, predProgramLists, program);
 }
 
 static
-void addGeneralEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
-                                vector<RoseInstruction> &program) {
+void addEodEventProgram(RoseBuildImpl &build, build_context &bc,
+                        vector<RoseInstruction> &program) {
+    if (build.eod_event_literal_id == MO_INVALID_IDX) {
+        return;
+    }
+
     const RoseGraph &g = build.g;
+    const auto &lit_info = build.literal_info.at(build.eod_event_literal_id);
+    assert(lit_info.delayed_ids.empty());
+    assert(!lit_info.squash_group);
+    assert(!lit_info.requires_benefits);
 
-    // pred state id -> list of programs
-    map<u32, vector<vector<RoseInstruction>>> predProgramLists;
-
-    for (auto v : vertices_range(g)) {
-        if (!g[v].eod_accept) {
-            continue;
-        }
-
-        DEBUG_PRINTF("vertex %zu (with %zu preds) fires on EOD\n", g[v].idx,
-                     in_degree(v, g));
-
-        vector<RoseEdge> edge_list;
+    // Collect all edges leading into EOD event literal vertices.
+    vector<RoseEdge> edge_list;
+    for (const auto &v : lit_info.vertices) {
         for (const auto &e : in_edges_range(v, g)) {
-            RoseVertex u = source(e, g);
-            if (build.isInETable(u)) {
-                DEBUG_PRINTF("pred %zu is in etable\n", g[u].idx);
-                continue;
-            }
-            if (canEagerlyReportAtEod(build, e)) {
-                DEBUG_PRINTF("already done report for vertex %zu\n", g[u].idx);
-                continue;
-            }
             edge_list.push_back(e);
         }
-
-        const bool multiple_preds = edge_list.size() > 1;
-        for (const auto &e : edge_list) {
-            RoseVertex u = source(e, g);
-            assert(contains(bc.roleStateIndices, u));
-            u32 predStateIdx = bc.roleStateIndices.at(u);
-
-            auto program = makeEodAnchorProgram(build, bc, e, multiple_preds);
-            predProgramLists[predStateIdx].push_back(program);
-        }
     }
 
-    if (!predProgramLists.empty()) {
-        if (!program.empty()) {
-            assert(program.back().code() == ROSE_INSTR_END);
-            program.pop_back();
-        }
-        addPredBlocks(bc, predProgramLists, program);
+    // Sort edge list for determinism, prettiness.
+    sort(begin(edge_list), end(edge_list),
+         [&g](const RoseEdge &a, const RoseEdge &b) {
+             return tie(g[source(a, g)].idx, g[target(a, g)].idx) <
+                    tie(g[source(b, g)].idx, g[target(b, g)].idx);
+         });
+
+    auto prog = buildLiteralProgram(build, bc, MO_INVALID_IDX, edge_list);
+    program.insert(end(program), begin(prog), end(prog));
+}
+
+static
+void addEnginesEodProgram(u32 eodNfaIterOffset,
+                          vector<RoseInstruction> &program) {
+    if (!eodNfaIterOffset) {
+        return;
     }
 
-    if (hasEodMatcher(build)) {
-        if (!program.empty()) {
-            assert(program.back().code() == ROSE_INSTR_END);
-            program.pop_back();
-        }
-        program.emplace_back(ROSE_INSTR_MATCHER_EOD);
-        program.emplace_back(ROSE_INSTR_END);
+    auto ri = RoseInstruction(ROSE_INSTR_ENGINES_EOD);
+    ri.u.enginesEod.iter_offset = eodNfaIterOffset;
+    if (!program.empty()) {
+        assert(program.back().code() == ROSE_INSTR_END);
+        program.pop_back();
+    }
+    program.push_back(move(ri));
+    program.emplace_back(ROSE_INSTR_END);
+}
+
+static
+void addSuffixesEodProgram(const RoseBuildImpl &build,
+                           vector<RoseInstruction> &program) {
+    if (!hasEodAnchoredSuffix(build)) {
+        return;
     }
 
     if (!program.empty()) {
         assert(program.back().code() == ROSE_INSTR_END);
         program.pop_back();
     }
+    program.emplace_back(ROSE_INSTR_SUFFIXES_EOD);
+    program.emplace_back(ROSE_INSTR_END);
+}
+
+static
+void addMatcherEodProgram(const RoseBuildImpl &build,
+                          vector<RoseInstruction> &program) {
+    if (!hasEodMatcher(build)) {
+        return;
+    }
+
+    if (!program.empty()) {
+        assert(program.back().code() == ROSE_INSTR_END);
+        program.pop_back();
+    }
+    program.emplace_back(ROSE_INSTR_MATCHER_EOD);
+    program.emplace_back(ROSE_INSTR_END);
 }
 
 static
@@ -4137,45 +4151,12 @@ u32 writeEodProgram(RoseBuildImpl &build, build_context &bc,
                     u32 eodNfaIterOffset) {
     vector<RoseInstruction> program;
 
-    if (build.eod_event_literal_id != MO_INVALID_IDX) {
-        const RoseGraph &g = build.g;
-        const auto &lit_info =
-            build.literal_info.at(build.eod_event_literal_id);
-        assert(lit_info.delayed_ids.empty());
-        assert(!lit_info.squash_group);
-        assert(!lit_info.requires_benefits);
-
-        // Collect all edges leading into EOD event literal vertices.
-        vector<RoseEdge> edge_list;
-        for (const auto &v : lit_info.vertices) {
-            for (const auto &e : in_edges_range(v, g)) {
-                edge_list.push_back(e);
-            }
-        }
-
-        // Sort edge list for determinism, prettiness.
-        sort(begin(edge_list), end(edge_list),
-             [&g](const RoseEdge &a, const RoseEdge &b) {
-                 return tie(g[source(a, g)].idx, g[target(a, g)].idx) <
-                        tie(g[source(b, g)].idx, g[target(b, g)].idx);
-             });
-
-        program = buildLiteralProgram(build, bc, MO_INVALID_IDX, edge_list);
-    }
-
-    if (eodNfaIterOffset) {
-        auto ri = RoseInstruction(ROSE_INSTR_ENGINES_EOD);
-        ri.u.enginesEod.iter_offset = eodNfaIterOffset;
-        if (!program.empty()) {
-            assert(program.back().code() == ROSE_INSTR_END);
-            program.pop_back();
-        }
-        program.push_back(move(ri));
-        program = flattenProgram({program});
-    }
-
-    addGeneralEodAnchorProgram(build, bc, program);
-    addEodAnchorProgram(build, bc, program);
+    addEodEventProgram(build, bc, program);
+    addEnginesEodProgram(eodNfaIterOffset, program);
+    addEodAnchorProgram(build, bc, false, program);
+    addMatcherEodProgram(build, program);
+    addEodAnchorProgram(build, bc, true, program);
+    addSuffixesEodProgram(build, program);
 
     if (program.size() == 1) {
         assert(program.back().code() == ROSE_INSTR_END);
@@ -4186,7 +4167,6 @@ u32 writeEodProgram(RoseBuildImpl &build, build_context &bc,
         return 0;
     }
 
-
     applyFinalSpecialisation(program);
     return writeProgram(bc, program);
 }

From 3e0232f0d6cf0df0db05f8cdf98519274e1f0a87 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 14 Jun 2016 13:39:41 +1000
Subject: [PATCH 050/166] eod: retire getELiteralMatcher

---
 src/rose/program_runtime.h |  2 +-
 src/rose/rose_internal.h   | 11 -----------
 2 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index e23a395c..30ecb4f7 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -935,7 +935,7 @@ hwlmcb_rv_t roseMatcherEod(const struct RoseEngine *rose,
     // the eod-anchored matcher region.
     size_t adj = eod_len - MIN(eod_len, rose->ematcherRegionSize);
 
-    const struct HWLM *etable = getELiteralMatcher(rose);
+    const struct HWLM *etable = getByOffset(rose, rose->ematcherOffset);
     hwlmExec(etable, eod_data, eod_len, adj, roseCallback, scratch,
              scratch->tctxt.groups);
 
diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h
index 366636b6..af5b2a95 100644
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@@ -463,17 +463,6 @@ const struct HWLM *getFLiteralMatcher(const struct RoseEngine *t) {
     return (const struct HWLM *)lt;
 }
 
-static really_inline
-const void *getELiteralMatcher(const struct RoseEngine *t) {
-    if (!t->ematcherOffset) {
-        return NULL;
-    }
-
-    const char *et = (const char *)t + t->ematcherOffset;
-    assert(ISALIGNED_N(et, 8));
-    return et;
-}
-
 static really_inline
 const void *getSBLiteralMatcher(const struct RoseEngine *t) {
     if (!t->sbmatcherOffset) {

From d9bd6d5deef42a4840698a44442b2d03c9b30a7e Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 14 Jun 2016 14:22:42 +1000
Subject: [PATCH 051/166] roseSuffixesEod: trust the queue structure

---
 src/rose/program_runtime.h | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index 30ecb4f7..4537c55b 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -859,32 +859,25 @@ hwlmcb_rv_t roseSuffixesEod(const struct RoseEngine *rose,
 
     for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID;
          qi = mmbit_iterate(aa, aaCount, qi)) {
-        const struct NfaInfo *info = getNfaInfoByQueue(rose, qi);
-        const struct NFA *nfa = getNfaByInfo(rose, info);
-
-        assert(nfaAcceptsEod(nfa));
-
         DEBUG_PRINTF("checking nfa %u\n", qi);
+        struct mq *q = scratch->queues + qi;
+        assert(q->nfa == getNfaByQueue(rose, qi));
+        assert(nfaAcceptsEod(q->nfa));
 
         /* We have just been triggered. */
         assert(fatbit_isset(scratch->aqa, rose->queueCount, qi));
 
-        char *fstate = scratch->fullState + info->fullStateOffset;
-        const char *sstate = scratch->core_info.state + info->stateOffset;
-
-        struct mq *q = scratch->queues + qi;
-
         pushQueueNoMerge(q, MQE_END, scratch->core_info.len);
-
         q->context = NULL;
+
         /* rose exec is used as we don't want to / can't raise matches in the
          * history buffer. */
         if (!nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX)) {
             DEBUG_PRINTF("nfa is dead\n");
             continue;
         }
-        if (nfaCheckFinalState(nfa, fstate, sstate, offset, roseReportAdaptor,
-                               roseReportSomAdaptor,
+        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
+                               roseReportAdaptor, roseReportSomAdaptor,
                                scratch) == MO_HALT_MATCHING) {
             DEBUG_PRINTF("user instructed us to stop\n");
             return HWLM_TERMINATE_MATCHING;

From 159c09b70eacf96540ce2f4057e33a37a68730af Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 14 Jun 2016 14:25:52 +1000
Subject: [PATCH 052/166] roseEnginesEod: trust the queue structure

---
 src/rose/program_runtime.h | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index 4537c55b..860f7599 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -826,22 +826,18 @@ hwlmcb_rv_t roseEnginesEod(const struct RoseEngine *rose,
     for (u32 qi = mmbit_sparse_iter_begin(aa, aaCount, &idx, it, si_state);
          qi != MMB_INVALID;
          qi = mmbit_sparse_iter_next(aa, aaCount, qi, &idx, it, si_state)) {
-        const struct NfaInfo *info = getNfaInfoByQueue(rose, qi);
-        const struct NFA *nfa = getNfaByInfo(rose, info);
-
         DEBUG_PRINTF("checking nfa %u\n", qi);
-        assert(nfaAcceptsEod(nfa));
-
-        char *fstate = scratch->fullState + info->fullStateOffset;
-        const char *sstate = scratch->core_info.state + info->stateOffset;
+        struct mq *q = scratch->queues + qi;
+        assert(q->nfa == getNfaByQueue(rose, qi));
+        assert(nfaAcceptsEod(q->nfa));
 
         if (is_streaming) {
             // Decompress stream state.
-            nfaExpandState(nfa, fstate, sstate, offset, key);
+            nfaExpandState(q->nfa, q->state, q->streamState, offset, key);
         }
 
-        if (nfaCheckFinalState(nfa, fstate, sstate, offset, roseReportAdaptor,
-                               roseReportSomAdaptor,
+        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
+                               roseReportAdaptor, roseReportSomAdaptor,
                                scratch) == MO_HALT_MATCHING) {
             DEBUG_PRINTF("user instructed us to stop\n");
             return HWLM_TERMINATE_MATCHING;

From a9eba12cce8077ac73668b5e3e9fbdd47f99cbf1 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 14 Jun 2016 16:58:13 +1000
Subject: [PATCH 053/166] rose: inline block-mode eod check

---
 src/rose/block.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++--
 src/rose/eod.c   | 20 --------------------
 src/rose/rose.h  | 19 ++-----------------
 3 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/src/rose/block.c b/src/rose/block.c
index 5fc5c8a1..0df5144c 100644
--- a/src/rose/block.c
+++ b/src/rose/block.c
@@ -29,13 +29,14 @@
 #include "catchup.h"
 #include "init.h"
 #include "match.h"
+#include "program_runtime.h"
+#include "rose.h"
+#include "rose_common.h"
 #include "nfa/nfa_api.h"
 #include "nfa/nfa_internal.h"
 #include "nfa/nfa_rev_api.h"
 #include "nfa/mcclellan.h"
 #include "util/fatbit.h"
-#include "rose.h"
-#include "rose_common.h"
 
 static rose_inline
 void runAnchoredTableBlock(const struct RoseEngine *t, const void *atable,
@@ -157,6 +158,38 @@ void init_for_block(const struct RoseEngine *t, struct hs_scratch *scratch,
     init_outfixes_for_block(t, scratch, state, is_small_block);
 }
 
+static rose_inline
+void roseBlockEodExec(const struct RoseEngine *t, u64a offset,
+                      struct hs_scratch *scratch) {
+    assert(t->requiresEodCheck);
+    assert(t->maxBiAnchoredWidth == ROSE_BOUND_INF
+           || offset <= t->maxBiAnchoredWidth);
+
+    assert(!can_stop_matching(scratch));
+    assert(t->eodProgramOffset);
+
+    // Ensure that history is correct before we look for EOD matches.
+    roseFlushLastByteHistory(t, scratch, offset);
+    scratch->tctxt.lastEndOffset = offset;
+
+    DEBUG_PRINTF("running eod program at %u\n", t->eodProgramOffset);
+
+    // There should be no pending delayed literals.
+    assert(!scratch->tctxt.filledDelayedSlots);
+
+    const u64a som = 0;
+    const size_t match_len = 0;
+    const char in_anchored = 0;
+    const char in_catchup = 0;
+    const char from_mpv = 0;
+    const char skip_mpv_catchup = 1;
+
+    // Note: we ignore the result, as this is the last thing to ever happen on
+    // a scan.
+    roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len,
+                   in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+}
+
 void roseBlockExec_i(const struct RoseEngine *t, struct hs_scratch *scratch) {
     assert(t);
     assert(scratch);
@@ -255,4 +288,16 @@ exit:;
     assert(!can_stop_matching(scratch));
 
     roseCatchUpTo(t, scratch, length);
+
+    if (!t->requiresEodCheck || !t->eodProgramOffset) {
+        DEBUG_PRINTF("no eod check required\n");
+        return;
+    }
+
+    if (can_stop_matching(scratch)) {
+        DEBUG_PRINTF("bailing, already halted\n");
+        return;
+    }
+
+    roseBlockEodExec(t, length, scratch);
 }
diff --git a/src/rose/eod.c b/src/rose/eod.c
index 4dee0150..249e7a9c 100644
--- a/src/rose/eod.c
+++ b/src/rose/eod.c
@@ -108,23 +108,3 @@ void roseEodExec(const struct RoseEngine *t, u64a offset,
     initContext(t, offset, scratch);
     roseEodExec_i(t, offset, scratch, 1);
 }
-
-static rose_inline
-void prepForEod(const struct RoseEngine *t, struct hs_scratch *scratch,
-                size_t length) {
-    roseFlushLastByteHistory(t, scratch, length);
-    scratch->tctxt.lastEndOffset = length;
-}
-
-void roseBlockEodExec(const struct RoseEngine *t, u64a offset,
-                      struct hs_scratch *scratch) {
-    assert(t->requiresEodCheck);
-    assert(t->maxBiAnchoredWidth == ROSE_BOUND_INF
-           || offset <= t->maxBiAnchoredWidth);
-
-    assert(!can_stop_matching(scratch));
-
-    // Ensure that history is correct before we look for EOD matches
-    prepForEod(t, scratch, scratch->core_info.len);
-    roseEodExec_i(t, offset, scratch, 0);
-}
diff --git a/src/rose/rose.h b/src/rose/rose.h
index d79c2f0c..5b7940a2 100644
--- a/src/rose/rose.h
+++ b/src/rose/rose.h
@@ -39,8 +39,6 @@
 // Initialise state space for engine use.
 void roseInitState(const struct RoseEngine *t, char *state);
 
-void roseBlockEodExec(const struct RoseEngine *t, u64a offset,
-                      struct hs_scratch *scratch);
 void roseBlockExec_i(const struct RoseEngine *t, struct hs_scratch *scratch);
 
 /* assumes core_info in scratch has been init to point to data */
@@ -57,28 +55,15 @@ void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
     // If this block is shorter than our minimum width, then no pattern in this
     // RoseEngine could match.
     /* minWidth checks should have already been performed by the caller */
-    const size_t length = scratch->core_info.len;
-    assert(length >= t->minWidth);
+    assert(scratch->core_info.len >= t->minWidth);
 
     // Similarly, we may have a maximum width (for engines constructed entirely
     // of bi-anchored patterns).
     /* This check is now handled by the interpreter */
     assert(t->maxBiAnchoredWidth == ROSE_BOUND_INF
-           || length <= t->maxBiAnchoredWidth);
+           || scratch->core_info.len <= t->maxBiAnchoredWidth);
 
     roseBlockExec_i(t, scratch);
-
-    if (!t->requiresEodCheck) {
-        DEBUG_PRINTF("no eod check required\n");
-        return;
-    }
-
-    if (can_stop_matching(scratch)) {
-        DEBUG_PRINTF("bailing, already halted\n");
-        return;
-    }
-
-    roseBlockEodExec(t, length, scratch);
 }
 
 /* assumes core_info in scratch has been init to point to data */

From 513ac11dbcd28ec6cd76aec9a3166dbb2d788a3e Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 14 Jun 2016 17:13:35 +1000
Subject: [PATCH 054/166] block: move matcher invocations into functions

---
 src/rose/block.c | 140 ++++++++++++++++++++++++++++-------------------
 1 file changed, 83 insertions(+), 57 deletions(-)

diff --git a/src/rose/block.c b/src/rose/block.c
index 0df5144c..0e70f549 100644
--- a/src/rose/block.c
+++ b/src/rose/block.c
@@ -190,6 +190,82 @@ void roseBlockEodExec(const struct RoseEngine *t, u64a offset,
                    in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
 }
 
+/**
+ * \brief Run the anchored matcher, if any. Returns non-zero if matching should
+ * halt.
+ */
+static rose_inline
+int roseBlockAnchored(const struct RoseEngine *t, struct hs_scratch *scratch) {
+    const void *atable = getALiteralMatcher(t);
+    if (!atable) {
+        DEBUG_PRINTF("no anchored table\n");
+        return 0;
+    }
+
+    const size_t length = scratch->core_info.len;
+
+    if (t->amatcherMaxBiAnchoredWidth != ROSE_BOUND_INF &&
+        length > t->amatcherMaxBiAnchoredWidth) {
+        return 0;
+    }
+
+    if (length < t->amatcherMinWidth) {
+        return 0;
+    }
+
+    runAnchoredTableBlock(t, atable, scratch);
+
+    return can_stop_matching(scratch);
+}
+
+/**
+ * \brief Run the floating matcher, if any. Returns non-zero if matching should
+ * halt.
+ */
+static rose_inline
+int roseBlockFloating(const struct RoseEngine *t, struct hs_scratch *scratch) {
+    const struct HWLM *ftable = getFLiteralMatcher(t);
+    if (!ftable) {
+        return 0;
+    }
+
+    const size_t length = scratch->core_info.len;
+    char *state = scratch->core_info.state;
+    struct RoseContext *tctxt = &scratch->tctxt;
+
+    DEBUG_PRINTF("ftable fd=%u fmd %u\n", t->floatingDistance,
+                 t->floatingMinDistance);
+    if (t->noFloatingRoots && !roseHasInFlightMatches(t, state, scratch)) {
+        DEBUG_PRINTF("skip FLOATING: no inflight matches\n");
+        return 0;
+    }
+
+    if (t->fmatcherMaxBiAnchoredWidth != ROSE_BOUND_INF &&
+        length > t->fmatcherMaxBiAnchoredWidth) {
+        return 0;
+    }
+
+    if (length < t->fmatcherMinWidth) {
+        return 0;
+    }
+
+    const u8 *buffer = scratch->core_info.buf;
+    size_t flen = length;
+    if (t->floatingDistance != ROSE_BOUND_INF) {
+        flen = MIN(t->floatingDistance, length);
+    }
+    if (flen <= t->floatingMinDistance) {
+        return 0;
+    }
+
+    DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length);
+    DEBUG_PRINTF("-- %016llx\n", tctxt->groups);
+    hwlmExec(ftable, buffer, flen, t->floatingMinDistance, roseCallback,
+             scratch, tctxt->groups);
+
+    return can_stop_matching(scratch);
+}
+
 void roseBlockExec_i(const struct RoseEngine *t, struct hs_scratch *scratch) {
     assert(t);
     assert(scratch);
@@ -222,65 +298,15 @@ void roseBlockExec_i(const struct RoseEngine *t, struct hs_scratch *scratch) {
         DEBUG_PRINTF("-- %016llx\n", tctxt->groups);
         hwlmExec(sbtable, scratch->core_info.buf, sblen, 0, roseCallback,
                  scratch, tctxt->groups);
-        goto exit;
+    } else {
+        if (roseBlockAnchored(t, scratch)) {
+            return;
+        }
+        if (roseBlockFloating(t, scratch)) {
+            return;
+        }
     }
 
-    const void *atable = getALiteralMatcher(t);
-
-    if (atable) {
-        if (t->amatcherMaxBiAnchoredWidth != ROSE_BOUND_INF
-            && length > t->amatcherMaxBiAnchoredWidth) {
-            goto skip_atable;
-        }
-
-        if (length < t->amatcherMinWidth) {
-            goto skip_atable;
-        }
-
-
-        runAnchoredTableBlock(t, atable, scratch);
-
-        if (can_stop_matching(scratch)) {
-            goto exit;
-        }
-
-    skip_atable:;
-    }
-
-    const struct HWLM *ftable = getFLiteralMatcher(t);
-    if (ftable) {
-        DEBUG_PRINTF("ftable fd=%u fmd %u\n", t->floatingDistance,
-            t->floatingMinDistance);
-        if (t->noFloatingRoots && !roseHasInFlightMatches(t, state, scratch)) {
-            DEBUG_PRINTF("skip FLOATING: no inflight matches\n");
-            goto exit;
-        }
-
-        if (t->fmatcherMaxBiAnchoredWidth != ROSE_BOUND_INF
-            && length > t->fmatcherMaxBiAnchoredWidth) {
-            goto exit;
-        }
-
-        if (length < t->fmatcherMinWidth) {
-            goto exit;
-        }
-
-        const u8 *buffer = scratch->core_info.buf;
-        size_t flen = length;
-        if (t->floatingDistance != ROSE_BOUND_INF) {
-            flen = MIN(t->floatingDistance, length);
-        }
-        if (flen <= t->floatingMinDistance) {
-            goto exit;
-        }
-
-        DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length);
-        DEBUG_PRINTF("-- %016llx\n", tctxt->groups);
-        hwlmExec(ftable, buffer, flen, t->floatingMinDistance,
-                 roseCallback, scratch, tctxt->groups);
-    }
-
-exit:;
     if (cleanUpDelayed(t, scratch, length, 0) == HWLM_TERMINATE_MATCHING) {
         return;
     }

From 66e0b77aa4671d495bfcce946c9fdea7cf2d5770 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 15 Jun 2016 09:35:48 +1000
Subject: [PATCH 055/166] block: unify roseBlockExec and roseBlockExec_i

---
 src/rose/block.c | 17 ++++++++++++++++-
 src/rose/rose.h  | 34 ++++------------------------------
 2 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/src/rose/block.c b/src/rose/block.c
index 0e70f549..c0b5e0e4 100644
--- a/src/rose/block.c
+++ b/src/rose/block.c
@@ -266,13 +266,28 @@ int roseBlockFloating(const struct RoseEngine *t, struct hs_scratch *scratch) {
     return can_stop_matching(scratch);
 }
 
-void roseBlockExec_i(const struct RoseEngine *t, struct hs_scratch *scratch) {
+void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
     assert(t);
     assert(scratch);
     assert(scratch->core_info.buf);
     assert(mmbit_sparse_iter_state_size(t->rolesWithStateCount)
            < MAX_SPARSE_ITER_STATES);
 
+    // We should not have been called if we've already been told to terminate
+    // matching.
+    assert(!told_to_stop_matching(scratch));
+
+    // If this block is shorter than our minimum width, then no pattern in this
+    // RoseEngine could match.
+    /* minWidth checks should have already been performed by the caller */
+    assert(scratch->core_info.len >= t->minWidth);
+
+    // Similarly, we may have a maximum width (for engines constructed entirely
+    // of bi-anchored patterns).
+    /* This check is now handled by the interpreter */
+    assert(t->maxBiAnchoredWidth == ROSE_BOUND_INF
+           || scratch->core_info.len <= t->maxBiAnchoredWidth);
+
     const size_t length = scratch->core_info.len;
 
     // We have optimizations for small block scans: we run a single coalesced
diff --git a/src/rose/rose.h b/src/rose/rose.h
index 5b7940a2..e3537774 100644
--- a/src/rose/rose.h
+++ b/src/rose/rose.h
@@ -29,42 +29,16 @@
 #ifndef ROSE_H
 #define ROSE_H
 
-#include "rose_types.h"
-#include "rose_internal.h"
-#include "runtime.h"
-#include "scratch.h"
 #include "ue2common.h"
-#include "util/multibit.h"
+
+struct RoseEngine;
+struct hs_scratch;
 
 // Initialise state space for engine use.
 void roseInitState(const struct RoseEngine *t, char *state);
 
-void roseBlockExec_i(const struct RoseEngine *t, struct hs_scratch *scratch);
-
 /* assumes core_info in scratch has been init to point to data */
-static really_inline
-void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
-    assert(t);
-    assert(scratch);
-    assert(scratch->core_info.buf);
-
-    // We should not have been called if we've already been told to terminate
-    // matching.
-    assert(!told_to_stop_matching(scratch));
-
-    // If this block is shorter than our minimum width, then no pattern in this
-    // RoseEngine could match.
-    /* minWidth checks should have already been performed by the caller */
-    assert(scratch->core_info.len >= t->minWidth);
-
-    // Similarly, we may have a maximum width (for engines constructed entirely
-    // of bi-anchored patterns).
-    /* This check is now handled by the interpreter */
-    assert(t->maxBiAnchoredWidth == ROSE_BOUND_INF
-           || scratch->core_info.len <= t->maxBiAnchoredWidth);
-
-    roseBlockExec_i(t, scratch);
-}
+void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch);
 
 /* assumes core_info in scratch has been init to point to data */
 void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch);

From d5c1280b9f3ff46a238845d2faf3ee519c0b24e2 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 15 Jun 2016 09:47:24 +1000
Subject: [PATCH 056/166] eod: tidy up, rename to roseStreamEodExec

---
 src/rose/eod.c  | 50 ++++++++++++++++++++-----------------------------
 src/rose/rose.h |  4 ++--
 src/runtime.c   |  2 +-
 3 files changed, 23 insertions(+), 33 deletions(-)

diff --git a/src/rose/eod.c b/src/rose/eod.c
index 249e7a9c..dd471c97 100644
--- a/src/rose/eod.c
+++ b/src/rose/eod.c
@@ -55,19 +55,32 @@ void initContext(const struct RoseEngine *t, u64a offset,
     fatbit_clear(scratch->aqa);
 }
 
-static really_inline
-void roseEodExec_i(const struct RoseEngine *t, u64a offset,
-                   struct hs_scratch *scratch, UNUSED const char is_streaming) {
-    assert(t);
-    assert(scratch->core_info.buf || scratch->core_info.hbuf);
-    assert(!scratch->core_info.buf || !scratch->core_info.hbuf);
-    assert(!can_stop_matching(scratch));
+void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
+                       struct hs_scratch *scratch) {
+    assert(scratch);
+    assert(t->requiresEodCheck);
+    DEBUG_PRINTF("ci buf %p/%zu his %p/%zu\n", scratch->core_info.buf,
+                 scratch->core_info.len, scratch->core_info.hbuf,
+                 scratch->core_info.hlen);
+
+    // We should not have been called if we've already been told to terminate
+    // matching.
+    assert(!told_to_stop_matching(scratch));
+
+    if (t->maxBiAnchoredWidth != ROSE_BOUND_INF
+        && offset > t->maxBiAnchoredWidth) {
+        DEBUG_PRINTF("bailing, we are beyond max width\n");
+        /* also some of the history/state may be stale */
+        return;
+    }
 
     if (!t->eodProgramOffset) {
         DEBUG_PRINTF("no eod program\n");
         return;
     }
 
+    initContext(t, offset, scratch);
+
     DEBUG_PRINTF("running eod program at %u\n", t->eodProgramOffset);
 
     // There should be no pending delayed literals.
@@ -85,26 +98,3 @@ void roseEodExec_i(const struct RoseEngine *t, u64a offset,
     roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len,
                    in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
 }
-
-void roseEodExec(const struct RoseEngine *t, u64a offset,
-                 struct hs_scratch *scratch) {
-    assert(scratch);
-    assert(t->requiresEodCheck);
-    DEBUG_PRINTF("ci buf %p/%zu his %p/%zu\n", scratch->core_info.buf,
-                 scratch->core_info.len, scratch->core_info.hbuf,
-                 scratch->core_info.hlen);
-
-    // We should not have been called if we've already been told to terminate
-    // matching.
-    assert(!told_to_stop_matching(scratch));
-
-    if (t->maxBiAnchoredWidth != ROSE_BOUND_INF
-        && offset > t->maxBiAnchoredWidth) {
-        DEBUG_PRINTF("bailing, we are beyond max width\n");
-        /* also some of the history/state may be stale */
-        return;
-    }
-
-    initContext(t, offset, scratch);
-    roseEodExec_i(t, offset, scratch, 1);
-}
diff --git a/src/rose/rose.h b/src/rose/rose.h
index e3537774..ca8bf353 100644
--- a/src/rose/rose.h
+++ b/src/rose/rose.h
@@ -43,8 +43,8 @@ void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch);
 /* assumes core_info in scratch has been init to point to data */
 void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch);
 
-void roseEodExec(const struct RoseEngine *t, u64a offset,
-                 struct hs_scratch *scratch);
+void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
+                       struct hs_scratch *scratch);
 
 hwlmcb_rv_t rosePureLiteralCallback(size_t start, size_t end, u32 id,
                                     void *context);
diff --git a/src/runtime.c b/src/runtime.c
index 95f21d84..7da41d29 100644
--- a/src/runtime.c
+++ b/src/runtime.c
@@ -532,7 +532,7 @@ void rawEodExec(hs_stream_t *id, hs_scratch_t *scratch) {
         return;
     }
 
-    roseEodExec(rose, id->offset, scratch);
+    roseStreamEodExec(rose, id->offset, scratch);
 }
 
 static never_inline

From cf8e8c90ffade5d22e948b597e0e2fcf0f3866fa Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 15 Jun 2016 10:33:51 +1000
Subject: [PATCH 057/166] eod: move stream eod code to stream.c

---
 CMakeLists.txt    |   1 -
 src/rose/eod.c    | 100 ----------------------------------------------
 src/rose/stream.c |  70 +++++++++++++++++++++++++++++++-
 3 files changed, 69 insertions(+), 102 deletions(-)
 delete mode 100644 src/rose/eod.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 92caf4ce..d49de277 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -495,7 +495,6 @@ set (hs_exec_SRCS
     src/rose/block.c
     src/rose/catchup.h
     src/rose/catchup.c
-    src/rose/eod.c
     src/rose/infix.h
     src/rose/init.h
     src/rose/init.c
diff --git a/src/rose/eod.c b/src/rose/eod.c
deleted file mode 100644
index dd471c97..00000000
--- a/src/rose/eod.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "catchup.h"
-#include "match.h"
-#include "program_runtime.h"
-#include "rose.h"
-#include "util/fatbit.h"
-
-static really_inline
-void initContext(const struct RoseEngine *t, u64a offset,
-                 struct hs_scratch *scratch) {
-    struct RoseContext *tctxt = &scratch->tctxt;
-    /* TODO: diff groups for eod */
-    tctxt->groups = loadGroups(t, scratch->core_info.state);
-    tctxt->lit_offset_adjust = scratch->core_info.buf_offset
-                             - scratch->core_info.hlen
-                             + 1; // index after last byte
-    tctxt->delayLastEndOffset = offset;
-    tctxt->lastEndOffset = offset;
-    tctxt->filledDelayedSlots = 0;
-    tctxt->lastMatchOffset = 0;
-    tctxt->minMatchOffset = offset;
-    tctxt->minNonMpvMatchOffset = offset;
-    tctxt->next_mpv_offset = offset;
-
-    scratch->catchup_pq.qm_size = 0;
-    scratch->al_log_sum = 0; /* clear the anchored logs */
-
-    fatbit_clear(scratch->aqa);
-}
-
-void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
-                       struct hs_scratch *scratch) {
-    assert(scratch);
-    assert(t->requiresEodCheck);
-    DEBUG_PRINTF("ci buf %p/%zu his %p/%zu\n", scratch->core_info.buf,
-                 scratch->core_info.len, scratch->core_info.hbuf,
-                 scratch->core_info.hlen);
-
-    // We should not have been called if we've already been told to terminate
-    // matching.
-    assert(!told_to_stop_matching(scratch));
-
-    if (t->maxBiAnchoredWidth != ROSE_BOUND_INF
-        && offset > t->maxBiAnchoredWidth) {
-        DEBUG_PRINTF("bailing, we are beyond max width\n");
-        /* also some of the history/state may be stale */
-        return;
-    }
-
-    if (!t->eodProgramOffset) {
-        DEBUG_PRINTF("no eod program\n");
-        return;
-    }
-
-    initContext(t, offset, scratch);
-
-    DEBUG_PRINTF("running eod program at %u\n", t->eodProgramOffset);
-
-    // There should be no pending delayed literals.
-    assert(!scratch->tctxt.filledDelayedSlots);
-
-    const u64a som = 0;
-    const size_t match_len = 0;
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 1;
-
-    // Note: we ignore the result, as this is the last thing to ever happen on
-    // a scan.
-    roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len,
-                   in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
-}
diff --git a/src/rose/stream.c b/src/rose/stream.c
index b08fe04d..0e382f03 100644
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@@ -31,13 +31,14 @@
 #include "infix.h"
 #include "match.h"
 #include "miracle.h"
+#include "program_runtime.h"
+#include "rose.h"
 #include "hwlm/hwlm.h"
 #include "nfa/mcclellan.h"
 #include "nfa/nfa_api.h"
 #include "nfa/nfa_api_queue.h"
 #include "nfa/nfa_internal.h"
 #include "util/fatbit.h"
-#include "rose.h"
 
 static rose_inline
 void runAnchoredTableStream(const struct RoseEngine *t, const void *atable,
@@ -558,3 +559,70 @@ exit:
                  scratch->core_info.status);
     return;
 }
+
+static rose_inline
+void roseStreamInitEod(const struct RoseEngine *t, u64a offset,
+                       struct hs_scratch *scratch) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+    /* TODO: diff groups for eod */
+    tctxt->groups = loadGroups(t, scratch->core_info.state);
+    tctxt->lit_offset_adjust = scratch->core_info.buf_offset
+                             - scratch->core_info.hlen
+                             + 1; // index after last byte
+    tctxt->delayLastEndOffset = offset;
+    tctxt->lastEndOffset = offset;
+    tctxt->filledDelayedSlots = 0;
+    tctxt->lastMatchOffset = 0;
+    tctxt->minMatchOffset = offset;
+    tctxt->minNonMpvMatchOffset = offset;
+    tctxt->next_mpv_offset = offset;
+
+    scratch->catchup_pq.qm_size = 0;
+    scratch->al_log_sum = 0; /* clear the anchored logs */
+
+    fatbit_clear(scratch->aqa);
+}
+
+void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
+                       struct hs_scratch *scratch) {
+    assert(scratch);
+    assert(t->requiresEodCheck);
+    DEBUG_PRINTF("ci buf %p/%zu his %p/%zu\n", scratch->core_info.buf,
+                 scratch->core_info.len, scratch->core_info.hbuf,
+                 scratch->core_info.hlen);
+
+    // We should not have been called if we've already been told to terminate
+    // matching.
+    assert(!told_to_stop_matching(scratch));
+
+    if (t->maxBiAnchoredWidth != ROSE_BOUND_INF
+        && offset > t->maxBiAnchoredWidth) {
+        DEBUG_PRINTF("bailing, we are beyond max width\n");
+        /* also some of the history/state may be stale */
+        return;
+    }
+
+    if (!t->eodProgramOffset) {
+        DEBUG_PRINTF("no eod program\n");
+        return;
+    }
+
+    roseStreamInitEod(t, offset, scratch);
+
+    DEBUG_PRINTF("running eod program at %u\n", t->eodProgramOffset);
+
+    // There should be no pending delayed literals.
+    assert(!scratch->tctxt.filledDelayedSlots);
+
+    const u64a som = 0;
+    const size_t match_len = 0;
+    const char in_anchored = 0;
+    const char in_catchup = 0;
+    const char from_mpv = 0;
+    const char skip_mpv_catchup = 1;
+
+    // Note: we ignore the result, as this is the last thing to ever happen on
+    // a scan.
+    roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len,
+                   in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+}

From d3c56b532b3b52dbf958f03cb56c50d6c524d6d9 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 14 Jun 2016 15:22:07 +1000
Subject: [PATCH 058/166] rose build: dedupe hasLastByteHistorySucc func

---
 src/rose/rose_build_bytecode.cpp | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 60df01b7..8b397c52 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -1776,22 +1776,12 @@ u32 addIteratorToTable(build_context &bc,
     return offset;
 }
 
-static
-bool hasLastByteHistoryOutEdge(const RoseGraph &g, RoseVertex v) {
-    for (const auto &e : out_edges_range(v, g)) {
-        if (g[e].history == ROSE_ROLE_HISTORY_LAST_BYTE) {
-            return true;
-        }
-    }
-    return false;
-}
-
 static
 u32 buildLastByteIter(const RoseGraph &g, build_context &bc) {
     vector<u32> lb_roles;
 
     for (auto v : vertices_range(g)) {
-        if (!hasLastByteHistoryOutEdge(g, v)) {
+        if (!hasLastByteHistorySucc(g, v)) {
             continue;
         }
         // Eager EOD reporters won't have state indices.

From cdaf705a875f254709f730e37060727a3b1d93f6 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 16 Jun 2016 16:08:48 +1000
Subject: [PATCH 059/166] rose: pick up more prefix->lookaround conversions

---
 src/rose/rose_build_bytecode.cpp   |  6 ++-
 src/rose/rose_build_lookaround.cpp | 62 ++++++++++++++++++++++++------
 2 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 8b397c52..6d485b0c 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -1160,7 +1160,11 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
         u32 lag = g[v].left.lag;
         bool is_transient = contains(tbi.transient, leftfix);
 
-        if (is_transient && tbi.cc.grey.roseLookaroundMasks) {
+        // Transient leftfixes can sometimes be implemented solely with
+        // lookarounds, in which case we don't need to build an engine.
+        // TODO: Handle SOM-tracking cases as well.
+        if (cc.grey.roseLookaroundMasks && is_transient &&
+            !g[v].left.tracksSom()) {
             vector<LookEntry> lookaround;
             if (makeLeftfixLookaround(tbi, v, lookaround)) {
                 DEBUG_PRINTF("implementing as lookaround!\n");
diff --git a/src/rose/rose_build_lookaround.cpp b/src/rose/rose_build_lookaround.cpp
index 54c01e08..a06bacef 100644
--- a/src/rose/rose_build_lookaround.cpp
+++ b/src/rose/rose_build_lookaround.cpp
@@ -538,6 +538,36 @@ void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
     }
 }
 
+static
+bool hasSingleFloatingStart(const NGHolder &g) {
+    NFAVertex initial = NGHolder::null_vertex();
+    for (auto v : adjacent_vertices_range(g.startDs, g)) {
+        if (v == g.startDs) {
+            continue;
+        }
+        if (initial != NGHolder::null_vertex()) {
+            DEBUG_PRINTF("more than one start\n");
+            return false;
+        }
+        initial = v;
+    }
+
+    if (initial == NGHolder::null_vertex()) {
+        DEBUG_PRINTF("no floating starts\n");
+        return false;
+    }
+
+    // Anchored start must have no successors other than startDs and initial.
+    for (auto v : adjacent_vertices_range(g.start, g)) {
+        if (v != initial && v != g.startDs) {
+            DEBUG_PRINTF("anchored start\n");
+            return false;
+        }
+    }
+
+    return true;
+}
+
 static
 bool getTransientPrefixReach(const NGHolder &g, u32 lag,
                              map<s32, CharReach> &look) {
@@ -546,15 +576,9 @@ bool getTransientPrefixReach(const NGHolder &g, u32 lag,
         return false;
     }
 
-    // Currently we don't handle anchored prefixes, as we would need to be able
-    // to represent the bounds from the anchor as well.
-    if (out_degree(g.start, g) != 1) {
-        DEBUG_PRINTF("anchored\n");
-        return false;
-    }
-
-    if (out_degree(g.startDs, g) != 2) {
-        DEBUG_PRINTF("more than one start\n");
+    // Must be a floating chain wired to startDs.
+    if (!hasSingleFloatingStart(g)) {
+        DEBUG_PRINTF("not a single floating start\n");
         return false;
     }
 
@@ -569,12 +593,28 @@ bool getTransientPrefixReach(const NGHolder &g, u32 lag,
 
         look[0 - i] = g[v].char_reach;
 
-        if (in_degree(v, g) != 1) {
+        NFAVertex next = NGHolder::null_vertex();
+        for (auto u : inv_adjacent_vertices_range(v, g)) {
+            if (u == g.start) {
+                continue; // Benign, checked by hasSingleFloatingStart
+            }
+            if (next == NGHolder::null_vertex()) {
+                next = u;
+                continue;
+            }
             DEBUG_PRINTF("branch\n");
             return false;
         }
 
-        v = *(inv_adjacent_vertices(v, g).first);
+        if (next == NGHolder::null_vertex() || next == v) {
+            DEBUG_PRINTF("no predecessor or only self-loop\n");
+            // This graph is malformed -- all vertices in a graph that makes it
+            // to this analysis should have predecessors.
+            assert(0);
+            return false;
+        }
+
+        v = next;
         i++;
     }
 

From 9d2403e8bb65fb65f7c21678d1663b114fe3b333 Mon Sep 17 00:00:00 2001
From: Kirill Rybalchenko <kirill.rybalchenko@intel.com>
Date: Thu, 21 Apr 2016 16:52:43 +0100
Subject: [PATCH 060/166] limex: implement variable shift NFA engines

Replaces the old LimEx NFA engines, which were specialised for model
size and number of shifts, with a new set of engines that can handle a
variable number of shifts.
---
 CMakeLists.txt                                |   4 +-
 src/grey.cpp                                  |   2 -
 src/grey.h                                    |   1 -
 src/nfa/limex.h                               |  40 +--
 src/nfa/limex_compile.cpp                     | 250 ++++++++++--------
 src/nfa/limex_dump.cpp                        |  81 +++---
 src/nfa/limex_internal.h                      |  12 +-
 src/nfa/limex_native.c                        |  33 +--
 src/nfa/limex_runtime.h                       |  29 +-
 src/nfa/limex_runtime_impl.h                  |  13 +-
 src/nfa/limex_simd128.c                       |  37 +--
 src/nfa/limex_simd256.c                       |  37 +--
 src/nfa/limex_simd384.c                       |  37 +--
 src/nfa/{limex_simd512b.c => limex_simd512.c} |  12 +-
 src/nfa/limex_simd512a.c                      |  74 ------
 src/nfa/limex_simd512c.c                      |  69 -----
 src/nfa/nfa_api_dispatch.c                    |  40 +--
 src/nfa/nfa_build_util.cpp                    |  58 +---
 src/nfa/nfa_dump_dispatch.cpp                 |  42 +--
 src/nfa/nfa_internal.h                        |  80 +-----
 unit/internal/limex_nfa.cpp                   |   6 +-
 21 files changed, 264 insertions(+), 693 deletions(-)
 rename src/nfa/{limex_simd512b.c => limex_simd512.c} (88%)
 delete mode 100644 src/nfa/limex_simd512a.c
 delete mode 100644 src/nfa/limex_simd512c.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d49de277..2652cea3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -438,9 +438,7 @@ set (hs_exec_SRCS
     src/nfa/limex_simd128.c
     src/nfa/limex_simd256.c
     src/nfa/limex_simd384.c
-    src/nfa/limex_simd512a.c
-    src/nfa/limex_simd512b.c
-    src/nfa/limex_simd512c.c
+    src/nfa/limex_simd512.c
     src/nfa/limex.h
     src/nfa/limex_common_impl.h
     src/nfa/limex_context.h
diff --git a/src/grey.cpp b/src/grey.cpp
index e2022e74..1f2fd904 100644
--- a/src/grey.cpp
+++ b/src/grey.cpp
@@ -81,7 +81,6 @@ Grey::Grey(void) :
                    allowZombies(true),
                    floodAsPuffette(false),
                    nfaForceSize(0),
-                   nfaForceShifts(0),
                    maxHistoryAvailable(DEFAULT_MAX_HISTORY),
                    minHistoryAvailable(0), /* debugging only */
                    maxAnchoredRegion(63), /* for rose's atable to run over */
@@ -234,7 +233,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(allowZombies);
         G_UPDATE(floodAsPuffette);
         G_UPDATE(nfaForceSize);
-        G_UPDATE(nfaForceShifts);
         G_UPDATE(highlanderSquash);
         G_UPDATE(maxHistoryAvailable);
         G_UPDATE(minHistoryAvailable);
diff --git a/src/grey.h b/src/grey.h
index 8ac9e6b1..634fa3a7 100644
--- a/src/grey.h
+++ b/src/grey.h
@@ -88,7 +88,6 @@ struct Grey {
     bool floodAsPuffette;
 
     u32 nfaForceSize;
-    u32 nfaForceShifts;
 
     u32 maxHistoryAvailable;
     u32 minHistoryAvailable;
diff --git a/src/nfa/limex.h b/src/nfa/limex.h
index 2c429a67..57ee46df 100644
--- a/src/nfa/limex.h
+++ b/src/nfa/limex.h
@@ -74,41 +74,11 @@ extern "C"
                                                    struct mq *q, s64a loc);    \
     GENERATE_NFA_DUMP_DECL(gf_name)
 
-GENERATE_NFA_DECL(nfaExecLimEx32_1)
-GENERATE_NFA_DECL(nfaExecLimEx32_2)
-GENERATE_NFA_DECL(nfaExecLimEx32_3)
-GENERATE_NFA_DECL(nfaExecLimEx32_4)
-GENERATE_NFA_DECL(nfaExecLimEx32_5)
-GENERATE_NFA_DECL(nfaExecLimEx32_6)
-GENERATE_NFA_DECL(nfaExecLimEx32_7)
-GENERATE_NFA_DECL(nfaExecLimEx128_1)
-GENERATE_NFA_DECL(nfaExecLimEx128_2)
-GENERATE_NFA_DECL(nfaExecLimEx128_3)
-GENERATE_NFA_DECL(nfaExecLimEx128_4)
-GENERATE_NFA_DECL(nfaExecLimEx128_5)
-GENERATE_NFA_DECL(nfaExecLimEx128_6)
-GENERATE_NFA_DECL(nfaExecLimEx128_7)
-GENERATE_NFA_DECL(nfaExecLimEx256_1)
-GENERATE_NFA_DECL(nfaExecLimEx256_2)
-GENERATE_NFA_DECL(nfaExecLimEx256_3)
-GENERATE_NFA_DECL(nfaExecLimEx256_4)
-GENERATE_NFA_DECL(nfaExecLimEx256_5)
-GENERATE_NFA_DECL(nfaExecLimEx256_6)
-GENERATE_NFA_DECL(nfaExecLimEx256_7)
-GENERATE_NFA_DECL(nfaExecLimEx384_1)
-GENERATE_NFA_DECL(nfaExecLimEx384_2)
-GENERATE_NFA_DECL(nfaExecLimEx384_3)
-GENERATE_NFA_DECL(nfaExecLimEx384_4)
-GENERATE_NFA_DECL(nfaExecLimEx384_5)
-GENERATE_NFA_DECL(nfaExecLimEx384_6)
-GENERATE_NFA_DECL(nfaExecLimEx384_7)
-GENERATE_NFA_DECL(nfaExecLimEx512_1)
-GENERATE_NFA_DECL(nfaExecLimEx512_2)
-GENERATE_NFA_DECL(nfaExecLimEx512_3)
-GENERATE_NFA_DECL(nfaExecLimEx512_4)
-GENERATE_NFA_DECL(nfaExecLimEx512_5)
-GENERATE_NFA_DECL(nfaExecLimEx512_6)
-GENERATE_NFA_DECL(nfaExecLimEx512_7)
+GENERATE_NFA_DECL(nfaExecLimEx32)
+GENERATE_NFA_DECL(nfaExecLimEx128)
+GENERATE_NFA_DECL(nfaExecLimEx256)
+GENERATE_NFA_DECL(nfaExecLimEx384)
+GENERATE_NFA_DECL(nfaExecLimEx512)
 
 #undef GENERATE_NFA_DECL
 #undef GENERATE_NFA_DUMP_DECL
diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index 5d51feb9..563d6c9c 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -169,10 +169,10 @@ struct build_info {
 
 // Constants for scoring mechanism
 
-#define LAST_LIMEX_NFA LIMEX_NFA_512_7
+#define LAST_LIMEX_NFA LIMEX_NFA_512
 
 const int LIMEX_INITIAL_SCORE = 2000;
-const int SHIFT_COST = 20; // limex: cost per shift mask
+const int SHIFT_COST = 10; // limex: cost per shift mask
 const int EXCEPTION_COST = 4; // limex: per exception
 
 template<NFAEngineType t> struct NFATraits { };
@@ -261,6 +261,17 @@ void maskSetBits(Mask &m, const NFAStateSet &bits) {
     }
 }
 
+template<class Mask>
+bool isMaskZero(Mask &m) {
+    u8 *m8 = (u8 *)&m;
+    for (u32 i = 0; i < sizeof(m); i++) {
+        if (m8[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
 // Sets an entire byte in a mask to the given value
 template<class Mask>
 void maskSetByte(Mask &m, const unsigned int idx, const char val) {
@@ -1315,6 +1326,95 @@ u32 depth_to_u32(const depth &d) {
     return d_val;
 }
 
+static
+bool isExceptionalTransition(const NGHolder &h, const NFAEdge &e,
+                             const build_info &args, u32 maxShift) {
+    NFAVertex from = source(e, h);
+    NFAVertex to = target(e, h);
+    u32 f = args.state_ids.at(from);
+    u32 t = args.state_ids.at(to);
+    if (!isLimitedTransition(f, t, maxShift)) {
+        return true;
+    }
+
+    // All transitions out of a tug trigger are exceptional.
+    if (contains(args.tugs, from)) {
+        return true;
+    }
+    return false;
+}
+
+static
+u32 findMaxVarShift(const build_info &args, u32 nShifts) {
+    const NGHolder &h = args.h;
+    u32 shiftMask = 0;
+    for (const auto &e : edges_range(h)) {
+        u32 from = args.state_ids.at(source(e, h));
+        u32 to = args.state_ids.at(target(e, h));
+        if (from == NO_STATE || to == NO_STATE) {
+            continue;
+        }
+        if (!isExceptionalTransition(h, e, args, MAX_SHIFT_AMOUNT)) {
+            shiftMask |= (1UL << (to - from));
+        }
+    }
+
+    u32 maxVarShift = 0;
+    for (u32 shiftCnt = 0; shiftMask != 0 && shiftCnt < nShifts; shiftCnt++) {
+        maxVarShift = findAndClearLSB_32(&shiftMask);
+    }
+
+    return maxVarShift;
+}
+
+static
+int getLimexScore(const build_info &args, u32 nShifts) {
+    const NGHolder &h = args.h;
+    u32 maxVarShift = nShifts;
+    int score = LIMEX_INITIAL_SCORE;
+
+    score -= SHIFT_COST * nShifts;
+    maxVarShift = findMaxVarShift(args, nShifts);
+
+    NFAStateSet exceptionalStates(args.num_states);
+    for (const auto &e : edges_range(h)) {
+        u32 from = args.state_ids.at(source(e, h));
+        u32 to = args.state_ids.at(target(e, h));
+        if (from == NO_STATE || to == NO_STATE) {
+            continue;
+        }
+        if (isExceptionalTransition(h, e, args, maxVarShift)) {
+            exceptionalStates.set(from);
+        }
+    }
+    score -= EXCEPTION_COST * exceptionalStates.count();
+    if (score < 0) {
+        score = 0;
+    }
+    return score;
+}
+
+// This function finds the best shift scheme with highest score
+// Returns number of shifts and score calculated for appropriate scheme
+// Returns zero if no appropriate scheme was found
+static
+u32 findBestNumOfVarShifts(const build_info &args,
+                           int *bestScoreRet = nullptr) {
+    u32 bestNumOfVarShifts = 0;
+    int bestScore = 0;
+    for (u32 shiftCount = 1; shiftCount <= MAX_SHIFT_COUNT; shiftCount++) {
+        int score = getLimexScore(args, shiftCount);
+        if (score > bestScore) {
+            bestScore = score;
+            bestNumOfVarShifts = shiftCount;
+        }
+    }
+    if (bestScoreRet != nullptr) {
+        *bestScoreRet = bestScore;
+    }
+    return bestNumOfVarShifts;
+}
+
 template<NFAEngineType dtype>
 struct Factory {
     // typedefs for readability, for types derived from traits
@@ -1322,25 +1422,6 @@ struct Factory {
     typedef typename NFATraits<dtype>::implNFA_t implNFA_t;
     typedef typename NFATraits<dtype>::tableRow_t tableRow_t;
 
-    static
-    bool isExceptionalTransition(const NGHolder &h, const NFAEdge &e,
-                                 const ue2::unordered_map<NFAVertex, u32> &state_ids,
-                                 const ue2::unordered_set<NFAVertex> &tugs) {
-        NFAVertex from = source(e, h);
-        NFAVertex to = target(e, h);
-        u32 f = state_ids.at(from);
-        u32 t = state_ids.at(to);
-        if (!isLimitedTransition(f, t, NFATraits<dtype>::maxShift)) {
-            return true;
-        }
-
-        // All transitions out of a tug trigger are exceptional.
-        if (contains(tugs, from)) {
-            return true;
-        }
-        return false;
-    }
-
     static
     void allocState(NFA *nfa, u32 repeatscratchStateSize,
                     u32 repeatStreamState) {
@@ -1504,6 +1585,9 @@ struct Factory {
     static
     void writeShiftMasks(const build_info &args, implNFA_t *limex) {
         const NGHolder &h = args.h;
+        u32 maxShift = findMaxVarShift(args, limex->shiftCount);
+        u32 shiftMask = 0;
+        int shiftMaskIdx = 0;
 
         for (const auto &e : edges_range(h)) {
             u32 from = args.state_ids.at(source(e, h));
@@ -1515,15 +1599,32 @@ struct Factory {
             // We check for exceptional transitions here, as we don't want tug
             // trigger transitions emitted as limited transitions (even if they
             // could be in this model).
-            if (!isExceptionalTransition(h, e, args.state_ids, args.tugs)) {
-                maskSetBit(limex->shift[to - from], from);
+            if (!isExceptionalTransition(h, e, args, maxShift)) {
+                u32 shift = to - from;
+                if ((shiftMask & (1UL << shift)) == 0UL) {
+                    shiftMask |= (1UL << shift);
+                    limex->shiftAmount[shiftMaskIdx++] = (u8)shift;
+                }
+                assert(limex->shiftCount <= MAX_SHIFT_COUNT);
+                for (u32 i = 0; i < limex->shiftCount; i++) {
+                    if (limex->shiftAmount[i] == (u8)shift) {
+                        maskSetBit(limex->shift[i], from);
+                        break;
+                    }
+                }
+            }
+        }
+        if (maxShift && limex->shiftCount > 1) {
+            for (u32 i = 0; i < limex->shiftCount; i++) {
+                assert(!isMaskZero(limex->shift[i]));
             }
         }
     }
 
     static
     void findExceptionalTransitions(const build_info &args,
-                                    ue2::unordered_set<NFAEdge> &exceptional) {
+                                    ue2::unordered_set<NFAEdge> &exceptional,
+                                    u32 maxShift) {
         const NGHolder &h = args.h;
 
         for (const auto &e : edges_range(h)) {
@@ -1533,7 +1634,7 @@ struct Factory {
                 continue;
             }
 
-            if (isExceptionalTransition(h, e, args.state_ids, args.tugs)) {
+            if (isExceptionalTransition(h, e, args, maxShift)) {
                 exceptional.insert(e);
             }
         }
@@ -1778,7 +1879,10 @@ struct Factory {
         }
 
         ue2::unordered_set<NFAEdge> exceptional;
-        findExceptionalTransitions(args, exceptional);
+        u32 shiftCount = findBestNumOfVarShifts(args);
+        assert(shiftCount);
+        u32 maxShift = findMaxVarShift(args, shiftCount);
+        findExceptionalTransitions(args, exceptional, maxShift);
 
         map<ExceptionProto, vector<u32> > exceptionMap;
         vector<ReportID> exceptionReports;
@@ -1874,6 +1978,7 @@ struct Factory {
         writeAccepts(acceptMask, acceptEodMask, accepts, acceptsEod, squash,
                      limex, acceptsOffset, acceptsEodOffset, squashOffset);
 
+        limex->shiftCount = shiftCount;
         writeShiftMasks(args, limex);
 
         // Determine the state required for our state vector.
@@ -1907,8 +2012,6 @@ struct Factory {
     }
 
     static int score(const build_info &args) {
-        const NGHolder &h = args.h;
-
         // LimEx NFAs are available in sizes from 32 to 512-bit.
         size_t num_states = args.num_states;
 
@@ -1928,45 +2031,17 @@ struct Factory {
             sz = args.cc.grey.nfaForceSize;
         }
 
-        if (args.cc.grey.nfaForceShifts &&
-            NFATraits<dtype>::maxShift != args.cc.grey.nfaForceShifts) {
-            return -1;
-        }
-
         if (sz != NFATraits<dtype>::maxStates) {
             return -1; // fail, size not appropriate
         }
 
         // We are of the right size, calculate a score based on the number
         // of exceptions and the number of shifts used by this LimEx.
-        int score = LIMEX_INITIAL_SCORE;
-        if (NFATraits<dtype>::maxShift != 0) {
-            score -= SHIFT_COST / 2; // first shift mask is cheap
-            score -= SHIFT_COST * (NFATraits<dtype>::maxShift - 1);
+        int score;
+        u32 shiftCount = findBestNumOfVarShifts(args, &score);
+        if (shiftCount == 0) {
+            return -1;
         }
-
-        NFAStateSet exceptionalStates(num_states); // outbound exc trans
-
-        for (const auto &e : edges_range(h)) {
-            u32 from = args.state_ids.at(source(e, h));
-            u32 to = args.state_ids.at(target(e, h));
-            if (from == NO_STATE || to == NO_STATE) {
-                continue;
-            }
-
-            if (isExceptionalTransition(h, e, args.state_ids, args.tugs)) {
-                exceptionalStates.set(from);
-            }
-        }
-        DEBUG_PRINTF("%zu exceptional states\n", exceptionalStates.count());
-        score -= EXCEPTION_COST * exceptionalStates.count();
-
-        /* ensure that we always report a valid score if have the right number
-         * of states */
-        if (score < 0) {
-            score = 0;
-        }
-
         return score;
     }
 };
@@ -1985,50 +2060,19 @@ struct scoreNfa {
     }
 };
 
-#define MAKE_LIMEX_TRAITS(mlt_size, mlt_shift)                          \
-    template<> struct NFATraits<LIMEX_NFA_##mlt_size##_##mlt_shift> {   \
-        typedef LimExNFA##mlt_size implNFA_t;                           \
-        typedef u_##mlt_size tableRow_t;                                \
-        typedef NFAException##mlt_size exception_t;                     \
-        static const size_t maxStates = mlt_size;                       \
-        static const u32 maxShift = mlt_shift;                          \
-    };                                                                  \
+#define MAKE_LIMEX_TRAITS(mlt_size)                                            \
+    template<> struct NFATraits<LIMEX_NFA_##mlt_size> {                        \
+        typedef LimExNFA##mlt_size implNFA_t;                                  \
+        typedef u_##mlt_size tableRow_t;                                       \
+        typedef NFAException##mlt_size exception_t;                            \
+        static const size_t maxStates = mlt_size;                              \
+    };
 
-MAKE_LIMEX_TRAITS(32, 1)
-MAKE_LIMEX_TRAITS(32, 2)
-MAKE_LIMEX_TRAITS(32, 3)
-MAKE_LIMEX_TRAITS(32, 4)
-MAKE_LIMEX_TRAITS(32, 5)
-MAKE_LIMEX_TRAITS(32, 6)
-MAKE_LIMEX_TRAITS(32, 7)
-MAKE_LIMEX_TRAITS(128, 1)
-MAKE_LIMEX_TRAITS(128, 2)
-MAKE_LIMEX_TRAITS(128, 3)
-MAKE_LIMEX_TRAITS(128, 4)
-MAKE_LIMEX_TRAITS(128, 5)
-MAKE_LIMEX_TRAITS(128, 6)
-MAKE_LIMEX_TRAITS(128, 7)
-MAKE_LIMEX_TRAITS(256, 1)
-MAKE_LIMEX_TRAITS(256, 2)
-MAKE_LIMEX_TRAITS(256, 3)
-MAKE_LIMEX_TRAITS(256, 4)
-MAKE_LIMEX_TRAITS(256, 5)
-MAKE_LIMEX_TRAITS(256, 6)
-MAKE_LIMEX_TRAITS(256, 7)
-MAKE_LIMEX_TRAITS(384, 1)
-MAKE_LIMEX_TRAITS(384, 2)
-MAKE_LIMEX_TRAITS(384, 3)
-MAKE_LIMEX_TRAITS(384, 4)
-MAKE_LIMEX_TRAITS(384, 5)
-MAKE_LIMEX_TRAITS(384, 6)
-MAKE_LIMEX_TRAITS(384, 7)
-MAKE_LIMEX_TRAITS(512, 1)
-MAKE_LIMEX_TRAITS(512, 2)
-MAKE_LIMEX_TRAITS(512, 3)
-MAKE_LIMEX_TRAITS(512, 4)
-MAKE_LIMEX_TRAITS(512, 5)
-MAKE_LIMEX_TRAITS(512, 6)
-MAKE_LIMEX_TRAITS(512, 7)
+MAKE_LIMEX_TRAITS(32)
+MAKE_LIMEX_TRAITS(128)
+MAKE_LIMEX_TRAITS(256)
+MAKE_LIMEX_TRAITS(384)
+MAKE_LIMEX_TRAITS(512)
 
 } // namespace
 
diff --git a/src/nfa/limex_dump.cpp b/src/nfa/limex_dump.cpp
index 084f35dd..8e1ee219 100644
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -244,6 +244,16 @@ void dumpLimexExceptions(const limex_type *limex, FILE *f) {
     }
 }
 
+template<typename limex_type>
+static
+void dumpLimexShifts(const limex_type *limex, FILE *f) {
+    u32 size = limex_traits<limex_type>::size;
+    fprintf(f, "Shift Masks:\n");
+    for(u32 i = 0; i < limex->shiftCount; i++) {
+        fprintf(f, "\t Shift %u(%hhu)\t\tMask: %s\n", i, limex->shiftAmount[i],
+                dumpMask((const u8 *)&limex->shift[i], size).c_str());
+    }
+}
 template<typename limex_type>
 static
 void dumpLimexText(const limex_type *limex, FILE *f) {
@@ -270,6 +280,9 @@ void dumpLimexText(const limex_type *limex, FILE *f) {
         topMask += size / 8;
     }
 
+    // Dump shift masks
+    dumpLimexShifts(limex, f);
+
     dumpSquash(limex, f);
 
     dumpLimexReachMap(limex->reachMap, f);
@@ -420,78 +433,44 @@ void dumpExDotInfo(const limex_type *limex, u32 state, FILE *f) {
 template<typename limex_type>
 static
 void dumpLimDotInfo(const limex_type *limex, u32 state, FILE *f) {
-    for (u32 j = 0; j < MAX_MAX_SHIFT; j++) {
+    for (u32 j = 0; j < limex->shiftCount; j++) {
+        const u32 shift_amount = limex->shiftAmount[j];
         if (testbit((const u8 *)&limex->shift[j],
                     limex_traits<limex_type>::size, state)) {
-            fprintf(f, "%u -> %u;\n", state, state + j);
+            fprintf(f, "%u -> %u;\n", state, state + shift_amount);
         }
     }
 }
 
-#define DUMP_TEXT_FN(ddf_u, ddf_n, ddf_s)                                      \
-    void nfaExecLimEx##ddf_n##_##ddf_s##_dumpText(const NFA *nfa, FILE *f) {   \
+#define DUMP_TEXT_FN(ddf_n)                                                    \
+    void nfaExecLimEx##ddf_n##_dumpText(const NFA *nfa, FILE *f) {             \
         dumpLimexText((const LimExNFA##ddf_n *)getImplNfa(nfa), f);            \
     }
 
-#define DUMP_DOT_FN(ddf_u, ddf_n, ddf_s)                                       \
-    void nfaExecLimEx##ddf_n##_##ddf_s##_dumpDot(const NFA *nfa, FILE *f) {    \
+#define DUMP_DOT_FN(ddf_n)                                                     \
+    void nfaExecLimEx##ddf_n##_dumpDot(const NFA *nfa, FILE *f) {              \
         const LimExNFA##ddf_n *limex =                                         \
             (const LimExNFA##ddf_n *)getImplNfa(nfa);                          \
                                                                                \
         dumpDotPreamble(f);                                                    \
-        u32 state_count = nfa->nPositions;                              \
+        u32 state_count = nfa->nPositions;                                     \
         dumpVertexDotInfo(limex, state_count, f,                               \
                           limex_labeller<LimExNFA##ddf_n>(limex));             \
         for (u32 i = 0; i < state_count; i++) {                                \
             dumpLimDotInfo(limex, i, f);                                       \
             dumpExDotInfo(limex, i, f);                                        \
         }                                                                      \
-                                                                               \
         dumpDotTrailer(f);                                                     \
     }
 
-#define LIMEX_DUMP_FNS(ntype, size, shifts)                                    \
-    DUMP_TEXT_FN(ntype, size, shifts)                                          \
-    DUMP_DOT_FN(ntype, size, shifts)
+#define LIMEX_DUMP_FNS(size)                                                   \
+    DUMP_TEXT_FN(size)                                                         \
+    DUMP_DOT_FN(size)
 
-LIMEX_DUMP_FNS(u32, 32, 1)
-LIMEX_DUMP_FNS(u32, 32, 2)
-LIMEX_DUMP_FNS(u32, 32, 3)
-LIMEX_DUMP_FNS(u32, 32, 4)
-LIMEX_DUMP_FNS(u32, 32, 5)
-LIMEX_DUMP_FNS(u32, 32, 6)
-LIMEX_DUMP_FNS(u32, 32, 7)
-
-LIMEX_DUMP_FNS(m128, 128, 1)
-LIMEX_DUMP_FNS(m128, 128, 2)
-LIMEX_DUMP_FNS(m128, 128, 3)
-LIMEX_DUMP_FNS(m128, 128, 4)
-LIMEX_DUMP_FNS(m128, 128, 5)
-LIMEX_DUMP_FNS(m128, 128, 6)
-LIMEX_DUMP_FNS(m128, 128, 7)
-
-LIMEX_DUMP_FNS(m256, 256, 1)
-LIMEX_DUMP_FNS(m256, 256, 2)
-LIMEX_DUMP_FNS(m256, 256, 3)
-LIMEX_DUMP_FNS(m256, 256, 4)
-LIMEX_DUMP_FNS(m256, 256, 5)
-LIMEX_DUMP_FNS(m256, 256, 6)
-LIMEX_DUMP_FNS(m256, 256, 7)
-
-LIMEX_DUMP_FNS(m384, 384, 1)
-LIMEX_DUMP_FNS(m384, 384, 2)
-LIMEX_DUMP_FNS(m384, 384, 3)
-LIMEX_DUMP_FNS(m384, 384, 4)
-LIMEX_DUMP_FNS(m384, 384, 5)
-LIMEX_DUMP_FNS(m384, 384, 6)
-LIMEX_DUMP_FNS(m384, 384, 7)
-
-LIMEX_DUMP_FNS(m512, 512, 1)
-LIMEX_DUMP_FNS(m512, 512, 2)
-LIMEX_DUMP_FNS(m512, 512, 3)
-LIMEX_DUMP_FNS(m512, 512, 4)
-LIMEX_DUMP_FNS(m512, 512, 5)
-LIMEX_DUMP_FNS(m512, 512, 6)
-LIMEX_DUMP_FNS(m512, 512, 7)
+LIMEX_DUMP_FNS(32)
+LIMEX_DUMP_FNS(128)
+LIMEX_DUMP_FNS(256)
+LIMEX_DUMP_FNS(384)
+LIMEX_DUMP_FNS(512)
 
 } // namespace ue2
diff --git a/src/nfa/limex_internal.h b/src/nfa/limex_internal.h
index adae6ab7..1483a911 100644
--- a/src/nfa/limex_internal.h
+++ b/src/nfa/limex_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -68,6 +68,9 @@
     The value of NFA.stateSize gives the total state size in bytes (the sum of
     all the above).
 
+    Number of shifts should be always greater or equal to 1
+    Number of shifts 0 means that no appropriate NFA engine was found.
+
 */
 
 #ifndef LIMEX_INTERNAL_H
@@ -77,7 +80,8 @@
 #include "repeat_internal.h"
 
 // Constants
-#define MAX_MAX_SHIFT 8      /**< largest maxshift used by a LimEx NFA */
+#define MAX_SHIFT_COUNT 8   /**< largest number of shifts used by a LimEx NFA */
+#define MAX_SHIFT_AMOUNT 16 /**< largest shift amount used by a LimEx NFA */
 
 #define LIMEX_FLAG_COMPRESS_STATE  1 /**< pack state into stream state */
 #define LIMEX_FLAG_COMPRESS_MASKED 2 /**< use reach mask-based compression */
@@ -168,8 +172,10 @@ struct LimExNFA##size { /* MUST align with LimExNFABase */                  \
     u_##size compressMask; /**< switch off before compress */               \
     u_##size exceptionMask;                                                 \
     u_##size repeatCyclicMask;                                              \
-    u_##size shift[MAX_MAX_SHIFT];                                          \
     u_##size zombieMask; /**< zombie if in any of the set states */         \
+    u_##size shift[MAX_SHIFT_COUNT];                                        \
+    u32 shiftCount; /**< number of shift masks used */                      \
+    u8 shiftAmount[MAX_SHIFT_COUNT]; /**< shift amount for each mask */     \
 };
 
 CREATE_NFA_LIMEX(32)
diff --git a/src/nfa/limex_native.c b/src/nfa/limex_native.c
index 471e4bf0..e156cb81 100644
--- a/src/nfa/limex_native.c
+++ b/src/nfa/limex_native.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -132,35 +132,4 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
 
 #define SIZE                32
 #define STATE_T             u32
-#define SHIFT               1
-#include "limex_runtime_impl.h"
-
-#define SIZE                32
-#define STATE_T             u32
-#define SHIFT               2
-#include "limex_runtime_impl.h"
-
-#define SIZE                32
-#define STATE_T             u32
-#define SHIFT               3
-#include "limex_runtime_impl.h"
-
-#define SIZE                32
-#define STATE_T             u32
-#define SHIFT               4
-#include "limex_runtime_impl.h"
-
-#define SIZE                32
-#define STATE_T             u32
-#define SHIFT               5
-#include "limex_runtime_impl.h"
-
-#define SIZE                32
-#define STATE_T             u32
-#define SHIFT               6
-#include "limex_runtime_impl.h"
-
-#define SIZE                32
-#define STATE_T             u32
-#define SHIFT               7
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_runtime.h b/src/nfa/limex_runtime.h
index 4e111aa6..778d376d 100644
--- a/src/nfa/limex_runtime.h
+++ b/src/nfa/limex_runtime.h
@@ -73,34 +73,35 @@ struct proto_cache {
 };
 
 // Shift macros for Limited NFAs. Defined in terms of uniform ops.
+// LimExNFAxxx ptr in 'limex' and the current state in 's'
 #define NFA_EXEC_LIM_SHIFT(nels_type, nels_i)                                  \
     (JOIN(shift_, nels_type)(                                                  \
         JOIN(and_, nels_type)(s,                                               \
                               JOIN(load_, nels_type)(&limex->shift[nels_i])),  \
-        nels_i))
+        limex->shiftAmount[nels_i]))
 
-// Calculate the (limited model) successors for a given max shift. Assumes
-// LimExNFAxxx ptr in 'l', current state in 's' and successors in 'succ'.
+// Calculate the (limited model) successors for a number of variable shifts.
+// Assumes current state in 's' and successors in 'succ'.
 
-#define NFA_EXEC_GET_LIM_SUCC(gls_type, gls_shift)                             \
+#define NFA_EXEC_GET_LIM_SUCC(gls_type)                                        \
     do {                                                                       \
-        succ =                                                                 \
-            JOIN(and_, gls_type)(s, JOIN(load_, gls_type)(&limex->shift[0]));  \
-        switch (gls_shift) {                                                   \
-        case 7:                                                                \
+        succ = NFA_EXEC_LIM_SHIFT(gls_type, 0);                                \
+        switch (limex->shiftCount) {                                           \
+        case 8:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 7)); \
-        case 6:                                                                \
+        case 7:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 6)); \
-        case 5:                                                                \
+        case 6:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 5)); \
-        case 4:                                                                \
+        case 5:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 4)); \
-        case 3:                                                                \
+        case 4:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 3)); \
-        case 2:                                                                \
+        case 3:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 2)); \
-        case 1:                                                                \
+        case 2:                                                                \
             succ = JOIN(or_, gls_type)(succ, NFA_EXEC_LIM_SHIFT(gls_type, 1)); \
+        case 1:                                                                \
         case 0:                                                                \
             ;                                                                  \
         }                                                                      \
diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index 676ed370..9924ef8c 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -37,11 +37,11 @@
   * Version 2.0: now with X-Macros, so you get line numbers in your debugger.
   */
 
-#if !defined(SIZE) || !defined(STATE_T) || !defined(SHIFT)
-#  error Must define SIZE and STATE_T and SHIFT in includer.
+#if !defined(SIZE) || !defined(STATE_T)
+#  error Must define SIZE and STATE_T in includer.
 #endif
 
-#define LIMEX_API_ROOT   JOIN(JOIN(JOIN(nfaExecLimEx, SIZE), _), SHIFT)
+#define LIMEX_API_ROOT   JOIN(nfaExecLimEx, SIZE)
 
 #define IMPL_NFA_T          JOIN(struct LimExNFA, SIZE)
 
@@ -201,7 +201,7 @@ without_accel:
 
         u8 c = input[i];
         STATE_T succ;
-        NFA_EXEC_GET_LIM_SUCC(STATE_T, SHIFT);
+        NFA_EXEC_GET_LIM_SUCC(STATE_T);
 
         if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, exceptionMap, s,
                               EXCEPTION_MASK, i, offset, &succ, final_loc, ctx,
@@ -252,7 +252,7 @@ with_accel:
 
         u8 c = input[i];
         STATE_T succ;
-        NFA_EXEC_GET_LIM_SUCC(STATE_T, SHIFT);
+        NFA_EXEC_GET_LIM_SUCC(STATE_T);
 
         if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, exceptionMap, s,
                               EXCEPTION_MASK, i, offset, &succ, final_loc, ctx,
@@ -318,7 +318,7 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
 
         u8 c = input[i-1];
         STATE_T succ;
-        NFA_EXEC_GET_LIM_SUCC(STATE_T, SHIFT);
+        NFA_EXEC_GET_LIM_SUCC(STATE_T);
 
         if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, exceptionMap, s,
                               EXCEPTION_MASK, i, offset, &succ, final_loc, ctx,
@@ -935,5 +935,4 @@ enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
 // Parameters.
 #undef SIZE
 #undef STATE_T
-#undef SHIFT
 #undef LIMEX_API_ROOT
diff --git a/src/nfa/limex_simd128.c b/src/nfa/limex_simd128.c
index 781c7972..f0fb1dd4 100644
--- a/src/nfa/limex_simd128.c
+++ b/src/nfa/limex_simd128.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -61,37 +61,6 @@
 #define INLINE_ATTR really_inline
 #include "limex_common_impl.h"
 
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               1
-#include "limex_runtime_impl.h"
-
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               2
-#include "limex_runtime_impl.h"
-
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               3
-#include "limex_runtime_impl.h"
-
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               4
-#include "limex_runtime_impl.h"
-
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               5
-#include "limex_runtime_impl.h"
-
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               6
-#include "limex_runtime_impl.h"
-
-#define SIZE                128
-#define STATE_T             m128
-#define SHIFT               7
+#define SIZE 128
+#define STATE_T m128
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_simd256.c b/src/nfa/limex_simd256.c
index b4df1459..57648b69 100644
--- a/src/nfa/limex_simd256.c
+++ b/src/nfa/limex_simd256.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -58,37 +58,6 @@
 #define INLINE_ATTR really_inline
 #include "limex_common_impl.h"
 
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               1
-#include "limex_runtime_impl.h"
-
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               2
-#include "limex_runtime_impl.h"
-
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               3
-#include "limex_runtime_impl.h"
-
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               4
-#include "limex_runtime_impl.h"
-
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               5
-#include "limex_runtime_impl.h"
-
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               6
-#include "limex_runtime_impl.h"
-
-#define SIZE                256
-#define STATE_T             m256
-#define SHIFT               7
+#define SIZE 256
+#define STATE_T m256
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_simd384.c b/src/nfa/limex_simd384.c
index 4b4b44bb..84061f61 100644
--- a/src/nfa/limex_simd384.c
+++ b/src/nfa/limex_simd384.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -58,37 +58,6 @@
 #define INLINE_ATTR really_inline
 #include "limex_common_impl.h"
 
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               1
-#include "limex_runtime_impl.h"
-
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               2
-#include "limex_runtime_impl.h"
-
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               3
-#include "limex_runtime_impl.h"
-
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               4
-#include "limex_runtime_impl.h"
-
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               5
-#include "limex_runtime_impl.h"
-
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               6
-#include "limex_runtime_impl.h"
-
-#define SIZE                384
-#define STATE_T             m384
-#define SHIFT               7
+#define SIZE 384
+#define STATE_T m384
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_simd512b.c b/src/nfa/limex_simd512.c
similarity index 88%
rename from src/nfa/limex_simd512b.c
rename to src/nfa/limex_simd512.c
index a3b705df..a6646d83 100644
--- a/src/nfa/limex_simd512b.c
+++ b/src/nfa/limex_simd512.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -58,12 +58,6 @@
 #define INLINE_ATTR really_inline
 #include "limex_common_impl.h"
 
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               4
-#include "limex_runtime_impl.h"
-
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               5
+#define SIZE 512
+#define STATE_T m512
 #include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_simd512a.c b/src/nfa/limex_simd512a.c
deleted file mode 100644
index 1c4a0fb9..00000000
--- a/src/nfa/limex_simd512a.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief LimEx NFA: 512-bit SIMD runtime implementations.
- */
-
-//#define DEBUG_INPUT
-//#define DEBUG_EXCEPTIONS
-
-#include "limex.h"
-
-#include "accel.h"
-#include "limex_internal.h"
-#include "nfa_internal.h"
-#include "ue2common.h"
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
-
-// Common code
-#include "limex_runtime.h"
-
-#define SIZE 512
-#define STATE_T m512
-#include "limex_exceptional.h"
-
-#define SIZE 512
-#define STATE_T m512
-#include "limex_state_impl.h"
-
-#define SIZE 512
-#define STATE_T m512
-#define INLINE_ATTR really_inline
-#include "limex_common_impl.h"
-
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               2
-#include "limex_runtime_impl.h"
-
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               1
-#include "limex_runtime_impl.h"
-
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               3
-#include "limex_runtime_impl.h"
diff --git a/src/nfa/limex_simd512c.c b/src/nfa/limex_simd512c.c
deleted file mode 100644
index 0918fca5..00000000
--- a/src/nfa/limex_simd512c.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief LimEx NFA: 512-bit SIMD runtime implementations.
- */
-
-//#define DEBUG_INPUT
-//#define DEBUG_EXCEPTIONS
-
-#include "limex.h"
-
-#include "accel.h"
-#include "limex_internal.h"
-#include "nfa_internal.h"
-#include "ue2common.h"
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
-
-// Common code
-#include "limex_runtime.h"
-
-#define SIZE 512
-#define STATE_T m512
-#include "limex_exceptional.h"
-
-#define SIZE 512
-#define STATE_T m512
-#include "limex_state_impl.h"
-
-#define SIZE 512
-#define STATE_T m512
-#define INLINE_ATTR really_inline
-#include "limex_common_impl.h"
-
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               6
-#include "limex_runtime_impl.h"
-
-#define SIZE                512
-#define STATE_T             m512
-#define SHIFT               7
-#include "limex_runtime_impl.h"
diff --git a/src/nfa/nfa_api_dispatch.c b/src/nfa/nfa_api_dispatch.c
index fb27e4eb..95b1898e 100644
--- a/src/nfa/nfa_api_dispatch.c
+++ b/src/nfa/nfa_api_dispatch.c
@@ -52,41 +52,11 @@
 
 #define DISPATCH_BY_NFA_TYPE(dbnt_func)                       \
     switch (nfa->type) {                                      \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_1, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_2, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_3, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_4, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_5, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_6, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_7, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_7, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_7, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_7, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_7, dbnt_func);    \
+        DISPATCH_CASE(LIMEX, LimEx, 32, dbnt_func);           \
+        DISPATCH_CASE(LIMEX, LimEx, 128, dbnt_func);          \
+        DISPATCH_CASE(LIMEX, LimEx, 256, dbnt_func);          \
+        DISPATCH_CASE(LIMEX, LimEx, 384, dbnt_func);          \
+        DISPATCH_CASE(LIMEX, LimEx, 512, dbnt_func);          \
         DISPATCH_CASE(MCCLELLAN, McClellan, 8, dbnt_func);    \
         DISPATCH_CASE(MCCLELLAN, McClellan, 16, dbnt_func);   \
         DISPATCH_CASE(GOUGH, Gough, 8, dbnt_func);            \
diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp
index 2ac0505e..96d0dabe 100644
--- a/src/nfa/nfa_build_util.cpp
+++ b/src/nfa/nfa_build_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -140,8 +140,8 @@ enum NFACategory {NFA_LIMEX, NFA_OTHER};
 #define DO_IF_DUMP_SUPPORT(a)
 #endif
 
-#define MAKE_LIMEX_TRAITS(mlt_size, mlt_shift)                          \
-    template<> struct NFATraits<LIMEX_NFA_##mlt_size##_##mlt_shift> {   \
+#define MAKE_LIMEX_TRAITS(mlt_size)                                     \
+    template<> struct NFATraits<LIMEX_NFA_##mlt_size> {                 \
         static UNUSED const char *name;                                 \
         static const NFACategory category = NFA_LIMEX;                  \
         typedef LimExNFA##mlt_size implNFA_t;                           \
@@ -151,52 +151,22 @@ enum NFACategory {NFA_LIMEX, NFA_OTHER};
                 MAX(alignof(tableRow_t), alignof(RepeatControl));       \
         static const bool fast = mlt_size <= 64;                        \
     };                                                                  \
-    const has_accel_fn NFATraits<LIMEX_NFA_##mlt_size##_##mlt_shift>::has_accel \
+    const has_accel_fn NFATraits<LIMEX_NFA_##mlt_size>::has_accel       \
             = has_accel_limex<LimExNFA##mlt_size>;                      \
     DO_IF_DUMP_SUPPORT(                                                 \
-    const char *NFATraits<LIMEX_NFA_##mlt_size##_##mlt_shift>::name     \
-        = "LimEx (0-"#mlt_shift") "#mlt_size;                           \
-    template<> struct getDescription<LIMEX_NFA_##mlt_size##_##mlt_shift> { \
-        static string call(const void *ptr) {                            \
-            return getDescriptionLimEx<LIMEX_NFA_##mlt_size##_##mlt_shift>((const NFA *)ptr); \
+    const char *NFATraits<LIMEX_NFA_##mlt_size>::name                   \
+        = "LimEx "#mlt_size;                                            \
+    template<> struct getDescription<LIMEX_NFA_##mlt_size> {            \
+        static string call(const void *ptr) {                           \
+            return getDescriptionLimEx<LIMEX_NFA_##mlt_size>((const NFA *)ptr); \
         } \
     };)
 
-MAKE_LIMEX_TRAITS(32, 1)
-MAKE_LIMEX_TRAITS(32, 2)
-MAKE_LIMEX_TRAITS(32, 3)
-MAKE_LIMEX_TRAITS(32, 4)
-MAKE_LIMEX_TRAITS(32, 5)
-MAKE_LIMEX_TRAITS(32, 6)
-MAKE_LIMEX_TRAITS(32, 7)
-MAKE_LIMEX_TRAITS(128, 1)
-MAKE_LIMEX_TRAITS(128, 2)
-MAKE_LIMEX_TRAITS(128, 3)
-MAKE_LIMEX_TRAITS(128, 4)
-MAKE_LIMEX_TRAITS(128, 5)
-MAKE_LIMEX_TRAITS(128, 6)
-MAKE_LIMEX_TRAITS(128, 7)
-MAKE_LIMEX_TRAITS(256, 1)
-MAKE_LIMEX_TRAITS(256, 2)
-MAKE_LIMEX_TRAITS(256, 3)
-MAKE_LIMEX_TRAITS(256, 4)
-MAKE_LIMEX_TRAITS(256, 5)
-MAKE_LIMEX_TRAITS(256, 6)
-MAKE_LIMEX_TRAITS(256, 7)
-MAKE_LIMEX_TRAITS(384, 1)
-MAKE_LIMEX_TRAITS(384, 2)
-MAKE_LIMEX_TRAITS(384, 3)
-MAKE_LIMEX_TRAITS(384, 4)
-MAKE_LIMEX_TRAITS(384, 5)
-MAKE_LIMEX_TRAITS(384, 6)
-MAKE_LIMEX_TRAITS(384, 7)
-MAKE_LIMEX_TRAITS(512, 1)
-MAKE_LIMEX_TRAITS(512, 2)
-MAKE_LIMEX_TRAITS(512, 3)
-MAKE_LIMEX_TRAITS(512, 4)
-MAKE_LIMEX_TRAITS(512, 5)
-MAKE_LIMEX_TRAITS(512, 6)
-MAKE_LIMEX_TRAITS(512, 7)
+MAKE_LIMEX_TRAITS(32)
+MAKE_LIMEX_TRAITS(128)
+MAKE_LIMEX_TRAITS(256)
+MAKE_LIMEX_TRAITS(384)
+MAKE_LIMEX_TRAITS(512)
 
 template<> struct NFATraits<MCCLELLAN_NFA_8> {
     UNUSED static const char *name;
diff --git a/src/nfa/nfa_dump_dispatch.cpp b/src/nfa/nfa_dump_dispatch.cpp
index 4a59dc1e..577c2fd0 100644
--- a/src/nfa/nfa_dump_dispatch.cpp
+++ b/src/nfa/nfa_dump_dispatch.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -57,41 +57,11 @@ namespace ue2 {
 #define DISPATCH_BY_NFA_TYPE(dbnt_func)                       \
     DEBUG_PRINTF("dispatch for NFA type %u\n", nfa->type);    \
     switch (nfa->type) {                                      \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_1, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_2, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_3, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_4, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_5, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_6, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   32_7, dbnt_func);     \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   128_7, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   256_7, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   384_7, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_1, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_2, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_3, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_4, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_5, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_6, dbnt_func);    \
-        DISPATCH_CASE(LIMEX,   LimEx,   512_7, dbnt_func);    \
+        DISPATCH_CASE(LIMEX, LimEx, 32, dbnt_func);           \
+        DISPATCH_CASE(LIMEX, LimEx, 128, dbnt_func);          \
+        DISPATCH_CASE(LIMEX, LimEx, 256, dbnt_func);          \
+        DISPATCH_CASE(LIMEX, LimEx, 384, dbnt_func);          \
+        DISPATCH_CASE(LIMEX, LimEx, 512, dbnt_func);          \
         DISPATCH_CASE(MCCLELLAN, McClellan, 8, dbnt_func);    \
         DISPATCH_CASE(MCCLELLAN, McClellan, 16, dbnt_func);   \
         DISPATCH_CASE(GOUGH, Gough, 8, dbnt_func);            \
diff --git a/src/nfa/nfa_internal.h b/src/nfa/nfa_internal.h
index 089e9683..d0a4ca0b 100644
--- a/src/nfa/nfa_internal.h
+++ b/src/nfa/nfa_internal.h
@@ -51,41 +51,11 @@ extern "C"
 // Common data structures for NFAs
 
 enum NFAEngineType {
-    LIMEX_NFA_32_1,
-    LIMEX_NFA_32_2,
-    LIMEX_NFA_32_3,
-    LIMEX_NFA_32_4,
-    LIMEX_NFA_32_5,
-    LIMEX_NFA_32_6,
-    LIMEX_NFA_32_7,
-    LIMEX_NFA_128_1,
-    LIMEX_NFA_128_2,
-    LIMEX_NFA_128_3,
-    LIMEX_NFA_128_4,
-    LIMEX_NFA_128_5,
-    LIMEX_NFA_128_6,
-    LIMEX_NFA_128_7,
-    LIMEX_NFA_256_1,
-    LIMEX_NFA_256_2,
-    LIMEX_NFA_256_3,
-    LIMEX_NFA_256_4,
-    LIMEX_NFA_256_5,
-    LIMEX_NFA_256_6,
-    LIMEX_NFA_256_7,
-    LIMEX_NFA_384_1,
-    LIMEX_NFA_384_2,
-    LIMEX_NFA_384_3,
-    LIMEX_NFA_384_4,
-    LIMEX_NFA_384_5,
-    LIMEX_NFA_384_6,
-    LIMEX_NFA_384_7,
-    LIMEX_NFA_512_1,
-    LIMEX_NFA_512_2,
-    LIMEX_NFA_512_3,
-    LIMEX_NFA_512_4,
-    LIMEX_NFA_512_5,
-    LIMEX_NFA_512_6,
-    LIMEX_NFA_512_7,
+    LIMEX_NFA_32,
+    LIMEX_NFA_128,
+    LIMEX_NFA_256,
+    LIMEX_NFA_384,
+    LIMEX_NFA_512,
     MCCLELLAN_NFA_8,    /**< magic pseudo nfa */
     MCCLELLAN_NFA_16,   /**< magic pseudo nfa */
     GOUGH_NFA_8,        /**< magic pseudo nfa */
@@ -184,41 +154,11 @@ static really_inline int isDfaType(u8 t) {
 /** \brief True if the given type (from NFA::type) is an NFA. */
 static really_inline int isNfaType(u8 t) {
     switch (t) {
-    case LIMEX_NFA_32_1:
-    case LIMEX_NFA_32_2:
-    case LIMEX_NFA_32_3:
-    case LIMEX_NFA_32_4:
-    case LIMEX_NFA_32_5:
-    case LIMEX_NFA_32_6:
-    case LIMEX_NFA_32_7:
-    case LIMEX_NFA_128_1:
-    case LIMEX_NFA_128_2:
-    case LIMEX_NFA_128_3:
-    case LIMEX_NFA_128_4:
-    case LIMEX_NFA_128_5:
-    case LIMEX_NFA_128_6:
-    case LIMEX_NFA_128_7:
-    case LIMEX_NFA_256_1:
-    case LIMEX_NFA_256_2:
-    case LIMEX_NFA_256_3:
-    case LIMEX_NFA_256_4:
-    case LIMEX_NFA_256_5:
-    case LIMEX_NFA_256_6:
-    case LIMEX_NFA_256_7:
-    case LIMEX_NFA_384_1:
-    case LIMEX_NFA_384_2:
-    case LIMEX_NFA_384_3:
-    case LIMEX_NFA_384_4:
-    case LIMEX_NFA_384_5:
-    case LIMEX_NFA_384_6:
-    case LIMEX_NFA_384_7:
-    case LIMEX_NFA_512_1:
-    case LIMEX_NFA_512_2:
-    case LIMEX_NFA_512_3:
-    case LIMEX_NFA_512_4:
-    case LIMEX_NFA_512_5:
-    case LIMEX_NFA_512_6:
-    case LIMEX_NFA_512_7:
+    case LIMEX_NFA_32:
+    case LIMEX_NFA_128:
+    case LIMEX_NFA_256:
+    case LIMEX_NFA_384:
+    case LIMEX_NFA_512:
         return 1;
     default:
         break;
diff --git a/unit/internal/limex_nfa.cpp b/unit/internal/limex_nfa.cpp
index 9d3c00b5..eb6ce08b 100644
--- a/unit/internal/limex_nfa.cpp
+++ b/unit/internal/limex_nfa.cpp
@@ -130,7 +130,7 @@ protected:
 
 INSTANTIATE_TEST_CASE_P(
     LimEx, LimExModelTest,
-    Range((int)LIMEX_NFA_32_1, (int)LIMEX_NFA_512_7));
+    Range((int)LIMEX_NFA_32, (int)LIMEX_NFA_512));
 
 TEST_P(LimExModelTest, StateSize) {
     ASSERT_TRUE(nfa != nullptr);
@@ -337,7 +337,7 @@ protected:
 };
 
 INSTANTIATE_TEST_CASE_P(LimExReverse, LimExReverseTest,
-                        Range((int)LIMEX_NFA_32_1, (int)LIMEX_NFA_512_7));
+                        Range((int)LIMEX_NFA_32, (int)LIMEX_NFA_512));
 
 TEST_P(LimExReverseTest, BlockExecReverse) {
     ASSERT_TRUE(nfa != nullptr);
@@ -424,7 +424,7 @@ protected:
 };
 
 INSTANTIATE_TEST_CASE_P(LimExZombie, LimExZombieTest,
-                        Range((int)LIMEX_NFA_32_1, (int)LIMEX_NFA_512_7));
+                        Range((int)LIMEX_NFA_32, (int)LIMEX_NFA_512));
 
 TEST_P(LimExZombieTest, GetZombieStatus) {
     ASSERT_TRUE(nfa != nullptr);

From 86483972576dae799ea975c3ff327da6ac6cf3fa Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 21 Jun 2016 12:48:54 +1000
Subject: [PATCH 061/166] limex: invert scoring to count up from zero

---
 src/nfa/limex_compile.cpp | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index 563d6c9c..17e08cb5 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -167,11 +167,9 @@ struct build_info {
     limex_accel_info accel;
 };
 
-// Constants for scoring mechanism
-
 #define LAST_LIMEX_NFA LIMEX_NFA_512
 
-const int LIMEX_INITIAL_SCORE = 2000;
+// Constants for scoring mechanism
 const int SHIFT_COST = 10; // limex: cost per shift mask
 const int EXCEPTION_COST = 4; // limex: per exception
 
@@ -1371,9 +1369,9 @@ static
 int getLimexScore(const build_info &args, u32 nShifts) {
     const NGHolder &h = args.h;
     u32 maxVarShift = nShifts;
-    int score = LIMEX_INITIAL_SCORE;
+    int score = 0;
 
-    score -= SHIFT_COST * nShifts;
+    score += SHIFT_COST * nShifts;
     maxVarShift = findMaxVarShift(args, nShifts);
 
     NFAStateSet exceptionalStates(args.num_states);
@@ -1387,10 +1385,7 @@ int getLimexScore(const build_info &args, u32 nShifts) {
             exceptionalStates.set(from);
         }
     }
-    score -= EXCEPTION_COST * exceptionalStates.count();
-    if (score < 0) {
-        score = 0;
-    }
+    score += EXCEPTION_COST * exceptionalStates.count();
     return score;
 }
 
@@ -1401,10 +1396,10 @@ static
 u32 findBestNumOfVarShifts(const build_info &args,
                            int *bestScoreRet = nullptr) {
     u32 bestNumOfVarShifts = 0;
-    int bestScore = 0;
+    int bestScore = INT_MAX;
     for (u32 shiftCount = 1; shiftCount <= MAX_SHIFT_COUNT; shiftCount++) {
         int score = getLimexScore(args, shiftCount);
-        if (score > bestScore) {
+        if (score < bestScore) {
             bestScore = score;
             bestNumOfVarShifts = shiftCount;
         }
@@ -2188,8 +2183,8 @@ aligned_unique_ptr<NFA> generate(NGHolder &h,
             NFAEngineType ntype = (NFAEngineType)i;
 
             int score = DISPATCH_BY_LIMEX_TYPE(ntype, scoreNfa, arg);
+            DEBUG_PRINTF("%s scores %d\n", nfa_type_name(ntype), score);
             if (score >= 0) {
-                DEBUG_PRINTF("%s scores %d\n", nfa_type_name(ntype), score);
                 scores.push_back(make_pair(score, ntype));
             }
         }
@@ -2200,7 +2195,7 @@ aligned_unique_ptr<NFA> generate(NGHolder &h,
         return nullptr;
     }
 
-    sort(scores.begin(), scores.end(), greater<EngineScore>());
+    sort(scores.begin(), scores.end(), less<EngineScore>());
 
     aligned_unique_ptr<NFA> nfa;
     for (auto i = scores.begin(); !nfa && i != scores.end(); ++i) {

From e915ca21c5e22cf9eb3cd0581f5802730a8b649b Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 21 Jun 2016 12:53:13 +1000
Subject: [PATCH 062/166] limex: tidy up scoring code

---
 src/nfa/limex_compile.cpp | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index 17e08cb5..b8857922 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -2172,20 +2172,18 @@ aligned_unique_ptr<NFA> generate(NGHolder &h,
     // Acceleration analysis.
     fillAccelInfo(arg);
 
-    typedef pair<int, NFAEngineType> EngineScore;
-    vector<EngineScore> scores;
+    vector<pair<int, NFAEngineType>> scores;
 
     if (hint != INVALID_NFA) {
         // The caller has told us what to (attempt to) build.
-        scores.push_back(make_pair(0, (NFAEngineType)hint));
+        scores.emplace_back(0, (NFAEngineType)hint);
     } else {
         for (size_t i = 0; i <= LAST_LIMEX_NFA; i++) {
             NFAEngineType ntype = (NFAEngineType)i;
-
             int score = DISPATCH_BY_LIMEX_TYPE(ntype, scoreNfa, arg);
-            DEBUG_PRINTF("%s scores %d\n", nfa_type_name(ntype), score);
             if (score >= 0) {
-                scores.push_back(make_pair(score, ntype));
+                DEBUG_PRINTF("%s scores %d\n", nfa_type_name(ntype), score);
+                scores.emplace_back(score, ntype);
             }
         }
     }
@@ -2195,22 +2193,22 @@ aligned_unique_ptr<NFA> generate(NGHolder &h,
         return nullptr;
     }
 
-    sort(scores.begin(), scores.end(), less<EngineScore>());
+    // Sort acceptable models in priority order, lowest score first.
+    sort(scores.begin(), scores.end());
 
-    aligned_unique_ptr<NFA> nfa;
-    for (auto i = scores.begin(); !nfa && i != scores.end(); ++i) {
-        assert(i->first >= 0);
-        nfa = DISPATCH_BY_LIMEX_TYPE(i->second, generateNfa, arg);
+    for (const auto &elem : scores) {
+        assert(elem.first >= 0);
+        NFAEngineType limex_model = elem.second;
+        auto nfa = DISPATCH_BY_LIMEX_TYPE(limex_model, generateNfa, arg);
+        if (nfa) {
+            DEBUG_PRINTF("successful build with NFA engine: %s\n",
+                         nfa_type_name(limex_model));
+            return nfa;
+        }
     }
 
-    if (!nfa) {
-        DEBUG_PRINTF("NFA build failed.\n");
-        return nullptr;
-    }
-
-    DEBUG_PRINTF("successful build with NFA engine: %s\n",
-                 nfa_type_name((NFAEngineType)nfa->type));
-    return nfa;
+    DEBUG_PRINTF("NFA build failed.\n");
+    return nullptr;
 }
 
 u32 countAccelStates(NGHolder &h,

From 0749f7c06dc3c0758fbfea3839abc0ea9a8f93b3 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 20 Jun 2016 10:17:38 +1000
Subject: [PATCH 063/166] rose: allow ghosts to be aliased

---
 src/rose/rose_build_role_aliasing.cpp | 82 ++++++++++++++++++++++-----
 1 file changed, 67 insertions(+), 15 deletions(-)

diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index 8e883ab9..cbd56b90 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -348,8 +348,45 @@ bool isAliasingCandidate(RoseVertex v, const RoseBuildImpl &tbi) {
     }
 
     assert(*props.literals.begin() != MO_INVALID_IDX);
+    return true;
+}
 
-    // Any vertex involved in a "ghost" relationship has already been disallowed
+static
+bool sameGhostProperties(const RoseBuildImpl &build, RoseVertex a,
+                         RoseVertex b) {
+    // If these are ghost mapping keys, then they must map to the same vertex.
+    if (contains(build.ghost, a) || contains(build.ghost, b)) {
+        DEBUG_PRINTF("checking ghost key compat\n");
+        if (!contains(build.ghost, a) || !contains(build.ghost, b)) {
+            DEBUG_PRINTF("missing ghost mapping\n");
+            return false;
+        }
+        if (build.ghost.at(a) != build.ghost.at(b)) {
+            DEBUG_PRINTF("diff ghost mapping\n");
+            return false;
+        }
+        DEBUG_PRINTF("ghost mappings ok\n");
+        return true;
+    }
+
+    // If they are ghost vertices, then they must have the same literals.
+    // FIXME: get rid of linear scan
+    vector<RoseVertex> ghost_a, ghost_b;
+    for (const auto &e : build.ghost) {
+        if (e.second == a) {
+            ghost_a.push_back(e.first);
+        }
+        if (e.second == b) {
+            ghost_b.push_back(e.first);
+        }
+    }
+    if (!ghost_a.empty() || !ghost_a.empty()) {
+        DEBUG_PRINTF("ghost map targets\n");
+        if (build.g[a].literals != build.g[b].literals) {
+            DEBUG_PRINTF("diff literals\n");
+            return false;
+        }
+    }
 
     return true;
 }
@@ -380,6 +417,10 @@ bool sameRoleProperties(const RoseBuildImpl &build, RoseVertex a, RoseVertex b)
         return false;
     }
 
+    if (!sameGhostProperties(build, a, b)) {
+        return false;
+    }
+
     /* "roses are mergeable" check are handled elsewhere  */
 
     return true;
@@ -536,6 +577,28 @@ void mergeLiteralSets(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi) {
     insert(&g[b].literals, a_literals);
 }
 
+static
+void updateGhostMap(RoseBuildImpl &build, RoseVertex a, RoseVertex b) {
+    // Ghost keys.
+    if (contains(build.ghost, a)) {
+        auto it = build.ghost.find(a);
+        assert(it->second == build.ghost[b]);
+        build.ghost.erase(it);
+    }
+
+    // Ghost values. FIXME: this will be slow at scale.
+    vector<RoseVertex> ghost_refs;
+    for (const auto &e : build.ghost) {
+        if (e.second == a) {
+            ghost_refs.push_back(e.first);
+        }
+    }
+    for (const auto &v : ghost_refs) {
+        build.ghost.erase(v);
+        build.ghost.emplace(v, b);
+    }
+}
+
 // Merge role 'a' into 'b'.
 static
 void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
@@ -566,6 +629,9 @@ void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
     }
 
     mergeEdges(a, b, g);
+
+    updateGhostMap(tbi, a, b);
+
     removeVertexFromMaps(a, tbi, rrm);
 }
 
@@ -600,21 +666,7 @@ void mergeVerticesDiamond(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
 
 static never_inline
 void findCandidates(const RoseBuildImpl &tbi, CandidateSet *candidates) {
-    ue2::unordered_set<RoseVertex> disallowed;
-
-    // We currently deny candidature to any vertex involved in a "ghost"
-    // relationship.
-    for (const auto &m : tbi.ghost) {
-        disallowed.insert(m.first);
-        disallowed.insert(m.second);
-    }
-
     for (auto v : vertices_range(tbi.g)) {
-        // Ignore ghost relationships.
-        if (contains(disallowed, v)) {
-            continue;
-        }
-
         if (isAliasingCandidate(v, tbi)) {
             DEBUG_PRINTF("candidate %zu\n", tbi.g[v].idx);
             DEBUG_PRINTF("lits: %u\n", *tbi.g[v].literals.begin());

From 1e5fcd5e8082c53cfffdcd463ba192fb38cd9d18 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 20 Jun 2016 10:37:22 +1000
Subject: [PATCH 064/166] tighten ghost vertex checks

---
 src/rose/rose_build_role_aliasing.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index cbd56b90..e3b8be2f 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -380,15 +380,19 @@ bool sameGhostProperties(const RoseBuildImpl &build, RoseVertex a,
             ghost_b.push_back(e.first);
         }
     }
-    if (!ghost_a.empty() || !ghost_a.empty()) {
-        DEBUG_PRINTF("ghost map targets\n");
-        if (build.g[a].literals != build.g[b].literals) {
-            DEBUG_PRINTF("diff literals\n");
-            return false;
-        }
+
+    if (ghost_a.empty() && ghost_b.empty()) {
+        return true;
     }
 
-    return true;
+    if (ghost_a.empty() || ghost_b.empty()) {
+        DEBUG_PRINTF("only one is a ghost vertex\n");
+        return false;
+    }
+
+    // Both are ghost vertices: it is only safe to merge them if their literals
+    // are the same.
+    return build.g[a].literals == build.g[b].literals;
 }
 
 static

From 679042779a2aa22401cd5a9c93551f82d10f0874 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 20 Jun 2016 14:56:14 +1000
Subject: [PATCH 065/166] role aliasing: use a reverse ghost map

---
 src/rose/rose_build_role_aliasing.cpp | 316 +++++++++++++-------------
 1 file changed, 156 insertions(+), 160 deletions(-)

diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index e3b8be2f..d611f5d3 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -156,22 +156,31 @@ private:
     ue2::unordered_set<RoseVertex> hash_cont; /* member checks */
 };
 
-/**
- * \brief Mapping from a particular rose engine to a set of associated
- * vertices.
- */
-typedef ue2::unordered_map<left_id, set<RoseVertex> > revRoseMap;
+struct RoseAliasingInfo {
+    RoseAliasingInfo(const RoseBuildImpl &build) {
+        const auto &g = build.g;
 
-} // namespace
+        // Populate reverse leftfix map.
+        for (auto v : vertices_range(g)) {
+            if (g[v].left) {
+                rev_leftfix[g[v].left].insert(v);
+            }
+        }
 
-static
-void populateRevRoseMap(const RoseGraph &g, revRoseMap *out) {
-    for (auto v : vertices_range(g)) {
-        if (g[v].left) {
-            (*out)[g[v].left].insert(v);
+        // Populate reverse ghost vertex map.
+        for (const auto &m : build.ghost) {
+            rev_ghost[m.second].insert(m.first);
         }
     }
-}
+
+    /** \brief Mapping from leftfix to vertices. */
+    ue2::unordered_map<left_id, set<RoseVertex>> rev_leftfix;
+
+    /** \brief Mapping from undelayed ghost to delayed vertices. */
+    ue2::unordered_map<RoseVertex, set<RoseVertex>> rev_ghost;
+};
+
+} // namespace
 
 // Check successor set: must lead to the same vertices via edges with the
 // same properties.
@@ -352,7 +361,8 @@ bool isAliasingCandidate(RoseVertex v, const RoseBuildImpl &tbi) {
 }
 
 static
-bool sameGhostProperties(const RoseBuildImpl &build, RoseVertex a,
+bool sameGhostProperties(const RoseBuildImpl &build,
+                         const RoseAliasingInfo &rai, RoseVertex a,
                          RoseVertex b) {
     // If these are ghost mapping keys, then they must map to the same vertex.
     if (contains(build.ghost, a) || contains(build.ghost, b)) {
@@ -370,33 +380,20 @@ bool sameGhostProperties(const RoseBuildImpl &build, RoseVertex a,
     }
 
     // If they are ghost vertices, then they must have the same literals.
-    // FIXME: get rid of linear scan
-    vector<RoseVertex> ghost_a, ghost_b;
-    for (const auto &e : build.ghost) {
-        if (e.second == a) {
-            ghost_a.push_back(e.first);
-        }
-        if (e.second == b) {
-            ghost_b.push_back(e.first);
+    if (contains(rai.rev_ghost, a) || contains(rai.rev_ghost, b)) {
+        if (!contains(rai.rev_ghost, a) || !contains(rai.rev_ghost, b)) {
+            DEBUG_PRINTF("missing ghost reverse mapping\n");
+            return false;
         }
+        return build.g[a].literals == build.g[b].literals;
     }
 
-    if (ghost_a.empty() && ghost_b.empty()) {
-        return true;
-    }
-
-    if (ghost_a.empty() || ghost_b.empty()) {
-        DEBUG_PRINTF("only one is a ghost vertex\n");
-        return false;
-    }
-
-    // Both are ghost vertices: it is only safe to merge them if their literals
-    // are the same.
-    return build.g[a].literals == build.g[b].literals;
+    return true;
 }
 
 static
-bool sameRoleProperties(const RoseBuildImpl &build, RoseVertex a, RoseVertex b) {
+bool sameRoleProperties(const RoseBuildImpl &build, const RoseAliasingInfo &rai,
+                        RoseVertex a, RoseVertex b) {
     const RoseGraph &g = build.g;
     const RoseVertexProps &aprops = g[a], &bprops = g[b];
 
@@ -421,7 +418,7 @@ bool sameRoleProperties(const RoseBuildImpl &build, RoseVertex a, RoseVertex b)
         return false;
     }
 
-    if (!sameGhostProperties(build, a, b)) {
+    if (!sameGhostProperties(build, rai, a, b)) {
         return false;
     }
 
@@ -494,11 +491,12 @@ size_t hashRightRoleProperties(RoseVertex v, const RoseGraph &g) {
 }
 
 static
-void removeVertexFromMaps(RoseVertex v, RoseBuildImpl &build, revRoseMap &rrm) {
+void removeVertexFromMaps(RoseVertex v, RoseBuildImpl &build,
+                          RoseAliasingInfo &rai) {
     if (build.g[v].left) {
         const left_id left(build.g[v].left);
-        assert(contains(rrm[left], v));
-        rrm[left].erase(v);
+        assert(contains(rai.rev_leftfix[left], v));
+        rai.rev_leftfix[left].erase(v);
     }
 }
 
@@ -582,31 +580,28 @@ void mergeLiteralSets(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi) {
 }
 
 static
-void updateGhostMap(RoseBuildImpl &build, RoseVertex a, RoseVertex b) {
-    // Ghost keys.
+void updateGhostMap(RoseBuildImpl &build, RoseAliasingInfo &rai, RoseVertex a,
+                    RoseVertex b) {
     if (contains(build.ghost, a)) {
-        auto it = build.ghost.find(a);
-        assert(it->second == build.ghost[b]);
-        build.ghost.erase(it);
+        auto ghost = build.ghost.at(a);
+        assert(contains(build.ghost, b) && ghost == build.ghost.at(b));
+        build.ghost.erase(a);
+        rai.rev_ghost[ghost].erase(a);
     }
 
-    // Ghost values. FIXME: this will be slow at scale.
-    vector<RoseVertex> ghost_refs;
-    for (const auto &e : build.ghost) {
-        if (e.second == a) {
-            ghost_refs.push_back(e.first);
+    if (contains(rai.rev_ghost, a)) {
+        for (const auto &v : rai.rev_ghost[a]) {
+            build.ghost[v] = b;
+            rai.rev_ghost[b].insert(v);
         }
-    }
-    for (const auto &v : ghost_refs) {
-        build.ghost.erase(v);
-        build.ghost.emplace(v, b);
+        rai.rev_ghost.erase(a);
     }
 }
 
 // Merge role 'a' into 'b'.
 static
 void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
-                   revRoseMap &rrm) {
+                   RoseAliasingInfo &rai) {
     RoseGraph &g = tbi.g;
     DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
 
@@ -633,10 +628,8 @@ void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
     }
 
     mergeEdges(a, b, g);
-
-    updateGhostMap(tbi, a, b);
-
-    removeVertexFromMaps(a, tbi, rrm);
+    updateGhostMap(tbi, rai, a, b);
+    removeVertexFromMaps(a, tbi, rai);
 }
 
 /**
@@ -645,7 +638,7 @@ void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
  */
 static
 void mergeVerticesDiamond(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
-                          revRoseMap &rrm) {
+                          RoseAliasingInfo &rai) {
     RoseGraph &g = tbi.g;
     DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
 
@@ -665,7 +658,7 @@ void mergeVerticesDiamond(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
     g[b].max_offset = max(g[a].max_offset, g[b].max_offset);
 
     mergeLiteralSets(a, b, tbi);
-    removeVertexFromMaps(a, tbi, rrm);
+    removeVertexFromMaps(a, tbi, rai);
 }
 
 static never_inline
@@ -885,7 +878,7 @@ void pruneUnusedTops(NGHolder &h, const RoseGraph &g,
 
 static
 bool mergeSameCastle(RoseBuildImpl &tbi, RoseVertex a, RoseVertex b,
-                     revRoseMap &rrm) {
+                     RoseAliasingInfo &rai) {
     RoseGraph &g = tbi.g;
     LeftEngInfo &a_left = g[a].left;
     LeftEngInfo &b_left = g[b].left;
@@ -931,9 +924,9 @@ bool mergeSameCastle(RoseBuildImpl &tbi, RoseVertex a, RoseVertex b,
         }
     }
 
-    assert(contains(rrm[b_left], b));
-    rrm[b_left].erase(b);
-    rrm[a_left].insert(b);
+    assert(contains(rai.rev_leftfix[b_left], b));
+    rai.rev_leftfix[b_left].erase(b);
+    rai.rev_leftfix[a_left].insert(b);
 
     a_left.leftfix_report = new_report;
     b_left.leftfix_report = new_report;
@@ -942,14 +935,14 @@ bool mergeSameCastle(RoseBuildImpl &tbi, RoseVertex a, RoseVertex b,
     updateEdgeTops(g, a, a_top_map);
     updateEdgeTops(g, b, b_top_map);
 
-    pruneUnusedTops(castle, g, rrm[a_left]);
+    pruneUnusedTops(castle, g, rai.rev_leftfix[a_left]);
     return true;
 }
 
 static
 bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
                             RoseVertex b, bool trivialCasesOnly,
-                            revRoseMap &rrm) {
+                            RoseAliasingInfo &rai) {
     RoseGraph &g = tbi.g;
     LeftEngInfo &a_left = g[a].left;
     LeftEngInfo &b_left = g[b].left;
@@ -968,28 +961,28 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 
     if (&a_castle == &b_castle) {
         DEBUG_PRINTF("castles are the same\n");
-        return mergeSameCastle(tbi, a, b, rrm);
+        return mergeSameCastle(tbi, a, b, rai);
     }
 
     if (is_equal(a_castle, a_left.leftfix_report, b_castle,
                  b_left.leftfix_report)) {
         DEBUG_PRINTF("castles are equiv with respect to reports\n");
-        if (rrm[a_left_id].size() == 1) {
+        if (rai.rev_leftfix[a_left_id].size() == 1) {
             /* nobody else is using a_castle */
-            rrm[b_left_id].erase(b);
-            rrm[a_left_id].insert(b);
-            pruneUnusedTops(b_castle, g, rrm[b_left_id]);
+            rai.rev_leftfix[b_left_id].erase(b);
+            rai.rev_leftfix[a_left_id].insert(b);
+            pruneUnusedTops(b_castle, g, rai.rev_leftfix[b_left_id]);
             b_left.castle = a_left.castle;
             b_left.leftfix_report = a_left.leftfix_report;
             DEBUG_PRINTF("OK -> only user of a_castle\n");
             return true;
         }
 
-        if (rrm[b_left_id].size() == 1) {
+        if (rai.rev_leftfix[b_left_id].size() == 1) {
             /* nobody else is using b_castle */
-            rrm[a_left_id].erase(a);
-            rrm[b_left_id].insert(a);
-            pruneUnusedTops(a_castle, g, rrm[a_left_id]);
+            rai.rev_leftfix[a_left_id].erase(a);
+            rai.rev_leftfix[b_left_id].insert(a);
+            pruneUnusedTops(a_castle, g, rai.rev_leftfix[a_left_id]);
             a_left.castle = b_left.castle;
             a_left.leftfix_report = b_left.leftfix_report;
             DEBUG_PRINTF("OK -> only user of b_castle\n");
@@ -998,32 +991,32 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 
         if (preds_same) {
             /* preds are the same anyway in diamond/left merges just need to
-             * check that all the literals in rrm[b_h] can handle a_h */
-            for (auto v : rrm[b_left_id]) {
+             * check that all the literals in rev_leftfix[b_h] can handle a_h */
+            for (auto v : rai.rev_leftfix[b_left_id]) {
                 if (!mergeableRoseVertices(tbi, a, v)) {
                     goto literal_mismatch_1;
                 }
             }
 
-            rrm[a_left_id].erase(a);
-            rrm[b_left_id].insert(a);
-            pruneUnusedTops(a_castle, g, rrm[a_left_id]);
+            rai.rev_leftfix[a_left_id].erase(a);
+            rai.rev_leftfix[b_left_id].insert(a);
+            pruneUnusedTops(a_castle, g, rai.rev_leftfix[a_left_id]);
             a_left.castle = b_left.castle;
             a_left.leftfix_report = b_left.leftfix_report;
             DEBUG_PRINTF("OK -> same preds ???\n");
             return true;
         literal_mismatch_1:
             /* preds are the same anyway in diamond/left merges just need to
-             * check that all the literals in rrm[a_h] can handle b_h */
-            for (auto v : rrm[a_left_id]) {
+             * check that all the literals in rev_leftfix[a_h] can handle b_h */
+            for (auto v : rai.rev_leftfix[a_left_id]) {
                 if (!mergeableRoseVertices(tbi, v, b)) {
                     goto literal_mismatch_2;
                 }
             }
 
-            rrm[b_left_id].erase(b);
-            rrm[a_left_id].insert(b);
-            pruneUnusedTops(b_castle, g, rrm[b_left_id]);
+            rai.rev_leftfix[b_left_id].erase(b);
+            rai.rev_leftfix[a_left_id].insert(b);
+            pruneUnusedTops(b_castle, g, rai.rev_leftfix[b_left_id]);
             b_left.castle = a_left.castle;
             b_left.leftfix_report = a_left.leftfix_report;
             DEBUG_PRINTF("OK -> same preds ???\n");
@@ -1039,10 +1032,10 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
         pruneCastle(*new_castle, a_left.leftfix_report);
         setReports(*new_castle, new_report);
 
-        rrm[a_left_id].erase(a);
-        rrm[b_left_id].erase(b);
-        pruneUnusedTops(*a_left.castle, g, rrm[a_left_id]);
-        pruneUnusedTops(*b_left.castle, g, rrm[b_left_id]);
+        rai.rev_leftfix[a_left_id].erase(a);
+        rai.rev_leftfix[b_left_id].erase(b);
+        pruneUnusedTops(*a_left.castle, g, rai.rev_leftfix[a_left_id]);
+        pruneUnusedTops(*b_left.castle, g, rai.rev_leftfix[b_left_id]);
 
         a_left.leftfix_report = new_report;
         b_left.leftfix_report = new_report;
@@ -1050,9 +1043,9 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
         b_left.castle = new_castle;
 
         assert(a_left == b_left);
-        rrm[a_left].insert(a);
-        rrm[a_left].insert(b);
-        pruneUnusedTops(*new_castle, g, rrm[a_left]);
+        rai.rev_leftfix[a_left].insert(a);
+        rai.rev_leftfix[a_left].insert(b);
+        pruneUnusedTops(*new_castle, g, rai.rev_leftfix[a_left]);
         return true;
     }
 
@@ -1068,7 +1061,7 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
         return false;
     }
 
-    set<RoseVertex> &b_verts = rrm[b_left_id];
+    set<RoseVertex> &b_verts = rai.rev_leftfix[b_left_id];
     set<RoseVertex> aa;
     aa.insert(a);
 
@@ -1126,10 +1119,10 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     DEBUG_PRINTF("merged into castle containing %zu repeats\n",
                   m_castle->repeats.size());
 
-    rrm[a_left_id].erase(a);
-    rrm[b_left_id].erase(b);
-    pruneUnusedTops(*a_left.castle, g, rrm[a_left_id]);
-    pruneUnusedTops(*b_left.castle, g, rrm[b_left_id]);
+    rai.rev_leftfix[a_left_id].erase(a);
+    rai.rev_leftfix[b_left_id].erase(b);
+    pruneUnusedTops(*a_left.castle, g, rai.rev_leftfix[a_left_id]);
+    pruneUnusedTops(*b_left.castle, g, rai.rev_leftfix[b_left_id]);
 
     a_left.castle = m_castle;
     a_left.leftfix_report = new_report;
@@ -1137,16 +1130,16 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     b_left.leftfix_report = new_report;
 
     assert(a_left == b_left);
-    rrm[a_left].insert(a);
-    rrm[a_left].insert(b);
-    pruneUnusedTops(*m_castle, g, rrm[a_left]);
+    rai.rev_leftfix[a_left].insert(a);
+    rai.rev_leftfix[a_left].insert(b);
+    pruneUnusedTops(*m_castle, g, rai.rev_leftfix[a_left]);
     return true;
 }
 
 static
 bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
                            RoseVertex b, bool trivialCasesOnly,
-                           revRoseMap &rrm) {
+                           RoseAliasingInfo &rai) {
     RoseGraph &g = tbi.g;
     LeftEngInfo &a_left = g[a].left;
     LeftEngInfo &b_left = g[b].left;
@@ -1169,67 +1162,67 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
         duplicateReport(*b_h, b_left.leftfix_report, new_report);
         a_left.leftfix_report = new_report;
         b_left.leftfix_report = new_report;
-        pruneReportIfUnused(tbi, b_h, rrm[b_left_id], a_oldreport);
-        pruneReportIfUnused(tbi, b_h, rrm[b_left_id], b_oldreport);
-        pruneUnusedTops(*b_h, g, rrm[b_left_id]);
+        pruneReportIfUnused(tbi, b_h, rai.rev_leftfix[b_left_id], a_oldreport);
+        pruneReportIfUnused(tbi, b_h, rai.rev_leftfix[b_left_id], b_oldreport);
+        pruneUnusedTops(*b_h, g, rai.rev_leftfix[b_left_id]);
         assert(a_left == b_left);
         return true;
     }
 
     /* if it is the same graph, it is also fairly easy */
     if (is_equal(*a_h, a_left.leftfix_report, *b_h, b_left.leftfix_report)) {
-        if (rrm[a_left_id].size() == 1) {
+        if (rai.rev_leftfix[a_left_id].size() == 1) {
             /* nobody else is using a_h */
-            rrm[b_left_id].erase(b);
-            rrm[a_left_id].insert(b);
+            rai.rev_leftfix[b_left_id].erase(b);
+            rai.rev_leftfix[a_left_id].insert(b);
             b_left.graph = a_h;
             b_left.leftfix_report = a_left.leftfix_report;
-            pruneUnusedTops(*b_h, g, rrm[b_left_id]);
+            pruneUnusedTops(*b_h, g, rai.rev_leftfix[b_left_id]);
             DEBUG_PRINTF("OK -> only user of a_h\n");
             return true;
         }
 
-        if (rrm[b_left_id].size() == 1) {
+        if (rai.rev_leftfix[b_left_id].size() == 1) {
             /* nobody else is using b_h */
-            rrm[a_left_id].erase(a);
-            rrm[b_left_id].insert(a);
+            rai.rev_leftfix[a_left_id].erase(a);
+            rai.rev_leftfix[b_left_id].insert(a);
             a_left.graph = b_h;
             a_left.leftfix_report = b_left.leftfix_report;
-            pruneUnusedTops(*a_h, g, rrm[a_left_id]);
+            pruneUnusedTops(*a_h, g, rai.rev_leftfix[a_left_id]);
             DEBUG_PRINTF("OK -> only user of b_h\n");
             return true;
         }
 
         if (preds_same) {
             /* preds are the same anyway in diamond/left merges just need to
-             * check that all the literals in rrm[b_h] can handle a_h */
-            for (auto v : rrm[b_left_id]) {
+             * check that all the literals in rev_leftfix[b_h] can handle a_h */
+            for (auto v : rai.rev_leftfix[b_left_id]) {
                 if (!mergeableRoseVertices(tbi, a, v)) {
                     goto literal_mismatch_1;
                 }
             }
 
-            rrm[a_left_id].erase(a);
-            rrm[b_left_id].insert(a);
+            rai.rev_leftfix[a_left_id].erase(a);
+            rai.rev_leftfix[b_left_id].insert(a);
             a_left.graph = b_h;
             a_left.leftfix_report = b_left.leftfix_report;
-            pruneUnusedTops(*a_h, g, rrm[a_left_id]);
+            pruneUnusedTops(*a_h, g, rai.rev_leftfix[a_left_id]);
             DEBUG_PRINTF("OK -> same preds ???\n");
             return true;
         literal_mismatch_1:
             /* preds are the same anyway in diamond/left merges just need to
-             * check that all the literals in rrm[a_h] can handle b_h */
-            for (auto v : rrm[a_left_id]) {
+             * check that all the literals in rev_leftfix[a_h] can handle b_h */
+            for (auto v : rai.rev_leftfix[a_left_id]) {
                 if (!mergeableRoseVertices(tbi, v, b)) {
                     goto literal_mismatch_2;
                 }
             }
 
-            rrm[b_left_id].erase(b);
-            rrm[a_left_id].insert(b);
+            rai.rev_leftfix[b_left_id].erase(b);
+            rai.rev_leftfix[a_left_id].insert(b);
             b_left.graph = a_h;
             b_left.leftfix_report = a_left.leftfix_report;
-            pruneUnusedTops(*b_h, g, rrm[b_left_id]);
+            pruneUnusedTops(*b_h, g, rai.rev_leftfix[b_left_id]);
             DEBUG_PRINTF("OK -> same preds ???\n");
             return true;
         literal_mismatch_2:;
@@ -1243,19 +1236,19 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
         duplicateReport(*new_graph, b_left.leftfix_report, new_report);
         pruneReportIfUnused(tbi, new_graph, {}, b_left.leftfix_report);
 
-        rrm[a_left_id].erase(a);
-        rrm[b_left_id].erase(b);
-        pruneUnusedTops(*a_h, g, rrm[a_left_id]);
-        pruneUnusedTops(*b_h, g, rrm[b_left_id]);
+        rai.rev_leftfix[a_left_id].erase(a);
+        rai.rev_leftfix[b_left_id].erase(b);
+        pruneUnusedTops(*a_h, g, rai.rev_leftfix[a_left_id]);
+        pruneUnusedTops(*b_h, g, rai.rev_leftfix[b_left_id]);
 
         a_left.leftfix_report = new_report;
         b_left.leftfix_report = new_report;
         a_left.graph = new_graph;
         b_left.graph = new_graph;
 
-        rrm[a_left].insert(a);
-        rrm[a_left].insert(b);
-        pruneUnusedTops(*new_graph, g, rrm[a_left]);
+        rai.rev_leftfix[a_left].insert(a);
+        rai.rev_leftfix[a_left].insert(b);
+        pruneUnusedTops(*new_graph, g, rai.rev_leftfix[a_left]);
         return true;
     }
 
@@ -1274,7 +1267,7 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     DEBUG_PRINTF("attempting merge of roses on vertices %zu and %zu\n",
                  g[a].idx, g[b].idx);
 
-    set<RoseVertex> &b_verts = rrm[b_left];
+    set<RoseVertex> &b_verts = rai.rev_leftfix[b_left];
     set<RoseVertex> aa;
     aa.insert(a);
 
@@ -1296,7 +1289,7 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     ReportID new_report = tbi.getNewNfaReport();
     duplicateReport(*b_h, b_left.leftfix_report, new_report);
     b_left.leftfix_report = new_report;
-    pruneReportIfUnused(tbi, b_h, rrm[b_left_id], b_oldreport);
+    pruneReportIfUnused(tbi, b_h, rai.rev_leftfix[b_left_id], b_oldreport);
 
     NGHolder victim;
     cloneHolder(victim, *a_h);
@@ -1337,16 +1330,16 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     a_left.graph = b_h;
     a_left.leftfix_report = new_report;
 
-    assert(contains(rrm[a_left_id], a));
-    assert(contains(rrm[b_left_id], b));
-    rrm[a_left_id].erase(a);
-    rrm[b_left_id].insert(a);
+    assert(contains(rai.rev_leftfix[a_left_id], a));
+    assert(contains(rai.rev_leftfix[b_left_id], b));
+    rai.rev_leftfix[a_left_id].erase(a);
+    rai.rev_leftfix[b_left_id].insert(a);
 
-    pruneUnusedTops(*a_h, g, rrm[a_left_id]);
-    pruneUnusedTops(*b_h, g, rrm[b_left_id]);
+    pruneUnusedTops(*a_h, g, rai.rev_leftfix[a_left_id]);
+    pruneUnusedTops(*b_h, g, rai.rev_leftfix[b_left_id]);
 
     // Prune A's report from its old prefix if it was only used by A.
-    pruneReportIfUnused(tbi, a_h, rrm[a_left_id], a_oldreport);
+    pruneReportIfUnused(tbi, a_h, rai.rev_leftfix[a_left_id], a_oldreport);
 
     reduceImplementableGraph(*b_h, SOM_NONE, nullptr, tbi.cc);
 
@@ -1361,7 +1354,8 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 // is not possible.
 static
 bool attemptRoseMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
-                      RoseVertex b, bool trivialCasesOnly, revRoseMap &rrm) {
+                      RoseVertex b, bool trivialCasesOnly,
+                      RoseAliasingInfo &rai) {
     DEBUG_PRINTF("attempting rose merge, vertices a=%zu, b=%zu\n",
                   tbi.g[a].idx, tbi.g[b].idx);
     assert(a != b);
@@ -1406,12 +1400,12 @@ bool attemptRoseMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 
     if (a_left_id.graph() && b_left_id.graph()) {
         return attemptRoseGraphMerge(tbi, preds_same, a, b, trivialCasesOnly,
-                                     rrm);
+                                     rai);
     }
 
     if (a_left_id.castle() && b_left_id.castle()) {
         return attemptRoseCastleMerge(tbi, preds_same, a, b, trivialCasesOnly,
-                                      rrm);
+                                      rai);
     }
 
     return false;
@@ -1557,7 +1551,7 @@ vector<vector<RoseVertex>> splitDiamondMergeBuckets(CandidateSet &candidates,
 static never_inline
 void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
                       vector<RoseVertex> *dead, bool mergeRoses,
-                      revRoseMap &rrm) {
+                      RoseAliasingInfo &rai) {
     DEBUG_PRINTF("begin\n");
     RoseGraph &g = tbi.g;
 
@@ -1580,7 +1574,7 @@ void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
                 RoseVertex b = *jt;
                 assert(contains(candidates, b));
 
-                if (!sameRoleProperties(tbi, a, b)) {
+                if (!sameRoleProperties(tbi, rai, a, b)) {
                     DEBUG_PRINTF("diff role prop\n");
                     continue;
                 }
@@ -1602,12 +1596,12 @@ void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
                     continue;
                 }
 
-                if (!attemptRoseMerge(tbi, true, a, b, !mergeRoses, rrm)) {
+                if (!attemptRoseMerge(tbi, true, a, b, !mergeRoses, rai)) {
                     DEBUG_PRINTF("rose fail\n");
                     continue;
                 }
 
-                mergeVerticesDiamond(a, b, tbi, rrm);
+                mergeVerticesDiamond(a, b, tbi, rai);
                 dead->push_back(a);
                 candidates.erase(a);
                 break; // next a
@@ -1623,6 +1617,7 @@ vector<RoseVertex>::iterator findLeftMergeSibling(
                           vector<RoseVertex>::iterator it,
                           const vector<RoseVertex>::iterator &end,
                           const RoseVertex a, const RoseBuildImpl &build,
+                          const RoseAliasingInfo &rai,
                           const CandidateSet &candidates) {
     const RoseGraph &g = build.g;
 
@@ -1636,7 +1631,7 @@ vector<RoseVertex>::iterator findLeftMergeSibling(
             continue;
         }
 
-        if (!sameRoleProperties(build, a, b)) {
+        if (!sameRoleProperties(build, rai, a, b)) {
             continue;
         }
 
@@ -1667,7 +1662,7 @@ vector<RoseVertex>::iterator findLeftMergeSibling(
 
 static never_inline
 void leftMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
-                   vector<RoseVertex> *dead, revRoseMap &rrm) {
+                   vector<RoseVertex> *dead, RoseAliasingInfo &rai) {
     DEBUG_PRINTF("begin (%zu)\n", candidates.size());
     RoseGraph &g = tbi.g;
     vector<RoseVertex> siblings;
@@ -1701,19 +1696,19 @@ void leftMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
         sort(siblings.begin(), siblings.end(), VertexIndexComp(g));
 
         auto jt = findLeftMergeSibling(siblings.begin(), siblings.end(), a, tbi,
-                                       candidates);
+                                       rai, candidates);
         if (jt == siblings.end()) {
             continue;
         }
 
         RoseVertex b = *jt;
 
-        if (!attemptRoseMerge(tbi, true, a, b, 0, rrm)) {
+        if (!attemptRoseMerge(tbi, true, a, b, 0, rai)) {
             DEBUG_PRINTF("rose fail\n");
             continue;
         }
 
-        mergeVertices(a, b, tbi, rrm);
+        mergeVertices(a, b, tbi, rai);
         dead->push_back(a);
         candidates.erase(ait);
     }
@@ -1748,6 +1743,7 @@ vector<RoseVertex>::const_iterator findRightMergeSibling(
                            vector<RoseVertex>::const_iterator it,
                            const vector<RoseVertex>::const_iterator &end,
                            const RoseVertex a, const RoseBuildImpl &build,
+                           const RoseAliasingInfo &rai,
                            const CandidateSet &candidates) {
     const RoseGraph &g = build.g;
 
@@ -1761,7 +1757,7 @@ vector<RoseVertex>::const_iterator findRightMergeSibling(
             continue;
         }
 
-        if (!sameRoleProperties(build, a, b)) {
+        if (!sameRoleProperties(build, rai, a, b)) {
             continue;
         }
 
@@ -1888,7 +1884,7 @@ const vector<RoseVertex> &getCandidateRightSiblings(
 static never_inline
 void rightMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
                     vector<RoseVertex> *dead, bool mergeRoses,
-                    revRoseMap &rrm) {
+                    RoseAliasingInfo &rai) {
     DEBUG_PRINTF("begin\n");
 
     map<size_t, vector<RoseVertex> > sibling_cache;
@@ -1911,11 +1907,12 @@ void rightMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
 
         auto jt = siblings.begin();
         while (jt != siblings.end()) {
-            jt = findRightMergeSibling(jt, siblings.end(), a, tbi, candidates);
+            jt = findRightMergeSibling(jt, siblings.end(), a, tbi, rai,
+                                       candidates);
             if (jt == siblings.end()) {
                 break;
             }
-            if (attemptRoseMerge(tbi, false, a, *jt, !mergeRoses, rrm)) {
+            if (attemptRoseMerge(tbi, false, a, *jt, !mergeRoses, rai)) {
                 break;
             }
             ++jt;
@@ -1926,7 +1923,7 @@ void rightMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
         }
 
         RoseVertex b = *jt;
-        mergeVertices(a, b, tbi, rrm);
+        mergeVertices(a, b, tbi, rai);
         dead->push_back(a);
         candidates.erase(ait);
     }
@@ -2002,10 +1999,9 @@ void aliasRoles(RoseBuildImpl &build, bool mergeRoses) {
         return;
     }
 
-    revRoseMap rrm;
-
     DEBUG_PRINTF("doing role aliasing mr=%d\n", (int)mergeRoses);
-    populateRevRoseMap(g, &rrm);
+
+    RoseAliasingInfo rai(build);
 
     mergeRoses &= cc.grey.mergeRose & cc.grey.roseMergeRosesDuringAliasing;
 
@@ -2018,8 +2014,8 @@ void aliasRoles(RoseBuildImpl &build, bool mergeRoses) {
     size_t old_dead_size = 0;
     do {
         old_dead_size = dead.size();
-        leftMergePass(candidates, build, &dead, rrm);
-        rightMergePass(candidates, build, &dead, mergeRoses, rrm);
+        leftMergePass(candidates, build, &dead, rai);
+        rightMergePass(candidates, build, &dead, mergeRoses, rai);
     } while (old_dead_size != dead.size());
 
     /* Diamond merge passes cannot create extra merges as they require the same
@@ -2027,7 +2023,7 @@ void aliasRoles(RoseBuildImpl &build, bool mergeRoses) {
      * to a merge to different pred/succ before a diamond merge, it will still
      * be afterwards. */
     filterDiamondCandidates(g, candidates);
-    diamondMergePass(candidates, build, &dead, mergeRoses, rrm);
+    diamondMergePass(candidates, build, &dead, mergeRoses, rai);
 
     DEBUG_PRINTF("killed %zu vertices\n", dead.size());
     build.removeVertices(dead);

From 623980556113b45da1e18eccc1ec71bd826402d9 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 20 Jun 2016 16:38:03 +1000
Subject: [PATCH 066/166] rose: don't build empty sparse iter subprograms

---
 src/rose/rose_build_bytecode.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 6d485b0c..cab0d60a 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -3409,6 +3409,7 @@ void addPredBlocksSingle(
 
     for (const auto &m : predProgramLists) {
         const u32 &pred_state = m.first;
+        assert(!m.second.empty());
         auto subprog = flattenProgram(m.second);
 
         // Check our pred state.
@@ -3465,6 +3466,7 @@ void addPredBlocksMulti(build_context &bc,
         DEBUG_PRINTF("subprogram %zu has offset %u\n", jump_table.size(),
                      curr_offset);
         jump_table.push_back(curr_offset);
+        assert(!e.second.empty());
         auto subprog = flattenProgram(e.second);
 
         if (e.first != keys.back()) {
@@ -3800,6 +3802,9 @@ vector<RoseInstruction> buildLiteralProgram(RoseBuildImpl &build,
         assert(contains(bc.roleStateIndices, u));
         u32 pred_state = bc.roleStateIndices.at(u);
         auto program = makeProgram(build, bc, e);
+        if (program.empty()) {
+            continue;
+        }
         predProgramLists[pred_state].push_back(program);
     }
 
@@ -4047,6 +4052,9 @@ void addEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
             u32 predStateIdx = bc.roleStateIndices.at(u);
 
             auto program = makeEodAnchorProgram(build, bc, e, multiple_preds);
+            if (program.empty()) {
+                continue;
+            }
             predProgramLists[predStateIdx].push_back(program);
         }
     }

From 1d18852dc9e06029f407d46bad5bd925ece30a00 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 23 Jun 2016 11:07:39 +1000
Subject: [PATCH 067/166] role aliasing: unify map update code

---
 src/rose/rose_build_role_aliasing.cpp | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index d611f5d3..a0416ac5 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -490,16 +490,6 @@ size_t hashRightRoleProperties(RoseVertex v, const RoseGraph &g) {
     return val;
 }
 
-static
-void removeVertexFromMaps(RoseVertex v, RoseBuildImpl &build,
-                          RoseAliasingInfo &rai) {
-    if (build.g[v].left) {
-        const left_id left(build.g[v].left);
-        assert(contains(rai.rev_leftfix[left], v));
-        rai.rev_leftfix[left].erase(v);
-    }
-}
-
 static
 void mergeEdgeAdd(RoseVertex u, RoseVertex v, const RoseEdge &from_edge,
                   const RoseEdge *to_edge, RoseGraph &g) {
@@ -580,8 +570,13 @@ void mergeLiteralSets(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi) {
 }
 
 static
-void updateGhostMap(RoseBuildImpl &build, RoseAliasingInfo &rai, RoseVertex a,
-                    RoseVertex b) {
+void updateAliasingInfo(RoseBuildImpl &build, RoseAliasingInfo &rai,
+                        RoseVertex a, RoseVertex b) {
+    if (build.g[a].left) {
+        const left_id left(build.g[a].left);
+        assert(contains(rai.rev_leftfix[left], a));
+        rai.rev_leftfix[left].erase(a);
+    }
     if (contains(build.ghost, a)) {
         auto ghost = build.ghost.at(a);
         assert(contains(build.ghost, b) && ghost == build.ghost.at(b));
@@ -628,8 +623,7 @@ void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
     }
 
     mergeEdges(a, b, g);
-    updateGhostMap(tbi, rai, a, b);
-    removeVertexFromMaps(a, tbi, rai);
+    updateAliasingInfo(tbi, rai, a, b);
 }
 
 /**
@@ -658,7 +652,7 @@ void mergeVerticesDiamond(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
     g[b].max_offset = max(g[a].max_offset, g[b].max_offset);
 
     mergeLiteralSets(a, b, tbi);
-    removeVertexFromMaps(a, tbi, rai);
+    updateAliasingInfo(tbi, rai, a, b);
 }
 
 static never_inline

From 1c2b0a271dc723210428ea6113d6fc0c1b6c19e9 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 23 Jun 2016 11:14:34 +1000
Subject: [PATCH 068/166] role aliasing: tidy up naming

---
 src/rose/rose_build_role_aliasing.cpp | 223 +++++++++++++-------------
 1 file changed, 115 insertions(+), 108 deletions(-)

diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index a0416ac5..292e199a 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -271,7 +271,8 @@ bool samePredecessors(RoseVertex a, RoseVertex b, const RoseGraph &g) {
 }
 
 static
-bool hasCommonSuccWithBadBounds(RoseVertex a, RoseVertex b, const RoseGraph &g) {
+bool hasCommonSuccWithBadBounds(RoseVertex a, RoseVertex b,
+                                const RoseGraph &g) {
     for (const auto &e_a : out_edges_range(a, g)) {
         bool exists;
         RoseEdge e;
@@ -292,7 +293,8 @@ bool hasCommonSuccWithBadBounds(RoseVertex a, RoseVertex b, const RoseGraph &g)
 }
 
 static
-bool hasCommonPredWithBadBounds(RoseVertex a, RoseVertex b, const RoseGraph &g) {
+bool hasCommonPredWithBadBounds(RoseVertex a, RoseVertex b,
+                                const RoseGraph &g) {
     for (const auto &e_a : in_edges_range(a, g)) {
         bool exists;
         RoseEdge e;
@@ -318,23 +320,24 @@ bool hasCommonPredWithBadBounds(RoseVertex a, RoseVertex b, const RoseGraph &g)
 }
 
 static
-bool canMergeLiterals(RoseVertex a, RoseVertex b, const RoseBuildImpl &tbi) {
-    const auto &lits_a = tbi.g[a].literals;
-    const auto &lits_b = tbi.g[b].literals;
+bool canMergeLiterals(RoseVertex a, RoseVertex b, const RoseBuildImpl &build) {
+    const auto &lits_a = build.g[a].literals;
+    const auto &lits_b = build.g[b].literals;
     assert(!lits_a.empty() && !lits_b.empty());
 
     // If both vertices have only pseudo-dotstar in-edges, we can merge
     // literals of different lengths and can avoid the check below.
-    if (tbi.hasOnlyPseudoStarInEdges(a) && tbi.hasOnlyPseudoStarInEdges(b)) {
+    if (build.hasOnlyPseudoStarInEdges(a) &&
+        build.hasOnlyPseudoStarInEdges(b)) {
         DEBUG_PRINTF("both have pseudo-dotstar in-edges\n");
         return true;
     }
 
     // Otherwise, all the literals involved must have the same length.
     for (u32 a_id : lits_a) {
-        const rose_literal_id &la = tbi.literals.right.at(a_id);
+        const rose_literal_id &la = build.literals.right.at(a_id);
         for (u32 b_id : lits_b) {
-            const rose_literal_id &lb = tbi.literals.right.at(b_id);
+            const rose_literal_id &lb = build.literals.right.at(b_id);
 
             if (la.elength() != lb.elength()) {
                 DEBUG_PRINTF("bad merge %zu!=%zu '%s', '%s'\n", la.elength(),
@@ -348,8 +351,8 @@ bool canMergeLiterals(RoseVertex a, RoseVertex b, const RoseBuildImpl &tbi) {
 }
 
 static
-bool isAliasingCandidate(RoseVertex v, const RoseBuildImpl &tbi) {
-    const RoseVertexProps &props = tbi.g[v];
+bool isAliasingCandidate(RoseVertex v, const RoseBuildImpl &build) {
+    const RoseVertexProps &props = build.g[v];
 
     // Must have literals.
     if (props.literals.empty()) {
@@ -427,8 +430,8 @@ bool sameRoleProperties(const RoseBuildImpl &build, const RoseAliasingInfo &rai,
     return true;
 }
 
-/* Checks compatibility of role properties if we require that two roles are right
- * equiv. */
+/* Checks compatibility of role properties if we require that two roles are
+ * right equiv. */
 static
 bool sameRightRoleProperties(const RoseBuildImpl &build, RoseVertex a,
                              RoseVertex b) {
@@ -518,7 +521,7 @@ void mergeEdges(RoseVertex a, RoseVertex b, RoseGraph &g) {
     // Cache b's in-edges so we can look them up by source quickly.
     for (const auto &e : in_edges_range(b, g)) {
         RoseVertex u = source(e, g);
-        b_edges.insert(make_pair(u, e));
+        b_edges.emplace(u, e);
     }
 
     // Add a's in-edges to b, merging them in where b already has the new edge.
@@ -537,7 +540,7 @@ void mergeEdges(RoseVertex a, RoseVertex b, RoseGraph &g) {
     b_edges.clear();
     for (const auto &e : out_edges_range(b, g)) {
         RoseVertex v = target(e, g);
-        b_edges.insert(make_pair(v, e));
+        b_edges.emplace(v, e);
     }
 
     // Add a's out-edges to b, merging them in where b already has the new edge.
@@ -557,11 +560,11 @@ void mergeEdges(RoseVertex a, RoseVertex b, RoseGraph &g) {
 }
 
 static
-void mergeLiteralSets(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi) {
-    RoseGraph &g = tbi.g;
+void mergeLiteralSets(RoseVertex a, RoseVertex b, RoseBuildImpl &build) {
+    RoseGraph &g = build.g;
     const auto &a_literals = g[a].literals;
     for (u32 lit_id : a_literals) {
-        auto &lit_vertices = tbi.literal_info[lit_id].vertices;
+        auto &lit_vertices = build.literal_info[lit_id].vertices;
         lit_vertices.erase(a);
         lit_vertices.insert(b);
     }
@@ -595,9 +598,9 @@ void updateAliasingInfo(RoseBuildImpl &build, RoseAliasingInfo &rai,
 
 // Merge role 'a' into 'b'.
 static
-void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
+void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &build,
                    RoseAliasingInfo &rai) {
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
     DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
 
     // Merge role properties.
@@ -614,7 +617,7 @@ void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
     g[b].min_offset = min(g[a].min_offset, g[b].min_offset);
     g[b].max_offset = max(g[a].max_offset, g[b].max_offset);
 
-    mergeLiteralSets(a, b, tbi);
+    mergeLiteralSets(a, b, build);
 
     if (!g[b].suffix) {
         g[b].suffix = g[a].suffix;
@@ -623,7 +626,7 @@ void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
     }
 
     mergeEdges(a, b, g);
-    updateAliasingInfo(tbi, rai, a, b);
+    updateAliasingInfo(build, rai, a, b);
 }
 
 /**
@@ -631,9 +634,9 @@ void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
  * that the in- and out-edge sets, reports and suffixes are identical.
  */
 static
-void mergeVerticesDiamond(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
+void mergeVerticesDiamond(RoseVertex a, RoseVertex b, RoseBuildImpl &build,
                           RoseAliasingInfo &rai) {
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
     DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
 
     // Merge role properties. For a diamond merge, most properties are already
@@ -651,23 +654,23 @@ void mergeVerticesDiamond(RoseVertex a, RoseVertex b, RoseBuildImpl &tbi,
     g[b].min_offset = min(g[a].min_offset, g[b].min_offset);
     g[b].max_offset = max(g[a].max_offset, g[b].max_offset);
 
-    mergeLiteralSets(a, b, tbi);
-    updateAliasingInfo(tbi, rai, a, b);
+    mergeLiteralSets(a, b, build);
+    updateAliasingInfo(build, rai, a, b);
 }
 
 static never_inline
-void findCandidates(const RoseBuildImpl &tbi, CandidateSet *candidates) {
-    for (auto v : vertices_range(tbi.g)) {
-        if (isAliasingCandidate(v, tbi)) {
-            DEBUG_PRINTF("candidate %zu\n", tbi.g[v].idx);
-            DEBUG_PRINTF("lits: %u\n", *tbi.g[v].literals.begin());
+void findCandidates(const RoseBuildImpl &build, CandidateSet *candidates) {
+    for (auto v : vertices_range(build.g)) {
+        if (isAliasingCandidate(v, build)) {
+            DEBUG_PRINTF("candidate %zu\n", build.g[v].idx);
+            DEBUG_PRINTF("lits: %u\n", *build.g[v].literals.begin());
             candidates->insert(v);
         }
     }
 
-    assert(candidates->size() <= num_vertices(tbi.g));
+    assert(candidates->size() <= num_vertices(build.g));
     DEBUG_PRINTF("found %zu/%zu candidates\n", candidates->size(),
-                 num_vertices(tbi.g));
+                 num_vertices(build.g));
 }
 
 static
@@ -682,7 +685,7 @@ RoseVertex pickSucc(const RoseVertex v, const RoseGraph &g) {
 
 static
 RoseVertex pickPred(const RoseVertex v, const RoseGraph &g,
-                    const RoseBuildImpl &tbi) {
+                    const RoseBuildImpl &build) {
     RoseGraph::in_edge_iterator ei, ee;
     tie(ei, ee) = in_edges(v, g);
     if (ei == ee) {
@@ -693,7 +696,7 @@ RoseVertex pickPred(const RoseVertex v, const RoseGraph &g,
     // Avoid roots if we have other options, since it doesn't matter to the
     // merge pass which predecessor we pick.
     RoseVertex u = source(*ei, g);
-    while (tbi.isAnyStart(u) && ++ei != ee) {
+    while (build.isAnyStart(u) && ++ei != ee) {
         u = source(*ei, g);
     }
     return u;
@@ -743,12 +746,13 @@ bool hasCommonPredWithDiffRoses(RoseVertex a, RoseVertex b,
 }
 
 static
-void pruneReportIfUnused(const RoseBuildImpl &tbi, shared_ptr<NGHolder> h,
+void pruneReportIfUnused(const RoseBuildImpl &build, shared_ptr<NGHolder> h,
                          const set<RoseVertex> &verts, ReportID report) {
     DEBUG_PRINTF("trying to prune %u from %p (v %zu)\n", report, h.get(),
                  verts.size());
     for (RoseVertex v : verts) {
-        if (tbi.g[v].left.graph == h && tbi.g[v].left.leftfix_report == report) {
+        if (build.g[v].left.graph == h &&
+            build.g[v].left.leftfix_report == report) {
             DEBUG_PRINTF("report %u still in use\n", report);
             return;
         }
@@ -760,12 +764,12 @@ void pruneReportIfUnused(const RoseBuildImpl &tbi, shared_ptr<NGHolder> h,
         // unimplementable.
 
         DEBUG_PRINTF("report %u has been merged away, pruning\n", report);
-        assert(h->kind == tbi.isRootSuccessor(*verts.begin()) ? NFA_PREFIX
-                                                              : NFA_INFIX);
+        assert(h->kind == build.isRootSuccessor(*verts.begin()) ? NFA_PREFIX
+                                                                : NFA_INFIX);
         unique_ptr<NGHolder> h_new = cloneHolder(*h);
         pruneReport(*h_new, report);
 
-        if (isImplementableNFA(*h_new, nullptr, tbi.cc)) {
+        if (isImplementableNFA(*h_new, nullptr, build.cc)) {
             clear_graph(*h);
             cloneHolder(*h, *h_new);
         } else {
@@ -871,9 +875,9 @@ void pruneUnusedTops(NGHolder &h, const RoseGraph &g,
 }
 
 static
-bool mergeSameCastle(RoseBuildImpl &tbi, RoseVertex a, RoseVertex b,
+bool mergeSameCastle(RoseBuildImpl &build, RoseVertex a, RoseVertex b,
                      RoseAliasingInfo &rai) {
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
     LeftEngInfo &a_left = g[a].left;
     LeftEngInfo &b_left = g[b].left;
     CastleProto &castle = *a_left.castle;
@@ -896,7 +900,7 @@ bool mergeSameCastle(RoseBuildImpl &tbi, RoseVertex a, RoseVertex b,
         return false;
     }
 
-    const ReportID new_report = tbi.getNewNfaReport();
+    const ReportID new_report = build.getNewNfaReport();
     map<u32, u32> a_top_map, b_top_map;
 
     for (const auto &c : castle.repeats) {
@@ -934,10 +938,10 @@ bool mergeSameCastle(RoseBuildImpl &tbi, RoseVertex a, RoseVertex b,
 }
 
 static
-bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
+bool attemptRoseCastleMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
                             RoseVertex b, bool trivialCasesOnly,
                             RoseAliasingInfo &rai) {
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
     LeftEngInfo &a_left = g[a].left;
     LeftEngInfo &b_left = g[b].left;
     left_id a_left_id(a_left);
@@ -955,7 +959,7 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 
     if (&a_castle == &b_castle) {
         DEBUG_PRINTF("castles are the same\n");
-        return mergeSameCastle(tbi, a, b, rai);
+        return mergeSameCastle(build, a, b, rai);
     }
 
     if (is_equal(a_castle, a_left.leftfix_report, b_castle,
@@ -987,7 +991,7 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
             /* preds are the same anyway in diamond/left merges just need to
              * check that all the literals in rev_leftfix[b_h] can handle a_h */
             for (auto v : rai.rev_leftfix[b_left_id]) {
-                if (!mergeableRoseVertices(tbi, a, v)) {
+                if (!mergeableRoseVertices(build, a, v)) {
                     goto literal_mismatch_1;
                 }
             }
@@ -1003,7 +1007,7 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
             /* preds are the same anyway in diamond/left merges just need to
              * check that all the literals in rev_leftfix[a_h] can handle b_h */
             for (auto v : rai.rev_leftfix[a_left_id]) {
-                if (!mergeableRoseVertices(tbi, v, b)) {
+                if (!mergeableRoseVertices(build, v, b)) {
                     goto literal_mismatch_2;
                 }
             }
@@ -1021,7 +1025,7 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
         /* we need to create a new graph as there may be other people
          * using b_left and it would be bad if a's preds started triggering it
          */
-        ReportID new_report = tbi.getNewNfaReport();
+        ReportID new_report = build.getNewNfaReport();
         shared_ptr<CastleProto> new_castle = make_shared<CastleProto>(a_castle);
         pruneCastle(*new_castle, a_left.leftfix_report);
         setReports(*new_castle, new_report);
@@ -1051,7 +1055,7 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 
     // Only infixes. Prefixes require special care when doing non-trivial
     // merges.
-    if (!tbi.isNonRootSuccessor(a) || !tbi.isNonRootSuccessor(b)) {
+    if (!build.isNonRootSuccessor(a) || !build.isNonRootSuccessor(b)) {
         return false;
     }
 
@@ -1059,19 +1063,19 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     set<RoseVertex> aa;
     aa.insert(a);
 
-    if (!mergeableRoseVertices(tbi, aa, b_verts)) {
+    if (!mergeableRoseVertices(build, aa, b_verts)) {
         DEBUG_PRINTF("vertices not mergeable\n");
         return false;
     }
 
-    if (!tbi.cc.grey.roseMultiTopRoses || !tbi.cc.grey.allowCastle) {
+    if (!build.cc.grey.roseMultiTopRoses || !build.cc.grey.allowCastle) {
         return false;
     }
 
     DEBUG_PRINTF("merging into new castle\n");
 
     // Clone new castle with a's repeats in it, set to a new report.
-    ReportID new_report = tbi.getNewNfaReport();
+    ReportID new_report = build.getNewNfaReport();
     shared_ptr<CastleProto> m_castle = make_shared<CastleProto>(a_castle);
     pruneCastle(*m_castle, a_left.leftfix_report);
     setReports(*m_castle, new_report);
@@ -1131,10 +1135,10 @@ bool attemptRoseCastleMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 }
 
 static
-bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
+bool attemptRoseGraphMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
                            RoseVertex b, bool trivialCasesOnly,
                            RoseAliasingInfo &rai) {
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
     LeftEngInfo &a_left = g[a].left;
     LeftEngInfo &b_left = g[b].left;
     left_id a_left_id(a_left);
@@ -1151,13 +1155,15 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
         DEBUG_PRINTF("OK -> same actual holder\n");
         ReportID a_oldreport = a_left.leftfix_report;
         ReportID b_oldreport = b_left.leftfix_report;
-        ReportID new_report = tbi.getNewNfaReport();
+        ReportID new_report = build.getNewNfaReport();
         duplicateReport(*a_h, a_left.leftfix_report, new_report);
         duplicateReport(*b_h, b_left.leftfix_report, new_report);
         a_left.leftfix_report = new_report;
         b_left.leftfix_report = new_report;
-        pruneReportIfUnused(tbi, b_h, rai.rev_leftfix[b_left_id], a_oldreport);
-        pruneReportIfUnused(tbi, b_h, rai.rev_leftfix[b_left_id], b_oldreport);
+        pruneReportIfUnused(build, b_h, rai.rev_leftfix[b_left_id],
+                            a_oldreport);
+        pruneReportIfUnused(build, b_h, rai.rev_leftfix[b_left_id],
+                            b_oldreport);
         pruneUnusedTops(*b_h, g, rai.rev_leftfix[b_left_id]);
         assert(a_left == b_left);
         return true;
@@ -1191,7 +1197,7 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
             /* preds are the same anyway in diamond/left merges just need to
              * check that all the literals in rev_leftfix[b_h] can handle a_h */
             for (auto v : rai.rev_leftfix[b_left_id]) {
-                if (!mergeableRoseVertices(tbi, a, v)) {
+                if (!mergeableRoseVertices(build, a, v)) {
                     goto literal_mismatch_1;
                 }
             }
@@ -1207,7 +1213,7 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
             /* preds are the same anyway in diamond/left merges just need to
              * check that all the literals in rev_leftfix[a_h] can handle b_h */
             for (auto v : rai.rev_leftfix[a_left_id]) {
-                if (!mergeableRoseVertices(tbi, v, b)) {
+                if (!mergeableRoseVertices(build, v, b)) {
                     goto literal_mismatch_2;
                 }
             }
@@ -1225,10 +1231,10 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
         /* we need to create a new graph as there may be other people
          * using b_left and it would be bad if a's preds started triggering it
          */
-        ReportID new_report = tbi.getNewNfaReport();
+        ReportID new_report = build.getNewNfaReport();
         shared_ptr<NGHolder> new_graph = cloneHolder(*b_h);
         duplicateReport(*new_graph, b_left.leftfix_report, new_report);
-        pruneReportIfUnused(tbi, new_graph, {}, b_left.leftfix_report);
+        pruneReportIfUnused(build, new_graph, {}, b_left.leftfix_report);
 
         rai.rev_leftfix[a_left_id].erase(a);
         rai.rev_leftfix[b_left_id].erase(b);
@@ -1254,7 +1260,7 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 
     // Only infixes. Prefixes require special care when doing non-trivial
     // merges.
-    if (!tbi.isNonRootSuccessor(a) || !tbi.isNonRootSuccessor(b)) {
+    if (!build.isNonRootSuccessor(a) || !build.isNonRootSuccessor(b)) {
         return false;
     }
 
@@ -1265,12 +1271,12 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     set<RoseVertex> aa;
     aa.insert(a);
 
-    if (!mergeableRoseVertices(tbi, aa, b_verts)) {
+    if (!mergeableRoseVertices(build, aa, b_verts)) {
         DEBUG_PRINTF("vertices not mergeable\n");
         return false;
     }
 
-    if (!tbi.cc.grey.roseMultiTopRoses) {
+    if (!build.cc.grey.roseMultiTopRoses) {
         return false;
     }
 
@@ -1280,10 +1286,10 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     /* We need to allocate a new report id because */
     ReportID a_oldreport = a_left.leftfix_report;
     ReportID b_oldreport = b_left.leftfix_report;
-    ReportID new_report = tbi.getNewNfaReport();
+    ReportID new_report = build.getNewNfaReport();
     duplicateReport(*b_h, b_left.leftfix_report, new_report);
     b_left.leftfix_report = new_report;
-    pruneReportIfUnused(tbi, b_h, rai.rev_leftfix[b_left_id], b_oldreport);
+    pruneReportIfUnused(build, b_h, rai.rev_leftfix[b_left_id], b_oldreport);
 
     NGHolder victim;
     cloneHolder(victim, *a_h);
@@ -1307,7 +1313,7 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     assert(victim.kind == b_h->kind);
     assert(!generates_callbacks(*b_h));
 
-    if (!mergeNfaPair(victim, *b_h, nullptr, tbi.cc)) {
+    if (!mergeNfaPair(victim, *b_h, nullptr, build.cc)) {
         DEBUG_PRINTF("merge failed\n");
         // Restore in-edge properties.
         for (const auto &e : in_edges_range(a, g)) {
@@ -1333,13 +1339,13 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     pruneUnusedTops(*b_h, g, rai.rev_leftfix[b_left_id]);
 
     // Prune A's report from its old prefix if it was only used by A.
-    pruneReportIfUnused(tbi, a_h, rai.rev_leftfix[a_left_id], a_oldreport);
+    pruneReportIfUnused(build, a_h, rai.rev_leftfix[a_left_id], a_oldreport);
 
-    reduceImplementableGraph(*b_h, SOM_NONE, nullptr, tbi.cc);
+    reduceImplementableGraph(*b_h, SOM_NONE, nullptr, build.cc);
 
     assert(roseHasTops(g, a));
     assert(roseHasTops(g, b));
-    assert(isImplementableNFA(*b_h, nullptr, tbi.cc));
+    assert(isImplementableNFA(*b_h, nullptr, build.cc));
     return true;
 }
 
@@ -1347,14 +1353,14 @@ bool attemptRoseGraphMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
 // the two LeftEngInfo structures to be the same. Returns false if the merge
 // is not possible.
 static
-bool attemptRoseMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
+bool attemptRoseMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
                       RoseVertex b, bool trivialCasesOnly,
                       RoseAliasingInfo &rai) {
     DEBUG_PRINTF("attempting rose merge, vertices a=%zu, b=%zu\n",
-                  tbi.g[a].idx, tbi.g[b].idx);
+                  build.g[a].idx, build.g[b].idx);
     assert(a != b);
 
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
     LeftEngInfo &a_left = g[a].left;
     LeftEngInfo &b_left = g[b].left;
 
@@ -1378,8 +1384,8 @@ bool attemptRoseMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     }
 
     // Only non-transients for the moment.
-    if (contains(tbi.transient, a_left_id) ||
-        contains(tbi.transient, b_left_id)) {
+    if (contains(build.transient, a_left_id) ||
+        contains(build.transient, b_left_id)) {
         return false;
     }
 
@@ -1393,12 +1399,12 @@ bool attemptRoseMerge(RoseBuildImpl &tbi, bool preds_same, RoseVertex a,
     assert(roseHasTops(g, b));
 
     if (a_left_id.graph() && b_left_id.graph()) {
-        return attemptRoseGraphMerge(tbi, preds_same, a, b, trivialCasesOnly,
+        return attemptRoseGraphMerge(build, preds_same, a, b, trivialCasesOnly,
                                      rai);
     }
 
     if (a_left_id.castle() && b_left_id.castle()) {
-        return attemptRoseCastleMerge(tbi, preds_same, a, b, trivialCasesOnly,
+        return attemptRoseCastleMerge(build, preds_same, a, b, trivialCasesOnly,
                                       rai);
     }
 
@@ -1524,8 +1530,8 @@ void splitByNeighbour(const RoseGraph &g, vector<vector<RoseVertex>> &buckets,
 }
 
 static
-vector<vector<RoseVertex>> splitDiamondMergeBuckets(CandidateSet &candidates,
-                                                    const RoseBuildImpl &build) {
+vector<vector<RoseVertex>>
+splitDiamondMergeBuckets(CandidateSet &candidates, const RoseBuildImpl &build) {
     const RoseGraph &g = build.g;
 
     vector<vector<RoseVertex>> buckets(1);
@@ -1542,19 +1548,20 @@ vector<vector<RoseVertex>> splitDiamondMergeBuckets(CandidateSet &candidates,
 
     return buckets;
 }
+
 static never_inline
-void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
+void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &build,
                       vector<RoseVertex> *dead, bool mergeRoses,
                       RoseAliasingInfo &rai) {
     DEBUG_PRINTF("begin\n");
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
 
     if (candidates.empty()) {
         return;
     }
 
     /* Vertices may only be diamond merged with others in the same bucket */
-    auto cand_buckets = splitDiamondMergeBuckets(candidates, tbi);
+    auto cand_buckets = splitDiamondMergeBuckets(candidates, build);
 
     for (const vector<RoseVertex> &siblings : cand_buckets) {
         for (auto it = siblings.begin(); it != siblings.end();) {
@@ -1568,7 +1575,7 @@ void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
                 RoseVertex b = *jt;
                 assert(contains(candidates, b));
 
-                if (!sameRoleProperties(tbi, rai, a, b)) {
+                if (!sameRoleProperties(build, rai, a, b)) {
                     DEBUG_PRINTF("diff role prop\n");
                     continue;
                 }
@@ -1579,23 +1586,23 @@ void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
                  * so we still have to checks successors and predecessors. */
 
                 if (!sameSuccessors(a, b, g)
-                    || !sameRightRoleProperties(tbi, a, b)
+                    || !sameRightRoleProperties(build, a, b)
                     || !samePredecessors(a, b, g)) {
                     DEBUG_PRINTF("not diamond\n");
                     continue;
                 }
 
-                if (!canMergeLiterals(a, b, tbi)) {
+                if (!canMergeLiterals(a, b, build)) {
                     DEBUG_PRINTF("incompatible lits\n");
                     continue;
                 }
 
-                if (!attemptRoseMerge(tbi, true, a, b, !mergeRoses, rai)) {
+                if (!attemptRoseMerge(build, true, a, b, !mergeRoses, rai)) {
                     DEBUG_PRINTF("rose fail\n");
                     continue;
                 }
 
-                mergeVerticesDiamond(a, b, tbi, rai);
+                mergeVerticesDiamond(a, b, build, rai);
                 dead->push_back(a);
                 candidates.erase(a);
                 break; // next a
@@ -1655,10 +1662,10 @@ vector<RoseVertex>::iterator findLeftMergeSibling(
 }
 
 static never_inline
-void leftMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
+void leftMergePass(CandidateSet &candidates, RoseBuildImpl &build,
                    vector<RoseVertex> *dead, RoseAliasingInfo &rai) {
     DEBUG_PRINTF("begin (%zu)\n", candidates.size());
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
     vector<RoseVertex> siblings;
 
     CandidateSet::iterator it = candidates.begin();
@@ -1673,11 +1680,11 @@ void leftMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
 
         assert(!g[a].literals.empty());
         u32 lit_id = *g[a].literals.begin();
-        const auto &verts = tbi.literal_info.at(lit_id).vertices;
-        RoseVertex pred = pickPred(a, g, tbi);
+        const auto &verts = build.literal_info.at(lit_id).vertices;
+        RoseVertex pred = pickPred(a, g, build);
 
         siblings.clear();
-        if (pred == RoseGraph::null_vertex() || tbi.isAnyStart(pred) ||
+        if (pred == RoseGraph::null_vertex() || build.isAnyStart(pred) ||
                     hasGreaterOutDegree(verts.size(), pred, g)) {
             // Select sibling from amongst the vertices that share a literal.
             siblings.insert(siblings.end(), verts.begin(), verts.end());
@@ -1689,20 +1696,20 @@ void leftMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
 
         sort(siblings.begin(), siblings.end(), VertexIndexComp(g));
 
-        auto jt = findLeftMergeSibling(siblings.begin(), siblings.end(), a, tbi,
-                                       rai, candidates);
+        auto jt = findLeftMergeSibling(siblings.begin(), siblings.end(), a,
+                                       build, rai, candidates);
         if (jt == siblings.end()) {
             continue;
         }
 
         RoseVertex b = *jt;
 
-        if (!attemptRoseMerge(tbi, true, a, b, 0, rai)) {
+        if (!attemptRoseMerge(build, true, a, b, 0, rai)) {
             DEBUG_PRINTF("rose fail\n");
             continue;
         }
 
-        mergeVertices(a, b, tbi, rai);
+        mergeVertices(a, b, build, rai);
         dead->push_back(a);
         candidates.erase(ait);
     }
@@ -1809,10 +1816,10 @@ void split(map<RoseVertex, size_t> &keys, size_t *next_key, Iter it,
 }
 
 static never_inline
-void buildCandidateRightSiblings(CandidateSet &candidates, RoseBuildImpl &tbi,
-                                 map<size_t, vector<RoseVertex> > &sibling_cache,
+void buildCandidateRightSiblings(CandidateSet &candidates, RoseBuildImpl &build,
+                                 map<size_t, vector<RoseVertex>> &sibling_cache,
                                  map<RoseVertex, size_t> &keys_ext) {
-    RoseGraph &g = tbi.g;
+    RoseGraph &g = build.g;
 
     size_t next_key = 1;
     map<RoseVertex, size_t> keys;
@@ -1828,7 +1835,7 @@ void buildCandidateRightSiblings(CandidateSet &candidates, RoseBuildImpl &tbi,
         assert(!g[a].literals.empty());
         u32 lit_id = *g[a].literals.begin();
         RoseVertex succ = pickSucc(a, g);
-        const auto &verts = tbi.literal_info.at(lit_id).vertices;
+        const auto &verts = build.literal_info.at(lit_id).vertices;
         if (succ != RoseGraph::null_vertex() &&
                 !hasGreaterInDegree(verts.size(), succ, g)) {
             if (!done_succ.insert(succ).second) {
@@ -1863,28 +1870,28 @@ void buildCandidateRightSiblings(CandidateSet &candidates, RoseBuildImpl &tbi,
     }
 
     for (auto &siblings : sibling_cache | map_values) {
-        sort(siblings.begin(), siblings.end(), VertexIndexComp(tbi.g));
+        sort(siblings.begin(), siblings.end(), VertexIndexComp(build.g));
     }
 }
 
 static
 const vector<RoseVertex> &getCandidateRightSiblings(
-                         const map<size_t, vector<RoseVertex> > &sibling_cache,
+                         const map<size_t, vector<RoseVertex>> &sibling_cache,
                          map<RoseVertex, size_t> &keys, RoseVertex a) {
     size_t key = keys.at(a);
     return sibling_cache.at(key);
 }
 
 static never_inline
-void rightMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
+void rightMergePass(CandidateSet &candidates, RoseBuildImpl &build,
                     vector<RoseVertex> *dead, bool mergeRoses,
                     RoseAliasingInfo &rai) {
     DEBUG_PRINTF("begin\n");
 
-    map<size_t, vector<RoseVertex> > sibling_cache;
+    map<size_t, vector<RoseVertex>> sibling_cache;
     map<RoseVertex, size_t> keys;
 
-    buildCandidateRightSiblings(candidates, tbi, sibling_cache, keys);
+    buildCandidateRightSiblings(candidates, build, sibling_cache, keys);
 
     CandidateSet::iterator it = candidates.begin();
     while (it != candidates.end()) {
@@ -1901,12 +1908,12 @@ void rightMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
 
         auto jt = siblings.begin();
         while (jt != siblings.end()) {
-            jt = findRightMergeSibling(jt, siblings.end(), a, tbi, rai,
+            jt = findRightMergeSibling(jt, siblings.end(), a, build, rai,
                                        candidates);
             if (jt == siblings.end()) {
                 break;
             }
-            if (attemptRoseMerge(tbi, false, a, *jt, !mergeRoses, rai)) {
+            if (attemptRoseMerge(build, false, a, *jt, !mergeRoses, rai)) {
                 break;
             }
             ++jt;
@@ -1917,7 +1924,7 @@ void rightMergePass(CandidateSet &candidates, RoseBuildImpl &tbi,
         }
 
         RoseVertex b = *jt;
-        mergeVertices(a, b, tbi, rai);
+        mergeVertices(a, b, build, rai);
         dead->push_back(a);
         candidates.erase(ait);
     }

From 575e8c06dcf04a823c1a0dcb025db70d242202d3 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Thu, 23 Jun 2016 13:14:39 +1000
Subject: [PATCH 069/166]     only show floating groups to the floating table

---
 src/rose/block.c                 |  4 ++--
 src/rose/match.c                 | 14 +++++++++++++-
 src/rose/match.h                 |  1 +
 src/rose/rose_build_bytecode.cpp |  4 +++-
 src/rose/rose_build_matchers.cpp |  6 ++++++
 src/rose/rose_build_matchers.h   |  1 +
 src/rose/rose_dump.cpp           |  2 ++
 src/rose/rose_internal.h         |  1 +
 src/rose/stream.c                |  9 +++++----
 9 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/src/rose/block.c b/src/rose/block.c
index c0b5e0e4..55323c2e 100644
--- a/src/rose/block.c
+++ b/src/rose/block.c
@@ -260,8 +260,8 @@ int roseBlockFloating(const struct RoseEngine *t, struct hs_scratch *scratch) {
 
     DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length);
     DEBUG_PRINTF("-- %016llx\n", tctxt->groups);
-    hwlmExec(ftable, buffer, flen, t->floatingMinDistance, roseCallback,
-             scratch, tctxt->groups);
+    hwlmExec(ftable, buffer, flen, t->floatingMinDistance, roseFloatingCallback,
+             scratch, tctxt->groups & t->floating_group_mask);
 
     return can_stop_matching(scratch);
 }
diff --git a/src/rose/match.c b/src/rose/match.c
index 4e9e72a6..e89c8d3a 100644
--- a/src/rose/match.c
+++ b/src/rose/match.c
@@ -516,7 +516,8 @@ anchored_leftovers:;
     return rv;
 }
 
-hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *ctxt) {
+static really_inline
+hwlmcb_rv_t roseCallback_i(size_t start, size_t end, u32 id, void *ctxt) {
     struct hs_scratch *scratch = ctxt;
     struct RoseContext *tctx = &scratch->tctxt;
     const struct RoseEngine *t = scratch->core_info.rose;
@@ -564,6 +565,17 @@ hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *ctxt) {
     return HWLM_TERMINATE_MATCHING;
 }
 
+hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *ctxt) {
+    return roseCallback_i(start, end, id, ctxt);
+}
+
+hwlmcb_rv_t roseFloatingCallback(size_t start, size_t end, u32 id, void *ctxt) {
+    struct hs_scratch *scratch = ctxt;
+    const struct RoseEngine *t = scratch->core_info.rose;
+
+    return roseCallback_i(start, end, id, ctxt) & t->floating_group_mask;
+}
+
 /**
  * \brief Match callback adaptor used for matches from pure-literal cases.
  *
diff --git a/src/rose/match.h b/src/rose/match.h
index cee32fc2..5b587aec 100644
--- a/src/rose/match.h
+++ b/src/rose/match.h
@@ -51,6 +51,7 @@ int roseNfaSomAdaptor(u64a from_offset, u64a offset, ReportID id, void *context)
 /* Callbacks, defined in match.c */
 
 hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *ctx);
+hwlmcb_rv_t roseFloatingCallback(size_t start, size_t end, u32 id, void *ctx);
 hwlmcb_rv_t roseDelayRebuildCallback(size_t start, size_t end, u32 id,
                                      void *ctx);
 int roseAnchoredCallback(u64a end, u32 id, void *ctx);
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index cab0d60a..3f36a05e 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -4370,9 +4370,10 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     }
 
     // Build floating HWLM matcher.
+    rose_group fgroups = 0;
     size_t fsize = 0;
     size_t floatingStreamStateRequired = 0;
-    auto ftable = buildFloatingMatcher(*this, &fsize, &historyRequired,
+    auto ftable = buildFloatingMatcher(*this, &fgroups, &fsize, &historyRequired,
                                        &floatingStreamStateRequired);
     u32 fmatcherOffset = 0;
     if (ftable) {
@@ -4584,6 +4585,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     fillMatcherDistances(*this, engine.get());
 
     engine->initialGroups = getInitialGroups();
+    engine->floating_group_mask = fgroups;
     engine->totalNumLiterals = verify_u32(literal_info.size());
     engine->asize = verify_u32(asize);
     engine->ematcherRegionSize = ematcher_region_size;
diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp
index f4597de7..b66556fc 100644
--- a/src/rose/rose_build_matchers.cpp
+++ b/src/rose/rose_build_matchers.cpp
@@ -580,10 +580,12 @@ vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
 }
 
 aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
+                                              rose_group *fgroups,
                                               size_t *fsize,
                                               size_t *historyRequired,
                                               size_t *streamStateRequired) {
     *fsize = 0;
+    *fgroups = 0;
 
     auto fl = fillHamsterLiteralList(build, ROSE_FLOATING);
     if (fl.empty()) {
@@ -591,6 +593,10 @@ aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
         return nullptr;
     }
 
+    for (const hwlmLiteral &hlit : fl) {
+        *fgroups |= hlit.groups;
+    }
+
     hwlmStreamingControl ctl;
     hwlmStreamingControl *ctlp;
     if (build.cc.streaming) {
diff --git a/src/rose/rose_build_matchers.h b/src/rose/rose_build_matchers.h
index 1dd53cd8..7d5c9283 100644
--- a/src/rose/rose_build_matchers.h
+++ b/src/rose/rose_build_matchers.h
@@ -48,6 +48,7 @@ std::vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
                                                 rose_literal_table table);
 
 aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
+                                              rose_group *fgroups,
                                               size_t *fsize,
                                               size_t *historyRequired,
                                               size_t *streamStateRequired);
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index 40979e8c..9f55dbf2 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -930,6 +930,7 @@ void roseDumpText(const RoseEngine *t, FILE *f) {
     fprintf(f, "\n");
 
     fprintf(f, "initial groups       : 0x%016llx\n", t->initialGroups);
+    fprintf(f, "floating groups      : 0x%016llx\n", t->floating_group_mask);
     fprintf(f, "handled key count    : %u\n", t->handledKeyCount);
     fprintf(f, "\n");
 
@@ -1035,6 +1036,7 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, floatingMinLiteralMatchOffset);
     DUMP_U32(t, nfaInfoOffset);
     DUMP_U64(t, initialGroups);
+    DUMP_U64(t, floating_group_mask);
     DUMP_U32(t, size);
     DUMP_U32(t, delay_count);
     DUMP_U32(t, delay_base_id);
diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h
index af5b2a95..9dd17350 100644
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@@ -401,6 +401,7 @@ struct RoseEngine {
                                         * table */
     u32 nfaInfoOffset; /* offset to the nfa info offset array */
     rose_group initialGroups;
+    rose_group floating_group_mask; /* groups that are used by the ftable */
     u32 size; // (bytes)
     u32 delay_count; /* number of delayed literal ids. */
     u32 delay_base_id; /* literal id of the first delayed literal.
diff --git a/src/rose/stream.c b/src/rose/stream.c
index 0e382f03..ffe965dd 100644
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@@ -461,8 +461,8 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
     tctxt->minMatchOffset = offset;
     tctxt->minNonMpvMatchOffset = offset;
     tctxt->next_mpv_offset = 0;
-    DEBUG_PRINTF("BEGIN: history len=%zu, buffer len=%zu\n",
-                  scratch->core_info.hlen, scratch->core_info.len);
+    DEBUG_PRINTF("BEGIN: history len=%zu, buffer len=%zu groups=%016llx\n",
+                 scratch->core_info.hlen, scratch->core_info.len, tctxt->groups);
 
     fatbit_clear(scratch->aqa);
     scratch->al_log_sum = 0;
@@ -540,8 +540,9 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
         }
 
         DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length);
-        hwlmExecStreaming(ftable, scratch, flen, start, roseCallback, scratch,
-                          tctxt->groups, stream_state);
+        hwlmExecStreaming(ftable, scratch, flen, start, roseFloatingCallback,
+                          scratch, tctxt->groups & t->floating_group_mask,
+                          stream_state);
     }
 
 flush_delay_and_exit:

From 8699e35c0985e7a92b7a4fdaa9c59ad9f9b6443e Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Thu, 23 Jun 2016 14:01:55 +1000
Subject: [PATCH 070/166] prevent merging the e and f tables if the ftable is
 squashable

---
 src/rose/rose_build_compile.cpp | 44 +++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index a6868ff8..d59d4d4f 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -537,6 +537,45 @@ bool RoseBuildImpl::isDirectReport(u32 id) const {
     return true;
 }
 
+
+/* If we have prefixes that can squash all the floating roots, we can have a
+ * somewhat-conditional floating table. As we can't yet look at squash_masks, we
+ * have to make some guess as to if we are in this case but the win for not
+ * running a floating table over a large portion of the stream is significantly
+ * larger than avoiding running an eod table over the last N bytes. */
+static
+bool checkFloatingKillableByPrefixes(const RoseBuildImpl &tbi) {
+    for (auto v : vertices_range(tbi.g)) {
+        if (!tbi.isRootSuccessor(v)) {
+            continue;
+        }
+
+        if (!tbi.isFloating(v)) {
+            continue;
+        }
+
+        if (!tbi.g[v].left) {
+            DEBUG_PRINTF("unguarded floating root\n");
+            return false;
+        }
+
+        if (tbi.g[v].left.graph) {
+            const NGHolder &h = *tbi.g[v].left.graph;
+            if (proper_out_degree(h.startDs, h)) {
+                DEBUG_PRINTF("floating nfa prefix, won't die\n");
+                return false;
+            }
+        } else if (tbi.g[v].left.dfa) {
+            if (tbi.g[v].left.dfa->start_floating != DEAD_STATE) {
+                DEBUG_PRINTF("floating dfa prefix, won't die\n");
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
 static
 bool checkEodStealFloating(const RoseBuildImpl &tbi,
                            const vector<u32> &eodLiteralsForFloating,
@@ -558,6 +597,11 @@ bool checkEodStealFloating(const RoseBuildImpl &tbi,
         return false;
     }
 
+    if (checkFloatingKillableByPrefixes(tbi)) {
+         DEBUG_PRINTF("skipping as prefixes may make ftable conditional\n");
+         return false;
+    }
+
     DEBUG_PRINTF("%zu are eod literals, %u floating; floating len=%zu\n",
                  eodLiteralsForFloating.size(), numFloatingLiterals,
                  shortestFloatingLen);

From f9ded5936173fb97fb0d5e5943d87fb91a114b51 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Wed, 22 Jun 2016 15:03:22 +1000
Subject: [PATCH 071/166] Disable strict aliasing

Strict aliasing allows to compiler to make some optimisations, but they aren't
without risk. The benefits do not appear to be worth the risk.
---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2652cea3..1b6e0e94 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,8 +173,8 @@ else()
     endif()
 
     # set compiler flags - more are tested and added later
-    set(EXTRA_C_FLAGS "-std=c99 -Wall -Wextra -Wshadow -Wcast-qual")
-    set(EXTRA_CXX_FLAGS "-std=c++11 -Wall -Wextra -Wno-shadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor")
+    set(EXTRA_C_FLAGS "-std=c99 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
+    set(EXTRA_CXX_FLAGS "-std=c++11 -Wall -Wextra -Wno-shadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
     if (NOT RELEASE_BUILD)
         # -Werror is most useful during development, don't potentially break
         # release builds

From f166bc5658ea57ee81a135f2a85ceca09390caf0 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Fri, 24 Jun 2016 09:28:42 +1000
Subject: [PATCH 072/166] allow some prefixes that may squash the literal match
 to run eagerly

---
 src/nfa/castle.c                  |  40 +++
 src/nfa/castle.h                  |   3 +-
 src/nfa/gough.c                   |   8 +
 src/nfa/gough.h                   |   4 +-
 src/nfa/lbr.h                     |   7 +-
 src/nfa/lbr_common_impl.h         |  11 +-
 src/nfa/limex.h                   |   1 +
 src/nfa/limex_common_impl.h       |  24 +-
 src/nfa/limex_compile.cpp         |   3 +-
 src/nfa/limex_runtime_impl.h      |  37 ++-
 src/nfa/mcclellan.c               |  42 ++-
 src/nfa/mcclellan.h               |   4 +-
 src/nfa/mcclellancompile_util.cpp |  32 ++
 src/nfa/mcclellancompile_util.h   |   2 +
 src/nfa/mpv.h                     |   4 +-
 src/nfa/nfa_api.h                 |  14 +-
 src/nfa/nfa_api_dispatch.c        |   6 +-
 src/nfa/nfa_kind.h                |  25 +-
 src/nfagraph/ng_execute.cpp       |  47 ++-
 src/nfagraph/ng_execute.h         |   5 +-
 src/nfagraph/ng_holder.h          |  17 +-
 src/nfagraph/ng_limex.cpp         |  10 +-
 src/nfagraph/ng_mcclellan.cpp     |   8 +-
 src/nfagraph/ng_split.cpp         |  14 +-
 src/rose/block.c                  |  82 +++++
 src/rose/program_runtime.h        |   8 +-
 src/rose/rose_build_add.cpp       |   1 +
 src/rose/rose_build_bytecode.cpp  | 479 +++++++++++++++++++++++-------
 src/rose/rose_build_dump.cpp      |   2 +
 src/rose/rose_build_impl.h        |   2 +-
 src/rose/rose_dump.cpp            |   4 +
 src/rose/rose_internal.h          |   4 +
 src/rose/runtime.h                |   5 +
 src/rose/stream.c                 |  88 ++++++
 34 files changed, 895 insertions(+), 148 deletions(-)

diff --git a/src/nfa/castle.c b/src/nfa/castle.c
index 13a44a97..bfdcf6b5 100644
--- a/src/nfa/castle.c
+++ b/src/nfa/castle.c
@@ -979,6 +979,46 @@ char nfaExecCastle0_inAccept(const struct NFA *n, ReportID report,
     return castleInAccept(c, q, report, q_cur_offset(q));
 }
 
+char nfaExecCastle0_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+    assert(n->type == CASTLE_NFA_0);
+    DEBUG_PRINTF("entry\n");
+
+    const struct Castle *c = getImplNfa(n);
+    const u64a offset = q_cur_offset(q);
+    DEBUG_PRINTF("offset=%llu\n", offset);
+
+    if (c->exclusive) {
+        u8 *active = (u8 *)q->streamState;
+        u8 *groups = active + c->groupIterOffset;
+        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
+            u8 *cur = active + i * c->activeIdxSize;
+            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
+            DEBUG_PRINTF("subcastle %u\n", activeIdx);
+            const struct SubCastle *sub = getSubCastle(c, activeIdx);
+            if (subCastleInAccept(c, q, sub->report, offset, activeIdx)) {
+                return 1;
+            }
+        }
+    }
+
+    if (c->exclusive != PURE_EXCLUSIVE) {
+        const u8 *active = (const u8 *)q->streamState + c->activeOffset;
+        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
+             i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) {
+            DEBUG_PRINTF("subcastle %u\n", i);
+            const struct SubCastle *sub = getSubCastle(c, i);
+            if (subCastleInAccept(c, q, sub->report, offset, i)) {
+                return 1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+
 char nfaExecCastle0_queueInitState(UNUSED const struct NFA *n, struct mq *q) {
     assert(n && q);
     assert(n->type == CASTLE_NFA_0);
diff --git a/src/nfa/castle.h b/src/nfa/castle.h
index 8fc3514b..84d79097 100644
--- a/src/nfa/castle.h
+++ b/src/nfa/castle.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -44,6 +44,7 @@ char nfaExecCastle0_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecCastle0_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecCastle0_inAccept(const struct NFA *n, ReportID report,
                              struct mq *q);
+char nfaExecCastle0_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecCastle0_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecCastle0_initCompressedState(const struct NFA *n, u64a offset,
                                         void *state, u8 key);
diff --git a/src/nfa/gough.c b/src/nfa/gough.c
index c52bca06..3b7a115d 100644
--- a/src/nfa/gough.c
+++ b/src/nfa/gough.c
@@ -1048,6 +1048,14 @@ char nfaExecGough16_inAccept(const struct NFA *n, ReportID report,
     return nfaExecMcClellan16_inAccept(n, report, q);
 }
 
+char nfaExecGough8_inAnyAccept(const struct NFA *n, struct mq *q) {
+    return nfaExecMcClellan8_inAnyAccept(n, q);
+}
+
+char nfaExecGough16_inAnyAccept(const struct NFA *n, struct mq *q) {
+    return nfaExecMcClellan16_inAnyAccept(n, q);
+}
+
 static
 char goughCheckEOD(const struct NFA *nfa, u16 s,
                    const struct gough_som_info *som,
diff --git a/src/nfa/gough.h b/src/nfa/gough.h
index 41d4cb5a..1a7dbd74 100644
--- a/src/nfa/gough.h
+++ b/src/nfa/gough.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,6 +46,7 @@ char nfaExecGough8_Q2(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecGough8_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecGough8_inAccept(const struct NFA *n, ReportID report, struct mq *q);
+char nfaExecGough8_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecGough8_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecGough8_initCompressedState(const struct NFA *n, u64a offset,
                                        void *state, u8 key);
@@ -68,6 +69,7 @@ char nfaExecGough16_Q2(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecGough16_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecGough16_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecGough16_inAccept(const struct NFA *n, ReportID report, struct mq *q);
+char nfaExecGough16_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecGough16_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecGough16_initCompressedState(const struct NFA *n, u64a offset,
                                         void *state, u8 key);
diff --git a/src/nfa/lbr.h b/src/nfa/lbr.h
index b770477d..a9e42046 100644
--- a/src/nfa/lbr.h
+++ b/src/nfa/lbr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,6 +46,7 @@ char nfaExecLbrDot_Q2(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecLbrDot_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecLbrDot_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecLbrDot_inAccept(const struct NFA *n, ReportID report, struct mq *q);
+char nfaExecLbrDot_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecLbrDot_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecLbrDot_initCompressedState(const struct NFA *n, u64a offset,
                                        void *state, u8 key);
@@ -66,6 +67,7 @@ char nfaExecLbrVerm_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecLbrVerm_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecLbrVerm_inAccept(const struct NFA *n, ReportID report,
                              struct mq *q);
+char nfaExecLbrVerm_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecLbrVerm_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecLbrVerm_initCompressedState(const struct NFA *n, u64a offset,
                                         void *state, u8 key);
@@ -86,6 +88,7 @@ char nfaExecLbrNVerm_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecLbrNVerm_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecLbrNVerm_inAccept(const struct NFA *n, ReportID report,
                               struct mq *q);
+char nfaExecLbrNVerm_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecLbrNVerm_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecLbrNVerm_initCompressedState(const struct NFA *n, u64a offset,
                                          void *state, u8 key);
@@ -106,6 +109,7 @@ char nfaExecLbrShuf_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecLbrShuf_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecLbrShuf_inAccept(const struct NFA *n, ReportID report,
                              struct mq *q);
+char nfaExecLbrShuf_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecLbrShuf_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecLbrShuf_initCompressedState(const struct NFA *n, u64a offset,
                                         void *state, u8 key);
@@ -126,6 +130,7 @@ char nfaExecLbrTruf_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecLbrTruf_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecLbrTruf_inAccept(const struct NFA *n, ReportID report,
                              struct mq *q);
+char nfaExecLbrTruf_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecLbrTruf_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecLbrTruf_initCompressedState(const struct NFA *n, u64a offset,
                                         void *state, u8 key);
diff --git a/src/nfa/lbr_common_impl.h b/src/nfa/lbr_common_impl.h
index 917a8e91..4fb8f62a 100644
--- a/src/nfa/lbr_common_impl.h
+++ b/src/nfa/lbr_common_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -94,6 +94,15 @@ char JOIN(ENGINE_EXEC_NAME, _inAccept)(const struct NFA *nfa,
     return lbrInAccept(l, lstate, q->streamState, offset, report);
 }
 
+char JOIN(ENGINE_EXEC_NAME, _inAnyAccept)(const struct NFA *nfa, struct mq *q) {
+    assert(nfa && q);
+    assert(isLbrType(nfa->type));
+    DEBUG_PRINTF("entry\n");
+
+    const struct lbr_common *l = getImplNfa(nfa);
+    return JOIN(ENGINE_EXEC_NAME, _inAccept)(nfa, l->report, q);
+}
+
 char JOIN(ENGINE_EXEC_NAME, _queueInitState)(const struct NFA *nfa,
                                              struct mq *q) {
     assert(nfa && q);
diff --git a/src/nfa/limex.h b/src/nfa/limex.h
index 57ee46df..3d4d258b 100644
--- a/src/nfa/limex.h
+++ b/src/nfa/limex.h
@@ -60,6 +60,7 @@ extern "C"
     char gf_name##_reportCurrent(const struct NFA *n, struct mq *q);           \
     char gf_name##_inAccept(const struct NFA *n, ReportID report,              \
                             struct mq *q);                                     \
+    char gf_name##_inAnyAccept(const struct NFA *n, struct mq *q);             \
     char gf_name##_queueInitState(const struct NFA *n, struct mq *q);          \
     char gf_name##_initCompressedState(const struct NFA *n, u64a offset,       \
                                        void *state, u8 key);                   \
diff --git a/src/nfa/limex_common_impl.h b/src/nfa/limex_common_impl.h
index 6e4b7718..68e0c0ad 100644
--- a/src/nfa/limex_common_impl.h
+++ b/src/nfa/limex_common_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,6 +40,7 @@
 #define TESTEOD_FN          JOIN(moNfaTestEod, SIZE)
 #define TESTEOD_REV_FN      JOIN(moNfaRevTestEod, SIZE)
 #define LIMEX_INACCEPT_FN   JOIN(limexInAccept, SIZE)
+#define LIMEX_INANYACCEPT_FN   JOIN(limexInAnyAccept, SIZE)
 #define EXPIRE_ESTATE_FN    JOIN(limexExpireExtendedState, SIZE)
 #define REPORTCURRENT_FN    JOIN(moNfaReportCurrent, SIZE)
 #define INITIAL_FN          JOIN(moNfaInitial, SIZE)
@@ -374,11 +375,32 @@ char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
     return 0;
 }
 
+static really_inline
+char LIMEX_INANYACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
+                          union RepeatControl *repeat_ctrl, char *repeat_state,
+                          u64a offset) {
+    assert(limex);
+
+    const STATE_T acceptMask = LOAD_STATE(&limex->accept);
+    STATE_T accstate = AND_STATE(state, acceptMask);
+
+    // Are we in an accept state?
+    if (ISZERO_STATE(accstate)) {
+        DEBUG_PRINTF("no accept states are on\n");
+        return 0;
+    }
+
+    SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, offset, &accstate);
+
+    return ISNONZERO_STATE(accstate);
+}
+
 #undef TESTEOD_FN
 #undef TESTEOD_REV_FN
 #undef REPORTCURRENT_FN
 #undef EXPIRE_ESTATE_FN
 #undef LIMEX_INACCEPT_FN
+#undef LIMEX_INANYACCEPT_FN
 #undef INITIAL_FN
 #undef TOP_FN
 #undef TOPN_FN
diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index b8857922..79e6db1c 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -1008,7 +1008,8 @@ void findMaskedCompressionStates(const build_info &args,
     // Suffixes and outfixes can mask out leaf states, which should all be
     // accepts. Right now we can only do this when there is nothing in initDs,
     // as we switch that on unconditionally in the expand call.
-    if (generates_callbacks(h) && !hasInitDsStates(h, args.state_ids)) {
+    if (!inspects_states_for_accepts(h)
+        && !hasInitDsStates(h, args.state_ids)) {
         NFAStateSet nonleaf(args.num_states);
         for (const auto &e : edges_range(h)) {
             u32 from = args.state_ids.at(source(e, h));
diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index 9924ef8c..19a5ebd3 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -650,7 +650,27 @@ char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) {
         ep = MIN(ep, end_abs);
         assert(ep >= sp);
 
-        assert(sp >= offset); // We no longer do history buffer scans here.
+        if (sp < offset) {
+            DEBUG_PRINTF("HISTORY BUFFER SCAN\n");
+            assert(offset - sp <= q->hlength);
+            u64a local_ep = MIN(offset, ep);
+            u64a final_look = 0;
+            /* we are starting inside the history buffer */
+            if (STREAMFIRST_FN(limex, q->history + q->hlength + sp - offset,
+                               local_ep - sp, &ctx, sp,
+                               &final_look) == MO_HALT_MATCHING) {
+                DEBUG_PRINTF("final_look:%llu sp:%llu end_abs:%llu "
+                             "offset:%llu\n", final_look, sp, end_abs, offset);
+                assert(q->cur);
+                q->cur--;
+                q->items[q->cur].type = MQE_START;
+                q->items[q->cur].location = sp + final_look - offset;
+                STORE_STATE(q->state, LOAD_STATE(&ctx.s));
+                return MO_MATCHES_PENDING;
+            }
+
+            sp = local_ep;
+        }
 
         if (sp >= ep) {
             goto scan_done;
@@ -868,6 +888,21 @@ char JOIN(LIMEX_API_ROOT, _inAccept)(const struct NFA *nfa,
                                      offset, report);
 }
 
+char JOIN(LIMEX_API_ROOT, _inAnyAccept)(const struct NFA *nfa, struct mq *q) {
+    assert(nfa && q);
+    assert(q->state && q->streamState);
+
+    const IMPL_NFA_T *limex = getImplNfa(nfa);
+    union RepeatControl *repeat_ctrl =
+        getRepeatControlBase(q->state, sizeof(STATE_T));
+    char *repeat_state = q->streamState + limex->stateSize;
+    STATE_T state = LOAD_STATE(q->state);
+    u64a offset = q->offset + q_last_loc(q) + 1;
+
+    return JOIN(limexInAnyAccept, SIZE)(limex, state, repeat_ctrl, repeat_state,
+                                        offset);
+}
+
 enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
                                                          const struct NFA *nfa,
                                                          struct mq *q,
diff --git a/src/nfa/mcclellan.c b/src/nfa/mcclellan.c
index 314e88e7..ac26c6a1 100644
--- a/src/nfa/mcclellan.c
+++ b/src/nfa/mcclellan.c
@@ -850,7 +850,7 @@ char nfaExecMcClellan8_reportCurrent(const struct NFA *n, struct mq *q) {
 }
 
 char nfaExecMcClellan16_reportCurrent(const struct NFA *n, struct mq *q) {
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     NfaCallback cb = q->cb;
     void *ctxt = q->context;
     u16 s = *(u16 *)q->state;
@@ -905,7 +905,7 @@ char nfaExecMcClellan8_inAccept(const struct NFA *n, ReportID report,
                                 struct mq *q) {
     assert(n && q);
 
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     u8 s = *(u8 *)q->state;
     DEBUG_PRINTF("checking accepts for %hhu\n", s);
     if (s < m->accept_limit_8) {
@@ -915,25 +915,45 @@ char nfaExecMcClellan8_inAccept(const struct NFA *n, ReportID report,
     return mcclellanHasAccept(m, get_aux(m, s), report);
 }
 
+char nfaExecMcClellan8_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcclellan *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+    assert(s < m->accept_limit_8 || get_aux(m, s)->accept);
+
+    return s >= m->accept_limit_8;
+}
 
 char nfaExecMcClellan16_inAccept(const struct NFA *n, ReportID report,
                                  struct mq *q) {
     assert(n && q);
 
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     u16 s = *(u16 *)q->state;
     DEBUG_PRINTF("checking accepts for %hu\n", s);
 
     return mcclellanHasAccept(m, get_aux(m, s), report);
 }
 
+char nfaExecMcClellan16_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcclellan *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return !!get_aux(m, s)->accept;
+}
+
 char nfaExecMcClellan8_Q2(const struct NFA *n, struct mq *q, s64a end) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
     NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == MCCLELLAN_NFA_8);
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     const u8 *hend = q->history + q->hlength;
 
     return nfaExecMcClellan8_Q2i(n, offset, buffer, hend, cb, context, q,
@@ -947,7 +967,7 @@ char nfaExecMcClellan16_Q2(const struct NFA *n, struct mq *q, s64a end) {
     NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == MCCLELLAN_NFA_16);
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     const u8 *hend = q->history + q->hlength;
 
     return nfaExecMcClellan16_Q2i(n, offset, buffer, hend, cb, context, q,
@@ -961,7 +981,7 @@ char nfaExecMcClellan8_QR(const struct NFA *n, struct mq *q, ReportID report) {
     NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == MCCLELLAN_NFA_8);
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     const u8 *hend = q->history + q->hlength;
 
     char rv = nfaExecMcClellan8_Q2i(n, offset, buffer, hend, cb, context, q,
@@ -980,7 +1000,7 @@ char nfaExecMcClellan16_QR(const struct NFA *n, struct mq *q, ReportID report) {
     NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == MCCLELLAN_NFA_16);
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
+    const struct mcclellan *m = getImplNfa(n);
     const u8 *hend = q->history + q->hlength;
 
     char rv = nfaExecMcClellan16_Q2i(n, offset, buffer, hend, cb, context, q,
@@ -996,7 +1016,7 @@ char nfaExecMcClellan16_QR(const struct NFA *n, struct mq *q, ReportID report) {
 
 char nfaExecMcClellan8_initCompressedState(const struct NFA *nfa, u64a offset,
                                            void *state, UNUSED u8 key) {
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
+    const struct mcclellan *m = getImplNfa(nfa);
     u8 s = offset ? m->start_floating : m->start_anchored;
     if (s) {
         *(u8 *)state = s;
@@ -1007,7 +1027,7 @@ char nfaExecMcClellan8_initCompressedState(const struct NFA *nfa, u64a offset,
 
 char nfaExecMcClellan16_initCompressedState(const struct NFA *nfa, u64a offset,
                                             void *state, UNUSED u8 key) {
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
+    const struct mcclellan *m = getImplNfa(nfa);
     u16 s = offset ? m->start_floating : m->start_anchored;
     if (s) {
         unaligned_store_u16(state, s);
@@ -1019,7 +1039,7 @@ char nfaExecMcClellan16_initCompressedState(const struct NFA *nfa, u64a offset,
 void nfaExecMcClellan8_SimpStream(const struct NFA *nfa, char *state,
                                   const u8 *buf, char top, size_t start_off,
                                   size_t len, NfaCallback cb, void *ctxt) {
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
+    const struct mcclellan *m = getImplNfa(nfa);
 
     u8 s = top ? m->start_anchored : *(u8 *)state;
 
@@ -1037,7 +1057,7 @@ void nfaExecMcClellan8_SimpStream(const struct NFA *nfa, char *state,
 void nfaExecMcClellan16_SimpStream(const struct NFA *nfa, char *state,
                                    const u8 *buf, char top, size_t start_off,
                                    size_t len, NfaCallback cb, void *ctxt) {
-    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
+    const struct mcclellan *m = getImplNfa(nfa);
 
     u16 s = top ? m->start_anchored : unaligned_load_u16(state);
 
diff --git a/src/nfa/mcclellan.h b/src/nfa/mcclellan.h
index 6b4ec2d5..677265f5 100644
--- a/src/nfa/mcclellan.h
+++ b/src/nfa/mcclellan.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,6 +47,7 @@ char nfaExecMcClellan8_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecMcClellan8_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecMcClellan8_inAccept(const struct NFA *n, ReportID report,
                                 struct mq *q);
+char nfaExecMcClellan8_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecMcClellan8_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecMcClellan8_initCompressedState(const struct NFA *n, u64a offset,
                                            void *state, u8 key);
@@ -70,6 +71,7 @@ char nfaExecMcClellan16_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecMcClellan16_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecMcClellan16_inAccept(const struct NFA *n, ReportID report,
                                  struct mq *q);
+char nfaExecMcClellan16_inAnyAccept(const struct NFA *n, struct mq *q);
 char nfaExecMcClellan16_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecMcClellan16_initCompressedState(const struct NFA *n, u64a offset,
                                             void *state, u8 key);
diff --git a/src/nfa/mcclellancompile_util.cpp b/src/nfa/mcclellancompile_util.cpp
index 234574d8..2f1ffa02 100644
--- a/src/nfa/mcclellancompile_util.cpp
+++ b/src/nfa/mcclellancompile_util.cpp
@@ -395,4 +395,36 @@ dstate_id_t get_sds_or_proxy(const raw_dfa &raw) {
     }
 }
 
+static
+bool can_die_early(const raw_dfa &raw, dstate_id_t s,
+                   map<dstate_id_t, u32> &visited, u32 age_limit) {
+    if (contains(visited, s) && visited[s] >= age_limit) {
+        /* we have already visited (or are in the process of visiting) here with
+         * a looser limit. */
+        return false;
+    }
+    visited[s] = age_limit;
+
+    if (s == DEAD_STATE) {
+        return true;
+    }
+
+    if (age_limit == 0) {
+        return false;
+    }
+
+    for (const auto &next : raw.states[s].next) {
+        if (can_die_early(raw, next, visited, age_limit - 1)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+bool can_die_early(const raw_dfa &raw, u32 age_limit) {
+    map<dstate_id_t, u32> visited;
+    return can_die_early(raw, raw.start_anchored, visited, age_limit);
+}
+
 } // namespace ue2
diff --git a/src/nfa/mcclellancompile_util.h b/src/nfa/mcclellancompile_util.h
index 7b6c033a..3d3ee2e7 100644
--- a/src/nfa/mcclellancompile_util.h
+++ b/src/nfa/mcclellancompile_util.h
@@ -57,6 +57,8 @@ size_t hash_dfa(const raw_dfa &rdfa);
 
 dstate_id_t get_sds_or_proxy(const raw_dfa &raw);
 
+bool can_die_early(const raw_dfa &raw, u32 age_limit);
+
 } // namespace ue2
 
 #endif
diff --git a/src/nfa/mpv.h b/src/nfa/mpv.h
index dc5dad6f..a3f90719 100644
--- a/src/nfa/mpv.h
+++ b/src/nfa/mpv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,7 +36,6 @@ struct NFA;
 
 char nfaExecMpv0_Q(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecMpv0_reportCurrent(const struct NFA *n, struct mq *q);
-char nfaExecMpv0_inAccept(const struct NFA *n, ReportID report, struct mq *q);
 char nfaExecMpv0_queueInitState(const struct NFA *n, struct mq *q);
 char nfaExecMpv0_initCompressedState(const struct NFA *n, u64a offset,
                                      void *state, u8 key);
@@ -47,6 +46,7 @@ char nfaExecMpv0_expandState(const struct NFA *nfa, void *dest, const void *src,
 
 #define nfaExecMpv0_testEOD NFA_API_NO_IMPL
 #define nfaExecMpv0_inAccept NFA_API_NO_IMPL
+#define nfaExecMpv0_inAnyAccept NFA_API_NO_IMPL
 #define nfaExecMpv0_QR NFA_API_NO_IMPL
 #define nfaExecMpv0_Q2 NFA_API_NO_IMPL /* for non-chained suffixes. */
 #define nfaExecMpv0_B_Reverse NFA_API_NO_IMPL
diff --git a/src/nfa/nfa_api.h b/src/nfa/nfa_api.h
index 84f5c4a0..dad3894a 100644
--- a/src/nfa/nfa_api.h
+++ b/src/nfa/nfa_api.h
@@ -175,10 +175,16 @@ char nfaReportCurrentMatches(const struct NFA *nfa, struct mq *q);
  */
 char nfaInAcceptState(const struct NFA *nfa, ReportID report, struct mq *q);
 
+/**
+ * Returns non-zero if the NFA is in any accept state regardless of report
+ * ID.
+ */
+char nfaInAnyAcceptState(const struct NFA *nfa, struct mq *q);
+
 /**
  * Process the queued commands on the given NFA up to end or the first match.
  *
- * Note: This version is meant for rose prefix NFAs:
+ * Note: This version is meant for rose prefix/infix NFAs:
  *  - never uses a callback
  *  - loading of state at a point in history is not special cased
  *
@@ -187,9 +193,9 @@ char nfaInAcceptState(const struct NFA *nfa, ReportID report, struct mq *q);
  *        end with some variant of end. The location field of the events must
  *        be monotonically increasing. If not all the data was processed during
  *        the call, the queue is updated to reflect the remaining work.
- * @param report we are interested in, if set at the end of the scan returns
- *        @ref MO_MATCHES_PENDING. If no report is desired, MO_INVALID_IDX should
- *        be passed in.
+ * @param report we are interested in. If the given report will be raised at
+ *        the end location, the function returns @ref MO_MATCHES_PENDING. If no
+ *        match information is desired, MO_INVALID_IDX should be passed in.
  * @return @ref MO_ALIVE if the nfa is still active with no matches pending,
  *         and @ref MO_MATCHES_PENDING if there are matches pending, 0 if not
  *         alive
diff --git a/src/nfa/nfa_api_dispatch.c b/src/nfa/nfa_api_dispatch.c
index 95b1898e..9591cad5 100644
--- a/src/nfa/nfa_api_dispatch.c
+++ b/src/nfa/nfa_api_dispatch.c
@@ -228,7 +228,6 @@ char nfaQueueExecToMatch(const struct NFA *nfa, struct mq *q, s64a end) {
 
     assert(q);
     assert(end >= 0);
-    assert(q->context);
     assert(q->state);
     assert(q->cur < q->end);
     assert(q->end <= MAX_MQE_LEN);
@@ -285,6 +284,11 @@ char nfaInAcceptState(const struct NFA *nfa, ReportID report, struct mq *q) {
     return 0;
 }
 
+char nfaInAnyAcceptState(const struct NFA *nfa, struct mq *q) {
+    DISPATCH_BY_NFA_TYPE(_inAnyAccept(nfa, q));
+    return 0;
+}
+
 char nfaQueueExecRose(const struct NFA *nfa, struct mq *q, ReportID r) {
     DEBUG_PRINTF("nfa=%p\n", nfa);
 #ifdef DEBUG
diff --git a/src/nfa/nfa_kind.h b/src/nfa/nfa_kind.h
index 46d0bc4c..adc7045f 100644
--- a/src/nfa/nfa_kind.h
+++ b/src/nfa/nfa_kind.h
@@ -47,6 +47,7 @@ enum nfa_kind {
     NFA_OUTFIX,  //!< "outfix" nfa not triggered by external events
     NFA_OUTFIX_RAW, //!< "outfix", but with unmanaged reports
     NFA_REV_PREFIX, //! reverse running prefixes (for som)
+    NFA_EAGER_PREFIX, //!< rose prefix that is also run up to matches
 };
 
 /** \brief True if this kind of engine is triggered by a top event. */
@@ -63,8 +64,10 @@ bool is_triggered(enum nfa_kind k) {
 }
 
 /**
- * \brief True if this kind of engine generates callback events when it
- * enters accept states.
+ * \brief True if this kind of engine generates actively checks for accept
+ * states either to halt matching or to raise a callback. Only these engines
+ * generated with this property should call nfaQueueExec() or
+ * nfaQueueExecToMatch().
  */
 inline
 bool generates_callbacks(enum nfa_kind k) {
@@ -73,6 +76,24 @@ bool generates_callbacks(enum nfa_kind k) {
     case NFA_OUTFIX:
     case NFA_OUTFIX_RAW:
     case NFA_REV_PREFIX:
+    case NFA_EAGER_PREFIX:
+        return true;
+    default:
+        return false;
+    }
+}
+
+/**
+ * \brief True if this kind of engine has its state inspected to see if it is in
+ * an accept state. Engines generated with this property will commonly call
+ * nfaQueueExecRose(), nfaInAcceptState(), and nfaInAnyAcceptState().
+ */
+inline
+bool inspects_states_for_accepts(enum nfa_kind k) {
+    switch (k) {
+    case NFA_PREFIX:
+    case NFA_INFIX:
+    case NFA_EAGER_PREFIX:
         return true;
     default:
         return false;
diff --git a/src/nfagraph/ng_execute.cpp b/src/nfagraph/ng_execute.cpp
index 92bef737..46307cd5 100644
--- a/src/nfagraph/ng_execute.cpp
+++ b/src/nfagraph/ng_execute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -324,4 +324,49 @@ flat_set<NFAVertex> execute_graph(const NGHolder &running_g,
                          initial_states);
 }
 
+static
+bool can_die_early(const NGHolder &g, const vector<StateInfo> &info,
+                   const dynamic_bitset<> &s,
+                   map<dynamic_bitset<>, u32> &visited, u32 age_limit) {
+    if (contains(visited, s) && visited[s] >= age_limit) {
+        /* we have already (or are in the process) of visiting here with a
+         * looser limit. */
+        return false;
+    }
+    visited[s] = age_limit;
+
+    if (s.none()) {
+        DEBUG_PRINTF("dead\n");
+        return true;
+    }
+
+    if (age_limit == 0) {
+        return false;
+    }
+
+    dynamic_bitset<> all_succ(s.size());
+    step(g, info, s, &all_succ);
+    all_succ.reset(NODE_START_DOTSTAR);
+
+    for (u32 i = 0; i < N_CHARS; i++) {
+        dynamic_bitset<> next = all_succ;
+        filter_by_reach(info, &next, CharReach(i));
+        if (can_die_early(g, info, next, visited, age_limit - 1)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+bool can_die_early(const NGHolder &g, u32 age_limit) {
+    if (proper_out_degree(g.startDs, g)) {
+        return false;
+    }
+    const vector<StateInfo> &info = makeInfoTable(g);
+    map<dynamic_bitset<>, u32> visited;
+    return can_die_early(g, info, makeStateBitset(g, {g.start}), visited,
+                         age_limit);
+}
+
 } // namespace ue2
diff --git a/src/nfagraph/ng_execute.h b/src/nfagraph/ng_execute.h
index e2c7c72d..bdcfecfd 100644
--- a/src/nfagraph/ng_execute.h
+++ b/src/nfagraph/ng_execute.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -64,6 +64,9 @@ flat_set<NFAVertex> execute_graph(const NGHolder &g, const NGHolder &input_dag,
                                   const flat_set<NFAVertex> &input_start_states,
                                   const flat_set<NFAVertex> &initial);
 
+/* returns true if it is possible for the nfa to die within age_limit bytes */
+bool can_die_early(const NGHolder &g, u32 age_limit);
+
 } // namespace ue2
 
 #endif
diff --git a/src/nfagraph/ng_holder.h b/src/nfagraph/ng_holder.h
index 3243f665..07f21d0f 100644
--- a/src/nfagraph/ng_holder.h
+++ b/src/nfagraph/ng_holder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -315,15 +315,26 @@ void remove_edges(const Container &c, NGHolder &h, bool renumber = true) {
     remove_edges(c.begin(), c.end(), h, renumber);
 }
 
-static UNUSED
+inline
 bool is_triggered(const NGHolder &g) {
     return is_triggered(g.kind);
 }
 
-static UNUSED
+inline
 bool generates_callbacks(const NGHolder &g) {
     return generates_callbacks(g.kind);
 }
+
+inline
+bool has_managed_reports(const NGHolder &g) {
+    return has_managed_reports(g.kind);
+}
+
+inline
+bool inspects_states_for_accepts(const NGHolder &g) {
+    return inspects_states_for_accepts(g.kind);
+}
+
 } // namespace ue2
 
 #endif
diff --git a/src/nfagraph/ng_limex.cpp b/src/nfagraph/ng_limex.cpp
index 713fe370..af7779ba 100644
--- a/src/nfagraph/ng_limex.cpp
+++ b/src/nfagraph/ng_limex.cpp
@@ -373,7 +373,7 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, vector<vector<CharReach>>> &triggers,
              bool compress_state, bool do_accel, bool impl_test_only, u32 hint,
              const CompileContext &cc) {
-    if (!generates_callbacks(h_in)) {
+    if (!has_managed_reports(h_in)) {
         rm = nullptr;
     } else {
         assert(rm);
@@ -413,7 +413,7 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
 
     set<NFAVertex> zombies = findZombies(*h, br_cyclic, state_ids, cc);
 
-    if (generates_callbacks(*h)) {
+    if (has_managed_reports(*h)) {
         assert(rm);
         remapReportsToPrograms(*h, *rm);
     }
@@ -508,7 +508,7 @@ u32 isImplementableNFA(const NGHolder &g, const ReportManager *rm,
         return true;
     }
 
-    if (!generates_callbacks(g)) {
+    if (!has_managed_reports(g)) {
         rm = nullptr;
     } else {
         assert(rm);
@@ -547,7 +547,7 @@ void reduceImplementableGraph(NGHolder &g, som_type som, const ReportManager *rm
 
     removeRedundancy(g, som);
 
-    if (rm && generates_callbacks(g)) {
+    if (rm && has_managed_reports(g)) {
         pruneHighlanderDominated(g, *rm);
     }
 
@@ -560,7 +560,7 @@ void reduceImplementableGraph(NGHolder &g, som_type som, const ReportManager *rm
 
 u32 countAccelStates(const NGHolder &g, const ReportManager *rm,
                      const CompileContext &cc) {
-    if (!generates_callbacks(g)) {
+    if (!has_managed_reports(g)) {
         rm = nullptr;
     } else {
         assert(rm);
diff --git a/src/nfagraph/ng_mcclellan.cpp b/src/nfagraph/ng_mcclellan.cpp
index b1c6ff96..024cf2c1 100644
--- a/src/nfagraph/ng_mcclellan.cpp
+++ b/src/nfagraph/ng_mcclellan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -531,9 +531,9 @@ unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
     DEBUG_PRINTF("attempting to build ?%d? mcclellan\n", (int)graph.kind);
     assert(allMatchStatesHaveReports(graph));
 
-    bool prunable = grey.highlanderPruneDFA && generates_callbacks(graph);
-    assert(rm || !generates_callbacks(graph));
-    if (!generates_callbacks(graph)) {
+    bool prunable = grey.highlanderPruneDFA && has_managed_reports(graph);
+    assert(rm || !has_managed_reports(graph));
+    if (!has_managed_reports(graph)) {
         rm = nullptr;
     }
 
diff --git a/src/nfagraph/ng_split.cpp b/src/nfagraph/ng_split.cpp
index 42157e1e..75150136 100644
--- a/src/nfagraph/ng_split.cpp
+++ b/src/nfagraph/ng_split.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -112,6 +112,12 @@ void splitLHS(const NGHolder &base, const vector<NFAVertex> &pivots,
     case NFA_SUFFIX:
         lhs->kind = NFA_INFIX;
         break;
+    case NFA_EAGER_PREFIX:
+        /* Current code should not be assigning eager until well after all the
+         * splitting is done. */
+        assert(0);
+        lhs->kind = NFA_EAGER_PREFIX;
+        break;
     case NFA_REV_PREFIX:
     case NFA_OUTFIX_RAW:
         assert(0);
@@ -154,6 +160,12 @@ void splitRHS(const NGHolder &base, const vector<NFAVertex> &pivots,
     case NFA_OUTFIX:
         rhs->kind = NFA_SUFFIX;
         break;
+    case NFA_EAGER_PREFIX:
+        /* Current code should not be assigning eager until well after all the
+         * splitting is done. */
+        assert(0);
+        rhs->kind = NFA_INFIX;
+        break;
     case NFA_REV_PREFIX:
     case NFA_OUTFIX_RAW:
         assert(0);
diff --git a/src/rose/block.c b/src/rose/block.c
index 55323c2e..a40d229b 100644
--- a/src/rose/block.c
+++ b/src/rose/block.c
@@ -266,6 +266,86 @@ int roseBlockFloating(const struct RoseEngine *t, struct hs_scratch *scratch) {
     return can_stop_matching(scratch);
 }
 
+static rose_inline
+void runEagerPrefixesBlock(const struct RoseEngine *t,
+                           struct hs_scratch *scratch) {
+    if (!t->eagerIterOffset) {
+        return;
+    }
+
+    char *state = scratch->core_info.state;
+    u8 *ara = getActiveLeftArray(t, state); /* indexed by offsets into
+                                             * left_table */
+    const u32 arCount = t->activeLeftCount;
+    const u32 qCount = t->queueCount;
+    const struct LeftNfaInfo *left_table = getLeftTable(t);
+    const struct mmbit_sparse_iter *it = getByOffset(t, t->eagerIterOffset);
+
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    u32 idx = 0;
+    u32 ri = mmbit_sparse_iter_begin(ara, arCount, &idx, it, si_state);
+    for (; ri != MMB_INVALID;
+           ri = mmbit_sparse_iter_next(ara, arCount, ri, &idx, it, si_state)) {
+        const struct LeftNfaInfo *left = left_table + ri;
+        u32 qi = ri + t->leftfixBeginQueue;
+        DEBUG_PRINTF("leftfix %u/%u, maxLag=%u\n", ri, arCount, left->maxLag);
+
+        assert(!fatbit_isset(scratch->aqa, qCount, qi));
+        assert(left->eager);
+        assert(!left->infix);
+
+        struct mq *q = scratch->queues + qi;
+        const struct NFA *nfa = getNfaByQueue(t, qi);
+
+        if (scratch->core_info.len < nfa->minWidth) {
+            /* we know that there is not enough data for this to ever match, so
+             * we can immediately squash/ */
+            mmbit_unset(ara, arCount, ri);
+            scratch->tctxt.groups &= left->squash_mask;
+        }
+
+        s64a loc = MIN(scratch->core_info.len, EAGER_STOP_OFFSET);
+
+        fatbit_set(scratch->aqa, qCount, qi);
+        initRoseQueue(t, qi, left, scratch);
+
+        pushQueueAt(q, 0, MQE_START, 0);
+        pushQueueAt(q, 1, MQE_TOP, 0);
+        pushQueueAt(q, 2, MQE_END, loc);
+        nfaQueueInitState(nfa, q);
+
+        char alive = nfaQueueExecToMatch(q->nfa, q, loc);
+
+        if (!alive) {
+            DEBUG_PRINTF("queue %u dead, squashing\n", qi);
+            mmbit_unset(ara, arCount, ri);
+            fatbit_unset(scratch->aqa, qCount, qi);
+            scratch->tctxt.groups &= left->squash_mask;
+        } else if (q->cur == q->end) {
+            assert(alive != MO_MATCHES_PENDING);
+            if (loc == (s64a)scratch->core_info.len) {
+                /* We know that the prefix does not match in the block so we
+                 * can squash the groups anyway even though it did not die */
+                /* TODO: if we knew the minimum lag the leftfix is checked at we
+                 * could make this check tighter */
+                DEBUG_PRINTF("queue %u has no match in block, squashing\n", qi);
+                mmbit_unset(ara, arCount, ri);
+                fatbit_unset(scratch->aqa, qCount, qi);
+                scratch->tctxt.groups &= left->squash_mask;
+            } else {
+                DEBUG_PRINTF("queue %u finished, nfa lives\n", qi);
+                q->cur = q->end = 0;
+                pushQueueAt(q, 0, MQE_START, loc);
+            }
+        } else {
+            assert(alive == MO_MATCHES_PENDING);
+            DEBUG_PRINTF("queue %u unfinished, nfa lives\n", qi);
+            q->end--; /* remove end item */
+        }
+    }
+}
+
 void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
     assert(t);
     assert(scratch);
@@ -314,6 +394,8 @@ void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
         hwlmExec(sbtable, scratch->core_info.buf, sblen, 0, roseCallback,
                  scratch, tctxt->groups);
     } else {
+        runEagerPrefixesBlock(t, scratch);
+
         if (roseBlockAnchored(t, scratch)) {
             return;
         }
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index 860f7599..3794ac3f 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -424,7 +424,7 @@ char roseTestLeftfix(const struct RoseEngine *t, struct hs_scratch *scratch,
     }
 
     s64a loc = (s64a)end - ci->buf_offset - leftfixLag;
-    assert(loc >= q_cur_loc(q));
+    assert(loc >= q_cur_loc(q) || left->eager);
     assert(leftfixReport != MO_INVALID_IDX);
 
     if (!is_infix && left->transient) {
@@ -471,7 +471,13 @@ char roseTestLeftfix(const struct RoseEngine *t, struct hs_scratch *scratch,
         DEBUG_PRINTF("checking for report %u\n", leftfixReport);
         DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
         return rv == MO_MATCHES_PENDING;
+    } else if (q_cur_loc(q) > loc) {
+        /* an eager leftfix may have already progressed past loc if there is no
+         * match at loc. */
+        assert(left->eager);
+        return 0;
     } else {
+        assert(q_cur_loc(q) == loc);
         DEBUG_PRINTF("checking for report %u\n", leftfixReport);
         char rv = nfaInAcceptState(q->nfa, leftfixReport, q);
         DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
diff --git a/src/rose/rose_build_add.cpp b/src/rose/rose_build_add.cpp
index ae155361..fe2c259e 100644
--- a/src/rose/rose_build_add.cpp
+++ b/src/rose/rose_build_add.cpp
@@ -1038,6 +1038,7 @@ bool canImplementGraph(RoseBuildImpl *tbi, const RoseInGraph &in, NGHolder &h,
                 return false;
             }
             break;
+        case NFA_EAGER_PREFIX:
         case NFA_REV_PREFIX:
         case NFA_OUTFIX_RAW:
             DEBUG_PRINTF("kind %u\n", (u32)h.kind);
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 3f36a05e..3f56b101 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -50,6 +50,7 @@
 #include "nfa/nfa_build_util.h"
 #include "nfa/nfa_internal.h"
 #include "nfa/shufticompile.h"
+#include "nfagraph/ng_execute.h"
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_lbr.h"
 #include "nfagraph/ng_limex.h"
@@ -1046,8 +1047,9 @@ makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
     // streaming mode.
     const bool compress_state = !is_transient;
 
-    assert(!left.graph()
-           || left.graph()->kind == (is_prefix ? NFA_PREFIX : NFA_INFIX));
+    assert(is_prefix || !left.graph() || left.graph()->kind == NFA_INFIX);
+    assert(!is_prefix || !left.graph() || left.graph()->kind == NFA_PREFIX
+           || left.graph()->kind == NFA_EAGER_PREFIX);
 
     // Holder should be implementable as an NFA at the very least.
     if (!left.dfa() && left.graph()) {
@@ -1089,7 +1091,9 @@ makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
 
     if (!n && left.graph()) {
         map<u32, vector<vector<CharReach>>> triggers;
-        findTriggerSequences(tbi, infixTriggers.at(left), &triggers);
+        if (left.graph()->kind == NFA_INFIX) {
+            findTriggerSequences(tbi, infixTriggers.at(left), &triggers);
+        }
         n = constructNFA(*left.graph(), nullptr, fixed_depth_tops, triggers,
                          compress_state, cc);
     }
@@ -1125,17 +1129,309 @@ void setLeftNfaProperties(NFA &n, const left_id &left) {
     // graph.
 }
 
+static
+void appendTailToHolder(NGHolder &h, const flat_set<ReportID> &reports,
+                        const vector<NFAVertex> &starts,
+                        const vector<CharReach> &tail) {
+    assert(!tail.empty());
+    NFAVertex curr = add_vertex(h);
+    for (NFAVertex v : starts) {
+        assert(!edge(v, h.acceptEod, h).second);
+        assert(h[v].reports == reports);
+        h[v].reports.clear();
+        remove_edge(v, h.accept, h);
+        add_edge(v, curr, h);
+    }
+    auto it = tail.begin();
+    h[curr].char_reach = *it;
+    ++it;
+    while (it != tail.end()) {
+        NFAVertex old = curr;
+        curr = add_vertex(h);
+        add_edge(old, curr, h);
+        assert(!it->none());
+        h[curr].char_reach = *it;
+        ++it;
+    }
+
+    h[curr].reports = reports;
+    add_edge(curr, h.accept, h);
+}
+
+static
+void appendTailToHolder(NGHolder &h, const vector<CharReach> &tail) {
+    assert(in_degree(h.acceptEod, h) == 1);
+    assert(!tail.empty());
+
+    map<flat_set<ReportID>, vector<NFAVertex> > reporters;
+    for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
+        reporters[h[v].reports].push_back(v);
+    }
+
+    for (const auto &e : reporters) {
+        appendTailToHolder(h, e.first, e.second, tail);
+    }
+
+    h.renumberEdges();
+}
+
+static
+u32 decreaseLag(const RoseBuildImpl &build, NGHolder &h,
+                const vector<RoseVertex> &succs) {
+    const RoseGraph &rg = build.g;
+    static const size_t MAX_RESTORE_LEN = 5;
+
+    vector<CharReach> restored(MAX_RESTORE_LEN);
+    for (RoseVertex v : succs) {
+        u32 lag = rg[v].left.lag;
+        for (u32 lit_id : rg[v].literals) {
+            u32 delay = build.literals.right.at(lit_id).delay;
+            const ue2_literal &literal = build.literals.right.at(lit_id).s;
+            assert(lag <= literal.length() + delay);
+            size_t base = literal.length() + delay - lag;
+            if (base >= literal.length()) {
+                return 0;
+            }
+            size_t len = literal.length() - base;
+            len = MIN(len, restored.size());
+            restored.resize(len);
+            auto lit_it = literal.begin() + base;
+            for (u32 i = 0; i < len; i++) {
+                assert(lit_it != literal.end());
+                restored[i] |= *lit_it;
+                ++lit_it;
+            }
+        }
+    }
+
+    assert(!restored.empty());
+
+    appendTailToHolder(h, restored);
+
+    return restored.size();
+}
+
+#define EAGER_DIE_BEFORE_LIMIT 10
+
+struct eager_info {
+    shared_ptr<NGHolder> new_graph;
+    u32 lag_adjust = 0;
+};
+
+static
+bool checkSuitableForEager(bool is_prefix, const left_id &left,
+                           const RoseBuildImpl &build,
+                           const vector<RoseVertex> &succs,
+                           rose_group squash_mask, rose_group initial_groups,
+                           eager_info &ei, const CompileContext &cc) {
+    DEBUG_PRINTF("checking prefix --> %016llx...\n", squash_mask);
+
+    const RoseGraph &rg = build.g;
+
+    if (!is_prefix) {
+        DEBUG_PRINTF("not prefix\n");
+        return false; /* only prefixes (for now...) */
+    }
+
+    if ((initial_groups & squash_mask) == initial_groups) {
+        DEBUG_PRINTF("no squash -- useless\n");
+        return false;
+    }
+
+    for (RoseVertex s : succs) {
+        if (build.isInETable(s)
+            || contains(rg[s].literals, build.eod_event_literal_id)) {
+            return false; /* Ignore EOD related prefixes */
+        }
+    }
+
+    if (left.dfa()) {
+        const raw_dfa &dfa = *left.dfa();
+        if (dfa.start_floating != DEAD_STATE) {
+            return false; /* not purely anchored */
+        }
+        if (!dfa.states[dfa.start_anchored].reports.empty()) {
+            return false; /* vacuous (todo: handle?) */
+        }
+
+        if (!can_die_early(dfa, EAGER_DIE_BEFORE_LIMIT)) {
+            return false;
+        }
+        ei.new_graph = rg[succs[0]].left.graph;
+    } else if (left.graph()) {
+        const NGHolder &g = *left.graph();
+        if (proper_out_degree(g.startDs, g)) {
+            return false; /* not purely anchored */
+        }
+        if (is_match_vertex(g.start, g)) {
+            return false; /* vacuous (todo: handle?) */
+        }
+
+        ei.new_graph = cloneHolder(*left.graph());
+        auto gg = ei.new_graph;
+        gg->kind = NFA_EAGER_PREFIX;
+
+        ei.lag_adjust = decreaseLag(build, *gg, succs);
+
+        if (!can_die_early(*gg, EAGER_DIE_BEFORE_LIMIT)) {
+            DEBUG_PRINTF("not eager as stuck alive\n");
+            return false;
+        }
+
+        /* We need to ensure that adding in the literals does not cause us to no
+         * longer be able to build an nfa. */
+        bool ok = isImplementableNFA(*gg, nullptr, cc);
+        if (!ok) {
+            return false;
+        }
+    } else {
+        DEBUG_PRINTF("unable to determine if good for eager running\n");
+        return false;
+    }
+
+    DEBUG_PRINTF("eager prefix\n");
+    return true;
+}
+
+static
+left_id updateLeftfixWithEager(RoseGraph &g, const eager_info &ei,
+                               const vector<RoseVertex> &succs) {
+    u32 lag_adjust = ei.lag_adjust;
+    auto gg = ei.new_graph;
+    for (RoseVertex v : succs) {
+        g[v].left.graph = gg;
+        assert(g[v].left.lag >= lag_adjust);
+        g[v].left.lag -= lag_adjust;
+        DEBUG_PRINTF("added %u literal chars back, new lag %u\n", lag_adjust,
+                     g[v].left.lag);
+    }
+    left_id leftfix = g[succs[0]].left;
+
+    if (leftfix.graph()) {
+        assert(leftfix.graph()->kind == NFA_PREFIX
+               || leftfix.graph()->kind == NFA_EAGER_PREFIX);
+        leftfix.graph()->kind = NFA_EAGER_PREFIX;
+    }
+    if (leftfix.dfa()) {
+        assert(leftfix.dfa()->kind == NFA_PREFIX);
+        leftfix.dfa()->kind = NFA_EAGER_PREFIX;
+    }
+
+    return leftfix;
+}
+
+static
+bool buildLeftfix(RoseBuildImpl &build, build_context &bc, bool prefix, u32 qi,
+                  const map<left_id, set<PredTopPair> > &infixTriggers,
+                  set<u32> *no_retrigger_queues, set<u32> *eager_queues,
+                  const map<left_id, eager_info> &eager,
+                  const vector<RoseVertex> &succs, left_id leftfix) {
+    RoseGraph &g = build.g;
+    const CompileContext &cc = build.cc;
+    const ReportManager &rm = build.rm;
+
+    bool is_transient = contains(build.transient, leftfix);
+    rose_group squash_mask = build.rose_squash_masks.at(leftfix);
+
+    DEBUG_PRINTF("making %sleftfix\n", is_transient ? "transient " : "");
+
+    if (contains(eager, leftfix)) {
+        eager_queues->insert(qi);
+        leftfix = updateLeftfixWithEager(g, eager.at(leftfix), succs);
+    }
+
+    aligned_unique_ptr<NFA> nfa;
+    // Need to build NFA, which is either predestined to be a Haig (in SOM mode)
+    // or could be all manner of things.
+    if (leftfix.haig()) {
+        nfa = goughCompile(*leftfix.haig(), build.ssm.somPrecision(), cc, rm);
+    }  else {
+        nfa = makeLeftNfa(build, leftfix, prefix, is_transient, infixTriggers,
+                          cc);
+    }
+
+    if (!nfa) {
+        assert(!"failed to build leftfix");
+        return false;
+    }
+
+    setLeftNfaProperties(*nfa, leftfix);
+
+    build.leftfix_queue_map.emplace(leftfix, qi);
+    nfa->queueIndex = qi;
+
+    if (!prefix && !leftfix.haig() && leftfix.graph()
+        && nfaStuckOn(*leftfix.graph())) {
+        DEBUG_PRINTF("%u sticks on\n", qi);
+        no_retrigger_queues->insert(qi);
+    }
+
+    DEBUG_PRINTF("built leftfix, qi=%u\n", qi);
+    add_nfa_to_blob(bc, *nfa);
+
+    // Leftfixes can have stop alphabets.
+    vector<u8> stop(N_CHARS, 0);
+    /* haigs track som information - need more care */
+    som_type som = leftfix.haig() ? SOM_LEFT : SOM_NONE;
+    if (leftfix.graph()) {
+        stop = findLeftOffsetStopAlphabet(*leftfix.graph(), som);
+    } else if (leftfix.castle()) {
+        stop = findLeftOffsetStopAlphabet(*leftfix.castle(), som);
+    }
+
+    // Infix NFAs can have bounds on their queue lengths.
+    u32 max_queuelen = UINT32_MAX;
+    if (!prefix) {
+        set<ue2_literal> lits;
+        for (RoseVertex v : succs) {
+            for (auto u : inv_adjacent_vertices_range(v, g)) {
+                for (u32 lit_id : g[u].literals) {
+                    lits.insert(build.literals.right.at(lit_id).s);
+                }
+            }
+        }
+        DEBUG_PRINTF("%zu literals\n", lits.size());
+        max_queuelen = findMaxInfixMatches(leftfix, lits);
+        if (max_queuelen < UINT32_MAX) {
+            max_queuelen++;
+        }
+    }
+
+    u32 max_width;
+    if (is_transient) {
+        depth d = findMaxWidth(leftfix);
+        assert(d.is_finite());
+        max_width = d;
+    } else {
+        max_width = 0;
+    }
+
+    u8 cm_count = 0;
+    CharReach cm_cr;
+    if (cc.grey.allowCountingMiracles) {
+        findCountingMiracleInfo(leftfix, stop, &cm_count, &cm_cr);
+    }
+
+    for (RoseVertex v : succs) {
+        bc.leftfix_info.emplace(v, left_build_info(qi, g[v].left.lag, max_width,
+                                                   squash_mask, stop,
+                                                   max_queuelen, cm_count,
+                                                   cm_cr));
+    }
+
+    return true;
+}
+
 static
 bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
                     QueueIndexFactory &qif, set<u32> *no_retrigger_queues,
-                    bool do_prefix) {
-    const RoseGraph &g = tbi.g;
+                    set<u32> *eager_queues, bool do_prefix) {
+    RoseGraph &g = tbi.g;
     const CompileContext &cc = tbi.cc;
-    const ReportManager &rm = tbi.rm;
-
-    ue2::unordered_map<left_id, u32> seen; // already built queue indices
 
     map<left_id, set<PredTopPair> > infixTriggers;
+    vector<left_id> order;
+    unordered_map<left_id, vector<RoseVertex> > succs;
     findInfixTriggers(tbi, &infixTriggers);
 
     for (auto v : vertices_range(g)) {
@@ -1143,6 +1439,7 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
             continue;
         }
 
+        assert(tbi.isNonRootSuccessor(v) != tbi.isRootSuccessor(v));
         bool is_prefix = tbi.isRootSuccessor(v);
 
         if (do_prefix != is_prefix) {
@@ -1156,8 +1453,6 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
         // our in-edges.
         assert(roseHasTops(g, v));
 
-        u32 qi; // queue index, set below.
-        u32 lag = g[v].left.lag;
         bool is_transient = contains(tbi.transient, leftfix);
 
         // Transient leftfixes can sometimes be implemented solely with
@@ -1173,95 +1468,42 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
             }
         }
 
-        if (contains(seen, leftfix)) {
-            // NFA already built.
-            qi = seen[leftfix];
-            assert(contains(bc.engineOffsets, qi));
-            DEBUG_PRINTF("sharing leftfix, qi=%u\n", qi);
-        } else {
-            DEBUG_PRINTF("making %sleftfix\n", is_transient ? "transient " : "");
-
-            aligned_unique_ptr<NFA> nfa;
-
-            // Need to build NFA, which is either predestined to be a Haig (in
-            // SOM mode) or could be all manner of things.
-            if (leftfix.haig()) {
-                nfa = goughCompile(*leftfix.haig(), tbi.ssm.somPrecision(), cc,
-                                   rm);
-            }  else {
-                assert(tbi.isNonRootSuccessor(v) != tbi.isRootSuccessor(v));
-                nfa = makeLeftNfa(tbi, leftfix, is_prefix, is_transient,
-                                  infixTriggers, cc);
-            }
-
-            if (!nfa) {
-                assert(!"failed to build leftfix");
-                return false;
-            }
-
-            setLeftNfaProperties(*nfa, leftfix);
-
-            qi = qif.get_queue();
-            tbi.leftfix_queue_map.emplace(leftfix, qi);
-            nfa->queueIndex = qi;
-
-            if (!is_prefix && !leftfix.haig() && leftfix.graph() &&
-                nfaStuckOn(*leftfix.graph())) {
-                DEBUG_PRINTF("%u sticks on\n", qi);
-                no_retrigger_queues->insert(qi);
-            }
-
-            DEBUG_PRINTF("built leftfix, qi=%u\n", qi);
-            add_nfa_to_blob(bc, *nfa);
-            seen.emplace(leftfix, qi);
+        if (!contains(succs, leftfix)) {
+            order.push_back(leftfix);
         }
 
+        succs[leftfix].push_back(v);
+    }
+
+    rose_group initial_groups = tbi.getInitialGroups();
+    rose_group combined_eager_squashed_mask = ~0ULL;
+
+    map<left_id, eager_info> eager;
+
+    for (const left_id &leftfix : order) {
+        const auto &left_succs = succs[leftfix];
+
         rose_group squash_mask = tbi.rose_squash_masks.at(leftfix);
+        eager_info ei;
 
-        // Leftfixes can have stop alphabets.
-        vector<u8> stop(N_CHARS, 0);
-        /* haigs track som information - need more care */
-        som_type som = leftfix.haig() ? SOM_LEFT : SOM_NONE;
-        if (leftfix.graph()) {
-            stop = findLeftOffsetStopAlphabet(*leftfix.graph(), som);
-        } else if (leftfix.castle()) {
-            stop = findLeftOffsetStopAlphabet(*leftfix.castle(), som);
+        if (checkSuitableForEager(do_prefix, leftfix, tbi, left_succs,
+                                  squash_mask, initial_groups, ei, cc)) {
+            eager[leftfix] = ei;
+            combined_eager_squashed_mask &= squash_mask;
+            DEBUG_PRINTF("combo %016llx...\n", combined_eager_squashed_mask);
         }
+    }
 
-        // Infix NFAs can have bounds on their queue lengths.
-        u32 max_queuelen = UINT32_MAX;
-        if (!is_prefix) {
-            set<ue2_literal> lits;
-            for (auto u : inv_adjacent_vertices_range(v, tbi.g)) {
-                for (u32 lit_id : tbi.g[u].literals) {
-                    lits.insert(tbi.literals.right.at(lit_id).s);
-                }
-            }
-            DEBUG_PRINTF("%zu literals\n", lits.size());
-            max_queuelen = findMaxInfixMatches(leftfix, lits);
-            if (max_queuelen < UINT32_MAX) {
-                max_queuelen++;
-            }
-        }
+    if (do_prefix && combined_eager_squashed_mask & initial_groups) {
+        DEBUG_PRINTF("eager groups won't squash everyone - be lazy\n");
+        eager_queues->clear();
+        eager.clear();
+    }
 
-        u32 max_width;
-        if (is_transient) {
-            depth d = findMaxWidth(leftfix);
-            assert(d.is_finite());
-            max_width = d;
-        } else {
-            max_width = 0;
-        }
-
-        u8 cm_count = 0;
-        CharReach cm_cr;
-        if (cc.grey.allowCountingMiracles) {
-            findCountingMiracleInfo(leftfix, stop, &cm_count, &cm_cr);
-        }
-
-        bc.leftfix_info.emplace(
-            v, left_build_info(qi, lag, max_width, squash_mask, stop,
-                               max_queuelen, cm_count, cm_cr));
+    for (const left_id &leftfix : order) {
+        buildLeftfix(tbi, bc, do_prefix, qif.get_queue(), infixTriggers,
+                     no_retrigger_queues, eager_queues, eager, succs[leftfix],
+                     leftfix);
     }
 
     return true;
@@ -1613,9 +1855,11 @@ void buildCountingMiracles(RoseBuildImpl &build, build_context &bc) {
     }
 }
 
+/* Note: buildNfas may reduce the lag for vertices that have prefixes */
 static
 bool buildNfas(RoseBuildImpl &tbi, build_context &bc, QueueIndexFactory &qif,
-               set<u32> *no_retrigger_queues, u32 *leftfixBeginQueue) {
+               set<u32> *no_retrigger_queues, set<u32> *eager_queues,
+               u32 *leftfixBeginQueue) {
     assignSuffixQueues(tbi, bc);
 
     if (!buildSuffixes(tbi, bc, no_retrigger_queues)) {
@@ -1624,11 +1868,13 @@ bool buildNfas(RoseBuildImpl &tbi, build_context &bc, QueueIndexFactory &qif,
 
     *leftfixBeginQueue = qif.allocated_count();
 
-    if (!buildLeftfixes(tbi, bc, qif, no_retrigger_queues, true)) {
+    if (!buildLeftfixes(tbi, bc, qif, no_retrigger_queues, eager_queues,
+                        true)) {
         return false;
     }
 
-    if (!buildLeftfixes(tbi, bc, qif, no_retrigger_queues, false)) {
+    if (!buildLeftfixes(tbi, bc, qif, no_retrigger_queues, eager_queues,
+                        false)) {
         return false;
     }
 
@@ -1672,10 +1918,10 @@ static
 void findTransientQueues(const map<RoseVertex, left_build_info> &leftfix_info,
                          set<u32> *out) {
     DEBUG_PRINTF("curating transient queues\n");
-    for (const auto &rbi : leftfix_info | map_values) {
-        if (rbi.transient) {
-            DEBUG_PRINTF("q %u is transient\n", rbi.queue);
-            out->insert(rbi.queue);
+    for (const auto &build : leftfix_info | map_values) {
+        if (build.transient) {
+            DEBUG_PRINTF("q %u is transient\n", build.queue);
+            out->insert(build.queue);
         }
     }
 }
@@ -3301,9 +3547,9 @@ void assignStateIndices(const RoseBuildImpl &build, build_context &bc) {
 }
 
 static
-bool hasUsefulStops(const left_build_info &rbi) {
+bool hasUsefulStops(const left_build_info &build) {
     for (u32 i = 0; i < N_CHARS; i++) {
-        if (rbi.stopAlphabet[i]) {
+        if (build.stopAlphabet[i]) {
             return true;
         }
     }
@@ -3312,6 +3558,7 @@ bool hasUsefulStops(const left_build_info &rbi) {
 
 static
 void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
+                        const set<u32> &eager_queues,
                         u32 leftfixBeginQueue, u32 leftfixCount,
                         vector<LeftNfaInfo> &leftTable, u32 *laggedRoseCount,
                         size_t *history) {
@@ -3371,6 +3618,7 @@ void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
             DEBUG_PRINTF("mw = %u\n", lbi.transient);
             left.transient = verify_u8(lbi.transient);
             left.infix = tbi.isNonRootSuccessor(v);
+            left.eager = contains(eager_queues, lbi.queue);
 
             // A rose has a lagIndex if it's non-transient and we are
             // streaming.
@@ -4271,6 +4519,25 @@ void fillMatcherDistances(const RoseBuildImpl &build, RoseEngine *engine) {
     }
 }
 
+static
+u32 buildEagerQueueIter(const set<u32> &eager, u32 leftfixBeginQueue,
+                        u32 queue_count,
+                        build_context &bc) {
+    if (eager.empty()) {
+        return 0;
+    }
+
+    vector<u32> vec;
+    for (u32 q : eager) {
+        assert(q >= leftfixBeginQueue);
+        vec.push_back(q - leftfixBeginQueue);
+    }
+
+    vector<mmbit_sparse_iter> iter;
+    mmbBuildSparseIterator(iter, vec, queue_count - leftfixBeginQueue);
+    return addIteratorToTable(bc, iter);
+}
+
 aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     DerivedBoundaryReports dboundary(boundary);
 
@@ -4305,7 +4572,10 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     u32 outfixEndQueue = qif.allocated_count();
     u32 leftfixBeginQueue = outfixEndQueue;
 
-    if (!buildNfas(*this, bc, qif, &no_retrigger_queues,
+    set<u32> eager_queues;
+
+    /* Note: buildNfas may reduce the lag for vertices that have prefixes */
+    if (!buildNfas(*this, bc, qif, &no_retrigger_queues, &eager_queues,
                    &leftfixBeginQueue)) {
         return nullptr;
     }
@@ -4325,7 +4595,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
 
     u32 laggedRoseCount = 0;
     vector<LeftNfaInfo> leftInfoTable;
-    buildLeftInfoTable(*this, bc, leftfixBeginQueue,
+    buildLeftInfoTable(*this, bc, eager_queues, leftfixBeginQueue,
                        queue_count - leftfixBeginQueue, leftInfoTable,
                        &laggedRoseCount, &historyRequired);
 
@@ -4340,6 +4610,8 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     buildActiveLeftIter(leftInfoTable, activeLeftIter);
 
     u32 lastByteOffset = buildLastByteIter(g, bc);
+    u32 eagerIterOffset = buildEagerQueueIter(eager_queues, leftfixBeginQueue,
+                                              queue_count, bc);
 
     // Enforce role table resource limit.
     if (num_vertices(g) > cc.grey.limitRoseRoleCount) {
@@ -4513,6 +4785,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     engine->activeArrayCount = activeArrayCount;
     engine->activeLeftCount = activeLeftCount;
     engine->queueCount = queue_count;
+    engine->eagerIterOffset = eagerIterOffset;
     engine->handledKeyCount = bc.handledKeys.size();
 
     engine->group_weak_end = group_weak_end;
diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp
index 46d1676d..2c3f326e 100644
--- a/src/rose/rose_build_dump.cpp
+++ b/src/rose/rose_build_dump.cpp
@@ -76,6 +76,8 @@ string to_string(nfa_kind k) {
         return "REV_PREFIX";
     case NFA_OUTFIX_RAW:
         return "OUTFIX_RAW";
+    case NFA_EAGER_PREFIX:
+        return "EAGER_PREFIX";
     }
     assert(0);
     return "?";
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index 5f1871e4..71940e07 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -150,7 +150,7 @@ struct left_id {
         : g(in.graph.get()), c(in.castle.get()), d(in.dfa.get()),
           h(in.haig.get()), dfa_min_width(in.dfa_min_width),
           dfa_max_width(in.dfa_max_width) {
-        assert(!g || !generates_callbacks(*g));
+        assert(!g || !has_managed_reports(*g));
     }
     bool operator==(const left_id &b) const {
         bool rv = g == b.g && c == b.c && h == b.h && d == b.d;
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index 9f55dbf2..1d63c71a 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -605,6 +605,9 @@ void dumpNfaNotes(ofstream &fout, const RoseEngine *t, const NFA *n) {
     }
 
     const LeftNfaInfo *left = getLeftInfoByQueue(t, qindex);
+    if (left->eager) {
+        fout << "eager ";
+    }
     if (left->transient) {
         fout << "transient " << (u32)left->transient << " ";
     }
@@ -1018,6 +1021,7 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, activeArrayCount);
     DUMP_U32(t, activeLeftCount);
     DUMP_U32(t, queueCount);
+    DUMP_U32(t, eagerIterOffset);
     DUMP_U32(t, handledKeyCount);
     DUMP_U32(t, leftOffset);
     DUMP_U32(t, roseCount);
diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h
index 9dd17350..5b6a9dc6 100644
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@@ -144,6 +144,7 @@ struct LeftNfaInfo {
     u32 stopTable; // stop table index, or ROSE_OFFSET_INVALID
     u8 transient; /**< 0 if not transient, else max width of transient prefix */
     char infix; /* TODO: make flags */
+    char eager; /**< nfa should be run eagerly to first match or death */
     char eod_check; /**< nfa is used by the event eod literal */
     u32 countingMiracleOffset; /** if not 0, offset to RoseCountingMiracle. */
     rose_group squash_mask; /* & mask applied when rose nfa dies */
@@ -366,6 +367,9 @@ struct RoseEngine {
     u32 activeLeftCount; //number of nfas tracked in the active rose array
     u32 queueCount;      /**< number of nfa queues */
 
+    u32 eagerIterOffset; /**< offset to sparse iter for eager prefixes or 0 if
+                          * none */
+
     /** \brief Number of keys used by CHECK_SET_HANDLED instructions in role
      * programs. Used to size the handled_roles fatbit in scratch. */
     u32 handledKeyCount;
diff --git a/src/rose/runtime.h b/src/rose/runtime.h
index f7f6641d..60c7d34b 100644
--- a/src/rose/runtime.h
+++ b/src/rose/runtime.h
@@ -55,6 +55,11 @@
 
 #define rose_inline really_inline
 
+/* Maximum offset that we will eagerly run prefixes to. Beyond this point, eager
+ * prefixes are always run in exactly the same way as normal prefixes. */
+#define EAGER_STOP_OFFSET 64
+
+
 static really_inline
 const void *getByOffset(const struct RoseEngine *t, u32 offset) {
     assert(offset < t->size);
diff --git a/src/rose/stream.c b/src/rose/stream.c
index ffe965dd..181bfe65 100644
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@@ -423,6 +423,92 @@ void do_rebuild(const struct RoseEngine *t, const struct HWLM *ftable,
     assert(!can_stop_matching(scratch));
 }
 
+static rose_inline
+void runEagerPrefixesStream(const struct RoseEngine *t,
+                            struct hs_scratch *scratch) {
+    if (!t->eagerIterOffset
+        || scratch->core_info.buf_offset >= EAGER_STOP_OFFSET) {
+        return;
+    }
+
+    char *state = scratch->core_info.state;
+    u8 *ara = getActiveLeftArray(t, state); /* indexed by offsets into
+                                             * left_table */
+    const u32 arCount = t->activeLeftCount;
+    const u32 qCount = t->queueCount;
+    const struct LeftNfaInfo *left_table = getLeftTable(t);
+    const struct mmbit_sparse_iter *it = getByOffset(t, t->eagerIterOffset);
+
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    u32 idx = 0;
+    u32 ri = mmbit_sparse_iter_begin(ara, arCount, &idx, it, si_state);
+    for (; ri != MMB_INVALID;
+           ri = mmbit_sparse_iter_next(ara, arCount, ri, &idx, it, si_state)) {
+        const struct LeftNfaInfo *left = left_table + ri;
+        u32 qi = ri + t->leftfixBeginQueue;
+        DEBUG_PRINTF("leftfix %u of %u, maxLag=%u\n", ri, arCount, left->maxLag);
+
+        assert(!fatbit_isset(scratch->aqa, qCount, qi));
+        assert(left->eager);
+        assert(!left->infix);
+
+        struct mq *q = scratch->queues + qi;
+        const struct NFA *nfa = getNfaByQueue(t, qi);
+        s64a loc = MIN(scratch->core_info.len,
+                       EAGER_STOP_OFFSET - scratch->core_info.buf_offset);
+
+        fatbit_set(scratch->aqa, qCount, qi);
+        initRoseQueue(t, qi, left, scratch);
+
+        if (scratch->core_info.buf_offset) {
+            s64a sp = left->transient ? -(s64a)scratch->core_info.hlen
+                                      : -(s64a)loadRoseDelay(t, state, left);
+            pushQueueAt(q, 0, MQE_START, sp);
+            if (scratch->core_info.buf_offset + sp > 0) {
+                loadStreamState(nfa, q, sp);
+                /* if the leftfix fix is currently in a match state, we cannot
+                 * advance it. */
+                if (nfaInAnyAcceptState(nfa, q)) {
+                    continue;
+                }
+                pushQueueAt(q, 1, MQE_END, loc);
+            } else {
+                pushQueueAt(q, 1, MQE_TOP, sp);
+                pushQueueAt(q, 2, MQE_END, loc);
+                nfaQueueInitState(q->nfa, q);
+            }
+        } else {
+            pushQueueAt(q, 0, MQE_START, 0);
+            pushQueueAt(q, 1, MQE_TOP, 0);
+            pushQueueAt(q, 2, MQE_END, loc);
+            nfaQueueInitState(nfa, q);
+        }
+
+        char alive = nfaQueueExecToMatch(q->nfa, q, loc);
+
+        if (!alive) {
+            DEBUG_PRINTF("queue %u dead, squashing\n", qi);
+            mmbit_unset(ara, arCount, ri);
+            fatbit_unset(scratch->aqa, qCount, qi);
+            scratch->tctxt.groups &= left->squash_mask;
+        } else if (q->cur == q->end) {
+            assert(alive != MO_MATCHES_PENDING);
+            /* unlike in block mode we cannot squash groups if there is no match
+             * in this block as we need the groups on for later stream writes */
+            /* TODO: investigate possibility of a method to suppress groups for
+             * a single stream block. */
+            DEBUG_PRINTF("queue %u finished, nfa lives\n", qi);
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+        } else {
+            assert(alive == MO_MATCHES_PENDING);
+            DEBUG_PRINTF("queue %u unfinished, nfa lives\n", qi);
+            q->end--; /* remove end item */
+        }
+    }
+}
+
 void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
     DEBUG_PRINTF("OH HAI\n");
     assert(t);
@@ -472,6 +558,8 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
         streamInitSufPQ(t, state, scratch);
     }
 
+    runEagerPrefixesStream(t, scratch);
+
     u32 alen = t->anchoredDistance > offset ?
         MIN(length + offset, t->anchoredDistance) - offset : 0;
 

From 76d96809f8369c0868fb02466b32b27749e35fcf Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 22 Jun 2016 16:23:36 +1000
Subject: [PATCH 073/166] rose: move roseRunProgram into its own unit

The roseRunProgram function had gotten very large for the number of
sites it was being inlined into, with negative effects on performance in
large cases. This change moves it into its own translation unit.
---
 CMakeLists.txt             |    1 +
 src/rose/block.c           |    7 +-
 src/rose/catchup.c         |   11 +-
 src/rose/match.c           |   56 +-
 src/rose/match.h           |   86 ++-
 src/rose/program_runtime.c | 1421 ++++++++++++++++++++++++++++++++++
 src/rose/program_runtime.h | 1463 +-----------------------------------
 src/rose/stream.c          |    7 +-
 8 files changed, 1549 insertions(+), 1503 deletions(-)
 create mode 100644 src/rose/program_runtime.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b6e0e94..ba3b29fa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -500,6 +500,7 @@ set (hs_exec_SRCS
     src/rose/match.h
     src/rose/match.c
     src/rose/miracle.h
+    src/rose/program_runtime.c
     src/rose/program_runtime.h
     src/rose/runtime.h
     src/rose/rose.h
diff --git a/src/rose/block.c b/src/rose/block.c
index a40d229b..fc72c6e9 100644
--- a/src/rose/block.c
+++ b/src/rose/block.c
@@ -179,15 +179,12 @@ void roseBlockEodExec(const struct RoseEngine *t, u64a offset,
 
     const u64a som = 0;
     const size_t match_len = 0;
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 1;
+    const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
 
     // Note: we ignore the result, as this is the last thing to ever happen on
     // a scan.
     roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len,
-                   in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+                   flags);
 }
 
 /**
diff --git a/src/rose/catchup.c b/src/rose/catchup.c
index c61079a8..f61cf390 100644
--- a/src/rose/catchup.c
+++ b/src/rose/catchup.c
@@ -39,6 +39,7 @@
 #include "nfa/mpv.h"
 #include "som/som_runtime.h"
 #include "util/fatbit.h"
+#include "report.h"
 
 typedef struct queue_match PQ_T;
 #define PQ_COMP(pqc_items, a, b) ((pqc_items)[a].loc < (pqc_items)[b].loc)
@@ -51,10 +52,12 @@ int roseNfaRunProgram(const struct RoseEngine *rose, struct hs_scratch *scratch,
                       u64a som, u64a offset, ReportID id, const char from_mpv) {
     const u32 program = id;
     const size_t match_len = 0; // Unused in this path.
-    const char in_anchored = 0;
-    const char in_catchup = 1;
-    roseRunProgram(rose, scratch, program, som, offset, match_len, in_anchored,
-                   in_catchup, from_mpv, 0);
+    u8 flags = ROSE_PROG_FLAG_IN_CATCHUP;
+    if (from_mpv) {
+        flags |= ROSE_PROG_FLAG_FROM_MPV;
+    }
+
+    roseRunProgram(rose, scratch, program, som, offset, match_len, flags);
 
     return can_stop_matching(scratch) ? MO_HALT_MATCHING : MO_CONTINUE_MATCHING;
 }
diff --git a/src/rose/match.c b/src/rose/match.c
index e89c8d3a..0311d496 100644
--- a/src/rose/match.c
+++ b/src/rose/match.c
@@ -27,14 +27,9 @@
  */
 
 #include "catchup.h"
-#include "counting_miracle.h"
-#include "infix.h"
 #include "match.h"
-#include "miracle.h"
 #include "program_runtime.h"
-#include "rose_program.h"
 #include "rose.h"
-#include "som/som_runtime.h"
 #include "util/bitutils.h"
 #include "util/fatbit.h"
 
@@ -98,13 +93,9 @@ hwlmcb_rv_t roseDelayRebuildCallback(size_t start, size_t end, u32 id,
     if (program) {
         const u64a som = 0;
         const size_t match_len = end - start + 1;
-        const char in_anchored = 0;
-        const char in_catchup = 0;
-        const char from_mpv = 0;
-        const char skip_mpv_catchup = 0;
-        UNUSED hwlmcb_rv_t rv =
-            roseRunProgram(t, scratch, program, som, real_end, match_len,
-                           in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+        const u8 flags = 0;
+        UNUSED hwlmcb_rv_t rv = roseRunProgram(t, scratch, program, som,
+                                               real_end, match_len, flags);
         assert(rv != HWLM_TERMINATE_MATCHING);
     }
 
@@ -253,13 +244,9 @@ int roseAnchoredCallback(u64a end, u32 id, void *ctx) {
     const u32 *programs = getByOffset(t, t->litProgramOffset);
     assert(id < t->literalCount);
     const u64a som = 0;
-    const char in_anchored = 1;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 0;
+    const u8 flags = ROSE_PROG_FLAG_IN_ANCHORED;
     if (roseRunProgram(t, scratch, programs[id], som, real_end, match_len,
-                       in_anchored, in_catchup, from_mpv,
-                       skip_mpv_catchup) == HWLM_TERMINATE_MATCHING) {
+                       flags) == HWLM_TERMINATE_MATCHING) {
         assert(can_stop_matching(scratch));
         DEBUG_PRINTF("caller requested termination\n");
         return MO_HALT_MATCHING;
@@ -284,12 +271,8 @@ hwlmcb_rv_t roseProcessMatch(const struct RoseEngine *t,
     const u32 *programs = getByOffset(t, t->litProgramOffset);
     assert(id < t->literalCount);
     const u64a som = 0;
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 0;
-    return roseRunProgram(t, scratch, programs[id], som, end, match_len,
-                          in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+    const u8 flags = 0;
+    return roseRunProgram(t, scratch, programs[id], som, end, match_len, flags);
 }
 
 static rose_inline
@@ -594,12 +577,9 @@ hwlmcb_rv_t rosePureLiteralCallback(size_t start, size_t end, u32 id,
     const struct RoseEngine *rose = ci->rose;
     const u32 *programs = getByOffset(rose, rose->litProgramOffset);
     assert(id < rose->literalCount);
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 0;
+    const u8 flags = 0;
     return roseRunProgram(rose, scratch, programs[id], som, real_end, match_len,
-                          in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+                          flags);
 }
 
 /**
@@ -635,13 +615,9 @@ int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program,
 
     const u64a som = 0;
     const size_t match_len = 0;
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 0;
-    hwlmcb_rv_t rv =
-        roseRunProgram(rose, scratch, program, som, stream_offset, match_len,
-                       in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+    const u8 flags = 0;
+    hwlmcb_rv_t rv = roseRunProgram(rose, scratch, program, som, stream_offset,
+                                    match_len, flags);
     if (rv == HWLM_TERMINATE_MATCHING) {
         return MO_HALT_MATCHING;
     }
@@ -659,13 +635,9 @@ int roseReportAdaptor_i(u64a som, u64a offset, ReportID id, void *context) {
     // Our match ID is the program offset.
     const u32 program = id;
     const size_t match_len = 0; // Unused in this path.
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 1;
+    const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
     hwlmcb_rv_t rv =
-        roseRunProgram(rose, scratch, program, som, offset, match_len,
-                       in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+        roseRunProgram(rose, scratch, program, som, offset, match_len, flags);
     if (rv == HWLM_TERMINATE_MATCHING) {
         return MO_HALT_MATCHING;
     }
diff --git a/src/rose/match.h b/src/rose/match.h
index 5b587aec..48866d1f 100644
--- a/src/rose/match.h
+++ b/src/rose/match.h
@@ -29,17 +29,20 @@
 #ifndef ROSE_MATCH_H
 #define ROSE_MATCH_H
 
-#include "hwlm/hwlm.h"
+#include "catchup.h"
 #include "runtime.h"
 #include "scratch.h"
+#include "report.h"
 #include "rose_common.h"
 #include "rose_internal.h"
 #include "ue2common.h"
+#include "hwlm/hwlm.h"
 #include "nfa/nfa_api.h"
 #include "nfa/nfa_api_queue.h"
 #include "nfa/nfa_api_util.h"
 #include "som/som_runtime.h"
 #include "util/bitutils.h"
+#include "util/exhaust.h"
 #include "util/fatbit.h"
 #include "util/multibit.h"
 
@@ -295,4 +298,85 @@ int roseHasInFlightMatches(const struct RoseEngine *t, char *state,
     return 0;
 }
 
+static rose_inline
+hwlmcb_rv_t roseHaltIfExhausted(const struct RoseEngine *t,
+                                struct hs_scratch *scratch) {
+    struct core_info *ci = &scratch->core_info;
+    if (isAllExhausted(t, ci->exhaustionVector)) {
+        ci->status |= STATUS_EXHAUSTED;
+        scratch->tctxt.groups = 0;
+        DEBUG_PRINTF("all exhausted, termination requested\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static really_inline
+hwlmcb_rv_t ensureQueueFlushed_i(const struct RoseEngine *t,
+                                 struct hs_scratch *scratch, u32 qi, s64a loc,
+                                 char is_mpv, char in_catchup) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+    u8 *aa = getActiveLeafArray(t, scratch->core_info.state);
+    struct fatbit *activeQueues = scratch->aqa;
+    u32 aaCount = t->activeArrayCount;
+    u32 qCount = t->queueCount;
+
+    struct mq *q = &scratch->queues[qi];
+    DEBUG_PRINTF("qcl %lld, loc: %lld, min (non mpv) match offset: %llu\n",
+                 q_cur_loc(q), loc, tctxt->minNonMpvMatchOffset);
+    if (q_cur_loc(q) == loc) {
+        /* too many tops enqueued at the one spot; need to flatten this queue.
+         * We can use the full catchups as it will short circuit as we are
+         * already at this location. It also saves waking everybody up */
+        pushQueueNoMerge(q, MQE_END, loc);
+        nfaQueueExec(q->nfa, q, loc);
+        q->cur = q->end = 0;
+        pushQueueAt(q, 0, MQE_START, loc);
+    } else if (!in_catchup) {
+        if (is_mpv) {
+            tctxt->next_mpv_offset = 0; /* force us to catch the mpv */
+            if (loc + scratch->core_info.buf_offset
+                <= tctxt->minNonMpvMatchOffset) {
+                DEBUG_PRINTF("flushing chained\n");
+                if (roseCatchUpMPV(t, loc, scratch) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                goto done_queue_empty;
+            }
+        }
+
+        if (roseCatchUpTo(t, scratch, loc + scratch->core_info.buf_offset) ==
+            HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    } else {
+        /* we must be a chained nfa */
+        assert(is_mpv);
+        DEBUG_PRINTF("flushing chained\n");
+        tctxt->next_mpv_offset = 0; /* force us to catch the mpv */
+        if (roseCatchUpMPV(t, loc, scratch) == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+done_queue_empty:
+    if (!mmbit_set(aa, aaCount, qi)) {
+        initQueue(q, qi, t, scratch);
+        nfaQueueInitState(q->nfa, q);
+        pushQueueAt(q, 0, MQE_START, loc);
+        fatbit_set(activeQueues, qCount, qi);
+    }
+
+    assert(!isQueueFull(q));
+
+    return roseHaltIfExhausted(t, scratch);
+}
+
+static rose_inline
+hwlmcb_rv_t ensureQueueFlushed(const struct RoseEngine *t,
+                               struct hs_scratch *scratch, u32 qi, s64a loc) {
+    return ensureQueueFlushed_i(t, scratch, qi, loc, 0, 0);
+}
+
 #endif
diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c
new file mode 100644
index 00000000..73a9e974
--- /dev/null
+++ b/src/rose/program_runtime.c
@@ -0,0 +1,1421 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Rose runtime: program interpreter.
+ */
+
+#include "program_runtime.h"
+
+#include "catchup.h"
+#include "counting_miracle.h"
+#include "infix.h"
+#include "match.h"
+#include "miracle.h"
+#include "report.h"
+#include "rose.h"
+#include "rose_internal.h"
+#include "rose_program.h"
+#include "rose_types.h"
+#include "runtime.h"
+#include "scratch.h"
+#include "ue2common.h"
+#include "util/compare.h"
+#include "util/fatbit.h"
+#include "util/multibit.h"
+
+static rose_inline
+int roseCheckBenefits(const struct core_info *ci, u64a end, u32 mask_rewind,
+                      const u8 *and_mask, const u8 *exp_mask) {
+    const u8 *data;
+
+    // If the check works over part of the history and part of the buffer, we
+    // create a temporary copy of the data in here so it's contiguous.
+    u8 temp[MAX_MASK2_WIDTH];
+
+    s64a buffer_offset = (s64a)end - ci->buf_offset;
+    DEBUG_PRINTF("rel offset %lld\n", buffer_offset);
+    if (buffer_offset >= mask_rewind) {
+        data = ci->buf + buffer_offset - mask_rewind;
+        DEBUG_PRINTF("all in one case data=%p buf=%p rewind=%u\n", data,
+                     ci->buf, mask_rewind);
+    } else if (buffer_offset <= 0) {
+        data = ci->hbuf + ci->hlen + buffer_offset - mask_rewind;
+        DEBUG_PRINTF("all in one case data=%p buf=%p rewind=%u\n", data,
+                     ci->buf, mask_rewind);
+    } else {
+        u32 shortfall = mask_rewind - buffer_offset;
+        DEBUG_PRINTF("shortfall of %u, rewind %u hlen %zu\n", shortfall,
+                     mask_rewind, ci->hlen);
+        data = temp;
+        memcpy(temp, ci->hbuf + ci->hlen - shortfall, shortfall);
+        memcpy(temp + shortfall, ci->buf, mask_rewind - shortfall);
+    }
+
+#ifdef DEBUG
+    DEBUG_PRINTF("DATA: ");
+    for (u32 i = 0; i < mask_rewind; i++) {
+        printf("%c", ourisprint(data[i]) ? data[i] : '?');
+    }
+    printf(" (len=%u)\n", mask_rewind);
+#endif
+
+    u32 len = mask_rewind;
+    while (len >= sizeof(u64a)) {
+        u64a a = unaligned_load_u64a(data);
+        a &= *(const u64a *)and_mask;
+        if (a != *(const u64a *)exp_mask) {
+            DEBUG_PRINTF("argh %016llx %016llx\n", a, *(const u64a *)exp_mask);
+            return 0;
+        }
+        data += sizeof(u64a);
+        and_mask += sizeof(u64a);
+        exp_mask += sizeof(u64a);
+        len -= sizeof(u64a);
+    }
+
+    while (len) {
+        u8 a = *data;
+        a &= *and_mask;
+        if (a != *exp_mask) {
+            DEBUG_PRINTF("argh d%02hhx =%02hhx am%02hhx  em%02hhx\n", a,
+                          *data, *and_mask, *exp_mask);
+            return 0;
+        }
+        data++;
+        and_mask++;
+        exp_mask++;
+        len--;
+    }
+
+    return 1;
+}
+
+static rose_inline
+void rosePushDelayedMatch(const struct RoseEngine *t,
+                          struct hs_scratch *scratch, u32 delay,
+                          u32 delay_index, u64a offset) {
+    assert(delay);
+
+    const u32 src_slot_index = delay;
+    u32 slot_index = (src_slot_index + offset) & DELAY_MASK;
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+    if (offset + src_slot_index <= tctxt->delayLastEndOffset) {
+        DEBUG_PRINTF("skip too late\n");
+        return;
+    }
+
+    const u32 delay_count = t->delay_count;
+    struct fatbit **delaySlots = getDelaySlots(scratch);
+    struct fatbit *slot = delaySlots[slot_index];
+
+    DEBUG_PRINTF("pushing tab %u into slot %u\n", delay_index, slot_index);
+    if (!(tctxt->filledDelayedSlots & (1U << slot_index))) {
+        tctxt->filledDelayedSlots |= 1U << slot_index;
+        fatbit_clear(slot);
+    }
+
+    fatbit_set(slot, delay_count, delay_index);
+}
+
+static rose_inline
+char roseLeftfixCheckMiracles(const struct RoseEngine *t,
+                              const struct LeftNfaInfo *left,
+                              struct core_info *ci, struct mq *q, u64a end,
+                              const char is_infix) {
+    if (!is_infix && left->transient) {
+        // Miracles won't help us with transient leftfix engines; they only
+        // scan for a limited time anyway.
+        return 1;
+    }
+
+    if (!left->stopTable) {
+        return 1;
+    }
+
+    DEBUG_PRINTF("looking for miracle on queue %u\n", q->nfa->queueIndex);
+
+    const s64a begin_loc = q_cur_loc(q);
+    const s64a end_loc = end - ci->buf_offset;
+
+    s64a miracle_loc;
+    if (roseMiracleOccurs(t, left, ci, begin_loc, end_loc, &miracle_loc)) {
+        goto found_miracle;
+    }
+
+    if (roseCountingMiracleOccurs(t, left, ci, begin_loc, end_loc,
+                                  &miracle_loc)) {
+        goto found_miracle;
+    }
+
+    return 1;
+
+found_miracle:
+    DEBUG_PRINTF("miracle at %lld\n", miracle_loc);
+    assert(miracle_loc >= begin_loc);
+
+    // If we're a prefix, then a miracle effectively results in us needing to
+    // re-init our state and start fresh.
+    if (!is_infix) {
+        if (miracle_loc != begin_loc) {
+            DEBUG_PRINTF("re-init prefix state\n");
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, miracle_loc);
+            pushQueueAt(q, 1, MQE_TOP, miracle_loc);
+            nfaQueueInitState(q->nfa, q);
+        }
+        return 1;
+    }
+
+    // Otherwise, we're an infix. Remove tops before the miracle from the queue
+    // and re-init at that location.
+
+    q_skip_forward_to(q, miracle_loc);
+
+    if (q_last_type(q) == MQE_START) {
+        DEBUG_PRINTF("miracle caused infix to die\n");
+        return 0;
+    }
+
+    DEBUG_PRINTF("re-init infix state\n");
+    assert(q->items[q->cur].type == MQE_START);
+    q->items[q->cur].location = miracle_loc;
+    nfaQueueInitState(q->nfa, q);
+
+    return 1;
+}
+
+static rose_inline
+hwlmcb_rv_t roseTriggerSuffix(const struct RoseEngine *t,
+                              struct hs_scratch *scratch, u32 qi, u32 top,
+                              u64a som, u64a end) {
+    DEBUG_PRINTF("suffix qi=%u, top event=%u\n", qi, top);
+
+    struct core_info *ci = &scratch->core_info;
+    u8 *aa = getActiveLeafArray(t, ci->state);
+    const u32 aaCount = t->activeArrayCount;
+    const u32 qCount = t->queueCount;
+    struct mq *q = &scratch->queues[qi];
+    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+    const struct NFA *nfa = getNfaByInfo(t, info);
+
+    s64a loc = (s64a)end - ci->buf_offset;
+    assert(loc <= (s64a)ci->len && loc >= -(s64a)ci->hlen);
+
+    if (!mmbit_set(aa, aaCount, qi)) {
+        initQueue(q, qi, t, scratch);
+        nfaQueueInitState(nfa, q);
+        pushQueueAt(q, 0, MQE_START, loc);
+        fatbit_set(scratch->aqa, qCount, qi);
+    } else if (info->no_retrigger) {
+        DEBUG_PRINTF("yawn\n");
+        /* nfa only needs one top; we can go home now */
+        return HWLM_CONTINUE_MATCHING;
+    } else if (!fatbit_set(scratch->aqa, qCount, qi)) {
+        initQueue(q, qi, t, scratch);
+        loadStreamState(nfa, q, 0);
+        pushQueueAt(q, 0, MQE_START, 0);
+    } else if (isQueueFull(q)) {
+        DEBUG_PRINTF("queue %u full -> catching up nfas\n", qi);
+        if (info->eod) {
+            /* can catch up suffix independently no pq */
+            q->context = NULL;
+            pushQueueNoMerge(q, MQE_END, loc);
+            nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX);
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+        } else if (ensureQueueFlushed(t, scratch, qi, loc)
+            == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    assert(top == MQE_TOP || (top >= MQE_TOP_FIRST && top < MQE_INVALID));
+    pushQueueSom(q, top, loc, som);
+
+    if (q_cur_loc(q) == (s64a)ci->len && !info->eod) {
+        /* we may not run the nfa; need to ensure state is fine  */
+        DEBUG_PRINTF("empty run\n");
+        pushQueueNoMerge(q, MQE_END, loc);
+        char alive = nfaQueueExec(nfa, q, loc);
+        if (alive) {
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+        } else {
+            mmbit_unset(aa, aaCount, qi);
+            fatbit_unset(scratch->aqa, qCount, qi);
+        }
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static really_inline
+char roseTestLeftfix(const struct RoseEngine *t, struct hs_scratch *scratch,
+                     u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end,
+                     const char is_infix) {
+    struct core_info *ci = &scratch->core_info;
+
+    u32 ri = queueToLeftIndex(t, qi);
+    const struct LeftNfaInfo *left = getLeftTable(t) + ri;
+
+    DEBUG_PRINTF("testing %s %s %u/%u with lag %u (maxLag=%u)\n",
+                 (left->transient ? "transient" : "active"),
+                 (is_infix ? "infix" : "prefix"),
+                 ri, qi, leftfixLag, left->maxLag);
+
+    assert(leftfixLag <= left->maxLag);
+    assert(left->infix == is_infix);
+    assert(!is_infix || !left->transient); // Only prefixes can be transient.
+
+    struct mq *q = scratch->queues + qi;
+    char *state = scratch->core_info.state;
+    u8 *activeLeftArray = getActiveLeftArray(t, state);
+    u32 qCount = t->queueCount;
+    u32 arCount = t->activeLeftCount;
+
+    if (!mmbit_isset(activeLeftArray, arCount, ri)) {
+        DEBUG_PRINTF("engine is dead nothing to see here\n");
+        return 0;
+    }
+
+    if (unlikely(end < leftfixLag)) {
+        assert(0); /* lag is the literal length */
+        return 0;
+    }
+
+    if (nfaSupportsZombie(getNfaByQueue(t, qi)) && ci->buf_offset
+        && !fatbit_isset(scratch->aqa, qCount, qi)
+        && isZombie(t, state, left)) {
+        DEBUG_PRINTF("zombie\n");
+        return 1;
+    }
+
+    if (!fatbit_set(scratch->aqa, qCount, qi)) {
+        DEBUG_PRINTF("initing q %u\n", qi);
+        initRoseQueue(t, qi, left, scratch);
+        if (ci->buf_offset) { // there have been writes before us!
+            s32 sp;
+            if (!is_infix && left->transient) {
+                sp = -(s32)ci->hlen;
+            } else {
+                sp = -(s32)loadRoseDelay(t, state, left);
+            }
+
+            /* transient nfas are always started fresh -> state not maintained
+             * at stream boundary */
+
+            pushQueueAt(q, 0, MQE_START, sp);
+            if (is_infix || (ci->buf_offset + sp > 0 && !left->transient)) {
+                loadStreamState(q->nfa, q, sp);
+            } else {
+                pushQueueAt(q, 1, MQE_TOP, sp);
+                nfaQueueInitState(q->nfa, q);
+            }
+        } else { // first write ever
+            pushQueueAt(q, 0, MQE_START, 0);
+            pushQueueAt(q, 1, MQE_TOP, 0);
+            nfaQueueInitState(q->nfa, q);
+        }
+    }
+
+    s64a loc = (s64a)end - ci->buf_offset - leftfixLag;
+    assert(loc >= q_cur_loc(q) || left->eager);
+    assert(leftfixReport != MO_INVALID_IDX);
+
+    if (!is_infix && left->transient) {
+        s64a start_loc = loc - left->transient;
+        if (q_cur_loc(q) < start_loc) {
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, start_loc);
+            pushQueueAt(q, 1, MQE_TOP, start_loc);
+            nfaQueueInitState(q->nfa, q);
+        }
+    }
+
+    if (q_cur_loc(q) < loc || q_last_type(q) != MQE_START) {
+        if (is_infix) {
+            if (infixTooOld(q, loc)) {
+                DEBUG_PRINTF("infix %u died of old age\n", ri);
+                goto nfa_dead;
+            }
+
+            reduceInfixQueue(q, loc, left->maxQueueLen, q->nfa->maxWidth);
+        }
+
+        if (!roseLeftfixCheckMiracles(t, left, ci, q, end, is_infix)) {
+            DEBUG_PRINTF("leftfix %u died due to miracle\n", ri);
+            goto nfa_dead;
+        }
+
+#ifdef DEBUG
+        debugQueue(q);
+#endif
+
+        pushQueueNoMerge(q, MQE_END, loc);
+
+        char rv = nfaQueueExecRose(q->nfa, q, leftfixReport);
+        if (!rv) { /* nfa is dead */
+            DEBUG_PRINTF("leftfix %u died while trying to catch up\n", ri);
+            goto nfa_dead;
+        }
+
+        // Queue must have next start loc before we call nfaInAcceptState.
+        q->cur = q->end = 0;
+        pushQueueAt(q, 0, MQE_START, loc);
+
+        DEBUG_PRINTF("checking for report %u\n", leftfixReport);
+        DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
+        return rv == MO_MATCHES_PENDING;
+    } else if (q_cur_loc(q) > loc) {
+        /* an eager leftfix may have already progressed past loc if there is no
+         * match at loc. */
+        assert(left->eager);
+        return 0;
+    } else {
+        assert(q_cur_loc(q) == loc);
+        DEBUG_PRINTF("checking for report %u\n", leftfixReport);
+        char rv = nfaInAcceptState(q->nfa, leftfixReport, q);
+        DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
+        return rv;
+    }
+
+nfa_dead:
+    mmbit_unset(activeLeftArray, arCount, ri);
+    scratch->tctxt.groups &= left->squash_mask;
+    return 0;
+}
+
+static rose_inline
+char roseTestPrefix(const struct RoseEngine *t, struct hs_scratch *scratch,
+                    u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end) {
+    return roseTestLeftfix(t, scratch, qi, leftfixLag, leftfixReport, end, 0);
+}
+
+static rose_inline
+char roseTestInfix(const struct RoseEngine *t, struct hs_scratch *scratch,
+                   u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end) {
+    return roseTestLeftfix(t, scratch, qi, leftfixLag, leftfixReport, end, 1);
+}
+
+static rose_inline
+void roseTriggerInfix(const struct RoseEngine *t, struct hs_scratch *scratch,
+                      u64a start, u64a end, u32 qi, u32 topEvent, u8 cancel) {
+    struct core_info *ci = &scratch->core_info;
+    s64a loc = (s64a)end - ci->buf_offset;
+
+    u32 ri = queueToLeftIndex(t, qi);
+    assert(topEvent < MQE_INVALID);
+
+    const struct LeftNfaInfo *left = getLeftInfoByQueue(t, qi);
+    assert(!left->transient);
+
+    DEBUG_PRINTF("rose %u (qi=%u) event %u\n", ri, qi, topEvent);
+
+    struct mq *q = scratch->queues + qi;
+    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+
+    char *state = ci->state;
+    u8 *activeLeftArray = getActiveLeftArray(t, state);
+    const u32 arCount = t->activeLeftCount;
+    char alive = mmbit_set(activeLeftArray, arCount, ri);
+
+    if (alive && info->no_retrigger) {
+        DEBUG_PRINTF("yawn\n");
+        return;
+    }
+
+    struct fatbit *aqa = scratch->aqa;
+    const u32 qCount = t->queueCount;
+
+    if (alive && nfaSupportsZombie(getNfaByInfo(t, info)) && ci->buf_offset &&
+        !fatbit_isset(aqa, qCount, qi) && isZombie(t, state, left)) {
+        DEBUG_PRINTF("yawn - zombie\n");
+        return;
+    }
+
+    if (cancel) {
+        DEBUG_PRINTF("dominating top: (re)init\n");
+        fatbit_set(aqa, qCount, qi);
+        initRoseQueue(t, qi, left, scratch);
+        pushQueueAt(q, 0, MQE_START, loc);
+        nfaQueueInitState(q->nfa, q);
+    } else if (!fatbit_set(aqa, qCount, qi)) {
+        DEBUG_PRINTF("initing %u\n", qi);
+        initRoseQueue(t, qi, left, scratch);
+        if (alive) {
+            s32 sp = -(s32)loadRoseDelay(t, state, left);
+            pushQueueAt(q, 0, MQE_START, sp);
+            loadStreamState(q->nfa, q, sp);
+        } else {
+            pushQueueAt(q, 0, MQE_START, loc);
+            nfaQueueInitState(q->nfa, q);
+        }
+    } else if (!alive) {
+        q->cur = q->end = 0;
+        pushQueueAt(q, 0, MQE_START, loc);
+        nfaQueueInitState(q->nfa, q);
+    } else if (isQueueFull(q)) {
+        reduceInfixQueue(q, loc, left->maxQueueLen, q->nfa->maxWidth);
+
+        if (isQueueFull(q)) {
+            /* still full - reduceInfixQueue did nothing */
+            DEBUG_PRINTF("queue %u full (%u items) -> catching up nfa\n", qi,
+                         q->end - q->cur);
+            pushQueueNoMerge(q, MQE_END, loc);
+            nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX);
+
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+        }
+    }
+
+    pushQueueSom(q, topEvent, loc, start);
+}
+
+static rose_inline
+hwlmcb_rv_t roseReport(const struct RoseEngine *t, struct hs_scratch *scratch,
+                       u64a end, ReportID onmatch, s32 offset_adjust,
+                       u32 ekey) {
+    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
+    DEBUG_PRINTF("firing callback onmatch=%u, end=%llu\n", onmatch, end);
+    updateLastMatchOffset(&scratch->tctxt, end);
+
+    int cb_rv = roseDeliverReport(end, onmatch, offset_adjust, scratch, ekey);
+    if (cb_rv == MO_HALT_MATCHING) {
+        DEBUG_PRINTF("termination requested\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    return roseHaltIfExhausted(t, scratch);
+}
+
+/* catches up engines enough to ensure any earlier mpv triggers are enqueued
+ * and then adds the trigger to the mpv queue. Must not be called during catch
+ * up */
+static rose_inline
+hwlmcb_rv_t roseCatchUpAndHandleChainMatch(const struct RoseEngine *t,
+                                           struct hs_scratch *scratch,
+                                           u32 event, u64a top_squash_distance,
+                                           u64a end, const char in_catchup) {
+    if (!in_catchup &&
+        roseCatchUpMpvFeeders(t, scratch, end) == HWLM_TERMINATE_MATCHING) {
+        return HWLM_TERMINATE_MATCHING;
+    }
+    return roseHandleChainMatch(t, scratch, event, top_squash_distance, end,
+                                in_catchup);
+}
+
+static rose_inline
+void roseHandleSom(UNUSED const struct RoseEngine *t,
+                   struct hs_scratch *scratch, const struct som_operation *sr,
+                   u64a end) {
+    DEBUG_PRINTF("end=%llu, minMatchOffset=%llu\n", end,
+                 scratch->tctxt.minMatchOffset);
+
+    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
+    updateLastMatchOffset(&scratch->tctxt, end);
+    handleSomInternal(scratch, sr, end);
+}
+
+static rose_inline
+hwlmcb_rv_t roseReportSom(const struct RoseEngine *t,
+                          struct hs_scratch *scratch, u64a start, u64a end,
+                          ReportID onmatch, s32 offset_adjust, u32 ekey) {
+    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
+    DEBUG_PRINTF("firing som callback onmatch=%u, start=%llu, end=%llu\n",
+                 onmatch, start, end);
+    updateLastMatchOffset(&scratch->tctxt, end);
+
+    int cb_rv = roseDeliverSomReport(start, end, onmatch, offset_adjust,
+                                     scratch, ekey);
+    if (cb_rv == MO_HALT_MATCHING) {
+        DEBUG_PRINTF("termination requested\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    return roseHaltIfExhausted(t, scratch);
+}
+
+static rose_inline
+void roseHandleSomSom(UNUSED const struct RoseEngine *t,
+                      struct hs_scratch *scratch,
+                      const struct som_operation *sr, u64a start, u64a end) {
+    DEBUG_PRINTF("start=%llu, end=%llu, minMatchOffset=%llu\n", start, end,
+                 scratch->tctxt.minMatchOffset);
+
+    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
+    updateLastMatchOffset(&scratch->tctxt, end);
+    setSomFromSomAware(scratch, sr, start, end);
+}
+
+static really_inline
+int reachHasBit(const u8 *reach, u8 c) {
+    return !!(reach[c / 8U] & (u8)1U << (c % 8U));
+}
+
+/**
+ * \brief Scan around a literal, checking that that "lookaround" reach masks
+ * are satisfied.
+ */
+static rose_inline
+int roseCheckLookaround(const struct RoseEngine *t,
+                        const struct hs_scratch *scratch, u32 lookaroundIndex,
+                        u32 lookaroundCount, u64a end) {
+    assert(lookaroundIndex != MO_INVALID_IDX);
+    assert(lookaroundCount > 0);
+
+    const struct core_info *ci = &scratch->core_info;
+    DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
+                 ci->buf_offset, ci->buf_offset + ci->len);
+
+    const u8 *base = (const u8 *)t;
+    const s8 *look_base = (const s8 *)(base + t->lookaroundTableOffset);
+    const s8 *look = look_base + lookaroundIndex;
+    const s8 *look_end = look + lookaroundCount;
+    assert(look < look_end);
+
+    const u8 *reach_base = base + t->lookaroundReachOffset;
+    const u8 *reach = reach_base + lookaroundIndex * REACH_BITVECTOR_LEN;
+
+    // The following code assumes that the lookaround structures are ordered by
+    // increasing offset.
+
+    const s64a base_offset = end - ci->buf_offset;
+    DEBUG_PRINTF("base_offset=%lld\n", base_offset);
+    DEBUG_PRINTF("first look has offset %d\n", *look);
+
+    // If our first check tells us we need to look at an offset before the
+    // start of the stream, this role cannot match.
+    if (unlikely(*look < 0 && (u64a)(0 - *look) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    // Skip over offsets that are before the history buffer.
+    do {
+        s64a offset = base_offset + *look;
+        if (offset >= -(s64a)ci->hlen) {
+            goto in_history;
+        }
+        DEBUG_PRINTF("look=%d before history\n", *look);
+        look++;
+        reach += REACH_BITVECTOR_LEN;
+    } while (look < look_end);
+
+    // History buffer.
+    DEBUG_PRINTF("scan history (%zu looks left)\n", look_end - look);
+    for (; look < look_end; ++look, reach += REACH_BITVECTOR_LEN) {
+    in_history:
+        ;
+        s64a offset = base_offset + *look;
+        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
+
+        if (offset >= 0) {
+            DEBUG_PRINTF("in buffer\n");
+            goto in_buffer;
+        }
+
+        assert(offset >= -(s64a)ci->hlen && offset < 0);
+        u8 c = ci->hbuf[ci->hlen + offset];
+        if (!reachHasBit(reach, c)) {
+            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+            return 0;
+        }
+    }
+    // Current buffer.
+    DEBUG_PRINTF("scan buffer (%zu looks left)\n", look_end - look);
+    for (; look < look_end; ++look, reach += REACH_BITVECTOR_LEN) {
+    in_buffer:
+        ;
+        s64a offset = base_offset + *look;
+        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
+
+        if (offset >= (s64a)ci->len) {
+            DEBUG_PRINTF("in the future\n");
+            break;
+        }
+
+        assert(offset >= 0 && offset < (s64a)ci->len);
+        u8 c = ci->buf[offset];
+        if (!reachHasBit(reach, c)) {
+            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+            return 0;
+        }
+    }
+
+    DEBUG_PRINTF("OK :)\n");
+    return 1;
+}
+
+static
+int roseNfaEarliestSom(u64a from_offset, UNUSED u64a offset, UNUSED ReportID id,
+                       void *context) {
+    u64a *som = context;
+    *som = MIN(*som, from_offset);
+    return MO_CONTINUE_MATCHING;
+}
+
+static rose_inline
+u64a roseGetHaigSom(const struct RoseEngine *t, struct hs_scratch *scratch,
+                    const u32 qi, UNUSED const u32 leftfixLag) {
+    u32 ri = queueToLeftIndex(t, qi);
+
+    UNUSED const struct LeftNfaInfo *left = getLeftTable(t) + ri;
+
+    DEBUG_PRINTF("testing %s prefix %u/%u with lag %u (maxLag=%u)\n",
+                 left->transient ? "transient" : "active", ri, qi,
+                 leftfixLag, left->maxLag);
+
+    assert(leftfixLag <= left->maxLag);
+
+    struct mq *q = scratch->queues + qi;
+
+    u64a start = ~0ULL;
+
+    /* switch the callback + context for a fun one */
+    q->som_cb = roseNfaEarliestSom;
+    q->context = &start;
+
+    nfaReportCurrentMatches(q->nfa, q);
+
+    /* restore the old callback + context */
+    q->som_cb = roseNfaSomAdaptor;
+    q->context = NULL;
+    DEBUG_PRINTF("earliest som is %llu\n", start);
+    return start;
+}
+
+static rose_inline
+char roseCheckBounds(u64a end, u64a min_bound, u64a max_bound) {
+    DEBUG_PRINTF("check offset=%llu against bounds [%llu,%llu]\n", end,
+                 min_bound, max_bound);
+    assert(min_bound <= max_bound);
+    return end >= min_bound && end <= max_bound;
+}
+
+static rose_inline
+hwlmcb_rv_t roseEnginesEod(const struct RoseEngine *rose,
+                           struct hs_scratch *scratch, u64a offset,
+                           u32 iter_offset) {
+    const char is_streaming = rose->mode != HS_MODE_BLOCK;
+
+    /* data, len is used for state decompress, should be full available data */
+    u8 key = 0;
+    if (is_streaming) {
+        const u8 *eod_data = scratch->core_info.hbuf;
+        size_t eod_len = scratch->core_info.hlen;
+        key = eod_len ? eod_data[eod_len - 1] : 0;
+    }
+
+    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
+    const u32 aaCount = rose->activeArrayCount;
+
+    const struct mmbit_sparse_iter *it = getByOffset(rose, iter_offset);
+    assert(ISALIGNED(it));
+
+    u32 idx = 0;
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    for (u32 qi = mmbit_sparse_iter_begin(aa, aaCount, &idx, it, si_state);
+         qi != MMB_INVALID;
+         qi = mmbit_sparse_iter_next(aa, aaCount, qi, &idx, it, si_state)) {
+        DEBUG_PRINTF("checking nfa %u\n", qi);
+        struct mq *q = scratch->queues + qi;
+        assert(q->nfa == getNfaByQueue(rose, qi));
+        assert(nfaAcceptsEod(q->nfa));
+
+        if (is_streaming) {
+            // Decompress stream state.
+            nfaExpandState(q->nfa, q->state, q->streamState, offset, key);
+        }
+
+        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
+                               roseReportAdaptor, roseReportSomAdaptor,
+                               scratch) == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("user instructed us to stop\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static rose_inline
+hwlmcb_rv_t roseSuffixesEod(const struct RoseEngine *rose,
+                            struct hs_scratch *scratch, u64a offset) {
+    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
+    const u32 aaCount = rose->activeArrayCount;
+
+    for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID;
+         qi = mmbit_iterate(aa, aaCount, qi)) {
+        DEBUG_PRINTF("checking nfa %u\n", qi);
+        struct mq *q = scratch->queues + qi;
+        assert(q->nfa == getNfaByQueue(rose, qi));
+        assert(nfaAcceptsEod(q->nfa));
+
+        /* We have just been triggered. */
+        assert(fatbit_isset(scratch->aqa, rose->queueCount, qi));
+
+        pushQueueNoMerge(q, MQE_END, scratch->core_info.len);
+        q->context = NULL;
+
+        /* rose exec is used as we don't want to / can't raise matches in the
+         * history buffer. */
+        if (!nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX)) {
+            DEBUG_PRINTF("nfa is dead\n");
+            continue;
+        }
+        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
+                               roseReportAdaptor, roseReportSomAdaptor,
+                               scratch) == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("user instructed us to stop\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static rose_inline
+hwlmcb_rv_t roseMatcherEod(const struct RoseEngine *rose,
+                           struct hs_scratch *scratch, u64a offset) {
+    assert(rose->ematcherOffset);
+    assert(rose->ematcherRegionSize);
+
+    // Clear role state and active engines, since we have already handled all
+    // outstanding work there.
+    DEBUG_PRINTF("clear role state and active leaf array\n");
+    char *state = scratch->core_info.state;
+    mmbit_clear(getRoleState(state), rose->rolesWithStateCount);
+    mmbit_clear(getActiveLeafArray(rose, state), rose->activeArrayCount);
+
+    const char is_streaming = rose->mode != HS_MODE_BLOCK;
+
+    size_t eod_len;
+    const u8 *eod_data;
+    if (!is_streaming) { /* Block */
+        eod_data = scratch->core_info.buf;
+        eod_len = scratch->core_info.len;
+    } else { /* Streaming */
+        eod_len = scratch->core_info.hlen;
+        eod_data = scratch->core_info.hbuf;
+    }
+
+    assert(eod_data);
+    assert(eod_len);
+
+    DEBUG_PRINTF("%zu bytes of eod data to scan at offset %llu\n", eod_len,
+                 offset);
+
+    // If we don't have enough bytes to produce a match from an EOD table scan,
+    // there's no point scanning.
+    if (eod_len < rose->eodmatcherMinWidth) {
+        DEBUG_PRINTF("too short for min width %u\n", rose->eodmatcherMinWidth);
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    // Ensure that we only need scan the last N bytes, where N is the length of
+    // the eod-anchored matcher region.
+    size_t adj = eod_len - MIN(eod_len, rose->ematcherRegionSize);
+
+    const struct HWLM *etable = getByOffset(rose, rose->ematcherOffset);
+    hwlmExec(etable, eod_data, eod_len, adj, roseCallback, scratch,
+             scratch->tctxt.groups);
+
+    // We may need to fire delayed matches.
+    if (cleanUpDelayed(rose, scratch, 0, offset) == HWLM_TERMINATE_MATCHING) {
+        DEBUG_PRINTF("user instructed us to stop\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    roseFlushLastByteHistory(rose, scratch, offset);
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static
+void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
+                    const char from_mpv) {
+    if (from_mpv) {
+        updateMinMatchOffsetFromMpv(tctxt, offset);
+    } else {
+        updateMinMatchOffset(tctxt, offset);
+    }
+}
+
+#define PROGRAM_CASE(name)                                                     \
+    case ROSE_INSTR_##name: {                                                  \
+        DEBUG_PRINTF("instruction: " #name " (pc=%u)\n",                       \
+                     programOffset + (u32)(pc - pc_base));                     \
+        const struct ROSE_STRUCT_##name *ri =                                  \
+            (const struct ROSE_STRUCT_##name *)pc;
+
+#define PROGRAM_NEXT_INSTRUCTION                                               \
+    pc += ROUNDUP_N(sizeof(*ri), ROSE_INSTR_MIN_ALIGN);                        \
+    break;                                                                     \
+    }
+
+static rose_inline
+hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
+                             struct hs_scratch *scratch, u32 programOffset,
+                             u64a som, u64a end, size_t match_len,
+                             char in_anchored, char in_catchup, char from_mpv,
+                             char skip_mpv_catchup) {
+    DEBUG_PRINTF("program=%u, offsets [%llu,%llu]\n", programOffset, som, end);
+
+    assert(programOffset >= sizeof(struct RoseEngine));
+    assert(programOffset < t->size);
+
+    const char *pc_base = getByOffset(t, programOffset);
+    const char *pc = pc_base;
+
+    // Local sparse iterator state for programs that use the SPARSE_ITER_BEGIN
+    // and SPARSE_ITER_NEXT instructions.
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    // If this program has an effect, work_done will be set to one (which may
+    // allow the program to squash groups).
+    int work_done = 0;
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+
+    assert(*(const u8 *)pc != ROSE_INSTR_END);
+
+    for (;;) {
+        assert(ISALIGNED_N(pc, ROSE_INSTR_MIN_ALIGN));
+        assert(pc >= pc_base);
+        assert((size_t)(pc - pc_base) < t->size);
+        const u8 code = *(const u8 *)pc;
+        assert(code <= ROSE_INSTR_END);
+
+        switch ((enum RoseInstructionCode)code) {
+            PROGRAM_CASE(ANCHORED_DELAY) {
+                if (in_anchored && end > t->floatingMinLiteralMatchOffset) {
+                    DEBUG_PRINTF("delay until playback\n");
+                    tctxt->groups |= ri->groups;
+                    work_done = 1;
+                    assert(ri->done_jump); // must progress
+                    pc += ri->done_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LIT_MASK) {
+                assert(match_len);
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckBenefits(ci, end, match_len, ri->and_mask.a8,
+                                       ri->cmp_mask.a8)) {
+                    DEBUG_PRINTF("halt: failed mask check\n");
+                    return HWLM_CONTINUE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LIT_EARLY) {
+                if (end < ri->min_offset) {
+                    DEBUG_PRINTF("halt: before min_offset=%u\n",
+                                 ri->min_offset);
+                    return HWLM_CONTINUE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_GROUPS) {
+                DEBUG_PRINTF("groups=0x%llx, checking instr groups=0x%llx\n",
+                             tctxt->groups, ri->groups);
+                if (!(ri->groups & tctxt->groups)) {
+                    DEBUG_PRINTF("halt: no groups are set\n");
+                    return HWLM_CONTINUE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_ONLY_EOD) {
+                struct core_info *ci = &scratch->core_info;
+                if (end != ci->buf_offset + ci->len) {
+                    DEBUG_PRINTF("should only match at end of data\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_BOUNDS) {
+                if (!roseCheckBounds(end, ri->min_bound, ri->max_bound)) {
+                    DEBUG_PRINTF("failed bounds check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_NOT_HANDLED) {
+                struct fatbit *handled = scratch->handled_roles;
+                if (fatbit_set(handled, t->handledKeyCount, ri->key)) {
+                    DEBUG_PRINTF("key %u already set\n", ri->key);
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LOOKAROUND) {
+                if (!roseCheckLookaround(t, scratch, ri->index, ri->count,
+                                         end)) {
+                    DEBUG_PRINTF("failed lookaround check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_INFIX) {
+                if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report,
+                                   end)) {
+                    DEBUG_PRINTF("failed infix check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_PREFIX) {
+                if (!roseTestPrefix(t, scratch, ri->queue, ri->lag, ri->report,
+                                    end)) {
+                    DEBUG_PRINTF("failed prefix check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(PUSH_DELAYED) {
+                rosePushDelayedMatch(t, scratch, ri->delay, ri->index, end);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CATCH_UP) {
+                if (roseCatchUpTo(t, scratch, end) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CATCH_UP_MPV) {
+                if (from_mpv || skip_mpv_catchup) {
+                    DEBUG_PRINTF("skipping mpv catchup\n");
+                } else if (roseCatchUpMPV(t,
+                                          end - scratch->core_info.buf_offset,
+                                          scratch) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_ADJUST) {
+                assert(ri->distance <= end);
+                som = end - ri->distance;
+                DEBUG_PRINTF("som is (end - %u) = %llu\n", ri->distance, som);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_LEFTFIX) {
+                som = roseGetHaigSom(t, scratch, ri->queue, ri->lag);
+                DEBUG_PRINTF("som from leftfix is %llu\n", som);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_FROM_REPORT) {
+                som = handleSomExternal(scratch, &ri->som, end);
+                DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch,
+                             som);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_ZERO) {
+                DEBUG_PRINTF("setting SOM to zero\n");
+                som = 0;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(TRIGGER_INFIX) {
+                roseTriggerInfix(t, scratch, som, end, ri->queue, ri->event,
+                                 ri->cancel);
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(TRIGGER_SUFFIX) {
+                if (roseTriggerSuffix(t, scratch, ri->queue, ri->event, som,
+                                      end) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DEDUPE) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                const char do_som = t->hasSom; // TODO: constant propagate
+                const char is_external_report = 1;
+                enum DedupeResult rv =
+                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
+                                  ri->dkey, ri->offset_adjust,
+                                  is_external_report, ri->quash_som, do_som);
+                switch (rv) {
+                case DEDUPE_HALT:
+                    return HWLM_TERMINATE_MATCHING;
+                case DEDUPE_SKIP:
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                case DEDUPE_CONTINUE:
+                    break;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DEDUPE_SOM) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                const char is_external_report = 0;
+                const char do_som = 1;
+                enum DedupeResult rv =
+                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
+                                  ri->dkey, ri->offset_adjust,
+                                  is_external_report, ri->quash_som, do_som);
+                switch (rv) {
+                case DEDUPE_HALT:
+                    return HWLM_TERMINATE_MATCHING;
+                case DEDUPE_SKIP:
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                case DEDUPE_CONTINUE:
+                    break;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_CHAIN) {
+                // Note: sequence points updated inside this function.
+                if (roseCatchUpAndHandleChainMatch(
+                        t, scratch, ri->event, ri->top_squash_distance, end,
+                        in_catchup) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM_INT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                roseHandleSom(t, scratch, &ri->som, end);
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM_AWARE) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                roseHandleSomSom(t, scratch, &ri->som, som, end);
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_EXHAUST) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               ri->ekey) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReportSom(t, scratch, som, end, ri->onmatch,
+                                  ri->offset_adjust,
+                                  INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM_EXHAUST) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReportSom(t, scratch, som, end, ri->onmatch,
+                                  ri->offset_adjust,
+                                  ri->ekey) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DEDUPE_AND_REPORT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                const char do_som = t->hasSom; // TODO: constant propagate
+                const char is_external_report = 1;
+                enum DedupeResult rv =
+                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
+                                  ri->dkey, ri->offset_adjust,
+                                  is_external_report, ri->quash_som, do_som);
+                switch (rv) {
+                case DEDUPE_HALT:
+                    return HWLM_TERMINATE_MATCHING;
+                case DEDUPE_SKIP:
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                case DEDUPE_CONTINUE:
+                    break;
+                }
+
+                const u32 ekey = INVALID_EKEY;
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               ekey) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(FINAL_REPORT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                /* One-shot specialisation: this instruction always terminates
+                 * execution of the program. */
+                return HWLM_CONTINUE_MATCHING;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_EXHAUSTED) {
+                DEBUG_PRINTF("check ekey %u\n", ri->ekey);
+                assert(ri->ekey != INVALID_EKEY);
+                assert(ri->ekey < t->ekeyCount);
+                const char *evec = scratch->core_info.exhaustionVector;
+                if (isExhausted(t, evec, ri->ekey)) {
+                    DEBUG_PRINTF("ekey %u already set, match is exhausted\n",
+                                 ri->ekey);
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MIN_LENGTH) {
+                DEBUG_PRINTF("check min length %llu (adj %d)\n", ri->min_length,
+                             ri->end_adj);
+                assert(ri->min_length > 0);
+                assert(ri->end_adj == 0 || ri->end_adj == -1);
+                assert(som == HS_OFFSET_PAST_HORIZON || som <= end);
+                if (som != HS_OFFSET_PAST_HORIZON &&
+                    ((end + ri->end_adj) - som < ri->min_length)) {
+                    DEBUG_PRINTF("failed check, match len %llu\n",
+                                 (u64a)((end + ri->end_adj) - som));
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SET_STATE) {
+                DEBUG_PRINTF("set state index %u\n", ri->index);
+                mmbit_set(getRoleState(scratch->core_info.state),
+                          t->rolesWithStateCount, ri->index);
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SET_GROUPS) {
+                tctxt->groups |= ri->groups;
+                DEBUG_PRINTF("set groups 0x%llx -> 0x%llx\n", ri->groups,
+                             tctxt->groups);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SQUASH_GROUPS) {
+                assert(popcount64(ri->groups) == 63); // Squash only one group.
+                if (work_done) {
+                    tctxt->groups &= ri->groups;
+                    DEBUG_PRINTF("squash groups 0x%llx -> 0x%llx\n", ri->groups,
+                                 tctxt->groups);
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_STATE) {
+                DEBUG_PRINTF("check state %u\n", ri->index);
+                const u8 *roles = getRoleState(scratch->core_info.state);
+                if (!mmbit_isset(roles, t->rolesWithStateCount, ri->index)) {
+                    DEBUG_PRINTF("state not on\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SPARSE_ITER_BEGIN) {
+                DEBUG_PRINTF("iter_offset=%u\n", ri->iter_offset);
+                const struct mmbit_sparse_iter *it =
+                    getByOffset(t, ri->iter_offset);
+                assert(ISALIGNED(it));
+
+                const u8 *roles = getRoleState(scratch->core_info.state);
+
+                u32 idx = 0;
+                u32 i = mmbit_sparse_iter_begin(roles, t->rolesWithStateCount,
+                                                &idx, it, si_state);
+                if (i == MMB_INVALID) {
+                    DEBUG_PRINTF("no states in sparse iter are on\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+
+                fatbit_clear(scratch->handled_roles);
+
+                const u32 *jumps = getByOffset(t, ri->jump_table);
+                DEBUG_PRINTF("state %u (idx=%u) is on, jump to %u\n", i, idx,
+                             jumps[idx]);
+                pc = pc_base + jumps[idx];
+                continue;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SPARSE_ITER_NEXT) {
+                DEBUG_PRINTF("iter_offset=%u, state=%u\n", ri->iter_offset,
+                             ri->state);
+                const struct mmbit_sparse_iter *it =
+                    getByOffset(t, ri->iter_offset);
+                assert(ISALIGNED(it));
+
+                const u8 *roles = getRoleState(scratch->core_info.state);
+
+                u32 idx = 0;
+                u32 i = mmbit_sparse_iter_next(roles, t->rolesWithStateCount,
+                                               ri->state, &idx, it, si_state);
+                if (i == MMB_INVALID) {
+                    DEBUG_PRINTF("no more states in sparse iter are on\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+
+                const u32 *jumps = getByOffset(t, ri->jump_table);
+                DEBUG_PRINTF("state %u (idx=%u) is on, jump to %u\n", i, idx,
+                             jumps[idx]);
+                pc = pc_base + jumps[idx];
+                continue;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(ENGINES_EOD) {
+                if (roseEnginesEod(t, scratch, end, ri->iter_offset) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SUFFIXES_EOD) {
+                if (roseSuffixesEod(t, scratch, end) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(MATCHER_EOD) {
+                if (roseMatcherEod(t, scratch, end) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(END) {
+                DEBUG_PRINTF("finished\n");
+                return HWLM_CONTINUE_MATCHING;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+        }
+    }
+
+    assert(0); // unreachable
+    return HWLM_CONTINUE_MATCHING;
+}
+
+#undef PROGRAM_CASE
+#undef PROGRAM_NEXT_INSTRUCTION
+
+hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
+                           struct hs_scratch *scratch, u32 programOffset,
+                           u64a som, u64a end, size_t match_len,
+                           u8 prog_flags) {
+    const char in_anchored = prog_flags & ROSE_PROG_FLAG_IN_ANCHORED;
+    const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP;
+    const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV;
+    const char skip_mpv_catchup = prog_flags & ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
+    return roseRunProgram_i(t, scratch, programOffset, som, end, match_len,
+                            in_anchored, in_catchup, from_mpv,
+                            skip_mpv_catchup);
+}
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index 3794ac3f..c12c9155 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -26,1462 +26,33 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+/**
+ * \file
+ * \brief Rose runtime: program interpreter.
+ */
+
 #ifndef PROGRAM_RUNTIME_H
 #define PROGRAM_RUNTIME_H
 
-#include "catchup.h"
-#include "counting_miracle.h"
-#include "infix.h"
-#include "match.h"
-#include "miracle.h"
-#include "report.h"
-#include "rose.h"
-#include "rose_internal.h"
-#include "rose_program.h"
-#include "rose_types.h"
-#include "runtime.h"
-#include "scratch.h"
 #include "ue2common.h"
-#include "util/compare.h"
-#include "util/fatbit.h"
-#include "util/multibit.h"
+#include "src/hwlm/hwlm.h" // for hwlmcb_rv_t
 
-static rose_inline
-int roseCheckBenefits(const struct core_info *ci, u64a end, u32 mask_rewind,
-                      const u8 *and_mask, const u8 *exp_mask) {
-    const u8 *data;
+struct RoseEngine;
+struct hs_scratch;
 
-    // If the check works over part of the history and part of the buffer, we
-    // create a temporary copy of the data in here so it's contiguous.
-    u8 temp[MAX_MASK2_WIDTH];
-
-    s64a buffer_offset = (s64a)end - ci->buf_offset;
-    DEBUG_PRINTF("rel offset %lld\n", buffer_offset);
-    if (buffer_offset >= mask_rewind) {
-        data = ci->buf + buffer_offset - mask_rewind;
-        DEBUG_PRINTF("all in one case data=%p buf=%p rewind=%u\n", data,
-                     ci->buf, mask_rewind);
-    } else if (buffer_offset <= 0) {
-        data = ci->hbuf + ci->hlen + buffer_offset - mask_rewind;
-        DEBUG_PRINTF("all in one case data=%p buf=%p rewind=%u\n", data,
-                     ci->buf, mask_rewind);
-    } else {
-        u32 shortfall = mask_rewind - buffer_offset;
-        DEBUG_PRINTF("shortfall of %u, rewind %u hlen %zu\n", shortfall,
-                     mask_rewind, ci->hlen);
-        data = temp;
-        memcpy(temp, ci->hbuf + ci->hlen - shortfall, shortfall);
-        memcpy(temp + shortfall, ci->buf, mask_rewind - shortfall);
-    }
-
-#ifdef DEBUG
-    DEBUG_PRINTF("DATA: ");
-    for (u32 i = 0; i < mask_rewind; i++) {
-        printf("%c", ourisprint(data[i]) ? data[i] : '?');
-    }
-    printf(" (len=%u)\n", mask_rewind);
-#endif
-
-    u32 len = mask_rewind;
-    while (len >= sizeof(u64a)) {
-        u64a a = unaligned_load_u64a(data);
-        a &= *(const u64a *)and_mask;
-        if (a != *(const u64a *)exp_mask) {
-            DEBUG_PRINTF("argh %016llx %016llx\n", a, *(const u64a *)exp_mask);
-            return 0;
-        }
-        data += sizeof(u64a);
-        and_mask += sizeof(u64a);
-        exp_mask += sizeof(u64a);
-        len -= sizeof(u64a);
-    }
-
-    while (len) {
-        u8 a = *data;
-        a &= *and_mask;
-        if (a != *exp_mask) {
-            DEBUG_PRINTF("argh d%02hhx =%02hhx am%02hhx  em%02hhx\n", a,
-                          *data, *and_mask, *exp_mask);
-            return 0;
-        }
-        data++;
-        and_mask++;
-        exp_mask++;
-        len--;
-    }
-
-    return 1;
-}
-
-static rose_inline
-void rosePushDelayedMatch(const struct RoseEngine *t,
-                          struct hs_scratch *scratch, u32 delay,
-                          u32 delay_index, u64a offset) {
-    assert(delay);
-
-    const u32 src_slot_index = delay;
-    u32 slot_index = (src_slot_index + offset) & DELAY_MASK;
-
-    struct RoseContext *tctxt = &scratch->tctxt;
-    if (offset + src_slot_index <= tctxt->delayLastEndOffset) {
-        DEBUG_PRINTF("skip too late\n");
-        return;
-    }
-
-    const u32 delay_count = t->delay_count;
-    struct fatbit **delaySlots = getDelaySlots(scratch);
-    struct fatbit *slot = delaySlots[slot_index];
-
-    DEBUG_PRINTF("pushing tab %u into slot %u\n", delay_index, slot_index);
-    if (!(tctxt->filledDelayedSlots & (1U << slot_index))) {
-        tctxt->filledDelayedSlots |= 1U << slot_index;
-        fatbit_clear(slot);
-    }
-
-    fatbit_set(slot, delay_count, delay_index);
-}
-
-static rose_inline
-char roseLeftfixCheckMiracles(const struct RoseEngine *t,
-                              const struct LeftNfaInfo *left,
-                              struct core_info *ci, struct mq *q, u64a end,
-                              const char is_infix) {
-    if (!is_infix && left->transient) {
-        // Miracles won't help us with transient leftfix engines; they only
-        // scan for a limited time anyway.
-        return 1;
-    }
-
-    if (!left->stopTable) {
-        return 1;
-    }
-
-    DEBUG_PRINTF("looking for miracle on queue %u\n", q->nfa->queueIndex);
-
-    const s64a begin_loc = q_cur_loc(q);
-    const s64a end_loc = end - ci->buf_offset;
-
-    s64a miracle_loc;
-    if (roseMiracleOccurs(t, left, ci, begin_loc, end_loc, &miracle_loc)) {
-        goto found_miracle;
-    }
-
-    if (roseCountingMiracleOccurs(t, left, ci, begin_loc, end_loc,
-                                  &miracle_loc)) {
-        goto found_miracle;
-    }
-
-    return 1;
-
-found_miracle:
-    DEBUG_PRINTF("miracle at %lld\n", miracle_loc);
-    assert(miracle_loc >= begin_loc);
-
-    // If we're a prefix, then a miracle effectively results in us needing to
-    // re-init our state and start fresh.
-    if (!is_infix) {
-        if (miracle_loc != begin_loc) {
-            DEBUG_PRINTF("re-init prefix state\n");
-            q->cur = q->end = 0;
-            pushQueueAt(q, 0, MQE_START, miracle_loc);
-            pushQueueAt(q, 1, MQE_TOP, miracle_loc);
-            nfaQueueInitState(q->nfa, q);
-        }
-        return 1;
-    }
-
-    // Otherwise, we're an infix. Remove tops before the miracle from the queue
-    // and re-init at that location.
-
-    q_skip_forward_to(q, miracle_loc);
-
-    if (q_last_type(q) == MQE_START) {
-        DEBUG_PRINTF("miracle caused infix to die\n");
-        return 0;
-    }
-
-    DEBUG_PRINTF("re-init infix state\n");
-    assert(q->items[q->cur].type == MQE_START);
-    q->items[q->cur].location = miracle_loc;
-    nfaQueueInitState(q->nfa, q);
-
-    return 1;
-}
-
-static rose_inline
-hwlmcb_rv_t roseHaltIfExhausted(const struct RoseEngine *t,
-                                struct hs_scratch *scratch) {
-    struct core_info *ci = &scratch->core_info;
-    if (isAllExhausted(t, ci->exhaustionVector)) {
-        ci->status |= STATUS_EXHAUSTED;
-        scratch->tctxt.groups = 0;
-        DEBUG_PRINTF("all exhausted, termination requested\n");
-        return HWLM_TERMINATE_MATCHING;
-    }
-
-    return HWLM_CONTINUE_MATCHING;
-}
-
-static really_inline
-hwlmcb_rv_t ensureQueueFlushed_i(const struct RoseEngine *t,
-                                 struct hs_scratch *scratch, u32 qi, s64a loc,
-                                 char is_mpv, char in_catchup) {
-    struct RoseContext *tctxt = &scratch->tctxt;
-    u8 *aa = getActiveLeafArray(t, scratch->core_info.state);
-    struct fatbit *activeQueues = scratch->aqa;
-    u32 aaCount = t->activeArrayCount;
-    u32 qCount = t->queueCount;
-
-    struct mq *q = &scratch->queues[qi];
-    DEBUG_PRINTF("qcl %lld, loc: %lld, min (non mpv) match offset: %llu\n",
-                 q_cur_loc(q), loc, tctxt->minNonMpvMatchOffset);
-    if (q_cur_loc(q) == loc) {
-        /* too many tops enqueued at the one spot; need to flatten this queue.
-         * We can use the full catchups as it will short circuit as we are
-         * already at this location. It also saves waking everybody up */
-        pushQueueNoMerge(q, MQE_END, loc);
-        nfaQueueExec(q->nfa, q, loc);
-        q->cur = q->end = 0;
-        pushQueueAt(q, 0, MQE_START, loc);
-    } else if (!in_catchup) {
-        if (is_mpv) {
-            tctxt->next_mpv_offset = 0; /* force us to catch the mpv */
-            if (loc + scratch->core_info.buf_offset
-                <= tctxt->minNonMpvMatchOffset) {
-                DEBUG_PRINTF("flushing chained\n");
-                if (roseCatchUpMPV(t, loc, scratch) ==
-                    HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                goto done_queue_empty;
-            }
-        }
-
-        if (roseCatchUpTo(t, scratch, loc + scratch->core_info.buf_offset) ==
-            HWLM_TERMINATE_MATCHING) {
-            return HWLM_TERMINATE_MATCHING;
-        }
-    } else {
-        /* we must be a chained nfa */
-        assert(is_mpv);
-        DEBUG_PRINTF("flushing chained\n");
-        tctxt->next_mpv_offset = 0; /* force us to catch the mpv */
-        if (roseCatchUpMPV(t, loc, scratch) == HWLM_TERMINATE_MATCHING) {
-            return HWLM_TERMINATE_MATCHING;
-        }
-    }
-done_queue_empty:
-    if (!mmbit_set(aa, aaCount, qi)) {
-        initQueue(q, qi, t, scratch);
-        nfaQueueInitState(q->nfa, q);
-        pushQueueAt(q, 0, MQE_START, loc);
-        fatbit_set(activeQueues, qCount, qi);
-    }
-
-    assert(!isQueueFull(q));
-
-    return roseHaltIfExhausted(t, scratch);
-}
-
-static rose_inline
-hwlmcb_rv_t ensureQueueFlushed(const struct RoseEngine *t,
-                               struct hs_scratch *scratch, u32 qi, s64a loc) {
-    return ensureQueueFlushed_i(t, scratch, qi, loc, 0, 0);
-}
-
-static rose_inline
-hwlmcb_rv_t roseTriggerSuffix(const struct RoseEngine *t,
-                              struct hs_scratch *scratch, u32 qi, u32 top,
-                              u64a som, u64a end) {
-    DEBUG_PRINTF("suffix qi=%u, top event=%u\n", qi, top);
-
-    struct core_info *ci = &scratch->core_info;
-    u8 *aa = getActiveLeafArray(t, ci->state);
-    const u32 aaCount = t->activeArrayCount;
-    const u32 qCount = t->queueCount;
-    struct mq *q = &scratch->queues[qi];
-    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
-    const struct NFA *nfa = getNfaByInfo(t, info);
-
-    s64a loc = (s64a)end - ci->buf_offset;
-    assert(loc <= (s64a)ci->len && loc >= -(s64a)ci->hlen);
-
-    if (!mmbit_set(aa, aaCount, qi)) {
-        initQueue(q, qi, t, scratch);
-        nfaQueueInitState(nfa, q);
-        pushQueueAt(q, 0, MQE_START, loc);
-        fatbit_set(scratch->aqa, qCount, qi);
-    } else if (info->no_retrigger) {
-        DEBUG_PRINTF("yawn\n");
-        /* nfa only needs one top; we can go home now */
-        return HWLM_CONTINUE_MATCHING;
-    } else if (!fatbit_set(scratch->aqa, qCount, qi)) {
-        initQueue(q, qi, t, scratch);
-        loadStreamState(nfa, q, 0);
-        pushQueueAt(q, 0, MQE_START, 0);
-    } else if (isQueueFull(q)) {
-        DEBUG_PRINTF("queue %u full -> catching up nfas\n", qi);
-        if (info->eod) {
-            /* can catch up suffix independently no pq */
-            q->context = NULL;
-            pushQueueNoMerge(q, MQE_END, loc);
-            nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX);
-            q->cur = q->end = 0;
-            pushQueueAt(q, 0, MQE_START, loc);
-        } else if (ensureQueueFlushed(t, scratch, qi, loc)
-            == HWLM_TERMINATE_MATCHING) {
-            return HWLM_TERMINATE_MATCHING;
-        }
-    }
-
-    assert(top == MQE_TOP || (top >= MQE_TOP_FIRST && top < MQE_INVALID));
-    pushQueueSom(q, top, loc, som);
-
-    if (q_cur_loc(q) == (s64a)ci->len && !info->eod) {
-        /* we may not run the nfa; need to ensure state is fine  */
-        DEBUG_PRINTF("empty run\n");
-        pushQueueNoMerge(q, MQE_END, loc);
-        char alive = nfaQueueExec(nfa, q, loc);
-        if (alive) {
-            q->cur = q->end = 0;
-            pushQueueAt(q, 0, MQE_START, loc);
-        } else {
-            mmbit_unset(aa, aaCount, qi);
-            fatbit_unset(scratch->aqa, qCount, qi);
-        }
-    }
-
-    return HWLM_CONTINUE_MATCHING;
-}
-
-static really_inline
-char roseTestLeftfix(const struct RoseEngine *t, struct hs_scratch *scratch,
-                     u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end,
-                     const char is_infix) {
-    struct core_info *ci = &scratch->core_info;
-
-    u32 ri = queueToLeftIndex(t, qi);
-    const struct LeftNfaInfo *left = getLeftTable(t) + ri;
-
-    DEBUG_PRINTF("testing %s %s %u/%u with lag %u (maxLag=%u)\n",
-                 (left->transient ? "transient" : "active"),
-                 (is_infix ? "infix" : "prefix"),
-                 ri, qi, leftfixLag, left->maxLag);
-
-    assert(leftfixLag <= left->maxLag);
-    assert(left->infix == is_infix);
-    assert(!is_infix || !left->transient); // Only prefixes can be transient.
-
-    struct mq *q = scratch->queues + qi;
-    char *state = scratch->core_info.state;
-    u8 *activeLeftArray = getActiveLeftArray(t, state);
-    u32 qCount = t->queueCount;
-    u32 arCount = t->activeLeftCount;
-
-    if (!mmbit_isset(activeLeftArray, arCount, ri)) {
-        DEBUG_PRINTF("engine is dead nothing to see here\n");
-        return 0;
-    }
-
-    if (unlikely(end < leftfixLag)) {
-        assert(0); /* lag is the literal length */
-        return 0;
-    }
-
-    if (nfaSupportsZombie(getNfaByQueue(t, qi)) && ci->buf_offset
-        && !fatbit_isset(scratch->aqa, qCount, qi)
-        && isZombie(t, state, left)) {
-        DEBUG_PRINTF("zombie\n");
-        return 1;
-    }
-
-    if (!fatbit_set(scratch->aqa, qCount, qi)) {
-        DEBUG_PRINTF("initing q %u\n", qi);
-        initRoseQueue(t, qi, left, scratch);
-        if (ci->buf_offset) { // there have been writes before us!
-            s32 sp;
-            if (!is_infix && left->transient) {
-                sp = -(s32)ci->hlen;
-            } else {
-                sp = -(s32)loadRoseDelay(t, state, left);
-            }
-
-            /* transient nfas are always started fresh -> state not maintained
-             * at stream boundary */
-
-            pushQueueAt(q, 0, MQE_START, sp);
-            if (is_infix || (ci->buf_offset + sp > 0 && !left->transient)) {
-                loadStreamState(q->nfa, q, sp);
-            } else {
-                pushQueueAt(q, 1, MQE_TOP, sp);
-                nfaQueueInitState(q->nfa, q);
-            }
-        } else { // first write ever
-            pushQueueAt(q, 0, MQE_START, 0);
-            pushQueueAt(q, 1, MQE_TOP, 0);
-            nfaQueueInitState(q->nfa, q);
-        }
-    }
-
-    s64a loc = (s64a)end - ci->buf_offset - leftfixLag;
-    assert(loc >= q_cur_loc(q) || left->eager);
-    assert(leftfixReport != MO_INVALID_IDX);
-
-    if (!is_infix && left->transient) {
-        s64a start_loc = loc - left->transient;
-        if (q_cur_loc(q) < start_loc) {
-            q->cur = q->end = 0;
-            pushQueueAt(q, 0, MQE_START, start_loc);
-            pushQueueAt(q, 1, MQE_TOP, start_loc);
-            nfaQueueInitState(q->nfa, q);
-        }
-    }
-
-    if (q_cur_loc(q) < loc || q_last_type(q) != MQE_START) {
-        if (is_infix) {
-            if (infixTooOld(q, loc)) {
-                DEBUG_PRINTF("infix %u died of old age\n", ri);
-                goto nfa_dead;
-            }
-
-            reduceInfixQueue(q, loc, left->maxQueueLen, q->nfa->maxWidth);
-        }
-
-        if (!roseLeftfixCheckMiracles(t, left, ci, q, end, is_infix)) {
-            DEBUG_PRINTF("leftfix %u died due to miracle\n", ri);
-            goto nfa_dead;
-        }
-
-#ifdef DEBUG
-        debugQueue(q);
-#endif
-
-        pushQueueNoMerge(q, MQE_END, loc);
-
-        char rv = nfaQueueExecRose(q->nfa, q, leftfixReport);
-        if (!rv) { /* nfa is dead */
-            DEBUG_PRINTF("leftfix %u died while trying to catch up\n", ri);
-            goto nfa_dead;
-        }
-
-        // Queue must have next start loc before we call nfaInAcceptState.
-        q->cur = q->end = 0;
-        pushQueueAt(q, 0, MQE_START, loc);
-
-        DEBUG_PRINTF("checking for report %u\n", leftfixReport);
-        DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
-        return rv == MO_MATCHES_PENDING;
-    } else if (q_cur_loc(q) > loc) {
-        /* an eager leftfix may have already progressed past loc if there is no
-         * match at loc. */
-        assert(left->eager);
-        return 0;
-    } else {
-        assert(q_cur_loc(q) == loc);
-        DEBUG_PRINTF("checking for report %u\n", leftfixReport);
-        char rv = nfaInAcceptState(q->nfa, leftfixReport, q);
-        DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
-        return rv;
-    }
-
-nfa_dead:
-    mmbit_unset(activeLeftArray, arCount, ri);
-    scratch->tctxt.groups &= left->squash_mask;
-    return 0;
-}
-
-static rose_inline
-char roseTestPrefix(const struct RoseEngine *t, struct hs_scratch *scratch,
-                    u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end) {
-    return roseTestLeftfix(t, scratch, qi, leftfixLag, leftfixReport, end, 0);
-}
-
-static rose_inline
-char roseTestInfix(const struct RoseEngine *t, struct hs_scratch *scratch,
-                   u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end) {
-    return roseTestLeftfix(t, scratch, qi, leftfixLag, leftfixReport, end, 1);
-}
-
-static rose_inline
-void roseTriggerInfix(const struct RoseEngine *t, struct hs_scratch *scratch,
-                      u64a start, u64a end, u32 qi, u32 topEvent, u8 cancel) {
-    struct core_info *ci = &scratch->core_info;
-    s64a loc = (s64a)end - ci->buf_offset;
-
-    u32 ri = queueToLeftIndex(t, qi);
-    assert(topEvent < MQE_INVALID);
-
-    const struct LeftNfaInfo *left = getLeftInfoByQueue(t, qi);
-    assert(!left->transient);
-
-    DEBUG_PRINTF("rose %u (qi=%u) event %u\n", ri, qi, topEvent);
-
-    struct mq *q = scratch->queues + qi;
-    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
-
-    char *state = ci->state;
-    u8 *activeLeftArray = getActiveLeftArray(t, state);
-    const u32 arCount = t->activeLeftCount;
-    char alive = mmbit_set(activeLeftArray, arCount, ri);
-
-    if (alive && info->no_retrigger) {
-        DEBUG_PRINTF("yawn\n");
-        return;
-    }
-
-    struct fatbit *aqa = scratch->aqa;
-    const u32 qCount = t->queueCount;
-
-    if (alive && nfaSupportsZombie(getNfaByInfo(t, info)) && ci->buf_offset &&
-        !fatbit_isset(aqa, qCount, qi) && isZombie(t, state, left)) {
-        DEBUG_PRINTF("yawn - zombie\n");
-        return;
-    }
-
-    if (cancel) {
-        DEBUG_PRINTF("dominating top: (re)init\n");
-        fatbit_set(aqa, qCount, qi);
-        initRoseQueue(t, qi, left, scratch);
-        pushQueueAt(q, 0, MQE_START, loc);
-        nfaQueueInitState(q->nfa, q);
-    } else if (!fatbit_set(aqa, qCount, qi)) {
-        DEBUG_PRINTF("initing %u\n", qi);
-        initRoseQueue(t, qi, left, scratch);
-        if (alive) {
-            s32 sp = -(s32)loadRoseDelay(t, state, left);
-            pushQueueAt(q, 0, MQE_START, sp);
-            loadStreamState(q->nfa, q, sp);
-        } else {
-            pushQueueAt(q, 0, MQE_START, loc);
-            nfaQueueInitState(q->nfa, q);
-        }
-    } else if (!alive) {
-        q->cur = q->end = 0;
-        pushQueueAt(q, 0, MQE_START, loc);
-        nfaQueueInitState(q->nfa, q);
-    } else if (isQueueFull(q)) {
-        reduceInfixQueue(q, loc, left->maxQueueLen, q->nfa->maxWidth);
-
-        if (isQueueFull(q)) {
-            /* still full - reduceInfixQueue did nothing */
-            DEBUG_PRINTF("queue %u full (%u items) -> catching up nfa\n", qi,
-                         q->end - q->cur);
-            pushQueueNoMerge(q, MQE_END, loc);
-            nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX);
-
-            q->cur = q->end = 0;
-            pushQueueAt(q, 0, MQE_START, loc);
-        }
-    }
-
-    pushQueueSom(q, topEvent, loc, start);
-}
-
-static rose_inline
-hwlmcb_rv_t roseReport(const struct RoseEngine *t, struct hs_scratch *scratch,
-                       u64a end, ReportID onmatch, s32 offset_adjust,
-                       u32 ekey) {
-    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
-    DEBUG_PRINTF("firing callback onmatch=%u, end=%llu\n", onmatch, end);
-    updateLastMatchOffset(&scratch->tctxt, end);
-
-    int cb_rv = roseDeliverReport(end, onmatch, offset_adjust, scratch, ekey);
-    if (cb_rv == MO_HALT_MATCHING) {
-        DEBUG_PRINTF("termination requested\n");
-        return HWLM_TERMINATE_MATCHING;
-    }
-
-    if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
-        return HWLM_CONTINUE_MATCHING;
-    }
-
-    return roseHaltIfExhausted(t, scratch);
-}
-
-/* catches up engines enough to ensure any earlier mpv triggers are enqueued
- * and then adds the trigger to the mpv queue. Must not be called during catch
- * up */
-static rose_inline
-hwlmcb_rv_t roseCatchUpAndHandleChainMatch(const struct RoseEngine *t,
-                                           struct hs_scratch *scratch,
-                                           u32 event, u64a top_squash_distance,
-                                           u64a end, const char in_catchup) {
-    if (!in_catchup &&
-        roseCatchUpMpvFeeders(t, scratch, end) == HWLM_TERMINATE_MATCHING) {
-        return HWLM_TERMINATE_MATCHING;
-    }
-    return roseHandleChainMatch(t, scratch, event, top_squash_distance, end,
-                                in_catchup);
-}
-
-static rose_inline
-void roseHandleSom(UNUSED const struct RoseEngine *t,
-                   struct hs_scratch *scratch, const struct som_operation *sr,
-                   u64a end) {
-    DEBUG_PRINTF("end=%llu, minMatchOffset=%llu\n", end,
-                 scratch->tctxt.minMatchOffset);
-
-    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
-    updateLastMatchOffset(&scratch->tctxt, end);
-    handleSomInternal(scratch, sr, end);
-}
-
-static rose_inline
-hwlmcb_rv_t roseReportSom(const struct RoseEngine *t,
-                          struct hs_scratch *scratch, u64a start, u64a end,
-                          ReportID onmatch, s32 offset_adjust, u32 ekey) {
-    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
-    DEBUG_PRINTF("firing som callback onmatch=%u, start=%llu, end=%llu\n",
-                 onmatch, start, end);
-    updateLastMatchOffset(&scratch->tctxt, end);
-
-    int cb_rv = roseDeliverSomReport(start, end, onmatch, offset_adjust,
-                                     scratch, ekey);
-    if (cb_rv == MO_HALT_MATCHING) {
-        DEBUG_PRINTF("termination requested\n");
-        return HWLM_TERMINATE_MATCHING;
-    }
-
-    if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
-        return HWLM_CONTINUE_MATCHING;
-    }
-
-    return roseHaltIfExhausted(t, scratch);
-}
-
-static rose_inline
-void roseHandleSomSom(UNUSED const struct RoseEngine *t,
-                      struct hs_scratch *scratch,
-                      const struct som_operation *sr, u64a start, u64a end) {
-    DEBUG_PRINTF("start=%llu, end=%llu, minMatchOffset=%llu\n", start, end,
-                 scratch->tctxt.minMatchOffset);
-
-    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
-    updateLastMatchOffset(&scratch->tctxt, end);
-    setSomFromSomAware(scratch, sr, start, end);
-}
-
-static really_inline
-int reachHasBit(const u8 *reach, u8 c) {
-    return !!(reach[c / 8U] & (u8)1U << (c % 8U));
-}
-
-/**
- * \brief Scan around a literal, checking that that "lookaround" reach masks
- * are satisfied.
+/*
+ * Program context flags, which control the behaviour of some instructions at
+ * based on runtime contexts (whether the program is triggered by the anchored
+ * matcher, engine catchup, etc).
  */
-static rose_inline
-int roseCheckLookaround(const struct RoseEngine *t,
-                        const struct hs_scratch *scratch, u32 lookaroundIndex,
-                        u32 lookaroundCount, u64a end) {
-    assert(lookaroundIndex != MO_INVALID_IDX);
-    assert(lookaroundCount > 0);
 
-    const struct core_info *ci = &scratch->core_info;
-    DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
-                 ci->buf_offset, ci->buf_offset + ci->len);
+#define ROSE_PROG_FLAG_IN_ANCHORED          1
+#define ROSE_PROG_FLAG_IN_CATCHUP           2
+#define ROSE_PROG_FLAG_FROM_MPV             4
+#define ROSE_PROG_FLAG_SKIP_MPV_CATCHUP     8
 
-    const u8 *base = (const u8 *)t;
-    const s8 *look_base = (const s8 *)(base + t->lookaroundTableOffset);
-    const s8 *look = look_base + lookaroundIndex;
-    const s8 *look_end = look + lookaroundCount;
-    assert(look < look_end);
-
-    const u8 *reach_base = base + t->lookaroundReachOffset;
-    const u8 *reach = reach_base + lookaroundIndex * REACH_BITVECTOR_LEN;
-
-    // The following code assumes that the lookaround structures are ordered by
-    // increasing offset.
-
-    const s64a base_offset = end - ci->buf_offset;
-    DEBUG_PRINTF("base_offset=%lld\n", base_offset);
-    DEBUG_PRINTF("first look has offset %d\n", *look);
-
-    // If our first check tells us we need to look at an offset before the
-    // start of the stream, this role cannot match.
-    if (unlikely(*look < 0 && (u64a)(0 - *look) > end)) {
-        DEBUG_PRINTF("too early, fail\n");
-        return 0;
-    }
-
-    // Skip over offsets that are before the history buffer.
-    do {
-        s64a offset = base_offset + *look;
-        if (offset >= -(s64a)ci->hlen) {
-            goto in_history;
-        }
-        DEBUG_PRINTF("look=%d before history\n", *look);
-        look++;
-        reach += REACH_BITVECTOR_LEN;
-    } while (look < look_end);
-
-    // History buffer.
-    DEBUG_PRINTF("scan history (%zu looks left)\n", look_end - look);
-    for (; look < look_end; ++look, reach += REACH_BITVECTOR_LEN) {
-    in_history:
-        ;
-        s64a offset = base_offset + *look;
-        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
-
-        if (offset >= 0) {
-            DEBUG_PRINTF("in buffer\n");
-            goto in_buffer;
-        }
-
-        assert(offset >= -(s64a)ci->hlen && offset < 0);
-        u8 c = ci->hbuf[ci->hlen + offset];
-        if (!reachHasBit(reach, c)) {
-            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
-            return 0;
-        }
-    }
-    // Current buffer.
-    DEBUG_PRINTF("scan buffer (%zu looks left)\n", look_end - look);
-    for (; look < look_end; ++look, reach += REACH_BITVECTOR_LEN) {
-    in_buffer:
-        ;
-        s64a offset = base_offset + *look;
-        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
-
-        if (offset >= (s64a)ci->len) {
-            DEBUG_PRINTF("in the future\n");
-            break;
-        }
-
-        assert(offset >= 0 && offset < (s64a)ci->len);
-        u8 c = ci->buf[offset];
-        if (!reachHasBit(reach, c)) {
-            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
-            return 0;
-        }
-    }
-
-    DEBUG_PRINTF("OK :)\n");
-    return 1;
-}
-
-static
-int roseNfaEarliestSom(u64a from_offset, UNUSED u64a offset, UNUSED ReportID id,
-                       void *context) {
-    u64a *som = context;
-    *som = MIN(*som, from_offset);
-    return MO_CONTINUE_MATCHING;
-}
-
-static rose_inline
-u64a roseGetHaigSom(const struct RoseEngine *t, struct hs_scratch *scratch,
-                    const u32 qi, UNUSED const u32 leftfixLag) {
-    u32 ri = queueToLeftIndex(t, qi);
-
-    UNUSED const struct LeftNfaInfo *left = getLeftTable(t) + ri;
-
-    DEBUG_PRINTF("testing %s prefix %u/%u with lag %u (maxLag=%u)\n",
-                 left->transient ? "transient" : "active", ri, qi,
-                 leftfixLag, left->maxLag);
-
-    assert(leftfixLag <= left->maxLag);
-
-    struct mq *q = scratch->queues + qi;
-
-    u64a start = ~0ULL;
-
-    /* switch the callback + context for a fun one */
-    q->som_cb = roseNfaEarliestSom;
-    q->context = &start;
-
-    nfaReportCurrentMatches(q->nfa, q);
-
-    /* restore the old callback + context */
-    q->som_cb = roseNfaSomAdaptor;
-    q->context = NULL;
-    DEBUG_PRINTF("earliest som is %llu\n", start);
-    return start;
-}
-
-static rose_inline
-char roseCheckBounds(u64a end, u64a min_bound, u64a max_bound) {
-    DEBUG_PRINTF("check offset=%llu against bounds [%llu,%llu]\n", end,
-                 min_bound, max_bound);
-    assert(min_bound <= max_bound);
-    return end >= min_bound && end <= max_bound;
-}
-
-static rose_inline
-hwlmcb_rv_t roseEnginesEod(const struct RoseEngine *rose,
-                           struct hs_scratch *scratch, u64a offset,
-                           u32 iter_offset) {
-    const char is_streaming = rose->mode != HS_MODE_BLOCK;
-
-    /* data, len is used for state decompress, should be full available data */
-    u8 key = 0;
-    if (is_streaming) {
-        const u8 *eod_data = scratch->core_info.hbuf;
-        size_t eod_len = scratch->core_info.hlen;
-        key = eod_len ? eod_data[eod_len - 1] : 0;
-    }
-
-    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
-    const u32 aaCount = rose->activeArrayCount;
-
-    const struct mmbit_sparse_iter *it = getByOffset(rose, iter_offset);
-    assert(ISALIGNED(it));
-
-    u32 idx = 0;
-    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
-
-    for (u32 qi = mmbit_sparse_iter_begin(aa, aaCount, &idx, it, si_state);
-         qi != MMB_INVALID;
-         qi = mmbit_sparse_iter_next(aa, aaCount, qi, &idx, it, si_state)) {
-        DEBUG_PRINTF("checking nfa %u\n", qi);
-        struct mq *q = scratch->queues + qi;
-        assert(q->nfa == getNfaByQueue(rose, qi));
-        assert(nfaAcceptsEod(q->nfa));
-
-        if (is_streaming) {
-            // Decompress stream state.
-            nfaExpandState(q->nfa, q->state, q->streamState, offset, key);
-        }
-
-        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
-                               roseReportAdaptor, roseReportSomAdaptor,
-                               scratch) == MO_HALT_MATCHING) {
-            DEBUG_PRINTF("user instructed us to stop\n");
-            return HWLM_TERMINATE_MATCHING;
-        }
-    }
-
-    return HWLM_CONTINUE_MATCHING;
-}
-
-static rose_inline
-hwlmcb_rv_t roseSuffixesEod(const struct RoseEngine *rose,
-                            struct hs_scratch *scratch, u64a offset) {
-    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
-    const u32 aaCount = rose->activeArrayCount;
-
-    for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID;
-         qi = mmbit_iterate(aa, aaCount, qi)) {
-        DEBUG_PRINTF("checking nfa %u\n", qi);
-        struct mq *q = scratch->queues + qi;
-        assert(q->nfa == getNfaByQueue(rose, qi));
-        assert(nfaAcceptsEod(q->nfa));
-
-        /* We have just been triggered. */
-        assert(fatbit_isset(scratch->aqa, rose->queueCount, qi));
-
-        pushQueueNoMerge(q, MQE_END, scratch->core_info.len);
-        q->context = NULL;
-
-        /* rose exec is used as we don't want to / can't raise matches in the
-         * history buffer. */
-        if (!nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX)) {
-            DEBUG_PRINTF("nfa is dead\n");
-            continue;
-        }
-        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
-                               roseReportAdaptor, roseReportSomAdaptor,
-                               scratch) == MO_HALT_MATCHING) {
-            DEBUG_PRINTF("user instructed us to stop\n");
-            return HWLM_TERMINATE_MATCHING;
-        }
-    }
-    return HWLM_CONTINUE_MATCHING;
-}
-
-static rose_inline
-hwlmcb_rv_t roseMatcherEod(const struct RoseEngine *rose,
-                           struct hs_scratch *scratch, u64a offset) {
-    assert(rose->ematcherOffset);
-    assert(rose->ematcherRegionSize);
-
-    // Clear role state and active engines, since we have already handled all
-    // outstanding work there.
-    DEBUG_PRINTF("clear role state and active leaf array\n");
-    char *state = scratch->core_info.state;
-    mmbit_clear(getRoleState(state), rose->rolesWithStateCount);
-    mmbit_clear(getActiveLeafArray(rose, state), rose->activeArrayCount);
-
-    const char is_streaming = rose->mode != HS_MODE_BLOCK;
-
-    size_t eod_len;
-    const u8 *eod_data;
-    if (!is_streaming) { /* Block */
-        eod_data = scratch->core_info.buf;
-        eod_len = scratch->core_info.len;
-    } else { /* Streaming */
-        eod_len = scratch->core_info.hlen;
-        eod_data = scratch->core_info.hbuf;
-    }
-
-    assert(eod_data);
-    assert(eod_len);
-
-    DEBUG_PRINTF("%zu bytes of eod data to scan at offset %llu\n", eod_len,
-                 offset);
-
-    // If we don't have enough bytes to produce a match from an EOD table scan,
-    // there's no point scanning.
-    if (eod_len < rose->eodmatcherMinWidth) {
-        DEBUG_PRINTF("too short for min width %u\n", rose->eodmatcherMinWidth);
-        return HWLM_CONTINUE_MATCHING;
-    }
-
-    // Ensure that we only need scan the last N bytes, where N is the length of
-    // the eod-anchored matcher region.
-    size_t adj = eod_len - MIN(eod_len, rose->ematcherRegionSize);
-
-    const struct HWLM *etable = getByOffset(rose, rose->ematcherOffset);
-    hwlmExec(etable, eod_data, eod_len, adj, roseCallback, scratch,
-             scratch->tctxt.groups);
-
-    // We may need to fire delayed matches.
-    if (cleanUpDelayed(rose, scratch, 0, offset) == HWLM_TERMINATE_MATCHING) {
-        DEBUG_PRINTF("user instructed us to stop\n");
-        return HWLM_TERMINATE_MATCHING;
-    }
-
-    roseFlushLastByteHistory(rose, scratch, offset);
-    return HWLM_CONTINUE_MATCHING;
-}
-
-static
-void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
-                    const char from_mpv) {
-    if (from_mpv) {
-        updateMinMatchOffsetFromMpv(tctxt, offset);
-    } else {
-        updateMinMatchOffset(tctxt, offset);
-    }
-}
-
-#define PROGRAM_CASE(name)                                                     \
-    case ROSE_INSTR_##name: {                                                  \
-        DEBUG_PRINTF("instruction: " #name " (pc=%u)\n",                       \
-                     programOffset + (u32)(pc - pc_base));                     \
-        const struct ROSE_STRUCT_##name *ri =                                  \
-            (const struct ROSE_STRUCT_##name *)pc;
-
-#define PROGRAM_NEXT_INSTRUCTION                                               \
-    pc += ROUNDUP_N(sizeof(*ri), ROSE_INSTR_MIN_ALIGN);                        \
-    break;                                                                     \
-    }
-
-static rose_inline
 hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
                            struct hs_scratch *scratch, u32 programOffset,
-                           u64a som, u64a end, size_t match_len,
-                           char in_anchored, char in_catchup, char from_mpv,
-                           char skip_mpv_catchup) {
-    DEBUG_PRINTF("program=%u, offsets [%llu,%llu]\n", programOffset, som, end);
-
-    assert(programOffset >= sizeof(struct RoseEngine));
-    assert(programOffset < t->size);
-
-    const char *pc_base = getByOffset(t, programOffset);
-    const char *pc = pc_base;
-
-    // Local sparse iterator state for programs that use the SPARSE_ITER_BEGIN
-    // and SPARSE_ITER_NEXT instructions.
-    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
-
-    // If this program has an effect, work_done will be set to one (which may
-    // allow the program to squash groups).
-    int work_done = 0;
-
-    struct RoseContext *tctxt = &scratch->tctxt;
-
-    assert(*(const u8 *)pc != ROSE_INSTR_END);
-
-    for (;;) {
-        assert(ISALIGNED_N(pc, ROSE_INSTR_MIN_ALIGN));
-        assert(pc >= pc_base);
-        assert((size_t)(pc - pc_base) < t->size);
-        const u8 code = *(const u8 *)pc;
-        assert(code <= ROSE_INSTR_END);
-
-        switch ((enum RoseInstructionCode)code) {
-            PROGRAM_CASE(ANCHORED_DELAY) {
-                if (in_anchored && end > t->floatingMinLiteralMatchOffset) {
-                    DEBUG_PRINTF("delay until playback\n");
-                    tctxt->groups |= ri->groups;
-                    work_done = 1;
-                    assert(ri->done_jump); // must progress
-                    pc += ri->done_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_LIT_MASK) {
-                assert(match_len);
-                struct core_info *ci = &scratch->core_info;
-                if (!roseCheckBenefits(ci, end, match_len, ri->and_mask.a8,
-                                       ri->cmp_mask.a8)) {
-                    DEBUG_PRINTF("halt: failed mask check\n");
-                    return HWLM_CONTINUE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_LIT_EARLY) {
-                if (end < ri->min_offset) {
-                    DEBUG_PRINTF("halt: before min_offset=%u\n",
-                                 ri->min_offset);
-                    return HWLM_CONTINUE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_GROUPS) {
-                DEBUG_PRINTF("groups=0x%llx, checking instr groups=0x%llx\n",
-                             tctxt->groups, ri->groups);
-                if (!(ri->groups & tctxt->groups)) {
-                    DEBUG_PRINTF("halt: no groups are set\n");
-                    return HWLM_CONTINUE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_ONLY_EOD) {
-                struct core_info *ci = &scratch->core_info;
-                if (end != ci->buf_offset + ci->len) {
-                    DEBUG_PRINTF("should only match at end of data\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_BOUNDS) {
-                if (!roseCheckBounds(end, ri->min_bound, ri->max_bound)) {
-                    DEBUG_PRINTF("failed bounds check\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_NOT_HANDLED) {
-                struct fatbit *handled = scratch->handled_roles;
-                if (fatbit_set(handled, t->handledKeyCount, ri->key)) {
-                    DEBUG_PRINTF("key %u already set\n", ri->key);
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_LOOKAROUND) {
-                if (!roseCheckLookaround(t, scratch, ri->index, ri->count,
-                                         end)) {
-                    DEBUG_PRINTF("failed lookaround check\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_INFIX) {
-                if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report,
-                                   end)) {
-                    DEBUG_PRINTF("failed infix check\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_PREFIX) {
-                if (!roseTestPrefix(t, scratch, ri->queue, ri->lag, ri->report,
-                                    end)) {
-                    DEBUG_PRINTF("failed prefix check\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(PUSH_DELAYED) {
-                rosePushDelayedMatch(t, scratch, ri->delay, ri->index, end);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CATCH_UP) {
-                if (roseCatchUpTo(t, scratch, end) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CATCH_UP_MPV) {
-                if (from_mpv || skip_mpv_catchup) {
-                    DEBUG_PRINTF("skipping mpv catchup\n");
-                } else if (roseCatchUpMPV(t,
-                                          end - scratch->core_info.buf_offset,
-                                          scratch) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SOM_ADJUST) {
-                assert(ri->distance <= end);
-                som = end - ri->distance;
-                DEBUG_PRINTF("som is (end - %u) = %llu\n", ri->distance, som);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SOM_LEFTFIX) {
-                som = roseGetHaigSom(t, scratch, ri->queue, ri->lag);
-                DEBUG_PRINTF("som from leftfix is %llu\n", som);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SOM_FROM_REPORT) {
-                som = handleSomExternal(scratch, &ri->som, end);
-                DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch,
-                             som);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SOM_ZERO) {
-                DEBUG_PRINTF("setting SOM to zero\n");
-                som = 0;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(TRIGGER_INFIX) {
-                roseTriggerInfix(t, scratch, som, end, ri->queue, ri->event,
-                                 ri->cancel);
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(TRIGGER_SUFFIX) {
-                if (roseTriggerSuffix(t, scratch, ri->queue, ri->event, som,
-                                      end) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(DEDUPE) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                const char do_som = t->hasSom; // TODO: constant propagate
-                const char is_external_report = 1;
-                enum DedupeResult rv =
-                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
-                                  ri->dkey, ri->offset_adjust,
-                                  is_external_report, ri->quash_som, do_som);
-                switch (rv) {
-                case DEDUPE_HALT:
-                    return HWLM_TERMINATE_MATCHING;
-                case DEDUPE_SKIP:
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                case DEDUPE_CONTINUE:
-                    break;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(DEDUPE_SOM) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                const char is_external_report = 0;
-                const char do_som = 1;
-                enum DedupeResult rv =
-                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
-                                  ri->dkey, ri->offset_adjust,
-                                  is_external_report, ri->quash_som, do_som);
-                switch (rv) {
-                case DEDUPE_HALT:
-                    return HWLM_TERMINATE_MATCHING;
-                case DEDUPE_SKIP:
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                case DEDUPE_CONTINUE:
-                    break;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_CHAIN) {
-                // Note: sequence points updated inside this function.
-                if (roseCatchUpAndHandleChainMatch(
-                        t, scratch, ri->event, ri->top_squash_distance, end,
-                        in_catchup) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_SOM_INT) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                roseHandleSom(t, scratch, &ri->som, end);
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_SOM_AWARE) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                roseHandleSomSom(t, scratch, &ri->som, som, end);
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
-                               INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_EXHAUST) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
-                               ri->ekey) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_SOM) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                if (roseReportSom(t, scratch, som, end, ri->onmatch,
-                                  ri->offset_adjust,
-                                  INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_SOM_EXHAUST) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                if (roseReportSom(t, scratch, som, end, ri->onmatch,
-                                  ri->offset_adjust,
-                                  ri->ekey) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(DEDUPE_AND_REPORT) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                const char do_som = t->hasSom; // TODO: constant propagate
-                const char is_external_report = 1;
-                enum DedupeResult rv =
-                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
-                                  ri->dkey, ri->offset_adjust,
-                                  is_external_report, ri->quash_som, do_som);
-                switch (rv) {
-                case DEDUPE_HALT:
-                    return HWLM_TERMINATE_MATCHING;
-                case DEDUPE_SKIP:
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                case DEDUPE_CONTINUE:
-                    break;
-                }
-
-                const u32 ekey = INVALID_EKEY;
-                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
-                               ekey) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(FINAL_REPORT) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
-                               INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                /* One-shot specialisation: this instruction always terminates
-                 * execution of the program. */
-                return HWLM_CONTINUE_MATCHING;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_EXHAUSTED) {
-                DEBUG_PRINTF("check ekey %u\n", ri->ekey);
-                assert(ri->ekey != INVALID_EKEY);
-                assert(ri->ekey < t->ekeyCount);
-                const char *evec = scratch->core_info.exhaustionVector;
-                if (isExhausted(t, evec, ri->ekey)) {
-                    DEBUG_PRINTF("ekey %u already set, match is exhausted\n",
-                                 ri->ekey);
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_MIN_LENGTH) {
-                DEBUG_PRINTF("check min length %llu (adj %d)\n", ri->min_length,
-                             ri->end_adj);
-                assert(ri->min_length > 0);
-                assert(ri->end_adj == 0 || ri->end_adj == -1);
-                assert(som == HS_OFFSET_PAST_HORIZON || som <= end);
-                if (som != HS_OFFSET_PAST_HORIZON &&
-                    ((end + ri->end_adj) - som < ri->min_length)) {
-                    DEBUG_PRINTF("failed check, match len %llu\n",
-                                 (u64a)((end + ri->end_adj) - som));
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SET_STATE) {
-                DEBUG_PRINTF("set state index %u\n", ri->index);
-                mmbit_set(getRoleState(scratch->core_info.state),
-                          t->rolesWithStateCount, ri->index);
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SET_GROUPS) {
-                tctxt->groups |= ri->groups;
-                DEBUG_PRINTF("set groups 0x%llx -> 0x%llx\n", ri->groups,
-                             tctxt->groups);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SQUASH_GROUPS) {
-                assert(popcount64(ri->groups) == 63); // Squash only one group.
-                if (work_done) {
-                    tctxt->groups &= ri->groups;
-                    DEBUG_PRINTF("squash groups 0x%llx -> 0x%llx\n", ri->groups,
-                                 tctxt->groups);
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_STATE) {
-                DEBUG_PRINTF("check state %u\n", ri->index);
-                const u8 *roles = getRoleState(scratch->core_info.state);
-                if (!mmbit_isset(roles, t->rolesWithStateCount, ri->index)) {
-                    DEBUG_PRINTF("state not on\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SPARSE_ITER_BEGIN) {
-                DEBUG_PRINTF("iter_offset=%u\n", ri->iter_offset);
-                const struct mmbit_sparse_iter *it =
-                    getByOffset(t, ri->iter_offset);
-                assert(ISALIGNED(it));
-
-                const u8 *roles = getRoleState(scratch->core_info.state);
-
-                u32 idx = 0;
-                u32 i = mmbit_sparse_iter_begin(roles, t->rolesWithStateCount,
-                                                &idx, it, si_state);
-                if (i == MMB_INVALID) {
-                    DEBUG_PRINTF("no states in sparse iter are on\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-
-                fatbit_clear(scratch->handled_roles);
-
-                const u32 *jumps = getByOffset(t, ri->jump_table);
-                DEBUG_PRINTF("state %u (idx=%u) is on, jump to %u\n", i, idx,
-                             jumps[idx]);
-                pc = pc_base + jumps[idx];
-                continue;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SPARSE_ITER_NEXT) {
-                DEBUG_PRINTF("iter_offset=%u, state=%u\n", ri->iter_offset,
-                             ri->state);
-                const struct mmbit_sparse_iter *it =
-                    getByOffset(t, ri->iter_offset);
-                assert(ISALIGNED(it));
-
-                const u8 *roles = getRoleState(scratch->core_info.state);
-
-                u32 idx = 0;
-                u32 i = mmbit_sparse_iter_next(roles, t->rolesWithStateCount,
-                                               ri->state, &idx, it, si_state);
-                if (i == MMB_INVALID) {
-                    DEBUG_PRINTF("no more states in sparse iter are on\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-
-                const u32 *jumps = getByOffset(t, ri->jump_table);
-                DEBUG_PRINTF("state %u (idx=%u) is on, jump to %u\n", i, idx,
-                             jumps[idx]);
-                pc = pc_base + jumps[idx];
-                continue;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(ENGINES_EOD) {
-                if (roseEnginesEod(t, scratch, end, ri->iter_offset) ==
-                    HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SUFFIXES_EOD) {
-                if (roseSuffixesEod(t, scratch, end) ==
-                    HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(MATCHER_EOD) {
-                if (roseMatcherEod(t, scratch, end) ==
-                    HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(END) {
-                DEBUG_PRINTF("finished\n");
-                return HWLM_CONTINUE_MATCHING;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-        }
-    }
-
-    assert(0); // unreachable
-    return HWLM_CONTINUE_MATCHING;
-}
-
-#undef PROGRAM_CASE
-#undef PROGRAM_NEXT_INSTRUCTION
+                           u64a som, u64a end, size_t match_len, u8 prog_flags);
 
 #endif // PROGRAM_RUNTIME_H
diff --git a/src/rose/stream.c b/src/rose/stream.c
index 181bfe65..c0b69f4c 100644
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@@ -705,13 +705,10 @@ void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
 
     const u64a som = 0;
     const size_t match_len = 0;
-    const char in_anchored = 0;
-    const char in_catchup = 0;
-    const char from_mpv = 0;
-    const char skip_mpv_catchup = 1;
+    const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
 
     // Note: we ignore the result, as this is the last thing to ever happen on
     // a scan.
     roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len,
-                   in_anchored, in_catchup, from_mpv, skip_mpv_catchup);
+                   flags);
 }

From 013dbd3b3c48ef72e10175a5aef157357f08120c Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 27 Jun 2016 12:48:46 +1000
Subject: [PATCH 074/166] rose: re-inline literal handling program exec

---
 src/rose/match.c           |   29 +-
 src/rose/program_runtime.c | 1374 +----------------------------------
 src/rose/program_runtime.h | 1378 +++++++++++++++++++++++++++++++++++-
 3 files changed, 1401 insertions(+), 1380 deletions(-)

diff --git a/src/rose/match.c b/src/rose/match.c
index 0311d496..bea2b5d2 100644
--- a/src/rose/match.c
+++ b/src/rose/match.c
@@ -261,8 +261,31 @@ int roseAnchoredCallback(u64a end, u32 id, void *ctx) {
     return MO_CONTINUE_MATCHING;
 }
 
-// Rose match-processing workhorse
-/* assumes not in_anchored */
+/**
+ * \brief Run the program for the given literal ID, with the interpreter
+ * inlined into this call.
+ *
+ * Assumes not in_anchored.
+ */
+static really_inline
+hwlmcb_rv_t roseProcessMatchInline(const struct RoseEngine *t,
+                             struct hs_scratch *scratch, u64a end,
+                             size_t match_len, u32 id) {
+    DEBUG_PRINTF("id=%u\n", id);
+    const u32 *programs = getByOffset(t, t->litProgramOffset);
+    assert(id < t->literalCount);
+    const u64a som = 0;
+    const u8 flags = 0;
+    return roseRunProgram_i(t, scratch, programs[id], som, end, match_len,
+                            flags);
+}
+
+/**
+ * \brief Run the program for the given literal ID, with the interpreter
+ * out of line.
+ *
+ * Assumes not in_anchored.
+ */
 static really_inline
 hwlmcb_rv_t roseProcessMatch(const struct RoseEngine *t,
                              struct hs_scratch *scratch, u64a end,
@@ -535,7 +558,7 @@ hwlmcb_rv_t roseCallback_i(size_t start, size_t end, u32 id, void *ctxt) {
     }
 
     size_t match_len = end - start + 1;
-    rv = roseProcessMatch(t, scratch, real_end, match_len, id);
+    rv = roseProcessMatchInline(t, scratch, real_end, match_len, id);
 
     DEBUG_PRINTF("DONE groups=0x%016llx\n", tctx->groups);
 
diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c
index 73a9e974..7669103f 100644
--- a/src/rose/program_runtime.c
+++ b/src/rose/program_runtime.c
@@ -33,657 +33,6 @@
 
 #include "program_runtime.h"
 
-#include "catchup.h"
-#include "counting_miracle.h"
-#include "infix.h"
-#include "match.h"
-#include "miracle.h"
-#include "report.h"
-#include "rose.h"
-#include "rose_internal.h"
-#include "rose_program.h"
-#include "rose_types.h"
-#include "runtime.h"
-#include "scratch.h"
-#include "ue2common.h"
-#include "util/compare.h"
-#include "util/fatbit.h"
-#include "util/multibit.h"
-
-static rose_inline
-int roseCheckBenefits(const struct core_info *ci, u64a end, u32 mask_rewind,
-                      const u8 *and_mask, const u8 *exp_mask) {
-    const u8 *data;
-
-    // If the check works over part of the history and part of the buffer, we
-    // create a temporary copy of the data in here so it's contiguous.
-    u8 temp[MAX_MASK2_WIDTH];
-
-    s64a buffer_offset = (s64a)end - ci->buf_offset;
-    DEBUG_PRINTF("rel offset %lld\n", buffer_offset);
-    if (buffer_offset >= mask_rewind) {
-        data = ci->buf + buffer_offset - mask_rewind;
-        DEBUG_PRINTF("all in one case data=%p buf=%p rewind=%u\n", data,
-                     ci->buf, mask_rewind);
-    } else if (buffer_offset <= 0) {
-        data = ci->hbuf + ci->hlen + buffer_offset - mask_rewind;
-        DEBUG_PRINTF("all in one case data=%p buf=%p rewind=%u\n", data,
-                     ci->buf, mask_rewind);
-    } else {
-        u32 shortfall = mask_rewind - buffer_offset;
-        DEBUG_PRINTF("shortfall of %u, rewind %u hlen %zu\n", shortfall,
-                     mask_rewind, ci->hlen);
-        data = temp;
-        memcpy(temp, ci->hbuf + ci->hlen - shortfall, shortfall);
-        memcpy(temp + shortfall, ci->buf, mask_rewind - shortfall);
-    }
-
-#ifdef DEBUG
-    DEBUG_PRINTF("DATA: ");
-    for (u32 i = 0; i < mask_rewind; i++) {
-        printf("%c", ourisprint(data[i]) ? data[i] : '?');
-    }
-    printf(" (len=%u)\n", mask_rewind);
-#endif
-
-    u32 len = mask_rewind;
-    while (len >= sizeof(u64a)) {
-        u64a a = unaligned_load_u64a(data);
-        a &= *(const u64a *)and_mask;
-        if (a != *(const u64a *)exp_mask) {
-            DEBUG_PRINTF("argh %016llx %016llx\n", a, *(const u64a *)exp_mask);
-            return 0;
-        }
-        data += sizeof(u64a);
-        and_mask += sizeof(u64a);
-        exp_mask += sizeof(u64a);
-        len -= sizeof(u64a);
-    }
-
-    while (len) {
-        u8 a = *data;
-        a &= *and_mask;
-        if (a != *exp_mask) {
-            DEBUG_PRINTF("argh d%02hhx =%02hhx am%02hhx  em%02hhx\n", a,
-                          *data, *and_mask, *exp_mask);
-            return 0;
-        }
-        data++;
-        and_mask++;
-        exp_mask++;
-        len--;
-    }
-
-    return 1;
-}
-
-static rose_inline
-void rosePushDelayedMatch(const struct RoseEngine *t,
-                          struct hs_scratch *scratch, u32 delay,
-                          u32 delay_index, u64a offset) {
-    assert(delay);
-
-    const u32 src_slot_index = delay;
-    u32 slot_index = (src_slot_index + offset) & DELAY_MASK;
-
-    struct RoseContext *tctxt = &scratch->tctxt;
-    if (offset + src_slot_index <= tctxt->delayLastEndOffset) {
-        DEBUG_PRINTF("skip too late\n");
-        return;
-    }
-
-    const u32 delay_count = t->delay_count;
-    struct fatbit **delaySlots = getDelaySlots(scratch);
-    struct fatbit *slot = delaySlots[slot_index];
-
-    DEBUG_PRINTF("pushing tab %u into slot %u\n", delay_index, slot_index);
-    if (!(tctxt->filledDelayedSlots & (1U << slot_index))) {
-        tctxt->filledDelayedSlots |= 1U << slot_index;
-        fatbit_clear(slot);
-    }
-
-    fatbit_set(slot, delay_count, delay_index);
-}
-
-static rose_inline
-char roseLeftfixCheckMiracles(const struct RoseEngine *t,
-                              const struct LeftNfaInfo *left,
-                              struct core_info *ci, struct mq *q, u64a end,
-                              const char is_infix) {
-    if (!is_infix && left->transient) {
-        // Miracles won't help us with transient leftfix engines; they only
-        // scan for a limited time anyway.
-        return 1;
-    }
-
-    if (!left->stopTable) {
-        return 1;
-    }
-
-    DEBUG_PRINTF("looking for miracle on queue %u\n", q->nfa->queueIndex);
-
-    const s64a begin_loc = q_cur_loc(q);
-    const s64a end_loc = end - ci->buf_offset;
-
-    s64a miracle_loc;
-    if (roseMiracleOccurs(t, left, ci, begin_loc, end_loc, &miracle_loc)) {
-        goto found_miracle;
-    }
-
-    if (roseCountingMiracleOccurs(t, left, ci, begin_loc, end_loc,
-                                  &miracle_loc)) {
-        goto found_miracle;
-    }
-
-    return 1;
-
-found_miracle:
-    DEBUG_PRINTF("miracle at %lld\n", miracle_loc);
-    assert(miracle_loc >= begin_loc);
-
-    // If we're a prefix, then a miracle effectively results in us needing to
-    // re-init our state and start fresh.
-    if (!is_infix) {
-        if (miracle_loc != begin_loc) {
-            DEBUG_PRINTF("re-init prefix state\n");
-            q->cur = q->end = 0;
-            pushQueueAt(q, 0, MQE_START, miracle_loc);
-            pushQueueAt(q, 1, MQE_TOP, miracle_loc);
-            nfaQueueInitState(q->nfa, q);
-        }
-        return 1;
-    }
-
-    // Otherwise, we're an infix. Remove tops before the miracle from the queue
-    // and re-init at that location.
-
-    q_skip_forward_to(q, miracle_loc);
-
-    if (q_last_type(q) == MQE_START) {
-        DEBUG_PRINTF("miracle caused infix to die\n");
-        return 0;
-    }
-
-    DEBUG_PRINTF("re-init infix state\n");
-    assert(q->items[q->cur].type == MQE_START);
-    q->items[q->cur].location = miracle_loc;
-    nfaQueueInitState(q->nfa, q);
-
-    return 1;
-}
-
-static rose_inline
-hwlmcb_rv_t roseTriggerSuffix(const struct RoseEngine *t,
-                              struct hs_scratch *scratch, u32 qi, u32 top,
-                              u64a som, u64a end) {
-    DEBUG_PRINTF("suffix qi=%u, top event=%u\n", qi, top);
-
-    struct core_info *ci = &scratch->core_info;
-    u8 *aa = getActiveLeafArray(t, ci->state);
-    const u32 aaCount = t->activeArrayCount;
-    const u32 qCount = t->queueCount;
-    struct mq *q = &scratch->queues[qi];
-    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
-    const struct NFA *nfa = getNfaByInfo(t, info);
-
-    s64a loc = (s64a)end - ci->buf_offset;
-    assert(loc <= (s64a)ci->len && loc >= -(s64a)ci->hlen);
-
-    if (!mmbit_set(aa, aaCount, qi)) {
-        initQueue(q, qi, t, scratch);
-        nfaQueueInitState(nfa, q);
-        pushQueueAt(q, 0, MQE_START, loc);
-        fatbit_set(scratch->aqa, qCount, qi);
-    } else if (info->no_retrigger) {
-        DEBUG_PRINTF("yawn\n");
-        /* nfa only needs one top; we can go home now */
-        return HWLM_CONTINUE_MATCHING;
-    } else if (!fatbit_set(scratch->aqa, qCount, qi)) {
-        initQueue(q, qi, t, scratch);
-        loadStreamState(nfa, q, 0);
-        pushQueueAt(q, 0, MQE_START, 0);
-    } else if (isQueueFull(q)) {
-        DEBUG_PRINTF("queue %u full -> catching up nfas\n", qi);
-        if (info->eod) {
-            /* can catch up suffix independently no pq */
-            q->context = NULL;
-            pushQueueNoMerge(q, MQE_END, loc);
-            nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX);
-            q->cur = q->end = 0;
-            pushQueueAt(q, 0, MQE_START, loc);
-        } else if (ensureQueueFlushed(t, scratch, qi, loc)
-            == HWLM_TERMINATE_MATCHING) {
-            return HWLM_TERMINATE_MATCHING;
-        }
-    }
-
-    assert(top == MQE_TOP || (top >= MQE_TOP_FIRST && top < MQE_INVALID));
-    pushQueueSom(q, top, loc, som);
-
-    if (q_cur_loc(q) == (s64a)ci->len && !info->eod) {
-        /* we may not run the nfa; need to ensure state is fine  */
-        DEBUG_PRINTF("empty run\n");
-        pushQueueNoMerge(q, MQE_END, loc);
-        char alive = nfaQueueExec(nfa, q, loc);
-        if (alive) {
-            q->cur = q->end = 0;
-            pushQueueAt(q, 0, MQE_START, loc);
-        } else {
-            mmbit_unset(aa, aaCount, qi);
-            fatbit_unset(scratch->aqa, qCount, qi);
-        }
-    }
-
-    return HWLM_CONTINUE_MATCHING;
-}
-
-static really_inline
-char roseTestLeftfix(const struct RoseEngine *t, struct hs_scratch *scratch,
-                     u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end,
-                     const char is_infix) {
-    struct core_info *ci = &scratch->core_info;
-
-    u32 ri = queueToLeftIndex(t, qi);
-    const struct LeftNfaInfo *left = getLeftTable(t) + ri;
-
-    DEBUG_PRINTF("testing %s %s %u/%u with lag %u (maxLag=%u)\n",
-                 (left->transient ? "transient" : "active"),
-                 (is_infix ? "infix" : "prefix"),
-                 ri, qi, leftfixLag, left->maxLag);
-
-    assert(leftfixLag <= left->maxLag);
-    assert(left->infix == is_infix);
-    assert(!is_infix || !left->transient); // Only prefixes can be transient.
-
-    struct mq *q = scratch->queues + qi;
-    char *state = scratch->core_info.state;
-    u8 *activeLeftArray = getActiveLeftArray(t, state);
-    u32 qCount = t->queueCount;
-    u32 arCount = t->activeLeftCount;
-
-    if (!mmbit_isset(activeLeftArray, arCount, ri)) {
-        DEBUG_PRINTF("engine is dead nothing to see here\n");
-        return 0;
-    }
-
-    if (unlikely(end < leftfixLag)) {
-        assert(0); /* lag is the literal length */
-        return 0;
-    }
-
-    if (nfaSupportsZombie(getNfaByQueue(t, qi)) && ci->buf_offset
-        && !fatbit_isset(scratch->aqa, qCount, qi)
-        && isZombie(t, state, left)) {
-        DEBUG_PRINTF("zombie\n");
-        return 1;
-    }
-
-    if (!fatbit_set(scratch->aqa, qCount, qi)) {
-        DEBUG_PRINTF("initing q %u\n", qi);
-        initRoseQueue(t, qi, left, scratch);
-        if (ci->buf_offset) { // there have been writes before us!
-            s32 sp;
-            if (!is_infix && left->transient) {
-                sp = -(s32)ci->hlen;
-            } else {
-                sp = -(s32)loadRoseDelay(t, state, left);
-            }
-
-            /* transient nfas are always started fresh -> state not maintained
-             * at stream boundary */
-
-            pushQueueAt(q, 0, MQE_START, sp);
-            if (is_infix || (ci->buf_offset + sp > 0 && !left->transient)) {
-                loadStreamState(q->nfa, q, sp);
-            } else {
-                pushQueueAt(q, 1, MQE_TOP, sp);
-                nfaQueueInitState(q->nfa, q);
-            }
-        } else { // first write ever
-            pushQueueAt(q, 0, MQE_START, 0);
-            pushQueueAt(q, 1, MQE_TOP, 0);
-            nfaQueueInitState(q->nfa, q);
-        }
-    }
-
-    s64a loc = (s64a)end - ci->buf_offset - leftfixLag;
-    assert(loc >= q_cur_loc(q) || left->eager);
-    assert(leftfixReport != MO_INVALID_IDX);
-
-    if (!is_infix && left->transient) {
-        s64a start_loc = loc - left->transient;
-        if (q_cur_loc(q) < start_loc) {
-            q->cur = q->end = 0;
-            pushQueueAt(q, 0, MQE_START, start_loc);
-            pushQueueAt(q, 1, MQE_TOP, start_loc);
-            nfaQueueInitState(q->nfa, q);
-        }
-    }
-
-    if (q_cur_loc(q) < loc || q_last_type(q) != MQE_START) {
-        if (is_infix) {
-            if (infixTooOld(q, loc)) {
-                DEBUG_PRINTF("infix %u died of old age\n", ri);
-                goto nfa_dead;
-            }
-
-            reduceInfixQueue(q, loc, left->maxQueueLen, q->nfa->maxWidth);
-        }
-
-        if (!roseLeftfixCheckMiracles(t, left, ci, q, end, is_infix)) {
-            DEBUG_PRINTF("leftfix %u died due to miracle\n", ri);
-            goto nfa_dead;
-        }
-
-#ifdef DEBUG
-        debugQueue(q);
-#endif
-
-        pushQueueNoMerge(q, MQE_END, loc);
-
-        char rv = nfaQueueExecRose(q->nfa, q, leftfixReport);
-        if (!rv) { /* nfa is dead */
-            DEBUG_PRINTF("leftfix %u died while trying to catch up\n", ri);
-            goto nfa_dead;
-        }
-
-        // Queue must have next start loc before we call nfaInAcceptState.
-        q->cur = q->end = 0;
-        pushQueueAt(q, 0, MQE_START, loc);
-
-        DEBUG_PRINTF("checking for report %u\n", leftfixReport);
-        DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
-        return rv == MO_MATCHES_PENDING;
-    } else if (q_cur_loc(q) > loc) {
-        /* an eager leftfix may have already progressed past loc if there is no
-         * match at loc. */
-        assert(left->eager);
-        return 0;
-    } else {
-        assert(q_cur_loc(q) == loc);
-        DEBUG_PRINTF("checking for report %u\n", leftfixReport);
-        char rv = nfaInAcceptState(q->nfa, leftfixReport, q);
-        DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
-        return rv;
-    }
-
-nfa_dead:
-    mmbit_unset(activeLeftArray, arCount, ri);
-    scratch->tctxt.groups &= left->squash_mask;
-    return 0;
-}
-
-static rose_inline
-char roseTestPrefix(const struct RoseEngine *t, struct hs_scratch *scratch,
-                    u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end) {
-    return roseTestLeftfix(t, scratch, qi, leftfixLag, leftfixReport, end, 0);
-}
-
-static rose_inline
-char roseTestInfix(const struct RoseEngine *t, struct hs_scratch *scratch,
-                   u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end) {
-    return roseTestLeftfix(t, scratch, qi, leftfixLag, leftfixReport, end, 1);
-}
-
-static rose_inline
-void roseTriggerInfix(const struct RoseEngine *t, struct hs_scratch *scratch,
-                      u64a start, u64a end, u32 qi, u32 topEvent, u8 cancel) {
-    struct core_info *ci = &scratch->core_info;
-    s64a loc = (s64a)end - ci->buf_offset;
-
-    u32 ri = queueToLeftIndex(t, qi);
-    assert(topEvent < MQE_INVALID);
-
-    const struct LeftNfaInfo *left = getLeftInfoByQueue(t, qi);
-    assert(!left->transient);
-
-    DEBUG_PRINTF("rose %u (qi=%u) event %u\n", ri, qi, topEvent);
-
-    struct mq *q = scratch->queues + qi;
-    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
-
-    char *state = ci->state;
-    u8 *activeLeftArray = getActiveLeftArray(t, state);
-    const u32 arCount = t->activeLeftCount;
-    char alive = mmbit_set(activeLeftArray, arCount, ri);
-
-    if (alive && info->no_retrigger) {
-        DEBUG_PRINTF("yawn\n");
-        return;
-    }
-
-    struct fatbit *aqa = scratch->aqa;
-    const u32 qCount = t->queueCount;
-
-    if (alive && nfaSupportsZombie(getNfaByInfo(t, info)) && ci->buf_offset &&
-        !fatbit_isset(aqa, qCount, qi) && isZombie(t, state, left)) {
-        DEBUG_PRINTF("yawn - zombie\n");
-        return;
-    }
-
-    if (cancel) {
-        DEBUG_PRINTF("dominating top: (re)init\n");
-        fatbit_set(aqa, qCount, qi);
-        initRoseQueue(t, qi, left, scratch);
-        pushQueueAt(q, 0, MQE_START, loc);
-        nfaQueueInitState(q->nfa, q);
-    } else if (!fatbit_set(aqa, qCount, qi)) {
-        DEBUG_PRINTF("initing %u\n", qi);
-        initRoseQueue(t, qi, left, scratch);
-        if (alive) {
-            s32 sp = -(s32)loadRoseDelay(t, state, left);
-            pushQueueAt(q, 0, MQE_START, sp);
-            loadStreamState(q->nfa, q, sp);
-        } else {
-            pushQueueAt(q, 0, MQE_START, loc);
-            nfaQueueInitState(q->nfa, q);
-        }
-    } else if (!alive) {
-        q->cur = q->end = 0;
-        pushQueueAt(q, 0, MQE_START, loc);
-        nfaQueueInitState(q->nfa, q);
-    } else if (isQueueFull(q)) {
-        reduceInfixQueue(q, loc, left->maxQueueLen, q->nfa->maxWidth);
-
-        if (isQueueFull(q)) {
-            /* still full - reduceInfixQueue did nothing */
-            DEBUG_PRINTF("queue %u full (%u items) -> catching up nfa\n", qi,
-                         q->end - q->cur);
-            pushQueueNoMerge(q, MQE_END, loc);
-            nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX);
-
-            q->cur = q->end = 0;
-            pushQueueAt(q, 0, MQE_START, loc);
-        }
-    }
-
-    pushQueueSom(q, topEvent, loc, start);
-}
-
-static rose_inline
-hwlmcb_rv_t roseReport(const struct RoseEngine *t, struct hs_scratch *scratch,
-                       u64a end, ReportID onmatch, s32 offset_adjust,
-                       u32 ekey) {
-    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
-    DEBUG_PRINTF("firing callback onmatch=%u, end=%llu\n", onmatch, end);
-    updateLastMatchOffset(&scratch->tctxt, end);
-
-    int cb_rv = roseDeliverReport(end, onmatch, offset_adjust, scratch, ekey);
-    if (cb_rv == MO_HALT_MATCHING) {
-        DEBUG_PRINTF("termination requested\n");
-        return HWLM_TERMINATE_MATCHING;
-    }
-
-    if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
-        return HWLM_CONTINUE_MATCHING;
-    }
-
-    return roseHaltIfExhausted(t, scratch);
-}
-
-/* catches up engines enough to ensure any earlier mpv triggers are enqueued
- * and then adds the trigger to the mpv queue. Must not be called during catch
- * up */
-static rose_inline
-hwlmcb_rv_t roseCatchUpAndHandleChainMatch(const struct RoseEngine *t,
-                                           struct hs_scratch *scratch,
-                                           u32 event, u64a top_squash_distance,
-                                           u64a end, const char in_catchup) {
-    if (!in_catchup &&
-        roseCatchUpMpvFeeders(t, scratch, end) == HWLM_TERMINATE_MATCHING) {
-        return HWLM_TERMINATE_MATCHING;
-    }
-    return roseHandleChainMatch(t, scratch, event, top_squash_distance, end,
-                                in_catchup);
-}
-
-static rose_inline
-void roseHandleSom(UNUSED const struct RoseEngine *t,
-                   struct hs_scratch *scratch, const struct som_operation *sr,
-                   u64a end) {
-    DEBUG_PRINTF("end=%llu, minMatchOffset=%llu\n", end,
-                 scratch->tctxt.minMatchOffset);
-
-    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
-    updateLastMatchOffset(&scratch->tctxt, end);
-    handleSomInternal(scratch, sr, end);
-}
-
-static rose_inline
-hwlmcb_rv_t roseReportSom(const struct RoseEngine *t,
-                          struct hs_scratch *scratch, u64a start, u64a end,
-                          ReportID onmatch, s32 offset_adjust, u32 ekey) {
-    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
-    DEBUG_PRINTF("firing som callback onmatch=%u, start=%llu, end=%llu\n",
-                 onmatch, start, end);
-    updateLastMatchOffset(&scratch->tctxt, end);
-
-    int cb_rv = roseDeliverSomReport(start, end, onmatch, offset_adjust,
-                                     scratch, ekey);
-    if (cb_rv == MO_HALT_MATCHING) {
-        DEBUG_PRINTF("termination requested\n");
-        return HWLM_TERMINATE_MATCHING;
-    }
-
-    if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
-        return HWLM_CONTINUE_MATCHING;
-    }
-
-    return roseHaltIfExhausted(t, scratch);
-}
-
-static rose_inline
-void roseHandleSomSom(UNUSED const struct RoseEngine *t,
-                      struct hs_scratch *scratch,
-                      const struct som_operation *sr, u64a start, u64a end) {
-    DEBUG_PRINTF("start=%llu, end=%llu, minMatchOffset=%llu\n", start, end,
-                 scratch->tctxt.minMatchOffset);
-
-    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
-    updateLastMatchOffset(&scratch->tctxt, end);
-    setSomFromSomAware(scratch, sr, start, end);
-}
-
-static really_inline
-int reachHasBit(const u8 *reach, u8 c) {
-    return !!(reach[c / 8U] & (u8)1U << (c % 8U));
-}
-
-/**
- * \brief Scan around a literal, checking that that "lookaround" reach masks
- * are satisfied.
- */
-static rose_inline
-int roseCheckLookaround(const struct RoseEngine *t,
-                        const struct hs_scratch *scratch, u32 lookaroundIndex,
-                        u32 lookaroundCount, u64a end) {
-    assert(lookaroundIndex != MO_INVALID_IDX);
-    assert(lookaroundCount > 0);
-
-    const struct core_info *ci = &scratch->core_info;
-    DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
-                 ci->buf_offset, ci->buf_offset + ci->len);
-
-    const u8 *base = (const u8 *)t;
-    const s8 *look_base = (const s8 *)(base + t->lookaroundTableOffset);
-    const s8 *look = look_base + lookaroundIndex;
-    const s8 *look_end = look + lookaroundCount;
-    assert(look < look_end);
-
-    const u8 *reach_base = base + t->lookaroundReachOffset;
-    const u8 *reach = reach_base + lookaroundIndex * REACH_BITVECTOR_LEN;
-
-    // The following code assumes that the lookaround structures are ordered by
-    // increasing offset.
-
-    const s64a base_offset = end - ci->buf_offset;
-    DEBUG_PRINTF("base_offset=%lld\n", base_offset);
-    DEBUG_PRINTF("first look has offset %d\n", *look);
-
-    // If our first check tells us we need to look at an offset before the
-    // start of the stream, this role cannot match.
-    if (unlikely(*look < 0 && (u64a)(0 - *look) > end)) {
-        DEBUG_PRINTF("too early, fail\n");
-        return 0;
-    }
-
-    // Skip over offsets that are before the history buffer.
-    do {
-        s64a offset = base_offset + *look;
-        if (offset >= -(s64a)ci->hlen) {
-            goto in_history;
-        }
-        DEBUG_PRINTF("look=%d before history\n", *look);
-        look++;
-        reach += REACH_BITVECTOR_LEN;
-    } while (look < look_end);
-
-    // History buffer.
-    DEBUG_PRINTF("scan history (%zu looks left)\n", look_end - look);
-    for (; look < look_end; ++look, reach += REACH_BITVECTOR_LEN) {
-    in_history:
-        ;
-        s64a offset = base_offset + *look;
-        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
-
-        if (offset >= 0) {
-            DEBUG_PRINTF("in buffer\n");
-            goto in_buffer;
-        }
-
-        assert(offset >= -(s64a)ci->hlen && offset < 0);
-        u8 c = ci->hbuf[ci->hlen + offset];
-        if (!reachHasBit(reach, c)) {
-            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
-            return 0;
-        }
-    }
-    // Current buffer.
-    DEBUG_PRINTF("scan buffer (%zu looks left)\n", look_end - look);
-    for (; look < look_end; ++look, reach += REACH_BITVECTOR_LEN) {
-    in_buffer:
-        ;
-        s64a offset = base_offset + *look;
-        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
-
-        if (offset >= (s64a)ci->len) {
-            DEBUG_PRINTF("in the future\n");
-            break;
-        }
-
-        assert(offset >= 0 && offset < (s64a)ci->len);
-        u8 c = ci->buf[offset];
-        if (!reachHasBit(reach, c)) {
-            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
-            return 0;
-        }
-    }
-
-    DEBUG_PRINTF("OK :)\n");
-    return 1;
-}
-
-static
 int roseNfaEarliestSom(u64a from_offset, UNUSED u64a offset, UNUSED ReportID id,
                        void *context) {
     u64a *som = context;
@@ -691,731 +40,10 @@ int roseNfaEarliestSom(u64a from_offset, UNUSED u64a offset, UNUSED ReportID id,
     return MO_CONTINUE_MATCHING;
 }
 
-static rose_inline
-u64a roseGetHaigSom(const struct RoseEngine *t, struct hs_scratch *scratch,
-                    const u32 qi, UNUSED const u32 leftfixLag) {
-    u32 ri = queueToLeftIndex(t, qi);
-
-    UNUSED const struct LeftNfaInfo *left = getLeftTable(t) + ri;
-
-    DEBUG_PRINTF("testing %s prefix %u/%u with lag %u (maxLag=%u)\n",
-                 left->transient ? "transient" : "active", ri, qi,
-                 leftfixLag, left->maxLag);
-
-    assert(leftfixLag <= left->maxLag);
-
-    struct mq *q = scratch->queues + qi;
-
-    u64a start = ~0ULL;
-
-    /* switch the callback + context for a fun one */
-    q->som_cb = roseNfaEarliestSom;
-    q->context = &start;
-
-    nfaReportCurrentMatches(q->nfa, q);
-
-    /* restore the old callback + context */
-    q->som_cb = roseNfaSomAdaptor;
-    q->context = NULL;
-    DEBUG_PRINTF("earliest som is %llu\n", start);
-    return start;
-}
-
-static rose_inline
-char roseCheckBounds(u64a end, u64a min_bound, u64a max_bound) {
-    DEBUG_PRINTF("check offset=%llu against bounds [%llu,%llu]\n", end,
-                 min_bound, max_bound);
-    assert(min_bound <= max_bound);
-    return end >= min_bound && end <= max_bound;
-}
-
-static rose_inline
-hwlmcb_rv_t roseEnginesEod(const struct RoseEngine *rose,
-                           struct hs_scratch *scratch, u64a offset,
-                           u32 iter_offset) {
-    const char is_streaming = rose->mode != HS_MODE_BLOCK;
-
-    /* data, len is used for state decompress, should be full available data */
-    u8 key = 0;
-    if (is_streaming) {
-        const u8 *eod_data = scratch->core_info.hbuf;
-        size_t eod_len = scratch->core_info.hlen;
-        key = eod_len ? eod_data[eod_len - 1] : 0;
-    }
-
-    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
-    const u32 aaCount = rose->activeArrayCount;
-
-    const struct mmbit_sparse_iter *it = getByOffset(rose, iter_offset);
-    assert(ISALIGNED(it));
-
-    u32 idx = 0;
-    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
-
-    for (u32 qi = mmbit_sparse_iter_begin(aa, aaCount, &idx, it, si_state);
-         qi != MMB_INVALID;
-         qi = mmbit_sparse_iter_next(aa, aaCount, qi, &idx, it, si_state)) {
-        DEBUG_PRINTF("checking nfa %u\n", qi);
-        struct mq *q = scratch->queues + qi;
-        assert(q->nfa == getNfaByQueue(rose, qi));
-        assert(nfaAcceptsEod(q->nfa));
-
-        if (is_streaming) {
-            // Decompress stream state.
-            nfaExpandState(q->nfa, q->state, q->streamState, offset, key);
-        }
-
-        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
-                               roseReportAdaptor, roseReportSomAdaptor,
-                               scratch) == MO_HALT_MATCHING) {
-            DEBUG_PRINTF("user instructed us to stop\n");
-            return HWLM_TERMINATE_MATCHING;
-        }
-    }
-
-    return HWLM_CONTINUE_MATCHING;
-}
-
-static rose_inline
-hwlmcb_rv_t roseSuffixesEod(const struct RoseEngine *rose,
-                            struct hs_scratch *scratch, u64a offset) {
-    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
-    const u32 aaCount = rose->activeArrayCount;
-
-    for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID;
-         qi = mmbit_iterate(aa, aaCount, qi)) {
-        DEBUG_PRINTF("checking nfa %u\n", qi);
-        struct mq *q = scratch->queues + qi;
-        assert(q->nfa == getNfaByQueue(rose, qi));
-        assert(nfaAcceptsEod(q->nfa));
-
-        /* We have just been triggered. */
-        assert(fatbit_isset(scratch->aqa, rose->queueCount, qi));
-
-        pushQueueNoMerge(q, MQE_END, scratch->core_info.len);
-        q->context = NULL;
-
-        /* rose exec is used as we don't want to / can't raise matches in the
-         * history buffer. */
-        if (!nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX)) {
-            DEBUG_PRINTF("nfa is dead\n");
-            continue;
-        }
-        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
-                               roseReportAdaptor, roseReportSomAdaptor,
-                               scratch) == MO_HALT_MATCHING) {
-            DEBUG_PRINTF("user instructed us to stop\n");
-            return HWLM_TERMINATE_MATCHING;
-        }
-    }
-    return HWLM_CONTINUE_MATCHING;
-}
-
-static rose_inline
-hwlmcb_rv_t roseMatcherEod(const struct RoseEngine *rose,
-                           struct hs_scratch *scratch, u64a offset) {
-    assert(rose->ematcherOffset);
-    assert(rose->ematcherRegionSize);
-
-    // Clear role state and active engines, since we have already handled all
-    // outstanding work there.
-    DEBUG_PRINTF("clear role state and active leaf array\n");
-    char *state = scratch->core_info.state;
-    mmbit_clear(getRoleState(state), rose->rolesWithStateCount);
-    mmbit_clear(getActiveLeafArray(rose, state), rose->activeArrayCount);
-
-    const char is_streaming = rose->mode != HS_MODE_BLOCK;
-
-    size_t eod_len;
-    const u8 *eod_data;
-    if (!is_streaming) { /* Block */
-        eod_data = scratch->core_info.buf;
-        eod_len = scratch->core_info.len;
-    } else { /* Streaming */
-        eod_len = scratch->core_info.hlen;
-        eod_data = scratch->core_info.hbuf;
-    }
-
-    assert(eod_data);
-    assert(eod_len);
-
-    DEBUG_PRINTF("%zu bytes of eod data to scan at offset %llu\n", eod_len,
-                 offset);
-
-    // If we don't have enough bytes to produce a match from an EOD table scan,
-    // there's no point scanning.
-    if (eod_len < rose->eodmatcherMinWidth) {
-        DEBUG_PRINTF("too short for min width %u\n", rose->eodmatcherMinWidth);
-        return HWLM_CONTINUE_MATCHING;
-    }
-
-    // Ensure that we only need scan the last N bytes, where N is the length of
-    // the eod-anchored matcher region.
-    size_t adj = eod_len - MIN(eod_len, rose->ematcherRegionSize);
-
-    const struct HWLM *etable = getByOffset(rose, rose->ematcherOffset);
-    hwlmExec(etable, eod_data, eod_len, adj, roseCallback, scratch,
-             scratch->tctxt.groups);
-
-    // We may need to fire delayed matches.
-    if (cleanUpDelayed(rose, scratch, 0, offset) == HWLM_TERMINATE_MATCHING) {
-        DEBUG_PRINTF("user instructed us to stop\n");
-        return HWLM_TERMINATE_MATCHING;
-    }
-
-    roseFlushLastByteHistory(rose, scratch, offset);
-    return HWLM_CONTINUE_MATCHING;
-}
-
-static
-void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
-                    const char from_mpv) {
-    if (from_mpv) {
-        updateMinMatchOffsetFromMpv(tctxt, offset);
-    } else {
-        updateMinMatchOffset(tctxt, offset);
-    }
-}
-
-#define PROGRAM_CASE(name)                                                     \
-    case ROSE_INSTR_##name: {                                                  \
-        DEBUG_PRINTF("instruction: " #name " (pc=%u)\n",                       \
-                     programOffset + (u32)(pc - pc_base));                     \
-        const struct ROSE_STRUCT_##name *ri =                                  \
-            (const struct ROSE_STRUCT_##name *)pc;
-
-#define PROGRAM_NEXT_INSTRUCTION                                               \
-    pc += ROUNDUP_N(sizeof(*ri), ROSE_INSTR_MIN_ALIGN);                        \
-    break;                                                                     \
-    }
-
-static rose_inline
-hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
-                             struct hs_scratch *scratch, u32 programOffset,
-                             u64a som, u64a end, size_t match_len,
-                             char in_anchored, char in_catchup, char from_mpv,
-                             char skip_mpv_catchup) {
-    DEBUG_PRINTF("program=%u, offsets [%llu,%llu]\n", programOffset, som, end);
-
-    assert(programOffset >= sizeof(struct RoseEngine));
-    assert(programOffset < t->size);
-
-    const char *pc_base = getByOffset(t, programOffset);
-    const char *pc = pc_base;
-
-    // Local sparse iterator state for programs that use the SPARSE_ITER_BEGIN
-    // and SPARSE_ITER_NEXT instructions.
-    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
-
-    // If this program has an effect, work_done will be set to one (which may
-    // allow the program to squash groups).
-    int work_done = 0;
-
-    struct RoseContext *tctxt = &scratch->tctxt;
-
-    assert(*(const u8 *)pc != ROSE_INSTR_END);
-
-    for (;;) {
-        assert(ISALIGNED_N(pc, ROSE_INSTR_MIN_ALIGN));
-        assert(pc >= pc_base);
-        assert((size_t)(pc - pc_base) < t->size);
-        const u8 code = *(const u8 *)pc;
-        assert(code <= ROSE_INSTR_END);
-
-        switch ((enum RoseInstructionCode)code) {
-            PROGRAM_CASE(ANCHORED_DELAY) {
-                if (in_anchored && end > t->floatingMinLiteralMatchOffset) {
-                    DEBUG_PRINTF("delay until playback\n");
-                    tctxt->groups |= ri->groups;
-                    work_done = 1;
-                    assert(ri->done_jump); // must progress
-                    pc += ri->done_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_LIT_MASK) {
-                assert(match_len);
-                struct core_info *ci = &scratch->core_info;
-                if (!roseCheckBenefits(ci, end, match_len, ri->and_mask.a8,
-                                       ri->cmp_mask.a8)) {
-                    DEBUG_PRINTF("halt: failed mask check\n");
-                    return HWLM_CONTINUE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_LIT_EARLY) {
-                if (end < ri->min_offset) {
-                    DEBUG_PRINTF("halt: before min_offset=%u\n",
-                                 ri->min_offset);
-                    return HWLM_CONTINUE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_GROUPS) {
-                DEBUG_PRINTF("groups=0x%llx, checking instr groups=0x%llx\n",
-                             tctxt->groups, ri->groups);
-                if (!(ri->groups & tctxt->groups)) {
-                    DEBUG_PRINTF("halt: no groups are set\n");
-                    return HWLM_CONTINUE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_ONLY_EOD) {
-                struct core_info *ci = &scratch->core_info;
-                if (end != ci->buf_offset + ci->len) {
-                    DEBUG_PRINTF("should only match at end of data\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_BOUNDS) {
-                if (!roseCheckBounds(end, ri->min_bound, ri->max_bound)) {
-                    DEBUG_PRINTF("failed bounds check\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_NOT_HANDLED) {
-                struct fatbit *handled = scratch->handled_roles;
-                if (fatbit_set(handled, t->handledKeyCount, ri->key)) {
-                    DEBUG_PRINTF("key %u already set\n", ri->key);
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_LOOKAROUND) {
-                if (!roseCheckLookaround(t, scratch, ri->index, ri->count,
-                                         end)) {
-                    DEBUG_PRINTF("failed lookaround check\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_INFIX) {
-                if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report,
-                                   end)) {
-                    DEBUG_PRINTF("failed infix check\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_PREFIX) {
-                if (!roseTestPrefix(t, scratch, ri->queue, ri->lag, ri->report,
-                                    end)) {
-                    DEBUG_PRINTF("failed prefix check\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(PUSH_DELAYED) {
-                rosePushDelayedMatch(t, scratch, ri->delay, ri->index, end);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CATCH_UP) {
-                if (roseCatchUpTo(t, scratch, end) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CATCH_UP_MPV) {
-                if (from_mpv || skip_mpv_catchup) {
-                    DEBUG_PRINTF("skipping mpv catchup\n");
-                } else if (roseCatchUpMPV(t,
-                                          end - scratch->core_info.buf_offset,
-                                          scratch) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SOM_ADJUST) {
-                assert(ri->distance <= end);
-                som = end - ri->distance;
-                DEBUG_PRINTF("som is (end - %u) = %llu\n", ri->distance, som);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SOM_LEFTFIX) {
-                som = roseGetHaigSom(t, scratch, ri->queue, ri->lag);
-                DEBUG_PRINTF("som from leftfix is %llu\n", som);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SOM_FROM_REPORT) {
-                som = handleSomExternal(scratch, &ri->som, end);
-                DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch,
-                             som);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SOM_ZERO) {
-                DEBUG_PRINTF("setting SOM to zero\n");
-                som = 0;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(TRIGGER_INFIX) {
-                roseTriggerInfix(t, scratch, som, end, ri->queue, ri->event,
-                                 ri->cancel);
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(TRIGGER_SUFFIX) {
-                if (roseTriggerSuffix(t, scratch, ri->queue, ri->event, som,
-                                      end) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(DEDUPE) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                const char do_som = t->hasSom; // TODO: constant propagate
-                const char is_external_report = 1;
-                enum DedupeResult rv =
-                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
-                                  ri->dkey, ri->offset_adjust,
-                                  is_external_report, ri->quash_som, do_som);
-                switch (rv) {
-                case DEDUPE_HALT:
-                    return HWLM_TERMINATE_MATCHING;
-                case DEDUPE_SKIP:
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                case DEDUPE_CONTINUE:
-                    break;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(DEDUPE_SOM) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                const char is_external_report = 0;
-                const char do_som = 1;
-                enum DedupeResult rv =
-                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
-                                  ri->dkey, ri->offset_adjust,
-                                  is_external_report, ri->quash_som, do_som);
-                switch (rv) {
-                case DEDUPE_HALT:
-                    return HWLM_TERMINATE_MATCHING;
-                case DEDUPE_SKIP:
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                case DEDUPE_CONTINUE:
-                    break;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_CHAIN) {
-                // Note: sequence points updated inside this function.
-                if (roseCatchUpAndHandleChainMatch(
-                        t, scratch, ri->event, ri->top_squash_distance, end,
-                        in_catchup) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_SOM_INT) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                roseHandleSom(t, scratch, &ri->som, end);
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_SOM_AWARE) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                roseHandleSomSom(t, scratch, &ri->som, som, end);
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
-                               INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_EXHAUST) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
-                               ri->ekey) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_SOM) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                if (roseReportSom(t, scratch, som, end, ri->onmatch,
-                                  ri->offset_adjust,
-                                  INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_SOM_EXHAUST) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                if (roseReportSom(t, scratch, som, end, ri->onmatch,
-                                  ri->offset_adjust,
-                                  ri->ekey) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(DEDUPE_AND_REPORT) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                const char do_som = t->hasSom; // TODO: constant propagate
-                const char is_external_report = 1;
-                enum DedupeResult rv =
-                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
-                                  ri->dkey, ri->offset_adjust,
-                                  is_external_report, ri->quash_som, do_som);
-                switch (rv) {
-                case DEDUPE_HALT:
-                    return HWLM_TERMINATE_MATCHING;
-                case DEDUPE_SKIP:
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                case DEDUPE_CONTINUE:
-                    break;
-                }
-
-                const u32 ekey = INVALID_EKEY;
-                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
-                               ekey) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(FINAL_REPORT) {
-                updateSeqPoint(tctxt, end, from_mpv);
-                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
-                               INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-                /* One-shot specialisation: this instruction always terminates
-                 * execution of the program. */
-                return HWLM_CONTINUE_MATCHING;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_EXHAUSTED) {
-                DEBUG_PRINTF("check ekey %u\n", ri->ekey);
-                assert(ri->ekey != INVALID_EKEY);
-                assert(ri->ekey < t->ekeyCount);
-                const char *evec = scratch->core_info.exhaustionVector;
-                if (isExhausted(t, evec, ri->ekey)) {
-                    DEBUG_PRINTF("ekey %u already set, match is exhausted\n",
-                                 ri->ekey);
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_MIN_LENGTH) {
-                DEBUG_PRINTF("check min length %llu (adj %d)\n", ri->min_length,
-                             ri->end_adj);
-                assert(ri->min_length > 0);
-                assert(ri->end_adj == 0 || ri->end_adj == -1);
-                assert(som == HS_OFFSET_PAST_HORIZON || som <= end);
-                if (som != HS_OFFSET_PAST_HORIZON &&
-                    ((end + ri->end_adj) - som < ri->min_length)) {
-                    DEBUG_PRINTF("failed check, match len %llu\n",
-                                 (u64a)((end + ri->end_adj) - som));
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SET_STATE) {
-                DEBUG_PRINTF("set state index %u\n", ri->index);
-                mmbit_set(getRoleState(scratch->core_info.state),
-                          t->rolesWithStateCount, ri->index);
-                work_done = 1;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SET_GROUPS) {
-                tctxt->groups |= ri->groups;
-                DEBUG_PRINTF("set groups 0x%llx -> 0x%llx\n", ri->groups,
-                             tctxt->groups);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SQUASH_GROUPS) {
-                assert(popcount64(ri->groups) == 63); // Squash only one group.
-                if (work_done) {
-                    tctxt->groups &= ri->groups;
-                    DEBUG_PRINTF("squash groups 0x%llx -> 0x%llx\n", ri->groups,
-                                 tctxt->groups);
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_STATE) {
-                DEBUG_PRINTF("check state %u\n", ri->index);
-                const u8 *roles = getRoleState(scratch->core_info.state);
-                if (!mmbit_isset(roles, t->rolesWithStateCount, ri->index)) {
-                    DEBUG_PRINTF("state not on\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SPARSE_ITER_BEGIN) {
-                DEBUG_PRINTF("iter_offset=%u\n", ri->iter_offset);
-                const struct mmbit_sparse_iter *it =
-                    getByOffset(t, ri->iter_offset);
-                assert(ISALIGNED(it));
-
-                const u8 *roles = getRoleState(scratch->core_info.state);
-
-                u32 idx = 0;
-                u32 i = mmbit_sparse_iter_begin(roles, t->rolesWithStateCount,
-                                                &idx, it, si_state);
-                if (i == MMB_INVALID) {
-                    DEBUG_PRINTF("no states in sparse iter are on\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-
-                fatbit_clear(scratch->handled_roles);
-
-                const u32 *jumps = getByOffset(t, ri->jump_table);
-                DEBUG_PRINTF("state %u (idx=%u) is on, jump to %u\n", i, idx,
-                             jumps[idx]);
-                pc = pc_base + jumps[idx];
-                continue;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SPARSE_ITER_NEXT) {
-                DEBUG_PRINTF("iter_offset=%u, state=%u\n", ri->iter_offset,
-                             ri->state);
-                const struct mmbit_sparse_iter *it =
-                    getByOffset(t, ri->iter_offset);
-                assert(ISALIGNED(it));
-
-                const u8 *roles = getRoleState(scratch->core_info.state);
-
-                u32 idx = 0;
-                u32 i = mmbit_sparse_iter_next(roles, t->rolesWithStateCount,
-                                               ri->state, &idx, it, si_state);
-                if (i == MMB_INVALID) {
-                    DEBUG_PRINTF("no more states in sparse iter are on\n");
-                    assert(ri->fail_jump); // must progress
-                    pc += ri->fail_jump;
-                    continue;
-                }
-
-                const u32 *jumps = getByOffset(t, ri->jump_table);
-                DEBUG_PRINTF("state %u (idx=%u) is on, jump to %u\n", i, idx,
-                             jumps[idx]);
-                pc = pc_base + jumps[idx];
-                continue;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(ENGINES_EOD) {
-                if (roseEnginesEod(t, scratch, end, ri->iter_offset) ==
-                    HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SUFFIXES_EOD) {
-                if (roseSuffixesEod(t, scratch, end) ==
-                    HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(MATCHER_EOD) {
-                if (roseMatcherEod(t, scratch, end) ==
-                    HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATE_MATCHING;
-                }
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(END) {
-                DEBUG_PRINTF("finished\n");
-                return HWLM_CONTINUE_MATCHING;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-        }
-    }
-
-    assert(0); // unreachable
-    return HWLM_CONTINUE_MATCHING;
-}
-
-#undef PROGRAM_CASE
-#undef PROGRAM_NEXT_INSTRUCTION
-
 hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
                            struct hs_scratch *scratch, u32 programOffset,
                            u64a som, u64a end, size_t match_len,
                            u8 prog_flags) {
-    const char in_anchored = prog_flags & ROSE_PROG_FLAG_IN_ANCHORED;
-    const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP;
-    const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV;
-    const char skip_mpv_catchup = prog_flags & ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
     return roseRunProgram_i(t, scratch, programOffset, som, end, match_len,
-                            in_anchored, in_catchup, from_mpv,
-                            skip_mpv_catchup);
+                            prog_flags);
 }
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index c12c9155..7f8c32e5 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -34,11 +34,23 @@
 #ifndef PROGRAM_RUNTIME_H
 #define PROGRAM_RUNTIME_H
 
+#include "catchup.h"
+#include "counting_miracle.h"
+#include "infix.h"
+#include "match.h"
+#include "miracle.h"
+#include "report.h"
+#include "rose.h"
+#include "rose_internal.h"
+#include "rose_program.h"
+#include "rose_types.h"
+#include "runtime.h"
+#include "scratch.h"
 #include "ue2common.h"
-#include "src/hwlm/hwlm.h" // for hwlmcb_rv_t
-
-struct RoseEngine;
-struct hs_scratch;
+#include "hwlm/hwlm.h" // for hwlmcb_rv_t
+#include "util/compare.h"
+#include "util/fatbit.h"
+#include "util/multibit.h"
 
 /*
  * Program context flags, which control the behaviour of some instructions at
@@ -55,4 +67,1362 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
                            struct hs_scratch *scratch, u32 programOffset,
                            u64a som, u64a end, size_t match_len, u8 prog_flags);
 
+/* Inline implementation follows. */
+
+static rose_inline
+int roseCheckBenefits(const struct core_info *ci, u64a end, u32 mask_rewind,
+                      const u8 *and_mask, const u8 *exp_mask) {
+    const u8 *data;
+
+    // If the check works over part of the history and part of the buffer, we
+    // create a temporary copy of the data in here so it's contiguous.
+    u8 temp[MAX_MASK2_WIDTH];
+
+    s64a buffer_offset = (s64a)end - ci->buf_offset;
+    DEBUG_PRINTF("rel offset %lld\n", buffer_offset);
+    if (buffer_offset >= mask_rewind) {
+        data = ci->buf + buffer_offset - mask_rewind;
+        DEBUG_PRINTF("all in one case data=%p buf=%p rewind=%u\n", data,
+                     ci->buf, mask_rewind);
+    } else if (buffer_offset <= 0) {
+        data = ci->hbuf + ci->hlen + buffer_offset - mask_rewind;
+        DEBUG_PRINTF("all in one case data=%p buf=%p rewind=%u\n", data,
+                     ci->buf, mask_rewind);
+    } else {
+        u32 shortfall = mask_rewind - buffer_offset;
+        DEBUG_PRINTF("shortfall of %u, rewind %u hlen %zu\n", shortfall,
+                     mask_rewind, ci->hlen);
+        data = temp;
+        memcpy(temp, ci->hbuf + ci->hlen - shortfall, shortfall);
+        memcpy(temp + shortfall, ci->buf, mask_rewind - shortfall);
+    }
+
+#ifdef DEBUG
+    DEBUG_PRINTF("DATA: ");
+    for (u32 i = 0; i < mask_rewind; i++) {
+        printf("%c", ourisprint(data[i]) ? data[i] : '?');
+    }
+    printf(" (len=%u)\n", mask_rewind);
+#endif
+
+    u32 len = mask_rewind;
+    while (len >= sizeof(u64a)) {
+        u64a a = unaligned_load_u64a(data);
+        a &= *(const u64a *)and_mask;
+        if (a != *(const u64a *)exp_mask) {
+            DEBUG_PRINTF("argh %016llx %016llx\n", a, *(const u64a *)exp_mask);
+            return 0;
+        }
+        data += sizeof(u64a);
+        and_mask += sizeof(u64a);
+        exp_mask += sizeof(u64a);
+        len -= sizeof(u64a);
+    }
+
+    while (len) {
+        u8 a = *data;
+        a &= *and_mask;
+        if (a != *exp_mask) {
+            DEBUG_PRINTF("argh d%02hhx =%02hhx am%02hhx  em%02hhx\n", a,
+                          *data, *and_mask, *exp_mask);
+            return 0;
+        }
+        data++;
+        and_mask++;
+        exp_mask++;
+        len--;
+    }
+
+    return 1;
+}
+
+static rose_inline
+void rosePushDelayedMatch(const struct RoseEngine *t,
+                          struct hs_scratch *scratch, u32 delay,
+                          u32 delay_index, u64a offset) {
+    assert(delay);
+
+    const u32 src_slot_index = delay;
+    u32 slot_index = (src_slot_index + offset) & DELAY_MASK;
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+    if (offset + src_slot_index <= tctxt->delayLastEndOffset) {
+        DEBUG_PRINTF("skip too late\n");
+        return;
+    }
+
+    const u32 delay_count = t->delay_count;
+    struct fatbit **delaySlots = getDelaySlots(scratch);
+    struct fatbit *slot = delaySlots[slot_index];
+
+    DEBUG_PRINTF("pushing tab %u into slot %u\n", delay_index, slot_index);
+    if (!(tctxt->filledDelayedSlots & (1U << slot_index))) {
+        tctxt->filledDelayedSlots |= 1U << slot_index;
+        fatbit_clear(slot);
+    }
+
+    fatbit_set(slot, delay_count, delay_index);
+}
+
+static rose_inline
+char roseLeftfixCheckMiracles(const struct RoseEngine *t,
+                              const struct LeftNfaInfo *left,
+                              struct core_info *ci, struct mq *q, u64a end,
+                              const char is_infix) {
+    if (!is_infix && left->transient) {
+        // Miracles won't help us with transient leftfix engines; they only
+        // scan for a limited time anyway.
+        return 1;
+    }
+
+    if (!left->stopTable) {
+        return 1;
+    }
+
+    DEBUG_PRINTF("looking for miracle on queue %u\n", q->nfa->queueIndex);
+
+    const s64a begin_loc = q_cur_loc(q);
+    const s64a end_loc = end - ci->buf_offset;
+
+    s64a miracle_loc;
+    if (roseMiracleOccurs(t, left, ci, begin_loc, end_loc, &miracle_loc)) {
+        goto found_miracle;
+    }
+
+    if (roseCountingMiracleOccurs(t, left, ci, begin_loc, end_loc,
+                                  &miracle_loc)) {
+        goto found_miracle;
+    }
+
+    return 1;
+
+found_miracle:
+    DEBUG_PRINTF("miracle at %lld\n", miracle_loc);
+    assert(miracle_loc >= begin_loc);
+
+    // If we're a prefix, then a miracle effectively results in us needing to
+    // re-init our state and start fresh.
+    if (!is_infix) {
+        if (miracle_loc != begin_loc) {
+            DEBUG_PRINTF("re-init prefix state\n");
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, miracle_loc);
+            pushQueueAt(q, 1, MQE_TOP, miracle_loc);
+            nfaQueueInitState(q->nfa, q);
+        }
+        return 1;
+    }
+
+    // Otherwise, we're an infix. Remove tops before the miracle from the queue
+    // and re-init at that location.
+
+    q_skip_forward_to(q, miracle_loc);
+
+    if (q_last_type(q) == MQE_START) {
+        DEBUG_PRINTF("miracle caused infix to die\n");
+        return 0;
+    }
+
+    DEBUG_PRINTF("re-init infix state\n");
+    assert(q->items[q->cur].type == MQE_START);
+    q->items[q->cur].location = miracle_loc;
+    nfaQueueInitState(q->nfa, q);
+
+    return 1;
+}
+
+static rose_inline
+hwlmcb_rv_t roseTriggerSuffix(const struct RoseEngine *t,
+                              struct hs_scratch *scratch, u32 qi, u32 top,
+                              u64a som, u64a end) {
+    DEBUG_PRINTF("suffix qi=%u, top event=%u\n", qi, top);
+
+    struct core_info *ci = &scratch->core_info;
+    u8 *aa = getActiveLeafArray(t, ci->state);
+    const u32 aaCount = t->activeArrayCount;
+    const u32 qCount = t->queueCount;
+    struct mq *q = &scratch->queues[qi];
+    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+    const struct NFA *nfa = getNfaByInfo(t, info);
+
+    s64a loc = (s64a)end - ci->buf_offset;
+    assert(loc <= (s64a)ci->len && loc >= -(s64a)ci->hlen);
+
+    if (!mmbit_set(aa, aaCount, qi)) {
+        initQueue(q, qi, t, scratch);
+        nfaQueueInitState(nfa, q);
+        pushQueueAt(q, 0, MQE_START, loc);
+        fatbit_set(scratch->aqa, qCount, qi);
+    } else if (info->no_retrigger) {
+        DEBUG_PRINTF("yawn\n");
+        /* nfa only needs one top; we can go home now */
+        return HWLM_CONTINUE_MATCHING;
+    } else if (!fatbit_set(scratch->aqa, qCount, qi)) {
+        initQueue(q, qi, t, scratch);
+        loadStreamState(nfa, q, 0);
+        pushQueueAt(q, 0, MQE_START, 0);
+    } else if (isQueueFull(q)) {
+        DEBUG_PRINTF("queue %u full -> catching up nfas\n", qi);
+        if (info->eod) {
+            /* can catch up suffix independently no pq */
+            q->context = NULL;
+            pushQueueNoMerge(q, MQE_END, loc);
+            nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX);
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+        } else if (ensureQueueFlushed(t, scratch, qi, loc)
+            == HWLM_TERMINATE_MATCHING) {
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    assert(top == MQE_TOP || (top >= MQE_TOP_FIRST && top < MQE_INVALID));
+    pushQueueSom(q, top, loc, som);
+
+    if (q_cur_loc(q) == (s64a)ci->len && !info->eod) {
+        /* we may not run the nfa; need to ensure state is fine  */
+        DEBUG_PRINTF("empty run\n");
+        pushQueueNoMerge(q, MQE_END, loc);
+        char alive = nfaQueueExec(nfa, q, loc);
+        if (alive) {
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+        } else {
+            mmbit_unset(aa, aaCount, qi);
+            fatbit_unset(scratch->aqa, qCount, qi);
+        }
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static really_inline
+char roseTestLeftfix(const struct RoseEngine *t, struct hs_scratch *scratch,
+                     u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end,
+                     const char is_infix) {
+    struct core_info *ci = &scratch->core_info;
+
+    u32 ri = queueToLeftIndex(t, qi);
+    const struct LeftNfaInfo *left = getLeftTable(t) + ri;
+
+    DEBUG_PRINTF("testing %s %s %u/%u with lag %u (maxLag=%u)\n",
+                 (left->transient ? "transient" : "active"),
+                 (is_infix ? "infix" : "prefix"),
+                 ri, qi, leftfixLag, left->maxLag);
+
+    assert(leftfixLag <= left->maxLag);
+    assert(left->infix == is_infix);
+    assert(!is_infix || !left->transient); // Only prefixes can be transient.
+
+    struct mq *q = scratch->queues + qi;
+    char *state = scratch->core_info.state;
+    u8 *activeLeftArray = getActiveLeftArray(t, state);
+    u32 qCount = t->queueCount;
+    u32 arCount = t->activeLeftCount;
+
+    if (!mmbit_isset(activeLeftArray, arCount, ri)) {
+        DEBUG_PRINTF("engine is dead nothing to see here\n");
+        return 0;
+    }
+
+    if (unlikely(end < leftfixLag)) {
+        assert(0); /* lag is the literal length */
+        return 0;
+    }
+
+    if (nfaSupportsZombie(getNfaByQueue(t, qi)) && ci->buf_offset
+        && !fatbit_isset(scratch->aqa, qCount, qi)
+        && isZombie(t, state, left)) {
+        DEBUG_PRINTF("zombie\n");
+        return 1;
+    }
+
+    if (!fatbit_set(scratch->aqa, qCount, qi)) {
+        DEBUG_PRINTF("initing q %u\n", qi);
+        initRoseQueue(t, qi, left, scratch);
+        if (ci->buf_offset) { // there have been writes before us!
+            s32 sp;
+            if (!is_infix && left->transient) {
+                sp = -(s32)ci->hlen;
+            } else {
+                sp = -(s32)loadRoseDelay(t, state, left);
+            }
+
+            /* transient nfas are always started fresh -> state not maintained
+             * at stream boundary */
+
+            pushQueueAt(q, 0, MQE_START, sp);
+            if (is_infix || (ci->buf_offset + sp > 0 && !left->transient)) {
+                loadStreamState(q->nfa, q, sp);
+            } else {
+                pushQueueAt(q, 1, MQE_TOP, sp);
+                nfaQueueInitState(q->nfa, q);
+            }
+        } else { // first write ever
+            pushQueueAt(q, 0, MQE_START, 0);
+            pushQueueAt(q, 1, MQE_TOP, 0);
+            nfaQueueInitState(q->nfa, q);
+        }
+    }
+
+    s64a loc = (s64a)end - ci->buf_offset - leftfixLag;
+    assert(loc >= q_cur_loc(q) || left->eager);
+    assert(leftfixReport != MO_INVALID_IDX);
+
+    if (!is_infix && left->transient) {
+        s64a start_loc = loc - left->transient;
+        if (q_cur_loc(q) < start_loc) {
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, start_loc);
+            pushQueueAt(q, 1, MQE_TOP, start_loc);
+            nfaQueueInitState(q->nfa, q);
+        }
+    }
+
+    if (q_cur_loc(q) < loc || q_last_type(q) != MQE_START) {
+        if (is_infix) {
+            if (infixTooOld(q, loc)) {
+                DEBUG_PRINTF("infix %u died of old age\n", ri);
+                goto nfa_dead;
+            }
+
+            reduceInfixQueue(q, loc, left->maxQueueLen, q->nfa->maxWidth);
+        }
+
+        if (!roseLeftfixCheckMiracles(t, left, ci, q, end, is_infix)) {
+            DEBUG_PRINTF("leftfix %u died due to miracle\n", ri);
+            goto nfa_dead;
+        }
+
+#ifdef DEBUG
+        debugQueue(q);
+#endif
+
+        pushQueueNoMerge(q, MQE_END, loc);
+
+        char rv = nfaQueueExecRose(q->nfa, q, leftfixReport);
+        if (!rv) { /* nfa is dead */
+            DEBUG_PRINTF("leftfix %u died while trying to catch up\n", ri);
+            goto nfa_dead;
+        }
+
+        // Queue must have next start loc before we call nfaInAcceptState.
+        q->cur = q->end = 0;
+        pushQueueAt(q, 0, MQE_START, loc);
+
+        DEBUG_PRINTF("checking for report %u\n", leftfixReport);
+        DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
+        return rv == MO_MATCHES_PENDING;
+    } else if (q_cur_loc(q) > loc) {
+        /* an eager leftfix may have already progressed past loc if there is no
+         * match at loc. */
+        assert(left->eager);
+        return 0;
+    } else {
+        assert(q_cur_loc(q) == loc);
+        DEBUG_PRINTF("checking for report %u\n", leftfixReport);
+        char rv = nfaInAcceptState(q->nfa, leftfixReport, q);
+        DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv);
+        return rv;
+    }
+
+nfa_dead:
+    mmbit_unset(activeLeftArray, arCount, ri);
+    scratch->tctxt.groups &= left->squash_mask;
+    return 0;
+}
+
+static rose_inline
+char roseTestPrefix(const struct RoseEngine *t, struct hs_scratch *scratch,
+                    u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end) {
+    return roseTestLeftfix(t, scratch, qi, leftfixLag, leftfixReport, end, 0);
+}
+
+static rose_inline
+char roseTestInfix(const struct RoseEngine *t, struct hs_scratch *scratch,
+                   u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end) {
+    return roseTestLeftfix(t, scratch, qi, leftfixLag, leftfixReport, end, 1);
+}
+
+static rose_inline
+void roseTriggerInfix(const struct RoseEngine *t, struct hs_scratch *scratch,
+                      u64a start, u64a end, u32 qi, u32 topEvent, u8 cancel) {
+    struct core_info *ci = &scratch->core_info;
+    s64a loc = (s64a)end - ci->buf_offset;
+
+    u32 ri = queueToLeftIndex(t, qi);
+    assert(topEvent < MQE_INVALID);
+
+    const struct LeftNfaInfo *left = getLeftInfoByQueue(t, qi);
+    assert(!left->transient);
+
+    DEBUG_PRINTF("rose %u (qi=%u) event %u\n", ri, qi, topEvent);
+
+    struct mq *q = scratch->queues + qi;
+    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
+
+    char *state = ci->state;
+    u8 *activeLeftArray = getActiveLeftArray(t, state);
+    const u32 arCount = t->activeLeftCount;
+    char alive = mmbit_set(activeLeftArray, arCount, ri);
+
+    if (alive && info->no_retrigger) {
+        DEBUG_PRINTF("yawn\n");
+        return;
+    }
+
+    struct fatbit *aqa = scratch->aqa;
+    const u32 qCount = t->queueCount;
+
+    if (alive && nfaSupportsZombie(getNfaByInfo(t, info)) && ci->buf_offset &&
+        !fatbit_isset(aqa, qCount, qi) && isZombie(t, state, left)) {
+        DEBUG_PRINTF("yawn - zombie\n");
+        return;
+    }
+
+    if (cancel) {
+        DEBUG_PRINTF("dominating top: (re)init\n");
+        fatbit_set(aqa, qCount, qi);
+        initRoseQueue(t, qi, left, scratch);
+        pushQueueAt(q, 0, MQE_START, loc);
+        nfaQueueInitState(q->nfa, q);
+    } else if (!fatbit_set(aqa, qCount, qi)) {
+        DEBUG_PRINTF("initing %u\n", qi);
+        initRoseQueue(t, qi, left, scratch);
+        if (alive) {
+            s32 sp = -(s32)loadRoseDelay(t, state, left);
+            pushQueueAt(q, 0, MQE_START, sp);
+            loadStreamState(q->nfa, q, sp);
+        } else {
+            pushQueueAt(q, 0, MQE_START, loc);
+            nfaQueueInitState(q->nfa, q);
+        }
+    } else if (!alive) {
+        q->cur = q->end = 0;
+        pushQueueAt(q, 0, MQE_START, loc);
+        nfaQueueInitState(q->nfa, q);
+    } else if (isQueueFull(q)) {
+        reduceInfixQueue(q, loc, left->maxQueueLen, q->nfa->maxWidth);
+
+        if (isQueueFull(q)) {
+            /* still full - reduceInfixQueue did nothing */
+            DEBUG_PRINTF("queue %u full (%u items) -> catching up nfa\n", qi,
+                         q->end - q->cur);
+            pushQueueNoMerge(q, MQE_END, loc);
+            nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX);
+
+            q->cur = q->end = 0;
+            pushQueueAt(q, 0, MQE_START, loc);
+        }
+    }
+
+    pushQueueSom(q, topEvent, loc, start);
+}
+
+static rose_inline
+hwlmcb_rv_t roseReport(const struct RoseEngine *t, struct hs_scratch *scratch,
+                       u64a end, ReportID onmatch, s32 offset_adjust,
+                       u32 ekey) {
+    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
+    DEBUG_PRINTF("firing callback onmatch=%u, end=%llu\n", onmatch, end);
+    updateLastMatchOffset(&scratch->tctxt, end);
+
+    int cb_rv = roseDeliverReport(end, onmatch, offset_adjust, scratch, ekey);
+    if (cb_rv == MO_HALT_MATCHING) {
+        DEBUG_PRINTF("termination requested\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    return roseHaltIfExhausted(t, scratch);
+}
+
+/* catches up engines enough to ensure any earlier mpv triggers are enqueued
+ * and then adds the trigger to the mpv queue. Must not be called during catch
+ * up */
+static rose_inline
+hwlmcb_rv_t roseCatchUpAndHandleChainMatch(const struct RoseEngine *t,
+                                           struct hs_scratch *scratch,
+                                           u32 event, u64a top_squash_distance,
+                                           u64a end, const char in_catchup) {
+    if (!in_catchup &&
+        roseCatchUpMpvFeeders(t, scratch, end) == HWLM_TERMINATE_MATCHING) {
+        return HWLM_TERMINATE_MATCHING;
+    }
+    return roseHandleChainMatch(t, scratch, event, top_squash_distance, end,
+                                in_catchup);
+}
+
+static rose_inline
+void roseHandleSom(UNUSED const struct RoseEngine *t,
+                   struct hs_scratch *scratch, const struct som_operation *sr,
+                   u64a end) {
+    DEBUG_PRINTF("end=%llu, minMatchOffset=%llu\n", end,
+                 scratch->tctxt.minMatchOffset);
+
+    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
+    updateLastMatchOffset(&scratch->tctxt, end);
+    handleSomInternal(scratch, sr, end);
+}
+
+static rose_inline
+hwlmcb_rv_t roseReportSom(const struct RoseEngine *t,
+                          struct hs_scratch *scratch, u64a start, u64a end,
+                          ReportID onmatch, s32 offset_adjust, u32 ekey) {
+    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
+    DEBUG_PRINTF("firing som callback onmatch=%u, start=%llu, end=%llu\n",
+                 onmatch, start, end);
+    updateLastMatchOffset(&scratch->tctxt, end);
+
+    int cb_rv = roseDeliverSomReport(start, end, onmatch, offset_adjust,
+                                     scratch, ekey);
+    if (cb_rv == MO_HALT_MATCHING) {
+        DEBUG_PRINTF("termination requested\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    return roseHaltIfExhausted(t, scratch);
+}
+
+static rose_inline
+void roseHandleSomSom(UNUSED const struct RoseEngine *t,
+                      struct hs_scratch *scratch,
+                      const struct som_operation *sr, u64a start, u64a end) {
+    DEBUG_PRINTF("start=%llu, end=%llu, minMatchOffset=%llu\n", start, end,
+                 scratch->tctxt.minMatchOffset);
+
+    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
+    updateLastMatchOffset(&scratch->tctxt, end);
+    setSomFromSomAware(scratch, sr, start, end);
+}
+
+static really_inline
+int reachHasBit(const u8 *reach, u8 c) {
+    return !!(reach[c / 8U] & (u8)1U << (c % 8U));
+}
+
+/**
+ * \brief Scan around a literal, checking that that "lookaround" reach masks
+ * are satisfied.
+ */
+static rose_inline
+int roseCheckLookaround(const struct RoseEngine *t,
+                        const struct hs_scratch *scratch, u32 lookaroundIndex,
+                        u32 lookaroundCount, u64a end) {
+    assert(lookaroundIndex != MO_INVALID_IDX);
+    assert(lookaroundCount > 0);
+
+    const struct core_info *ci = &scratch->core_info;
+    DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
+                 ci->buf_offset, ci->buf_offset + ci->len);
+
+    const u8 *base = (const u8 *)t;
+    const s8 *look_base = (const s8 *)(base + t->lookaroundTableOffset);
+    const s8 *look = look_base + lookaroundIndex;
+    const s8 *look_end = look + lookaroundCount;
+    assert(look < look_end);
+
+    const u8 *reach_base = base + t->lookaroundReachOffset;
+    const u8 *reach = reach_base + lookaroundIndex * REACH_BITVECTOR_LEN;
+
+    // The following code assumes that the lookaround structures are ordered by
+    // increasing offset.
+
+    const s64a base_offset = end - ci->buf_offset;
+    DEBUG_PRINTF("base_offset=%lld\n", base_offset);
+    DEBUG_PRINTF("first look has offset %d\n", *look);
+
+    // If our first check tells us we need to look at an offset before the
+    // start of the stream, this role cannot match.
+    if (unlikely(*look < 0 && (u64a)(0 - *look) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    // Skip over offsets that are before the history buffer.
+    do {
+        s64a offset = base_offset + *look;
+        if (offset >= -(s64a)ci->hlen) {
+            goto in_history;
+        }
+        DEBUG_PRINTF("look=%d before history\n", *look);
+        look++;
+        reach += REACH_BITVECTOR_LEN;
+    } while (look < look_end);
+
+    // History buffer.
+    DEBUG_PRINTF("scan history (%zu looks left)\n", look_end - look);
+    for (; look < look_end; ++look, reach += REACH_BITVECTOR_LEN) {
+    in_history:
+        ;
+        s64a offset = base_offset + *look;
+        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
+
+        if (offset >= 0) {
+            DEBUG_PRINTF("in buffer\n");
+            goto in_buffer;
+        }
+
+        assert(offset >= -(s64a)ci->hlen && offset < 0);
+        u8 c = ci->hbuf[ci->hlen + offset];
+        if (!reachHasBit(reach, c)) {
+            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+            return 0;
+        }
+    }
+    // Current buffer.
+    DEBUG_PRINTF("scan buffer (%zu looks left)\n", look_end - look);
+    for (; look < look_end; ++look, reach += REACH_BITVECTOR_LEN) {
+    in_buffer:
+        ;
+        s64a offset = base_offset + *look;
+        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
+
+        if (offset >= (s64a)ci->len) {
+            DEBUG_PRINTF("in the future\n");
+            break;
+        }
+
+        assert(offset >= 0 && offset < (s64a)ci->len);
+        u8 c = ci->buf[offset];
+        if (!reachHasBit(reach, c)) {
+            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+            return 0;
+        }
+    }
+
+    DEBUG_PRINTF("OK :)\n");
+    return 1;
+}
+
+int roseNfaEarliestSom(u64a from_offset, u64a offset, ReportID id,
+                       void *context);
+static rose_inline
+u64a roseGetHaigSom(const struct RoseEngine *t, struct hs_scratch *scratch,
+                    const u32 qi, UNUSED const u32 leftfixLag) {
+    u32 ri = queueToLeftIndex(t, qi);
+
+    UNUSED const struct LeftNfaInfo *left = getLeftTable(t) + ri;
+
+    DEBUG_PRINTF("testing %s prefix %u/%u with lag %u (maxLag=%u)\n",
+                 left->transient ? "transient" : "active", ri, qi,
+                 leftfixLag, left->maxLag);
+
+    assert(leftfixLag <= left->maxLag);
+
+    struct mq *q = scratch->queues + qi;
+
+    u64a start = ~0ULL;
+
+    /* switch the callback + context for a fun one */
+    q->som_cb = roseNfaEarliestSom;
+    q->context = &start;
+
+    nfaReportCurrentMatches(q->nfa, q);
+
+    /* restore the old callback + context */
+    q->som_cb = roseNfaSomAdaptor;
+    q->context = NULL;
+    DEBUG_PRINTF("earliest som is %llu\n", start);
+    return start;
+}
+
+static rose_inline
+char roseCheckBounds(u64a end, u64a min_bound, u64a max_bound) {
+    DEBUG_PRINTF("check offset=%llu against bounds [%llu,%llu]\n", end,
+                 min_bound, max_bound);
+    assert(min_bound <= max_bound);
+    return end >= min_bound && end <= max_bound;
+}
+
+static rose_inline
+hwlmcb_rv_t roseEnginesEod(const struct RoseEngine *rose,
+                           struct hs_scratch *scratch, u64a offset,
+                           u32 iter_offset) {
+    const char is_streaming = rose->mode != HS_MODE_BLOCK;
+
+    /* data, len is used for state decompress, should be full available data */
+    u8 key = 0;
+    if (is_streaming) {
+        const u8 *eod_data = scratch->core_info.hbuf;
+        size_t eod_len = scratch->core_info.hlen;
+        key = eod_len ? eod_data[eod_len - 1] : 0;
+    }
+
+    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
+    const u32 aaCount = rose->activeArrayCount;
+
+    const struct mmbit_sparse_iter *it = getByOffset(rose, iter_offset);
+    assert(ISALIGNED(it));
+
+    u32 idx = 0;
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    for (u32 qi = mmbit_sparse_iter_begin(aa, aaCount, &idx, it, si_state);
+         qi != MMB_INVALID;
+         qi = mmbit_sparse_iter_next(aa, aaCount, qi, &idx, it, si_state)) {
+        DEBUG_PRINTF("checking nfa %u\n", qi);
+        struct mq *q = scratch->queues + qi;
+        assert(q->nfa == getNfaByQueue(rose, qi));
+        assert(nfaAcceptsEod(q->nfa));
+
+        if (is_streaming) {
+            // Decompress stream state.
+            nfaExpandState(q->nfa, q->state, q->streamState, offset, key);
+        }
+
+        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
+                               roseReportAdaptor, roseReportSomAdaptor,
+                               scratch) == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("user instructed us to stop\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static rose_inline
+hwlmcb_rv_t roseSuffixesEod(const struct RoseEngine *rose,
+                            struct hs_scratch *scratch, u64a offset) {
+    const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state);
+    const u32 aaCount = rose->activeArrayCount;
+
+    for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID;
+         qi = mmbit_iterate(aa, aaCount, qi)) {
+        DEBUG_PRINTF("checking nfa %u\n", qi);
+        struct mq *q = scratch->queues + qi;
+        assert(q->nfa == getNfaByQueue(rose, qi));
+        assert(nfaAcceptsEod(q->nfa));
+
+        /* We have just been triggered. */
+        assert(fatbit_isset(scratch->aqa, rose->queueCount, qi));
+
+        pushQueueNoMerge(q, MQE_END, scratch->core_info.len);
+        q->context = NULL;
+
+        /* rose exec is used as we don't want to / can't raise matches in the
+         * history buffer. */
+        if (!nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX)) {
+            DEBUG_PRINTF("nfa is dead\n");
+            continue;
+        }
+        if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
+                               roseReportAdaptor, roseReportSomAdaptor,
+                               scratch) == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("user instructed us to stop\n");
+            return HWLM_TERMINATE_MATCHING;
+        }
+    }
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static rose_inline
+hwlmcb_rv_t roseMatcherEod(const struct RoseEngine *rose,
+                           struct hs_scratch *scratch, u64a offset) {
+    assert(rose->ematcherOffset);
+    assert(rose->ematcherRegionSize);
+
+    // Clear role state and active engines, since we have already handled all
+    // outstanding work there.
+    DEBUG_PRINTF("clear role state and active leaf array\n");
+    char *state = scratch->core_info.state;
+    mmbit_clear(getRoleState(state), rose->rolesWithStateCount);
+    mmbit_clear(getActiveLeafArray(rose, state), rose->activeArrayCount);
+
+    const char is_streaming = rose->mode != HS_MODE_BLOCK;
+
+    size_t eod_len;
+    const u8 *eod_data;
+    if (!is_streaming) { /* Block */
+        eod_data = scratch->core_info.buf;
+        eod_len = scratch->core_info.len;
+    } else { /* Streaming */
+        eod_len = scratch->core_info.hlen;
+        eod_data = scratch->core_info.hbuf;
+    }
+
+    assert(eod_data);
+    assert(eod_len);
+
+    DEBUG_PRINTF("%zu bytes of eod data to scan at offset %llu\n", eod_len,
+                 offset);
+
+    // If we don't have enough bytes to produce a match from an EOD table scan,
+    // there's no point scanning.
+    if (eod_len < rose->eodmatcherMinWidth) {
+        DEBUG_PRINTF("too short for min width %u\n", rose->eodmatcherMinWidth);
+        return HWLM_CONTINUE_MATCHING;
+    }
+
+    // Ensure that we only need scan the last N bytes, where N is the length of
+    // the eod-anchored matcher region.
+    size_t adj = eod_len - MIN(eod_len, rose->ematcherRegionSize);
+
+    const struct HWLM *etable = getByOffset(rose, rose->ematcherOffset);
+    hwlmExec(etable, eod_data, eod_len, adj, roseCallback, scratch,
+             scratch->tctxt.groups);
+
+    // We may need to fire delayed matches.
+    if (cleanUpDelayed(rose, scratch, 0, offset) == HWLM_TERMINATE_MATCHING) {
+        DEBUG_PRINTF("user instructed us to stop\n");
+        return HWLM_TERMINATE_MATCHING;
+    }
+
+    roseFlushLastByteHistory(rose, scratch, offset);
+    return HWLM_CONTINUE_MATCHING;
+}
+
+static
+void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
+                    const char from_mpv) {
+    if (from_mpv) {
+        updateMinMatchOffsetFromMpv(tctxt, offset);
+    } else {
+        updateMinMatchOffset(tctxt, offset);
+    }
+}
+
+#define PROGRAM_CASE(name)                                                     \
+    case ROSE_INSTR_##name: {                                                  \
+        DEBUG_PRINTF("instruction: " #name " (pc=%u)\n",                       \
+                     programOffset + (u32)(pc - pc_base));                     \
+        const struct ROSE_STRUCT_##name *ri =                                  \
+            (const struct ROSE_STRUCT_##name *)pc;
+
+#define PROGRAM_NEXT_INSTRUCTION                                               \
+    pc += ROUNDUP_N(sizeof(*ri), ROSE_INSTR_MIN_ALIGN);                        \
+    break;                                                                     \
+    }
+
+static rose_inline
+hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
+                             struct hs_scratch *scratch, u32 programOffset,
+                             u64a som, u64a end, size_t match_len,
+                             u8 prog_flags) {
+    DEBUG_PRINTF("program=%u, offsets [%llu,%llu], flags=%u\n", programOffset,
+                 som, end, prog_flags);
+
+    assert(programOffset >= sizeof(struct RoseEngine));
+    assert(programOffset < t->size);
+
+    const char in_anchored = prog_flags & ROSE_PROG_FLAG_IN_ANCHORED;
+    const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP;
+    const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV;
+    const char skip_mpv_catchup = prog_flags & ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
+
+    const char *pc_base = getByOffset(t, programOffset);
+    const char *pc = pc_base;
+
+    // Local sparse iterator state for programs that use the SPARSE_ITER_BEGIN
+    // and SPARSE_ITER_NEXT instructions.
+    struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES];
+
+    // If this program has an effect, work_done will be set to one (which may
+    // allow the program to squash groups).
+    int work_done = 0;
+
+    struct RoseContext *tctxt = &scratch->tctxt;
+
+    assert(*(const u8 *)pc != ROSE_INSTR_END);
+
+    for (;;) {
+        assert(ISALIGNED_N(pc, ROSE_INSTR_MIN_ALIGN));
+        assert(pc >= pc_base);
+        assert((size_t)(pc - pc_base) < t->size);
+        const u8 code = *(const u8 *)pc;
+        assert(code <= ROSE_INSTR_END);
+
+        switch ((enum RoseInstructionCode)code) {
+            PROGRAM_CASE(ANCHORED_DELAY) {
+                if (in_anchored && end > t->floatingMinLiteralMatchOffset) {
+                    DEBUG_PRINTF("delay until playback\n");
+                    tctxt->groups |= ri->groups;
+                    work_done = 1;
+                    assert(ri->done_jump); // must progress
+                    pc += ri->done_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LIT_MASK) {
+                assert(match_len);
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckBenefits(ci, end, match_len, ri->and_mask.a8,
+                                       ri->cmp_mask.a8)) {
+                    DEBUG_PRINTF("halt: failed mask check\n");
+                    return HWLM_CONTINUE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LIT_EARLY) {
+                if (end < ri->min_offset) {
+                    DEBUG_PRINTF("halt: before min_offset=%u\n",
+                                 ri->min_offset);
+                    return HWLM_CONTINUE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_GROUPS) {
+                DEBUG_PRINTF("groups=0x%llx, checking instr groups=0x%llx\n",
+                             tctxt->groups, ri->groups);
+                if (!(ri->groups & tctxt->groups)) {
+                    DEBUG_PRINTF("halt: no groups are set\n");
+                    return HWLM_CONTINUE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_ONLY_EOD) {
+                struct core_info *ci = &scratch->core_info;
+                if (end != ci->buf_offset + ci->len) {
+                    DEBUG_PRINTF("should only match at end of data\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_BOUNDS) {
+                if (!roseCheckBounds(end, ri->min_bound, ri->max_bound)) {
+                    DEBUG_PRINTF("failed bounds check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_NOT_HANDLED) {
+                struct fatbit *handled = scratch->handled_roles;
+                if (fatbit_set(handled, t->handledKeyCount, ri->key)) {
+                    DEBUG_PRINTF("key %u already set\n", ri->key);
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LOOKAROUND) {
+                if (!roseCheckLookaround(t, scratch, ri->index, ri->count,
+                                         end)) {
+                    DEBUG_PRINTF("failed lookaround check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_INFIX) {
+                if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report,
+                                   end)) {
+                    DEBUG_PRINTF("failed infix check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_PREFIX) {
+                if (!roseTestPrefix(t, scratch, ri->queue, ri->lag, ri->report,
+                                    end)) {
+                    DEBUG_PRINTF("failed prefix check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(PUSH_DELAYED) {
+                rosePushDelayedMatch(t, scratch, ri->delay, ri->index, end);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CATCH_UP) {
+                if (roseCatchUpTo(t, scratch, end) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CATCH_UP_MPV) {
+                if (from_mpv || skip_mpv_catchup) {
+                    DEBUG_PRINTF("skipping mpv catchup\n");
+                } else if (roseCatchUpMPV(t,
+                                          end - scratch->core_info.buf_offset,
+                                          scratch) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_ADJUST) {
+                assert(ri->distance <= end);
+                som = end - ri->distance;
+                DEBUG_PRINTF("som is (end - %u) = %llu\n", ri->distance, som);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_LEFTFIX) {
+                som = roseGetHaigSom(t, scratch, ri->queue, ri->lag);
+                DEBUG_PRINTF("som from leftfix is %llu\n", som);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_FROM_REPORT) {
+                som = handleSomExternal(scratch, &ri->som, end);
+                DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch,
+                             som);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_ZERO) {
+                DEBUG_PRINTF("setting SOM to zero\n");
+                som = 0;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(TRIGGER_INFIX) {
+                roseTriggerInfix(t, scratch, som, end, ri->queue, ri->event,
+                                 ri->cancel);
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(TRIGGER_SUFFIX) {
+                if (roseTriggerSuffix(t, scratch, ri->queue, ri->event, som,
+                                      end) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DEDUPE) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                const char do_som = t->hasSom; // TODO: constant propagate
+                const char is_external_report = 1;
+                enum DedupeResult rv =
+                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
+                                  ri->dkey, ri->offset_adjust,
+                                  is_external_report, ri->quash_som, do_som);
+                switch (rv) {
+                case DEDUPE_HALT:
+                    return HWLM_TERMINATE_MATCHING;
+                case DEDUPE_SKIP:
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                case DEDUPE_CONTINUE:
+                    break;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DEDUPE_SOM) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                const char is_external_report = 0;
+                const char do_som = 1;
+                enum DedupeResult rv =
+                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
+                                  ri->dkey, ri->offset_adjust,
+                                  is_external_report, ri->quash_som, do_som);
+                switch (rv) {
+                case DEDUPE_HALT:
+                    return HWLM_TERMINATE_MATCHING;
+                case DEDUPE_SKIP:
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                case DEDUPE_CONTINUE:
+                    break;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_CHAIN) {
+                // Note: sequence points updated inside this function.
+                if (roseCatchUpAndHandleChainMatch(
+                        t, scratch, ri->event, ri->top_squash_distance, end,
+                        in_catchup) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM_INT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                roseHandleSom(t, scratch, &ri->som, end);
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM_AWARE) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                roseHandleSomSom(t, scratch, &ri->som, som, end);
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_EXHAUST) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               ri->ekey) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReportSom(t, scratch, som, end, ri->onmatch,
+                                  ri->offset_adjust,
+                                  INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM_EXHAUST) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReportSom(t, scratch, som, end, ri->onmatch,
+                                  ri->offset_adjust,
+                                  ri->ekey) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DEDUPE_AND_REPORT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                const char do_som = t->hasSom; // TODO: constant propagate
+                const char is_external_report = 1;
+                enum DedupeResult rv =
+                    dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust,
+                                  ri->dkey, ri->offset_adjust,
+                                  is_external_report, ri->quash_som, do_som);
+                switch (rv) {
+                case DEDUPE_HALT:
+                    return HWLM_TERMINATE_MATCHING;
+                case DEDUPE_SKIP:
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                case DEDUPE_CONTINUE:
+                    break;
+                }
+
+                const u32 ekey = INVALID_EKEY;
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               ekey) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(FINAL_REPORT) {
+                updateSeqPoint(tctxt, end, from_mpv);
+                if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust,
+                               INVALID_EKEY) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                /* One-shot specialisation: this instruction always terminates
+                 * execution of the program. */
+                return HWLM_CONTINUE_MATCHING;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_EXHAUSTED) {
+                DEBUG_PRINTF("check ekey %u\n", ri->ekey);
+                assert(ri->ekey != INVALID_EKEY);
+                assert(ri->ekey < t->ekeyCount);
+                const char *evec = scratch->core_info.exhaustionVector;
+                if (isExhausted(t, evec, ri->ekey)) {
+                    DEBUG_PRINTF("ekey %u already set, match is exhausted\n",
+                                 ri->ekey);
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MIN_LENGTH) {
+                DEBUG_PRINTF("check min length %llu (adj %d)\n", ri->min_length,
+                             ri->end_adj);
+                assert(ri->min_length > 0);
+                assert(ri->end_adj == 0 || ri->end_adj == -1);
+                assert(som == HS_OFFSET_PAST_HORIZON || som <= end);
+                if (som != HS_OFFSET_PAST_HORIZON &&
+                    ((end + ri->end_adj) - som < ri->min_length)) {
+                    DEBUG_PRINTF("failed check, match len %llu\n",
+                                 (u64a)((end + ri->end_adj) - som));
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SET_STATE) {
+                DEBUG_PRINTF("set state index %u\n", ri->index);
+                mmbit_set(getRoleState(scratch->core_info.state),
+                          t->rolesWithStateCount, ri->index);
+                work_done = 1;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SET_GROUPS) {
+                tctxt->groups |= ri->groups;
+                DEBUG_PRINTF("set groups 0x%llx -> 0x%llx\n", ri->groups,
+                             tctxt->groups);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SQUASH_GROUPS) {
+                assert(popcount64(ri->groups) == 63); // Squash only one group.
+                if (work_done) {
+                    tctxt->groups &= ri->groups;
+                    DEBUG_PRINTF("squash groups 0x%llx -> 0x%llx\n", ri->groups,
+                                 tctxt->groups);
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_STATE) {
+                DEBUG_PRINTF("check state %u\n", ri->index);
+                const u8 *roles = getRoleState(scratch->core_info.state);
+                if (!mmbit_isset(roles, t->rolesWithStateCount, ri->index)) {
+                    DEBUG_PRINTF("state not on\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SPARSE_ITER_BEGIN) {
+                DEBUG_PRINTF("iter_offset=%u\n", ri->iter_offset);
+                const struct mmbit_sparse_iter *it =
+                    getByOffset(t, ri->iter_offset);
+                assert(ISALIGNED(it));
+
+                const u8 *roles = getRoleState(scratch->core_info.state);
+
+                u32 idx = 0;
+                u32 i = mmbit_sparse_iter_begin(roles, t->rolesWithStateCount,
+                                                &idx, it, si_state);
+                if (i == MMB_INVALID) {
+                    DEBUG_PRINTF("no states in sparse iter are on\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+
+                fatbit_clear(scratch->handled_roles);
+
+                const u32 *jumps = getByOffset(t, ri->jump_table);
+                DEBUG_PRINTF("state %u (idx=%u) is on, jump to %u\n", i, idx,
+                             jumps[idx]);
+                pc = pc_base + jumps[idx];
+                continue;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SPARSE_ITER_NEXT) {
+                DEBUG_PRINTF("iter_offset=%u, state=%u\n", ri->iter_offset,
+                             ri->state);
+                const struct mmbit_sparse_iter *it =
+                    getByOffset(t, ri->iter_offset);
+                assert(ISALIGNED(it));
+
+                const u8 *roles = getRoleState(scratch->core_info.state);
+
+                u32 idx = 0;
+                u32 i = mmbit_sparse_iter_next(roles, t->rolesWithStateCount,
+                                               ri->state, &idx, it, si_state);
+                if (i == MMB_INVALID) {
+                    DEBUG_PRINTF("no more states in sparse iter are on\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+
+                const u32 *jumps = getByOffset(t, ri->jump_table);
+                DEBUG_PRINTF("state %u (idx=%u) is on, jump to %u\n", i, idx,
+                             jumps[idx]);
+                pc = pc_base + jumps[idx];
+                continue;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(ENGINES_EOD) {
+                if (roseEnginesEod(t, scratch, end, ri->iter_offset) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SUFFIXES_EOD) {
+                if (roseSuffixesEod(t, scratch, end) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(MATCHER_EOD) {
+                if (roseMatcherEod(t, scratch, end) ==
+                    HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(END) {
+                DEBUG_PRINTF("finished\n");
+                return HWLM_CONTINUE_MATCHING;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+        }
+    }
+
+    assert(0); // unreachable
+    return HWLM_CONTINUE_MATCHING;
+}
+
+#undef PROGRAM_CASE
+#undef PROGRAM_NEXT_INSTRUCTION
+
 #endif // PROGRAM_RUNTIME_H

From 9087d59be54a9e9dc0d4c3d95b0ff624c3ae08f1 Mon Sep 17 00:00:00 2001
From: Xiang Wang <xiang.w.wang@intel.com>
Date: Tue, 11 Aug 2015 05:23:12 -0400
Subject: [PATCH 075/166] tamarama: add container engine for exclusive nfas

Add the new Tamarama engine that acts as a container for infix/suffix
engines that can be proven to run exclusively of one another.

This reduces stream state for pattern sets with many exclusive engines.
---
 CMakeLists.txt                     |  11 +
 src/grey.cpp                       |   4 +
 src/grey.h                         |   4 +
 src/nfa/castle_dump.cpp            |   5 +-
 src/nfa/castle_dump.h              |   6 +-
 src/nfa/goughdump.cpp              |   8 +-
 src/nfa/goughdump.h                |  10 +-
 src/nfa/lbr_dump.cpp               |  17 +-
 src/nfa/lbr_dump.h                 |  18 +-
 src/nfa/limex.h                    |   4 +-
 src/nfa/limex_dump.cpp             |   3 +-
 src/nfa/mcclellandump.cpp          |   8 +-
 src/nfa/mcclellandump.h            |   9 +-
 src/nfa/mpv_dump.cpp               |   3 +-
 src/nfa/mpv_dump.h                 |   6 +-
 src/nfa/nfa_api.h                  |  14 +
 src/nfa/nfa_api_dispatch.c         |  10 +
 src/nfa/nfa_build_util.cpp         |  12 +
 src/nfa/nfa_dump_api.h             |   5 +-
 src/nfa/nfa_dump_dispatch.cpp      |   7 +-
 src/nfa/nfa_internal.h             |   7 +
 src/nfa/tamarama.c                 | 440 ++++++++++++++++++++++++++++
 src/nfa/tamarama.h                 |  72 +++++
 src/nfa/tamarama_dump.cpp          |  92 ++++++
 src/nfa/tamarama_dump.h            |  49 ++++
 src/nfa/tamarama_internal.h        | 105 +++++++
 src/nfa/tamaramacompile.cpp        | 175 +++++++++++
 src/nfa/tamaramacompile.h          |  94 ++++++
 src/rose/rose_build_bytecode.cpp   | 443 +++++++++++++++++++++++++++-
 src/rose/rose_build_exclusive.cpp  | 446 +++++++++++++++++++++++++++++
 src/rose/rose_build_exclusive.h    | 144 ++++++++++
 src/rose/rose_build_impl.h         |  23 +-
 src/rose/rose_build_misc.cpp       |  10 +-
 src/rose/rose_dump.cpp             |   6 +-
 src/rose/rose_graph.h              |   9 +-
 src/smallwrite/smallwrite_dump.cpp |   4 +-
 src/util/clique.cpp                | 131 +++++++++
 src/util/clique.h                  |  60 ++++
 38 files changed, 2418 insertions(+), 56 deletions(-)
 create mode 100644 src/nfa/tamarama.c
 create mode 100644 src/nfa/tamarama.h
 create mode 100644 src/nfa/tamarama_dump.cpp
 create mode 100644 src/nfa/tamarama_dump.h
 create mode 100644 src/nfa/tamarama_internal.h
 create mode 100644 src/nfa/tamaramacompile.cpp
 create mode 100644 src/nfa/tamaramacompile.h
 create mode 100644 src/rose/rose_build_exclusive.cpp
 create mode 100644 src/rose/rose_build_exclusive.h
 create mode 100644 src/util/clique.cpp
 create mode 100644 src/util/clique.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba3b29fa..94a54241 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -478,6 +478,9 @@ set (hs_exec_SRCS
     src/nfa/shufti_common.h
     src/nfa/shufti.c
     src/nfa/shufti.h
+    src/nfa/tamarama.c
+    src/nfa/tamarama.h
+    src/nfa/tamarama_internal.h
     src/nfa/truffle_common.h
     src/nfa/truffle.c
     src/nfa/truffle.h
@@ -639,6 +642,8 @@ SET (hs_SRCS
     src/nfa/repeatcompile.h
     src/nfa/shufticompile.cpp
     src/nfa/shufticompile.h
+    src/nfa/tamaramacompile.cpp
+    src/nfa/tamaramacompile.h
     src/nfa/trufflecompile.cpp
     src/nfa/trufflecompile.h
     src/nfagraph/ng.cpp
@@ -823,6 +828,8 @@ SET (hs_SRCS
     src/rose/rose_build_compile.cpp
     src/rose/rose_build_convert.cpp
     src/rose/rose_build_convert.h
+    src/rose/rose_build_exclusive.cpp
+    src/rose/rose_build_exclusive.h
     src/rose/rose_build_groups.cpp
     src/rose/rose_build_groups.h
     src/rose/rose_build_impl.h
@@ -853,6 +860,8 @@ SET (hs_SRCS
     src/util/charreach.cpp
     src/util/charreach.h
     src/util/charreach_util.h
+    src/util/clique.cpp
+    src/util/clique.h
     src/util/compare.h
     src/util/compile_context.cpp
     src/util/compile_context.h
@@ -916,6 +925,8 @@ set(hs_dump_SRCS
     src/nfa/nfa_dump_dispatch.cpp
     src/nfa/nfa_dump_internal.cpp
     src/nfa/nfa_dump_internal.h
+    src/nfa/tamarama_dump.cpp
+    src/nfa/tamarama_dump.h
     src/parser/dump.cpp
     src/parser/dump.h
     src/parser/position_dump.h
diff --git a/src/grey.cpp b/src/grey.cpp
index 1f2fd904..f4a67677 100644
--- a/src/grey.cpp
+++ b/src/grey.cpp
@@ -127,6 +127,8 @@ Grey::Grey(void) :
                    limitSmallWriteOutfixSize(1048576), // 1 MB
                    smallWriteMaxPatterns(10000),
                    smallWriteMaxLiterals(10000),
+                   allowTamarama(true), // Tamarama engine
+                   tamaChunkSize(100),
                    dumpFlags(0),
                    limitPatternCount(8000000), // 8M patterns
                    limitPatternLength(16000),  // 16K bytes
@@ -275,6 +277,8 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(limitSmallWriteOutfixSize);
         G_UPDATE(smallWriteMaxPatterns);
         G_UPDATE(smallWriteMaxLiterals);
+        G_UPDATE(allowTamarama);
+        G_UPDATE(tamaChunkSize);
         G_UPDATE(limitPatternCount);
         G_UPDATE(limitPatternLength);
         G_UPDATE(limitGraphVertices);
diff --git a/src/grey.h b/src/grey.h
index 634fa3a7..03e40ed5 100644
--- a/src/grey.h
+++ b/src/grey.h
@@ -145,6 +145,10 @@ struct Grey {
     u32 smallWriteMaxPatterns; // only try small writes if fewer patterns
     u32 smallWriteMaxLiterals; // only try small writes if fewer literals
 
+    // Tamarama engine
+    bool allowTamarama;
+    u32 tamaChunkSize; //!< max chunk size for exclusivity analysis in Tamarama
+
     enum DumpFlags {
         DUMP_NONE       = 0,
         DUMP_BASICS     = 1 << 0, // Dump basic textual data
diff --git a/src/nfa/castle_dump.cpp b/src/nfa/castle_dump.cpp
index dd0e369f..fd1521a5 100644
--- a/src/nfa/castle_dump.cpp
+++ b/src/nfa/castle_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -48,7 +48,8 @@
 
 namespace ue2 {
 
-void nfaExecCastle0_dumpDot(const struct NFA *, FILE *) {
+void nfaExecCastle0_dumpDot(const struct NFA *, FILE *,
+                            UNUSED const std::string &base) {
     // No GraphViz output for Castles.
 }
 
diff --git a/src/nfa/castle_dump.h b/src/nfa/castle_dump.h
index c0b1f899..94dadec0 100644
--- a/src/nfa/castle_dump.h
+++ b/src/nfa/castle_dump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,12 +32,14 @@
 #if defined(DUMP_SUPPORT)
 
 #include <cstdio>
+#include <string>
 
 struct NFA;
 
 namespace ue2 {
 
-void nfaExecCastle0_dumpDot(const NFA *nfa, FILE *file);
+void nfaExecCastle0_dumpDot(const NFA *nfa, FILE *file,
+                            const std::string &base);
 void nfaExecCastle0_dumpText(const NFA *nfa, FILE *file);
 
 } // namespace ue2
diff --git a/src/nfa/goughdump.cpp b/src/nfa/goughdump.cpp
index f4f15eea..4e6e5425 100644
--- a/src/nfa/goughdump.cpp
+++ b/src/nfa/goughdump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -259,7 +259,8 @@ void dumpTransitions(const NFA *nfa, FILE *f,
     fprintf(f, "\n");
 }
 
-void nfaExecGough8_dumpDot(const struct NFA *nfa, FILE *f) {
+void nfaExecGough8_dumpDot(const struct NFA *nfa, FILE *f,
+                           UNUSED const string &base) {
     assert(nfa->type == GOUGH_NFA_8);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
 
@@ -302,7 +303,8 @@ void nfaExecGough8_dumpText(const struct NFA *nfa, FILE *f) {
     dumpTextReverse(nfa, f);
 }
 
-void nfaExecGough16_dumpDot(const struct NFA *nfa, FILE *f) {
+void nfaExecGough16_dumpDot(const struct NFA *nfa, FILE *f,
+                            UNUSED const string &base) {
     assert(nfa->type == GOUGH_NFA_16);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
 
diff --git a/src/nfa/goughdump.h b/src/nfa/goughdump.h
index 5e15356d..b96938e4 100644
--- a/src/nfa/goughdump.h
+++ b/src/nfa/goughdump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,12 +33,16 @@
 
 #include "ue2common.h"
 
+#include <string>
+
 struct NFA;
 
 namespace ue2 {
 
-void nfaExecGough8_dumpDot(const NFA *nfa, FILE *file);
-void nfaExecGough16_dumpDot(const NFA *nfa, FILE *file);
+void nfaExecGough8_dumpDot(const NFA *nfa, FILE *file,
+                           const std::string &base);
+void nfaExecGough16_dumpDot(const NFA *nfa, FILE *file,
+                            const std::string &base);
 void nfaExecGough8_dumpText(const NFA *nfa, FILE *file);
 void nfaExecGough16_dumpText(const NFA *nfa, FILE *file);
 
diff --git a/src/nfa/lbr_dump.cpp b/src/nfa/lbr_dump.cpp
index 3de75333..3412ddf5 100644
--- a/src/nfa/lbr_dump.cpp
+++ b/src/nfa/lbr_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -49,23 +49,28 @@
 
 namespace ue2 {
 
-void nfaExecLbrDot_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f) {
+void nfaExecLbrDot_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
+                           UNUSED const std::string &base) {
     // No impl
 }
 
-void nfaExecLbrVerm_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f) {
+void nfaExecLbrVerm_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
+                            UNUSED const std::string &base) {
     // No impl
 }
 
-void nfaExecLbrNVerm_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f) {
+void nfaExecLbrNVerm_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
+                             UNUSED const std::string &base) {
     // No impl
 }
 
-void nfaExecLbrShuf_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f) {
+void nfaExecLbrShuf_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
+                            UNUSED const std::string &base) {
     // No impl
 }
 
-void nfaExecLbrTruf_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f) {
+void nfaExecLbrTruf_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *f,
+                            UNUSED const std::string &base) {
     // No impl
 }
 
diff --git a/src/nfa/lbr_dump.h b/src/nfa/lbr_dump.h
index 5f6dd261..06ed51e2 100644
--- a/src/nfa/lbr_dump.h
+++ b/src/nfa/lbr_dump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,16 +32,22 @@
 #ifdef DUMP_SUPPORT
 
 #include <cstdio>
+#include <string>
 
 struct NFA;
 
 namespace ue2 {
 
-void nfaExecLbrDot_dumpDot(const struct NFA *nfa, FILE *file);
-void nfaExecLbrVerm_dumpDot(const struct NFA *nfa, FILE *file);
-void nfaExecLbrNVerm_dumpDot(const struct NFA *nfa, FILE *file);
-void nfaExecLbrShuf_dumpDot(const struct NFA *nfa, FILE *file);
-void nfaExecLbrTruf_dumpDot(const struct NFA *nfa, FILE *file);
+void nfaExecLbrDot_dumpDot(const struct NFA *nfa, FILE *file,
+                           const std::string &base);
+void nfaExecLbrVerm_dumpDot(const struct NFA *nfa, FILE *file,
+                            const std::string &base);
+void nfaExecLbrNVerm_dumpDot(const struct NFA *nfa, FILE *file,
+                            const std::string &base);
+void nfaExecLbrShuf_dumpDot(const struct NFA *nfa, FILE *file,
+                            const std::string &base);
+void nfaExecLbrTruf_dumpDot(const struct NFA *nfa, FILE *file,
+                            const std::string &base);
 void nfaExecLbrDot_dumpText(const struct NFA *nfa, FILE *file);
 void nfaExecLbrVerm_dumpText(const struct NFA *nfa, FILE *file);
 void nfaExecLbrNVerm_dumpText(const struct NFA *nfa, FILE *file);
diff --git a/src/nfa/limex.h b/src/nfa/limex.h
index 3d4d258b..9266b5de 100644
--- a/src/nfa/limex.h
+++ b/src/nfa/limex.h
@@ -30,6 +30,7 @@
 #define LIMEX_H
 
 #ifdef __cplusplus
+#include <string>
 extern "C"
 {
 #endif
@@ -40,7 +41,8 @@ extern "C"
 #define GENERATE_NFA_DUMP_DECL(gf_name)                                        \
     } /* extern "C" */                                                         \
     namespace ue2 {                                                            \
-    void gf_name##_dumpDot(const struct NFA *nfa, FILE *file);                 \
+    void gf_name##_dumpDot(const struct NFA *nfa, FILE *file,                  \
+                           const std::string &base);                           \
     void gf_name##_dumpText(const struct NFA *nfa, FILE *file);                \
     } /* namespace ue2 */                                                      \
     extern "C" {
diff --git a/src/nfa/limex_dump.cpp b/src/nfa/limex_dump.cpp
index 8e1ee219..207769a0 100644
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@@ -448,7 +448,8 @@ void dumpLimDotInfo(const limex_type *limex, u32 state, FILE *f) {
     }
 
 #define DUMP_DOT_FN(ddf_n)                                                     \
-    void nfaExecLimEx##ddf_n##_dumpDot(const NFA *nfa, FILE *f) {              \
+    void nfaExecLimEx##ddf_n##_dumpDot(const NFA *nfa, FILE *f,                \
+                                       UNUSED const string &base) {            \
         const LimExNFA##ddf_n *limex =                                         \
             (const LimExNFA##ddf_n *)getImplNfa(nfa);                          \
                                                                                \
diff --git a/src/nfa/mcclellandump.cpp b/src/nfa/mcclellandump.cpp
index 52711bf1..dcbb0915 100644
--- a/src/nfa/mcclellandump.cpp
+++ b/src/nfa/mcclellandump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -267,7 +267,8 @@ void dumpDotPreambleDfa(FILE *f) {
     fprintf(f, "0 [style=invis];\n");
 }
 
-void nfaExecMcClellan16_dumpDot(const NFA *nfa, FILE *f) {
+void nfaExecMcClellan16_dumpDot(const NFA *nfa, FILE *f,
+                                UNUSED const string &base) {
     assert(nfa->type == MCCLELLAN_NFA_16);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
 
@@ -286,7 +287,8 @@ void nfaExecMcClellan16_dumpDot(const NFA *nfa, FILE *f) {
     fprintf(f, "}\n");
 }
 
-void nfaExecMcClellan8_dumpDot(const NFA *nfa, FILE *f) {
+void nfaExecMcClellan8_dumpDot(const NFA *nfa, FILE *f,
+                               UNUSED const string &base) {
     assert(nfa->type == MCCLELLAN_NFA_8);
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
 
diff --git a/src/nfa/mcclellandump.h b/src/nfa/mcclellandump.h
index d74a6b6d..efa61544 100644
--- a/src/nfa/mcclellandump.h
+++ b/src/nfa/mcclellandump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 #include "rdfa.h"
 
 #include <cstdio>
+#include <string>
 
 struct mcclellan;
 struct mstate_aux;
@@ -42,8 +43,10 @@ union AccelAux;
 
 namespace ue2 {
 
-void nfaExecMcClellan8_dumpDot(const struct NFA *nfa, FILE *file);
-void nfaExecMcClellan16_dumpDot(const struct NFA *nfa, FILE *file);
+void nfaExecMcClellan8_dumpDot(const struct NFA *nfa, FILE *file,
+                               const std::string &base);
+void nfaExecMcClellan16_dumpDot(const struct NFA *nfa, FILE *file,
+                                const std::string &base);
 void nfaExecMcClellan8_dumpText(const struct NFA *nfa, FILE *file);
 void nfaExecMcClellan16_dumpText(const struct NFA *nfa, FILE *file);
 
diff --git a/src/nfa/mpv_dump.cpp b/src/nfa/mpv_dump.cpp
index 504cc677..da21d7cf 100644
--- a/src/nfa/mpv_dump.cpp
+++ b/src/nfa/mpv_dump.cpp
@@ -48,7 +48,8 @@
 
 namespace ue2 {
 
-void nfaExecMpv0_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *file) {
+void nfaExecMpv0_dumpDot(UNUSED const NFA *nfa, UNUSED FILE *file,
+                         UNUSED const std::string &base) {
 }
 
 static really_inline
diff --git a/src/nfa/mpv_dump.h b/src/nfa/mpv_dump.h
index 5dcd9f8b..23910dce 100644
--- a/src/nfa/mpv_dump.h
+++ b/src/nfa/mpv_dump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,12 +32,14 @@
 #if defined(DUMP_SUPPORT)
 
 #include <cstdio>
+#include <string>
 
 struct NFA;
 
 namespace ue2 {
 
-void nfaExecMpv0_dumpDot(const struct NFA *nfa, FILE *file);
+void nfaExecMpv0_dumpDot(const struct NFA *nfa, FILE *file,
+                         const std::string &base);
 void nfaExecMpv0_dumpText(const struct NFA *nfa, FILE *file);
 
 } // namespace ue2
diff --git a/src/nfa/nfa_api.h b/src/nfa/nfa_api.h
index dad3894a..3ef6dfca 100644
--- a/src/nfa/nfa_api.h
+++ b/src/nfa/nfa_api.h
@@ -120,6 +120,13 @@ char nfaInitCompressedState(const struct NFA *nfa, u64a offset, void *state,
  */
 char nfaQueueExec(const struct NFA *nfa, struct mq *q, s64a end);
 
+/**
+ * Main execution function that doesn't perform the checks and optimisations of
+ * nfaQueueExec() and just dispatches directly to the nfa implementations. It is
+ * intended to be used by the Tamarama engine.
+ */
+char nfaQueueExec_raw(const struct NFA *nfa, struct mq *q, s64a end);
+
 /** Return value indicating that the engine is alive. */
 #define MO_ALIVE 1
 
@@ -155,6 +162,13 @@ char nfaQueueExec(const struct NFA *nfa, struct mq *q, s64a end);
  */
 char nfaQueueExecToMatch(const struct NFA *nfa, struct mq *q, s64a end);
 
+/**
+ * Main execution function that doesn't perform the checks and optimisations of
+ * nfaQueueExecToMatch() and just dispatches directly to the nfa
+ * implementations. It is intended to be used by the Tamarama engine.
+ */
+char nfaQueueExec2_raw(const struct NFA *nfa, struct mq *q, s64a end);
+
 /**
  * Report matches at the current queue location.
  *
diff --git a/src/nfa/nfa_api_dispatch.c b/src/nfa/nfa_api_dispatch.c
index 9591cad5..b9c9f2ea 100644
--- a/src/nfa/nfa_api_dispatch.c
+++ b/src/nfa/nfa_api_dispatch.c
@@ -42,6 +42,7 @@
 #include "limex.h"
 #include "mcclellan.h"
 #include "mpv.h"
+#include "tamarama.h"
 
 #define DISPATCH_CASE(dc_ltype, dc_ftype, dc_subtype, dc_func_call) \
     case dc_ltype##_NFA_##dc_subtype:                               \
@@ -68,6 +69,7 @@
         DISPATCH_CASE(LBR, Lbr, Shuf, dbnt_func);             \
         DISPATCH_CASE(LBR, Lbr, Truf, dbnt_func);             \
         DISPATCH_CASE(CASTLE, Castle, 0, dbnt_func);          \
+        DISPATCH_CASE(TAMARAMA, Tamarama, 0, dbnt_func);      \
     default:                                                  \
         assert(0);                                            \
     }
@@ -105,6 +107,14 @@ char nfaQueueExec2_i(const struct NFA *nfa, struct mq *q, s64a end) {
     return 0;
 }
 
+char nfaQueueExec_raw(const struct NFA *nfa, struct mq *q, s64a end) {
+    return nfaQueueExec_i(nfa, q, end);
+}
+
+char nfaQueueExec2_raw(const struct NFA *nfa, struct mq *q, s64a end) {
+    return nfaQueueExec2_i(nfa, q, end);
+}
+
 static really_inline
 char nfaQueueExecRose_i(const struct NFA *nfa, struct mq *q, ReportID report) {
     DISPATCH_BY_NFA_TYPE(_QR(nfa, q, report));
diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp
index 96d0dabe..9244dcfb 100644
--- a/src/nfa/nfa_build_util.cpp
+++ b/src/nfa/nfa_build_util.cpp
@@ -300,6 +300,18 @@ const has_accel_fn NFATraits<LBR_NFA_Truf>::has_accel = has_accel_generic;
 const char *NFATraits<LBR_NFA_Truf>::name = "Lim Bounded Repeat (M)";
 #endif
 
+template<> struct NFATraits<TAMARAMA_NFA_0> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 32;
+    static const bool fast = true;
+    static const has_accel_fn has_accel;
+};
+const has_accel_fn NFATraits<TAMARAMA_NFA_0>::has_accel = has_accel_generic;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<TAMARAMA_NFA_0>::name = "Tamarama";
+#endif
+
 } // namespace
 
 #if defined(DUMP_SUPPORT)
diff --git a/src/nfa/nfa_dump_api.h b/src/nfa/nfa_dump_api.h
index 8675dd5d..1054a204 100644
--- a/src/nfa/nfa_dump_api.h
+++ b/src/nfa/nfa_dump_api.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,6 +36,7 @@
 #if defined(DUMP_SUPPORT)
 
 #include <cstdio>
+#include <string>
 
 struct NFA;
 
@@ -45,7 +46,7 @@ namespace ue2 {
  * \brief Dump (in Graphviz 'dot' format) a representation of the NFA into the
  * file pointed to by dotFile.
  */
-void nfaDumpDot(const struct NFA *nfa, FILE *dotFile);
+void nfaDumpDot(const struct NFA *nfa, FILE *dotFile, const std::string &base);
 
 /** \brief Dump a textual representation of the NFA. */
 void nfaDumpText(const struct NFA *fact, FILE *textFile);
diff --git a/src/nfa/nfa_dump_dispatch.cpp b/src/nfa/nfa_dump_dispatch.cpp
index 577c2fd0..cf2aa7f5 100644
--- a/src/nfa/nfa_dump_dispatch.cpp
+++ b/src/nfa/nfa_dump_dispatch.cpp
@@ -40,6 +40,7 @@
 #include "limex.h"
 #include "mcclellandump.h"
 #include "mpv_dump.h"
+#include "tamarama_dump.h"
 
 #ifndef DUMP_SUPPORT
 #error "no dump support"
@@ -73,12 +74,14 @@ namespace ue2 {
         DISPATCH_CASE(LBR, Lbr, Shuf, dbnt_func);             \
         DISPATCH_CASE(LBR, Lbr, Truf, dbnt_func);             \
         DISPATCH_CASE(CASTLE, Castle, 0, dbnt_func);          \
+        DISPATCH_CASE(TAMARAMA, Tamarama, 0, dbnt_func);      \
     default:                                                  \
         assert(0);                                            \
     }
 
-void nfaDumpDot(const struct NFA *nfa, FILE *dotFile) {
-    DISPATCH_BY_NFA_TYPE(_dumpDot(nfa, dotFile));
+void nfaDumpDot(const struct NFA *nfa, FILE *dotFile,
+                const std::string &base) {
+    DISPATCH_BY_NFA_TYPE(_dumpDot(nfa, dotFile, base));
 }
 
 void nfaDumpText(const struct NFA *nfa, FILE *txtFile) {
diff --git a/src/nfa/nfa_internal.h b/src/nfa/nfa_internal.h
index d0a4ca0b..a3703cb5 100644
--- a/src/nfa/nfa_internal.h
+++ b/src/nfa/nfa_internal.h
@@ -67,6 +67,7 @@ enum NFAEngineType {
     LBR_NFA_Shuf,       /**< magic pseudo nfa */
     LBR_NFA_Truf,       /**< magic pseudo nfa */
     CASTLE_NFA_0,       /**< magic pseudo nfa */
+    TAMARAMA_NFA_0,     /**< magic nfa container */
     /** \brief bogus NFA - not used */
     INVALID_NFA
 };
@@ -173,6 +174,12 @@ int isLbrType(u8 t) {
            t == LBR_NFA_Shuf || t == LBR_NFA_Truf;
 }
 
+/** \brief True if the given type (from NFA::type) is a container engine. */
+static really_inline
+int isContainerType(u8 t) {
+    return t == TAMARAMA_NFA_0;
+}
+
 static really_inline
 int isMultiTopType(u8 t) {
     return !isDfaType(t) && !isLbrType(t);
diff --git a/src/nfa/tamarama.c b/src/nfa/tamarama.c
new file mode 100644
index 00000000..e8dd7690
--- /dev/null
+++ b/src/nfa/tamarama.c
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+    \brief Tamarama: container engine for exclusive engines, runtime code.
+*/
+#include "config.h"
+
+#include "tamarama.h"
+
+#include "tamarama_internal.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_api_util.h"
+#include "nfa_internal.h"
+#include "scratch.h"
+#include "util/partial_store.h"
+
+static really_inline
+u32 getSubOffset(const struct Tamarama *t, u32 num) {
+    DEBUG_PRINTF("subengine:%u\n", num);
+    assert(num < t->numSubEngines);
+    const u32 *sub =
+        (const u32 *)((const char *)t + sizeof(struct Tamarama) +
+                      t->numSubEngines * sizeof(u32));
+    assert(ISALIGNED(sub));
+    return sub[num];
+}
+
+static
+const struct NFA *getSubEngine(const struct Tamarama *t,
+                               const u32 activeIdx) {
+    const u32 offset = getSubOffset(t, activeIdx);
+    DEBUG_PRINTF("activeIdx:%u offsets:%u\n", activeIdx, offset);
+    const char *base = (const char *)t;
+    return (const struct NFA *)(base + offset);
+}
+
+static
+void storeActiveIdx(const struct Tamarama *t, char *state,
+                    const u32 idx) {
+    assert(idx <= t->numSubEngines);
+    partial_store_u32(state, idx, t->activeIdxSize);
+}
+
+static
+u32 loadActiveIdx(const char *state,
+                  const u32 activeIdxSize) {
+    return partial_load_u32(state, activeIdxSize);
+}
+
+static really_inline
+void copyQueueProperties(const struct mq *q1, struct mq *q2,
+                         const u32 activeIdxSize) {
+    q2->state = q1->state;
+    q2->streamState = q1->streamState + activeIdxSize;
+    q2->offset = q1->offset;
+    q2->buffer = q1->buffer;
+    q2->length = q1->length;
+    q2->history = q1->history;
+    q2->hlength = q1->hlength;
+    q2->cb = q1->cb;
+    q2->som_cb = q1->som_cb;
+    q2->context = q1->context;
+    q2->scratch = q1->scratch;
+    q2->report_current = q1->report_current;
+}
+
+static
+void copyQueueItems(const struct Tamarama *t, const struct NFA *sub,
+                    struct mq *q1, struct mq *q2, const u32 activeIdx) {
+    const u32 *baseTop = (const u32 *)((const char *)t +
+                                       sizeof(struct Tamarama));
+
+    u32 lower = baseTop[activeIdx];
+    u32 upper = activeIdx == t->numSubEngines - 1 ?
+                    ~0U : baseTop[activeIdx + 1];
+    u32 event_base = isMultiTopType(sub->type) ? MQE_TOP_FIRST : MQE_TOP;
+    while (q1->cur < q1->end) {
+        u32 type = q1->items[q1->cur].type;
+        s64a loc = q1->items[q1->cur].location;
+        DEBUG_PRINTF("type:%u lower:%u upper:%u\n", type, lower, upper);
+        if (type >= lower && type < upper) {
+            u32 event = event_base;
+            if (event == MQE_TOP_FIRST) {
+                event += type - lower;
+            }
+            pushQueue(q2, event, loc);
+        } else {
+            pushQueueNoMerge(q2, MQE_END, loc);
+            break;
+        }
+        q1->cur++;
+    }
+}
+
+static
+void copyQueue(const struct Tamarama *t, const struct NFA *sub,
+               struct mq *q1, struct mq *q2, const u32 activeIdx) {
+    copyQueueProperties(q1, q2, t->activeIdxSize);
+
+    // copy MQE_START item
+    u32 cur = q1->cur++;
+    q2->cur = cur;
+    q2->items[cur] = q1->items[cur];
+    q2->end = cur + 1;
+
+    copyQueueItems(t, sub, q1, q2, activeIdx);
+    // restore cur index of the main queue
+    q1->cur = cur;
+}
+
+static
+u32 findEngineForTop(const u32 *baseTop, const u32 cur,
+                     const u32 numSubEngines) {
+    u32 i;
+    for (i = 0; i < numSubEngines; ++i) {
+        DEBUG_PRINTF("cur:%u base:%u\n", cur, baseTop[i]);
+        if (cur >= baseTop[i] &&
+            (i == numSubEngines - 1 || cur < baseTop[i + 1])) {
+            break;
+        }
+    }
+    return i;
+}
+
+static
+void initSubQueue(const struct Tamarama *t, struct mq *q1,
+                  struct mq *q2, const u32 lastActiveIdx,
+                  const u32 activeIdx) {
+    // Push events to the new queue
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+    assert(!isContainerType(sub->type));
+    q2->nfa = sub;
+
+    // Reinitialize state if the last active subengine is different
+    // from current one
+    if (lastActiveIdx == t->numSubEngines ||
+        lastActiveIdx != activeIdx) {
+        nfaQueueInitState(q2->nfa, q2);
+    }
+
+    copyQueueItems(t, sub, q1, q2, activeIdx);
+    if (q1->items[q1->cur].type == MQE_END) {
+        q1->cur++;
+    }
+    DEBUG_PRINTF("update lastIdx:%u\n", activeIdx);
+    storeActiveIdx(t, q1->streamState, activeIdx);
+}
+
+static
+void updateQueues(const struct Tamarama *t, struct mq *q1, struct mq *q2) {
+    q2->cur = q2->end = 0;
+    copyQueueProperties(q1, q2, t->activeIdxSize);
+
+    const u32 numSubEngines = t->numSubEngines;
+    u32 lastActiveIdx = loadActiveIdx(q1->streamState,
+                                      t->activeIdxSize);
+#ifdef DEBUG
+    DEBUG_PRINTF("external queue\n");
+    debugQueue(q1);
+#endif
+
+    // Push MQE_START event to the subqueue
+    s64a loc = q1->items[q1->cur].location;
+    pushQueueAt(q2, 0, MQE_START, loc);
+    char hasStart = 0;
+    if (q1->items[q1->cur].type == MQE_START) {
+        hasStart = 1;
+        q1->cur++;
+    }
+
+    u32 activeIdx = lastActiveIdx;
+    // If we have top events in the main queue, update current active id
+    if (q1->cur < q1->end - 1) {
+        const u32 *baseTop = (const u32 *)((const char *)t +
+                                           sizeof(struct Tamarama));
+        u32 curTop = q1->items[q1->cur].type;
+        activeIdx = findEngineForTop(baseTop, curTop, numSubEngines);
+    }
+
+    assert(activeIdx < numSubEngines);
+    DEBUG_PRINTF("last id:%u, current id:%u, num of subengines:%u\n",
+                 lastActiveIdx, activeIdx, numSubEngines);
+    // Handle unfinished last alive subengine
+    if (lastActiveIdx != activeIdx &&
+        lastActiveIdx != numSubEngines && hasStart) {
+        loc = q1->items[q1->cur].location;
+        pushQueueNoMerge(q2, MQE_END, loc);
+        q2->nfa = getSubEngine(t, lastActiveIdx);
+        return;
+    }
+
+    initSubQueue(t, q1, q2, lastActiveIdx, activeIdx);
+    DEBUG_PRINTF("finish queues\n");
+}
+
+// After processing subqueue items for subengines, we need to copy back
+// remaining items in subqueue if there are any to Tamarama main queue
+static
+void copyBack(const struct  Tamarama *t, struct mq *q, struct mq *q1) {
+    DEBUG_PRINTF("copy back %u, %u\n", q1->cur, q1->end);
+    q->report_current = q1->report_current;
+    if (q->cur >= q->end && q1->cur >= q1->end) {
+        return;
+    }
+
+    const u32 *baseTop = (const u32 *)((const char *)t +
+                                        sizeof(struct Tamarama));
+    const u32 lastIdx = loadActiveIdx(q->streamState,
+                                      t->activeIdxSize);
+    u32 base = 0, event_base = 0;
+    if (lastIdx != t->numSubEngines) {
+        base = baseTop[lastIdx];
+        const struct NFA *sub = getSubEngine(t, lastIdx);
+        event_base = isMultiTopType(sub->type) ? MQE_TOP_FIRST : MQE_TOP;
+    }
+
+    u32 numItems = q1->end > q1->cur + 1 ? q1->end - q1->cur - 1 : 1;
+    // Also need to copy MQE_END if the main queue is empty
+    if (q->cur == q->end) {
+        numItems++;
+    }
+    u32 cur = q->cur - numItems;
+    q->items[cur] = q1->items[q1->cur++];
+    q->items[cur].type = MQE_START;
+    q->cur = cur++;
+    for (u32 i = 0; i < numItems - 1; ++i) {
+        u32 type = q1->items[q1->cur].type;
+        if (type > MQE_END) {
+            q1->items[q1->cur].type = type - event_base + base;
+        }
+        q->items[cur++] = q1->items[q1->cur++];
+    }
+
+#ifdef DEBUG
+    DEBUG_PRINTF("external queue\n");
+    debugQueue(q);
+#endif
+}
+
+char nfaExecTamarama0_testEOD(const struct NFA *n, const char *state,
+                              const char *streamState, u64a offset,
+                              NfaCallback callback, SomNfaCallback som_cb,
+                              void *context) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+    if (nfaAcceptsEod(sub)) {
+        assert(!isContainerType(sub->type));
+        const char *subStreamState = streamState + t->activeIdxSize;
+        return nfaCheckFinalState(sub, state, subStreamState,
+                                  offset, callback, som_cb, context);
+    }
+
+    return MO_CONTINUE_MATCHING;
+}
+
+char nfaExecTamarama0_QR(const struct NFA *n, struct mq *q,
+                         ReportID report) {
+    DEBUG_PRINTF("exec rose\n");
+    struct mq q1;
+    q1.cur = q1.end = 0;
+    char rv = 0;
+    const struct Tamarama *t = getImplNfa(n);
+    while (q->cur < q->end) {
+        updateQueues(t, q, &q1);
+    }
+
+    if (q1.cur < q1.end) {
+        rv = nfaQueueExecRose(q1.nfa, &q1, report);
+    }
+
+    DEBUG_PRINTF("exec rose rv:%u\n", rv);
+    return rv;
+}
+
+char nfaExecTamarama0_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 1;
+    }
+
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+    struct mq q1;
+    copyQueue(t, sub, q, &q1, activeIdx);
+    return nfaReportCurrentMatches(sub, &q1);
+}
+
+char nfaExecTamarama0_inAccept(const struct NFA *n, ReportID report,
+                               struct mq *q) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 0;
+    }
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    struct mq q1;
+    copyQueue(t, sub, q, &q1, activeIdx);
+    return nfaInAcceptState(sub, report, &q1);
+}
+
+char nfaExecTamarama0_inAnyAccept(const struct NFA *n, struct mq *q) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 0;
+    }
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    struct mq q1;
+    copyQueue(t, sub, q, &q1, activeIdx);
+    return nfaInAnyAcceptState(sub, &q1);
+}
+
+char nfaExecTamarama0_queueInitState(const struct NFA *n, struct mq *q) {
+    DEBUG_PRINTF("init state\n");
+    const struct Tamarama *t = getImplNfa(n);
+    char *ptr = q->streamState;
+    // Use activeIdxSize as a sentinel value and initialize the state to
+    // an invalid engine as nothing has been triggered yet
+    storeActiveIdx(t, ptr, t->numSubEngines);
+    return 0;
+}
+
+char nfaExecTamarama0_queueCompressState(const struct NFA *n,
+                                         const struct mq *q, s64a loc) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 0;
+    }
+
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    struct mq q1;
+    copyQueueProperties(q, &q1, t->activeIdxSize);
+    return nfaQueueCompressState(sub, &q1, loc);
+}
+
+char nfaExecTamarama0_expandState(const struct NFA *n, void *dest,
+                                  const void *src, u64a offset, u8 key) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(src, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return 0;
+    }
+
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    const char *subStreamState = (const char *)src + t->activeIdxSize;
+    return nfaExpandState(sub, dest, subStreamState, offset, key);
+}
+
+enum nfa_zombie_status nfaExecTamarama0_zombie_status(const struct NFA *n,
+                                                      struct mq *q, s64a loc) {
+    const struct Tamarama *t = getImplNfa(n);
+    u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize);
+    if (activeIdx == t->numSubEngines) {
+        return NFA_ZOMBIE_NO;
+    }
+    const struct NFA *sub = getSubEngine(t, activeIdx);
+
+    struct mq q1;
+    copyQueue(t, sub, q, &q1, activeIdx);
+    return nfaGetZombieStatus(sub, &q1, loc);
+}
+
+char nfaExecTamarama0_Q(const struct NFA *n, struct mq *q, s64a end) {
+    DEBUG_PRINTF("exec\n");
+    struct mq q1;
+    char rv = MO_ALIVE;
+    char copy = 0;
+    const struct Tamarama *t = getImplNfa(n);
+    while (q->cur < q->end && q_cur_loc(q) <= end) {
+        updateQueues(t, q, &q1);
+        rv = nfaQueueExec_raw(q1.nfa, &q1, end);
+        q->report_current = q1.report_current;
+        copy = 1;
+        if (can_stop_matching(q->scratch)) {
+            break;
+        }
+    }
+    if (copy) {
+        copyBack(t, q, &q1);
+    }
+    return rv;
+}
+
+char nfaExecTamarama0_Q2(const struct NFA *n,
+                         struct mq *q, s64a end) {
+    DEBUG_PRINTF("exec to match\n");
+    struct mq q1;
+    char rv = 0;
+    char copy = 0;
+    const struct Tamarama *t = getImplNfa(n);
+    while (q->cur < q->end && q_cur_loc(q) <= end &&
+           rv != MO_MATCHES_PENDING) {
+        updateQueues(t, q, &q1);
+        rv = nfaQueueExec2_raw(q1.nfa, &q1, end);
+        q->report_current = q1.report_current;
+        copy = 1;
+    }
+    if (copy) {
+        copyBack(t, q, &q1);
+    }
+    return rv;
+}
+
diff --git a/src/nfa/tamarama.h b/src/nfa/tamarama.h
new file mode 100644
index 00000000..c39639a6
--- /dev/null
+++ b/src/nfa/tamarama.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef TAMARAMA_H
+#define TAMARAMA_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "callback.h"
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+struct hs_scratch;
+
+char nfaExecTamarama0_testEOD(const struct NFA *n, const char *state,
+                              const char *streamState, u64a offset,
+                              NfaCallback callback, SomNfaCallback som_cb,
+                              void *context);
+char nfaExecTamarama0_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecTamarama0_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecTamarama0_inAccept(const struct NFA *n, ReportID report,
+                               struct mq *q);
+char nfaExecTamarama0_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecTamarama0_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecTamarama0_queueCompressState(const struct NFA *n,
+                                         const struct mq *q,
+                                         s64a loc);
+char nfaExecTamarama0_expandState(const struct NFA *n, void *dest,
+                                  const void *src, u64a offset, u8 key);
+enum nfa_zombie_status nfaExecTamarama0_zombie_status(const struct NFA *n,
+                                                      struct mq *q, s64a loc);
+char nfaExecTamarama0_Q(const struct NFA *nfa, struct mq *q, s64a end);
+char nfaExecTamarama0_Q2(const struct NFA *nfa, struct mq *q, s64a end);
+
+// only used by outfix and miracles, no implementation for tamarama
+#define nfaExecTamarama0_initCompressedState NFA_API_NO_IMPL
+#define nfaExecTamarama0_B_Reverse NFA_API_NO_IMPL
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/nfa/tamarama_dump.cpp b/src/nfa/tamarama_dump.cpp
new file mode 100644
index 00000000..ed2f1cb1
--- /dev/null
+++ b/src/nfa/tamarama_dump.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Tamarama: container engine for exclusve engines, dump code.
+ */
+
+#include "config.h"
+
+#include "tamarama_dump.h"
+
+#include "tamarama_internal.h"
+#include "nfa_dump_api.h"
+#include "nfa_dump_internal.h"
+#include "nfa_internal.h"
+
+#include <string>
+#include <sstream>
+
+#ifndef DUMP_SUPPORT
+#error No dump support!
+#endif
+
+namespace ue2 {
+
+void nfaExecTamarama0_dumpDot(const struct NFA *nfa, UNUSED FILE *f,
+                              const std::string &base) {
+    const Tamarama *t = (const Tamarama *)getImplNfa(nfa);
+    const u32 *subOffset =
+        (const u32 *)((const char *)t + sizeof(struct Tamarama) +
+                      t->numSubEngines * sizeof(u32));
+    const char *offset = (const char *)nfa;
+    for (u32 i = 0; i < t->numSubEngines; i++) {
+        std::stringstream ssdot;
+        ssdot << base << "rose_nfa_" << nfa->queueIndex
+            << "_sub_" << i << ".dot";
+        const NFA *sub = (const struct NFA *)(offset + subOffset[i]);
+        FILE *f1 = fopen(ssdot.str().c_str(), "w");
+        nfaDumpDot(sub, f1, base);
+        fclose(f1);
+    }
+}
+
+void nfaExecTamarama0_dumpText(const struct NFA *nfa, FILE *f) {
+    const Tamarama *t = (const Tamarama *)getImplNfa(nfa);
+
+    fprintf(f, "Tamarama container engine\n");
+    fprintf(f, "\n");
+    fprintf(f, "Number of subengine tenants:  %u\n", t->numSubEngines);
+
+    fprintf(f, "\n");
+    dumpTextReverse(nfa, f);
+    fprintf(f, "\n");
+
+    const u32 *subOffset =
+        (const u32 *)((const char *)t + sizeof(struct Tamarama) +
+                      t->numSubEngines * sizeof(u32));
+    const char *offset = (const char *)nfa;
+    for (u32 i = 0; i < t->numSubEngines; i++) {
+        fprintf(f, "Sub %u:\n", i);
+        const NFA *sub = (const struct NFA *)(offset + subOffset[i]);
+        nfaDumpText(sub, f);
+        fprintf(f, "\n");
+    }
+}
+
+} // namespace ue2
diff --git a/src/nfa/tamarama_dump.h b/src/nfa/tamarama_dump.h
new file mode 100644
index 00000000..dc976004
--- /dev/null
+++ b/src/nfa/tamarama_dump.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef TAMARAMA_DUMP_H
+#define TAMARAMA_DUMP_H
+
+#if defined(DUMP_SUPPORT)
+
+#include <cstdio>
+#include <string>
+
+struct NFA;
+
+namespace ue2 {
+
+void nfaExecTamarama0_dumpDot(const NFA *nfa, FILE *file,
+                              const std::string &base);
+void nfaExecTamarama0_dumpText(const NFA *nfa, FILE *file);
+
+} // namespace ue2
+
+#endif // DUMP_SUPPORT
+
+#endif
diff --git a/src/nfa/tamarama_internal.h b/src/nfa/tamarama_internal.h
new file mode 100644
index 00000000..5cdc70d4
--- /dev/null
+++ b/src/nfa/tamarama_internal.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ *\brief Tamarama: container engine for exclusive engines,
+ *                 data structures.
+ */
+
+/* Tamarama bytecode layout:
+ * * |-----|
+ * * |     | struct NFA
+ * * |-----|
+ * * |     | struct Tamarama
+ * * |     |
+ * * |-----|
+ * * |     | top remapping table:
+ * * |     | stores top base for each subengine.
+ * * |     | old_top = remapped_top - top_base;
+ * * |     | The size of table is equal to the number of subengines.
+ * * ...
+ * * |     |
+ * * |-----|
+ * * |     | offsets from the start of struct Tamarama to subengines --\
+ * * ...                                                               |
+ * * |     |                                          -----------\     |
+ * * |-----|                                                     |     |
+ * * ||--| | subengine 1 (struct NFA + rest of subengine)     <--/     |
+ * * ||  | |                                                           |
+ * * ||--| |                                                           |
+ * * ||  | |                                                           |
+ * * ||  | |                                                           |
+ * * ||--| |                                                           |
+ * * |     |                                                           |
+ * * ||--| | subengine 2 (struct NFA + rest of subengine)      <-------/
+ * * ||  | |
+ * * ||--| |
+ * * ||  | |
+ * * ||  | |
+ * * ||--| |
+ * * |     |
+ * * ...
+ * * |     |
+ * * |-----| total size of tamarama
+ * *
+ * * Tamarama stream state:
+ * *
+ * * |---|
+ * * |   | active subengine id
+ * * |---|
+ * * |   | common pool of stream state for each engine
+ * * |   |
+ * * |   |
+ * * ...
+ * * |   |
+ * * |   |
+ * * |---|
+ * *
+ * * Tamarama scratch space:
+ * *
+ * * |---|
+ * * |   | common pool of scratch for each engine
+ * * |   |
+ * * |   |
+ * * ...
+ * * |   |
+ * * |   |
+ * * |---|
+ * */
+
+#ifndef NFA_TAMARAMA_INTERNAL_H
+#define NFA_TAMARAMA_INTERNAL_H
+
+#include "ue2common.h"
+
+struct ALIGN_AVX_DIRECTIVE Tamarama {
+    u32 numSubEngines;
+    u8 activeIdxSize;
+};
+
+#endif // NFA_TAMARAMA_INTERNAL_H
diff --git a/src/nfa/tamaramacompile.cpp b/src/nfa/tamaramacompile.cpp
new file mode 100644
index 00000000..73d19595
--- /dev/null
+++ b/src/nfa/tamaramacompile.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Tamarama: container engine for exclusive engines,
+ *                  compiler code.
+ */
+
+#include "config.h"
+
+#include "tamaramacompile.h"
+
+#include "tamarama_internal.h"
+#include "nfa_internal.h"
+#include "nfa_api_queue.h"
+#include "repeatcompile.h"
+#include "util/container.h"
+#include "util/verify_types.h"
+
+using namespace std;
+
+namespace ue2 {
+
+static
+void remapTops(const TamaInfo &tamaInfo,
+               vector<u32> &top_base,
+               map<pair<const NFA *, u32>, u32> &out_top_remap) {
+    u32 i = 0;
+    u32 cur = 0;
+    for (const auto &sub : tamaInfo.subengines) {
+        u32 base = cur;
+        top_base.push_back(base + MQE_TOP_FIRST);
+        DEBUG_PRINTF("subengine:%u\n", i);
+        for (const auto &t : tamaInfo.tops[i++]) {
+            cur = base + t;
+            DEBUG_PRINTF("top remapping %u:%u\n", t ,cur);
+            out_top_remap.emplace(make_pair(sub, t), cur++);
+        }
+    }
+}
+
+/**
+ * update stream state and scratch state sizes and copy in
+ * subengines in Tamarama.
+ */
+static
+void copyInSubnfas(const char *base_offset, NFA &nfa,
+                   const TamaInfo &tamaInfo, u32 *offsets,
+                   char *sub_nfa_offset, const u32 activeIdxSize) {
+    u32 maxStreamStateSize = 0;
+    u32 maxScratchStateSize = 0;
+    sub_nfa_offset = ROUNDUP_PTR(sub_nfa_offset, 64);
+    bool infinite_max_width = false;
+    for (auto &sub : tamaInfo.subengines) {
+        u32 streamStateSize = verify_u32(sub->streamStateSize);
+        u32 scratchStateSize = verify_u32(sub->scratchStateSize);
+        maxStreamStateSize = max(maxStreamStateSize, streamStateSize);
+        maxScratchStateSize = max(maxScratchStateSize, scratchStateSize);
+        sub->queueIndex = nfa.queueIndex;
+
+        memcpy(sub_nfa_offset, sub, sub->length);
+        *offsets = verify_u32(sub_nfa_offset - base_offset);
+        DEBUG_PRINTF("type:%u offsets:%u\n", sub->type, *offsets);
+        ++offsets;
+        sub_nfa_offset += ROUNDUP_CL(sub->length);
+
+        // update nfa properties
+        nfa.flags |= sub->flags;
+        if (!sub->maxWidth) {
+            infinite_max_width = true;
+        } else if (!infinite_max_width) {
+            nfa.maxWidth = max(nfa.maxWidth, sub->maxWidth);
+        }
+    }
+
+    if (infinite_max_width) {
+        nfa.maxWidth = 0;
+    }
+    nfa.maxBiAnchoredWidth = 0;
+    nfa.streamStateSize = activeIdxSize + maxStreamStateSize;
+    nfa.scratchStateSize = maxScratchStateSize;
+}
+
+/**
+ * Take in a collection of exclusive sub engines and produces a tamarama, also
+ * returns via out_top_remap, a mapping indicating how tops in the subengines in
+ * relate to the tamarama's tops.
+ */
+aligned_unique_ptr<NFA> buildTamarama(const TamaInfo &tamaInfo, const u32 queue,
+                        map<pair<const NFA *, u32>, u32> &out_top_remap) {
+    vector<u32> top_base;
+    remapTops(tamaInfo, top_base, out_top_remap);
+
+    size_t subSize = tamaInfo.subengines.size();
+    DEBUG_PRINTF("subSize:%lu\n", subSize);
+    size_t total_size =
+        sizeof(NFA) +               // initial NFA structure
+        sizeof(Tamarama) +          // Tamarama structure
+        sizeof(u32) * subSize +     // base top event value for subengines,
+                                    // used for top remapping at runtime
+        sizeof(u32) * subSize + 64; // offsets to subengines in bytecode and
+                                    // padding for subengines
+
+    for (const auto &sub : tamaInfo.subengines) {
+        total_size += ROUNDUP_CL(sub->length);
+    }
+
+    // use subSize as a sentinel value for no active subengines,
+    // so add one to subSize here
+    u32 activeIdxSize = calcPackedBytes(subSize + 1);
+    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    nfa->type = verify_u8(TAMARAMA_NFA_0);
+    nfa->length = verify_u32(total_size);
+    nfa->queueIndex = queue;
+
+    char *ptr = (char *)nfa.get() + sizeof(NFA);
+    char *base_offset = ptr;
+    Tamarama *t = (Tamarama *)ptr;
+    t->numSubEngines = verify_u32(subSize);
+    t->activeIdxSize = verify_u8(activeIdxSize);
+
+    ptr += sizeof(Tamarama);
+    copy_bytes(ptr, top_base);
+    ptr += byte_length(top_base);
+
+    u32 *offsets = (u32*)ptr;
+    char *sub_nfa_offset = ptr + sizeof(u32) * subSize;
+    copyInSubnfas(base_offset, *nfa, tamaInfo, offsets, sub_nfa_offset,
+                  activeIdxSize);
+    assert((size_t)(sub_nfa_offset - (char *)nfa.get()) <= total_size);
+    return nfa;
+}
+
+set<ReportID> all_reports(const TamaProto &proto) {
+    return proto.reports;
+}
+
+void TamaInfo::add(NFA *sub, const set<u32> &top) {
+    assert(subengines.size() < max_occupancy);
+    subengines.push_back(sub);
+    tops.push_back(top);
+}
+
+void TamaProto::add(const NFA *n, const u32 id, const u32 top,
+                    const map<pair<const NFA *, u32>, u32> &out_top_remap) {
+    top_remap.emplace(make_pair(id, top), out_top_remap.at(make_pair(n, top)));
+}
+
+} // namespace ue2
+
diff --git a/src/nfa/tamaramacompile.h b/src/nfa/tamaramacompile.h
new file mode 100644
index 00000000..048b966b
--- /dev/null
+++ b/src/nfa/tamaramacompile.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ *  \brief Tamarama: container engine for exclusive engines, compiler code.
+ */
+
+#ifndef NFA_TAMARAMACOMPILE_H
+#define NFA_TAMARAMACOMPILE_H
+
+#include "ue2common.h"
+#include "util/alloc.h"
+
+#include <map>
+#include <set>
+#include <vector>
+
+struct NFA;
+
+namespace ue2 {
+
+/**
+ * \brief A TamaProto that contains top remapping and reports info
+ */
+struct TamaProto {
+    void add(const NFA *n, const u32 id, const u32 top,
+             const std::map<std::pair<const NFA *, u32>, u32> &out_top_remap);
+    /** Top remapping between <vertex id, top value> and
+     ** remapped top value. */
+    std::map<std::pair<u32, u32>, u32> top_remap;
+
+    /** All the reports in subengines */
+    std::set<ReportID> reports;
+};
+
+/**
+ * \brief Contruction info for a Tamarama engine:
+ * contains at least two subengines.
+ *
+ * A TamaInfo is converted into a single NFA, with each top triggering a
+ * subengine. A TamaInfo can contain at most TamaInfo::max_occupancy
+ * subengines.
+ */
+struct TamaInfo {
+    static constexpr size_t max_occupancy = 65536; // arbitrary limit
+
+    /** \brief Add a new subengine. */
+    void add(NFA* sub, const std::set<u32> &top);
+
+    /** \brief All the subengines */
+    std::vector<NFA *> subengines;
+
+    /** \brief Tops of subengines */
+    std::vector<std::set<u32>> tops;
+};
+
+std::set<ReportID> all_reports(const TamaProto &proto);
+
+/**
+ * Take in a collection of exclusive subengines and produces a tamarama, also
+ * returns via out_top_remap, a mapping indicating how tops in the subengines in
+ * relate to the tamarama's tops.
+ */
+ue2::aligned_unique_ptr<NFA> buildTamarama(const TamaInfo &tamaInfo,
+                      const u32 queue,
+                      std::map<std::pair<const NFA *, u32>, u32> &out_top_remap);
+} // namespace ue2
+
+#endif // NFA_TAMARAMACOMPILE_H
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 3f56b101..23e025d0 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -33,6 +33,7 @@
 #include "hs_compile.h" // for HS_MODE_*
 #include "rose_build_add_internal.h"
 #include "rose_build_anchored.h"
+#include "rose_build_exclusive.h"
 #include "rose_build_groups.h"
 #include "rose_build_infix.h"
 #include "rose_build_lookaround.h"
@@ -50,6 +51,8 @@
 #include "nfa/nfa_build_util.h"
 #include "nfa/nfa_internal.h"
 #include "nfa/shufticompile.h"
+#include "nfa/tamaramacompile.h"
+#include "nfa/tamarama_internal.h"
 #include "nfagraph/ng_execute.h"
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_lbr.h"
@@ -71,6 +74,7 @@
 #include "util/compile_error.h"
 #include "util/container.h"
 #include "util/graph_range.h"
+#include "util/make_unique.h"
 #include "util/multibit_build.h"
 #include "util/order_check.h"
 #include "util/queue_index_factory.h"
@@ -1422,6 +1426,296 @@ bool buildLeftfix(RoseBuildImpl &build, build_context &bc, bool prefix, u32 qi,
     return true;
 }
 
+static
+unique_ptr<TamaInfo> constructTamaInfo(const RoseGraph &g,
+                     const vector<ExclusiveSubengine> &subengines,
+                     const bool is_suffix) {
+    unique_ptr<TamaInfo> tamaInfo = ue2::make_unique<TamaInfo>();
+    for (const auto &sub : subengines) {
+        const auto &rose_vertices = sub.vertices;
+        NFA *nfa = sub.nfa.get();
+        set<u32> tops;
+        for (const auto &v : rose_vertices) {
+            if (is_suffix) {
+                tops.insert(g[v].suffix.top);
+            } else {
+                for (const auto &e : in_edges_range(v, g)) {
+                    tops.insert(g[e].rose_top);
+                }
+            }
+        }
+        tamaInfo->add(nfa, tops);
+    }
+
+    return tamaInfo;
+}
+
+static
+void updateTops(const RoseGraph &g, const TamaInfo &tamaInfo,
+                TamaProto &tamaProto,
+                const vector<ExclusiveSubengine> &subengines,
+                const map<pair<const NFA *, u32>, u32> &out_top_remap,
+                const bool is_suffix) {
+    u32 i = 0;
+    for (const auto &n : tamaInfo.subengines) {
+        for (const auto &v : subengines[i].vertices) {
+            if (is_suffix) {
+                tamaProto.add(n, g[v].idx, g[v].suffix.top,
+                              out_top_remap);
+            } else {
+                for (const auto &e : in_edges_range(v, g)) {
+                    tamaProto.add(n, g[v].idx, g[e].rose_top,
+                                  out_top_remap);
+                }
+            }
+        }
+        i++;
+    }
+}
+
+static
+shared_ptr<TamaProto> constructContainerEngine(const RoseGraph &g,
+                                               build_context &bc,
+                                               const ExclusiveInfo &info,
+                                               const u32 queue,
+                                               const bool is_suffix) {
+    const auto &subengines = info.subengines;
+    auto tamaInfo =
+        constructTamaInfo(g, subengines, is_suffix);
+
+    map<pair<const NFA *, u32>, u32> out_top_remap;
+    auto n = buildTamarama(*tamaInfo, queue, out_top_remap);
+    add_nfa_to_blob(bc, *n);
+
+    DEBUG_PRINTF("queue id:%u\n", queue);
+    shared_ptr<TamaProto> tamaProto = make_shared<TamaProto>();
+    tamaProto->reports = info.reports;
+    updateTops(g, *tamaInfo, *tamaProto, subengines,
+               out_top_remap, is_suffix);
+    return tamaProto;
+}
+
+static
+void buildInfixContainer(RoseGraph &g, build_context &bc,
+                         const vector<ExclusiveInfo> &exclusive_info) {
+    // Build tamarama engine
+    for (const auto &info : exclusive_info) {
+        const u32 queue = info.queue;
+        const auto &subengines = info.subengines;
+        auto tamaProto =
+            constructContainerEngine(g, bc, info, queue, false);
+
+        for (const auto &sub : subengines) {
+            const auto &verts = sub.vertices;
+            for (const auto &v : verts) {
+                DEBUG_PRINTF("vert id:%lu\n", g[v].idx);
+                g[v].left.tamarama = tamaProto;
+            }
+        }
+    }
+}
+
+static
+void buildSuffixContainer(RoseGraph &g, build_context &bc,
+                          const vector<ExclusiveInfo> &exclusive_info) {
+    // Build tamarama engine
+    for (const auto &info : exclusive_info) {
+        const u32 queue = info.queue;
+        const auto &subengines = info.subengines;
+        auto tamaProto =
+            constructContainerEngine(g, bc, info, queue, true);
+        for (const auto &sub : subengines) {
+            const auto &verts = sub.vertices;
+            for (const auto &v : verts) {
+                DEBUG_PRINTF("vert id:%lu\n", g[v].idx);
+                g[v].suffix.tamarama = tamaProto;
+            }
+            const auto &v = verts[0];
+            suffix_id newSuffix(g[v].suffix);
+            bc.suffixes.emplace(newSuffix, queue);
+        }
+    }
+}
+
+static
+void updateExclusiveInfixProperties(const RoseBuildImpl &build,
+                                    build_context &bc,
+                                    const vector<ExclusiveInfo> &exclusive_info,
+                                    set<u32> *no_retrigger_queues) {
+    const RoseGraph &g = build.g;
+    for (const auto &info : exclusive_info) {
+        // Set leftfix optimisations, disabled for tamarama subengines
+        rose_group squash_mask = ~rose_group{0};
+        // Leftfixes can have stop alphabets.
+        vector<u8> stop(N_CHARS, 0);
+        // Infix NFAs can have bounds on their queue lengths.
+        u32 max_queuelen = 0;
+        u32 max_width = 0;
+        u8 cm_count = 0;
+        CharReach cm_cr;
+
+        const auto &qi = info.queue;
+        const auto &subengines = info.subengines;
+        bool no_retrigger = true;
+        for (const auto &sub : subengines) {
+            const auto &verts = sub.vertices;
+            const auto &v_first = verts[0];
+            left_id leftfix(g[v_first].left);
+            if (leftfix.haig() || !leftfix.graph() ||
+                !nfaStuckOn(*leftfix.graph())) {
+                no_retrigger = false;
+            }
+
+            for (const auto &v : verts) {
+                set<ue2_literal> lits;
+                for (auto u : inv_adjacent_vertices_range(v, build.g)) {
+                    for (u32 lit_id : build.g[u].literals) {
+                        lits.insert(build.literals.right.at(lit_id).s);
+                    }
+                }
+                DEBUG_PRINTF("%zu literals\n", lits.size());
+
+                u32 queuelen = findMaxInfixMatches(leftfix, lits);
+                if (queuelen < UINT32_MAX) {
+                    queuelen++;
+                }
+                max_queuelen = max(max_queuelen, queuelen);
+            }
+        }
+
+        if (no_retrigger) {
+            no_retrigger_queues->insert(qi);
+        }
+
+        for (const auto &sub : subengines) {
+            const auto &verts = sub.vertices;
+            for (const auto &v : verts) {
+                u32 lag = g[v].left.lag;
+                bc.leftfix_info.emplace(
+                    v, left_build_info(qi, lag, max_width, squash_mask, stop,
+                                       max_queuelen, cm_count, cm_cr));
+            }
+        }
+    }
+}
+
+static
+void updateExclusiveSuffixProperties(const RoseBuildImpl &build,
+                                const vector<ExclusiveInfo> &exclusive_info,
+                                set<u32> *no_retrigger_queues) {
+    const RoseGraph &g = build.g;
+    for (auto &info : exclusive_info) {
+        const auto &qi = info.queue;
+        const auto &subengines = info.subengines;
+        bool no_retrigger = true;
+        for (const auto &sub : subengines) {
+            const auto &v_first = sub.vertices[0];
+            suffix_id suffix(g[v_first].suffix);
+            if (!suffix.graph() || !nfaStuckOn(*suffix.graph())) {
+                no_retrigger = false;
+                break;
+            }
+        }
+
+        if (no_retrigger) {
+            no_retrigger_queues->insert(qi);
+        }
+    }
+}
+
+static
+void buildExclusiveInfixes(RoseBuildImpl &build, build_context &bc,
+                           QueueIndexFactory &qif,
+                           const map<left_id, set<PredTopPair>> &infixTriggers,
+                           const map<u32, vector<RoseVertex>> &vertex_map,
+                           const vector<vector<u32>> &groups,
+                           set<u32> *no_retrigger_queues) {
+    RoseGraph &g = build.g;
+    const CompileContext &cc = build.cc;
+
+    vector<ExclusiveInfo> exclusive_info;
+    for (const auto &gp : groups) {
+        ExclusiveInfo info;
+        for (const auto &id : gp) {
+            const auto &verts = vertex_map.at(id);
+            left_id leftfix(g[verts[0]].left);
+
+            bool is_transient = false;
+            auto n = makeLeftNfa(build, leftfix, false, is_transient,
+                                 infixTriggers, cc);
+            assert(n);
+
+            setLeftNfaProperties(*n, leftfix);
+
+            ExclusiveSubengine engine;
+            engine.nfa = move(n);
+            engine.vertices = verts;
+            info.subengines.push_back(move(engine));
+        }
+        info.queue = qif.get_queue();
+        exclusive_info.push_back(move(info));
+    }
+    updateExclusiveInfixProperties(build, bc, exclusive_info,
+                                   no_retrigger_queues);
+    buildInfixContainer(g, bc, exclusive_info);
+}
+
+static
+void findExclusiveInfixes(RoseBuildImpl &build, build_context &bc,
+                          QueueIndexFactory &qif,
+                          const map<left_id, set<PredTopPair>> &infixTriggers,
+                          set<u32> *no_retrigger_queues) {
+    const RoseGraph &g = build.g;
+
+    set<RoleInfo<left_id>> roleInfoSet;
+    map<u32, vector<RoseVertex>> vertex_map;
+
+    u32 role_id = 0;
+    map<left_id, u32> leftfixes;
+    for (auto v : vertices_range(g)) {
+        if (!g[v].left || build.isRootSuccessor(v)) {
+            continue;
+        }
+
+        left_id leftfix(g[v].left);
+
+        // Sanity check: our NFA should contain each of the tops mentioned on
+        // our in-edges.
+        assert(roseHasTops(g, v));
+
+        if (contains(leftfixes, leftfix)) {
+            // NFA already built.
+            u32 id = leftfixes[leftfix];
+            if (contains(vertex_map, id)) {
+                vertex_map[id].push_back(v);
+            }
+            DEBUG_PRINTF("sharing leftfix, id=%u\n", id);
+            continue;
+        }
+
+        if (leftfix.graph() || leftfix.castle()) {
+            leftfixes.emplace(leftfix, role_id);
+            vertex_map[role_id].push_back(v);
+
+            map<u32, vector<vector<CharReach>>> triggers;
+            findTriggerSequences(build, infixTriggers.at(leftfix), &triggers);
+            RoleInfo<left_id> info(leftfix, role_id);
+            if (setTriggerLiteralsInfix(info, triggers)) {
+                roleInfoSet.insert(info);
+            }
+            role_id++;
+        }
+    }
+
+    if (leftfixes.size() > 1) {
+        DEBUG_PRINTF("leftfix size:%lu\n", leftfixes.size());
+        vector<vector<u32>> groups;
+        exclusiveAnalysisInfix(build, vertex_map, roleInfoSet, groups);
+        buildExclusiveInfixes(build, bc, qif, infixTriggers, vertex_map,
+                              groups, no_retrigger_queues);
+    }
+}
+
 static
 bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
                     QueueIndexFactory &qif, set<u32> *no_retrigger_queues,
@@ -1434,8 +1728,13 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
     unordered_map<left_id, vector<RoseVertex> > succs;
     findInfixTriggers(tbi, &infixTriggers);
 
+    if (cc.grey.allowTamarama && cc.streaming && !do_prefix) {
+        findExclusiveInfixes(tbi, bc, qif, infixTriggers,
+                             no_retrigger_queues);
+    }
+
     for (auto v : vertices_range(g)) {
-        if (!g[v].left) {
+        if (!g[v].left || g[v].left.tamarama) {
             continue;
         }
 
@@ -1753,11 +2052,111 @@ void setSuffixProperties(NFA &n, const suffix_id &suff,
 }
 
 static
-bool buildSuffixes(const RoseBuildImpl &tbi, build_context &bc,
-                   set<u32> *no_retrigger_queues) {
-    map<suffix_id, set<PredTopPair> > suffixTriggers;
-    findSuffixTriggers(tbi, &suffixTriggers);
+void buildExclusiveSuffixes(RoseBuildImpl &build, build_context &bc,
+                            QueueIndexFactory &qif,
+                            map<suffix_id, set<PredTopPair>> &suffixTriggers,
+                            const map<u32, vector<RoseVertex>> &vertex_map,
+                            const vector<vector<u32>> &groups,
+                            set<u32> *no_retrigger_queues) {
+    RoseGraph &g = build.g;
 
+    vector<ExclusiveInfo> exclusive_info;
+    for (const auto &gp : groups) {
+        ExclusiveInfo info;
+        for (const auto &id : gp) {
+            const auto &verts = vertex_map.at(id);
+            suffix_id s(g[verts[0]].suffix);
+
+            const set<PredTopPair> &s_triggers = suffixTriggers.at(s);
+
+            map<u32, u32> fixed_depth_tops;
+            findFixedDepthTops(g, s_triggers, &fixed_depth_tops);
+
+            map<u32, vector<vector<CharReach>>> triggers;
+            findTriggerSequences(build, s_triggers, &triggers);
+
+            auto n = buildSuffix(build.rm, build.ssm, fixed_depth_tops,
+                                 triggers, s, build.cc);
+            assert(n);
+
+            setSuffixProperties(*n, s, build.rm);
+
+            ExclusiveSubengine engine;
+            engine.nfa = move(n);
+            engine.vertices = verts;
+            info.subengines.push_back(move(engine));
+
+            const auto &reports = all_reports(s);
+            info.reports.insert(reports.begin(), reports.end());
+        }
+        info.queue = qif.get_queue();
+        exclusive_info.push_back(move(info));
+    }
+    updateExclusiveSuffixProperties(build, exclusive_info,
+                                    no_retrigger_queues);
+    buildSuffixContainer(g, bc, exclusive_info);
+}
+
+static
+void findExclusiveSuffixes(RoseBuildImpl &tbi, build_context &bc,
+                  QueueIndexFactory &qif,
+                  map<suffix_id, set<PredTopPair>> &suffixTriggers,
+                  set<u32> *no_retrigger_queues) {
+    const RoseGraph &g = tbi.g;
+
+    map<suffix_id, u32> suffixes;
+    set<RoleInfo<suffix_id>> roleInfoSet;
+    map<u32, vector<RoseVertex>> vertex_map;
+    u32 role_id = 0;
+    for (auto v : vertices_range(g)) {
+        if (!g[v].suffix) {
+            continue;
+        }
+
+        const suffix_id s(g[v].suffix);
+
+        DEBUG_PRINTF("vertex %zu triggers suffix %p\n", g[v].idx, s.graph());
+
+        // We may have already built this NFA.
+        if (contains(suffixes, s)) {
+            u32 id = suffixes[s];
+            if (!tbi.isInETable(v)) {
+                vertex_map[id].push_back(v);
+            }
+            continue;
+        }
+
+        // Currently disable eod suffixes for exclusive analysis
+        if (!tbi.isInETable(v) && (s.graph() || s.castle())) {
+            DEBUG_PRINTF("assigning %p to id %u\n", s.graph(), role_id);
+            suffixes.emplace(s, role_id);
+
+            vertex_map[role_id].push_back(v);
+            const set<PredTopPair> &s_triggers = suffixTriggers.at(s);
+            map<u32, vector<vector<CharReach>>> triggers;
+            findTriggerSequences(tbi, s_triggers, &triggers);
+
+            RoleInfo<suffix_id> info(s, role_id);
+            if (setTriggerLiteralsSuffix(info, triggers)) {
+                roleInfoSet.insert(info);
+            }
+            role_id++;
+        }
+    }
+
+    if (suffixes.size() > 1) {
+        DEBUG_PRINTF("suffix size:%lu\n", suffixes.size());
+        vector<vector<u32>> groups;
+        exclusiveAnalysisSuffix(tbi, vertex_map, roleInfoSet, groups);
+        buildExclusiveSuffixes(tbi, bc, qif, suffixTriggers, vertex_map,
+                               groups, no_retrigger_queues);
+    }
+}
+
+static
+bool buildSuffixes(const RoseBuildImpl &tbi, build_context &bc,
+                   set<u32> *no_retrigger_queues,
+                   const map<suffix_id, set<PredTopPair>> &suffixTriggers) {
     // To ensure compile determinism, build suffix engines in order of their
     // (unique) queue indices, so that we call add_nfa_to_blob in the same
     // order.
@@ -1770,6 +2169,11 @@ bool buildSuffixes(const RoseBuildImpl &tbi, build_context &bc,
     for (const auto &e : ordered) {
         const u32 queue = e.first;
         const suffix_id &s = e.second;
+
+        if (s.tamarama()) {
+            continue;
+        }
+
         const set<PredTopPair> &s_triggers = suffixTriggers.at(s);
 
         map<u32, u32> fixed_depth_tops;
@@ -1860,11 +2264,20 @@ static
 bool buildNfas(RoseBuildImpl &tbi, build_context &bc, QueueIndexFactory &qif,
                set<u32> *no_retrigger_queues, set<u32> *eager_queues,
                u32 *leftfixBeginQueue) {
+    map<suffix_id, set<PredTopPair>> suffixTriggers;
+    findSuffixTriggers(tbi, &suffixTriggers);
+
+    if (tbi.cc.grey.allowTamarama && tbi.cc.streaming) {
+        findExclusiveSuffixes(tbi, bc, qif, suffixTriggers,
+                              no_retrigger_queues);
+    }
+
     assignSuffixQueues(tbi, bc);
 
-    if (!buildSuffixes(tbi, bc, no_retrigger_queues)) {
+    if (!buildSuffixes(tbi, bc, no_retrigger_queues, suffixTriggers)) {
         return false;
     }
+    suffixTriggers.clear();
 
     *leftfixBeginQueue = qif.allocated_count();
 
@@ -3205,7 +3618,15 @@ void makeRoleSuffix(RoseBuildImpl &build, build_context &bc, RoseVertex v,
     assert(contains(bc.engineOffsets, qi));
     const NFA *nfa = get_nfa_from_blob(bc, qi);
     u32 suffixEvent;
-    if (isMultiTopType(nfa->type)) {
+    if (isContainerType(nfa->type)) {
+        auto tamaProto = g[v].suffix.tamarama.get();
+        assert(tamaProto);
+        u32 top = (u32)MQE_TOP_FIRST +
+                  tamaProto->top_remap.at(make_pair(g[v].idx,
+                                                    g[v].suffix.top));
+        assert(top < MQE_INVALID);
+        suffixEvent = top;
+    } else if (isMultiTopType(nfa->type)) {
         assert(!g[v].suffix.haig);
         u32 top = (u32)MQE_TOP_FIRST + g[v].suffix.top;
         assert(top < MQE_INVALID);
@@ -3283,7 +3704,13 @@ void makeRoleInfixTriggers(RoseBuildImpl &build, build_context &bc,
 
         // DFAs have no TOP_N support, so they get a classic MQE_TOP event.
         u32 top;
-        if (!isMultiTopType(nfa->type)) {
+        if (isContainerType(nfa->type)) {
+            auto tamaProto = g[v].left.tamarama.get();
+            assert(tamaProto);
+            top = MQE_TOP_FIRST + tamaProto->top_remap.at(
+                                      make_pair(g[v].idx, g[e].rose_top));
+            assert(top < MQE_INVALID);
+        } else if (!isMultiTopType(nfa->type)) {
             assert(num_tops(g[v].left) == 1);
             top = MQE_TOP;
         } else {
diff --git a/src/rose/rose_build_exclusive.cpp b/src/rose/rose_build_exclusive.cpp
new file mode 100644
index 00000000..c9e8d215
--- /dev/null
+++ b/src/rose/rose_build_exclusive.cpp
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ue2common.h"
+
+#include "rose_build_exclusive.h"
+#include "rose_build_merge.h"
+#include "nfa/castlecompile.h"
+#include "nfagraph/ng_execute.h"
+#include "nfagraph/ng_holder.h"
+#include "nfagraph/ng_util.h"
+#include "util/clique.h"
+#include "util/compile_context.h"
+#include "util/container.h"
+#include "util/graph.h"
+#include "util/make_unique.h"
+
+using namespace std;
+
+namespace ue2 {
+
+template<typename role_id>
+struct RoleChunk {
+    vector<RoleInfo<role_id>> roles;
+};
+
+static
+CharReach getReachability(const NGHolder &h) {
+    CharReach cr;
+    for (const auto &v : vertices_range(h)) {
+        if (!is_special(v, h)) {
+            cr |= h[v].char_reach;
+        }
+    }
+    return cr;
+}
+
+template<typename role_id>
+static
+vector<RoleChunk<role_id>> divideIntoChunks(const RoseBuildImpl &build,
+                                 set<RoleInfo<role_id>> &roleInfoSet) {
+    u32 chunkSize = build.cc.grey.tamaChunkSize;
+    u32 cnt = 1;
+    vector<RoleChunk<role_id>> chunks;
+    RoleChunk<role_id> roleChunk;
+    for (const auto &roleInfo : roleInfoSet) {
+        if (cnt == chunkSize) {
+            cnt -= chunkSize;
+            chunks.push_back(roleChunk);
+            roleChunk.roles.clear();
+        }
+        roleChunk.roles.push_back(roleInfo);
+        cnt++;
+    }
+
+    if (cnt > 1) {
+        chunks.push_back(roleChunk);
+    }
+
+    return chunks;
+}
+
+/* add prefix literals to engine graph */
+static
+bool addPrefixLiterals(NGHolder &h, ue2::unordered_set<u32> &tailId,
+                       const vector<vector<CharReach>> &triggers) {
+    DEBUG_PRINTF("add literals to graph\n");
+
+    NFAVertex start = h.start;
+    vector<NFAVertex> heads;
+    vector<NFAVertex> tails;
+    for (const auto &lit : triggers) {
+        NFAVertex last = start;
+        if (lit.empty()) {
+            return false;
+        }
+        u32 i = 0;
+        for (const auto &c : lit) {
+            DEBUG_PRINTF("lit:%s \n", c.to_string().c_str());
+            NFAVertex u = add_vertex(h);
+            h[u].char_reach = c;
+            if (!i++) {
+                heads.push_back(u);
+                last = u;
+                continue;
+            }
+            add_edge(last, u, h);
+            last = u;
+        }
+        tails.push_back(last);
+        tailId.insert(h[last].index);
+    }
+
+    for (auto v : adjacent_vertices_range(start, h)) {
+        if (v != h.startDs) {
+            for (auto &t : tails) {
+                add_edge(t, v, h);
+            }
+        }
+    }
+
+    clear_out_edges(start, h);
+    add_edge(h.start, h.start, h);
+    for (auto &t : heads) {
+        add_edge(start, t, h);
+    }
+
+    DEBUG_PRINTF("literals addition done\n");
+    return true;
+}
+
+/* check if one literal is suffix of another */
+static
+bool isSuffix(const vector<vector<CharReach>> &triggers1,
+              const vector<vector<CharReach>> &triggers2) {
+    // literal suffix test
+    for (const auto &lit1 : triggers1) {
+        for (const auto &lit2 : triggers2) {
+            const size_t len = min(lit1.size(), lit2.size());
+            if (equal(lit1.rbegin(), lit1.rbegin() + len,
+                      lit2.rbegin(), overlaps)) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+/* prepare initial infix or suffix graph used for exclusive analysis */
+template<typename role_id>
+static
+u32 prepareRoleGraph(NGHolder &h, const role_id &s1) {
+    u32 num = 0;
+    if (s1.castle()) {
+        num = num_vertices(h);
+        NFAVertex u = add_vertex(h);
+        h[u].char_reach = s1.castle()->reach();
+        add_edge(h.startDs, u, h);
+        // add self loop to repeat characters
+        add_edge(u, u, h);
+    } else if (s1.graph()) {
+        const NGHolder &g = *s1.graph();
+        cloneHolder(h, g);
+        num = num_vertices(h);
+    } else {
+        // only infixes and suffixes with graph properties are possible
+        // candidates, already filtered out other cases before
+        // exclusive analysis
+        assert(0);
+    }
+
+    return num;
+}
+
+/* get a subset of literal if reset character is found */
+static
+vector<CharReach> findStartPos(const CharReach &cr1,
+                               const vector<CharReach> &lit) {
+    auto it = lit.rbegin(), ite = lit.rend();
+    u32 pos = lit.size();
+    for (; it != ite; it++) {
+        if (!overlaps(cr1, *it)) {
+            break;
+        }
+        pos--;
+    }
+
+    return vector<CharReach> (lit.begin() + pos, lit.end());
+}
+
+template<typename role_id>
+static
+bool isExclusive(const NGHolder &h,
+                 const u32 num, ue2::unordered_set<u32> &tailId,
+                 map<u32, ue2::unordered_set<u32>> &skipList,
+                 const RoleInfo<role_id> &role1,
+                 const RoleInfo<role_id> &role2) {
+    const u32 id1 = role1.id;
+    const u32 id2 = role2.id;
+
+    if (contains(skipList, id1) && contains(skipList[id1], id2)) {
+        return false;
+    }
+
+    const auto &triggers1 = role1.literals;
+    const auto &triggers2 = role2.literals;
+    if (isSuffix(triggers1, triggers2)) {
+        skipList[id2].insert(id1);
+        return false;
+    }
+
+    DEBUG_PRINTF("role id2:%u\n", id2);
+    const auto &cr1 = role1.cr;
+    if (overlaps(cr1, role2.last_cr)) {
+        CharReach cr = cr1 | role1.prefix_cr;
+        for (const auto &lit : triggers2) {
+            auto lit1 = findStartPos(cr, lit);
+            if (lit1.empty()) {
+                continue;
+            }
+            u32 lower_bound = 0;
+            if (lit1.size() < lit.size()) {
+                lower_bound = ~0U;
+            }
+
+            ue2::flat_set<NFAVertex> states;
+            for (const auto &v : vertices_range(h)) {
+                if (h[v].index >= lower_bound || h[v].index < 2) {
+                    states.insert(v);
+                }
+            }
+
+            auto activeStates = execute_graph(h, lit1, states);
+            // Check if has only literal states are on
+            for (const auto &s : activeStates) {
+                u32 stateId = h[s].index;
+                if ((stateId > 1 && stateId <= num) ||
+                    contains(tailId, stateId)) {
+                    skipList[id2].insert(id1);
+                    return false;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+template<typename role_id>
+static
+ue2::unordered_set<u32> checkExclusivity(const NGHolder &h,
+                            const u32 num, ue2::unordered_set<u32> &tailId,
+                            map<u32, ue2::unordered_set<u32>> &skipList,
+                            const RoleInfo<role_id> &role1,
+                            const RoleChunk<role_id> &roleChunk) {
+    ue2::unordered_set<u32> info;
+    const u32 id1 = role1.id;
+    for (const auto &role2 : roleChunk.roles) {
+        const u32 id2 = role2.id;
+        if (id1 != id2 && isExclusive(h, num, tailId, skipList,
+                                      role1, role2)) {
+            info.insert(id2);
+        }
+    }
+
+    return info;
+}
+
+static
+void findCliques(const map<u32, set<u32>> &exclusiveGroups,
+                 vector<vector<u32>> &exclusive_roles) {
+    if (exclusiveGroups.empty()) {
+        return;
+    }
+    // Construct the exclusivity graph
+    map<u32, CliqueVertex> vertex_map;
+    unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
+
+    // Add vertices representing infixes/suffixes
+    for (const auto &e : exclusiveGroups) {
+        const u32 id = e.first;
+        CliqueVertex v1 = add_vertex(CliqueVertexProps(id), *cg);
+        vertex_map[id] = v1;
+    }
+
+    // Wire exclusive pairs
+    for (const auto &e1 : exclusiveGroups) {
+        const u32 literalId1 = e1.first;
+        CliqueVertex lv = vertex_map[literalId1];
+        const set<u32> &exclusiveSet = e1.second;
+        for (const auto &e2 : exclusiveGroups) {
+            const u32 literalId2 = e2.first;
+            if (literalId1 < literalId2 &&
+                contains(exclusiveSet, literalId2)) {
+                add_edge(lv, vertex_map[literalId2], *cg);
+                DEBUG_PRINTF("Wire %u:%u\n", literalId1, literalId2);
+            }
+        }
+    }
+
+    // Find clique groups
+    const auto &clique = removeClique(*cg);
+    for (const auto &i : clique) {
+        DEBUG_PRINTF("cliq:%lu\n", i.size());
+        if (i.size() > 1) {
+            exclusive_roles.push_back(i);
+        }
+    }
+    DEBUG_PRINTF("Clique graph size:%lu\n", exclusive_roles.size());
+}
+
+static
+map<u32, set<u32>> findExclusiveGroups(const RoseBuildImpl &build,
+            const map<u32, ue2::unordered_set<u32>> &exclusiveInfo,
+            const map<u32, vector<RoseVertex>> &vertex_map,
+            const bool is_infix) {
+    map<u32, set<u32>> exclusiveGroups;
+    for (const auto &e : exclusiveInfo) {
+        u32 i = e.first;
+        const auto &s = e.second;
+        set<u32> group;
+        set<RoseVertex> q1(vertex_map.at(i).begin(),
+                           vertex_map.at(i).end());
+        DEBUG_PRINTF("vertex set:%lu\n", q1.size());
+        for (const auto &val : s) {
+            set<RoseVertex> q2(vertex_map.at(val).begin(),
+                               vertex_map.at(val).end());
+            if (contains(exclusiveInfo.at(val), i) &&
+                (!is_infix || mergeableRoseVertices(build, q1, q2))) {
+                group.insert(val);
+            }
+        }
+        if (!group.empty()) {
+            exclusiveGroups[i] = group;
+        }
+    }
+
+    return exclusiveGroups;
+}
+
+template<typename role_id>
+static
+bool setTriggerLiterals(RoleInfo<role_id> &roleInfo,
+        const map<u32, vector<vector<CharReach>>> &triggers) {
+    u32 minLiteralLen = ~0U;
+    for (const auto &tr : triggers) {
+        for (const auto &lit : tr.second) {
+            if (lit.empty()) {
+                return false;
+            }
+            minLiteralLen = min(minLiteralLen, (u32)lit.size());
+            roleInfo.last_cr |= lit.back();
+            for (const auto &c : lit) {
+                roleInfo.prefix_cr |= c;
+            }
+            roleInfo.literals.push_back(lit);
+        }
+    }
+
+    if (roleInfo.role.graph()) {
+        const NGHolder &g = *roleInfo.role.graph();
+        roleInfo.cr = getReachability(g);
+    } else if (roleInfo.role.castle()) {
+        roleInfo.cr = roleInfo.role.castle()->reach();
+    }
+
+    // test the score of this engine
+    roleInfo.score = 256 - roleInfo.cr.count() + minLiteralLen;
+    if (roleInfo.score < 20) {
+        return false;
+    }
+
+    return true;
+}
+
+bool setTriggerLiteralsInfix(RoleInfo<left_id> &roleInfo,
+        const map<u32, vector<vector<CharReach>>> &triggers) {
+    return setTriggerLiterals(roleInfo, triggers);
+}
+
+bool setTriggerLiteralsSuffix(RoleInfo<suffix_id> &roleInfo,
+        const map<u32, vector<vector<CharReach>>> &triggers) {
+    return setTriggerLiterals(roleInfo, triggers);
+}
+
+template<typename role_id>
+static
+void exclusiveAnalysis(const RoseBuildImpl &build,
+               const map<u32, vector<RoseVertex>> &vertex_map,
+               set<RoleInfo<role_id>> &roleInfoSet,
+               vector<vector<u32>> &exclusive_roles, const bool is_infix) {
+    const auto &chunks = divideIntoChunks(build, roleInfoSet);
+    DEBUG_PRINTF("Exclusivity analysis entry\n");
+    map<u32, ue2::unordered_set<u32>> exclusiveInfo;
+
+    for (const auto &roleChunk : chunks) {
+        map<u32, ue2::unordered_set<u32>> skipList;
+        for (const auto &role1 : roleChunk.roles) {
+            const u32 id1 = role1.id;
+            const role_id &s1 = role1.role;
+            const auto &triggers1 = role1.literals;
+
+            NGHolder h;
+            u32 num = prepareRoleGraph(h, s1);
+            DEBUG_PRINTF("role id1:%u\n", id1);
+            unordered_set<u32> tailId;
+            if (!addPrefixLiterals(h, tailId, triggers1)) {
+                continue;
+            }
+
+            exclusiveInfo[id1] = checkExclusivity(h, num, tailId,
+                                             skipList, role1, roleChunk);
+        }
+    }
+
+    // Create final candidate exclusive groups
+    const auto exclusiveGroups =
+        findExclusiveGroups(build, exclusiveInfo, vertex_map, is_infix);
+    exclusiveInfo.clear();
+
+    // Find cliques for each exclusive groups
+    findCliques(exclusiveGroups, exclusive_roles);
+}
+
+void exclusiveAnalysisInfix(const RoseBuildImpl &build,
+               const map<u32, vector<RoseVertex>> &vertex_map,
+               set<RoleInfo<left_id>> &roleInfoSet,
+               vector<vector<u32>> &exclusive_roles) {
+    exclusiveAnalysis(build, vertex_map, roleInfoSet, exclusive_roles,
+                      true);
+}
+
+void exclusiveAnalysisSuffix(const RoseBuildImpl &build,
+               const map<u32, vector<RoseVertex>> &vertex_map,
+               set<RoleInfo<suffix_id>> &roleInfoSet,
+               vector<vector<u32>> &exclusive_roles) {
+    exclusiveAnalysis(build, vertex_map, roleInfoSet, exclusive_roles,
+                      false);
+}
+
+} // namespace ue2
diff --git a/src/rose/rose_build_exclusive.h b/src/rose/rose_build_exclusive.h
new file mode 100644
index 00000000..a6772f7f
--- /dev/null
+++ b/src/rose/rose_build_exclusive.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief exclusive analysis for infix and suffix engines.
+ * Two engines are considered as exclusive if they can never be alive
+ * at the same time. This analysis takes advantage of the property of
+ * triggering literal + engine graph. If the triggering literals of
+ * two engines can make all the states dead in each other's graph,
+ * then they are exclusive.
+ */
+#ifndef ROSE_BUILD_EXCLUSIVE_H
+#define ROSE_BUILD_EXCLUSIVE_H
+
+#include "ue2common.h"
+
+#include "rose_build_impl.h"
+#include "util/alloc.h"
+#include "util/charreach.h"
+
+#include <map>
+#include <set>
+#include <vector>
+
+namespace ue2 {
+
+/** brief subengine info including built engine and
+ * corresponding triggering rose vertices */
+struct ExclusiveSubengine {
+    aligned_unique_ptr<NFA> nfa;
+    std::vector<RoseVertex> vertices;
+};
+
+/** \brief exclusive info to build tamarama */
+struct ExclusiveInfo {
+    // subengine info
+    std::vector<ExclusiveSubengine> subengines;
+    // all the report in tamarama
+    std::set<ReportID> reports;
+    // assigned queue id
+    u32 queue;
+};
+
+/** \brief role info structure for exclusive analysis */
+template<typename role_id>
+struct RoleInfo {
+    RoleInfo(role_id role_in, u32 id_in) : role(role_in), id(id_in) {}
+    bool operator==(const RoleInfo &b) const {
+        return id == b.id;
+    }
+    bool operator!=(const RoleInfo &b) const { return !(*this == b); }
+    bool operator<(const RoleInfo &b) const {
+        const RoleInfo &a = *this;
+        if (a.score != b.score) {
+            return a.score > b.score;
+        }
+        ORDER_CHECK(id);
+        return false;
+    }
+
+    std::vector<std::vector<CharReach>> literals; // prefix literals
+    CharReach prefix_cr; // reach of prefix literals
+    CharReach last_cr; // reach of the last character of literals
+    CharReach cr; // reach of engine graph
+    const role_id role; // infix or suffix info
+    const u32 id; // infix or suffix id
+    u32 score; // score for exclusive analysis
+};
+
+/**
+ * \brief add triggering literals to infix info.
+ */
+bool setTriggerLiteralsInfix(RoleInfo<left_id> &roleInfo,
+        const std::map<u32, std::vector<std::vector<CharReach>>> &triggers);
+
+/**
+ * \brief add triggering literals to suffix info.
+ */
+bool setTriggerLiteralsSuffix(RoleInfo<suffix_id> &roleInfo,
+        const std::map<u32, std::vector<std::vector<CharReach>>> &triggers);
+
+/**
+ * Exclusive analysis for infix engines.
+ *
+ * @param build rose build info mainly used to set exclusive chunk size here
+ * @param vertex_map mapping between engine id and rose vertices
+ *        related to this engine
+ * @param roleInfoSet structure contains role properties including infix info,
+ *        triggering literals and literal reachabilities.
+ *        Used for exclusive analysis.
+ * @param exclusive_roles output mapping between engine id and its exclusive
+ *        group id
+ */
+void exclusiveAnalysisInfix(const RoseBuildImpl &build,
+               const std::map<u32, std::vector<RoseVertex>> &vertex_map,
+               std::set<RoleInfo<left_id>> &roleInfoSet,
+               std::vector<std::vector<u32>> &exclusive_roles);
+
+/**
+ * Exclusive analysis for suffix engines.
+ *
+ * @param build rose build info mainly used to set exclusive chunk size here
+ * @param vertex_map mapping between engine id and rose vertices
+ *        related to this engine
+ * @param roleInfoSet structure contains role properties including suffix info,
+ *        triggering literals and literal reachabilities.
+ *        Used for exclusive analysis.
+ * @param exclusive_roles output mapping between engine id and its exclusive
+ *        group id
+ */
+void exclusiveAnalysisSuffix(const RoseBuildImpl &build,
+               const std::map<u32, std::vector<RoseVertex>> &vertex_map,
+               std::set<RoleInfo<suffix_id>> &roleInfoSet,
+               std::vector<std::vector<u32>> &exclusive_roles);
+
+} // namespace ue2
+
+#endif //ROSE_BUILD_EXCLUSIVE_H
+
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index 71940e07..ca1b64e2 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -65,12 +65,13 @@ class SomSlotManager;
 struct suffix_id {
     suffix_id(const RoseSuffixInfo &in)
         : g(in.graph.get()), c(in.castle.get()), d(in.rdfa.get()),
-          h(in.haig.get()), dfa_min_width(in.dfa_min_width),
+          h(in.haig.get()), t(in.tamarama.get()),
+          dfa_min_width(in.dfa_min_width),
           dfa_max_width(in.dfa_max_width) {
             assert(!g || g->kind == NFA_SUFFIX);
     }
     bool operator==(const suffix_id &b) const {
-        bool rv = g == b.g && c == b.c && h == b.h && d == b.d;
+        bool rv = g == b.g && c == b.c && h == b.h && d == b.d && t == b.t;
         assert(!rv || dfa_min_width == b.dfa_min_width);
         assert(!rv || dfa_max_width == b.dfa_max_width);
         return rv;
@@ -82,6 +83,7 @@ struct suffix_id {
         ORDER_CHECK(c);
         ORDER_CHECK(d);
         ORDER_CHECK(h);
+        ORDER_CHECK(t);
         return false;
     }
 
@@ -113,6 +115,22 @@ struct suffix_id {
         }
         return c;
     }
+    TamaProto *tamarama() {
+        if (!d && !h) {
+            assert(dfa_min_width == depth(0));
+            assert(dfa_max_width == depth::infinity());
+        }
+        return t;
+    }
+    const TamaProto *tamarama() const {
+        if (!d && !h) {
+            assert(dfa_min_width == depth(0));
+            assert(dfa_max_width == depth::infinity());
+        }
+        return t;
+    }
+
+
     raw_som_dfa *haig() { return h; }
     const raw_som_dfa *haig() const { return h; }
     raw_dfa *dfa() { return d; }
@@ -125,6 +143,7 @@ private:
     CastleProto *c;
     raw_dfa *d;
     raw_som_dfa *h;
+    TamaProto *t;
     depth dfa_min_width;
     depth dfa_max_width;
 
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index b16e3a69..f430f731 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -34,6 +34,7 @@
 #include "nfa/mcclellancompile_util.h"
 #include "nfa/nfa_api.h"
 #include "nfa/rdfa.h"
+#include "nfa/tamaramacompile.h"
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_limex.h"
 #include "nfagraph/ng_reports.h"
@@ -909,7 +910,7 @@ set<ReportID> all_reports(const OutfixInfo &outfix) {
 
 bool RoseSuffixInfo::operator==(const RoseSuffixInfo &b) const {
     return top == b.top && graph == b.graph && castle == b.castle &&
-           rdfa == b.rdfa && haig == b.haig;
+           rdfa == b.rdfa && haig == b.haig && tamarama == b.tamarama;
 }
 
 bool RoseSuffixInfo::operator<(const RoseSuffixInfo &b) const {
@@ -919,6 +920,7 @@ bool RoseSuffixInfo::operator<(const RoseSuffixInfo &b) const {
     ORDER_CHECK(castle);
     ORDER_CHECK(haig);
     ORDER_CHECK(rdfa);
+    ORDER_CHECK(tamarama);
     assert(a.dfa_min_width == b.dfa_min_width);
     assert(a.dfa_max_width == b.dfa_max_width);
     return false;
@@ -931,13 +933,16 @@ void RoseSuffixInfo::reset(void) {
     castle.reset();
     rdfa.reset();
     haig.reset();
+    tamarama.reset();
     dfa_min_width = 0;
     dfa_max_width = depth::infinity();
 }
 
 std::set<ReportID> all_reports(const suffix_id &s) {
     assert(s.graph() || s.castle() || s.haig() || s.dfa());
-    if (s.graph()) {
+    if (s.tamarama()) {
+        return all_reports(*s.tamarama());
+    } else if (s.graph()) {
         return all_reports(*s.graph());
     } else if (s.castle()) {
         return all_reports(*s.castle());
@@ -1149,6 +1154,7 @@ void LeftEngInfo::reset(void) {
     castle.reset();
     dfa.reset();
     haig.reset();
+    tamarama.reset();
     lag = 0;
     leftfix_report = MO_INVALID_IDX;
     dfa_min_width = 0;
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index 1d63c71a..19d8414d 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -718,7 +718,7 @@ void dumpNfas(const RoseEngine *t, bool dump_raw, const string &base) {
         FILE *f;
 
         f = fopen(ssdot.str().c_str(), "w");
-        nfaDumpDot(n, f);
+        nfaDumpDot(n, f, base);
         fclose(f);
 
         f = fopen(sstxt.str().c_str(), "w");
@@ -778,7 +778,7 @@ void dumpRevNfas(const RoseEngine *t, bool dump_raw, const string &base) {
         FILE *f;
 
         f = fopen(ssdot.str().c_str(), "w");
-        nfaDumpDot(n, f);
+        nfaDumpDot(n, f, base);
         fclose(f);
 
         f = fopen(sstxt.str().c_str(), "w");
@@ -809,7 +809,7 @@ void dumpAnchored(const RoseEngine *t, const string &base) {
         FILE *f;
 
         f = fopen(ssdot.str().c_str(), "w");
-        nfaDumpDot(n, f);
+        nfaDumpDot(n, f, base);
         fclose(f);
 
         f = fopen(sstxt.str().c_str(), "w");
diff --git a/src/rose/rose_graph.h b/src/rose/rose_graph.h
index b0ac8d11..6abe629b 100644
--- a/src/rose/rose_graph.h
+++ b/src/rose/rose_graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -55,6 +55,7 @@ namespace ue2 {
 struct CastleProto;
 struct raw_dfa;
 struct raw_som_dfa;
+struct TamaProto;
 
 /** \brief Table type for a literal. */
 enum rose_literal_table {
@@ -82,6 +83,7 @@ struct LeftEngInfo {
     std::shared_ptr<CastleProto> castle;
     std::shared_ptr<raw_dfa> dfa;
     std::shared_ptr<raw_som_dfa> haig;
+    std::shared_ptr<TamaProto> tamarama;
     u32 lag = 0U;
     ReportID leftfix_report = MO_INVALID_IDX;
     depth dfa_min_width = 0;
@@ -92,6 +94,7 @@ struct LeftEngInfo {
             && other.castle == castle
             && other.dfa == dfa
             && other.haig == haig
+            && other.tamarama == tamarama
             && other.lag == lag
             && other.leftfix_report == leftfix_report;
     }
@@ -104,6 +107,7 @@ struct LeftEngInfo {
         ORDER_CHECK(castle);
         ORDER_CHECK(dfa);
         ORDER_CHECK(haig);
+        ORDER_CHECK(tamarama);
         ORDER_CHECK(lag);
         ORDER_CHECK(leftfix_report);
         return false;
@@ -121,6 +125,7 @@ struct RoseSuffixInfo {
     std::shared_ptr<CastleProto> castle;
     std::shared_ptr<raw_som_dfa> haig;
     std::shared_ptr<raw_dfa> rdfa;
+    std::shared_ptr<TamaProto> tamarama;
     depth dfa_min_width = 0;
     depth dfa_max_width = depth::infinity();
 
@@ -128,7 +133,7 @@ struct RoseSuffixInfo {
     bool operator!=(const RoseSuffixInfo &b) const { return !(*this == b); }
     bool operator<(const RoseSuffixInfo &b) const;
     void reset(void);
-    operator bool() const { return graph || castle || haig || rdfa; }
+    operator bool() const { return graph || castle || haig || rdfa || tamarama; }
 };
 
 /** \brief Properties attached to each Rose graph vertex. */
diff --git a/src/smallwrite/smallwrite_dump.cpp b/src/smallwrite/smallwrite_dump.cpp
index 8987e8b3..0db97df5 100644
--- a/src/smallwrite/smallwrite_dump.cpp
+++ b/src/smallwrite/smallwrite_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -73,7 +73,7 @@ void smwrDumpNFA(const SmallWriteEngine *smwr, bool dump_raw,
     FILE *f;
 
     f = fopen((base + "smallwrite_nfa.dot").c_str(), "w");
-    nfaDumpDot(n, f);
+    nfaDumpDot(n, f, base);
     fclose(f);
 
     f = fopen((base + "smallwrite_nfa.txt").c_str(), "w");
diff --git a/src/util/clique.cpp b/src/util/clique.cpp
new file mode 100644
index 00000000..ea22779c
--- /dev/null
+++ b/src/util/clique.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief An algorithm to find cliques.
+ */
+
+#include "clique.h"
+#include "container.h"
+#include "graph_range.h"
+#include "make_unique.h"
+#include "ue2_containers.h"
+
+#include <map>
+#include <set>
+#include <stack>
+
+using namespace std;
+
+namespace ue2 {
+
+static
+vector<u32> getNeighborInfo(const CliqueGraph &g,
+                     const CliqueVertex &cv, const set<u32> &group) {
+    u32 id = g[cv].stateId;
+    vector<u32> neighbor;
+    // find neighbors for cv
+    for (const auto &v : adjacent_vertices_range(cv, g)) {
+        if (g[v].stateId != id && contains(group, g[v].stateId)){
+            neighbor.push_back(g[v].stateId);
+            DEBUG_PRINTF("Neighbor:%u\n", g[v].stateId);
+        }
+    }
+
+    return neighbor;
+}
+
+static
+vector<u32> findCliqueGroup(CliqueGraph &cg) {
+    stack<vector<u32>> gStack;
+
+    // Create mapping between vertex and id
+    map<u32, CliqueVertex> vertexMap;
+    vector<u32> init;
+    for (const auto &v : vertices_range(cg)) {
+        vertexMap[cg[v].stateId] = v;
+        init.push_back(cg[v].stateId);
+    }
+    gStack.push(init);
+
+    // Get the vertex to start from
+    vector<u32> clique;
+    while (!gStack.empty()) {
+        vector<u32> g = move(gStack.top());
+        gStack.pop();
+
+        // Choose a vertex from the graph
+        u32 id = g[0];
+        CliqueVertex &n = vertexMap.at(id);
+        clique.push_back(id);
+        // Corresponding vertex in the original graph
+        set<u32> subgraphId(g.begin(), g.end());
+        auto neighbor = getNeighborInfo(cg, n, subgraphId);
+        // Get graph consisting of neighbors for left branch
+        if (!neighbor.empty()) {
+            gStack.push(neighbor);
+        }
+    }
+
+    return clique;
+}
+
+template<typename Graph>
+bool graph_empty(const Graph &g) {
+    typename Graph::vertex_iterator vi, ve;
+    tie(vi, ve) = vertices(g);
+    return vi == ve;
+}
+
+vector<vector<u32>> removeClique(CliqueGraph &cg) {
+    DEBUG_PRINTF("graph size:%lu\n", num_vertices(cg));
+    vector<vector<u32>> cliquesVec = {findCliqueGroup(cg)};
+    while (!graph_empty(cg)) {
+        const vector<u32> &c = cliquesVec.back();
+        vector<CliqueVertex> dead;
+        for (const auto &v : vertices_range(cg)) {
+            u32 id = cg[v].stateId;
+            if (find(c.begin(), c.end(), id) != c.end()) {
+                dead.push_back(v);
+            }
+        }
+        for (const auto &v : dead) {
+            clear_vertex(v, cg);
+            remove_vertex(v, cg);
+        }
+        if (graph_empty(cg)) {
+            break;
+        }
+        auto clique = findCliqueGroup(cg);
+        cliquesVec.push_back(clique);
+    }
+
+    return cliquesVec;
+}
+
+} // namespace ue2
diff --git a/src/util/clique.h b/src/util/clique.h
new file mode 100644
index 00000000..89c6d4ed
--- /dev/null
+++ b/src/util/clique.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief An algorithm to find cliques.
+ */
+
+#ifndef CLIQUE_H
+#define CLIQUE_H
+
+#include "ue2common.h"
+
+#include <vector>
+
+#include <boost/graph/adjacency_list.hpp>
+
+namespace ue2 {
+
+struct CliqueVertexProps {
+    CliqueVertexProps() {}
+    explicit CliqueVertexProps(u32 state_in) : stateId(state_in) {}
+
+    u32 stateId = ~0U;
+};
+
+typedef boost::adjacency_list<boost::listS, boost::listS, boost::undirectedS,
+                              CliqueVertexProps> CliqueGraph;
+typedef CliqueGraph::vertex_descriptor CliqueVertex;
+
+/** \brief Returns a vector of cliques found in a graph. */
+std::vector<std::vector<u32>> removeClique(CliqueGraph &cg);
+
+} // namespace ue2
+
+#endif

From cf9e40ae1c67820979c4d47f5b6ec65f47c6954b Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 24 Jun 2016 16:22:43 +1000
Subject: [PATCH 076/166] nfa: unify NfaCallback and SomNfaCallback

Use just one callback type, with both start and end offsets.
---
 src/nfa/callback.h               | 12 ++---
 src/nfa/castle.c                 |  4 +-
 src/nfa/gough.c                  | 46 ++++++++--------
 src/nfa/gough.h                  |  6 +--
 src/nfa/lbr.c                    |  4 +-
 src/nfa/lbr_common_impl.h        |  4 +-
 src/nfa/limex.h                  |  3 +-
 src/nfa/limex_common_impl.h      |  4 +-
 src/nfa/limex_runtime.h          |  2 +-
 src/nfa/limex_runtime_impl.h     |  6 +--
 src/nfa/mcclellan.c              | 36 ++++++-------
 src/nfa/mcclellan.h              |  6 +--
 src/nfa/mpv.c                    |  5 +-
 src/nfa/nfa_api.h                |  7 +--
 src/nfa/nfa_api_dispatch.c       |  5 +-
 src/nfa/nfa_api_queue.h          |  3 +-
 src/nfa/tamarama.c               |  8 ++-
 src/nfa/tamarama.h               |  3 +-
 src/rose/catchup.c               | 92 +++++---------------------------
 src/rose/match.c                 | 17 ++----
 src/rose/match.h                 |  6 +--
 src/rose/program_runtime.h       |  8 +--
 src/rose/rose.h                  |  3 +-
 src/rose/rose_build_bytecode.cpp | 26 +--------
 src/rose/rose_internal.h         |  2 -
 src/runtime.c                    |  7 ++-
 src/som/som_runtime.c            |  2 +-
 unit/internal/lbr.cpp            |  3 +-
 unit/internal/limex_nfa.cpp      |  7 +--
 29 files changed, 103 insertions(+), 234 deletions(-)

diff --git a/src/nfa/callback.h b/src/nfa/callback.h
index dfcd1b9f..0284f1d5 100644
--- a/src/nfa/callback.h
+++ b/src/nfa/callback.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -53,14 +53,8 @@
  * are 0, which means 'stop running the engine' or non-zero, which means
  * 'continue matching'.
  */
-typedef int (*NfaCallback)(u64a offset, ReportID id, void *context);
-
-/** \brief The type for an NFA callback which also tracks start of match.
- *
- * see \ref NfaCallback
- */
-typedef int (*SomNfaCallback)(u64a from_offset, u64a to_offset, ReportID id,
-                              void *context);
+typedef int (*NfaCallback)(u64a from_offset, u64a to_offset, ReportID id,
+                           void *context);
 
 /**
  * standard \ref NfaCallback return value indicating that engine execution
diff --git a/src/nfa/castle.c b/src/nfa/castle.c
index bfdcf6b5..6a72ae31 100644
--- a/src/nfa/castle.c
+++ b/src/nfa/castle.c
@@ -98,7 +98,7 @@ char subCastleReportCurrent(const struct Castle *c, struct mq *q,
     if (match == REPEAT_MATCH) {
         DEBUG_PRINTF("firing match at %llu for sub %u, report %u\n", offset,
                      subIdx, sub->report);
-        if (q->cb(offset, sub->report, q->context) == MO_HALT_MATCHING) {
+        if (q->cb(0, offset, sub->report, q->context) == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING;
         }
     }
@@ -457,7 +457,7 @@ char subCastleFireMatch(const struct Castle *c, const void *full_state,
          i = mmbit_iterate(matching, c->numRepeats, i)) {
         const struct SubCastle *sub = getSubCastle(c, i);
         DEBUG_PRINTF("firing match at %llu for sub %u\n", offset, i);
-        if (cb(offset, sub->report, ctx) == MO_HALT_MATCHING) {
+        if (cb(0, offset, sub->report, ctx) == MO_HALT_MATCHING) {
             DEBUG_PRINTF("caller told us to halt\n");
             return MO_HALT_MATCHING;
         }
diff --git a/src/nfa/gough.c b/src/nfa/gough.c
index 3b7a115d..520aca93 100644
--- a/src/nfa/gough.c
+++ b/src/nfa/gough.c
@@ -110,7 +110,7 @@ u64a expandSomValue(u32 comp_slot_width, u64a curr_offset,
 }
 
 static really_inline
-char doReports(SomNfaCallback cb, void *ctxt, const struct mcclellan *m,
+char doReports(NfaCallback cb, void *ctxt, const struct mcclellan *m,
                const struct gough_som_info *som, u16 s, u64a loc,
                char eod, u16 * const cached_accept_state,
                u32 * const cached_accept_id, u32 * const cached_accept_som) {
@@ -307,7 +307,7 @@ u16 goughEnableStarts(const struct mcclellan *m, u16 s, u64a som_offset,
 static really_inline
 char goughExec16_i(const struct mcclellan *m, struct gough_som_info *som,
                    u16 *state, const u8 *buf, size_t len, u64a offAdj,
-                   SomNfaCallback cb, void *ctxt, const u8 **c_final,
+                   NfaCallback cb, void *ctxt, const u8 **c_final,
                    enum MatchMode mode) {
     assert(ISALIGNED_N(state, 2));
 
@@ -461,7 +461,7 @@ with_accel:
 static really_inline
 char goughExec8_i(const struct mcclellan *m, struct gough_som_info *som,
                   u8 *state, const u8 *buf, size_t len, u64a offAdj,
-                  SomNfaCallback cb, void *ctxt, const u8 **c_final,
+                  NfaCallback cb, void *ctxt, const u8 **c_final,
                   enum MatchMode mode) {
     u8 s = *state;
     const u8 *c = buf, *c_end = buf + len;
@@ -595,7 +595,7 @@ with_accel:
 static never_inline
 char goughExec8_i_ni(const struct mcclellan *m, struct gough_som_info *som,
                      u8 *state, const u8 *buf, size_t len, u64a offAdj,
-                     SomNfaCallback cb, void *ctxt, const u8 **final_point,
+                     NfaCallback cb, void *ctxt, const u8 **final_point,
                      enum MatchMode mode) {
     return goughExec8_i(m, som, state, buf, len, offAdj, cb, ctxt, final_point,
                         mode);
@@ -604,7 +604,7 @@ char goughExec8_i_ni(const struct mcclellan *m, struct gough_som_info *som,
 static never_inline
 char goughExec16_i_ni(const struct mcclellan *m, struct gough_som_info *som,
                       u16 *state, const u8 *buf, size_t len, u64a offAdj,
-                      SomNfaCallback cb, void *ctxt, const u8 **final_point,
+                      NfaCallback cb, void *ctxt, const u8 **final_point,
                       enum MatchMode mode) {
     return goughExec16_i(m, som, state, buf, len, offAdj, cb, ctxt, final_point,
                          mode);
@@ -622,7 +622,7 @@ const struct gough_som_info *getSomInfoConst(const char *state_base) {
 
 static really_inline
 char nfaExecGough8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
-                      const u8 *hend, SomNfaCallback cb, void *context,
+                      const u8 *hend, NfaCallback cb, void *context,
                       struct mq *q, s64a end, enum MatchMode mode) {
     DEBUG_PRINTF("enter\n");
     struct gough_som_info *som = getSomInfo(q->state);
@@ -755,7 +755,7 @@ char nfaExecGough8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
 static really_inline
 char nfaExecGough16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
-                       const u8 *hend, SomNfaCallback cb, void *context,
+                       const u8 *hend, NfaCallback cb, void *context,
                        struct mq *q, s64a end, enum MatchMode mode) {
     struct gough_som_info *som = getSomInfo(q->state);
     assert(n->type == GOUGH_NFA_16);
@@ -887,7 +887,7 @@ char nfaExecGough16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 char nfaExecGough8_Q(const struct NFA *n, struct mq *q, s64a end) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == GOUGH_NFA_8);
     const u8 *hend = q->history + q->hlength;
@@ -899,7 +899,7 @@ char nfaExecGough8_Q(const struct NFA *n, struct mq *q, s64a end) {
 char nfaExecGough16_Q(const struct NFA *n, struct mq *q, s64a end) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == GOUGH_NFA_16);
     const u8 *hend = q->history + q->hlength;
@@ -911,7 +911,7 @@ char nfaExecGough16_Q(const struct NFA *n, struct mq *q, s64a end) {
 char nfaExecGough8_Q2(const struct NFA *n, struct mq *q, s64a end) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == GOUGH_NFA_8);
     const u8 *hend = q->history + q->hlength;
@@ -923,7 +923,7 @@ char nfaExecGough8_Q2(const struct NFA *n, struct mq *q, s64a end) {
 char nfaExecGough16_Q2(const struct NFA *n, struct mq *q, s64a end) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == GOUGH_NFA_16);
     const u8 *hend = q->history + q->hlength;
@@ -935,7 +935,7 @@ char nfaExecGough16_Q2(const struct NFA *n, struct mq *q, s64a end) {
 char nfaExecGough8_QR(const struct NFA *n, struct mq *q, ReportID report) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == GOUGH_NFA_8);
     const u8 *hend = q->history + q->hlength;
@@ -952,7 +952,7 @@ char nfaExecGough8_QR(const struct NFA *n, struct mq *q, ReportID report) {
 char nfaExecGough16_QR(const struct NFA *n, struct mq *q, ReportID report) {
     u64a offset = q->offset;
     const u8 *buffer = q->buffer;
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *context = q->context;
     assert(n->type == GOUGH_NFA_16);
     const u8 *hend = q->history + q->hlength;
@@ -994,7 +994,7 @@ char nfaExecGough16_initCompressedState(const struct NFA *nfa, u64a offset,
 
 char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q) {
     const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *ctxt = q->context;
     u8 s = *(u8 *)q->state;
     u64a offset = q_cur_offset(q);
@@ -1016,7 +1016,7 @@ char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q) {
 
 char nfaExecGough16_reportCurrent(const struct NFA *n, struct mq *q) {
     const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
-    SomNfaCallback cb = q->som_cb;
+    NfaCallback cb = q->cb;
     void *ctxt = q->context;
     u16 s = *(u16 *)q->state;
     const struct mstate_aux *aux = get_aux(m, s);
@@ -1059,7 +1059,7 @@ char nfaExecGough16_inAnyAccept(const struct NFA *n, struct mq *q) {
 static
 char goughCheckEOD(const struct NFA *nfa, u16 s,
                    const struct gough_som_info *som,
-                   u64a offset, SomNfaCallback cb, void *ctxt) {
+                   u64a offset, NfaCallback cb, void *ctxt) {
     const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa);
     const struct mstate_aux *aux = get_aux(m, s);
 
@@ -1070,21 +1070,19 @@ char goughCheckEOD(const struct NFA *nfa, u16 s,
 }
 
 char nfaExecGough8_testEOD(const struct NFA *nfa, const char *state,
-                          UNUSED const char *streamState, u64a offset,
-                          UNUSED NfaCallback callback,
-                          SomNfaCallback som_callback, void *context) {
+                           UNUSED const char *streamState, u64a offset,
+                           NfaCallback callback, void *context) {
     const struct gough_som_info *som = getSomInfoConst(state);
-    return goughCheckEOD(nfa, *(const u8 *)state, som, offset, som_callback,
+    return goughCheckEOD(nfa, *(const u8 *)state, som, offset, callback,
                          context);
 }
 
 char nfaExecGough16_testEOD(const struct NFA *nfa, const char *state,
-                           UNUSED const char *streamState, u64a offset,
-                           UNUSED NfaCallback callback,
-                           SomNfaCallback som_callback, void *context) {
+                            UNUSED const char *streamState, u64a offset,
+                            NfaCallback callback, void *context) {
     assert(ISALIGNED_N(state, 8));
     const struct gough_som_info *som = getSomInfoConst(state);
-    return goughCheckEOD(nfa, *(const u16 *)state, som, offset, som_callback,
+    return goughCheckEOD(nfa, *(const u16 *)state, som, offset, callback,
                          context);
 }
 
diff --git a/src/nfa/gough.h b/src/nfa/gough.h
index 1a7dbd74..a7f48892 100644
--- a/src/nfa/gough.h
+++ b/src/nfa/gough.h
@@ -39,8 +39,7 @@ struct mq;
 
 char nfaExecGough8_testEOD(const struct NFA *nfa, const char *state,
                            const char *streamState, u64a offset,
-                           NfaCallback callback, SomNfaCallback som_cb,
-                           void *context);
+                           NfaCallback callback, void *context);
 char nfaExecGough8_Q(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecGough8_Q2(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecGough8_QR(const struct NFA *n, struct mq *q, ReportID report);
@@ -62,8 +61,7 @@ char nfaExecGough8_expandState(const struct NFA *nfa, void *dest,
 
 char nfaExecGough16_testEOD(const struct NFA *nfa, const char *state,
                             const char *streamState, u64a offset,
-                            NfaCallback callback, SomNfaCallback som_cb,
-                            void *context);
+                            NfaCallback callback, void *context);
 char nfaExecGough16_Q(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecGough16_Q2(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecGough16_QR(const struct NFA *n, struct mq *q, ReportID report);
diff --git a/src/nfa/lbr.c b/src/nfa/lbr.c
index 0d69cc2a..07e59239 100644
--- a/src/nfa/lbr.c
+++ b/src/nfa/lbr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -293,7 +293,7 @@ char lbrMatchLoop(const struct lbr_common *l, const u64a begin, const u64a end,
         }
 
         DEBUG_PRINTF("firing match at %llu\n", i);
-        if (cb(i, l->report, ctx) == MO_HALT_MATCHING) {
+        if (cb(0, i, l->report, ctx) == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING;
         }
     }
diff --git a/src/nfa/lbr_common_impl.h b/src/nfa/lbr_common_impl.h
index 4fb8f62a..5ae35431 100644
--- a/src/nfa/lbr_common_impl.h
+++ b/src/nfa/lbr_common_impl.h
@@ -72,7 +72,7 @@ char JOIN(ENGINE_EXEC_NAME, _reportCurrent)(const struct NFA *nfa,
     const struct lbr_common *l = getImplNfa(nfa);
     u64a offset = q_cur_offset(q);
     DEBUG_PRINTF("firing match %u at %llu\n", l->report, offset);
-    q->cb(offset, l->report, q->context);
+    q->cb(0, offset, l->report, q->context);
     return 0;
 }
 
@@ -215,7 +215,7 @@ char JOIN(ENGINE_EXEC_NAME, _Q_i)(const struct NFA *nfa, struct mq *q,
 
     if (q->report_current) {
         DEBUG_PRINTF("report_current: fire match at %llu\n", q_cur_offset(q));
-        int rv = q->cb(q_cur_offset(q), l->report, q->context);
+        int rv = q->cb(0, q_cur_offset(q), l->report, q->context);
         q->report_current = 0;
         if (rv == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING;
diff --git a/src/nfa/limex.h b/src/nfa/limex.h
index 9266b5de..ad53503c 100644
--- a/src/nfa/limex.h
+++ b/src/nfa/limex.h
@@ -54,8 +54,7 @@ extern "C"
 #define GENERATE_NFA_DECL(gf_name)                                             \
     char gf_name##_testEOD(const struct NFA *nfa, const char *state,           \
                            const char *streamState, u64a offset,               \
-                           NfaCallback callback, SomNfaCallback som_cb,        \
-                           void *context);                                     \
+                           NfaCallback callback, void *context);               \
     char gf_name##_Q(const struct NFA *n, struct mq *q, s64a end);             \
     char gf_name##_Q2(const struct NFA *n, struct mq *q, s64a end);            \
     char gf_name##_QR(const struct NFA *n, struct mq *q, ReportID report);     \
diff --git a/src/nfa/limex_common_impl.h b/src/nfa/limex_common_impl.h
index 68e0c0ad..9523b073 100644
--- a/src/nfa/limex_common_impl.h
+++ b/src/nfa/limex_common_impl.h
@@ -119,7 +119,7 @@ char PROCESS_ACCEPTS_FN(const IMPL_NFA_T *limex, STATE_T *s,
         if (TESTBIT_STATE(s, a->state)) {
             DEBUG_PRINTF("state %u is on, firing report id=%u, offset=%llu\n",
                          a->state, a->externalId, offset);
-            int rv = callback(offset, a->externalId, context);
+            int rv = callback(0, offset, a->externalId, context);
             if (unlikely(rv == MO_HALT_MATCHING)) {
                 return 1;
             }
@@ -150,7 +150,7 @@ char PROCESS_ACCEPTS_NOSQUASH_FN(const STATE_T *s,
         if (TESTBIT_STATE(s, a->state)) {
             DEBUG_PRINTF("state %u is on, firing report id=%u, offset=%llu\n",
                          a->state, a->externalId, offset);
-            int rv = callback(offset, a->externalId, context);
+            int rv = callback(0, offset, a->externalId, context);
             if (unlikely(rv == MO_HALT_MATCHING)) {
                 return 1;
             }
diff --git a/src/nfa/limex_runtime.h b/src/nfa/limex_runtime.h
index 778d376d..70601e27 100644
--- a/src/nfa/limex_runtime.h
+++ b/src/nfa/limex_runtime.h
@@ -130,7 +130,7 @@ int limexRunReports(const ReportID *reports, NfaCallback callback,
     for (; *reports != MO_INVALID_IDX; ++reports) {
         DEBUG_PRINTF("firing report for id %u at offset %llu\n",
                      *reports, offset);
-        int rv = callback(offset, *reports, context);
+        int rv = callback(0, offset, *reports, context);
         if (rv == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING;
         }
diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index 19a5ebd3..e6c1c06f 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -809,10 +809,8 @@ char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q,
 }
 
 char JOIN(LIMEX_API_ROOT, _testEOD)(const struct NFA *n, const char *state,
-                                       const char *streamState, u64a offset,
-                                       NfaCallback callback,
-                                       UNUSED SomNfaCallback som_callback,
-                                       void *context) {
+                                    const char *streamState, u64a offset,
+                                    NfaCallback callback, void *context) {
     assert(n && state);
 
     const IMPL_NFA_T *limex = getImplNfa(n);
diff --git a/src/nfa/mcclellan.c b/src/nfa/mcclellan.c
index ac26c6a1..88da27c0 100644
--- a/src/nfa/mcclellan.c
+++ b/src/nfa/mcclellan.c
@@ -42,13 +42,13 @@
 
 static really_inline
 char doComplexReport(NfaCallback cb, void *ctxt, const struct mcclellan *m,
-                     u16 s, u64a loc, char eod, u16 * const cached_accept_state,
-                     u32 * const cached_accept_id) {
+                     u16 s, u64a loc, char eod, u16 *const cached_accept_state,
+                     u32 *const cached_accept_id) {
     DEBUG_PRINTF("reporting state = %hu, loc=%llu, eod %hhu\n",
                  (u16)(s & STATE_MASK), loc, eod);
 
     if (!eod && s == *cached_accept_state) {
-        if (cb(loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING; /* termination requested */
         }
 
@@ -71,7 +71,7 @@ char doComplexReport(NfaCallback cb, void *ctxt, const struct mcclellan *m,
         *cached_accept_id = rl->report[0];
 
         DEBUG_PRINTF("reporting %u\n", rl->report[0]);
-        if (cb(loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING; /* termination requested */
         }
 
@@ -80,7 +80,7 @@ char doComplexReport(NfaCallback cb, void *ctxt, const struct mcclellan *m,
 
     for (u32 i = 0; i < count; i++) {
         DEBUG_PRINTF("reporting %u\n", rl->report[i]);
-        if (cb(loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING; /* termination requested */
         }
     }
@@ -146,7 +146,7 @@ without_accel:
 
             if (single) {
                 DEBUG_PRINTF("reporting %u\n", m->arb_report);
-                if (cb(loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
                     return MO_HALT_MATCHING; /* termination requested */
                 }
             } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
@@ -186,7 +186,7 @@ with_accel:
 
             if (single) {
                 DEBUG_PRINTF("reporting %u\n", m->arb_report);
-                if (cb(loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
                     return MO_HALT_MATCHING; /* termination requested */
                 }
             } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
@@ -328,7 +328,7 @@ without_accel:
             u64a loc = (c - 1) - buf + offAdj + 1;
             if (single) {
                 DEBUG_PRINTF("reporting %u\n", m->arb_report);
-                if (cb(loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
                     return MO_HALT_MATCHING;
                 }
             } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
@@ -360,7 +360,7 @@ with_accel:
                 u64a loc = (c - 1) - buf + offAdj + 1;
                 if (single) {
                     DEBUG_PRINTF("reporting %u\n", m->arb_report);
-                    if (cb(loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
                         return MO_HALT_MATCHING;
                     }
                 } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
@@ -475,7 +475,7 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         int rv;
         if (single) {
             DEBUG_PRINTF("reporting %u\n", m->arb_report);
-            rv = cb(q_cur_offset(q), m->arb_report, context);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
         } else {
             u32 cached_accept_id = 0;
             u16 cached_accept_state = 0;
@@ -632,7 +632,7 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         int rv;
         if (single) {
             DEBUG_PRINTF("reporting %u\n", m->arb_report);
-            rv = cb(q_cur_offset(q), m->arb_report, context);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
         } else {
             u32 cached_accept_id = 0;
             u16 cached_accept_state = 0;
@@ -836,7 +836,7 @@ char nfaExecMcClellan8_reportCurrent(const struct NFA *n, struct mq *q) {
     if (s >= m->accept_limit_8) {
         if (single) {
             DEBUG_PRINTF("reporting %u\n", m->arb_report);
-            cb(offset, m->arb_report, ctxt);
+            cb(0, offset, m->arb_report, ctxt);
         } else {
             u32 cached_accept_id = 0;
             u16 cached_accept_state = 0;
@@ -864,7 +864,7 @@ char nfaExecMcClellan16_reportCurrent(const struct NFA *n, struct mq *q) {
     if (aux->accept) {
         if (single) {
             DEBUG_PRINTF("reporting %u\n", m->arb_report);
-            cb(offset, m->arb_report, ctxt);
+            cb(0, offset, m->arb_report, ctxt);
         } else {
             u32 cached_accept_id = 0;
             u16 cached_accept_state = 0;
@@ -1073,17 +1073,15 @@ void nfaExecMcClellan16_SimpStream(const struct NFA *nfa, char *state,
 }
 
 char nfaExecMcClellan8_testEOD(const struct NFA *nfa, const char *state,
-                               UNUSED const char *streamState,
-                               u64a offset, NfaCallback callback,
-                               UNUSED SomNfaCallback som_cb, void *context) {
+                               UNUSED const char *streamState, u64a offset,
+                               NfaCallback callback, void *context) {
     return mcclellanCheckEOD(nfa, *(const u8 *)state, offset, callback,
                              context);
 }
 
 char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state,
-                                UNUSED const char *streamState,
-                                u64a offset, NfaCallback callback,
-                                UNUSED SomNfaCallback som_cb, void *context) {
+                                UNUSED const char *streamState, u64a offset,
+                                NfaCallback callback, void *context) {
     assert(ISALIGNED_N(state, 2));
     return mcclellanCheckEOD(nfa, *(const u16 *)state, offset, callback,
                              context);
diff --git a/src/nfa/mcclellan.h b/src/nfa/mcclellan.h
index 677265f5..9c6b3eec 100644
--- a/src/nfa/mcclellan.h
+++ b/src/nfa/mcclellan.h
@@ -39,8 +39,7 @@ struct NFA;
 
 char nfaExecMcClellan8_testEOD(const struct NFA *nfa, const char *state,
                                const char *streamState, u64a offset,
-                               NfaCallback callback, SomNfaCallback som_cb,
-                               void *context);
+                               NfaCallback callback, void *context);
 char nfaExecMcClellan8_Q(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecMcClellan8_Q2(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecMcClellan8_QR(const struct NFA *n, struct mq *q, ReportID report);
@@ -63,8 +62,7 @@ char nfaExecMcClellan8_expandState(const struct NFA *nfa, void *dest,
 
 char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state,
                                 const char *streamState, u64a offset,
-                                NfaCallback callback, SomNfaCallback som_cb,
-                                void *context);
+                                NfaCallback callback, void *context);
 char nfaExecMcClellan16_Q(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecMcClellan16_Q2(const struct NFA *n, struct mq *q, s64a end);
 char nfaExecMcClellan16_QR(const struct NFA *n, struct mq *q, ReportID report);
diff --git a/src/nfa/mpv.c b/src/nfa/mpv.c
index 4bae7b18..c6c8cb88 100644
--- a/src/nfa/mpv.c
+++ b/src/nfa/mpv.c
@@ -131,7 +131,8 @@ char processReports(const struct mpv *m, u8 *reporters,
                     rl_count++;
                 }
 
-                if (cb(report_offset, curr->report, ctxt) == MO_HALT_MATCHING) {
+                if (cb(0, report_offset, curr->report, ctxt) ==
+                    MO_HALT_MATCHING) {
                     DEBUG_PRINTF("bailing\n");
                     return MO_HALT_MATCHING;
                 }
@@ -180,7 +181,7 @@ char processReportsForRange(const struct mpv *m, u8 *reporters,
 
     for (size_t i = 2; i <= length; i++) {
         for (u32 j = 0; j < rl_count; j++) {
-            if (cb(first_offset + i, rl[j], ctxt) == MO_HALT_MATCHING) {
+            if (cb(0, first_offset + i, rl[j], ctxt) == MO_HALT_MATCHING) {
                 DEBUG_PRINTF("bailing\n");
                 return MO_HALT_MATCHING;
             }
diff --git a/src/nfa/nfa_api.h b/src/nfa/nfa_api.h
index 3ef6dfca..9e0b6f89 100644
--- a/src/nfa/nfa_api.h
+++ b/src/nfa/nfa_api.h
@@ -225,6 +225,9 @@ char nfaQueueExecRose(const struct NFA *nfa, struct mq *q, ReportID report);
  * Runs an NFA in reverse from (buf + buflen) to buf and then from (hbuf + hlen)
  * to hbuf (main buffer and history buffer).
  *
+ * Note: provides the match location as the "end" offset when the callback is
+ * called.
+ *
  * @param nfa engine to run
  * @param offset base offset of buf
  * @param buf main buffer
@@ -249,7 +252,6 @@ char nfaBlockExecReverse(const struct NFA *nfa, u64a offset, const u8 *buf,
  *        (including br region)
  * @param offset the offset to return (via the callback) with each match
  * @param callback the callback to call for each match raised
- * @param som_cb the callback to call for each match raised (Haig)
  * @param context context pointer passed to each callback
  *
  * @return @ref MO_HALT_MATCHING if the user instructed us to halt, otherwise
@@ -257,8 +259,7 @@ char nfaBlockExecReverse(const struct NFA *nfa, u64a offset, const u8 *buf,
  */
 char nfaCheckFinalState(const struct NFA *nfa, const char *state,
                         const char *streamState, u64a offset,
-                        NfaCallback callback, SomNfaCallback som_cb,
-                        void *context);
+                        NfaCallback callback, void *context);
 
 /**
  * Indicates if an engine is a zombie.
diff --git a/src/nfa/nfa_api_dispatch.c b/src/nfa/nfa_api_dispatch.c
index b9c9f2ea..789c3014 100644
--- a/src/nfa/nfa_api_dispatch.c
+++ b/src/nfa/nfa_api_dispatch.c
@@ -76,15 +76,14 @@
 
 char nfaCheckFinalState(const struct NFA *nfa, const char *state,
                         const char *streamState, u64a offset,
-                        NfaCallback callback, SomNfaCallback som_cb,
-                        void *context) {
+                        NfaCallback callback, void *context) {
     assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa)));
 
     // Caller should avoid calling us if we can never produce matches.
     assert(nfaAcceptsEod(nfa));
 
     DISPATCH_BY_NFA_TYPE(_testEOD(nfa, state, streamState, offset, callback,
-                                  som_cb, context));
+                                  context));
     return 0;
 }
 
diff --git a/src/nfa/nfa_api_queue.h b/src/nfa/nfa_api_queue.h
index 59c18fca..e3579a7e 100644
--- a/src/nfa/nfa_api_queue.h
+++ b/src/nfa/nfa_api_queue.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -97,7 +97,6 @@ struct mq {
                           * callback. If true, the queue must be located at a
                           * point where MO_MATCHES_PENDING was returned */
     NfaCallback cb; /**< callback to trigger on matches */
-    SomNfaCallback som_cb; /**< callback with som info;  used by haig */
     void *context; /**< context to pass along with a callback */
     struct mq_item items[MAX_MQE_LEN]; /**< queue items */
 };
diff --git a/src/nfa/tamarama.c b/src/nfa/tamarama.c
index e8dd7690..b7ba126c 100644
--- a/src/nfa/tamarama.c
+++ b/src/nfa/tamarama.c
@@ -85,7 +85,6 @@ void copyQueueProperties(const struct mq *q1, struct mq *q2,
     q2->history = q1->history;
     q2->hlength = q1->hlength;
     q2->cb = q1->cb;
-    q2->som_cb = q1->som_cb;
     q2->context = q1->context;
     q2->scratch = q1->scratch;
     q2->report_current = q1->report_current;
@@ -266,8 +265,7 @@ void copyBack(const struct  Tamarama *t, struct mq *q, struct mq *q1) {
 
 char nfaExecTamarama0_testEOD(const struct NFA *n, const char *state,
                               const char *streamState, u64a offset,
-                              NfaCallback callback, SomNfaCallback som_cb,
-                              void *context) {
+                              NfaCallback callback, void *context) {
     const struct Tamarama *t = getImplNfa(n);
     u32 activeIdx = loadActiveIdx(streamState, t->activeIdxSize);
     if (activeIdx == t->numSubEngines) {
@@ -278,8 +276,8 @@ char nfaExecTamarama0_testEOD(const struct NFA *n, const char *state,
     if (nfaAcceptsEod(sub)) {
         assert(!isContainerType(sub->type));
         const char *subStreamState = streamState + t->activeIdxSize;
-        return nfaCheckFinalState(sub, state, subStreamState,
-                                  offset, callback, som_cb, context);
+        return nfaCheckFinalState(sub, state, subStreamState, offset, callback,
+                                  context);
     }
 
     return MO_CONTINUE_MATCHING;
diff --git a/src/nfa/tamarama.h b/src/nfa/tamarama.h
index c39639a6..7ccfa5a0 100644
--- a/src/nfa/tamarama.h
+++ b/src/nfa/tamarama.h
@@ -43,8 +43,7 @@ struct hs_scratch;
 
 char nfaExecTamarama0_testEOD(const struct NFA *n, const char *state,
                               const char *streamState, u64a offset,
-                              NfaCallback callback, SomNfaCallback som_cb,
-                              void *context);
+                              NfaCallback callback, void *context);
 char nfaExecTamarama0_QR(const struct NFA *n, struct mq *q, ReportID report);
 char nfaExecTamarama0_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecTamarama0_inAccept(const struct NFA *n, ReportID report,
diff --git a/src/rose/catchup.c b/src/rose/catchup.c
index f61cf390..9a075d17 100644
--- a/src/rose/catchup.c
+++ b/src/rose/catchup.c
@@ -281,14 +281,15 @@ restart:
 
 /* for use by mpv (chained) only */
 static
-int roseNfaFinalBlastAdaptor(u64a offset, ReportID id, void *context) {
+int roseNfaFinalBlastAdaptor(u64a som, u64a offset, ReportID id,
+                             void *context) {
     struct hs_scratch *scratch = context;
     const struct RoseEngine *t = scratch->core_info.rose;
 
     DEBUG_PRINTF("masky got himself a blasted match @%llu id %u !woot!\n",
                  offset, id);
 
-    int cb_rv = roseNfaRunProgram(t, scratch, 0, offset, id, 1);
+    int cb_rv = roseNfaRunProgram(t, scratch, som, offset, id, 1);
     if (cb_rv == MO_HALT_MATCHING) {
         return MO_HALT_MATCHING;
     } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
@@ -394,7 +395,6 @@ hwlmcb_rv_t roseCatchUpMPV_i(const struct RoseEngine *t, s64a loc,
     assert(!q->report_current);
 
     q->cb = roseNfaFinalBlastAdaptor;
-    q->som_cb = NULL;
 
     DEBUG_PRINTF("queue %u blasting, %u/%u [%lld/%lld]\n",
                   qi, q->cur, q->end, q->items[q->cur].location, loc);
@@ -449,7 +449,7 @@ char in_mpv(const struct RoseEngine *rose, const struct hs_scratch *scratch) {
 }
 
 static
-int roseNfaBlastAdaptor(u64a offset, ReportID id, void *context) {
+int roseNfaBlastAdaptor(u64a som, u64a offset, ReportID id, void *context) {
     struct hs_scratch *scratch = context;
     struct RoseContext *tctxt = &scratch->tctxt;
     const struct RoseEngine *t = scratch->core_info.rose;
@@ -458,7 +458,7 @@ int roseNfaBlastAdaptor(u64a offset, ReportID id, void *context) {
                  offset, id);
 
     const char from_mpv = in_mpv(t, scratch);
-    int cb_rv = roseNfaRunProgram(t, scratch, 0, offset, id, from_mpv);
+    int cb_rv = roseNfaRunProgram(t, scratch, som, offset, id, from_mpv);
     if (cb_rv == MO_HALT_MATCHING) {
         return MO_HALT_MATCHING;
     } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
@@ -470,65 +470,8 @@ int roseNfaBlastAdaptor(u64a offset, ReportID id, void *context) {
     }
 }
 
-static
-int roseNfaBlastAdaptorNoInternal(u64a offset, ReportID id, void *context) {
-    struct hs_scratch *scratch = context;
-    struct RoseContext *tctxt = &scratch->tctxt;
-    const struct RoseEngine *t = scratch->core_info.rose;
-
-    DEBUG_PRINTF("masky got himself a blasted match @%llu id %u !woot!\n",
-                 offset, id);
-
-    assert(!in_mpv(t, scratch));
-
-    int cb_rv = roseNfaRunProgram(t, scratch, 0, offset, id, 0);
-    if (cb_rv == MO_HALT_MATCHING) {
-        return MO_HALT_MATCHING;
-    } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
-        return MO_CONTINUE_MATCHING;
-    } else {
-        assert(cb_rv == MO_CONTINUE_MATCHING);
-        return !roseSuffixIsExhausted(t, tctxt->curr_qi,
-                                      scratch->core_info.exhaustionVector);
-    }
-}
-
-static
-int roseNfaBlastSomAdaptor(u64a from_offset, u64a offset, ReportID id,
-                           void *context) {
-    struct hs_scratch *scratch = context;
-    struct RoseContext *tctxt = &scratch->tctxt;
-    const struct RoseEngine *t = scratch->core_info.rose;
-
-    DEBUG_PRINTF("masky got himself a blasted match @%llu id %u !woot!\n",
-                 offset, id);
-
-    assert(!in_mpv(t, scratch));
-
-    /* must be a external report as haig cannot directly participate in chain */
-    int cb_rv = roseNfaRunProgram(scratch->core_info.rose, scratch, from_offset,
-                                  offset, id, 0);
-    if (cb_rv == MO_HALT_MATCHING) {
-        return MO_HALT_MATCHING;
-    } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
-        return MO_CONTINUE_MATCHING;
-    } else {
-        assert(cb_rv == MO_CONTINUE_MATCHING);
-        return !roseSuffixIsExhausted(t, tctxt->curr_qi,
-                                      scratch->core_info.exhaustionVector);
-    }
-}
-
-int roseNfaAdaptor(u64a offset, ReportID id, void *context) {
-    struct hs_scratch *scratch = context;
-    DEBUG_PRINTF("masky got himself a match @%llu id %u !woot!\n", offset, id);
-
-    return roseNfaRunProgram(scratch->core_info.rose, scratch, 0, offset, id,
-                             0);
-}
-
-int roseNfaSomAdaptor(u64a from_offset, u64a offset, ReportID id,
-                      void *context) {
+int roseNfaAdaptor(u64a from_offset, u64a offset, ReportID id,
+                   void *context) {
     struct hs_scratch *scratch = context;
     DEBUG_PRINTF("masky got himself a match @%llu id %u !woot!\n", offset, id);
 
@@ -538,24 +481,15 @@ int roseNfaSomAdaptor(u64a from_offset, u64a offset, ReportID id,
 }
 
 static really_inline
-char blast_queue(const struct RoseEngine *t, struct hs_scratch *scratch,
-                 struct mq *q, u32 qi, s64a to_loc, char report_current) {
-    struct RoseContext *tctxt = &scratch->tctxt;
-    const struct NfaInfo *info = getNfaInfoByQueue(t, qi);
-
-    tctxt->curr_qi = qi;
-    if (info->only_external) {
-        q->cb = roseNfaBlastAdaptorNoInternal;
-    } else {
-        q->cb = roseNfaBlastAdaptor;
-    }
+char blast_queue(struct hs_scratch *scratch, struct mq *q, u32 qi, s64a to_loc,
+                 char report_current) {
+    scratch->tctxt.curr_qi = qi;
+    q->cb = roseNfaBlastAdaptor;
     q->report_current = report_current;
-    q->som_cb = roseNfaBlastSomAdaptor;
     DEBUG_PRINTF("queue %u blasting, %u/%u [%lld/%lld]\n", qi, q->cur, q->end,
                  q_cur_loc(q), to_loc);
     char alive = nfaQueueExec(q->nfa, q, to_loc);
     q->cb = roseNfaAdaptor;
-    q->som_cb = roseNfaSomAdaptor;
     assert(!q->report_current);
 
     return alive;
@@ -585,7 +519,7 @@ hwlmcb_rv_t buildSufPQ_final(const struct RoseEngine *t, s64a report_ok_loc,
 
     ensureEnd(q, a_qi, final_loc);
 
-    char alive = blast_queue(t, scratch, q, a_qi, second_place_loc, 0);
+    char alive = blast_queue(scratch, q, a_qi, second_place_loc, 0);
 
     /* We have three possible outcomes:
      * (1) the nfa died
@@ -881,7 +815,7 @@ hwlmcb_rv_t roseCatchUpNfas(const struct RoseEngine *t, s64a loc,
             continue;
         }
 
-        char alive = blast_queue(t, scratch, q, qi, second_place_loc, 1);
+        char alive = blast_queue(scratch, q, qi, second_place_loc, 1);
 
         if (!alive) {
             if (can_stop_matching(scratch)) {
diff --git a/src/rose/match.c b/src/rose/match.c
index bea2b5d2..eb8def9b 100644
--- a/src/rose/match.c
+++ b/src/rose/match.c
@@ -211,7 +211,7 @@ event_enqueued:
     return HWLM_CONTINUE_MATCHING;
 }
 
-int roseAnchoredCallback(u64a end, u32 id, void *ctx) {
+int roseAnchoredCallback(u64a som, u64a end, u32 id, void *ctx) {
     struct hs_scratch *scratch = ctx;
     struct RoseContext *tctxt = &scratch->tctxt;
     struct core_info *ci = &scratch->core_info;
@@ -243,7 +243,6 @@ int roseAnchoredCallback(u64a end, u32 id, void *ctx) {
 
     const u32 *programs = getByOffset(t, t->litProgramOffset);
     assert(id < t->literalCount);
-    const u64a som = 0;
     const u8 flags = ROSE_PROG_FLAG_IN_ANCHORED;
     if (roseRunProgram(t, scratch, programs[id], som, real_end, match_len,
                        flags) == HWLM_TERMINATE_MATCHING) {
@@ -648,8 +647,8 @@ int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program,
     return MO_CONTINUE_MATCHING;
 }
 
-static really_inline
-int roseReportAdaptor_i(u64a som, u64a offset, ReportID id, void *context) {
+int roseReportAdaptor(u64a som, u64a offset, ReportID id, void *context) {
+    DEBUG_PRINTF("som=%llu, offset=%llu, id=%u\n", som, offset, id);
     struct hs_scratch *scratch = context;
     assert(scratch && scratch->magic == SCRATCH_MAGIC);
 
@@ -667,13 +666,3 @@ int roseReportAdaptor_i(u64a som, u64a offset, ReportID id, void *context) {
 
     return can_stop_matching(scratch) ? MO_HALT_MATCHING : MO_CONTINUE_MATCHING;
 }
-
-int roseReportAdaptor(u64a offset, ReportID id, void *context) {
-    DEBUG_PRINTF("offset=%llu, id=%u\n", offset, id);
-    return roseReportAdaptor_i(0, offset, id, context);
-}
-
-int roseReportSomAdaptor(u64a som, u64a offset, ReportID id, void *context) {
-    DEBUG_PRINTF("som=%llu, offset=%llu, id=%u\n", som, offset, id);
-    return roseReportAdaptor_i(som, offset, id, context);
-}
diff --git a/src/rose/match.h b/src/rose/match.h
index 48866d1f..49afa588 100644
--- a/src/rose/match.h
+++ b/src/rose/match.h
@@ -48,8 +48,7 @@
 
 /* Callbacks, defined in catchup.c */
 
-int roseNfaAdaptor(u64a offset, ReportID id, void *context);
-int roseNfaSomAdaptor(u64a from_offset, u64a offset, ReportID id, void *context);
+int roseNfaAdaptor(u64a from_offset, u64a offset, ReportID id, void *context);
 
 /* Callbacks, defined in match.c */
 
@@ -57,7 +56,7 @@ hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *ctx);
 hwlmcb_rv_t roseFloatingCallback(size_t start, size_t end, u32 id, void *ctx);
 hwlmcb_rv_t roseDelayRebuildCallback(size_t start, size_t end, u32 id,
                                      void *ctx);
-int roseAnchoredCallback(u64a end, u32 id, void *ctx);
+int roseAnchoredCallback(u64a som, u64a end, u32 id, void *ctx);
 
 /* Common code, used all over Rose runtime */
 
@@ -82,7 +81,6 @@ void initQueue(struct mq *q, u32 qi, const struct RoseEngine *t,
     q->history = scratch->core_info.hbuf;
     q->hlength = scratch->core_info.hlen;
     q->cb = roseNfaAdaptor;
-    q->som_cb = roseNfaSomAdaptor;
     q->context = scratch;
     q->report_current = 0;
 
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index 7f8c32e5..e90395fb 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -722,13 +722,13 @@ u64a roseGetHaigSom(const struct RoseEngine *t, struct hs_scratch *scratch,
     u64a start = ~0ULL;
 
     /* switch the callback + context for a fun one */
-    q->som_cb = roseNfaEarliestSom;
+    q->cb = roseNfaEarliestSom;
     q->context = &start;
 
     nfaReportCurrentMatches(q->nfa, q);
 
     /* restore the old callback + context */
-    q->som_cb = roseNfaSomAdaptor;
+    q->cb = roseNfaAdaptor;
     q->context = NULL;
     DEBUG_PRINTF("earliest som is %llu\n", start);
     return start;
@@ -779,7 +779,7 @@ hwlmcb_rv_t roseEnginesEod(const struct RoseEngine *rose,
         }
 
         if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
-                               roseReportAdaptor, roseReportSomAdaptor,
+                               roseReportAdaptor,
                                scratch) == MO_HALT_MATCHING) {
             DEBUG_PRINTF("user instructed us to stop\n");
             return HWLM_TERMINATE_MATCHING;
@@ -815,7 +815,7 @@ hwlmcb_rv_t roseSuffixesEod(const struct RoseEngine *rose,
             continue;
         }
         if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset,
-                               roseReportAdaptor, roseReportSomAdaptor,
+                               roseReportAdaptor,
                                scratch) == MO_HALT_MATCHING) {
             DEBUG_PRINTF("user instructed us to stop\n");
             return HWLM_TERMINATE_MATCHING;
diff --git a/src/rose/rose.h b/src/rose/rose.h
index ca8bf353..ecf16854 100644
--- a/src/rose/rose.h
+++ b/src/rose/rose.h
@@ -49,8 +49,7 @@ void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
 hwlmcb_rv_t rosePureLiteralCallback(size_t start, size_t end, u32 id,
                                     void *context);
 
-int roseReportAdaptor(u64a offset, ReportID id, void *context);
-int roseReportSomAdaptor(u64a som, u64a offset, ReportID id, void *context);
+int roseReportAdaptor(u64a som, u64a offset, ReportID id, void *context);
 
 int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program,
                            u64a stream_offset, struct hs_scratch *scratch);
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 23e025d0..5b3806a2 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -2616,16 +2616,6 @@ bool anyEndfixMpvTriggers(const RoseBuildImpl &tbi) {
     return false;
 }
 
-static
-bool hasInternalReport(const set<ReportID> &reports, const ReportManager &rm) {
-    for (ReportID r : reports) {
-        if (!isExternalReport(rm.getReport(r))) {
-            return true;
-        }
-    }
-    return false;
-}
-
 static
 void populateNfaInfoBasics(const RoseBuildImpl &build, const build_context &bc,
                            const vector<OutfixInfo> &outfixes,
@@ -2643,24 +2633,10 @@ void populateNfaInfoBasics(const RoseBuildImpl &build, const build_context &bc,
         info.no_retrigger = contains(no_retrigger_queues, qi) ? 1 : 0;
     }
 
-    // Mark outfixes that only trigger external reports.
+    // Mark outfixes that are in the small block matcher.
     for (const auto &out : outfixes) {
         const u32 qi = out.get_queue();
-
         infos[qi].in_sbmatcher = out.in_sbmatcher;
-        if (!hasInternalReport(all_reports(out), build.rm)) {
-            infos[qi].only_external = 1;
-        }
-    }
-
-    // Mark suffixes that only trigger external reports.
-    for (const auto &e : bc.suffixes) {
-        const suffix_id &s = e.first;
-        u32 qi = e.second;
-
-        if (!hasInternalReport(all_reports(s), build.rm)) {
-            infos[qi].only_external = 1;
-        }
     }
 
     // Mark suffixes triggered by EOD table literals.
diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h
index 5b6a9dc6..803810b0 100644
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@@ -156,8 +156,6 @@ struct NfaInfo {
     u32 fullStateOffset; /* offset in scratch, relative to ??? */
     u32 ekeyListOffset; /* suffix, relative to base of rose, 0 if no ekeys */
     u8 no_retrigger; /* TODO */
-    u8 only_external; /**< does not raise any som internal events or chained
-                       * rose events */
     u8 in_sbmatcher;  /**< this outfix should not be run in small-block
                        * execution, as it will be handled by the sbmatcher
                        * HWLM table. */
diff --git a/src/runtime.c b/src/runtime.c
index 7da41d29..2def17c8 100644
--- a/src/runtime.c
+++ b/src/runtime.c
@@ -217,7 +217,6 @@ void initOutfixQueue(struct mq *q, u32 qi, const struct RoseEngine *t,
     q->history = scratch->core_info.hbuf;
     q->hlength = scratch->core_info.hlen;
     q->cb = roseReportAdaptor;
-    q->som_cb = roseReportSomAdaptor;
     q->context = scratch;
     q->report_current = 0;
 
@@ -257,8 +256,8 @@ void soleOutfixBlockExec(const struct RoseEngine *t,
     char rv = nfaQueueExec(q->nfa, q, scratch->core_info.len);
 
     if (rv && nfaAcceptsEod(nfa) && len == scratch->core_info.len) {
-        nfaCheckFinalState(nfa, q->state, q->streamState, q->length,
-                        q->cb, q->som_cb, scratch);
+        nfaCheckFinalState(nfa, q->state, q->streamState, q->length, q->cb,
+                           scratch);
     }
 }
 
@@ -568,7 +567,7 @@ void soleOutfixEodExec(hs_stream_t *id, hs_scratch_t *scratch) {
 
     assert(nfaAcceptsEod(nfa));
     nfaCheckFinalState(nfa, q->state, q->streamState, q->offset, q->cb,
-                       q->som_cb, scratch);
+                       scratch);
 }
 
 static really_inline
diff --git a/src/som/som_runtime.c b/src/som/som_runtime.c
index 9d0a1390..b9972b2c 100644
--- a/src/som/som_runtime.c
+++ b/src/som/som_runtime.c
@@ -87,7 +87,7 @@ char ok_and_mark_if_unset(u8 *som_store_valid, struct fatbit *som_set_now,
 }
 
 static
-int somRevCallback(u64a offset, ReportID id, void *ctx) {
+int somRevCallback(UNUSED u64a som, u64a offset, ReportID id, void *ctx) {
     DEBUG_PRINTF("offset=%llu, id=%u\n", offset, id);
 
     // We use the id to store the offset adjustment (for assertions like a
diff --git a/unit/internal/lbr.cpp b/unit/internal/lbr.cpp
index f335e184..bd799c0f 100644
--- a/unit/internal/lbr.cpp
+++ b/unit/internal/lbr.cpp
@@ -71,7 +71,7 @@ struct LbrTestParams {
 };
 
 static
-int onMatch(u64a, ReportID, void *ctx) {
+int onMatch(u64a, u64a, ReportID, void *ctx) {
     unsigned *matches = (unsigned *)ctx;
     (*matches)++;
     return MO_CONTINUE_MATCHING;
@@ -125,7 +125,6 @@ protected:
         q.scratch = nullptr; // not needed by LBR
         q.report_current = 0;
         q.cb = onMatch;
-        q.som_cb = nullptr; // only used by Haig
         q.context = &matches;
     }
 
diff --git a/unit/internal/limex_nfa.cpp b/unit/internal/limex_nfa.cpp
index eb6ce08b..c3cfb3dd 100644
--- a/unit/internal/limex_nfa.cpp
+++ b/unit/internal/limex_nfa.cpp
@@ -51,7 +51,7 @@ static const string SCAN_DATA = "___foo______\n___foofoo_foo_^^^^^^^^^^^^^^^^^^"
 static const u32 MATCH_REPORT = 1024;
 
 static
-int onMatch(u64a, ReportID, void *ctx) {
+int onMatch(u64a, u64a, ReportID, void *ctx) {
     unsigned *matches = (unsigned *)ctx;
     (*matches)++;
     return MO_CONTINUE_MATCHING;
@@ -105,7 +105,6 @@ protected:
         q.scratch = nullptr; /* limex does not use scratch */
         q.report_current = 0;
         q.cb = onMatch;
-        q.som_cb = nullptr; // only used by Haig
         q.context = &matches;
     }
 
@@ -293,8 +292,7 @@ TEST_P(LimExModelTest, CheckFinalState) {
 
     // Check for EOD matches.
     char rv = nfaCheckFinalState(nfa.get(), full_state.get(),
-                                 stream_state.get(), end, onMatch, nullptr,
-                                 &matches);
+                                 stream_state.get(), end, onMatch, &matches);
     ASSERT_EQ(MO_CONTINUE_MATCHING, rv);
 }
 
@@ -400,7 +398,6 @@ protected:
         q.scratch = nullptr; /* limex does not use scratch */
         q.report_current = 0;
         q.cb = onMatch;
-        q.som_cb = nullptr; // only used by Haig
         q.context = &matches;
     }
 

From 9f98f4c7b2325a8b2690b8540c4115fd4b56537b Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 29 Jun 2016 15:04:40 +1000
Subject: [PATCH 077/166] nfa: standardise callback start, end naming

---
 src/nfa/callback.h         | 26 ++++++++++++++------------
 src/rose/catchup.c         | 31 +++++++++++++++----------------
 src/rose/match.c           | 12 +++++++-----
 src/rose/match.h           |  4 ++--
 src/rose/program_runtime.c |  5 +++--
 src/rose/program_runtime.h |  4 ++--
 src/rose/rose.h            |  2 +-
 src/som/som_runtime.c      |  6 +++---
 8 files changed, 47 insertions(+), 43 deletions(-)

diff --git a/src/nfa/callback.h b/src/nfa/callback.h
index 0284f1d5..9bdaa8d1 100644
--- a/src/nfa/callback.h
+++ b/src/nfa/callback.h
@@ -37,24 +37,26 @@
 
 /** \brief The type for an NFA callback.
  *
- * This is a function that takes as arguments the current offset where the
- * match occurs, the id of the match and the context pointer that was passed
- * into the NFA API function that executed the NFA.
+ * This is a function that takes as arguments the current start and end offsets
+ * where the match occurs, the id of the match and the context pointer that was
+ * passed into the NFA API function that executed the NFA.
  *
- * The offset where the match occurs will be the offset after the character
- * that caused the match. Thus, if we have a buffer containing 'abc', then a
- * pattern that matches an empty string will have an offset of 0, a pattern
- * that matches 'a' will have an offset of 1, and a pattern that matches 'abc'
- * will have an offset of 3, which will be a value that is 'beyond' the size of
- * the buffer. That is, if we have n characters in the buffer, there are n+1
- * different potential offsets for matches.
+ * The start offset is the "start of match" (SOM) offset for the match. It is
+ * only provided by engines that natively support SOM tracking (e.g. Gough).
+ *
+ * The end offset will be the offset after the character that caused the match.
+ * Thus, if we have a buffer containing 'abc', then a pattern that matches an
+ * empty string will have an offset of 0, a pattern that matches 'a' will have
+ * an offset of 1, and a pattern that matches 'abc' will have an offset of 3,
+ * which will be a value that is 'beyond' the size of the buffer. That is, if
+ * we have n characters in the buffer, there are n+1 different potential
+ * offsets for matches.
  *
  * This function should return an int - currently the possible return values
  * are 0, which means 'stop running the engine' or non-zero, which means
  * 'continue matching'.
  */
-typedef int (*NfaCallback)(u64a from_offset, u64a to_offset, ReportID id,
-                           void *context);
+typedef int (*NfaCallback)(u64a start, u64a end, ReportID id, void *context);
 
 /**
  * standard \ref NfaCallback return value indicating that engine execution
diff --git a/src/rose/catchup.c b/src/rose/catchup.c
index 9a075d17..017a6bf0 100644
--- a/src/rose/catchup.c
+++ b/src/rose/catchup.c
@@ -281,15 +281,14 @@ restart:
 
 /* for use by mpv (chained) only */
 static
-int roseNfaFinalBlastAdaptor(u64a som, u64a offset, ReportID id,
-                             void *context) {
+int roseNfaFinalBlastAdaptor(u64a start, u64a end, ReportID id, void *context) {
     struct hs_scratch *scratch = context;
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
     const struct RoseEngine *t = scratch->core_info.rose;
 
-    DEBUG_PRINTF("masky got himself a blasted match @%llu id %u !woot!\n",
-                 offset, id);
+    DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end);
 
-    int cb_rv = roseNfaRunProgram(t, scratch, som, offset, id, 1);
+    int cb_rv = roseNfaRunProgram(t, scratch, start, end, id, 1);
     if (cb_rv == MO_HALT_MATCHING) {
         return MO_HALT_MATCHING;
     } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
@@ -449,35 +448,35 @@ char in_mpv(const struct RoseEngine *rose, const struct hs_scratch *scratch) {
 }
 
 static
-int roseNfaBlastAdaptor(u64a som, u64a offset, ReportID id, void *context) {
+int roseNfaBlastAdaptor(u64a start, u64a end, ReportID id, void *context) {
     struct hs_scratch *scratch = context;
-    struct RoseContext *tctxt = &scratch->tctxt;
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
     const struct RoseEngine *t = scratch->core_info.rose;
 
-    DEBUG_PRINTF("masky got himself a blasted match @%llu id %u !woot!\n",
-                 offset, id);
+    DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end);
 
     const char from_mpv = in_mpv(t, scratch);
-    int cb_rv = roseNfaRunProgram(t, scratch, som, offset, id, from_mpv);
+    int cb_rv = roseNfaRunProgram(t, scratch, start, end, id, from_mpv);
     if (cb_rv == MO_HALT_MATCHING) {
         return MO_HALT_MATCHING;
     } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) {
         return MO_CONTINUE_MATCHING;
     } else {
         assert(cb_rv == MO_CONTINUE_MATCHING);
-        return !roseSuffixIsExhausted(t, tctxt->curr_qi,
+        return !roseSuffixIsExhausted(t, scratch->tctxt.curr_qi,
                                       scratch->core_info.exhaustionVector);
     }
 }
 
-int roseNfaAdaptor(u64a from_offset, u64a offset, ReportID id,
-                   void *context) {
+int roseNfaAdaptor(u64a start, u64a end, ReportID id, void *context) {
     struct hs_scratch *scratch = context;
-    DEBUG_PRINTF("masky got himself a match @%llu id %u !woot!\n", offset, id);
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
+
+    DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end);
 
     /* must be a external report as haig cannot directly participate in chain */
-    return roseNfaRunProgram(scratch->core_info.rose, scratch, from_offset,
-                             offset, id, 0);
+    return roseNfaRunProgram(scratch->core_info.rose, scratch, start, end, id,
+                             0);
 }
 
 static really_inline
diff --git a/src/rose/match.c b/src/rose/match.c
index eb8def9b..2b05fd76 100644
--- a/src/rose/match.c
+++ b/src/rose/match.c
@@ -211,8 +211,9 @@ event_enqueued:
     return HWLM_CONTINUE_MATCHING;
 }
 
-int roseAnchoredCallback(u64a som, u64a end, u32 id, void *ctx) {
+int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx) {
     struct hs_scratch *scratch = ctx;
+    assert(scratch && scratch->magic == SCRATCH_MAGIC);
     struct RoseContext *tctxt = &scratch->tctxt;
     struct core_info *ci = &scratch->core_info;
     const struct RoseEngine *t = ci->rose;
@@ -244,7 +245,7 @@ int roseAnchoredCallback(u64a som, u64a end, u32 id, void *ctx) {
     const u32 *programs = getByOffset(t, t->litProgramOffset);
     assert(id < t->literalCount);
     const u8 flags = ROSE_PROG_FLAG_IN_ANCHORED;
-    if (roseRunProgram(t, scratch, programs[id], som, real_end, match_len,
+    if (roseRunProgram(t, scratch, programs[id], start, real_end, match_len,
                        flags) == HWLM_TERMINATE_MATCHING) {
         assert(can_stop_matching(scratch));
         DEBUG_PRINTF("caller requested termination\n");
@@ -647,11 +648,12 @@ int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program,
     return MO_CONTINUE_MATCHING;
 }
 
-int roseReportAdaptor(u64a som, u64a offset, ReportID id, void *context) {
-    DEBUG_PRINTF("som=%llu, offset=%llu, id=%u\n", som, offset, id);
+int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context) {
     struct hs_scratch *scratch = context;
     assert(scratch && scratch->magic == SCRATCH_MAGIC);
 
+    DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end);
+
     const struct RoseEngine *rose = scratch->core_info.rose;
 
     // Our match ID is the program offset.
@@ -659,7 +661,7 @@ int roseReportAdaptor(u64a som, u64a offset, ReportID id, void *context) {
     const size_t match_len = 0; // Unused in this path.
     const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
     hwlmcb_rv_t rv =
-        roseRunProgram(rose, scratch, program, som, offset, match_len, flags);
+        roseRunProgram(rose, scratch, program, start, end, match_len, flags);
     if (rv == HWLM_TERMINATE_MATCHING) {
         return MO_HALT_MATCHING;
     }
diff --git a/src/rose/match.h b/src/rose/match.h
index 49afa588..b69ff158 100644
--- a/src/rose/match.h
+++ b/src/rose/match.h
@@ -48,7 +48,7 @@
 
 /* Callbacks, defined in catchup.c */
 
-int roseNfaAdaptor(u64a from_offset, u64a offset, ReportID id, void *context);
+int roseNfaAdaptor(u64a start, u64a end, ReportID id, void *context);
 
 /* Callbacks, defined in match.c */
 
@@ -56,7 +56,7 @@ hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *ctx);
 hwlmcb_rv_t roseFloatingCallback(size_t start, size_t end, u32 id, void *ctx);
 hwlmcb_rv_t roseDelayRebuildCallback(size_t start, size_t end, u32 id,
                                      void *ctx);
-int roseAnchoredCallback(u64a som, u64a end, u32 id, void *ctx);
+int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx);
 
 /* Common code, used all over Rose runtime */
 
diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c
index 7669103f..23532d40 100644
--- a/src/rose/program_runtime.c
+++ b/src/rose/program_runtime.c
@@ -33,10 +33,11 @@
 
 #include "program_runtime.h"
 
-int roseNfaEarliestSom(u64a from_offset, UNUSED u64a offset, UNUSED ReportID id,
+int roseNfaEarliestSom(u64a start, UNUSED u64a end, UNUSED ReportID id,
                        void *context) {
+    assert(context);
     u64a *som = context;
-    *som = MIN(*som, from_offset);
+    *som = MIN(*som, start);
     return MO_CONTINUE_MATCHING;
 }
 
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index e90395fb..fe71772e 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -702,8 +702,8 @@ int roseCheckLookaround(const struct RoseEngine *t,
     return 1;
 }
 
-int roseNfaEarliestSom(u64a from_offset, u64a offset, ReportID id,
-                       void *context);
+int roseNfaEarliestSom(u64a start, u64a end, ReportID id, void *context);
+
 static rose_inline
 u64a roseGetHaigSom(const struct RoseEngine *t, struct hs_scratch *scratch,
                     const u32 qi, UNUSED const u32 leftfixLag) {
diff --git a/src/rose/rose.h b/src/rose/rose.h
index ecf16854..280e3bd5 100644
--- a/src/rose/rose.h
+++ b/src/rose/rose.h
@@ -49,7 +49,7 @@ void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
 hwlmcb_rv_t rosePureLiteralCallback(size_t start, size_t end, u32 id,
                                     void *context);
 
-int roseReportAdaptor(u64a som, u64a offset, ReportID id, void *context);
+int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context);
 
 int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program,
                            u64a stream_offset, struct hs_scratch *scratch);
diff --git a/src/som/som_runtime.c b/src/som/som_runtime.c
index b9972b2c..1a868efc 100644
--- a/src/som/som_runtime.c
+++ b/src/som/som_runtime.c
@@ -87,14 +87,14 @@ char ok_and_mark_if_unset(u8 *som_store_valid, struct fatbit *som_set_now,
 }
 
 static
-int somRevCallback(UNUSED u64a som, u64a offset, ReportID id, void *ctx) {
-    DEBUG_PRINTF("offset=%llu, id=%u\n", offset, id);
+int somRevCallback(UNUSED u64a start, u64a end, ReportID id, void *ctx) {
+    DEBUG_PRINTF("offset=%llu, id=%u\n", end, id);
 
     // We use the id to store the offset adjustment (for assertions like a
     // leading \b or multiline mode).
     assert(id <= 1);
     u64a *from_offset = ctx;
-    LIMIT_TO_AT_MOST(from_offset, offset + id);
+    LIMIT_TO_AT_MOST(from_offset, end + id);
     return 1; // continue matching.
 }
 

From 4d6934fc7762d6803e086eb16143f93b7595f41b Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Mon, 6 Jun 2016 11:54:21 +1000
Subject: [PATCH 078/166] Move limex specific shuffle utils and ssse3 funcs

---
 CMakeLists.txt                                |   7 +-
 src/fdr/fdr.c                                 |   1 -
 src/fdr/teddy.c                               |   1 -
 src/fdr/teddy_avx2.c                          |   1 -
 src/hwlm/noodle_engine.c                      |   1 -
 src/nfa/limex_accel.c                         |  52 +++---
 src/{util/shuffle.h => nfa/limex_shuffle.h}   |  37 +++-
 src/nfa/multishufti_avx2.h                    |   3 +-
 src/nfa/multishufti_sse.h                     |   3 +-
 src/nfa/multitruffle.c                        |   3 +-
 src/nfa/shufti.c                              |   3 +-
 src/nfa/shufti_common.h                       |   3 +-
 src/nfa/truffle.c                             |   3 +-
 src/nfa/truffle_common.h                      |   3 +-
 src/rose/counting_miracle.h                   |   3 +-
 src/util/masked_move.h                        |   3 +-
 src/util/shuffle_ssse3.h                      |  79 ---------
 src/util/{simd_utils_ssse3.c => simd_utils.c} |   2 +-
 src/util/simd_utils.h                         |  44 ++++-
 src/util/simd_utils_ssse3.h                   | 166 ------------------
 unit/internal/shuffle.cpp                     | 134 ++++++++------
 unit/internal/simd_utils.cpp                  |   1 -
 22 files changed, 182 insertions(+), 371 deletions(-)
 rename src/{util/shuffle.h => nfa/limex_shuffle.h} (74%)
 delete mode 100644 src/util/shuffle_ssse3.h
 rename src/util/{simd_utils_ssse3.c => simd_utils.c} (98%)
 delete mode 100644 src/util/simd_utils_ssse3.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 94a54241..67109797 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -445,6 +445,7 @@ set (hs_exec_SRCS
     src/nfa/limex_internal.h
     src/nfa/limex_runtime.h
     src/nfa/limex_runtime_impl.h
+    src/nfa/limex_shuffle.h
     src/nfa/limex_state_impl.h
     src/nfa/mpv.h
     src/nfa/mpv.c
@@ -525,11 +526,8 @@ set (hs_exec_SRCS
     src/util/pqueue.h
     src/util/scatter.h
     src/util/scatter_runtime.h
-    src/util/shuffle.h
-    src/util/shuffle_ssse3.h
     src/util/simd_utils.h
-    src/util/simd_utils_ssse3.h
-    src/util/simd_utils_ssse3.c
+    src/util/simd_utils.c
     src/util/state_compress.h
     src/util/state_compress.c
     src/util/unaligned.h
@@ -887,7 +885,6 @@ SET (hs_SRCS
     src/util/report_manager.cpp
     src/util/report_manager.h
     src/util/simd_utils.h
-    src/util/simd_utils_ssse3.h
     src/util/target_info.cpp
     src/util/target_info.h
     src/util/ue2_containers.h
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index ff69853e..aa9d1c1d 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -36,7 +36,6 @@
 #include "teddy.h"
 #include "teddy_internal.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 /** \brief number of bytes processed in each iteration */
 #define ITER_BYTES          16
diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 08b761c0..4ff0b18e 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -36,7 +36,6 @@
 #include "teddy_internal.h"
 #include "teddy_runtime_common.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
     {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index 33dd8a30..f282c505 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -36,7 +36,6 @@
 #include "teddy_internal.h"
 #include "teddy_runtime_common.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 #if defined(__AVX2__)
 
diff --git a/src/hwlm/noodle_engine.c b/src/hwlm/noodle_engine.c
index e2f80a59..1d1ab4e6 100644
--- a/src/hwlm/noodle_engine.c
+++ b/src/hwlm/noodle_engine.c
@@ -37,7 +37,6 @@
 #include "util/compare.h"
 #include "util/masked_move.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 #include <ctype.h>
 #include <stdbool.h>
diff --git a/src/nfa/limex_accel.c b/src/nfa/limex_accel.c
index 2c73f9ff..28f37083 100644
--- a/src/nfa/limex_accel.c
+++ b/src/nfa/limex_accel.c
@@ -35,6 +35,7 @@
 #include "accel.h"
 #include "limex_internal.h"
 #include "limex_limits.h"
+#include "limex_shuffle.h"
 #include "nfa_internal.h"
 #include "shufti.h"
 #include "truffle.h"
@@ -44,10 +45,7 @@
 #include "ue2common.h"
 #include "vermicelli.h"
 #include "util/bitutils.h"
-#include "util/shuffle.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
-#include "util/shuffle_ssse3.h"
 
 static really_inline
 size_t accelScanWrapper(const u8 *accelTable, const union AccelAux *aux,
@@ -80,7 +78,7 @@ size_t accelScanWrapper(const u8 *accelTable, const union AccelAux *aux,
 size_t doAccel32(u32 s, u32 accel, const u8 *accelTable,
                  const union AccelAux *aux, const u8 *input, size_t i,
                  size_t end) {
-    u32 idx = shuffleDynamic32(s, accel);
+    u32 idx = packedExtract32(s, accel);
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
 
@@ -92,7 +90,7 @@ size_t doAccel128(const m128 *state, const struct LimExNFA128 *limex,
     DEBUG_PRINTF("using PSHUFB for 128-bit shuffle\n");
     m128 accelPerm = limex->accelPermute;
     m128 accelComp = limex->accelCompare;
-    idx = shufflePshufb128(s, accelPerm, accelComp);
+    idx = packedExtract128(s, accelPerm, accelComp);
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
 
@@ -105,17 +103,13 @@ size_t doAccel256(const m256 *state, const struct LimExNFA256 *limex,
     m256 accelPerm = limex->accelPermute;
     m256 accelComp = limex->accelCompare;
 #if !defined(__AVX2__)
-    u32 idx1 = shufflePshufb128(s.lo, accelPerm.lo, accelComp.lo);
-    u32 idx2 = shufflePshufb128(s.hi, accelPerm.hi, accelComp.hi);
-#else
-    // TODO: learn you some avx2 shuffles for great good
-    u32 idx1 = shufflePshufb128(movdq_lo(s), movdq_lo(accelPerm),
-                                movdq_lo(accelComp));
-    u32 idx2 = shufflePshufb128(movdq_hi(s), movdq_hi(accelPerm),
-                                movdq_hi(accelComp));
-#endif
+    u32 idx1 = packedExtract128(s.lo, accelPerm.lo, accelComp.lo);
+    u32 idx2 = packedExtract128(s.hi, accelPerm.hi, accelComp.hi);
     assert((idx1 & idx2) == 0); // should be no shared bits
     idx = idx1 | idx2;
+#else
+    idx = packedExtract256(s, accelPerm, accelComp);
+#endif
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
 
@@ -127,9 +121,9 @@ size_t doAccel384(const m384 *state, const struct LimExNFA384 *limex,
     DEBUG_PRINTF("using PSHUFB for 384-bit shuffle\n");
     m384 accelPerm = limex->accelPermute;
     m384 accelComp = limex->accelCompare;
-    u32 idx1 = shufflePshufb128(s.lo, accelPerm.lo, accelComp.lo);
-    u32 idx2 = shufflePshufb128(s.mid, accelPerm.mid, accelComp.mid);
-    u32 idx3 = shufflePshufb128(s.hi, accelPerm.hi, accelComp.hi);
+    u32 idx1 = packedExtract128(s.lo, accelPerm.lo, accelComp.lo);
+    u32 idx2 = packedExtract128(s.mid, accelPerm.mid, accelComp.mid);
+    u32 idx3 = packedExtract128(s.hi, accelPerm.hi, accelComp.hi);
     assert((idx1 & idx2 & idx3) == 0); // should be no shared bits
     idx = idx1 | idx2 | idx3;
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
@@ -144,21 +138,17 @@ size_t doAccel512(const m512 *state, const struct LimExNFA512 *limex,
     m512 accelPerm = limex->accelPermute;
     m512 accelComp = limex->accelCompare;
 #if !defined(__AVX2__)
-    u32 idx1 = shufflePshufb128(s.lo.lo, accelPerm.lo.lo, accelComp.lo.lo);
-    u32 idx2 = shufflePshufb128(s.lo.hi, accelPerm.lo.hi, accelComp.lo.hi);
-    u32 idx3 = shufflePshufb128(s.hi.lo, accelPerm.hi.lo, accelComp.hi.lo);
-    u32 idx4 = shufflePshufb128(s.hi.hi, accelPerm.hi.hi, accelComp.hi.hi);
-#else
-    u32 idx1 = shufflePshufb128(movdq_lo(s.lo), movdq_lo(accelPerm.lo),
-                                movdq_lo(accelComp.lo));
-    u32 idx2 = shufflePshufb128(movdq_hi(s.lo), movdq_hi(accelPerm.lo),
-                                movdq_hi(accelComp.lo));
-    u32 idx3 = shufflePshufb128(movdq_lo(s.hi), movdq_lo(accelPerm.hi),
-                                movdq_lo(accelComp.hi));
-    u32 idx4 = shufflePshufb128(movdq_hi(s.hi), movdq_hi(accelPerm.hi),
-                                movdq_hi(accelComp.hi));
-#endif
+    u32 idx1 = packedExtract128(s.lo.lo, accelPerm.lo.lo, accelComp.lo.lo);
+    u32 idx2 = packedExtract128(s.lo.hi, accelPerm.lo.hi, accelComp.lo.hi);
+    u32 idx3 = packedExtract128(s.hi.lo, accelPerm.hi.lo, accelComp.hi.lo);
+    u32 idx4 = packedExtract128(s.hi.hi, accelPerm.hi.hi, accelComp.hi.hi);
     assert((idx1 & idx2 & idx3 & idx4) == 0); // should be no shared bits
     idx = idx1 | idx2 | idx3 | idx4;
+#else
+    u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo);
+    u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi);
+    assert((idx1 & idx2) == 0); // should be no shared bits
+    idx = idx1 | idx2;
+#endif
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
diff --git a/src/util/shuffle.h b/src/nfa/limex_shuffle.h
similarity index 74%
rename from src/util/shuffle.h
rename to src/nfa/limex_shuffle.h
index ba85fb5d..40900a65 100644
--- a/src/util/shuffle.h
+++ b/src/nfa/limex_shuffle.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,20 +34,19 @@
  * be faster and actually correct if these assumptions don't hold true.
  */
 
-#ifndef SHUFFLE_H
-#define SHUFFLE_H
+#ifndef LIMEX_SHUFFLE_H
+#define LIMEX_SHUFFLE_H
 
-#include "config.h"
-#include "bitutils.h"
-#include "simd_utils.h"
 #include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
 
 #if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__))
 #define HAVE_PEXT
 #endif
 
 static really_inline
-u32 shuffleDynamic32(u32 x, u32 mask) {
+u32 packedExtract32(u32 x, u32 mask) {
 #if defined(HAVE_PEXT)
     // Intel BMI2 can do this operation in one instruction.
     return _pext_u32(x, mask);
@@ -67,7 +66,7 @@ u32 shuffleDynamic32(u32 x, u32 mask) {
 }
 
 static really_inline
-u32 shuffleDynamic64(u64a x, u64a mask) {
+u32 packedExtract64(u64a x, u64a mask) {
 #if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
     // Intel BMI2 can do this operation in one instruction.
     return _pext_u64(x, mask);
@@ -88,4 +87,24 @@ u32 shuffleDynamic64(u64a x, u64a mask) {
 
 #undef HAVE_PEXT
 
-#endif // SHUFFLE_H
+static really_inline
+u32 packedExtract128(m128 s, const m128 permute, const m128 compare) {
+    m128 shuffled = pshufb(s, permute);
+    m128 compared = and128(shuffled, compare);
+    u16 rv = ~cmpmsk8(compared, shuffled);
+    return (u32)rv;
+}
+
+#if defined(__AVX2__)
+static really_inline
+u32 packedExtract256(m256 s, const m256 permute, const m256 compare) {
+    // vpshufb doesn't cross lanes, so this is a bit of a cheat
+    m256 shuffled = vpshufb(s, permute);
+    m256 compared = and256(shuffled, compare);
+    u32 rv = ~movemask256(eq256(compared, shuffled));
+    // stitch the lane-wise results back together
+    return (u32)((rv >> 16) | (rv & 0xffffU));
+}
+#endif // AVX2
+
+#endif // LIMEX_SHUFFLE_H
diff --git a/src/nfa/multishufti_avx2.h b/src/nfa/multishufti_avx2.h
index e9980872..042f5570 100644
--- a/src/nfa/multishufti_avx2.h
+++ b/src/nfa/multishufti_avx2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,7 +31,6 @@
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 static really_inline
 const u8 *JOIN(MATCH_ALGO, fwdBlock)(m256 mask_lo, m256 mask_hi, m256 chars,
diff --git a/src/nfa/multishufti_sse.h b/src/nfa/multishufti_sse.h
index 7ea5946d..0a9b543e 100644
--- a/src/nfa/multishufti_sse.h
+++ b/src/nfa/multishufti_sse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,7 +31,6 @@
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 /* Normal SSSE3 shufti */
 
diff --git a/src/nfa/multitruffle.c b/src/nfa/multitruffle.c
index 3af6394a..381bda93 100644
--- a/src/nfa/multitruffle.c
+++ b/src/nfa/multitruffle.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,7 +32,6 @@
 #include "multitruffle.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 #include "multiaccel_common.h"
 
diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c
index b1fec488..5aba9847 100644
--- a/src/nfa/shufti.c
+++ b/src/nfa/shufti.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,7 +40,6 @@
 
 #include "shufti_common.h"
 
-#include "util/simd_utils_ssse3.h"
 
 /** \brief Naive byte-by-byte implementation. */
 static really_inline
diff --git a/src/nfa/shufti_common.h b/src/nfa/shufti_common.h
index 9c11f2b9..84835665 100644
--- a/src/nfa/shufti_common.h
+++ b/src/nfa/shufti_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,6 @@
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/unaligned.h"
-#include "util/simd_utils_ssse3.h"
 
 /*
  * Common stuff for all versions of shufti (single, multi and multidouble)
diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c
index 8863c71a..1eff269a 100644
--- a/src/nfa/truffle.c
+++ b/src/nfa/truffle.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,6 @@
 #include "truffle.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 #include "truffle_common.h"
 
diff --git a/src/nfa/truffle_common.h b/src/nfa/truffle_common.h
index 122f65c4..593a605e 100644
--- a/src/nfa/truffle_common.h
+++ b/src/nfa/truffle_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,7 +31,6 @@
 
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 /*
  * Common stuff for all versions of truffle (single, multi and multidouble)
diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h
index d36ed272..cd84d052 100644
--- a/src/rose/counting_miracle.h
+++ b/src/rose/counting_miracle.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,6 @@
 #include "rose_internal.h"
 #include "nfa/nfa_api_queue.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 /** \brief Maximum number of bytes to scan when looking for a "counting miracle"
  * stop character. */
diff --git a/src/util/masked_move.h b/src/util/masked_move.h
index 93c79e75..09276e80 100644
--- a/src/util/masked_move.h
+++ b/src/util/masked_move.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,7 +33,6 @@
 
 #include "unaligned.h"
 #include "simd_utils.h"
-#include "simd_utils_ssse3.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/util/shuffle_ssse3.h b/src/util/shuffle_ssse3.h
deleted file mode 100644
index d295839b..00000000
--- a/src/util/shuffle_ssse3.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef SHUFFLE_SSSE3_H
-#define SHUFFLE_SSSE3_H
-
-#include "simd_utils_ssse3.h"
-
-#ifdef DEBUG
-#include "compare.h"
-static really_inline void shufDumpMsk(m128 msk) {
-    u8 * mskAsU8 = (u8 *)&msk;
-    for (int i = 0; i < 16; i++) {
-        u8 c = mskAsU8[i];
-        for (int j = 0; j < 8; j++) {
-            if ((c >> (7-j)) & 0x1)
-                printf("1");
-            else
-                printf("0");
-        }
-        printf(" ");
-    }
-}
-
-static really_inline void shufDumpMskAsChars(m128 msk) {
-    u8 * mskAsU8 = (u8 *)&msk;
-    for (int i = 0; i < 16; i++) {
-        u8 c = mskAsU8[i];
-        if (ourisprint(c))
-            printf("%c",c);
-        else
-            printf(".");
-    }
-}
-#endif
-
-#if !defined(NO_SSSE3)
-static really_inline
-u32 shufflePshufb128(m128 s, const m128 permute, const m128 compare) {
-    m128 shuffled = pshufb(s, permute);
-    m128 compared = and128(shuffled, compare);
-#ifdef DEBUG
-    printf("State:   ");  shufDumpMsk(s);       printf("\n");
-    printf("Permute: ");  shufDumpMsk(permute); printf("\n");
-    printf("Compare: ");  shufDumpMsk(compare); printf("\n");
-    printf("Shuffled: "); shufDumpMsk(shuffled); printf("\n");
-    printf("Compared: "); shufDumpMsk(compared); printf("\n");
-#endif
-    u16 rv = ~cmpmsk8(compared, shuffled);
-    return (u32)rv;
-}
-#endif // NO_SSSE3
-
-#endif // SHUFFLE_SSSE3_H
diff --git a/src/util/simd_utils_ssse3.c b/src/util/simd_utils.c
similarity index 98%
rename from src/util/simd_utils_ssse3.c
rename to src/util/simd_utils.c
index 50cbe007..5f354270 100644
--- a/src/util/simd_utils_ssse3.c
+++ b/src/util/simd_utils.c
@@ -26,7 +26,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "simd_utils_ssse3.h"
+#include "simd_utils.h"
 
 const char vbs_mask_data[] ALIGN_CL_DIRECTIVE = {
     0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 99ad7ce5..4ac7b176 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -33,6 +33,10 @@
 #ifndef SIMD_UTILS
 #define SIMD_UTILS
 
+#if !defined(_WIN32) && !defined(__SSSE3__)
+#error SSSE3 instructions must be enabled
+#endif
+
 #include "config.h"
 #include <string.h> // for memcpy
 
@@ -93,6 +97,14 @@
 #define assume_aligned(x, y) (x)
 #endif
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern const char vbs_mask_data[];
+#ifdef __cplusplus
+}
+#endif
+
 static really_inline m128 ones128(void) {
 #if !defined(NO_ASM)
     // trick from Intel's optimization guide to generate all-ones. We have to
@@ -160,7 +172,6 @@ static really_inline unsigned short cmpmsk8(m128 a, m128 b) {
 #define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
 #define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
 
-
 // We found that this generated better code with gcc-4.1 and with the default
 // tuning settings on gcc-4.4 than just using the _mm_set1_epi8() instrinsic.
 static really_inline m128 set16x8(u8 c) {
@@ -318,6 +329,36 @@ char testbit128(const m128 *ptr, unsigned int n) {
     return !!(bytes[n / 8] & (1 << (n % 8)));
 }
 
+// offset must be an immediate
+#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
+
+static really_inline
+m128 pshufb(m128 a, m128 b) {
+    m128 result;
+    result = _mm_shuffle_epi8(a, b);
+    return result;
+}
+
+static really_inline
+m256 vpshufb(m256 a, m256 b) {
+#if defined(__AVX2__)
+    return _mm256_shuffle_epi8(a, b);
+#else
+    m256 rv;
+    rv.lo = pshufb(a.lo, b.lo);
+    rv.hi = pshufb(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
+    return pshufb(in, shift_mask);
+}
+
+
 /****
  **** 256-bit Primitives
  ****/
@@ -735,6 +776,7 @@ m256 shift256Left8Bits(m256 a) {
 #define extractlow32from256(a) movd(cast256to128(a))
 #define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b);
 #define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b);
+#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
 
 #endif //AVX2
 
diff --git a/src/util/simd_utils_ssse3.h b/src/util/simd_utils_ssse3.h
deleted file mode 100644
index 6854ade3..00000000
--- a/src/util/simd_utils_ssse3.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief SIMD primitives specifically for Intel SSSE3 platforms.
- */
-
-#ifndef SIMD_UTILS_SSSE3_H_E27DF795C9AA02
-#define SIMD_UTILS_SSSE3_H_E27DF795C9AA02
-
-#if !defined(_WIN32) && !defined(__SSSE3__)
-#error SSSE3 instructions must be enabled
-#endif
-
-#include "simd_utils.h"
-#include "ue2common.h"
-
-// we may already have x86intrin.h
-#if !defined(USE_X86INTRIN_H)
-#if defined(HAVE_C_INTRIN_H)
-#include <intrin.h>
-#elif defined(HAVE_TMMINTRIN_H)
-#include <tmmintrin.h> // SSSE3 intrinsics
-#else
-#define I_HAVE_BROKEN_INTRINSICS
-#endif
-#endif
-
-
-#if !defined(I_HAVE_BROKEN_INTRINSICS)
-// newish compilers get this right
-#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
-#else
-// must be inline, even in weak-sauce debug builds.
-// oldish compilers either don't have the intrinsic, or force one arg through memory
-static really_really_inline
-m128 palignr(m128 r, m128 l, const int offset) {
-    __asm__ ("palignr   %2,%1,%0" : "+x"(r) : "x"(l), "i"(offset));
-    return r;
-}
-#endif
-
-
-static really_inline
-m128 pshufb(m128 a, m128 b) {
-    m128 result;
-#if !defined(I_HAVE_BROKEN_INTRINSICS)
-    result = _mm_shuffle_epi8(a, b);
-#else
-    __asm__("pshufb\t%1,%0" : "=x"(result) : "xm"(b), "0"(a));
-#endif
-    return result;
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-extern const char vbs_mask_data[];
-#ifdef __cplusplus
-}
-#endif
-
-static really_inline
-m128 variable_byte_shift_m128(m128 in, s32 amount) {
-    assert(amount >= -16 && amount <= 16);
-    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
-    return pshufb(in, shift_mask);
-}
-
-#if defined(__AVX2__)
-
-static really_inline
-m256 vpshufb(m256 a, m256 b) {
-    return _mm256_shuffle_epi8(a, b);
-}
-
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define vpalignr(r, l, offset) ({                   \
-    m256 res = _mm256_alignr_epi8(r, l, offset);    \
-    res;                                            \
-})
-#else
-#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
-#endif
-
-#else // not __AVX2__
-
-static really_inline
-m256 vpshufb(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = pshufb(a.lo, b.lo);
-    rv.hi = pshufb(a.hi, b.hi);
-    return rv;
-}
-
-/* palignr requires the offset to be an immediate, which we can do with a
- * compound macro, otherwise we have to enumerate the offsets and hope the
- * compiler can throw the rest away. */
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define vpalignr(r, l, offset) ({           \
-    m256 res;                               \
-    res.lo = palignr(r.lo, l.lo, offset);   \
-    res.hi = palignr(r.hi, l.hi, offset);   \
-    res;                                    \
-})
-#else
-#define VPALIGN_CASE(N) case N: \
-		res.lo = palignr(r.lo, l.lo, N); \
-		res.hi = palignr(r.hi, l.hi, N); \
-		return res;
-static really_inline
-m256 vpalignr(m256 r, m256 l, const int offset) {
-	m256 res;
-	switch (offset) {
-	VPALIGN_CASE(0)
-	VPALIGN_CASE(1)
-	VPALIGN_CASE(2)
-	VPALIGN_CASE(3)
-	VPALIGN_CASE(4)
-	VPALIGN_CASE(5)
-	VPALIGN_CASE(6)
-	VPALIGN_CASE(7)
-	VPALIGN_CASE(8)
-	VPALIGN_CASE(9)
-	VPALIGN_CASE(10)
-	VPALIGN_CASE(11)
-	VPALIGN_CASE(12)
-	VPALIGN_CASE(13)
-	VPALIGN_CASE(14)
-	VPALIGN_CASE(15)
-	default:
-		assert(0);
-		return zeroes256();
-	}
-}
-#undef VPALIGN_CASE
-#endif
-#endif // __AVX2__
-
-#endif /* SIMD_UTILS_SSSE3_H_E27DF795C9AA02 */
-
diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp
index 58e5a61f..614b641d 100644
--- a/unit/internal/shuffle.cpp
+++ b/unit/internal/shuffle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,8 +31,7 @@
 #include "gtest/gtest.h"
 
 #include "util/simd_utils.h"
-#include "util/shuffle.h"
-#include "util/shuffle_ssse3.h"
+#include "nfa/limex_shuffle.h"
 
 namespace {
 
@@ -50,34 +49,34 @@ Mask setbit(unsigned int bit) {
     return cf.simd;
 }
 
-TEST(Shuffle, ShuffleDynamic32_1) {
+TEST(Shuffle, PackedExtract32_1) {
     // Try all possible one-bit masks
     for (unsigned int i = 0; i < 32; i++) {
         // shuffle a single 1 bit to the front
         u32 mask = 1U << i;
-        EXPECT_EQ(1U, shuffleDynamic32(mask, mask));
-        EXPECT_EQ(1U, shuffleDynamic32(~0U, mask));
+        EXPECT_EQ(1U, packedExtract32(mask, mask));
+        EXPECT_EQ(1U, packedExtract32(~0U, mask));
         // we should get zero out of these cases
-        EXPECT_EQ(0U, shuffleDynamic32(0, mask));
-        EXPECT_EQ(0U, shuffleDynamic32(~mask, mask));
+        EXPECT_EQ(0U, packedExtract32(0, mask));
+        EXPECT_EQ(0U, packedExtract32(~mask, mask));
         // we should get zero out of all the other bit positions
         for (unsigned int j = 0; (j != i && j < 32); j++) {
-            EXPECT_EQ(0U, shuffleDynamic32((1U << j), mask));
+            EXPECT_EQ(0U, packedExtract32((1U << j), mask));
         }
     }
 }
 
-TEST(Shuffle, ShuffleDynamic32_2) {
+TEST(Shuffle, PackedExtract32_2) {
     // All 32 bits in mask are on
     u32 mask = ~0U;
-    EXPECT_EQ(0U, shuffleDynamic32(0, mask));
-    EXPECT_EQ(mask, shuffleDynamic32(mask, mask));
+    EXPECT_EQ(0U, packedExtract32(0, mask));
+    EXPECT_EQ(mask, packedExtract32(mask, mask));
     for (unsigned int i = 0; i < 32; i++) {
-        EXPECT_EQ(1U << i, shuffleDynamic32(1U << i, mask));
+        EXPECT_EQ(1U << i, packedExtract32(1U << i, mask));
     }
 }
 
-TEST(Shuffle, ShuffleDynamic32_3) {
+TEST(Shuffle, PackedExtract32_3) {
     // Try setting every second bit
     u32 mask = 0;
     for (unsigned int i = 0; i < 32; i += 2) {
@@ -85,63 +84,63 @@ TEST(Shuffle, ShuffleDynamic32_3) {
     }
 
     // Test both cases (all even bits, all odd bits)
-    EXPECT_EQ((1U << 16) - 1, shuffleDynamic32(mask, mask));
-    EXPECT_EQ((1U << 16) - 1, shuffleDynamic32(~mask, ~mask));
-    EXPECT_EQ(0U, shuffleDynamic32(~mask, mask));
-    EXPECT_EQ(0U, shuffleDynamic32(mask, ~mask));
+    EXPECT_EQ((1U << 16) - 1, packedExtract32(mask, mask));
+    EXPECT_EQ((1U << 16) - 1, packedExtract32(~mask, ~mask));
+    EXPECT_EQ(0U, packedExtract32(~mask, mask));
+    EXPECT_EQ(0U, packedExtract32(mask, ~mask));
 
     for (unsigned int i = 0; i < 32; i += 2) {
-        EXPECT_EQ(1U << (i/2), shuffleDynamic32(1U << i, mask));
-        EXPECT_EQ(0U, shuffleDynamic32(1U << i, ~mask));
-        EXPECT_EQ(1U << (i/2), shuffleDynamic32(1U << (i+1), ~mask));
-        EXPECT_EQ(0U, shuffleDynamic32(1U << (i+1), mask));
+        EXPECT_EQ(1U << (i/2), packedExtract32(1U << i, mask));
+        EXPECT_EQ(0U, packedExtract32(1U << i, ~mask));
+        EXPECT_EQ(1U << (i/2), packedExtract32(1U << (i+1), ~mask));
+        EXPECT_EQ(0U, packedExtract32(1U << (i+1), mask));
     }
 }
 
-TEST(Shuffle, ShuffleDynamic64_1) {
+TEST(Shuffle, PackedExtract64_1) {
     // Try all possible one-bit masks
     for (unsigned int i = 0; i < 64; i++) {
         // shuffle a single 1 bit to the front
         u64a mask = 1ULL << i;
-        EXPECT_EQ(1U, shuffleDynamic64(mask, mask));
-        EXPECT_EQ(1U, shuffleDynamic64(~0ULL, mask));
+        EXPECT_EQ(1U, packedExtract64(mask, mask));
+        EXPECT_EQ(1U, packedExtract64(~0ULL, mask));
         // we should get zero out of these cases
-        EXPECT_EQ(0U, shuffleDynamic64(0, mask));
-        EXPECT_EQ(0U, shuffleDynamic64(~mask, mask));
+        EXPECT_EQ(0U, packedExtract64(0, mask));
+        EXPECT_EQ(0U, packedExtract64(~mask, mask));
         // we should get zero out of all the other bit positions
         for (unsigned int j = 0; (j != i && j < 64); j++) {
-            EXPECT_EQ(0U, shuffleDynamic64((1ULL << j), mask));
+            EXPECT_EQ(0U, packedExtract64((1ULL << j), mask));
         }
     }
 }
 
-TEST(Shuffle, ShuffleDynamic64_2) {
+TEST(Shuffle, PackedExtract64_2) {
     // Fill first half of mask
     u64a mask = 0x00000000ffffffffULL;
-    EXPECT_EQ(0U, shuffleDynamic64(0, mask));
-    EXPECT_EQ(0xffffffffU, shuffleDynamic64(mask, mask));
+    EXPECT_EQ(0U, packedExtract64(0, mask));
+    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
     for (unsigned int i = 0; i < 32; i++) {
-        EXPECT_EQ(1U << i, shuffleDynamic64(1ULL << i, mask));
+        EXPECT_EQ(1U << i, packedExtract64(1ULL << i, mask));
     }
 
     // Fill second half of mask
     mask = 0xffffffff00000000ULL;
-    EXPECT_EQ(0U, shuffleDynamic64(0, mask));
-    EXPECT_EQ(0xffffffffU, shuffleDynamic64(mask, mask));
+    EXPECT_EQ(0U, packedExtract64(0, mask));
+    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
     for (unsigned int i = 32; i < 64; i++) {
-        EXPECT_EQ(1U << (i - 32), shuffleDynamic64(1ULL << i, mask));
+        EXPECT_EQ(1U << (i - 32), packedExtract64(1ULL << i, mask));
     }
 
     // Try one in the middle
     mask = 0x0000ffffffff0000ULL;
-    EXPECT_EQ(0U, shuffleDynamic64(0, mask));
-    EXPECT_EQ(0xffffffffU, shuffleDynamic64(mask, mask));
+    EXPECT_EQ(0U, packedExtract64(0, mask));
+    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
     for (unsigned int i = 16; i < 48; i++) {
-        EXPECT_EQ(1U << (i - 16), shuffleDynamic64(1ULL << i, mask));
+        EXPECT_EQ(1U << (i - 16), packedExtract64(1ULL << i, mask));
     }
 }
 
-TEST(Shuffle, ShuffleDynamic64_3) {
+TEST(Shuffle, PackedExtract64_3) {
     // Try setting every second bit (note: 32 bits, the max we can shuffle)
     u64a mask = 0;
     for (unsigned int i = 0; i < 64; i += 2) {
@@ -149,46 +148,69 @@ TEST(Shuffle, ShuffleDynamic64_3) {
     }
 
     // Test both cases (all even bits, all odd bits)
-    EXPECT_EQ(0xffffffffU, shuffleDynamic64(mask, mask));
-    EXPECT_EQ(0xffffffffU, shuffleDynamic64(~mask, ~mask));
-    EXPECT_EQ(0U, shuffleDynamic64(~mask, mask));
-    EXPECT_EQ(0U, shuffleDynamic64(mask, ~mask));
+    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
+    EXPECT_EQ(0xffffffffU, packedExtract64(~mask, ~mask));
+    EXPECT_EQ(0U, packedExtract64(~mask, mask));
+    EXPECT_EQ(0U, packedExtract64(mask, ~mask));
 
     for (unsigned int i = 0; i < 64; i += 2) {
-        EXPECT_EQ(1U << (i/2), shuffleDynamic64(1ULL << i, mask));
-        EXPECT_EQ(0U, shuffleDynamic64(1ULL << i, ~mask));
-        EXPECT_EQ(1U << (i/2), shuffleDynamic64(1ULL << (i+1), ~mask));
-        EXPECT_EQ(0U, shuffleDynamic64(1ULL << (i+1), mask));
+        EXPECT_EQ(1U << (i/2), packedExtract64(1ULL << i, mask));
+        EXPECT_EQ(0U, packedExtract64(1ULL << i, ~mask));
+        EXPECT_EQ(1U << (i/2), packedExtract64(1ULL << (i+1), ~mask));
+        EXPECT_EQ(0U, packedExtract64(1ULL << (i+1), mask));
     }
 }
 
+template<typename T>
 static
-void build_pshufb_masks_onebit(unsigned int bit, m128 *permute, m128 *compare) {
+void build_pshufb_masks_onebit(unsigned int bit, T *permute, T *compare) {
+    static_assert(sizeof(T) == sizeof(m128) || sizeof(T) == sizeof(m256),
+                  "should be valid type");
     // permute mask has 0x80 in all bytes except the one we care about
     memset(permute, 0x80, sizeof(*permute));
     memset(compare, 0, sizeof(*compare));
     char *pmsk = (char *)permute;
     char *cmsk = (char *)compare;
-    pmsk[0] = bit/8;
-    cmsk[0] = ~(1 << (bit % 8));
+    u8 off = (bit >= 128) ? 0x10 : 0;
+    pmsk[off] = bit/8;
+    cmsk[off] = ~(1 << (bit % 8));
 }
 
-TEST(Shuffle, ShufflePshufb128_1) {
+TEST(Shuffle, PackedExtract128_1) {
     // Try all possible one-bit masks
     for (unsigned int i = 0; i < 128; i++) {
         // shuffle a single 1 bit to the front
         m128 permute, compare;
         build_pshufb_masks_onebit(i, &permute, &compare);
-        EXPECT_EQ(1U, shufflePshufb128(setbit<m128>(i), permute, compare));
-        EXPECT_EQ(1U, shufflePshufb128(ones128(), permute, compare));
+        EXPECT_EQ(1U, packedExtract128(setbit<m128>(i), permute, compare));
+        EXPECT_EQ(1U, packedExtract128(ones128(), permute, compare));
         // we should get zero out of these cases
-        EXPECT_EQ(0U, shufflePshufb128(zeroes128(), permute, compare));
-        EXPECT_EQ(0U, shufflePshufb128(not128(setbit<m128>(i)), permute, compare));
+        EXPECT_EQ(0U, packedExtract128(zeroes128(), permute, compare));
+        EXPECT_EQ(0U, packedExtract128(not128(setbit<m128>(i)), permute, compare));
         // we should get zero out of all the other bit positions
         for (unsigned int j = 0; (j != i && j < 128); j++) {
-            EXPECT_EQ(0U, shufflePshufb128(setbit<m128>(j), permute, compare));
+            EXPECT_EQ(0U, packedExtract128(setbit<m128>(j), permute, compare));
         }
     }
 }
 
+#if defined(__AVX2__)
+TEST(Shuffle, PackedExtract256_1) {
+    // Try all possible one-bit masks
+    for (unsigned int i = 0; i < 256; i++) {
+        // shuffle a single 1 bit to the front
+        m256 permute, compare;
+        build_pshufb_masks_onebit(i, &permute, &compare);
+        EXPECT_EQ(1U, packedExtract256(setbit<m256>(i), permute, compare));
+        EXPECT_EQ(1U, packedExtract256(ones256(), permute, compare));
+        // we should get zero out of these cases
+        EXPECT_EQ(0U, packedExtract256(zeroes256(), permute, compare));
+        EXPECT_EQ(0U, packedExtract256(not256(setbit<m256>(i)), permute, compare));
+        // we should get zero out of all the other bit positions
+        for (unsigned int j = 0; (j != i && j < 256); j++) {
+            EXPECT_EQ(0U, packedExtract256(setbit<m256>(j), permute, compare));
+        }
+    }
+}
+#endif
 } // namespace
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index de0f1eea..e95f7533 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -32,7 +32,6 @@
 #include "util/alloc.h"
 #include "util/make_unique.h"
 #include "util/simd_utils.h"
-#include "util/simd_utils_ssse3.h"
 
 using namespace std;
 using namespace ue2;

From adf820bbbaf88fd9c59af1f45878f9fd36f8c644 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Mon, 6 Jun 2016 16:24:55 +1000
Subject: [PATCH 079/166] simd: simplify the set-all-ones util funcs

Modern compilers (gcc, icc) get this right, with the benefit of
removing our last use of inline asm in this file.
---
 src/util/simd_utils.h | 36 +++++++-----------------------------
 1 file changed, 7 insertions(+), 29 deletions(-)

diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 4ac7b176..90f3893d 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -88,10 +88,6 @@
 #  endif
 #endif
 
-#ifdef _WIN32
-#define NO_ASM
-#endif
-
 // Fallback to identity case.
 #ifndef assume_aligned
 #define assume_aligned(x, y) (x)
@@ -106,13 +102,12 @@ extern const char vbs_mask_data[];
 #endif
 
 static really_inline m128 ones128(void) {
-#if !defined(NO_ASM)
-    // trick from Intel's optimization guide to generate all-ones. We have to
-    // use inline asm, as there's no intrinsic for this yet.
-    m128 ret;
-    __asm__ ("pcmpeqb %0,%0" : "=x"(ret));
-    return ret;
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+    /* gcc gets this right */
+    return _mm_set1_epi8(0xFF);
 #else
+    /* trick from Intel's optimization guide to generate all-ones.
+     * ICC converts this to the single cmpeq instruction */
     return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
 #endif
 }
@@ -172,19 +167,8 @@ static really_inline unsigned short cmpmsk8(m128 a, m128 b) {
 #define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
 #define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
 
-// We found that this generated better code with gcc-4.1 and with the default
-// tuning settings on gcc-4.4 than just using the _mm_set1_epi8() instrinsic.
 static really_inline m128 set16x8(u8 c) {
-#if !defined(__AVX2__)
-    m128 a = _mm_cvtsi32_si128((int)c);
-    a = _mm_unpacklo_epi8(a, a);
-    a = _mm_unpacklo_epi8(a, a);
-    a = _mm_shuffle_epi32(a, 0);
-    return a;
-#else
-    // uses a broadcast for much win
     return _mm_set1_epi8(c);
-#endif
 }
 
 static really_inline u32 movd(const m128 in) {
@@ -369,8 +353,7 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
 
 static really_inline
 m256 set32x8(u32 in) {
-    m128 a = _mm_cvtsi32_si128(in);
-    return _mm256_broadcastb_epi8(a);
+    return _mm256_set1_epi8(in);
 }
 
 #define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
@@ -423,12 +406,7 @@ static really_inline m256 zeroes256(void) {
 
 static really_inline m256 ones256(void) {
 #if defined(__AVX2__)
-    m256 rv;
-#if defined(NO_ASM)
-    rv = eq256(zeroes256(), zeroes256());
-#else
-    __asm__ ("vpcmpeqb %0,%0,%0" : "=x"(rv));
-#endif
+    m256 rv = _mm256_set1_epi8(0xFF);
 #else
     m256 rv = {ones128(), ones128()};
 #endif

From 1b3e795fc908a4b804a0af6beed15843a4dc8a29 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Tue, 7 Jun 2016 15:44:39 +1000
Subject: [PATCH 080/166] teddy: we only need the upper lane

Just use an extract, no need to shuffle first.
---
 src/fdr/teddy_avx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index f282c505..ef06813c 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -204,7 +204,7 @@ do {                                                                        \
     if (unlikely(isnonzero256(var))) {                                      \
         u32 arrCnt = 0;                                                     \
         m128 lo = cast256to128(var);                                        \
-        m128 hi = cast256to128(swap128in256(var));                          \
+        m128 hi = movdq_hi(var);                                            \
         bit_array_fast_teddy(lo, bitArr, &arrCnt, offset);                  \
         bit_array_fast_teddy(hi, bitArr, &arrCnt, offset + 2);              \
         for (u32 i = 0; i < arrCnt; i++) {                                  \

From 0722b5db5b1e1ce51a8ff9c690d7e83f63586b8d Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Tue, 7 Jun 2016 15:45:53 +1000
Subject: [PATCH 081/166] Remove GCC-style compound statements

These do not appear to give us benefits over inlining on recent compilers.
---
 src/util/simd_utils.h | 177 ++----------------------------------------
 1 file changed, 6 insertions(+), 171 deletions(-)

diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 90f3893d..90a8aba4 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -72,10 +72,6 @@
 #include "ue2common.h"
 #include "simd_types.h"
 
-#if defined(__GNUC__)
-#define USE_GCC_COMPOUND_STATEMENTS
-#endif
-
 // Define a common assume_aligned using an appropriate compiler built-in, if
 // it's available. Note that we need to handle C or C++ compilation.
 #ifdef __cplusplus
@@ -417,13 +413,6 @@ static really_inline m256 ones256(void) {
 static really_inline m256 and256(m256 a, m256 b) {
     return _mm256_and_si256(a, b);
 }
-#elif defined(USE_GCC_COMPOUND_STATEMENTS)
-#define and256(a, b) ({                                                 \
-    m256 rv_and256;                                                     \
-    rv_and256.lo = and128((a).lo, (b).lo);                              \
-    rv_and256.hi = and128((a).hi, (b).hi);                              \
-    rv_and256;                                                          \
-})
 #else
 static really_inline m256 and256(m256 a, m256 b) {
     m256 rv;
@@ -437,13 +426,6 @@ static really_inline m256 and256(m256 a, m256 b) {
 static really_inline m256 or256(m256 a, m256 b) {
     return _mm256_or_si256(a, b);
 }
-#elif defined(USE_GCC_COMPOUND_STATEMENTS)
-#define or256(a, b) ({                                                  \
-    m256 rv_or256;                                                      \
-    rv_or256.lo = or128((a).lo, (b).lo);                                \
-    rv_or256.hi = or128((a).hi, (b).hi);                                \
-    rv_or256;                                                           \
-})
 #else
 static really_inline m256 or256(m256 a, m256 b) {
     m256 rv;
@@ -457,13 +439,6 @@ static really_inline m256 or256(m256 a, m256 b) {
 static really_inline m256 xor256(m256 a, m256 b) {
     return _mm256_xor_si256(a, b);
 }
-#elif defined(USE_GCC_COMPOUND_STATEMENTS)
-#define xor256(a, b) ({                                                 \
-    m256 rv_xor256;                                                     \
-    rv_xor256.lo = xor128((a).lo, (b).lo);                              \
-    rv_xor256.hi = xor128((a).hi, (b).hi);                              \
-    rv_xor256;                                                          \
-})
 #else
 static really_inline m256 xor256(m256 a, m256 b) {
     m256 rv;
@@ -477,13 +452,6 @@ static really_inline m256 xor256(m256 a, m256 b) {
 static really_inline m256 not256(m256 a) {
     return _mm256_xor_si256(a, ones256());
 }
-#elif defined(USE_GCC_COMPOUND_STATEMENTS)
-#define not256(a) ({                                                    \
-    m256 rv_not256;                                                     \
-    rv_not256.lo = not128((a).lo);                                      \
-    rv_not256.hi = not128((a).hi);                                      \
-    rv_not256;                                                          \
-})
 #else
 static really_inline m256 not256(m256 a) {
     m256 rv;
@@ -497,13 +465,6 @@ static really_inline m256 not256(m256 a) {
 static really_inline m256 andnot256(m256 a, m256 b) {
     return _mm256_andnot_si256(a, b);
 }
-#elif defined(USE_GCC_COMPOUND_STATEMENTS)
-#define andnot256(a, b) ({                                              \
-    m256 rv_andnot256;                                                  \
-    rv_andnot256.lo = andnot128((a).lo, (b).lo);                        \
-    rv_andnot256.hi = andnot128((a).hi, (b).hi);                        \
-    rv_andnot256;                                                       \
-})
 #else
 static really_inline m256 andnot256(m256 a, m256 b) {
     m256 rv;
@@ -513,19 +474,11 @@ static really_inline m256 andnot256(m256 a, m256 b) {
 }
 #endif
 
-// The shift amount is an immediate, so we define these operations as macros on
-// Intel SIMD (using a GNU C extension).
+// The shift amount is an immediate
 #if defined(__AVX2__)
 #define shift256(a, b)  _mm256_slli_epi64((a), (b))
-#elif defined(__GNUC__)
-#define shift256(a, b)  ({                                              \
-    m256 rv_shift256;                                                   \
-    rv_shift256.lo = shift128(a.lo, b);                                 \
-    rv_shift256.hi = shift128(a.hi, b);                                 \
-    rv_shift256;                                                        \
-})
 #else
-static really_inline m256 shift256(m256 a, unsigned b) {
+static really_really_inline m256 shift256(m256 a, unsigned b) {
     m256 rv;
     rv.lo = shift128(a.lo, b);
     rv.hi = shift128(a.hi, b);
@@ -762,15 +715,6 @@ m256 shift256Left8Bits(m256 a) {
  **** 384-bit Primitives
  ****/
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define and384(a, b) ({                                                 \
-    m384 rv_and384;                                                     \
-    rv_and384.lo = and128((a).lo, (b).lo);                              \
-    rv_and384.mid = and128((a).mid, (b).mid);                           \
-    rv_and384.hi = and128((a).hi, (b).hi);                              \
-    rv_and384;                                                          \
-})
-#else
 static really_inline m384 and384(m384 a, m384 b) {
     m384 rv;
     rv.lo = and128(a.lo, b.lo);
@@ -778,17 +722,7 @@ static really_inline m384 and384(m384 a, m384 b) {
     rv.hi = and128(a.hi, b.hi);
     return rv;
 }
-#endif
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define or384(a, b) ({                                                  \
-    m384 rv_or384;                                                      \
-    rv_or384.lo = or128((a).lo, (b).lo);                                \
-    rv_or384.mid = or128((a).mid, (b).mid);                             \
-    rv_or384.hi = or128((a).hi, (b).hi);                                \
-    rv_or384;                                                           \
-})
-#else
 static really_inline m384 or384(m384 a, m384 b) {
     m384 rv;
     rv.lo = or128(a.lo, b.lo);
@@ -796,17 +730,7 @@ static really_inline m384 or384(m384 a, m384 b) {
     rv.hi = or128(a.hi, b.hi);
     return rv;
 }
-#endif
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define xor384(a, b) ({                                                 \
-    m384 rv_xor384;                                                     \
-    rv_xor384.lo = xor128((a).lo, (b).lo);                              \
-    rv_xor384.mid = xor128((a).mid, (b).mid);                           \
-    rv_xor384.hi = xor128((a).hi, (b).hi);                              \
-    rv_xor384;                                                          \
-})
-#else
 static really_inline m384 xor384(m384 a, m384 b) {
     m384 rv;
     rv.lo = xor128(a.lo, b.lo);
@@ -814,17 +738,6 @@ static really_inline m384 xor384(m384 a, m384 b) {
     rv.hi = xor128(a.hi, b.hi);
     return rv;
 }
-#endif
-
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define not384(a) ({                                                    \
-    m384 rv_not384;                                                     \
-    rv_not384.lo = not128((a).lo);                                      \
-    rv_not384.mid = not128((a).mid);                                    \
-    rv_not384.hi = not128((a).hi);                                      \
-    rv_not384;                                                          \
-})
-#else
 static really_inline m384 not384(m384 a) {
     m384 rv;
     rv.lo = not128(a.lo);
@@ -832,17 +745,6 @@ static really_inline m384 not384(m384 a) {
     rv.hi = not128(a.hi);
     return rv;
 }
-#endif
-
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define andnot384(a, b) ({                                              \
-    m384 rv_andnot384;                                                  \
-    rv_andnot384.lo = andnot128((a).lo, (b).lo);                        \
-    rv_andnot384.mid = andnot128((a).mid, (b).mid);                     \
-    rv_andnot384.hi = andnot128((a).hi, (b).hi);                        \
-    rv_andnot384;                                                       \
-})
-#else
 static really_inline m384 andnot384(m384 a, m384 b) {
     m384 rv;
     rv.lo = andnot128(a.lo, b.lo);
@@ -850,27 +752,15 @@ static really_inline m384 andnot384(m384 a, m384 b) {
     rv.hi = andnot128(a.hi, b.hi);
     return rv;
 }
-#endif
 
-// The shift amount is an immediate, so we define these operations as macros on
-// Intel SIMD (using a GNU C extension).
-#if defined(__GNUC__)
-#define shift384(a, b)  ({                                              \
-    m384 rv;                                                            \
-    rv.lo = shift128(a.lo, b);                                          \
-    rv.mid = shift128(a.mid, b);                                        \
-    rv.hi = shift128(a.hi, b);                                          \
-    rv;                                                                 \
-})
-#else
-static really_inline m384 shift384(m384 a, unsigned b) {
+// The shift amount is an immediate
+static really_really_inline m384 shift384(m384 a, unsigned b) {
     m384 rv;
     rv.lo = shift128(a.lo, b);
     rv.mid = shift128(a.mid, b);
     rv.hi = shift128(a.hi, b);
     return rv;
 }
-#endif
 
 static really_inline m384 zeroes384(void) {
     m384 rv = {zeroes128(), zeroes128(), zeroes128()};
@@ -1000,103 +890,48 @@ char testbit384(const m384 *ptr, unsigned int n) {
  **** 512-bit Primitives
  ****/
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define and512(a, b) ({                                                 \
-    m512 rv_and512;                                                     \
-    rv_and512.lo = and256((a).lo, (b).lo);                              \
-    rv_and512.hi = and256((a).hi, (b).hi);                              \
-    rv_and512;                                                          \
-})
-#else
 static really_inline m512 and512(m512 a, m512 b) {
     m512 rv;
     rv.lo = and256(a.lo, b.lo);
     rv.hi = and256(a.hi, b.hi);
     return rv;
 }
-#endif
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define or512(a, b) ({                                                  \
-    m512 rv_or512;                                                      \
-    rv_or512.lo = or256((a).lo, (b).lo);                                \
-    rv_or512.hi = or256((a).hi, (b).hi);                                \
-    rv_or512;                                                           \
-})
-#else
 static really_inline m512 or512(m512 a, m512 b) {
     m512 rv;
     rv.lo = or256(a.lo, b.lo);
     rv.hi = or256(a.hi, b.hi);
     return rv;
 }
-#endif
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define xor512(a, b) ({                                                 \
-    m512 rv_xor512;                                                     \
-    rv_xor512.lo = xor256((a).lo, (b).lo);                              \
-    rv_xor512.hi = xor256((a).hi, (b).hi);                              \
-    rv_xor512;                                                          \
-})
-#else
 static really_inline m512 xor512(m512 a, m512 b) {
     m512 rv;
     rv.lo = xor256(a.lo, b.lo);
     rv.hi = xor256(a.hi, b.hi);
     return rv;
 }
-#endif
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define not512(a) ({                                                    \
-    m512 rv_not512;                                                     \
-    rv_not512.lo = not256((a).lo);                                      \
-    rv_not512.hi = not256((a).hi);                                      \
-    rv_not512;                                                          \
-})
-#else
 static really_inline m512 not512(m512 a) {
     m512 rv;
     rv.lo = not256(a.lo);
     rv.hi = not256(a.hi);
     return rv;
 }
-#endif
 
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define andnot512(a, b) ({                                              \
-    m512 rv_andnot512;                                                  \
-    rv_andnot512.lo = andnot256((a).lo, (b).lo);                        \
-    rv_andnot512.hi = andnot256((a).hi, (b).hi);                        \
-    rv_andnot512;                                                       \
-})
-#else
 static really_inline m512 andnot512(m512 a, m512 b) {
     m512 rv;
     rv.lo = andnot256(a.lo, b.lo);
     rv.hi = andnot256(a.hi, b.hi);
     return rv;
 }
-#endif
 
-// The shift amount is an immediate, so we define these operations as macros on
-// Intel SIMD (using a GNU C extension).
-#if defined(USE_GCC_COMPOUND_STATEMENTS)
-#define shift512(a, b)  ({                                              \
-    m512 rv_shift512;                                                   \
-    rv_shift512.lo = shift256(a.lo, b);                                 \
-    rv_shift512.hi = shift256(a.hi, b);                                 \
-    rv_shift512;                                                        \
-})
-#else
-static really_inline m512 shift512(m512 a, unsigned b) {
+// The shift amount is an immediate
+static really_really_inline m512 shift512(m512 a, unsigned b) {
     m512 rv;
     rv.lo = shift256(a.lo, b);
     rv.hi = shift256(a.hi, b);
     return rv;
 }
-#endif
 
 static really_inline m512 zeroes512(void) {
     m512 rv = {zeroes256(), zeroes256()};

From 9c915cc936c5ad0e1f285cc8913e6e5a8fdab06e Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Tue, 14 Jun 2016 11:21:48 +1000
Subject: [PATCH 082/166] remove only use of cmpmsk8 and unused cmpmsk16

---
 src/nfa/limex_shuffle.h |  2 +-
 src/util/simd_utils.h   | 10 ----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/nfa/limex_shuffle.h b/src/nfa/limex_shuffle.h
index 40900a65..e45e4331 100644
--- a/src/nfa/limex_shuffle.h
+++ b/src/nfa/limex_shuffle.h
@@ -91,7 +91,7 @@ static really_inline
 u32 packedExtract128(m128 s, const m128 permute, const m128 compare) {
     m128 shuffled = pshufb(s, permute);
     m128 compared = and128(shuffled, compare);
-    u16 rv = ~cmpmsk8(compared, shuffled);
+    u16 rv = ~movemask128(eq128(compared, shuffled));
     return (u32)rv;
 }
 
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 90a8aba4..107b22af 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -152,12 +152,6 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
 // forward decl
 static really_inline m128 xor128(m128 a, m128 b);
 
-/** \brief Return msb mask of packet 8 bit compare equal */
-static really_inline unsigned short cmpmsk8(m128 a, m128 b) {
-    m128 tmp = _mm_cmpeq_epi8(a, b);
-    return _mm_movemask_epi8(tmp);
-}
-
 #define shift2x64(a, b)  _mm_slli_epi64((a), (b))
 #define rshift2x64(a, b) _mm_srli_epi64((a), (b))
 #define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
@@ -355,10 +349,6 @@ m256 set32x8(u32 in) {
 #define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
 #define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
 
-static really_inline u32 cmpmsk16(m256 a, m256 b) {
-    m256 tmp = _mm256_cmpeq_epi8(a, b);
-    return _mm256_movemask_epi8(tmp);
-}
 static really_inline
 m256 set2x128(m128 a) {
     return _mm256_broadcastsi128_si256(a);

From c76ff285e7edf396b4cc033f1a96155d5342d97a Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Tue, 14 Jun 2016 11:28:00 +1000
Subject: [PATCH 083/166] remove unnecessary function proto

---
 src/util/simd_utils.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 107b22af..d3dba9a3 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -149,9 +149,6 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
 #endif
 }
 
-// forward decl
-static really_inline m128 xor128(m128 a, m128 b);
-
 #define shift2x64(a, b)  _mm_slli_epi64((a), (b))
 #define rshift2x64(a, b) _mm_srli_epi64((a), (b))
 #define eq128(a, b)      _mm_cmpeq_epi8((a), (b))

From e3d416a6ea57c307c43318cc1afd8bb423ef0c60 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Wed, 15 Jun 2016 11:02:42 +1000
Subject: [PATCH 084/166] Apply some consistency to the names we give shifts

---
 src/fdr/fdr.c                | 58 +++++++++++++-------------
 src/fdr/teddy.c              | 16 +++----
 src/fdr/teddy_avx2.c         | 18 ++++----
 src/hwlm/noodle_engine_sse.c |  6 ++-
 src/nfa/limex_runtime.h      |  2 +-
 src/nfa/shufti.c             |  5 +--
 src/nfa/shufti_common.h      |  4 +-
 src/nfa/truffle_common.h     |  6 +--
 src/nfa/vermicelli_sse.h     | 27 ++++++------
 src/rose/counting_miracle.h  |  2 +-
 src/util/simd_utils.h        | 81 +++++++++++-------------------------
 src/util/uniform_ops.h       | 14 +++----
 unit/internal/simd_utils.cpp | 44 ++++++++++----------
 13 files changed, 127 insertions(+), 156 deletions(-)

diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index aa9d1c1d..c79db037 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -131,7 +131,7 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u8 *ft,
         u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1);
         tmp &= fdr->domainMask;
         s = *((const m128 *)ft + tmp);
-        s = shiftRight8Bits(s);
+        s = rshiftbyte_m128(s, 1);
     } else {
         s = fdr->start;
     }
@@ -185,20 +185,20 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     m128 st14 = *(const m128 *)(ft + v14*8);
     m128 st15 = *(const m128 *)(ft + v15*8);
 
-    st1 = byteShiftLeft128(st1, 1);
-    st2 = byteShiftLeft128(st2, 2);
-    st3 = byteShiftLeft128(st3, 3);
-    st4 = byteShiftLeft128(st4, 4);
-    st5 = byteShiftLeft128(st5, 5);
-    st6 = byteShiftLeft128(st6, 6);
-    st7 = byteShiftLeft128(st7, 7);
-    st9 = byteShiftLeft128(st9, 1);
-    st10 = byteShiftLeft128(st10, 2);
-    st11 = byteShiftLeft128(st11, 3);
-    st12 = byteShiftLeft128(st12, 4);
-    st13 = byteShiftLeft128(st13, 5);
-    st14 = byteShiftLeft128(st14, 6);
-    st15 = byteShiftLeft128(st15, 7);
+    st1 = lshiftbyte_m128(st1, 1);
+    st2 = lshiftbyte_m128(st2, 2);
+    st3 = lshiftbyte_m128(st3, 3);
+    st4 = lshiftbyte_m128(st4, 4);
+    st5 = lshiftbyte_m128(st5, 5);
+    st6 = lshiftbyte_m128(st6, 6);
+    st7 = lshiftbyte_m128(st7, 7);
+    st9 = lshiftbyte_m128(st9, 1);
+    st10 = lshiftbyte_m128(st10, 2);
+    st11 = lshiftbyte_m128(st11, 3);
+    st12 = lshiftbyte_m128(st12, 4);
+    st13 = lshiftbyte_m128(st13, 5);
+    st14 = lshiftbyte_m128(st14, 6);
+    st15 = lshiftbyte_m128(st15, 7);
 
     *s = or128(*s, st0);
     *s = or128(*s, st1);
@@ -209,7 +209,7 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     *s = or128(*s, st6);
     *s = or128(*s, st7);
     *conf0 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
     *conf0 ^= ~0ULL;
 
     *s = or128(*s, st8);
@@ -221,7 +221,7 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     *s = or128(*s, st14);
     *s = or128(*s, st15);
     *conf8 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
     *conf8 ^= ~0ULL;
 }
 
@@ -252,19 +252,19 @@ void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     m128 st12 = *(const m128 *)(ft + v12*8);
     m128 st14 = *(const m128 *)(ft + v14*8);
 
-    st2 = byteShiftLeft128(st2, 2);
-    st4 = byteShiftLeft128(st4, 4);
-    st6 = byteShiftLeft128(st6, 6);
-    st10 = byteShiftLeft128(st10, 2);
-    st12 = byteShiftLeft128(st12, 4);
-    st14 = byteShiftLeft128(st14, 6);
+    st2  = lshiftbyte_m128(st2, 2);
+    st4  = lshiftbyte_m128(st4, 4);
+    st6  = lshiftbyte_m128(st6, 6);
+    st10 = lshiftbyte_m128(st10, 2);
+    st12 = lshiftbyte_m128(st12, 4);
+    st14 = lshiftbyte_m128(st14, 6);
 
     *s = or128(*s, st0);
     *s = or128(*s, st2);
     *s = or128(*s, st4);
     *s = or128(*s, st6);
     *conf0 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
     *conf0 ^= ~0ULL;
 
     *s = or128(*s, st8);
@@ -272,7 +272,7 @@ void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     *s = or128(*s, st12);
     *s = or128(*s, st14);
     *conf8 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
     *conf8 ^= ~0ULL;
 }
 
@@ -295,19 +295,19 @@ void get_conf_stride_4(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     m128 st8 = *(const m128 *)(ft + v8*8);
     m128 st12 = *(const m128 *)(ft + v12*8);
 
-    st4 = byteShiftLeft128(st4, 4);
-    st12 = byteShiftLeft128(st12, 4);
+    st4 = lshiftbyte_m128(st4, 4);
+    st12 = lshiftbyte_m128(st12, 4);
 
     *s = or128(*s, st0);
     *s = or128(*s, st4);
     *conf0 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
     *conf0 ^= ~0ULL;
 
     *s = or128(*s, st8);
     *s = or128(*s, st12);
     *conf8 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
     *conf8 ^= ~0ULL;
 }
 
diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 4ff0b18e..2406a167 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -79,7 +79,7 @@ const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
 do {                                                                        \
     if (unlikely(isnonzero128(var))) {                                      \
         u64a lo = movq(var);                                                \
-        u64a hi = movq(byteShiftRight128(var, 8));                          \
+        u64a hi = movq(rshiftbyte_m128(var, 8));                            \
         if (unlikely(lo)) {                                                 \
             conf_fn(&lo, bucket, offset, confBase, reason, a, ptr,          \
                     control, &last_match);                                  \
@@ -97,9 +97,9 @@ do {                                                                        \
 do {                                                                        \
     if (unlikely(isnonzero128(var))) {                                      \
         u32 part1 = movd(var);                                              \
-        u32 part2 = movd(byteShiftRight128(var, 4));                        \
-        u32 part3 = movd(byteShiftRight128(var, 8));                        \
-        u32 part4 = movd(byteShiftRight128(var, 12));                       \
+        u32 part2 = movd(rshiftbyte_m128(var, 4));                          \
+        u32 part3 = movd(rshiftbyte_m128(var, 8));                          \
+        u32 part4 = movd(rshiftbyte_m128(var, 12));                         \
         if (unlikely(part1)) {                                              \
             conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
                     control, &last_match);                                  \
@@ -128,7 +128,7 @@ static really_inline
 m128 prep_conf_teddy_m1(const m128 *maskBase, m128 p_mask, m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
-    m128 hi = and128(rshift2x64(val, 4), mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
     return and128(and128(pshufb(maskBase[0*2], lo),
                          pshufb(maskBase[0*2+1], hi)), p_mask);
 }
@@ -138,7 +138,7 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 p_mask,
                         m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
-    m128 hi = and128(rshift2x64(val, 4), mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m1(maskBase, p_mask, val);
 
     m128 res_1 = and128(pshufb(maskBase[1*2], lo),
@@ -153,7 +153,7 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
                         m128 p_mask, m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
-    m128 hi = and128(rshift2x64(val, 4), mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m2(maskBase, old_1, p_mask, val);
 
     m128 res_2 = and128(pshufb(maskBase[2*2], lo),
@@ -168,7 +168,7 @@ m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
                         m128 *old_3, m128 p_mask, m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
-    m128 hi = and128(rshift2x64(val, 4), mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, p_mask, val);
 
     m128 res_3 = and128(pshufb(maskBase[3*2], lo),
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index ef06813c..5ea4e368 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -371,7 +371,7 @@ void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) {
                                     64 * (offset);
             *arrCnt += 1;
         }
-        u64a part_1 = movq(byteShiftRight128(var, 8));
+        u64a part_1 = movq(rshiftbyte_m128(var, 8));
         while (unlikely(part_1)) {
             bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
                                     64 * (offset + 1);
@@ -384,19 +384,19 @@ void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) {
                                     32 * (offset * 2);
             *arrCnt += 1;
         }
-        u32 part_1 = movd(byteShiftRight128(var, 4));
+        u32 part_1 = movd(rshiftbyte_m128(var, 4));
         while (unlikely(part_1)) {
             bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
                                     32 * (offset * 2 + 1);
             *arrCnt += 1;
         }
-        u32 part_2 = movd(byteShiftRight128(var, 8));
+        u32 part_2 = movd(rshiftbyte_m128(var, 8));
         while (unlikely(part_2)) {
             bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_2) +
                                     32 * (offset * 2 + 2);
             *arrCnt += 1;
         }
-        u32 part_3 = movd(byteShiftRight128(var, 12));
+        u32 part_3 = movd(rshiftbyte_m128(var, 12));
         while (unlikely(part_3)) {
             bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_3) +
                                     32 * (offset * 2 + 3);
@@ -410,7 +410,7 @@ static really_inline
 m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 p_mask, m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
     return and256(and256(vpshufb(maskBase[0*2], lo),
                          vpshufb(maskBase[0*2+1], hi)), p_mask);
 }
@@ -420,7 +420,7 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 p_mask,
                             m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m1(maskBase, p_mask, val);
 
     m256 res_1 = and256(vpshufb(maskBase[1*2], lo),
@@ -435,7 +435,7 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
                             m256 p_mask, m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, p_mask, val);
 
     m256 res_2 = and256(vpshufb(maskBase[2*2], lo),
@@ -450,7 +450,7 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
                             m256 *old_3, m256 p_mask, m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, p_mask, val);
 
     m256 res_3 = and256(vpshufb(maskBase[3*2], lo),
@@ -464,7 +464,7 @@ static really_inline
 m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi,
                              m256 p_mask) {
     m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 res = and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi));
     return and256(res, p_mask);
 }
diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c
index b3673246..40575409 100644
--- a/src/hwlm/noodle_engine_sse.c
+++ b/src/hwlm/noodle_engine_sse.c
@@ -115,7 +115,8 @@ hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
         v = and128(v, caseMask);
     }
 
-    u32 z = movemask128(and128(shiftLeft8Bits(eq128(mask1, v)), eq128(mask2, v)));
+    u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
+                               eq128(mask2, v)));
 
     // mask out where we can't match
     u32 mask = (0xFFFF >> (16 - l));
@@ -142,7 +143,8 @@ hwlm_error_t scanDoubleUnaligned(const u8 *buf, size_t len, size_t offset,
         v = and128(v, caseMask);
     }
 
-    u32 z = movemask128(and128(shiftLeft8Bits(eq128(mask1, v)), eq128(mask2, v)));
+    u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
+                               eq128(mask2, v)));
 
     // mask out where we can't match
     u32 buf_off = start - offset;
diff --git a/src/nfa/limex_runtime.h b/src/nfa/limex_runtime.h
index 70601e27..e0c182fc 100644
--- a/src/nfa/limex_runtime.h
+++ b/src/nfa/limex_runtime.h
@@ -75,7 +75,7 @@ struct proto_cache {
 // Shift macros for Limited NFAs. Defined in terms of uniform ops.
 // LimExNFAxxx ptr in 'limex' and the current state in 's'
 #define NFA_EXEC_LIM_SHIFT(nels_type, nels_i)                                  \
-    (JOIN(shift_, nels_type)(                                                  \
+    (JOIN(lshift_, nels_type)(                                                 \
         JOIN(and_, nels_type)(s,                                               \
                               JOIN(load_, nels_type)(&limex->shift[nels_i])),  \
         limex->shiftAmount[nels_i]))
diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c
index 5aba9847..903e04da 100644
--- a/src/nfa/shufti.c
+++ b/src/nfa/shufti.c
@@ -40,7 +40,6 @@
 
 #include "shufti_common.h"
 
-
 /** \brief Naive byte-by-byte implementation. */
 static really_inline
 const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
@@ -234,7 +233,7 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
 
     m128 c2_lo  = pshufb(mask2_lo, chars_lo);
     m128 c2_hi  = pshufb(mask2_hi, chars_hi);
-    m128 t2     = or128(t, shiftRight8Bits(or128(c2_lo, c2_hi)));
+    m128 t2     = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
 
 #ifdef DEBUG
     DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo);        printf("\n");
@@ -471,7 +470,7 @@ const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
 
     m256 c2_lo  = vpshufb(mask2_lo, chars_lo);
     m256 c2_hi  = vpshufb(mask2_hi, chars_hi);
-    m256 t2     = or256(t, shift256Right8Bits(or256(c2_lo, c2_hi)));
+    m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1));
 
 #ifdef DEBUG
     DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo);        printf("\n");
diff --git a/src/nfa/shufti_common.h b/src/nfa/shufti_common.h
index 84835665..e63ad27a 100644
--- a/src/nfa/shufti_common.h
+++ b/src/nfa/shufti_common.h
@@ -93,7 +93,7 @@ DUMP_MSK(128)
 #endif
 
 #define GET_LO_4(chars) and128(chars, low4bits)
-#define GET_HI_4(chars) rshift2x64(andnot128(low4bits, chars), 4)
+#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
 
 static really_inline
 u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
@@ -119,7 +119,7 @@ DUMP_MSK(256)
 #endif
 
 #define GET_LO_4(chars) and256(chars, low4bits)
-#define GET_HI_4(chars) rshift4x64(andnot256(low4bits, chars), 4)
+#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
 
 static really_inline
 u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
diff --git a/src/nfa/truffle_common.h b/src/nfa/truffle_common.h
index 593a605e..7368e550 100644
--- a/src/nfa/truffle_common.h
+++ b/src/nfa/truffle_common.h
@@ -48,7 +48,6 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
     return NULL; // no match
 }
 
-#define shift128r(a, b) _mm_srli_epi64((a), (b))
 static really_inline
 u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
 
@@ -59,7 +58,7 @@ u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
     m128 shuf1 = pshufb(shuf_mask_lo_highclear, v);
     m128 t1 = xor128(v, highconst);
     m128 shuf2 = pshufb(shuf_mask_lo_highset, t1);
-    m128 t2 = andnot128(highconst, shift128r(v, 4));
+    m128 t2 = andnot128(highconst, rshift64_m128(v, 4));
     m128 shuf3 = pshufb(shuf_mask_hi, t2);
     m128 tmp = and128(or128(shuf1, shuf2), shuf3);
     m128 tmp2 = eq128(tmp, zeroes128());
@@ -102,7 +101,6 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
     return NULL; // no match
 }
 
-#define shift256r(a, b) _mm256_srli_epi64((a), (b))
 static really_inline
 u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
 
@@ -113,7 +111,7 @@ u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
     m256 shuf1 = vpshufb(shuf_mask_lo_highclear, v);
     m256 t1 = xor256(v, highconst);
     m256 shuf2 = vpshufb(shuf_mask_lo_highset, t1);
-    m256 t2 = andnot256(highconst, shift256r(v, 4));
+    m256 t2 = andnot256(highconst, rshift64_m256(v, 4));
     m256 shuf3 = vpshufb(shuf_mask_hi, t2);
     m256 tmp = and256(or256(shuf1, shuf2), shuf3);
     m256 tmp2 = eq256(tmp, zeroes256());
diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h
index 1883a44c..0749470f 100644
--- a/src/nfa/vermicelli_sse.h
+++ b/src/nfa/vermicelli_sse.h
@@ -138,7 +138,7 @@ const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
     for (; buf + 16 < buf_end; buf += 16) {
         m128 data = load128(buf);
         u32 z = movemask128(and128(eq128(chars1, data),
-                            shiftRight8Bits(eq128(chars2, data))));
+                                   rshiftbyte_m128(eq128(chars2, data), 1)));
         if (buf[15] == c1 && buf[16] == c2) {
             z |= (1 << 15);
         }
@@ -161,7 +161,7 @@ const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
         m128 data = load128(buf);
         m128 v = and128(casemask, data);
         u32 z = movemask128(and128(eq128(chars1, v),
-                            shiftRight8Bits(eq128(chars2, v))));
+                                   rshiftbyte_m128(eq128(chars2, v), 1)));
         if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) {
             z |= (1 << 15);
         }
@@ -182,8 +182,10 @@ const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2,
 
     for (; buf + 16 < buf_end; buf += 16) {
         m128 data = load128(buf);
-        u32 z = movemask128(and128(eq128(chars1, and128(data, mask1)),
-                   shiftRight8Bits(eq128(chars2, and128(data, mask2)))));
+        m128 v1 = eq128(chars1, and128(data, mask1));
+        m128 v2 = eq128(chars2, and128(data, mask2));
+        u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
+
         if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) {
             z |= (1 << 15);
         }
@@ -201,7 +203,7 @@ static really_inline
 const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
     m128 data = loadu128(buf); // unaligned
     u32 z = movemask128(and128(eq128(chars1, data),
-                        shiftRight8Bits(eq128(chars2, data))));
+                               rshiftbyte_m128(eq128(chars2, data), 1)));
 
     /* no fixup of the boundary required - the aligned run will pick it up */
     if (unlikely(z)) {
@@ -219,7 +221,7 @@ const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
     m128 data = loadu128(buf); // unaligned
     m128 v = and128(casemask, data);
     u32 z = movemask128(and128(eq128(chars1, v),
-                               shiftRight8Bits(eq128(chars2, v))));
+                               rshiftbyte_m128(eq128(chars2, v), 1)));
 
     /* no fixup of the boundary required - the aligned run will pick it up */
     if (unlikely(z)) {
@@ -234,8 +236,9 @@ static really_inline
 const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
                                   m128 mask1, m128 mask2, const u8 *buf) {
     m128 data = loadu128(buf); // unaligned
-    u32 z = movemask128(and128(eq128(chars1, and128(data, mask1)),
-               shiftRight8Bits(eq128(chars2, and128(data, mask2)))));
+    m128 v1 = eq128(chars1, and128(data, mask1));
+    m128 v2 = eq128(chars2, and128(data, mask2));
+    u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
 
     /* no fixup of the boundary required - the aligned run will pick it up */
     if (unlikely(z)) {
@@ -324,7 +327,7 @@ const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
     for (; buf + 16 < buf_end; buf_end -= 16) {
         m128 data = load128(buf_end - 16);
         u32 z = movemask128(and128(eq128(chars2, data),
-                            shiftLeft8Bits(eq128(chars1, data))));
+                                   lshiftbyte_m128(eq128(chars1, data), 1)));
         if (buf_end[-17] == c1 && buf_end[-16] == c2) {
             z |= 1;
         }
@@ -345,7 +348,7 @@ const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
         m128 data = load128(buf_end - 16);
         m128 v = and128(casemask, data);
         u32 z = movemask128(and128(eq128(chars2, v),
-                            shiftLeft8Bits(eq128(chars1, v))));
+                                   lshiftbyte_m128(eq128(chars1, v), 1)));
         if ((buf_end[-17] & CASE_CLEAR) == c1
             && (buf_end[-16] & CASE_CLEAR) == c2) {
             z |= 1;
@@ -362,7 +365,7 @@ static really_inline
 const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
     m128 data = loadu128(buf);
     u32 z = movemask128(and128(eq128(chars2, data),
-                               shiftLeft8Bits(eq128(chars1, data))));
+                               lshiftbyte_m128(eq128(chars1, data), 1)));
 
     /* no fixup of the boundary required - the aligned run will pick it up */
     if (unlikely(z)) {
@@ -380,7 +383,7 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
     m128 data = loadu128(buf);
     m128 v = and128(casemask, data);
     u32 z = movemask128(and128(eq128(chars2, v),
-                               shiftLeft8Bits(eq128(chars1, v))));
+                               lshiftbyte_m128(eq128(chars1, v), 1)));
     /* no fixup of the boundary required - the aligned run will pick it up */
     if (unlikely(z)) {
         return lastMatchOffset(buf + 16, z);
diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h
index cd84d052..76db5a77 100644
--- a/src/rose/counting_miracle.h
+++ b/src/rose/counting_miracle.h
@@ -82,7 +82,7 @@ char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end,
 }
 
 #define GET_LO_4(chars) and128(chars, low4bits)
-#define GET_HI_4(chars) rshift2x64(andnot128(low4bits, chars), 4)
+#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
 
 static really_inline
 u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index d3dba9a3..5f557ba5 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -149,8 +149,8 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
 #endif
 }
 
-#define shift2x64(a, b)  _mm_slli_epi64((a), (b))
-#define rshift2x64(a, b) _mm_srli_epi64((a), (b))
+#define lshift64_m128(a, b) _mm_slli_epi64((a), (b))
+#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
 #define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
 #define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
 
@@ -172,16 +172,8 @@ static really_inline u64a movq(const m128 in) {
 #endif
 }
 
-static really_inline m128 shiftRight8Bits(m128 a) {
-    return _mm_srli_si128(a,1);
-}
-
-static really_inline m128 shiftLeft8Bits(m128 a) {
-    return _mm_slli_si128(a,1);
-}
-
-#define byteShiftRight128(a, count_immed) _mm_srli_si128(a, count_immed)
-#define byteShiftLeft128(a, count_immed) _mm_slli_si128(a, count_immed)
+#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
+#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
 
 #if !defined(__AVX2__)
 // TODO: this entire file needs restructuring - this carveout is awful
@@ -191,8 +183,8 @@ static really_inline m128 shiftLeft8Bits(m128 a) {
 #define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
 #define extract64from256(a, imm) _mm_extract_epi64((imm >> 2) ? a.hi : a.lo, imm % 2)
 #else
-#define extract32from256(a, imm) movd(byteShiftRight128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 8))
-#define extract64from256(a, imm) movq(byteShiftRight128((imm >> 2) ? a.hi : a.lo, (imm % 2) * 8))
+#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 8))
+#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 2) * 8))
 #endif
 
 #endif // !AVX2
@@ -213,10 +205,6 @@ static really_inline m128 andnot128(m128 a, m128 b) {
     return _mm_andnot_si128(a, b);
 }
 
-// The shift amount is an immediate, so we define these operations as macros on
-// Intel SIMD.
-#define shift128(a, b)  _mm_slli_epi64((a), (b))
-
 // aligned load
 static really_inline m128 load128(const void *ptr) {
     assert(ISALIGNED_N(ptr, alignof(m128)));
@@ -335,8 +323,8 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
  ****/
 
 #if defined(__AVX2__)
-#define shift4x64(a, b)  _mm256_slli_epi64((a), (b))
-#define rshift4x64(a, b) _mm256_srli_epi64((a), (b))
+#define lshift64_m256(a, b) _mm256_slli_epi64((a), (b))
+#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
 
 static really_inline
 m256 set32x8(u32 in) {
@@ -354,18 +342,18 @@ m256 set2x128(m128 a) {
 #else
 
 static really_inline
-m256 shift4x64(m256 a, int b) {
+m256 lshift64_m256(m256 a, int b) {
     m256 rv = a;
-    rv.lo = shift2x64(rv.lo, b);
-    rv.hi = shift2x64(rv.hi, b);
+    rv.lo = lshift64_m128(rv.lo, b);
+    rv.hi = lshift64_m128(rv.hi, b);
     return rv;
 }
 
 static really_inline
-m256 rshift4x64(m256 a, int b) {
+m256 rshift64_m256(m256 a, int b) {
     m256 rv = a;
-    rv.lo = rshift2x64(rv.lo, b);
-    rv.hi = rshift2x64(rv.hi, b);
+    rv.lo = rshift64_m128(rv.lo, b);
+    rv.hi = rshift64_m128(rv.hi, b);
     return rv;
 }
 static really_inline
@@ -461,18 +449,6 @@ static really_inline m256 andnot256(m256 a, m256 b) {
 }
 #endif
 
-// The shift amount is an immediate
-#if defined(__AVX2__)
-#define shift256(a, b)  _mm256_slli_epi64((a), (b))
-#else
-static really_really_inline m256 shift256(m256 a, unsigned b) {
-    m256 rv;
-    rv.lo = shift128(a.lo, b);
-    rv.hi = shift128(a.hi, b);
-    return rv;
-}
-#endif
-
 static really_inline int diff256(m256 a, m256 b) {
 #if defined(__AVX2__)
     return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
@@ -673,21 +649,12 @@ m128 movdq_lo(m256 x) {
     return _mm256_extracti128_si256(x, 0);
 }
 
-static really_inline
-m256 shift256Right8Bits(m256 a) {
-    return _mm256_srli_si256(a, 1);
-}
-
-static really_inline
-m256 shift256Left8Bits(m256 a) {
-    return _mm256_slli_si256(a, 1);
-}
 #define cast256to128(a) _mm256_castsi256_si128(a)
 #define cast128to256(a) _mm256_castsi128_si256(a)
 #define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
 #define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
-#define byteShiftRight256(a, count_immed) _mm256_srli_si256(a, count_immed)
-#define byteShiftLeft256(a, count_immed) _mm256_slli_si256(a, count_immed)
+#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
+#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
 #define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
 #define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
 #define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
@@ -741,11 +708,12 @@ static really_inline m384 andnot384(m384 a, m384 b) {
 }
 
 // The shift amount is an immediate
-static really_really_inline m384 shift384(m384 a, unsigned b) {
+static really_really_inline
+m384 lshift64_m384(m384 a, unsigned b) {
     m384 rv;
-    rv.lo = shift128(a.lo, b);
-    rv.mid = shift128(a.mid, b);
-    rv.hi = shift128(a.hi, b);
+    rv.lo = lshift64_m128(a.lo, b);
+    rv.mid = lshift64_m128(a.mid, b);
+    rv.hi = lshift64_m128(a.hi, b);
     return rv;
 }
 
@@ -913,10 +881,11 @@ static really_inline m512 andnot512(m512 a, m512 b) {
 }
 
 // The shift amount is an immediate
-static really_really_inline m512 shift512(m512 a, unsigned b) {
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
     m512 rv;
-    rv.lo = shift256(a.lo, b);
-    rv.hi = shift256(a.hi, b);
+    rv.lo = lshift64_m256(a.lo, b);
+    rv.hi = lshift64_m256(a.hi, b);
     return rv;
 }
 
diff --git a/src/util/uniform_ops.h b/src/util/uniform_ops.h
index 45ea4108..0619c7e4 100644
--- a/src/util/uniform_ops.h
+++ b/src/util/uniform_ops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -125,12 +125,12 @@
 #define andnot_m384(a, b)   (andnot384(a, b))
 #define andnot_m512(a, b)   (andnot512(a, b))
 
-#define shift_u32(a, b)     ((a) << (b))
-#define shift_u64a(a, b)    ((a) << (b))
-#define shift_m128(a, b)    (shift128(a, b))
-#define shift_m256(a, b)    (shift256(a, b))
-#define shift_m384(a, b)    (shift384(a, b))
-#define shift_m512(a, b)    (shift512(a, b))
+#define lshift_u32(a, b)    ((a) << (b))
+#define lshift_u64a(a, b)   ((a) << (b))
+#define lshift_m128(a, b)   (lshift64_m128(a, b))
+#define lshift_m256(a, b)   (lshift64_m256(a, b))
+#define lshift_m384(a, b)   (lshift64_m384(a, b))
+#define lshift_m512(a, b)   (lshift64_m512(a, b))
 
 #define isZero_u8(a)        ((a) == 0)
 #define isZero_u32(a)       ((a) == 0)
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index e95f7533..3c07b2b0 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -643,50 +643,50 @@ TEST(SimdUtilsTest, variableByteShift128) {
     char base[] = "0123456789ABCDEF";
     m128 in = loadu128(base);
 
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 0),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
                          variable_byte_shift_m128(in, 0)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 1),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
                          variable_byte_shift_m128(in, -1)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 2),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 2),
                          variable_byte_shift_m128(in, -2)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 3),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 3),
                          variable_byte_shift_m128(in, -3)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 4),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 4),
                          variable_byte_shift_m128(in, -4)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 5),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 5),
                          variable_byte_shift_m128(in, -5)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 6),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 6),
                          variable_byte_shift_m128(in, -6)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 7),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 7),
                          variable_byte_shift_m128(in, -7)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 8),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 8),
                          variable_byte_shift_m128(in, -8)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 9),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 9),
                          variable_byte_shift_m128(in, -9)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 10),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 10),
                          variable_byte_shift_m128(in, -10)));
 
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 0),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 0),
                          variable_byte_shift_m128(in, 0)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 1),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 1),
                          variable_byte_shift_m128(in, 1)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 2),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 2),
                          variable_byte_shift_m128(in, 2)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 3),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 3),
                          variable_byte_shift_m128(in, 3)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 4),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 4),
                          variable_byte_shift_m128(in, 4)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 5),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 5),
                          variable_byte_shift_m128(in, 5)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 6),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 6),
                          variable_byte_shift_m128(in, 6)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 7),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 7),
                          variable_byte_shift_m128(in, 7)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 8),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 8),
                          variable_byte_shift_m128(in, 8)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 9),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 9),
                          variable_byte_shift_m128(in, 9)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 10),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
                          variable_byte_shift_m128(in, 10)));
 
     EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));

From e9cfbae68f69b06bb4fdcd2abd7c1ee5afec0262 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 24 Jun 2016 11:30:07 +1000
Subject: [PATCH 085/166] workaround for freebsd/clang/libc++ build issues

Rather than relying on set's constructor from {}, explicitly construct
the set.
---
 src/rose/rose_build_misc.cpp          | 2 +-
 src/rose/rose_build_role_aliasing.cpp | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index f430f731..7fbc5f65 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -881,7 +881,7 @@ namespace {
 class OutfixAllReports : public boost::static_visitor<set<ReportID>> {
 public:
     set<ReportID> operator()(const boost::blank &) const {
-        return {};
+        return set<ReportID>();
     }
 
     template<class T>
diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index 292e199a..b2f6b385 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -1234,7 +1234,8 @@ bool attemptRoseGraphMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
         ReportID new_report = build.getNewNfaReport();
         shared_ptr<NGHolder> new_graph = cloneHolder(*b_h);
         duplicateReport(*new_graph, b_left.leftfix_report, new_report);
-        pruneReportIfUnused(build, new_graph, {}, b_left.leftfix_report);
+        pruneReportIfUnused(build, new_graph, set<RoseVertex>(),
+                            b_left.leftfix_report);
 
         rai.rev_leftfix[a_left_id].erase(a);
         rai.rev_leftfix[b_left_id].erase(b);

From 373a624badc4b6b280b9b7f951388530d8d9c9e3 Mon Sep 17 00:00:00 2001
From: Boris Nagaev <bnagaev@gmail.com>
Date: Sat, 4 Jun 2016 02:29:26 +0300
Subject: [PATCH 086/166] simplegrep: open file in binary mode ("rb")

Otherwise it hangs on binary files (platform MinGW).
---
 examples/simplegrep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/simplegrep.c b/examples/simplegrep.c
index 9e392a8f..d6bd4b39 100644
--- a/examples/simplegrep.c
+++ b/examples/simplegrep.c
@@ -77,7 +77,7 @@ static int eventHandler(unsigned int id, unsigned long long from,
  * length with its length. Returns NULL on failure.
  */
 static char *readInputData(const char *inputFN, unsigned int *length) {
-    FILE *f = fopen(inputFN, "r");
+    FILE *f = fopen(inputFN, "rb");
     if (!f) {
         fprintf(stderr, "ERROR: unable to open file \"%s\": %s\n", inputFN,
                 strerror(errno));

From 66c2a53d33aecc7c29f0d9f8797a48655030ac51 Mon Sep 17 00:00:00 2001
From: Boris Nagaev <bnagaev@gmail.com>
Date: Sun, 29 May 2016 12:27:11 +0300
Subject: [PATCH 087/166] hs_serialized_database_info: check whole input

Most lines of hs_serialized_database_info are not needed,
as the information is provided by db_decode_header.
Moreover, db_decode_header changes its first argument,
so it points to end of header after db_decode_header.
---
 src/database.c | 31 +++++++------------------------
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/src/database.c b/src/database.c
index 635a3b66..a4e10c22 100644
--- a/src/database.c
+++ b/src/database.c
@@ -458,33 +458,16 @@ hs_error_t hs_serialized_database_info(const char *bytes, size_t length,
     }
     *info = NULL;
 
-    if (!bytes || length < sizeof(struct hs_database)) {
-        return HS_INVALID;
+    // Decode and check the header
+    hs_database_t header;
+    hs_error_t ret = db_decode_header(&bytes, length, &header);
+    if (ret != HS_SUCCESS) {
+        return ret;
     }
 
-    const u32 *buf = (const u32 *)bytes;
+    u32 mode = unaligned_load_u32(bytes + offsetof(struct RoseEngine, mode));
 
-    u32 magic = unaligned_load_u32(buf++);
-    if (magic != HS_DB_MAGIC) {
-        return HS_INVALID;
-    }
-
-    u32 version = unaligned_load_u32(buf++);
-
-    buf++; /* length */
-
-    platform_t plat;
-    plat = unaligned_load_u64a(buf);
-    buf += 2;
-
-    buf++; /* crc */
-    buf++; /* reserved 0 */
-    buf++; /* reserved 1 */
-
-    const char *t_raw = (const char *)buf;
-    u32 mode = unaligned_load_u32(t_raw + offsetof(struct RoseEngine, mode));
-
-    return print_database_string(info, version, plat, mode);
+    return print_database_string(info, header.version, header.platform, mode);
 }
 
 HS_PUBLIC_API

From b73bd9b6e19b150867338ac768752ee1369a392b Mon Sep 17 00:00:00 2001
From: Boris Nagaev <bnagaev@gmail.com>
Date: Sun, 29 May 2016 12:29:13 +0300
Subject: [PATCH 088/166] new test: deserializers fail with garbage input

---
 unit/hyperscan/serialize.cpp | 67 ++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/unit/hyperscan/serialize.cpp b/unit/hyperscan/serialize.cpp
index e13d27b2..7e0fcb7c 100644
--- a/unit/hyperscan/serialize.cpp
+++ b/unit/hyperscan/serialize.cpp
@@ -483,4 +483,71 @@ TEST(Serialize, DeserializeUnalignedMalloc) {
     free(bytes);
 }
 
+TEST(Serialize, DeserializeGarbage) {
+    hs_database_t *db;
+    hs_compile_error_t *c_err;
+    static const char *pattern = "hatstand.*(badgerbrush|teakettle)";
+
+    hs_error_t err = hs_compile(pattern, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
+    ASSERT_EQ(HS_SUCCESS, err);
+    ASSERT_TRUE(db != nullptr);
+
+    // determine database size for subsequent hs_deserialize_database_at
+    size_t db_len;
+    err = hs_database_size(db, &db_len);
+    ASSERT_EQ(HS_SUCCESS, err);
+    ASSERT_NE(0, db_len);
+
+    // serialize
+    char *bytes = nullptr;
+    size_t bytes_len = 0;
+
+    err = hs_serialize_database(db, &bytes, &bytes_len);
+    ASSERT_EQ(HS_SUCCESS, err);
+    ASSERT_NE(0, bytes_len);
+
+    hs_free_database(db);
+
+    // append '\0' byte to the serialized string to spoil it
+    bytes = (char *)realloc(bytes, bytes_len + 1);
+    ASSERT_NE(nullptr, bytes);
+    bytes[bytes_len] = '\0';
+
+    // create set of invalid serializations
+    struct Arg {
+        char *start;
+        size_t len;
+    };
+
+    const Arg invalid_args[] = {
+        {bytes + 1, bytes_len},
+        {bytes + 1, bytes_len - 1},
+        {bytes, bytes_len - 1},
+        {bytes, bytes_len + 1},
+    };
+
+    for (const Arg &arg : invalid_args) {
+        hs_database_t *a_db;
+        err = hs_deserialize_database(arg.start, arg.len, &a_db);
+        ASSERT_NE(HS_SUCCESS, err);
+
+        char *new_db = (char *)malloc(db_len);
+        ASSERT_NE(nullptr, new_db);
+        err = hs_deserialize_database_at(arg.start, arg.len,
+                                         (hs_database_t *)(new_db));
+        ASSERT_NE(HS_SUCCESS, err);
+        free(new_db);
+
+        char *info;
+        err = hs_serialized_database_info(arg.start, arg.len, &info);
+        ASSERT_NE(HS_SUCCESS, err);
+
+        size_t ser_len;
+        err = hs_serialized_database_size(arg.start, arg.len, &ser_len);
+        ASSERT_NE(HS_SUCCESS, err);
+    }
+
+    free(bytes);
+}
+
 }

From bfaa0acaea38bcf27f6f0f441338bf3f5e315c74 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 2 Jun 2016 10:55:22 +1000
Subject: [PATCH 089/166] rose: preserve lit properties when building masks

This fixes a bug with commit 6a6b0e5, which did not preserve the
requires_explode and requires_benefits properties when a new literal was
generated to add an HWLM and/cmp mask.

Also extends the requires_explode handling to allow masked literals.
---
 src/rose/rose_build_matchers.cpp | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp
index b66556fc..498af2f0 100644
--- a/src/rose/rose_build_matchers.cpp
+++ b/src/rose/rose_build_matchers.cpp
@@ -381,7 +381,8 @@ void findMoreLiteralMasks(RoseBuildImpl &build) {
             continue;
         }
         assert(!msk.empty());
-        DEBUG_PRINTF("found advisory mask for lit_id=%u\n", id);
+        DEBUG_PRINTF("found advisory mask for lit_id=%u (%s)\n", id,
+                     dumpString(lit.s).c_str());
         u32 new_id = build.getLiteralId(lit.s, msk, cmp, lit.delay, lit.table);
         assert(new_id != id);
         DEBUG_PRINTF("replacing with new lit_id=%u\n", new_id);
@@ -390,6 +391,8 @@ void findMoreLiteralMasks(RoseBuildImpl &build) {
         // We assume that this transform is happening prior to group assignment.
         assert(lit_info.group_mask == 0);
         auto &new_info = build.literal_info.at(new_id);
+
+        // Move the vertices across.
         new_info.vertices.insert(begin(lit_info.vertices),
                                  end(lit_info.vertices));
         for (auto v : lit_info.vertices) {
@@ -397,6 +400,10 @@ void findMoreLiteralMasks(RoseBuildImpl &build) {
             build.g[v].literals.insert(new_id);
         }
         lit_info.vertices.clear();
+
+        // Preserve other properties.
+        new_info.requires_explode = lit_info.requires_explode;
+        new_info.requires_benefits = lit_info.requires_benefits;
     }
 }
 
@@ -546,17 +553,24 @@ vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
 
         if (info.requires_explode) {
             DEBUG_PRINTF("exploding lit\n");
-            const vector<u8> empty_msk; // msk/cmp will be empty
             case_iter cit = caseIterateBegin(lit);
             case_iter cite = caseIterateEnd();
             for (; cit != cite; ++cit) {
+                string s = *cit;
+                bool nocase = false;
+
                 DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d msk=%s, "
                              "cmp=%s (exploded)\n",
-                             final_id, escapeString(lit.get_string()).c_str(),
-                             0, noruns, dumpMask(msk).c_str(),
-                             dumpMask(cmp).c_str());
-                lits.emplace_back(*cit, false, noruns, final_id, groups,
-                                  empty_msk, empty_msk);
+                             final_id, escapeString(s).c_str(), nocase, noruns,
+                             dumpMask(msk).c_str(), dumpMask(cmp).c_str());
+
+                if (!maskIsConsistent(s, nocase, msk, cmp)) {
+                    DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
+                    continue;
+                }
+
+                lits.emplace_back(move(s), nocase, noruns, final_id, groups,
+                                  msk, cmp);
             }
         } else {
             const std::string &s = lit.get_string();

From 7cc5346c1152f1e403b2764a1ea96efd982cb8d1 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 2 Jun 2016 14:19:30 +1000
Subject: [PATCH 090/166] assign groups: turn pair<pair> into a tuple

---
 src/rose/rose_build_groups.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
index e1a130ef..5e477e3b 100644
--- a/src/rose/rose_build_groups.cpp
+++ b/src/rose/rose_build_groups.cpp
@@ -212,7 +212,7 @@ void assignGroupsToLiterals(RoseBuildImpl &build) {
     }
 
     u32 min_start_group = counter;
-    priority_queue<pair<pair<s32, s32>, u32> > pq;
+    priority_queue<tuple<s32, s32, u32>> pq;
 
     // Second pass: the other literals.
     for (const auto &e : literals.right) {
@@ -225,12 +225,11 @@ void assignGroupsToLiterals(RoseBuildImpl &build) {
         }
 
         assert(!eligibleForAlwaysOnGroup(build, id));
-        pq.push(make_pair(make_pair(-(s32)literal_info[id].vertices.size(),
-                                    -(s32)lit.s.length()), id));
+        pq.emplace(-(s32)info.vertices.size(), -(s32)lit.s.length(), id);
     }
     vector<u32> long_lits;
     while (!pq.empty()) {
-        u32 id = pq.top().second;
+        u32 id = get<2>(pq.top());
         pq.pop();
         UNUSED const rose_literal_id &lit = literals.right.at(id);
         DEBUG_PRINTF("assigning groups to lit %u (v %zu l %zu)\n", id,

From 19e79be87d4f3f62a6d0f9e16fe2a36662200172 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 4 Jul 2016 11:27:21 +1000
Subject: [PATCH 091/166] limex: compress repeats before state

---
 src/nfa/limex_runtime_impl.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index e6c1c06f..052ad167 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -371,14 +371,13 @@ void COMPRESS_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, const void *src,
 }
 
 char JOIN(LIMEX_API_ROOT, _queueCompressState)(const struct NFA *n,
-                                                  const struct mq *q,
-                                                  s64a loc) {
+                                               const struct mq *q, s64a loc) {
     void *dest = q->streamState;
     const void *src = q->state;
     u8 key = queue_prev_byte(q, loc);
     const IMPL_NFA_T *limex = getImplNfa(n);
-    COMPRESS_FN(limex, dest, src, key);
     COMPRESS_REPEATS_FN(limex, dest, src, q->offset + loc);
+    COMPRESS_FN(limex, dest, src, key);
     return 0;
 }
 

From aa54352a3abc73f1e6230104d4e21605b0793243 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 4 Jul 2016 12:41:25 +1000
Subject: [PATCH 092/166] limex: switch off stale repeats during compress

---
 src/nfa/limex_runtime_impl.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index 052ad167..2ea86ed2 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -73,6 +73,7 @@
 #define ANDNOT_STATE        JOIN(andnot_, STATE_T)
 #define OR_STATE            JOIN(or_, STATE_T)
 #define TESTBIT_STATE       JOIN(testbit_, STATE_T)
+#define CLEARBIT_STATE      JOIN(clearbit_, STATE_T)
 #define ZERO_STATE          JOIN(zero_, STATE_T)
 #define ISNONZERO_STATE     JOIN(isNonZero_, STATE_T)
 #define ISZERO_STATE        JOIN(isZero_, STATE_T)
@@ -349,14 +350,13 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
 }
 
 static really_inline
-void COMPRESS_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, const void *src,
+void COMPRESS_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, void *src,
                          u64a offset) {
     if (!limex->repeatCount) {
         return;
     }
 
-    // Note: we compress all repeats, as they may have *just* had their
-    // cyclic states switched off a moment ago. TODO: is this required
+    STATE_T s = LOAD_STATE(src);
 
     const union RepeatControl *ctrl =
         getRepeatControlBaseConst((const char *)src, sizeof(STATE_T));
@@ -365,15 +365,25 @@ void COMPRESS_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, const void *src,
     for (u32 i = 0; i < limex->repeatCount; i++) {
         const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
         const struct RepeatInfo *repeat = getRepeatInfo(info);
+
+        if (TESTBIT_STATE(&s, info->cyclicState) &&
+            repeatHasMatch(repeat, &ctrl[i], state_base + info->stateOffset,
+                           offset) == REPEAT_STALE) {
+            DEBUG_PRINTF("repeat %u is stale\n", i);
+            CLEARBIT_STATE(&s, info->cyclicState);
+        }
+
         repeatPack(state_base + info->packedCtrlOffset, repeat, &ctrl[i],
                    offset);
     }
+
+    STORE_STATE(src, s);
 }
 
 char JOIN(LIMEX_API_ROOT, _queueCompressState)(const struct NFA *n,
                                                const struct mq *q, s64a loc) {
     void *dest = q->streamState;
-    const void *src = q->state;
+    void *src = q->state;
     u8 key = queue_prev_byte(q, loc);
     const IMPL_NFA_T *limex = getImplNfa(n);
     COMPRESS_REPEATS_FN(limex, dest, src, q->offset + loc);
@@ -952,6 +962,7 @@ enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
 #undef ANDNOT_STATE
 #undef OR_STATE
 #undef TESTBIT_STATE
+#undef CLEARBIT_STATE
 #undef ZERO_STATE
 #undef ISNONZERO_STATE
 #undef ISZERO_STATE

From ae5e347778c1e717c0a0fd3c6fad99b2afadfaa1 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 4 Jul 2016 12:56:03 +1000
Subject: [PATCH 093/166] limex: only compress active repeats

---
 src/nfa/limex_runtime_impl.h | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index 2ea86ed2..95f18d1b 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -363,16 +363,24 @@ void COMPRESS_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, void *src,
     char *state_base = (char *)dest + limex->stateSize;
 
     for (u32 i = 0; i < limex->repeatCount; i++) {
+        DEBUG_PRINTF("repeat %u\n", i);
         const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
-        const struct RepeatInfo *repeat = getRepeatInfo(info);
 
-        if (TESTBIT_STATE(&s, info->cyclicState) &&
-            repeatHasMatch(repeat, &ctrl[i], state_base + info->stateOffset,
-                           offset) == REPEAT_STALE) {
-            DEBUG_PRINTF("repeat %u is stale\n", i);
-            CLEARBIT_STATE(&s, info->cyclicState);
+        if (!TESTBIT_STATE(&s, info->cyclicState)) {
+            DEBUG_PRINTF("is dead\n");
+            continue;
         }
 
+        const struct RepeatInfo *repeat = getRepeatInfo(info);
+        if (repeatHasMatch(repeat, &ctrl[i], state_base + info->stateOffset,
+                           offset) == REPEAT_STALE) {
+            DEBUG_PRINTF("is stale, clearing state\n");
+            CLEARBIT_STATE(&s, info->cyclicState);
+            continue;
+        }
+
+        DEBUG_PRINTF("packing state (packedCtrlOffset=%u)\n",
+                     info->packedCtrlOffset);
         repeatPack(state_base + info->packedCtrlOffset, repeat, &ctrl[i],
                    offset);
     }
@@ -398,15 +406,24 @@ void EXPAND_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, const void *src,
         return;
     }
 
-    // Note: we expand all repeats, as they may have *just* had their
-    // cyclic states switched off a moment ago. TODO: is this required?
+    // Note: state has already been expanded into 'dest'.
+    STATE_T s = LOAD_STATE(dest);
 
     union RepeatControl *ctrl =
         getRepeatControlBase((char *)dest, sizeof(STATE_T));
     const char *state_base = (const char *)src + limex->stateSize;
 
     for (u32 i = 0; i < limex->repeatCount; i++) {
+        DEBUG_PRINTF("repeat %u\n", i);
         const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
+
+        if (!TESTBIT_STATE(&s, info->cyclicState)) {
+            DEBUG_PRINTF("is dead\n");
+            continue;
+        }
+
+        DEBUG_PRINTF("unpacking state (packedCtrlOffset=%u)\n",
+                     info->packedCtrlOffset);
         const struct RepeatInfo *repeat = getRepeatInfo(info);
         repeatUnpack(state_base + info->packedCtrlOffset, repeat, offset,
                      &ctrl[i]);

From 8435f918d1139046882254476d931d28f88b571f Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 4 Jul 2016 14:19:10 +1000
Subject: [PATCH 094/166] limex: use cyclics mask for repeat compress

---
 src/nfa/limex_runtime_impl.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index 95f18d1b..d6c28c6f 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -358,6 +358,11 @@ void COMPRESS_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, void *src,
 
     STATE_T s = LOAD_STATE(src);
 
+    if (ISZERO_STATE(AND_STATE(s, LOAD_STATE(&limex->repeatCyclicMask)))) {
+        DEBUG_PRINTF("no cyclics are on\n");
+        return;
+    }
+
     const union RepeatControl *ctrl =
         getRepeatControlBaseConst((const char *)src, sizeof(STATE_T));
     char *state_base = (char *)dest + limex->stateSize;
@@ -407,7 +412,12 @@ void EXPAND_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, const void *src,
     }
 
     // Note: state has already been expanded into 'dest'.
-    STATE_T s = LOAD_STATE(dest);
+    const STATE_T cyclics =
+        AND_STATE(LOAD_STATE(dest), LOAD_STATE(&limex->repeatCyclicMask));
+    if (ISZERO_STATE(cyclics)) {
+        DEBUG_PRINTF("no cyclics are on\n");
+        return;
+    }
 
     union RepeatControl *ctrl =
         getRepeatControlBase((char *)dest, sizeof(STATE_T));
@@ -417,7 +427,7 @@ void EXPAND_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, const void *src,
         DEBUG_PRINTF("repeat %u\n", i);
         const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i);
 
-        if (!TESTBIT_STATE(&s, info->cyclicState)) {
+        if (!TESTBIT_STATE(&cyclics, info->cyclicState)) {
             DEBUG_PRINTF("is dead\n");
             continue;
         }

From d497a1259a1230c47e070ef73084f956dda6b7cd Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 5 Jul 2016 14:37:22 +1000
Subject: [PATCH 095/166] rose: use normal callback for pure-literal cases

---
 src/rose/match.c | 23 -----------------------
 src/rose/rose.h  |  3 +--
 src/runtime.c    | 12 ++++++++++--
 3 files changed, 11 insertions(+), 27 deletions(-)

diff --git a/src/rose/match.c b/src/rose/match.c
index 2b05fd76..15d3534c 100644
--- a/src/rose/match.c
+++ b/src/rose/match.c
@@ -582,29 +582,6 @@ hwlmcb_rv_t roseFloatingCallback(size_t start, size_t end, u32 id, void *ctxt) {
     return roseCallback_i(start, end, id, ctxt) & t->floating_group_mask;
 }
 
-/**
- * \brief Match callback adaptor used for matches from pure-literal cases.
- *
- * Literal match IDs in this path run limited Rose programs that do not use
- * Rose state (which is not initialised in the pure-literal path). They can
- * still, for example, check lookarounds or literal masks.
- */
-hwlmcb_rv_t rosePureLiteralCallback(size_t start, size_t end, u32 id,
-                                    void *context) {
-    DEBUG_PRINTF("start=%zu, end=%zu, id=%u\n", start, end, id);
-    struct hs_scratch *scratch = context;
-    struct core_info *ci = &scratch->core_info;
-    const u64a real_end = (u64a)end + ci->buf_offset + 1;
-    const u64a som = 0;
-    const size_t match_len = end - start + 1;
-    const struct RoseEngine *rose = ci->rose;
-    const u32 *programs = getByOffset(rose, rose->litProgramOffset);
-    assert(id < rose->literalCount);
-    const u8 flags = 0;
-    return roseRunProgram(rose, scratch, programs[id], som, real_end, match_len,
-                          flags);
-}
-
 /**
  * \brief Execute a boundary report program.
  *
diff --git a/src/rose/rose.h b/src/rose/rose.h
index 280e3bd5..9a50f0e9 100644
--- a/src/rose/rose.h
+++ b/src/rose/rose.h
@@ -46,8 +46,7 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch);
 void roseStreamEodExec(const struct RoseEngine *t, u64a offset,
                        struct hs_scratch *scratch);
 
-hwlmcb_rv_t rosePureLiteralCallback(size_t start, size_t end, u32 id,
-                                    void *context);
+hwlmcb_rv_t roseCallback(size_t start, size_t end, u32 id, void *context);
 
 int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context);
 
diff --git a/src/runtime.c b/src/runtime.c
index 2def17c8..fc867b8e 100644
--- a/src/runtime.c
+++ b/src/runtime.c
@@ -198,7 +198,11 @@ void pureLiteralBlockExec(const struct RoseEngine *rose,
     size_t length = scratch->core_info.len;
     DEBUG_PRINTF("rose engine %d\n", rose->runtimeImpl);
 
-    hwlmExec(ftable, buffer, length, 0, rosePureLiteralCallback, scratch,
+    // RoseContext values that need to be set for use by roseCallback.
+    scratch->tctxt.groups = rose->initialGroups;
+    scratch->tctxt.lit_offset_adjust = 1;
+
+    hwlmExec(ftable, buffer, length, 0, roseCallback, scratch,
              rose->initialGroups);
 }
 
@@ -742,11 +746,15 @@ void pureLiteralStreamExec(struct hs_stream *stream_state,
     DEBUG_PRINTF("::: streaming rose ::: offset = %llu len = %zu\n",
                  stream_state->offset, scratch->core_info.len);
 
+    // RoseContext values that need to be set for use by roseCallback.
+    scratch->tctxt.groups = loadGroups(rose, scratch->core_info.state);
+    scratch->tctxt.lit_offset_adjust = scratch->core_info.buf_offset + 1;
+
     // Pure literal cases don't have floatingMinDistance set, so we always
     // start the match region at zero.
     const size_t start = 0;
 
-    hwlmExecStreaming(ftable, scratch, len2, start, rosePureLiteralCallback,
+    hwlmExecStreaming(ftable, scratch, len2, start, roseCallback,
                       scratch, rose->initialGroups, hwlm_stream_state);
 
     if (!told_to_stop_matching(scratch) &&

From 22b451b59b919444f81aceb8af5fdaec42a369b7 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Thu, 7 Jul 2016 14:00:11 +1000
Subject: [PATCH 096/166] Ensure that m256 is 32-aligned on non-avx2 builds

---
 src/ue2common.h       |  7 +++----
 src/util/simd_types.h |  4 ++--
 src/util/simd_utils.h | 10 ++++------
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/ue2common.h b/src/ue2common.h
index 2de60753..e1f03f72 100644
--- a/src/ue2common.h
+++ b/src/ue2common.h
@@ -52,6 +52,9 @@
 #define ALIGN_ATTR(x) __attribute__((aligned((x))))
 #endif
 
+#define ALIGN_DIRECTIVE ALIGN_ATTR(16)
+#define ALIGN_AVX_DIRECTIVE ALIGN_ATTR(32)
+#define ALIGN_CL_DIRECTIVE ALIGN_ATTR(64)
 
 typedef signed char s8;
 typedef unsigned char u8;
@@ -82,10 +85,6 @@ typedef u32 ReportID;
 #define HS_PUBLIC_API
 #endif
 
-#define ALIGN_DIRECTIVE ALIGN_ATTR(16)
-#define ALIGN_AVX_DIRECTIVE ALIGN_ATTR(32)
-#define ALIGN_CL_DIRECTIVE ALIGN_ATTR(64)
-
 #define ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))
 
 /** \brief Shorthand for the attribute to shut gcc about unused parameters */
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index 63311b10..e4541411 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -65,7 +65,7 @@ typedef __m128i m128;
 #if defined(__AVX2__)
 typedef __m256i m256;
 #else
-typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
+typedef ALIGN_AVX_DIRECTIVE struct {m128 lo; m128 hi;} m256;
 #endif
 
 // these should align to 16 and 32 respectively
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 5f557ba5..8cea458e 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -493,11 +493,10 @@ static really_inline u32 diffrich64_256(m256 a, m256 b) {
 
 // aligned load
 static really_inline m256 load256(const void *ptr) {
-#if defined(__AVX2__)
     assert(ISALIGNED_N(ptr, alignof(m256)));
+#if defined(__AVX2__)
     return _mm256_load_si256((const m256 *)ptr);
 #else
-    assert(ISALIGNED_N(ptr, alignof(m128)));
     m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
     return rv;
 #endif
@@ -517,11 +516,10 @@ static really_inline m256 load2x128(const void *ptr) {
 
 // aligned store
 static really_inline void store256(void *ptr, m256 a) {
-#if defined(__AVX2__)
     assert(ISALIGNED_N(ptr, alignof(m256)));
+#if defined(__AVX2__)
     _mm256_store_si256((m256 *)ptr, a);
 #else
-    assert(ISALIGNED_16(ptr));
     ptr = assume_aligned(ptr, 16);
     *(m256 *)ptr = a;
 #endif
@@ -943,19 +941,19 @@ static really_inline u32 diffrich64_512(m512 a, m512 b) {
 
 // aligned load
 static really_inline m512 load512(const void *ptr) {
-    assert(ISALIGNED_16(ptr));
+    assert(ISALIGNED_N(ptr, alignof(m256)));
     m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
     return rv;
 }
 
 // aligned store
 static really_inline void store512(void *ptr, m512 a) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
 #if defined(__AVX2__)
     m512 *x = (m512 *)ptr;
     store256(&x->lo, a.lo);
     store256(&x->hi, a.hi);
 #else
-    assert(ISALIGNED_16(ptr));
     ptr = assume_aligned(ptr, 16);
     *(m512 *)ptr = a;
 #endif

From 3a1429a621e99ee6cbc716c9473d482c53021e0e Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Thu, 7 Jul 2016 11:36:05 +1000
Subject: [PATCH 097/166] group_weak_end is no longer used

---
 src/rose/rose_build_bytecode.cpp | 2 --
 src/rose/rose_build_impl.h       | 1 -
 src/rose/rose_build_misc.cpp     | 1 -
 src/rose/rose_dump.cpp           | 1 -
 src/rose/rose_internal.h         | 1 -
 5 files changed, 6 deletions(-)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 5b3806a2..c6f709bc 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -5191,8 +5191,6 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     engine->eagerIterOffset = eagerIterOffset;
     engine->handledKeyCount = bc.handledKeys.size();
 
-    engine->group_weak_end = group_weak_end;
-
     engine->rolesWithStateCount = bc.numStates;
 
     engine->leftOffset = leftOffset;
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index ca1b64e2..a00bc4ea 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -547,7 +547,6 @@ public:
     std::map<size_t, std::vector<std::unique_ptr<raw_dfa>>> anchored_nfas;
     std::map<simple_anchored_info, std::set<u32>> anchored_simple;
     std::map<u32, std::set<u32> > group_to_literal;
-    u32 group_weak_end;
     u32 group_end;
 
     u32 anchored_base_id;
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index 7fbc5f65..f99c391f 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -76,7 +76,6 @@ RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, SomSlotManager &ssm_in,
       vertexIndex(0),
       delay_base_id(MO_INVALID_IDX),
       hasSom(false),
-      group_weak_end(0),
       group_end(0),
       anchored_base_id(MO_INVALID_IDX),
       ematcher_region_size(0),
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index 19d8414d..f53ebe61 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -1076,7 +1076,6 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
     DUMP_U32(t, ematcherRegionSize);
     DUMP_U32(t, somRevCount);
     DUMP_U32(t, somRevOffsetOffset);
-    DUMP_U32(t, group_weak_end);
     DUMP_U32(t, floatingStreamState);
     fprintf(f, "}\n");
     fprintf(f, "sizeof(RoseEngine) = %zu\n", sizeof(RoseEngine));
diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h
index 803810b0..2b646af0 100644
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@@ -429,7 +429,6 @@ struct RoseEngine {
     u32 ematcherRegionSize; /* max region size to pass to ematcher */
     u32 somRevCount; /**< number of som reverse nfas */
     u32 somRevOffsetOffset; /**< offset to array of offsets to som rev nfas */
-    u32 group_weak_end; /* end of weak groups, debugging only */
     u32 floatingStreamState; // size in bytes
 
     struct scatter_full_plan state_init;

From 2471b770a8cd4e78fdffcbfac616c4afd5a40f57 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Thu, 18 Jun 2015 14:55:34 +1000
Subject: [PATCH 098/166] we no longer store the history len

---
 src/rose/rose_dump.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index f53ebe61..75b831a5 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -914,8 +914,7 @@ void roseDumpText(const RoseEngine *t, FILE *f) {
             t->lookaroundTableOffset - t->lookaroundReachOffset);
 
     fprintf(f, "state space required : %u bytes\n", t->stateOffsets.end);
-    fprintf(f, " - history buffer    : %u bytes (+1 for len)\n",
-            t->historyRequired);
+    fprintf(f, " - history buffer    : %u bytes\n", t->historyRequired);
     fprintf(f, " - exhaustion vector : %u bytes\n", (t->ekeyCount + 7) / 8);
     fprintf(f, " - role state mmbit  : %u bytes\n", t->stateSize);
     fprintf(f, " - floating matcher  : %u bytes\n", t->floatingStreamState);

From 691b08d170c8d1af89048bb1b4992f2ac65f5caa Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Thu, 7 Jul 2016 14:07:12 +1000
Subject: [PATCH 099/166] use NGHolder::foo in favour of NFAGraph::foo

---
 src/nfa/limex_compile.cpp              |  2 +-
 src/nfagraph/ng_anchored_dots.cpp      | 14 ++--
 src/nfagraph/ng_builder.cpp            |  4 +-
 src/nfagraph/ng_execute.cpp            |  2 +-
 src/nfagraph/ng_extparam.cpp           | 16 ++---
 src/nfagraph/ng_haig.cpp               |  6 +-
 src/nfagraph/ng_holder.cpp             |  4 +-
 src/nfagraph/ng_holder.h               | 12 ++--
 src/nfagraph/ng_limex.cpp              |  2 +-
 src/nfagraph/ng_limex_accel.cpp        |  8 +--
 src/nfagraph/ng_mcclellan.cpp          |  4 +-
 src/nfagraph/ng_prefilter.cpp          |  4 +-
 src/nfagraph/ng_puff.cpp               |  2 +-
 src/nfagraph/ng_redundancy.cpp         |  4 +-
 src/nfagraph/ng_repeat.cpp             |  4 +-
 src/nfagraph/ng_rose.cpp               |  2 +-
 src/nfagraph/ng_som.cpp                |  6 +-
 src/nfagraph/ng_squash.cpp             |  6 +-
 src/nfagraph/ng_uncalc_components.cpp  |  6 +-
 src/nfagraph/ng_util.cpp               | 26 +++----
 src/nfagraph/ng_util.h                 |  8 +--
 src/rose/rose_build_anchored.cpp       |  6 +-
 src/rose/rose_build_misc.cpp           |  6 +-
 unit/internal/limex_nfa.cpp            |  6 +-
 unit/internal/nfagraph_equivalence.cpp | 96 +++++++++++---------------
 unit/internal/nfagraph_redundancy.cpp  | 51 ++++++++------
 unit/internal/nfagraph_util.cpp        | 54 +++++++--------
 util/ng_corpus_generator.cpp           |  4 +-
 util/ng_find_matches.cpp               |  2 +-
 29 files changed, 177 insertions(+), 190 deletions(-)

diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index 79e6db1c..e0c459aa 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -345,7 +345,7 @@ void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
 }
 
 struct AccelBuild {
-    AccelBuild() : v(NFAGraph::null_vertex()), state(0), offset(0), ma_len1(0),
+    AccelBuild() : v(NGHolder::null_vertex()), state(0), offset(0), ma_len1(0),
             ma_len2(0), ma_type(MultibyteAccelInfo::MAT_NONE) {}
     NFAVertex v;
     u32 state;
diff --git a/src/nfagraph/ng_anchored_dots.cpp b/src/nfagraph/ng_anchored_dots.cpp
index 1b6d8826..ba352e60 100644
--- a/src/nfagraph/ng_anchored_dots.cpp
+++ b/src/nfagraph/ng_anchored_dots.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -119,7 +119,7 @@ NFAVertex findReformable(const NGHolder &g, const set<NFAVertex> &starts,
     }
 
     if (dotq.empty()) {
-        return NFAGraph::null_vertex();
+        return NGHolder::null_vertex();
     }
 
     const DotInfo &dot = dotq.top();
@@ -165,10 +165,10 @@ void reformAnchoredRepeatsComponent(NGHolder &g,
         return;
     }
 
-    NFAVertex dotV = NFAGraph::null_vertex();
+    NFAVertex dotV = NGHolder::null_vertex();
     set<NFAVertex> otherV;
     dotV = findReformable(g, compAnchoredStarts, otherV);
-    if (dotV == NFAGraph::null_vertex()) {
+    if (dotV == NGHolder::null_vertex()) {
         DEBUG_PRINTF("no candidate reformable dot found.\n");
         return;
     }
@@ -268,10 +268,10 @@ void reformUnanchoredRepeatsComponent(NGHolder &g,
     }
 
     while (true) {
-        NFAVertex dotV = NFAGraph::null_vertex();
+        NFAVertex dotV = NGHolder::null_vertex();
         set<NFAVertex> otherV;
         dotV = findReformable(g, compUnanchoredStarts, otherV);
-        if (dotV == NFAGraph::null_vertex()) {
+        if (dotV == NGHolder::null_vertex()) {
             DEBUG_PRINTF("no candidate reformable dot found.\n");
             return;
         }
@@ -464,7 +464,7 @@ void collapseVariableDotRepeat(NGHolder &g, NFAVertex start,
     // The first of our optional dots must be connected to start. The jump edge
     // past it will be verified in gatherParticipants(). If start is
     // graph.start, it should not be connected to startDs.
-    NFAVertex initialDot = NFAGraph::null_vertex();
+    NFAVertex initialDot = NGHolder::null_vertex();
     for (auto v : adjacent_vertices_range(start, g)) {
         if (is_special(v, g)) {
             continue;
diff --git a/src/nfagraph/ng_builder.cpp b/src/nfagraph/ng_builder.cpp
index 36ce80b0..8a92b7ee 100644
--- a/src/nfagraph/ng_builder.cpp
+++ b/src/nfagraph/ng_builder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -131,7 +131,7 @@ NFABuilderImpl::~NFABuilderImpl() {
 NFAVertex NFABuilderImpl::getVertex(Position pos) const {
     assert(id2vertex.size() >= pos);
     const NFAVertex v = id2vertex[pos];
-    assert(v != NFAGraph::null_vertex());
+    assert(v != NGHolder::null_vertex());
     assert(graph->g[v].index == pos);
     return v;
 }
diff --git a/src/nfagraph/ng_execute.cpp b/src/nfagraph/ng_execute.cpp
index 46307cd5..4ffd89c0 100644
--- a/src/nfagraph/ng_execute.cpp
+++ b/src/nfagraph/ng_execute.cpp
@@ -58,7 +58,7 @@ namespace ue2 {
 
 struct StateInfo {
     StateInfo(NFAVertex v, const CharReach &cr) : vertex(v), reach(cr) {}
-    StateInfo() : vertex(NFAGraph::null_vertex()) {}
+    StateInfo() : vertex(NGHolder::null_vertex()) {}
     NFAVertex vertex;
     CharReach reach;
 };
diff --git a/src/nfagraph/ng_extparam.cpp b/src/nfagraph/ng_extparam.cpp
index 17d2a513..bc101df2 100644
--- a/src/nfagraph/ng_extparam.cpp
+++ b/src/nfagraph/ng_extparam.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -294,21 +294,21 @@ bool anchorPatternWithBoundedRepeat(NGWrapper &g, const depth &minWidth,
 
 static
 NFAVertex findSingleCyclic(const NGHolder &g) {
-    NFAVertex v = NFAGraph::null_vertex();
+    NFAVertex v = NGHolder::null_vertex();
     for (const auto &e : edges_range(g)) {
         if (source(e, g) == target(e, g)) {
             if (source(e, g) == g.startDs) {
                 continue;
             }
-            if (v != NFAGraph::null_vertex()) {
+            if (v != NGHolder::null_vertex()) {
                 // More than one cyclic vertex.
-                return NFAGraph::null_vertex();
+                return NGHolder::null_vertex();
             }
             v = source(e, g);
         }
     }
 
-    if (v != NFAGraph::null_vertex()) {
+    if (v != NGHolder::null_vertex()) {
         DEBUG_PRINTF("cyclic is %u\n", g[v].index);
         assert(!is_special(v, g));
     }
@@ -359,11 +359,11 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
     // The graph must contain a single cyclic vertex (other than startDs), and
     // that vertex can have one pred and one successor.
     NFAVertex cyclic = findSingleCyclic(g);
-    if (cyclic == NFAGraph::null_vertex()) {
+    if (cyclic == NGHolder::null_vertex()) {
         return false;
     }
 
-    NFAGraph::adjacency_iterator ai, ae;
+    NGHolder::adjacency_iterator ai, ae;
     tie(ai, ae) = adjacent_vertices(g.start, g);
     if (*ai == g.startDs) {
         ++ai;
@@ -411,7 +411,7 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
 
     // Check the cyclic state is A-OK.
     v = getSoleDestVertex(g, cyclic);
-    if (v == NFAGraph::null_vertex()) {
+    if (v == NGHolder::null_vertex()) {
         DEBUG_PRINTF("cyclic has more than one successor\n");
         return false;
     }
diff --git a/src/nfagraph/ng_haig.cpp b/src/nfagraph/ng_haig.cpp
index 8fe4889d..e70b7708 100644
--- a/src/nfagraph/ng_haig.cpp
+++ b/src/nfagraph/ng_haig.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -89,11 +89,11 @@ void populateInit(const NGHolder &g, const flat_set<NFAVertex> &unused,
     }
 
     v_by_index->clear();
-    v_by_index->resize(num_vertices(g), NFAGraph::null_vertex());
+    v_by_index->resize(num_vertices(g), NGHolder::null_vertex());
 
     for (auto v : vertices_range(g)) {
         u32 v_index = g[v].index;
-        assert((*v_by_index)[v_index] == NFAGraph::null_vertex());
+        assert((*v_by_index)[v_index] == NGHolder::null_vertex());
         (*v_by_index)[v_index] = v;
     }
 }
diff --git a/src/nfagraph/ng_holder.cpp b/src/nfagraph/ng_holder.cpp
index fd403378..53566891 100644
--- a/src/nfagraph/ng_holder.cpp
+++ b/src/nfagraph/ng_holder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -154,7 +154,7 @@ void clear_out_edges(NFAVertex v, NGHolder &h) {
 }
 
 void clear_graph(NGHolder &h) {
-    NFAGraph::vertex_iterator vi, ve;
+    NGHolder::vertex_iterator vi, ve;
     for (tie(vi, ve) = vertices(h); vi != ve;) {
         NFAVertex v = *vi;
         ++vi;
diff --git a/src/nfagraph/ng_holder.h b/src/nfagraph/ng_holder.h
index 07f21d0f..5b6a3de8 100644
--- a/src/nfagraph/ng_holder.h
+++ b/src/nfagraph/ng_holder.h
@@ -171,7 +171,7 @@ bool is_special(const NFAVertex v, const GraphT &g) {
 }
 
 static really_inline
-std::pair<NFAGraph::adjacency_iterator, NFAGraph::adjacency_iterator>
+std::pair<NGHolder::adjacency_iterator, NGHolder::adjacency_iterator>
 adjacent_vertices(NFAVertex v, const NGHolder &h) {
     return adjacent_vertices(v, h.g);
 }
@@ -182,7 +182,7 @@ std::pair<NFAEdge, bool> edge(NFAVertex u, NFAVertex v, const NGHolder &h) {
 }
 
 static really_inline
-std::pair<NFAGraph::edge_iterator, NFAGraph::edge_iterator>
+std::pair<NGHolder::edge_iterator, NGHolder::edge_iterator>
 edges(const NGHolder &h) {
     return edges(h.g);
 }
@@ -193,13 +193,13 @@ size_t in_degree(NFAVertex v, const NGHolder &h) {
 }
 
 static really_inline
-std::pair<NFAGraph::in_edge_iterator, NFAGraph::in_edge_iterator>
+std::pair<NGHolder::in_edge_iterator, NGHolder::in_edge_iterator>
 in_edges(NFAVertex v, const NGHolder &h) {
     return in_edges(v, h.g);
 }
 
 static really_inline
-std::pair<NFAGraph::inv_adjacency_iterator, NFAGraph::inv_adjacency_iterator>
+std::pair<NGHolder::inv_adjacency_iterator, NGHolder::inv_adjacency_iterator>
 inv_adjacent_vertices(NFAVertex v, const NGHolder &h) {
     return inv_adjacent_vertices(v, h.g);
 }
@@ -210,7 +210,7 @@ size_t out_degree(NFAVertex v, const NGHolder &h) {
 }
 
 static really_inline
-std::pair<NFAGraph::out_edge_iterator, NFAGraph::out_edge_iterator>
+std::pair<NGHolder::out_edge_iterator, NGHolder::out_edge_iterator>
 out_edges(NFAVertex v, const NGHolder &h) {
     return out_edges(v, h.g);
 }
@@ -226,7 +226,7 @@ NFAVertex target(const NFAEdge &e, const NGHolder &h) {
 }
 
 static really_inline
-std::pair<NFAGraph::vertex_iterator, NFAGraph::vertex_iterator>
+std::pair<NGHolder::vertex_iterator, NGHolder::vertex_iterator>
 vertices(const NGHolder &h) {
     return vertices(h.g);
 }
diff --git a/src/nfagraph/ng_limex.cpp b/src/nfagraph/ng_limex.cpp
index af7779ba..a82d18b6 100644
--- a/src/nfagraph/ng_limex.cpp
+++ b/src/nfagraph/ng_limex.cpp
@@ -164,7 +164,7 @@ void makeTopStates(NGHolder &g, map<u32, NFAVertex> &tops,
 
         assert(!contains(tops, t));
 
-        NFAVertex s = NFAGraph::null_vertex();
+        NFAVertex s = NGHolder::null_vertex();
         flat_set<NFAVertex> succs;
         insert(&succs, top.second);
 
diff --git a/src/nfagraph/ng_limex_accel.cpp b/src/nfagraph/ng_limex_accel.cpp
index 1f991f19..deaf2ffd 100644
--- a/src/nfagraph/ng_limex_accel.cpp
+++ b/src/nfagraph/ng_limex_accel.cpp
@@ -658,7 +658,7 @@ NFAVertex get_sds_or_proxy(const NGHolder &g) {
         return g.startDs;
     }
 
-    NFAVertex v = NFAGraph::null_vertex();
+    NFAVertex v = NGHolder::null_vertex();
     for (auto w : adjacent_vertices_range(g.start, g)) {
         if (w != g.startDs) {
             if (!v) {
@@ -693,8 +693,8 @@ NFAVertex get_sds_or_proxy(const NGHolder &g) {
 
 static
 NFAVertex find_next(const NFAVertex v, const NGHolder &g) {
-    NFAVertex res = NFAGraph::null_vertex();
-    for (NFAVertex u :  adjacent_vertices_range(v, g)) {
+    NFAVertex res = NGHolder::null_vertex();
+    for (NFAVertex u : adjacent_vertices_range(v, g)) {
         if (u != v) {
             res = u;
             break;
@@ -736,7 +736,7 @@ MultibyteAccelInfo nfaCheckMultiAccel(const NGHolder &g,
 
     // find our start vertex
     NFAVertex cur = find_next(v, g);
-    if (cur == NFAGraph::null_vertex()) {
+    if (cur == NGHolder::null_vertex()) {
         DEBUG_PRINTF("invalid start vertex\n");
         return MultibyteAccelInfo();
     }
diff --git a/src/nfagraph/ng_mcclellan.cpp b/src/nfagraph/ng_mcclellan.cpp
index 024cf2c1..39788570 100644
--- a/src/nfagraph/ng_mcclellan.cpp
+++ b/src/nfagraph/ng_mcclellan.cpp
@@ -173,11 +173,11 @@ void populateInit(const NGHolder &g, const flat_set<NFAVertex> &unused,
     }
 
     v_by_index->clear();
-    v_by_index->resize(num_vertices(g), NFAGraph::null_vertex());
+    v_by_index->resize(num_vertices(g), NGHolder::null_vertex());
 
     for (auto v : vertices_range(g)) {
         u32 vert_id = g[v].index;
-        assert((*v_by_index)[vert_id] == NFAGraph::null_vertex());
+        assert((*v_by_index)[vert_id] == NGHolder::null_vertex());
         (*v_by_index)[vert_id] = v;
     }
 
diff --git a/src/nfagraph/ng_prefilter.cpp b/src/nfagraph/ng_prefilter.cpp
index c2b9eea9..8abc45b3 100644
--- a/src/nfagraph/ng_prefilter.cpp
+++ b/src/nfagraph/ng_prefilter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -135,7 +135,7 @@ void findWidths(const NGHolder &g,
     // Wire our entries to start and our exits to accept.
     for (auto v : ri.vertices) {
         NFAVertex v_new = mapping[v];
-        assert(v_new != NFAGraph::null_vertex());
+        assert(v_new != NGHolder::null_vertex());
 
         if (isRegionEntry(g, v, region_map) &&
             !edge(rg.start, v_new, rg).second) {
diff --git a/src/nfagraph/ng_puff.cpp b/src/nfagraph/ng_puff.cpp
index 540f4859..00b2e8ac 100644
--- a/src/nfagraph/ng_puff.cpp
+++ b/src/nfagraph/ng_puff.cpp
@@ -472,7 +472,7 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
     }
 
     NFAVertex puffv = nodes.back();
-    assert(puffv != NFAGraph::null_vertex());
+    assert(puffv != NGHolder::null_vertex());
     u32 width = countChain(g, nodes.back());
 
     flat_set<ReportID> chain_reports;
diff --git a/src/nfagraph/ng_redundancy.cpp b/src/nfagraph/ng_redundancy.cpp
index b9b80c5b..26599251 100644
--- a/src/nfagraph/ng_redundancy.cpp
+++ b/src/nfagraph/ng_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -158,7 +158,7 @@ void populateContainers(const NGHolder &g, VertexInfoMap &infoMap) {
 static
 void inplaceIntersection(vector<NFAVertex> &vset1,
                          const flat_set<NFAVertex> &vset2) {
-    const NFAVertex GONE = NFAGraph::null_vertex();
+    const NFAVertex GONE = NGHolder::null_vertex();
 
     vector<NFAVertex>::iterator it = vset1.begin(), ite = vset1.end();
     flat_set<NFAVertex>::const_iterator jt = vset2.begin(), jte = vset2.end();
diff --git a/src/nfagraph/ng_repeat.cpp b/src/nfagraph/ng_repeat.cpp
index 80434a0a..bc7e73d3 100644
--- a/src/nfagraph/ng_repeat.cpp
+++ b/src/nfagraph/ng_repeat.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -1202,7 +1202,7 @@ CharReach predReach(const NGHolder &g, NFAVertex v) {
 static
 void filterMap(const NGHolder &subg,
                ue2::unordered_map<NFAVertex, NFAVertex> &vmap) {
-    NFAGraph::vertex_iterator vi, ve;
+    NGHolder::vertex_iterator vi, ve;
     tie(vi, ve) = vertices(subg);
     const ue2::unordered_set<NFAVertex> remaining_verts(vi, ve);
 
diff --git a/src/nfagraph/ng_rose.cpp b/src/nfagraph/ng_rose.cpp
index 997191d2..9b8f0e9a 100644
--- a/src/nfagraph/ng_rose.cpp
+++ b/src/nfagraph/ng_rose.cpp
@@ -780,7 +780,7 @@ bool literalIsWholeGraph(const NGHolder &g, const ue2_literal &lit) {
     NFAVertex v = g.accept;
 
     for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
-        NFAGraph::inv_adjacency_iterator ai, ae;
+        NGHolder::inv_adjacency_iterator ai, ae;
         tie(ai, ae) = inv_adjacent_vertices(v, g);
         if (ai == ae) {
             assert(0); // no predecessors?
diff --git a/src/nfagraph/ng_som.cpp b/src/nfagraph/ng_som.cpp
index 03a612a0..4af0e20c 100644
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -384,7 +384,7 @@ makePrefix(const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> &regions,
     add_edge(prefix.accept, prefix.acceptEod, prefix);
 
     assert(!next_enters.empty());
-    assert(next_enters.front() != NFAGraph::null_vertex());
+    assert(next_enters.front() != NGHolder::null_vertex());
     u32 dead_region = regions.at(next_enters.front());
     DEBUG_PRINTF("curr_region %u, dead_region %u\n",
                  regions.at(curr_exits.front()), dead_region);
@@ -2537,7 +2537,7 @@ bool doHaigLitHaigSom(NG &ng, NGHolder &g,
     RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), ig);
 
     bool lhs_all_vac = true;
-    NFAGraph::adjacency_iterator ai, ae;
+    NGHolder::adjacency_iterator ai, ae;
     for (tie(ai, ae) = adjacent_vertices(lhs->startDs, *lhs);
          ai != ae && lhs_all_vac; ++ai) {
         if (!is_special(*ai, *lhs)) {
diff --git a/src/nfagraph/ng_squash.cpp b/src/nfagraph/ng_squash.cpp
index dd3693e5..6577673f 100644
--- a/src/nfagraph/ng_squash.cpp
+++ b/src/nfagraph/ng_squash.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -369,7 +369,7 @@ map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g, som_type som) {
     buildPDomTree(g, pdom_tree);
 
     // Build list of vertices by state ID and a set of init states.
-    vector<NFAVertex> vByIndex(numStates, NFAGraph::null_vertex());
+    vector<NFAVertex> vByIndex(numStates, NGHolder::null_vertex());
     NFAStateSet initStates(numStates);
     smgb_cache cache(g);
 
@@ -394,7 +394,7 @@ map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g, som_type som) {
 
     for (u32 i = 0; i < numStates; i++) {
         NFAVertex v = vByIndex[i];
-        assert(v != NFAGraph::null_vertex());
+        assert(v != NGHolder::null_vertex());
         const CharReach &cr = g[v].char_reach;
 
         /* only non-init cyclics can be squashers */
diff --git a/src/nfagraph/ng_uncalc_components.cpp b/src/nfagraph/ng_uncalc_components.cpp
index abba09f9..217183de 100644
--- a/src/nfagraph/ng_uncalc_components.cpp
+++ b/src/nfagraph/ng_uncalc_components.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -184,7 +184,7 @@ u32 commonPrefixLength(const NGHolder &ga,
             size_t a_count = 0;
             size_t b_count = 0;
 
-            NFAGraph::out_edge_iterator ei, ee;
+            NGHolder::out_edge_iterator ei, ee;
             for (tie(ei, ee) = out_edges(a[i], ga); ok && ei != ee; ++ei) {
                 u32 sid = a_state_ids.at(target(*ei, ga));
                 if (sid == NO_STATE || sid >= max) {
@@ -213,7 +213,7 @@ u32 commonPrefixLength(const NGHolder &ga,
                 }
             }
 
-            NFAGraph::adjacency_iterator ai, ae;
+            NGHolder::adjacency_iterator ai, ae;
             for (tie(ai, ae) = adjacent_vertices(b[i], gb); ok && ai != ae;
                  ++ai) {
                 u32 sid = b_state_ids.at(*ai);
diff --git a/src/nfagraph/ng_util.cpp b/src/nfagraph/ng_util.cpp
index bcf0ce29..935a223e 100644
--- a/src/nfagraph/ng_util.cpp
+++ b/src/nfagraph/ng_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -78,26 +78,26 @@ depth maxDistFromStartOfData(const NFAVertexDepth &vd) {
 }
 
 NFAVertex getSoleDestVertex(const NGHolder &g, NFAVertex a) {
-    assert(a != NFAGraph::null_vertex());
+    assert(a != NGHolder::null_vertex());
 
-    NFAGraph::out_edge_iterator ii, iie;
+    NGHolder::out_edge_iterator ii, iie;
     tie(ii, iie) = out_edges(a, g);
     if (ii == iie) {
-        return NFAGraph::null_vertex();
+        return NGHolder::null_vertex();
     }
     NFAVertex b = target(*ii, g);
     if (a == b) {
         ++ii;
         if (ii == iie) {
-            return NFAGraph::null_vertex();
+            return NGHolder::null_vertex();
         }
 
         b = target(*ii, g);
         if (++ii != iie) {
-            return NFAGraph::null_vertex();
+            return NGHolder::null_vertex();
         }
     } else if (++ii != iie && (target(*ii, g) != a || ++ii != iie)) {
-        return NFAGraph::null_vertex();
+        return NGHolder::null_vertex();
     }
 
     assert(a != b);
@@ -105,23 +105,23 @@ NFAVertex getSoleDestVertex(const NGHolder &g, NFAVertex a) {
 }
 
 NFAVertex getSoleSourceVertex(const NGHolder &g, NFAVertex a) {
-    assert(a != NFAGraph::null_vertex());
+    assert(a != NGHolder::null_vertex());
 
     u32 idegree = in_degree(a, g);
     if (idegree != 1 && !(idegree == 2 && hasSelfLoop(a, g))) {
-        return NFAGraph::null_vertex();
+        return NGHolder::null_vertex();
     }
 
-    NFAGraph::in_edge_iterator ii, iie;
+    NGHolder::in_edge_iterator ii, iie;
     tie(ii, iie) = in_edges(a, g);
     if (ii == iie) {
-        return NFAGraph::null_vertex();
+        return NGHolder::null_vertex();
     }
     NFAVertex b = source(*ii, g);
     if (a == b) {
         ++ii;
         if (ii == iie) {
-            return NFAGraph::null_vertex();
+            return NGHolder::null_vertex();
         }
 
         b = source(*ii, g);
@@ -321,7 +321,7 @@ bool can_match_at_eod(const NGHolder &h) {
 }
 
 bool can_only_match_at_eod(const NGHolder &g) {
-    NFAGraph::in_edge_iterator ie, ee;
+    NGHolder::in_edge_iterator ie, ee;
     tie(ie, ee) = in_edges(g.accept, g);
 
     return ie == ee;
diff --git a/src/nfagraph/ng_util.h b/src/nfagraph/ng_util.h
index 9eb621e8..955c9b7b 100644
--- a/src/nfagraph/ng_util.h
+++ b/src/nfagraph/ng_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -65,7 +65,7 @@ bool is_dot(NFAVertex v, const GraphT &g) {
 template<class U>
 static really_inline
 void succ(const NGHolder &g, NFAVertex v, U *s) {
-    NFAGraph::adjacency_iterator ai, ae;
+    NGHolder::adjacency_iterator ai, ae;
     tie(ai, ae) = adjacent_vertices(v, g);
     s->insert(ai, ae);
 }
@@ -74,14 +74,14 @@ void succ(const NGHolder &g, NFAVertex v, U *s) {
 template<class U>
 static really_inline
 void pred(const NGHolder &g, NFAVertex v, U *p) {
-    NFAGraph::inv_adjacency_iterator it, ite;
+    NGHolder::inv_adjacency_iterator it, ite;
     tie(it, ite) = inv_adjacent_vertices(v, g);
     p->insert(it, ite);
 }
 
 /** returns a vertex with an out edge from v and is not v.
  * v must have exactly one out-edge excluding self-loops.
- * will return NFAGraph::null_vertex() if the preconditions don't hold.
+ * will return NGHolder::null_vertex() if the preconditions don't hold.
  */
 NFAVertex getSoleDestVertex(const NGHolder &g, NFAVertex v);
 
diff --git a/src/rose/rose_build_anchored.cpp b/src/rose/rose_build_anchored.cpp
index 35ff7138..286cc7ae 100644
--- a/src/rose/rose_build_anchored.cpp
+++ b/src/rose/rose_build_anchored.cpp
@@ -476,7 +476,7 @@ NFAVertex extractLiteral(const NGHolder &h, ue2_literal *lit) {
     }
 
     if (lit_verts.empty()) {
-        return NFAGraph::null_vertex();
+        return NGHolder::null_vertex();
     }
 
     bool nocase = false;
@@ -488,7 +488,7 @@ NFAVertex extractLiteral(const NGHolder &h, ue2_literal *lit) {
         if (cr.isAlpha()) {
             bool cr_nocase = cr.count() != 1;
             if (case_set && cr_nocase != nocase) {
-                return NFAGraph::null_vertex();
+                return NGHolder::null_vertex();
             }
 
             case_set = true;
@@ -511,7 +511,7 @@ bool isSimple(const NGHolder &h, u32 *min_bound, u32 *max_bound,
     DEBUG_PRINTF("looking for simple case\n");
     NFAVertex lit_head = extractLiteral(h, lit);
 
-    if (lit_head == NFAGraph::null_vertex()) {
+    if (lit_head == NGHolder::null_vertex()) {
         DEBUG_PRINTF("no literal found\n");
         return false;
     }
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index f99c391f..38586bcd 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -538,11 +538,11 @@ static
 bool requiresDedupe(const NGHolder &h, const ue2::flat_set<ReportID> &reports,
                     const Grey &grey) {
     /* TODO: tighten */
-    NFAVertex seen_vert = NFAGraph::null_vertex();
+    NFAVertex seen_vert = NGHolder::null_vertex();
 
     for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
         if (has_intersection(h[v].reports, reports)) {
-            if (seen_vert != NFAGraph::null_vertex()) {
+            if (seen_vert != NGHolder::null_vertex()) {
                 return true;
             }
             seen_vert = v;
@@ -551,7 +551,7 @@ bool requiresDedupe(const NGHolder &h, const ue2::flat_set<ReportID> &reports,
 
     for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) {
         if (has_intersection(h[v].reports, reports)) {
-            if (seen_vert != NFAGraph::null_vertex()) {
+            if (seen_vert != NGHolder::null_vertex()) {
                 return true;
             }
             seen_vert = v;
diff --git a/unit/internal/limex_nfa.cpp b/unit/internal/limex_nfa.cpp
index c3cfb3dd..926bf6eb 100644
--- a/unit/internal/limex_nfa.cpp
+++ b/unit/internal/limex_nfa.cpp
@@ -314,10 +314,8 @@ protected:
         // Reverse the graph and add some reports on the accept vertices.
         NGHolder g_rev(NFA_REV_PREFIX);
         reverseHolder(*g, g_rev);
-        NFAGraph::inv_adjacency_iterator ai, ae;
-        for (tie(ai, ae) = inv_adjacent_vertices(g_rev.accept, g_rev); ai != ae;
-             ++ai) {
-            g_rev[*ai].reports.insert(0);
+        for (NFAVertex v : inv_adjacent_vertices_range(g_rev.accept, g_rev)) {
+            g_rev[v].reports.insert(0);
         }
 
         nfa = constructReversedNFA(g_rev, type, cc);
diff --git a/unit/internal/nfagraph_equivalence.cpp b/unit/internal/nfagraph_equivalence.cpp
index 3677e1d2..3ca1923f 100644
--- a/unit/internal/nfagraph_equivalence.cpp
+++ b/unit/internal/nfagraph_equivalence.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,7 +27,8 @@
  */
 
 /**
- * Unit tests for checking the removeGraphEquivalences code in nfagraph/ng_equivalence.cpp.
+ * Unit tests for checking the removeGraphEquivalences code in
+ * nfagraph/ng_equivalence.cpp.
  */
 
 #include "config.h"
@@ -71,10 +72,9 @@ TEST(NFAGraph, RemoveEquivalence1) {
     ASSERT_EQ(2U, in_degree(g.accept, g));
 
     // Find a vertex that goes right after startDs
-    NFAVertex a = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(g.startDs, g); ai != ae; ++ai) {
-        a = *ai;
+    NFAVertex a = NGHolder::null_vertex();
+    for (NFAVertex v : adjacent_vertices_range(g.startDs, g)) {
+        a = v;
         if (a == g.startDs) {
             continue;
         }
@@ -87,8 +87,8 @@ TEST(NFAGraph, RemoveEquivalence1) {
     ASSERT_TRUE(a != nullptr);
 
     // There should be two edges from v to nodes with reachability 'b' and 'c'
-    NFAVertex b = NFAGraph::null_vertex();
-    NFAVertex c = NFAGraph::null_vertex();
+    NFAVertex b = NGHolder::null_vertex();
+    NFAVertex c = NGHolder::null_vertex();
     for (NFAVertex tmp : adjacent_vertices_range(a, g)) {
         const CharReach &tmpcr = g[tmp].char_reach;
         ASSERT_EQ(1U, tmpcr.count());
@@ -133,11 +133,9 @@ TEST(NFAGraph, RemoveEquivalence2) {
     ASSERT_EQ(1U, in_degree(g.accept, g));
 
     // Find a vertex leading to accept
-    NFAVertex a = NFAGraph::null_vertex();
-    NFAGraph::inv_adjacency_iterator ai, ae;
-    for (tie(ai, ae) = inv_adjacent_vertices(g.accept, g); ai != ae;
-         ++ai) {
-        a = *ai;
+    NFAVertex a = NGHolder::null_vertex();
+    for (NFAVertex v : inv_adjacent_vertices_range(g.accept, g)) {
+        a = v;
         if (a == g.accept) {
             continue;
         }
@@ -150,8 +148,8 @@ TEST(NFAGraph, RemoveEquivalence2) {
     ASSERT_TRUE(a != nullptr);
 
     // There should be two edges from v to nodes with reachability 'b' and 'c'
-    NFAVertex b = NFAGraph::null_vertex();
-    NFAVertex c = NFAGraph::null_vertex();
+    NFAVertex b = NGHolder::null_vertex();
+    NFAVertex c = NGHolder::null_vertex();
     for (NFAVertex tmp : inv_adjacent_vertices_range(a, g)) {
         const CharReach &tmpcr = g[tmp].char_reach;
         ASSERT_EQ(1U, tmpcr.count());
@@ -197,10 +195,9 @@ TEST(NFAGraph, RemoveEquivalence3) {
     ASSERT_EQ(2U, in_degree(g.accept, g));
 
     // Find a vertex 'a' that goes right after startDs
-    NFAVertex a = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(g.startDs, g); ai != ae; ++ai) {
-        a = *ai;
+    NFAVertex a = NGHolder::null_vertex();
+    for (NFAVertex v : adjacent_vertices_range(g.startDs, g)) {
+        a = v;
         if (a == g.startDs) {
             continue;
         }
@@ -234,10 +231,9 @@ TEST(NFAGraph, RemoveEquivalence3) {
     ASSERT_TRUE(edge(dot2, dot1, g).second);
 
     // now, let's find X and Y nodes
-    NFAVertex X = NFAGraph::null_vertex();
-    NFAVertex Y = NFAGraph::null_vertex();
-    for (tie(ai, ae) = adjacent_vertices(dot2, g); ai != ae; ++ai) {
-        NFAVertex tmp = *ai;
+    NFAVertex X = NGHolder::null_vertex();
+    NFAVertex Y = NGHolder::null_vertex();
+    for (NFAVertex tmp : adjacent_vertices_range(dot2, g)) {
 
         // we already know about dot1, so skip it
         if (tmp == dot1) {
@@ -290,12 +286,9 @@ TEST(NFAGraph, RemoveEquivalence4) {
     ASSERT_EQ(1U, in_degree(g.accept, g));
 
     // Find X and Y nodes that are connected to startDs
-    NFAVertex X = NFAGraph::null_vertex();
-    NFAVertex Y = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(g.startDs, g); ai != ae; ++ai) {
-        NFAVertex tmp = *ai;
-
+    NFAVertex X = NGHolder::null_vertex();
+    NFAVertex Y = NGHolder::null_vertex();
+    for (NFAVertex tmp : adjacent_vertices_range(g.startDs, g)) {
         // skip startDs
         if (tmp == g.startDs) {
             continue;
@@ -341,10 +334,8 @@ TEST(NFAGraph, RemoveEquivalence4) {
     ASSERT_TRUE(edge(dot2, dot1, g).second);
 
     // now find 'a'
-    NFAVertex a = NFAGraph::null_vertex();
-    for (tie(ai, ae) = adjacent_vertices(dot2, g); ai != ae; ++ai) {
-        NFAVertex tmp = *ai;
-
+    NFAVertex a = NGHolder::null_vertex();
+    for (NFAVertex tmp : adjacent_vertices_range(dot2, g)) {
         // skip dot1
         if (tmp == dot1) {
             continue;
@@ -392,10 +383,9 @@ TEST(NFAGraph, RemoveEquivalence5) {
     ASSERT_EQ(1U, in_degree(g.accept, g));
 
     // find first vertex and ensure it has a self loop
-    NFAVertex v = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(g.startDs, g); ai != ae; ++ai) {
-        v = *ai;
+    NFAVertex v = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(g.startDs, g)) {
+        v = t;
         if (v == g.startDs) {
             continue;
         }
@@ -409,15 +399,13 @@ TEST(NFAGraph, RemoveEquivalence5) {
     ASSERT_TRUE(v != nullptr);
 
     // now, find the vertex leading to accept
-    NFAVertex v2 = NFAGraph::null_vertex();
-    for (tie(ai, ae) = adjacent_vertices(v, g); ai != ae; ++ai) {
-        NFAVertex tmp = *ai;
-
+    NFAVertex v2 = NGHolder::null_vertex();
+    for (NFAVertex tmp : adjacent_vertices_range(v, g)) {
         // skip self-loop
         if (tmp == v) {
             continue;
         }
-        v2 = *ai;
+        v2 = tmp;
         // get char reach
         const CharReach tmpcr = g[tmp].char_reach;
 
@@ -450,10 +438,9 @@ TEST(NFAGraph, RemoveEquivalence6) {
     ASSERT_EQ(1U, in_degree(g.accept, g));
 
     // find that vertex and ensure it has no self loops and an edge to accept
-    NFAVertex v = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(g.startDs, g); ai != ae; ++ai) {
-        v = *ai;
+    NFAVertex v = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(g.startDs, g)) {
+        v = t;
         if (v == g.startDs) {
             continue;
         }
@@ -492,13 +479,12 @@ TEST(NFAGraph, RemoveEquivalence7) {
     ASSERT_EQ(1U, in_degree(g.accept, g));
 
     // find that vertex and ensure it's a dot self loop and has one outgoing edge
-    NFAVertex v = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(g.start, g); ai != ae; ++ai) {
-        if (*ai == g.startDs) {
+    NFAVertex v = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(g.start, g)) {
+        if (t == g.startDs) {
             continue;
         }
-        v = *ai;
+        v = t;
         // check if it has the right char reach
         const CharReach &tmpcr = g[v].char_reach;
         ASSERT_TRUE(tmpcr.all());
@@ -509,13 +495,13 @@ TEST(NFAGraph, RemoveEquivalence7) {
     ASSERT_TRUE(v != nullptr);
 
     // find the next vertex and ensure it has an edge to accept
-    NFAVertex v2 = NFAGraph::null_vertex();
-    for (tie(ai, ae) = adjacent_vertices(v, g); ai != ae; ++ai) {
+    NFAVertex v2 = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(v, g)) {
         // skip self loop
-        if (*ai == v) {
+        if (t == v) {
             continue;
         }
-        v2 = *ai;
+        v2 = t;
         // check if it has the right char reach
         const CharReach &tmpcr = g[v2].char_reach;
         ASSERT_EQ(1U, tmpcr.count());
diff --git a/unit/internal/nfagraph_redundancy.cpp b/unit/internal/nfagraph_redundancy.cpp
index 16266453..acb3cc7b 100644
--- a/unit/internal/nfagraph_redundancy.cpp
+++ b/unit/internal/nfagraph_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,7 +27,8 @@
  */
 
 /**
- * Unit tests for checking the removeRedundancy code in nfagraph/ng_redundancy.cpp.
+ * Unit tests for checking the removeRedundancy code in
+ * nfagraph/ng_redundancy.cpp.
  */
 
 #include "config.h"
@@ -62,15 +63,17 @@ TEST(NFAGraph, RemoveRedundancy1) {
     // Our graph should only have two non-special nodes
     ASSERT_EQ((size_t)N_SPECIALS + 2, num_vertices(*graph));
 
-    // Dot-star start state should be connected to itself and a single other vertex
+    // Dot-star start state should be connected to itself and a single other
+    // vertex
     ASSERT_EQ(2U, out_degree(graph->startDs, g));
 
     // That single vertex should have reachability [ab]
-    NFAVertex v = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(graph->startDs, g); ai != ae; ++ai) {
-        v = *ai;
-        if (v != graph->startDs) break;
+    NFAVertex v = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(graph->startDs, g)) {
+        v = t;
+        if (v != graph->startDs) {
+            break;
+        }
     }
     const CharReach &cr = g[v].char_reach;
     ASSERT_EQ(2U, cr.count());
@@ -103,35 +106,39 @@ TEST(NFAGraph, RemoveRedundancy2) {
     // Our graph should now have only 3 non-special vertices
     ASSERT_EQ((size_t)N_SPECIALS + 3, num_vertices(*graph));
 
-    // Dot-star start state should be connected to itself and a single other vertex
+    // Dot-star start state should be connected to itself and a single other
+    // vertex
     ASSERT_EQ(2U, out_degree(graph->startDs, g));
 
     // That single vertex should have reachability [a]
-    NFAVertex v = NFAGraph::null_vertex();
-    NFAGraph::adjacency_iterator ai, ae;
-    for (tie(ai, ae) = adjacent_vertices(graph->startDs, g); ai != ae; ++ai) {
-        v = *ai;
-        if (v != graph->startDs) break;
+    NFAVertex v = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(graph->startDs, g)) {
+        v = t;
+        if (v != graph->startDs) {
+            break;
+        }
     }
     const CharReach &cr = g[v].char_reach;
     ASSERT_EQ(1U, cr.count());
     ASSERT_TRUE(cr.test('a'));
 
-    // 'a' should have two out edges: one to a dot with a cycle (.*) and one to 'c'
+    // 'a' should have two out edges: one to a dot with a cycle (.*) and one to
+    // 'c'
     ASSERT_EQ(2U, out_degree(v, g));
-    NFAVertex dotstar = NFAGraph::null_vertex(), vc = NFAGraph::null_vertex();
-    for (tie(ai, ae) = adjacent_vertices(v, g); ai != ae; ++ai) {
-        const CharReach &cr2 = g[*ai].char_reach;
+    NFAVertex dotstar = NGHolder::null_vertex();
+    NFAVertex vc = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(v, g)) {
+        const CharReach &cr2 = g[t].char_reach;
         if (cr2.count() == 1 && cr2.test('c')) {
-            vc = *ai;
+            vc = t;
         } else if (cr2.all()) {
-            dotstar = *ai;
+            dotstar = t;
         } else {
             FAIL();
         }
     }
-    ASSERT_TRUE(vc != NFAGraph::null_vertex());
-    ASSERT_TRUE(dotstar != NFAGraph::null_vertex());
+    ASSERT_TRUE(vc != NGHolder::null_vertex());
+    ASSERT_TRUE(dotstar != NGHolder::null_vertex());
 
     // Dot-star node should have a self-loop and an edge to vertex 'c'
     ASSERT_EQ(2U, out_degree(dotstar, g));
diff --git a/unit/internal/nfagraph_util.cpp b/unit/internal/nfagraph_util.cpp
index 81dfd682..135276dd 100644
--- a/unit/internal/nfagraph_util.cpp
+++ b/unit/internal/nfagraph_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -85,24 +85,23 @@ TEST(NFAGraph, split1) {
     splitGraph(src, pivot, &lhs, &lhs_map, &rhs, &rhs_map);
 
     ASSERT_EQ(3U + N_SPECIALS, num_vertices(lhs));
-    NFAGraph::vertex_iterator vi, ve;
-    for (tie(vi, ve) = vertices(lhs); vi != ve; ++vi) {
-        if (is_special(*vi, lhs)) {
+    for (NFAVertex v : vertices_range(lhs)) {
+        if (is_special(v, lhs)) {
             continue;
         }
 
-        u32 cr = lhs[*vi].char_reach.find_first();
+        u32 cr = lhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE((cr >= 'a' && cr <= 'c'));
     }
 
     ASSERT_EQ(8U + N_SPECIALS, num_vertices(rhs) );
-    for (tie(vi, ve) = vertices(rhs); vi != ve; ++vi) {
-        if (is_special(*vi, rhs)) {
+    for (NFAVertex v : vertices_range(rhs)) {
+        if (is_special(v, rhs)) {
             continue;
         }
 
-        u32 cr = rhs[*vi].char_reach.find_first();
+        u32 cr = rhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE(cr >= 'b' && cr <= 'i');
     }
@@ -137,24 +136,23 @@ TEST(NFAGraph, split2) {
     splitGraph(src, pivot, &lhs, &lhs_map, &rhs, &rhs_map);
 
     ASSERT_EQ(3U + N_SPECIALS, num_vertices(lhs));
-    NFAGraph::vertex_iterator vi, ve;
-    for (tie(vi, ve) = vertices(lhs); vi != ve; ++vi) {
-        if (is_special(*vi, lhs)) {
+    for (NFAVertex v : vertices_range(lhs)) {
+        if (is_special(v, lhs)) {
             continue;
         }
 
-        u32 cr = lhs[*vi].char_reach.find_first();
+        u32 cr = lhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE(cr >= 'a' && cr <= 'c');
     }
 
     ASSERT_EQ(3U + N_SPECIALS, num_vertices(rhs) );
-    for (tie(vi, ve) = vertices(rhs); vi != ve; ++vi) {
-        if (is_special(*vi, rhs)) {
+    for (NFAVertex v : vertices_range(rhs)) {
+        if (is_special(v, rhs)) {
             continue;
         }
 
-        u32 cr = rhs[*vi].char_reach.find_first();
+        u32 cr = rhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE(cr >= 'b' && cr <= 'd');
     }
@@ -211,24 +209,23 @@ TEST(NFAGraph, split3) {
     splitGraph(src, pivots, &lhs, &lhs_map, &rhs, &rhs_map);
 
     ASSERT_EQ(7U + N_SPECIALS, num_vertices(lhs));
-    NFAGraph::vertex_iterator vi, ve;
-    for (tie(vi, ve) = vertices(lhs); vi != ve; ++vi) {
-        if (is_special(*vi, lhs)) {
+    for (NFAVertex v : vertices_range(lhs)) {
+        if (is_special(v, lhs)) {
             continue;
         }
 
-        u32 cr = lhs[*vi].char_reach.find_first();
+        u32 cr = lhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE((cr >= 'a' && cr <= 'g'));
     }
 
     ASSERT_EQ(2U + N_SPECIALS, num_vertices(rhs) );
-    for (tie(vi, ve) = vertices(rhs); vi != ve; ++vi) {
-        if (is_special(*vi, rhs)) {
+    for (NFAVertex v : vertices_range(rhs)) {
+        if (is_special(v, rhs)) {
             continue;
         }
 
-        u32 cr = rhs[*vi].char_reach.find_first();
+        u32 cr = rhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE(cr >= 'h' && cr <= 'i');
     }
@@ -289,13 +286,12 @@ TEST(NFAGraph, split4) {
     splitGraph(src, pivots, &lhs, &lhs_map, &rhs, &rhs_map);
 
     ASSERT_EQ(7U + N_SPECIALS, num_vertices(lhs));
-    NFAGraph::vertex_iterator vi, ve;
-    for (tie(vi, ve) = vertices(lhs); vi != ve; ++vi) {
-        if (is_special(*vi, lhs)) {
+    for (NFAVertex v : vertices_range(lhs)) {
+        if (is_special(v, lhs)) {
             continue;
         }
 
-        u32 cr = lhs[*vi].char_reach.find_first();
+        u32 cr = lhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE((cr >= 'a' && cr <= 'g'));
     }
@@ -304,12 +300,12 @@ TEST(NFAGraph, split4) {
     ASSERT_TRUE(edge(lhs_map[d], lhs_map[d], lhs).second);
 
     ASSERT_EQ(2U + N_SPECIALS, num_vertices(rhs) );
-    for (tie(vi, ve) = vertices(rhs); vi != ve; ++vi) {
-        if (is_special(*vi, rhs)) {
+    for (NFAVertex v : vertices_range(rhs)) {
+        if (is_special(v, rhs)) {
             continue;
         }
 
-        u32 cr = rhs[*vi].char_reach.find_first();
+        u32 cr = rhs[v].char_reach.find_first();
         SCOPED_TRACE(cr);
         ASSERT_TRUE(cr >= 'h' && cr <= 'i');
     }
diff --git a/util/ng_corpus_generator.cpp b/util/ng_corpus_generator.cpp
index 30629f71..9fa6743e 100644
--- a/util/ng_corpus_generator.cpp
+++ b/util/ng_corpus_generator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -158,7 +158,7 @@ void findPaths(const NGHolder &g, CorpusProperties &cProps,
         DEBUG_PRINTF("dequeuing path %s, back %u\n",
                      pathToString(g, *p).c_str(), g[u].index);
 
-        NFAGraph::adjacency_iterator ai, ae;
+        NGHolder::adjacency_iterator ai, ae;
         for (tie(ai, ae) = adjacent_vertices(u, g); ai != ae; ++ai) {
             NFAVertex v = *ai;
 
diff --git a/util/ng_find_matches.cpp b/util/ng_find_matches.cpp
index 4d188d78..60ff0a17 100644
--- a/util/ng_find_matches.cpp
+++ b/util/ng_find_matches.cpp
@@ -76,7 +76,7 @@ struct fmstate {
     fmstate(const NGHolder &g, bool som_in, bool utf8_in, bool aSD_in,
             const ReportManager &rm_in)
         : num_states(num_vertices(g)), states(num_states), next(num_states),
-          vertices(num_vertices(g), NFAGraph::null_vertex()), som(som_in),
+          vertices(num_vertices(g), NGHolder::null_vertex()), som(som_in),
           utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in), accept(num_states),
           accept_with_eod(num_states) {
         // init states

From 69933edf680fc9b2304064601a1f86515fc830e8 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Thu, 30 Jun 2016 09:50:08 +1000
Subject: [PATCH 100/166] truffle hwlm accel

---
 src/hwlm/hwlm.c         |  8 +++++++-
 src/hwlm/hwlm_build.cpp | 12 +++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/hwlm/hwlm.c b/src/hwlm/hwlm.c
index 054f05c4..2e16f1ac 100644
--- a/src/hwlm/hwlm.c
+++ b/src/hwlm/hwlm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,6 +37,7 @@
 #include "fdr/fdr.h"
 #include "nfa/accel.h"
 #include "nfa/shufti.h"
+#include "nfa/truffle.h"
 #include "nfa/vermicelli.h"
 #include <string.h>
 
@@ -64,8 +65,13 @@ const u8 *run_hwlm_accel(const union AccelAux *aux, const u8 *ptr,
     case ACCEL_SHUFTI:
         DEBUG_PRINTF("single shufti\n");
         return shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end);
+    case ACCEL_TRUFFLE:
+        DEBUG_PRINTF("truffle\n");
+        return truffleExec(aux->truffle.mask1, aux->truffle.mask2, ptr, end);
     default:
         /* no acceleration, fall through and return current ptr */
+        DEBUG_PRINTF("no accel; %u\n", (int)aux->accel_type);
+        assert(aux->accel_type == ACCEL_NONE);
         return ptr;
     }
 }
diff --git a/src/hwlm/hwlm_build.cpp b/src/hwlm/hwlm_build.cpp
index b3978017..27361c8c 100644
--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@@ -38,6 +38,7 @@
 #include "ue2common.h"
 #include "fdr/fdr_compile.h"
 #include "nfa/shufticompile.h"
+#include "nfa/trufflecompile.h"
 #include "util/alloc.h"
 #include "util/bitutils.h"
 #include "util/charreach.h"
@@ -372,12 +373,9 @@ void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
         for (u32 i = 0; i < MAX_ACCEL_OFFSET && i < lit.s.length(); i++) {
             unsigned char c = lit.s[i];
             if (lit.nocase) {
-                DEBUG_PRINTF("adding %02hhx to %u\n", mytoupper(c), i);
-                DEBUG_PRINTF("adding %02hhx to %u\n", mytolower(c), i);
                 reach[i].set(mytoupper(c));
                 reach[i].set(mytolower(c));
             } else {
-                DEBUG_PRINTF("adding %02hhx to %u\n", c, i);
                 reach[i].set(c);
             }
         }
@@ -397,7 +395,7 @@ void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
     assert(min_offset <= min_len);
 
     if (min_count > MAX_SHUFTI_WIDTH) {
-        DEBUG_PRINTF("min shufti with %u chars is too wide\n", min_count);
+        DEBUG_PRINTF("FAIL: min shufti with %u chars is too wide\n", min_count);
         return;
     }
 
@@ -410,7 +408,11 @@ void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
         return;
     }
 
-    DEBUG_PRINTF("fail\n");
+    truffleBuildMasks(cr, &aux->truffle.mask1, &aux->truffle.mask2);
+    DEBUG_PRINTF("built truffle for %s (%zu chars, offset %u)\n",
+                 describeClass(cr).c_str(), cr.count(), min_offset);
+    aux->truffle.accel_type = ACCEL_TRUFFLE;
+    aux->truffle.offset = verify_u8(min_offset);
 }
 
 static

From c58424cab3bcb82da8c809c66a99982cc017631c Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Fri, 1 Jul 2016 14:04:55 +1000
Subject: [PATCH 101/166] reduce character classes for hwlm accel

---
 src/hwlm/hwlm_build.cpp | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/src/hwlm/hwlm_build.cpp b/src/hwlm/hwlm_build.cpp
index 27361c8c..42d6bbdf 100644
--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@@ -347,6 +347,25 @@ void filterLits(const vector<hwlmLiteral> &lits, hwlm_group_t expected_groups,
     }
 }
 
+static
+bool litGuardedByCharReach(const CharReach &cr, const hwlmLiteral &lit,
+                           u32 max_offset) {
+    for (u32 i = 0; i <= max_offset && i < lit.s.length(); i++) {
+         unsigned char c = lit.s[i];
+         if (lit.nocase) {
+             if (cr.test(mytoupper(c)) && cr.test(mytolower(c))) {
+                 return true;
+             }
+         } else {
+             if (cr.test(c)) {
+                 return true;
+             }
+         }
+    }
+
+    return false;
+}
+
 static
 void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
                             hwlm_group_t expected_groups, AccelAux *aux) {
@@ -364,26 +383,33 @@ void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
         return;
     }
 
+    /* look for shufti/truffle */
+
     vector<CharReach> reach(MAX_ACCEL_OFFSET, CharReach());
     for (const auto &lit : lits) {
         if (!(lit.groups & expected_groups)) {
             continue;
         }
 
-        for (u32 i = 0; i < MAX_ACCEL_OFFSET && i < lit.s.length(); i++) {
-            unsigned char c = lit.s[i];
+        for (u32 i = 0; i < MAX_ACCEL_OFFSET; i++) {
+            CharReach &reach_i = reach[i];
+
+            if (litGuardedByCharReach(reach_i, lit, i)) {
+                continue;
+            }
+            unsigned char c = i < lit.s.length() ? lit.s[i] : lit.s.back();
             if (lit.nocase) {
-                reach[i].set(mytoupper(c));
-                reach[i].set(mytolower(c));
+                reach_i.set(mytoupper(c));
+                reach_i.set(mytolower(c));
             } else {
-                reach[i].set(c);
+                reach_i.set(c);
             }
         }
     }
 
     u32 min_count = ~0U;
     u32 min_offset = ~0U;
-    for (u32 i = 0; i < min_len; i++) {
+    for (u32 i = 0; i < MAX_ACCEL_OFFSET; i++) {
         size_t count = reach[i].count();
         DEBUG_PRINTF("offset %u is %s (reach %zu)\n", i,
                      describeClass(reach[i]).c_str(), count);
@@ -392,7 +418,6 @@ void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
             min_offset = i;
         }
     }
-    assert(min_offset <= min_len);
 
     if (min_count > MAX_SHUFTI_WIDTH) {
         DEBUG_PRINTF("FAIL: min shufti with %u chars is too wide\n", min_count);

From 790683b64142d9e79da330cebf2b44baa00eae04 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 6 Jul 2016 15:49:03 +1000
Subject: [PATCH 102/166] rose: don't always dedupe small-block lit variants

---
 src/rose/rose_build_misc.cpp | 60 ++++++++++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 9 deletions(-)

diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index 38586bcd..186f4d16 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -581,8 +581,12 @@ public:
     bool requiresDedupeSupport(
         const ue2::flat_set<ReportID> &reports) const override;
 
+private:
+    bool hasSafeMultiReports(const ue2::flat_set<ReportID> &reports) const;
+
     const RoseBuildImpl &tbi;
-    map<ReportID, set<RoseVertex>> vert_map;
+    map<ReportID, set<RoseVertex>> vert_map; //!< ordinary literals
+    map<ReportID, set<RoseVertex>> sb_vert_map; //!< small block literals
     map<ReportID, set<suffix_id>> suffix_map;
     map<ReportID, set<const OutfixInfo *>> outfix_map;
     map<ReportID, set<const raw_puff *>> puff_map;
@@ -602,10 +606,14 @@ RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
     set<suffix_id> suffixes;
 
     for (auto v : vertices_range(g)) {
-        // Literals in the small block table don't count as dupes: although
-        // they have copies in the anchored table, the two are never run in the
-        // same runtime invocation. All other literals count, though.
-        if (!tbi.hasLiteralInTable(v, ROSE_ANCHORED_SMALL_BLOCK)) {
+        // Literals in the small block table are "shadow" copies of literals in
+        // the other tables that do not run in the same runtime invocation.
+        // Dedupe key assignment will be taken care of by the real literals.
+        if (tbi.hasLiteralInTable(v, ROSE_ANCHORED_SMALL_BLOCK)) {
+            for (const auto &report_id : g[v].reports) {
+                sb_vert_map[report_id].insert(v);
+            }
+        } else {
             for (const auto &report_id : g[v].reports) {
                 vert_map[report_id].insert(v);
             }
@@ -673,19 +681,54 @@ bool literalsCouldRace(const rose_literal_id &lit1,
     return r.first == smaller->rend();
 }
 
+bool RoseDedupeAuxImpl::hasSafeMultiReports(
+    const flat_set<ReportID> &reports) const {
+    if (reports.size() <= 1) {
+        return true;
+    }
+
+    /* We have more than one ReportID corresponding to the external ID that is
+     * presented to the user. These may differ in offset adjustment, bounds
+     * checks, etc. */
+
+    /* TODO: work out if these differences will actually cause problems */
+
+    /* One common case where we know we don't have a problem is if there are
+     * precisely two reports, one for the main Rose path and one for the
+     * "small block matcher" path. */
+    if (reports.size() == 2) {
+        ReportID id1 = *reports.begin();
+        ReportID id2 = *reports.rbegin();
+
+        bool has_verts_1 = contains(vert_map, id1);
+        bool has_verts_2 = contains(vert_map, id2);
+        bool has_sb_verts_1 = contains(sb_vert_map, id1);
+        bool has_sb_verts_2 = contains(sb_vert_map, id2);
+
+        if (has_verts_1 != has_verts_2 && has_sb_verts_1 != has_sb_verts_2) {
+            DEBUG_PRINTF("two reports, one full and one small block: ok\n");
+            return true;
+        }
+    }
+
+    DEBUG_PRINTF("more than one report\n");
+    return false;
+}
+
 bool RoseDedupeAuxImpl::requiresDedupeSupport(
     const ue2::flat_set<ReportID> &reports) const {
     /* TODO: this could be expanded to check for offset or character
        constraints */
 
+    DEBUG_PRINTF("reports: %s\n", as_string_list(reports).c_str());
+
     const RoseGraph &g = tbi.g;
 
     bool has_suffix = false;
     bool has_outfix = false;
 
-    if (reports.size() > 1) {
-        /* may have offset adjust */
-        /* TODO: work out if the offset adjust will actually cause problems */
+    if (!hasSafeMultiReports(reports)) {
+        DEBUG_PRINTF("multiple reports not safe\n");
         return true;
     }
 
@@ -697,7 +740,6 @@ bool RoseDedupeAuxImpl::requiresDedupeSupport(
         if (contains(vert_map, r)) {
             insert(&roles, vert_map.at(r));
         }
-
         if (contains(suffix_map, r)) {
             insert(&suffixes, suffix_map.at(r));
         }

From 49bb3b5c82d62530b03ca61143e9a788b23dcc05 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 13 May 2016 09:39:26 +1000
Subject: [PATCH 103/166] simd_utils: setbit/clearbit by loading 1-bit mask

---
 src/util/simd_utils.c | 20 ++++++++++
 src/util/simd_utils.h | 85 +++++++++++++++----------------------------
 2 files changed, 50 insertions(+), 55 deletions(-)

diff --git a/src/util/simd_utils.c b/src/util/simd_utils.c
index 5f354270..a86c568d 100644
--- a/src/util/simd_utils.c
+++ b/src/util/simd_utils.c
@@ -26,6 +26,10 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+/** \file
+ * \brief Lookup tables to support SIMD operations.
+ */
+
 #include "simd_utils.h"
 
 const char vbs_mask_data[] ALIGN_CL_DIRECTIVE = {
@@ -38,3 +42,19 @@ const char vbs_mask_data[] ALIGN_CL_DIRECTIVE = {
     0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
     0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
 };
+
+#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
+
+/** \brief LUT for the mask1bit functions. */
+const u8 simd_onebit_masks[] ALIGN_CL_DIRECTIVE = {
+    ZEROES_31, 0x01, ZEROES_32,
+    ZEROES_31, 0x02, ZEROES_32,
+    ZEROES_31, 0x04, ZEROES_32,
+    ZEROES_31, 0x08, ZEROES_32,
+    ZEROES_31, 0x10, ZEROES_32,
+    ZEROES_31, 0x20, ZEROES_32,
+    ZEROES_31, 0x40, ZEROES_32,
+    ZEROES_31, 0x80, ZEROES_32,
+};
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 8cea458e..3544629f 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -245,47 +245,37 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
     return a;
 }
 
+extern const u8 simd_onebit_masks[];
+
+static really_inline
+m128 mask1bit128(unsigned int n) {
+    assert(n < sizeof(m128) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 31;
+    mask_idx -= n / 8;
+    return loadu128(&simd_onebit_masks[mask_idx]);
+}
+
 // switches on bit N in the given vector.
 static really_inline
 void setbit128(m128 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    // We should be able to figure out a better way than this.
-    union {
-        m128 simd;
-        u8 bytes[sizeof(m128)];
-    } x;
-    x.simd = *ptr;
-
-    u8 *b = &x.bytes[n / 8];
-    *b |= 1U << (n % 8);
-
-    *ptr = x.simd;
+    *ptr = or128(mask1bit128(n), *ptr);
 }
 
 // switches off bit N in the given vector.
 static really_inline
 void clearbit128(m128 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    // We should be able to figure out a better way than this.
-    union {
-        m128 simd;
-        u8 bytes[sizeof(m128)];
-    } x;
-    x.simd = *ptr;
-
-    u8 *b = &x.bytes[n / 8];
-    *b &= ~(1U << (n % 8));
-
-    *ptr = x.simd;
+    *ptr = andnot128(mask1bit128(n), *ptr);
 }
 
 // tests bit N in the given vector.
 static really_inline
 char testbit128(const m128 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    // We should be able to figure out a better way than this.
-    const char *bytes = (const char *)ptr;
-    return !!(bytes[n / 8] & (1 << (n % 8)));
+    const m128 mask = mask1bit128(n);
+#if defined(__SSE4_1__)
+    return !_mm_testz_si128(mask, *ptr);
+#else
+    return isnonzero128(and128(mask, *ptr));
+#endif
 }
 
 // offset must be an immediate
@@ -551,6 +541,14 @@ m256 loadbytes256(const void *ptr, unsigned int n) {
     return a;
 }
 
+static really_inline
+m256 mask1bit256(unsigned int n) {
+    assert(n < sizeof(m256) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 31;
+    mask_idx -= n / 8;
+    return loadu256(&simd_onebit_masks[mask_idx]);
+}
+
 #if !defined(__AVX2__)
 // switches on bit N in the given vector.
 static really_inline
@@ -599,42 +597,19 @@ char testbit256(const m256 *ptr, unsigned int n) {
 // switches on bit N in the given vector.
 static really_inline
 void setbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    // We should be able to figure out a better way than this.
-    union {
-        m256 simd;
-        u8 bytes[sizeof(m256)];
-    } x;
-    x.simd = *ptr;
-
-    u8 *b = &x.bytes[n / 8];
-    *b |= 1U << (n % 8);
-
-    *ptr = x.simd;
+    *ptr = or256(mask1bit256(n), *ptr);
 }
 
-// TODO: can we do this better in avx-land?
 static really_inline
 void clearbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    union {
-        m256 simd;
-        u8 bytes[sizeof(m256)];
-    } x;
-    x.simd = *ptr;
-
-    u8 *b = &x.bytes[n / 8];
-    *b &= ~(1U << (n % 8));
-
-    *ptr = x.simd;
+    *ptr = andnot256(mask1bit256(n), *ptr);
 }
 
 // tests bit N in the given vector.
 static really_inline
 char testbit256(const m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    const char *bytes = (const char *)ptr;
-    return !!(bytes[n / 8] & (1 << (n % 8)));
+    const m256 mask = mask1bit256(n);
+    return !_mm256_testz_si256(mask, *ptr);
 }
 
 static really_really_inline

From 85f049edb28f5dda7969fc973b670415c5dd39ed Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 5 Jul 2016 16:36:04 +1000
Subject: [PATCH 104/166] fdr: remove extra control ptr

---
 src/fdr/fdr.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index c79db037..f973f639 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -312,10 +312,9 @@ void get_conf_stride_4(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
 }
 
 static really_inline
-void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *controlVal,
+void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
                     const u32 *confBase, const struct FDR_Runtime_Args *a,
-                    const u8 *ptr, hwlmcb_rv_t *control, u32 *last_match_id,
-                    struct zone *z) {
+                    const u8 *ptr, u32 *last_match_id, struct zone *z) {
     const u8 bucket = 8;
     const u8 pullback = 1;
 
@@ -351,13 +350,13 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *controlVal,
                 continue;
             }
            *last_match_id = id;
-           *controlVal = a->cb(ptr_main + byte - a->buf,
-                               ptr_main + byte - a->buf, id, a->ctxt);
+           *control = a->cb(ptr_main + byte - a->buf, ptr_main + byte - a->buf,
+                            id, a->ctxt);
            continue;
         }
         u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a));
-        confWithBit(fdrc, a, ptr_main - a->buf + byte, pullback,
-                    control, last_match_id, confVal);
+        confWithBit(fdrc, a, ptr_main - a->buf + byte, pullback, control,
+                    last_match_id, confVal);
     } while (unlikely(!!*conf));
 }
 
@@ -680,9 +679,9 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
             itPtr += ITER_BYTES) {                                          \
             if (unlikely(itPtr > tryFloodDetect)) {                         \
                 tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
-                                             &floodBackoff, &controlVal,    \
+                                             &floodBackoff, &control,       \
                                              ITER_BYTES);                   \
-                if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) {      \
+                if (unlikely(control == HWLM_TERMINATE_MATCHING)) {         \
                     return HWLM_TERMINATED;                                 \
                 }                                                           \
             }                                                               \
@@ -691,11 +690,11 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
             u64a conf8;                                                     \
             get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_adjusted,    \
                         ft, &conf0, &conf8, &s);                            \
-            do_confirm_fdr(&conf0, 0, &controlVal, confBase, a, itPtr,      \
-                           control, &last_match_id, zz);                    \
-            do_confirm_fdr(&conf8, 8, &controlVal, confBase, a, itPtr,      \
-                           control, &last_match_id, zz);                    \
-            if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) {          \
+            do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr,         \
+                           &last_match_id, zz);                             \
+            do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr,         \
+                           &last_match_id, zz);                             \
+            if (unlikely(control == HWLM_TERMINATE_MATCHING)) {             \
                 return HWLM_TERMINATED;                                     \
             }                                                               \
         } /* end for loop */                                                \
@@ -704,8 +703,7 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
 static never_inline
 hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
                              const struct FDR_Runtime_Args *a) {
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     u32 last_match_id = INVALID_MATCH_ID;
     u64a domain_mask_adjusted = fdr->domainMask << 1;

From b6a77b73299ebb81dc4f1beade16e8b5cae386ab Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 6 Jul 2016 09:22:31 +1000
Subject: [PATCH 105/166] teddy: remove extra control ptr

---
 src/fdr/teddy.c                | 52 ++++++++++-------------
 src/fdr/teddy_avx2.c           | 76 +++++++++++++++-------------------
 src/fdr/teddy_runtime_common.h |  7 ++--
 3 files changed, 58 insertions(+), 77 deletions(-)

diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 2406a167..57fcaef1 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -82,12 +82,12 @@ do {                                                                        \
         u64a hi = movq(rshiftbyte_m128(var, 8));                            \
         if (unlikely(lo)) {                                                 \
             conf_fn(&lo, bucket, offset, confBase, reason, a, ptr,          \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(hi)) {                                                 \
             conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr,      \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
     }                                                                       \
@@ -102,22 +102,22 @@ do {                                                                        \
         u32 part4 = movd(rshiftbyte_m128(var, 12));                         \
         if (unlikely(part1)) {                                              \
             conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part2)) {                                              \
             conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part3)) {                                              \
             conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part4)) {                                              \
             conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr,  \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
     }                                                                       \
@@ -182,8 +182,7 @@ hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
                                   const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -229,7 +228,7 @@ hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
         m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -237,8 +236,7 @@ hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -284,7 +282,7 @@ hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
         m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -292,8 +290,7 @@ hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
                                   const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -343,7 +340,7 @@ hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
         m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -351,8 +348,7 @@ hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -402,7 +398,7 @@ hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
         m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -410,8 +406,7 @@ hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
                                   const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -464,7 +459,7 @@ hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
                                       p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -472,8 +467,7 @@ hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -526,7 +520,7 @@ hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
                                       p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -534,8 +528,7 @@ hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
                                   const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -589,7 +582,7 @@ hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
                                       &res_old_3, p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -597,8 +590,7 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -652,6 +644,6 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
                                       &res_old_3, p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index 5ea4e368..8f52027d 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -121,22 +121,22 @@ do {                                                                        \
         u64a part4 = extract64from256(r, 1);                                \
         if (unlikely(part1)) {                                              \
             conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part2)) {                                              \
             conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part3)) {                                              \
             conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part4)) {                                              \
             conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr,  \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
     }                                                                       \
@@ -158,41 +158,41 @@ do {                                                                        \
         u32 part8 = extract32from256(r, 3);                                 \
         if (unlikely(part1)) {                                              \
             conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part2)) {                                              \
             conf_fn(&part2, bucket, offset + 2, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
         }                                                                   \
         if (unlikely(part3)) {                                              \
             conf_fn(&part3, bucket, offset + 4, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part4)) {                                              \
             conf_fn(&part4, bucket, offset + 6, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part5)) {                                              \
             conf_fn(&part5, bucket, offset + 8, confBase, reason, a, ptr,   \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part6)) {                                              \
             conf_fn(&part6, bucket, offset + 10, confBase, reason, a, ptr,  \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part7)) {                                              \
             conf_fn(&part7, bucket, offset + 12, confBase, reason, a, ptr,  \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
         if (unlikely(part8)) {                                              \
             conf_fn(&part8, bucket, offset + 14, confBase, reason, a, ptr,  \
-                    control, &last_match);                                  \
+                    &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
     }                                                                       \
@@ -208,7 +208,7 @@ do {                                                                        \
         bit_array_fast_teddy(lo, bitArr, &arrCnt, offset);                  \
         bit_array_fast_teddy(hi, bitArr, &arrCnt, offset + 2);              \
         for (u32 i = 0; i < arrCnt; i++) {                                  \
-            conf_fn(bitArr[i], confBase, reason, a, ptr, control,           \
+            conf_fn(bitArr[i], confBase, reason, a, ptr, &control,          \
                     &last_match);                                           \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
@@ -484,8 +484,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
                                            const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -532,7 +531,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
         m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -540,8 +539,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
                                                const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -588,7 +586,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
         m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -596,8 +594,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
                                            const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -647,7 +644,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
         m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -655,8 +652,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
                                                const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -706,7 +702,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
         m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -714,8 +710,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
                                            const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -768,7 +763,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
                                           p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -776,8 +771,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
                                                const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -830,7 +824,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
                                           p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -838,8 +832,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
                                            const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -896,7 +889,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
                                           &res_old_3, p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -904,8 +897,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
                                                const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -962,7 +954,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
                                           &res_old_3, p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -970,8 +962,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
                                             const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -1032,7 +1023,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
                                              p_mask);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
@@ -1040,8 +1031,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
                                                 const struct FDR_Runtime_Args *a) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t *control = &controlVal;
+    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -1102,7 +1092,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
                                              p_mask);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
     }
-    *a->groups = controlVal;
+    *a->groups = control;
     return HWLM_SUCCESS;
 }
 
diff --git a/src/fdr/teddy_runtime_common.h b/src/fdr/teddy_runtime_common.h
index c50b4d16..6ccaeb05 100644
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@@ -51,8 +51,8 @@ extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
 
 #define CHECK_HWLM_TERMINATE_MATCHING                                       \
 do {                                                                        \
-    if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) {                  \
-        *a->groups = controlVal;                                            \
+    if (unlikely(control == HWLM_TERMINATE_MATCHING)) {                     \
+        *a->groups = control;                                               \
         return HWLM_TERMINATED;                                             \
     }                                                                       \
 } while (0);
@@ -61,8 +61,7 @@ do {                                                                        \
 do {                                                                        \
     if (unlikely(ptr > tryFloodDetect)) {                                   \
         tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect,          \
-                                     &floodBackoff, &controlVal,            \
-                                     iterBytes);                            \
+                                     &floodBackoff, &control, iterBytes);   \
         CHECK_HWLM_TERMINATE_MATCHING;                                      \
     }                                                                       \
 } while (0);

From 42f23c2c91e4740155a473af54cbb236ed20e493 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 6 Jul 2016 09:26:42 +1000
Subject: [PATCH 106/166] teddy: no need to write control out at the end

---
 src/fdr/teddy.c                | 16 ++++++++--------
 src/fdr/teddy_avx2.c           | 20 ++++++++++----------
 src/fdr/teddy_runtime_common.h |  1 -
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 57fcaef1..462b57df 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -228,7 +228,7 @@ hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
         m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -282,7 +282,7 @@ hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
         m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -340,7 +340,7 @@ hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
         m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -398,7 +398,7 @@ hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
         m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -459,7 +459,7 @@ hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
                                       p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -520,7 +520,7 @@ hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
                                       p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -582,7 +582,7 @@ hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
                                       &res_old_3, p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -644,6 +644,6 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
                                       &res_old_3, p_mask, val_0);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index 8f52027d..52e75cb4 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -531,7 +531,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
         m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -586,7 +586,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
         m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -644,7 +644,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
         m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -702,7 +702,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
         m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -763,7 +763,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
                                           p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -824,7 +824,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
                                           p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -889,7 +889,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
                                           &res_old_3, p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -954,7 +954,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
                                           &res_old_3, p_mask, val_0);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -1023,7 +1023,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
                                              p_mask);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
@@ -1092,7 +1092,7 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
                                              p_mask);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
     }
-    *a->groups = control;
+
     return HWLM_SUCCESS;
 }
 
diff --git a/src/fdr/teddy_runtime_common.h b/src/fdr/teddy_runtime_common.h
index 6ccaeb05..dc65c70a 100644
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@@ -52,7 +52,6 @@ extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
 #define CHECK_HWLM_TERMINATE_MATCHING                                       \
 do {                                                                        \
     if (unlikely(control == HWLM_TERMINATE_MATCHING)) {                     \
-        *a->groups = control;                                               \
         return HWLM_TERMINATED;                                             \
     }                                                                       \
 } while (0);

From 9346a9090e03c0bcd9a6091d01d99d6f0c11fe7b Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 7 Jul 2016 12:53:09 +1000
Subject: [PATCH 107/166] fdr: remove groups from struct FDR_Runtime_Args

---
 src/fdr/fdr.c          | 15 ++++++-----
 src/fdr/fdr_internal.h |  1 -
 src/fdr/teddy.c        | 32 +++++++++++------------
 src/fdr/teddy.h        | 59 ++++++++++++++++++++++++++++--------------
 src/fdr/teddy_avx2.c   | 40 ++++++++++++++--------------
 5 files changed, 84 insertions(+), 63 deletions(-)

diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index f973f639..4230c2b1 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -702,8 +702,8 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
 
 static never_inline
 hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
-                             const struct FDR_Runtime_Args *a) {
-    hwlmcb_rv_t control = *a->groups;
+                             const struct FDR_Runtime_Args *a,
+                             hwlm_group_t control) {
     u32 floodBackoff = FLOOD_BACKOFF_START;
     u32 last_match_id = INVALID_MATCH_ID;
     u64a domain_mask_adjusted = fdr->domainMask << 1;
@@ -768,7 +768,10 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
 #define ONLY_AVX2(func) NULL
 #endif
 
-typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
+typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr,
+                                    const struct FDR_Runtime_Args *a,
+                                    hwlm_group_t control);
+
 static const FDRFUNCTYPE funcs[] = {
     fdr_engine_exec,
     ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fast),
@@ -811,7 +814,6 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
         start,
         cb,
         ctxt,
-        &groups,
         nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
         0
     };
@@ -819,7 +821,7 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
         return HWLM_SUCCESS;
     } else {
         assert(funcs[fdr->engineID]);
-        return funcs[fdr->engineID](fdr, &a);
+        return funcs[fdr->engineID](fdr, &a, groups);
     }
 }
 
@@ -837,7 +839,6 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
         start,
         cb,
         ctxt,
-        &groups,
         nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
         /* we are guaranteed to always have 16 initialised bytes at the end of
          * the history buffer (they may be garbage). */
@@ -850,7 +851,7 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
         ret = HWLM_SUCCESS;
     } else {
         assert(funcs[fdr->engineID]);
-        ret = funcs[fdr->engineID](fdr, &a);
+        ret = funcs[fdr->engineID](fdr, &a, groups);
     }
 
     fdrPackState(fdr, &a, stream_state);
diff --git a/src/fdr/fdr_internal.h b/src/fdr/fdr_internal.h
index cde13f6c..6272b69e 100644
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@@ -105,7 +105,6 @@ struct FDR_Runtime_Args {
     size_t start_offset;
     HWLMCallback cb;
     void *ctxt;
-    hwlm_group_t *groups;
     const u8 *firstFloodDetect;
     const u64a histBytes;
 };
diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 462b57df..9f8b5104 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -179,10 +179,10 @@ m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
 }
 
 hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a) {
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -233,10 +233,10 @@ hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a) {
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -287,10 +287,10 @@ hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a) {
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -345,10 +345,10 @@ hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a) {
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -403,10 +403,10 @@ hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a) {
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -464,10 +464,10 @@ hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a) {
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -525,10 +525,10 @@ hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a) {
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -587,10 +587,10 @@ hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a) {
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
diff --git a/src/fdr/teddy.h b/src/fdr/teddy.h
index f3902723..e2936723 100644
--- a/src/fdr/teddy.h
+++ b/src/fdr/teddy.h
@@ -33,64 +33,85 @@
 #ifndef TEDDY_H_
 #define TEDDY_H_
 
+#include "hwlm/hwlm.h" // for hwlm_group_t
+
 struct FDR; // forward declaration from fdr_internal.h
 struct FDR_Runtime_Args;
 
 hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a);
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a);
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a);
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a);
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a);
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a);
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a);
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a);
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control);
 
 #if defined(__AVX2__)
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a);
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a);
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a);
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a);
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a);
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a);
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a);
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a);
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control);
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
-                                            const struct FDR_Runtime_Args *a);
+                                            const struct FDR_Runtime_Args *a,
+                                            hwlm_group_t control);
 
-hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
-                                                const struct FDR_Runtime_Args *a);
+hwlm_error_t
+fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
+                                   const struct FDR_Runtime_Args *a,
+                                   hwlm_group_t control);
 
 #endif /* __AVX2__ */
 
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index 52e75cb4..428c9446 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -481,10 +481,10 @@ const u32 * getConfBase_avx2(const struct Teddy *teddy, u8 numMask) {
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a) {
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -536,10 +536,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a) {
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -591,10 +591,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a) {
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -649,10 +649,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a) {
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -707,10 +707,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a) {
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -768,10 +768,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a) {
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -829,10 +829,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
-                                           const struct FDR_Runtime_Args *a) {
+                                           const struct FDR_Runtime_Args *a,
+                                           hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -894,10 +894,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
-                                               const struct FDR_Runtime_Args *a) {
+                                               const struct FDR_Runtime_Args *a,
+                                               hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -959,10 +959,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
-                                            const struct FDR_Runtime_Args *a) {
+                                            const struct FDR_Runtime_Args *a,
+                                            hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;
@@ -1028,10 +1028,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
-                                                const struct FDR_Runtime_Args *a) {
+                                            const struct FDR_Runtime_Args *a,
+                                            hwlm_group_t control) {
     const u8 *buf_end = a->buf + a->len;
     const u8 *ptr = a->buf + a->start_offset;
-    hwlmcb_rv_t control = *a->groups;
     u32 floodBackoff = FLOOD_BACKOFF_START;
     const u8 *tryFloodDetect = a->firstFloodDetect;
     u32 last_match = (u32)-1;

From 39c6a0c7bfea6d8e4378b06bb8d13ec54b827b63 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 7 Jul 2016 10:25:49 +1000
Subject: [PATCH 108/166] rose: check literal bounds when building SB table

Literals that cannot lead to a report in the first ROSE_SMALL_BLOCK_LEN
bytes may be dropped from the small block table.
---
 src/rose/rose_build_dump.cpp     |  20 +----
 src/rose/rose_build_matchers.cpp | 136 ++++++++++++++++++++++++++++---
 src/rose/rose_build_matchers.h   |   8 +-
 3 files changed, 133 insertions(+), 31 deletions(-)

diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp
index 2c3f326e..fc60af4c 100644
--- a/src/rose/rose_build_dump.cpp
+++ b/src/rose/rose_build_dump.cpp
@@ -458,18 +458,6 @@ void dumpTestLiterals(const string &filename, const vector<hwlmLiteral> &lits) {
     of.close();
 }
 
-namespace {
-struct LongerThanLimit {
-    explicit LongerThanLimit(size_t len) : max_len(len) {}
-    bool operator()(const hwlmLiteral &lit) const {
-        return lit.s.length() > max_len;
-    }
-
-  private:
-    size_t max_len;
-};
-}
-
 static
 void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) {
     auto lits = fillHamsterLiteralList(build, ROSE_ANCHORED);
@@ -481,12 +469,10 @@ void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) {
     lits = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED);
     dumpTestLiterals(base + "rose_eod_test_literals.txt", lits);
 
-    lits = fillHamsterLiteralList(build, ROSE_FLOATING);
-    auto lits2 = fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK);
+    lits = fillHamsterLiteralList(build, ROSE_FLOATING, ROSE_SMALL_BLOCK_LEN);
+    auto lits2 = fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK,
+                                        ROSE_SMALL_BLOCK_LEN);
     lits.insert(end(lits), begin(lits2), end(lits2));
-    lits.erase(remove_if(lits.begin(), lits.end(),
-                         LongerThanLimit(ROSE_SMALL_BLOCK_LEN)),
-               lits.end());
     dumpTestLiterals(base + "rose_smallblock_test_literals.txt", lits);
 }
 
diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp
index 498af2f0..7b20bd1c 100644
--- a/src/rose/rose_build_matchers.cpp
+++ b/src/rose/rose_build_matchers.cpp
@@ -38,12 +38,14 @@
 #include "hwlm/hwlm_build.h"
 #include "hwlm/hwlm_literal.h"
 #include "nfa/castlecompile.h"
+#include "nfa/nfa_api_queue.h"
 #include "util/charreach_util.h"
 #include "util/compile_context.h"
 #include "util/compile_error.h"
 #include "util/dump_charclass.h"
 #include "util/report.h"
 #include "util/report_manager.h"
+#include "util/verify_types.h"
 #include "ue2common.h"
 
 #include <iomanip>
@@ -519,8 +521,111 @@ bool isNoRunsLiteral(const RoseBuildImpl &build, const u32 id,
     return true;
 }
 
+static
+const raw_puff &getChainedPuff(const RoseBuildImpl &build,
+                               const Report &report) {
+    DEBUG_PRINTF("chained report, event %u\n", report.onmatch);
+
+    // MPV has already been moved to the outfixes vector.
+    assert(!build.mpv_outfix);
+
+    auto mpv_outfix_it = find_if(
+        begin(build.outfixes), end(build.outfixes),
+        [](const OutfixInfo &outfix) { return outfix.is_nonempty_mpv(); });
+    assert(mpv_outfix_it != end(build.outfixes));
+    const auto *mpv = mpv_outfix_it->mpv();
+
+    u32 puff_index = report.onmatch - MQE_TOP_FIRST;
+    assert(puff_index < mpv->triggered_puffettes.size());
+    return mpv->triggered_puffettes.at(puff_index);
+}
+
+/**
+ * \brief Returns a conservative estimate of the minimum offset at which the
+ * given literal can lead to a report.
+ *
+ * TODO: This could be made more precise by calculating a "distance to accept"
+ * for every vertex in the graph; right now we're only accurate for leaf nodes.
+ */
+static
+u64a literalMinReportOffset(const RoseBuildImpl &build,
+                           const rose_literal_id &lit,
+                           const rose_literal_info &info) {
+    const auto &g = build.g;
+
+    const u32 lit_len = verify_u32(lit.elength());
+
+    u64a lit_min_offset = UINT64_MAX;
+
+    for (const auto &v : info.vertices) {
+        DEBUG_PRINTF("vertex %zu min_offset=%u\n", g[v].idx, g[v].min_offset);
+
+        u64a vert_offset = g[v].min_offset;
+
+        if (vert_offset >= lit_min_offset) {
+            continue;
+        }
+
+        u64a min_offset = UINT64_MAX;
+
+        for (const auto &id : g[v].reports) {
+            const Report &report = build.rm.getReport(id);
+            DEBUG_PRINTF("report id %u, min offset=%llu\n", id,
+                         report.minOffset);
+            if (report.type == INTERNAL_ROSE_CHAIN) {
+                // This vertex triggers an MPV, which will fire reports after
+                // repeating for a while.
+                assert(report.minOffset == 0); // Should not have bounds.
+                const auto &puff = getChainedPuff(build, report);
+                DEBUG_PRINTF("chained puff repeats=%u\n", puff.repeats);
+                const Report &puff_report = build.rm.getReport(puff.report);
+                DEBUG_PRINTF("puff report %u, min offset=%llu\n", puff.report,
+                              puff_report.minOffset);
+                min_offset = min(min_offset, max(vert_offset + puff.repeats,
+                                                 puff_report.minOffset));
+            } else {
+                DEBUG_PRINTF("report min offset=%llu\n", report.minOffset);
+                min_offset = min(min_offset, max(vert_offset,
+                                                 report.minOffset));
+            }
+        }
+
+        if (g[v].suffix) {
+            depth suffix_width = findMinWidth(g[v].suffix, g[v].suffix.top);
+            assert(suffix_width.is_reachable());
+            DEBUG_PRINTF("suffix with width %s\n", suffix_width.str().c_str());
+            min_offset = min(min_offset, vert_offset + suffix_width);
+        }
+
+        if (!isLeafNode(v, g) || min_offset == UINT64_MAX) {
+            min_offset = vert_offset;
+        }
+
+        lit_min_offset = min(lit_min_offset, min_offset);
+    }
+
+    // If this literal in the undelayed literal corresponding to some delayed
+    // literals, we must take their minimum offsets into account.
+    for (const u32 &delayed_id : info.delayed_ids) {
+        const auto &delayed_lit = build.literals.right.at(delayed_id);
+        const auto &delayed_info = build.literal_info.at(delayed_id);
+        u64a delayed_min_offset = literalMinReportOffset(build, delayed_lit,
+                                                         delayed_info);
+        DEBUG_PRINTF("delayed_id=%u, min_offset = %llu\n", delayed_id,
+                     delayed_min_offset);
+        lit_min_offset = min(lit_min_offset, delayed_min_offset);
+    }
+
+    // If we share a vertex with a shorter literal, our min offset might dip
+    // below the length of this one.
+    lit_min_offset = max(lit_min_offset, u64a{lit_len});
+
+    return lit_min_offset;
+}
+
 vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
-                                           rose_literal_table table) {
+                                           rose_literal_table table,
+                                           u32 max_offset) {
     vector<hwlmLiteral> lits;
 
     for (const auto &e : build.literals.right) {
@@ -546,6 +651,15 @@ vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
 
         DEBUG_PRINTF("lit='%s'\n", escapeString(lit).c_str());
 
+        if (max_offset != ROSE_BOUND_INF) {
+            u64a min_report = literalMinReportOffset(build, e.second, info);
+            if (min_report > max_offset) {
+                DEBUG_PRINTF("min report offset=%llu exceeds max_offset=%u\n",
+                             min_report, max_offset);
+                continue;
+            }
+        }
+
         const vector<u8> &msk = e.second.msk;
         const vector<u8> &cmp = e.second.cmp;
 
@@ -664,7 +778,8 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
         return nullptr;
     }
 
-    auto lits = fillHamsterLiteralList(build, ROSE_FLOATING);
+    auto lits = fillHamsterLiteralList(build, ROSE_FLOATING,
+                                       ROSE_SMALL_BLOCK_LEN);
     if (lits.empty()) {
         DEBUG_PRINTF("no floating table\n");
         return nullptr;
@@ -673,8 +788,8 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
         return nullptr;
     }
 
-    auto anchored_lits =
-        fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK);
+    auto anchored_lits = fillHamsterLiteralList(build,
+                            ROSE_ANCHORED_SMALL_BLOCK, ROSE_SMALL_BLOCK_LEN);
     if (anchored_lits.empty()) {
         DEBUG_PRINTF("no small-block anchored literals\n");
         return nullptr;
@@ -682,15 +797,10 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
 
     lits.insert(lits.end(), anchored_lits.begin(), anchored_lits.end());
 
-    // Remove literals that are longer than our small block length, as they can
-    // never match. TODO: improve by removing literals that have a min match
-    // offset greater than ROSE_SMALL_BLOCK_LEN, which will catch anchored cases
-    // with preceding dots that put them over the limit.
-    auto longer_than_limit = [](const hwlmLiteral &lit) {
-        return lit.s.length() > ROSE_SMALL_BLOCK_LEN;
-    };
-    lits.erase(remove_if(lits.begin(), lits.end(), longer_than_limit),
-               lits.end());
+    // None of our literals should be longer than the small block limit.
+    assert(all_of(begin(lits), end(lits), [](const hwlmLiteral &lit) {
+        return lit.s.length() <= ROSE_SMALL_BLOCK_LEN;
+    }));
 
     if (lits.empty()) {
         DEBUG_PRINTF("no literals shorter than small block len\n");
diff --git a/src/rose/rose_build_matchers.h b/src/rose/rose_build_matchers.h
index 7d5c9283..2a225bf5 100644
--- a/src/rose/rose_build_matchers.h
+++ b/src/rose/rose_build_matchers.h
@@ -44,8 +44,14 @@ namespace ue2 {
 
 struct hwlmLiteral;
 
+/**
+ * \brief Build up a vector of literals for the given table.
+ *
+ * If max_offset is specified (and not ROSE_BOUND_INF), then literals that can
+ * only lead to a pattern match after max_offset may be excluded.
+ */
 std::vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
-                                                rose_literal_table table);
+                    rose_literal_table table, u32 max_offset = ROSE_BOUND_INF);
 
 aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
                                               rose_group *fgroups,

From 6ec93a54c4f71c8e0537c2c972293fd5a82de8df Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 8 Jul 2016 11:36:10 +1000
Subject: [PATCH 109/166] rose: only dump small-block table in block mode

---
 src/rose/rose_build_dump.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp
index fc60af4c..5e176c30 100644
--- a/src/rose/rose_build_dump.cpp
+++ b/src/rose/rose_build_dump.cpp
@@ -469,11 +469,14 @@ void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) {
     lits = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED);
     dumpTestLiterals(base + "rose_eod_test_literals.txt", lits);
 
-    lits = fillHamsterLiteralList(build, ROSE_FLOATING, ROSE_SMALL_BLOCK_LEN);
-    auto lits2 = fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK,
-                                        ROSE_SMALL_BLOCK_LEN);
-    lits.insert(end(lits), begin(lits2), end(lits2));
-    dumpTestLiterals(base + "rose_smallblock_test_literals.txt", lits);
+    if (!build.cc.streaming) {
+        lits = fillHamsterLiteralList(build, ROSE_FLOATING,
+                                      ROSE_SMALL_BLOCK_LEN);
+        auto lits2 = fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK,
+                                            ROSE_SMALL_BLOCK_LEN);
+        lits.insert(end(lits), begin(lits2), end(lits2));
+        dumpTestLiterals(base + "rose_smallblock_test_literals.txt", lits);
+    }
 }
 
 void dumpRose(const RoseBuild &build_base, const RoseEngine *t,

From f55e968692f233ef29f2ef438f932c6f5d03166d Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Thu, 30 Jun 2016 11:32:24 +1000
Subject: [PATCH 110/166] tamarama dump: use correct base offset

---
 src/nfa/tamarama_dump.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/nfa/tamarama_dump.cpp b/src/nfa/tamarama_dump.cpp
index ed2f1cb1..181fa9af 100644
--- a/src/nfa/tamarama_dump.cpp
+++ b/src/nfa/tamarama_dump.cpp
@@ -54,12 +54,11 @@ void nfaExecTamarama0_dumpDot(const struct NFA *nfa, UNUSED FILE *f,
     const u32 *subOffset =
         (const u32 *)((const char *)t + sizeof(struct Tamarama) +
                       t->numSubEngines * sizeof(u32));
-    const char *offset = (const char *)nfa;
     for (u32 i = 0; i < t->numSubEngines; i++) {
         std::stringstream ssdot;
         ssdot << base << "rose_nfa_" << nfa->queueIndex
             << "_sub_" << i << ".dot";
-        const NFA *sub = (const struct NFA *)(offset + subOffset[i]);
+        const NFA *sub = (const struct NFA *)((const char *)t + subOffset[i]);
         FILE *f1 = fopen(ssdot.str().c_str(), "w");
         nfaDumpDot(sub, f1, base);
         fclose(f1);
@@ -80,10 +79,9 @@ void nfaExecTamarama0_dumpText(const struct NFA *nfa, FILE *f) {
     const u32 *subOffset =
         (const u32 *)((const char *)t + sizeof(struct Tamarama) +
                       t->numSubEngines * sizeof(u32));
-    const char *offset = (const char *)nfa;
     for (u32 i = 0; i < t->numSubEngines; i++) {
         fprintf(f, "Sub %u:\n", i);
-        const NFA *sub = (const struct NFA *)(offset + subOffset[i]);
+        const NFA *sub = (const struct NFA *)((const char *)t + subOffset[i]);
         nfaDumpText(sub, f);
         fprintf(f, "\n");
     }

From 8d316075569ab9d6aab764626e19eea06e1316fe Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 11 Jul 2016 11:44:57 +1000
Subject: [PATCH 111/166] rose: only use anch history when there are bounds

---
 src/rose/rose_build_compile.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index d59d4d4f..472de156 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -453,7 +453,8 @@ RoseRoleHistory findHistoryScheme(const RoseBuildImpl &tbi, const RoseEdge &e) {
         return ROSE_ROLE_HISTORY_NONE;
     }
 
-    if (g[u].fixedOffset()) {
+    if (g[u].fixedOffset() &&
+        (g[e].minBound || g[e].maxBound != ROSE_BOUND_INF)) {
         DEBUG_PRINTF("fixed offset -> anch\n");
         return ROSE_ROLE_HISTORY_ANCH;
     }

From 3e96cd48efb89b29667020750586c132917a19fd Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 11 Jul 2016 11:40:32 +1000
Subject: [PATCH 112/166] rose: sanity check CHECK_BOUNDS instruction

---
 src/rose/rose_build_bytecode.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index c6f709bc..ecdda146 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -3766,6 +3766,10 @@ void makeRoleCheckBounds(const RoseBuildImpl &build, RoseVertex v,
         max_bound = MAX_OFFSET;
     }
 
+    // This instruction should be doing _something_ -- bounds should be tighter
+    // than just {length, inf}.
+    assert(min_bound > lit_length || max_bound < MAX_OFFSET);
+
     auto ri = RoseInstruction(ROSE_INSTR_CHECK_BOUNDS, JumpTarget::NEXT_BLOCK);
     ri.u.checkBounds.min_bound = min_bound;
     ri.u.checkBounds.max_bound = max_bound;

From 1bab10698f6f14dacec0cfa36fdd993535e06ab7 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 12 Jul 2016 09:36:49 +1000
Subject: [PATCH 113/166] rose_build_convert: improve history selection

Fixes assertion failures introduced by last commit.
---
 src/rose/rose_build_convert.cpp | 64 ++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/src/rose/rose_build_convert.cpp b/src/rose/rose_build_convert.cpp
index f5e99c23..1578dda1 100644
--- a/src/rose/rose_build_convert.cpp
+++ b/src/rose/rose_build_convert.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -651,6 +651,26 @@ CharReach getReachOfNormalVertex(const NGHolder &g) {
     return CharReach();
 }
 
+/**
+ * \brief Set the edge bounds and appropriate history on the given edge in the
+ * Rose graph.
+ */
+static
+void setEdgeBounds(RoseGraph &g, const RoseEdge &e, u32 min_bound,
+                   u32 max_bound) {
+    assert(min_bound <= max_bound);
+    assert(max_bound <= ROSE_BOUND_INF);
+
+    g[e].minBound = min_bound;
+    g[e].maxBound = max_bound;
+
+    if (min_bound || max_bound < ROSE_BOUND_INF) {
+        g[e].history = ROSE_ROLE_HISTORY_ANCH;
+    } else {
+        g[e].history = ROSE_ROLE_HISTORY_NONE;
+    }
+}
+
 static
 bool handleStartPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
                              const RoseEdge &e_old, RoseVertex ar,
@@ -686,18 +706,13 @@ bool handleStartPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
     if (source(e_old, g) == ar) {
         assert(g[e_old].minBound <= bound_min);
         assert(g[e_old].maxBound >= bound_max);
-        g[e_old].minBound = bound_min;
-        g[e_old].maxBound = bound_max;
-        g[e_old].history = ROSE_ROLE_HISTORY_ANCH;
+        setEdgeBounds(g, e_old, bound_min, bound_max);
     } else {
         RoseEdge e_new;
         UNUSED bool added;
         tie(e_new, added) = add_edge(ar, v, g);
         assert(added);
-        g[e_new].minBound = bound_min;
-        g[e_new].maxBound = bound_max;
-        g[e_new].history = ROSE_ROLE_HISTORY_ANCH;
-
+        setEdgeBounds(g, e_new, bound_min, bound_max);
         to_delete->push_back(e_old);
     }
 
@@ -751,9 +766,7 @@ bool handleStartDsPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
 
     /* update bounds on edge */
     assert(g[e].minBound <= repeatCount);
-    g[e].minBound = repeatCount;
-    g[e].maxBound = ROSE_BOUND_INF;
-    g[e].history = ROSE_ROLE_HISTORY_ANCH;
+    setEdgeBounds(g, e, repeatCount, ROSE_BOUND_INF);
 
     g[v].left.reset(); /* clear the prefix info */
 
@@ -893,26 +906,19 @@ bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
         }
 
         if (source(e_old, g) == ar) {
-            g[e_old].minBound = ri.repeatMin + width;
-            g[e_old].maxBound = ri.repeatMax + width;
-            g[e_old].history = ROSE_ROLE_HISTORY_ANCH;
+            setEdgeBounds(g, e_old, ri.repeatMin + width, ri.repeatMax + width);
         } else {
             RoseEdge e_new;
             UNUSED bool added;
             tie(e_new, added) = add_edge(ar, v, g);
             assert(added);
-            g[e_new].minBound = ri.repeatMin + width;
-            g[e_new].maxBound = ri.repeatMax + width;
-            g[e_new].history = ROSE_ROLE_HISTORY_ANCH;
-
+            setEdgeBounds(g, e_new, ri.repeatMin + width, ri.repeatMax + width);
             to_delete->push_back(e_old);
         }
 
     } else {
         assert(g[e_old].minBound <= ri.repeatMin + width);
-        g[e_old].minBound = ri.repeatMin + width;
-        g[e_old].maxBound = ROSE_BOUND_INF;
-        g[e_old].history = ROSE_ROLE_HISTORY_ANCH;
+        setEdgeBounds(g, e_old, ri.repeatMin + width, ROSE_BOUND_INF);
     }
 
     g[v].left.dfa.reset();
@@ -1110,19 +1116,9 @@ void convertAnchPrefixToBounds(RoseBuildImpl &tbi) {
             bounds.min -= delay_adj;
         }
         bounds.max -= delay_adj;
-
-        g[e].minBound = bounds.min;
-        g[e].maxBound =
-            bounds.max.is_finite() ? (u32)bounds.max : ROSE_BOUND_INF;
-
-        // It's possible that a (0,inf) case might sneak through here, in which
-        // case we don't need ANCH history at all.
-        if (g[e].minBound == 0 && g[e].maxBound == ROSE_BOUND_INF) {
-            g[e].history = ROSE_ROLE_HISTORY_NONE;
-        } else {
-            g[e].history = ROSE_ROLE_HISTORY_ANCH;
-        }
-
+        setEdgeBounds(g, e, bounds.min, bounds.max.is_finite()
+                                            ? (u32)bounds.max
+                                            : ROSE_BOUND_INF);
         g[v].left.reset();
     }
 }

From 4d7469392dcc335516ec3cb0151d28e4aed5243a Mon Sep 17 00:00:00 2001
From: "Xu, Chi" <chi.xu@intel.com>
Date: Fri, 13 May 2016 08:52:43 +0800
Subject: [PATCH 114/166] rose: add CHECK_BYTE/CHECK_MASK instructions

These instructions are specialisations of the "lookaround" code for
performance.
---
 CMakeLists.txt                   |   1 +
 src/rose/program_runtime.h       | 173 ++++++++++++++++++
 src/rose/rose_build_bytecode.cpp | 110 +++++++++++
 src/rose/rose_dump.cpp           |  18 ++
 src/rose/rose_program.h          |  20 ++
 src/rose/validate_mask.h         |  77 ++++++++
 unit/CMakeLists.txt              |   1 +
 unit/internal/rose_mask.cpp      | 302 +++++++++++++++++++++++++++++++
 8 files changed, 702 insertions(+)
 create mode 100644 src/rose/validate_mask.h
 create mode 100644 unit/internal/rose_mask.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 67109797..7dff1c51 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -512,6 +512,7 @@ set (hs_exec_SRCS
     src/rose/rose_program.h
     src/rose/rose_types.h
     src/rose/rose_common.h
+    src/rose/validate_mask.h
     src/util/bitutils.h
     src/util/exhaust.h
     src/util/fatbit.h
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index fe71772e..fdaa2e07 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -44,6 +44,7 @@
 #include "rose_internal.h"
 #include "rose_program.h"
 #include "rose_types.h"
+#include "validate_mask.h"
 #include "runtime.h"
 #include "scratch.h"
 #include "ue2common.h"
@@ -608,6 +609,154 @@ int reachHasBit(const u8 *reach, u8 c) {
     return !!(reach[c / 8U] & (u8)1U << (c % 8U));
 }
 
+/*
+ * Generate a 8-byte valid_mask with #high bytes 0 from the highest side
+ * and #low bytes 0 from the lowest side
+ * and (8 - high - low) bytes '0xff' in the middle.
+ */
+static rose_inline
+u64a generateValidMask(const s32 high, const s32 low) {
+    assert(high + low < 8);
+    DEBUG_PRINTF("high %d low %d\n", high, low);
+    const u64a ones = ~0ull;
+    return (ones << ((high + low) * 8)) >> (high * 8);
+}
+
+/*
+ * Do the single-byte check if only one lookaround entry exists
+ * and it's a single mask.
+ * Return success if the byte is in the future or before history
+ * (offset is greater than (history) buffer length).
+ */
+static rose_inline
+int roseCheckByte(const struct core_info *ci, u8 and_mask, u8 cmp_mask,
+                  u8 negation, s32 checkOffset, u64a end) {
+    DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
+                 ci->buf_offset, ci->buf_offset + ci->len);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    const s64a base_offset = end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("checkOffset=%d offset=%lld\n", checkOffset, offset);
+    u8 c;
+    if (offset >= 0) {
+        if (offset >= (s64a)ci->len) {
+            DEBUG_PRINTF("in the future\n");
+            return 1;
+        } else {
+            assert(offset < (s64a)ci->len);
+            DEBUG_PRINTF("check byte in buffer\n");
+            c = ci->buf[offset];
+        }
+    } else {
+        if (offset >= -(s64a) ci->hlen) {
+            DEBUG_PRINTF("check byte in history\n");
+            c = ci->hbuf[ci->hlen + offset];
+        } else {
+            DEBUG_PRINTF("before history and return\n");
+            return 1;
+        }
+    }
+
+    if (((and_mask & c) != cmp_mask) ^ negation) {
+        DEBUG_PRINTF("char 0x%02x at offset %lld failed byte check\n",
+                     c, offset);
+        return 0;
+    }
+
+    DEBUG_PRINTF("real offset=%lld char=%02x\n", offset, c);
+    DEBUG_PRINTF("OK :)\n");
+    return 1;
+}
+
+static rose_inline
+int roseCheckMask(const struct core_info *ci, u64a and_mask, u64a cmp_mask,
+                  u64a neg_mask, s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("rel offset %lld\n",base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u64a data = 0;
+    u64a valid_data_mask = ~0ULL; // mask for validate check.
+    //A 0xff byte means that this byte is in the buffer.
+    s32 shift_l = 0; // size of bytes in the future.
+    s32 shift_r = 0; // size of bytes before the history.
+    s32 h_len = 0; // size of bytes in the history buffer.
+    s32 c_len = 8; // size of bytes in the current buffer.
+    //s64a c_start = offset; // offset of start pointer in current buffer.
+    if (offset < 0) {
+        // in or before history buffer.
+        if (offset + 8 <= -(s64a)ci->hlen) {
+            DEBUG_PRINTF("before history and return\n");
+            return 1;
+        }
+        const u8 *h_start = ci->hbuf; // start pointer in history buffer.
+        if (offset < -(s64a)ci->hlen) {
+            // some bytes are before history.
+            shift_r = -(offset + (s64a)ci->hlen);
+            DEBUG_PRINTF("shift_r %d", shift_r);
+        } else {
+            h_start += ci->hlen + offset;
+        }
+        if (offset + 7 < 0) {
+            DEBUG_PRINTF("all in history buffer\n");
+            data = partial_load_u64a(h_start, 8 - shift_r);
+        } else {
+            // history part
+            c_len = offset + 8;
+            h_len = -offset - shift_r;
+            DEBUG_PRINTF("%d bytes in history\n", h_len);
+            s64a data_h = 0;
+            data_h = partial_load_u64a(h_start, h_len);
+            // current part
+            if (c_len > (s64a)ci->len) {
+                shift_l = c_len - ci->len;
+                c_len = ci->len;
+            }
+            data = partial_load_u64a(ci->buf, c_len);
+            data <<= h_len << 3;
+            data |= data_h;
+        }
+        if (shift_r) {
+            data <<= shift_r << 3;
+        }
+    } else {
+        // current buffer.
+        if (offset + c_len > (s64a)ci->len) {
+            if (offset >= (s64a)ci->len) {
+                DEBUG_PRINTF("all in the future\n");
+                return 1;
+            }
+            // some  bytes in the future.
+            shift_l = offset + c_len - ci->len;
+            c_len = ci->len - offset;
+            data = partial_load_u64a(ci->buf + offset, c_len);
+        } else {
+            data = unaligned_load_u64a(ci->buf + offset);
+        }
+    }
+
+    if (shift_l || shift_r) {
+        valid_data_mask = generateValidMask(shift_l, shift_r);
+    }
+    DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask);
+
+    if (validateMask(data, valid_data_mask,
+                     and_mask, cmp_mask, neg_mask)) {
+        DEBUG_PRINTF("check mask successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
 /**
  * \brief Scan around a literal, checking that that "lookaround" reach masks
  * are satisfied.
@@ -1026,6 +1175,30 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_MASK) {
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckMask(ci, ri->and_mask, ri->cmp_mask,
+                                   ri->neg_mask, ri->offset, end)) {
+                    DEBUG_PRINTF("failed mask check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_BYTE) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask,
+                                   ri->negation, ri->offset, end)) {
+                    DEBUG_PRINTF("failed byte check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_INFIX) {
                 if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report,
                                    end)) {
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index ecdda146..b8301dbd 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -77,6 +77,7 @@
 #include "util/make_unique.h"
 #include "util/multibit_build.h"
 #include "util/order_check.h"
+#include "util/popcount.h"
 #include "util/queue_index_factory.h"
 #include "util/report_manager.h"
 #include "util/ue2string.h"
@@ -197,6 +198,8 @@ public:
         case ROSE_INSTR_CHECK_BOUNDS: return &u.checkBounds;
         case ROSE_INSTR_CHECK_NOT_HANDLED: return &u.checkNotHandled;
         case ROSE_INSTR_CHECK_LOOKAROUND: return &u.checkLookaround;
+        case ROSE_INSTR_CHECK_MASK: return &u.checkMask;
+        case ROSE_INSTR_CHECK_BYTE: return &u.checkByte;
         case ROSE_INSTR_CHECK_INFIX: return &u.checkInfix;
         case ROSE_INSTR_CHECK_PREFIX: return &u.checkPrefix;
         case ROSE_INSTR_ANCHORED_DELAY: return &u.anchoredDelay;
@@ -246,6 +249,8 @@ public:
         case ROSE_INSTR_CHECK_BOUNDS: return sizeof(u.checkBounds);
         case ROSE_INSTR_CHECK_NOT_HANDLED: return sizeof(u.checkNotHandled);
         case ROSE_INSTR_CHECK_LOOKAROUND: return sizeof(u.checkLookaround);
+        case ROSE_INSTR_CHECK_MASK: return sizeof(u.checkMask);
+        case ROSE_INSTR_CHECK_BYTE: return sizeof(u.checkByte);
         case ROSE_INSTR_CHECK_INFIX: return sizeof(u.checkInfix);
         case ROSE_INSTR_CHECK_PREFIX: return sizeof(u.checkPrefix);
         case ROSE_INSTR_ANCHORED_DELAY: return sizeof(u.anchoredDelay);
@@ -294,6 +299,8 @@ public:
         ROSE_STRUCT_CHECK_BOUNDS checkBounds;
         ROSE_STRUCT_CHECK_NOT_HANDLED checkNotHandled;
         ROSE_STRUCT_CHECK_LOOKAROUND checkLookaround;
+        ROSE_STRUCT_CHECK_MASK checkMask;
+        ROSE_STRUCT_CHECK_BYTE checkByte;
         ROSE_STRUCT_CHECK_INFIX checkInfix;
         ROSE_STRUCT_CHECK_PREFIX checkPrefix;
         ROSE_STRUCT_ANCHORED_DELAY anchoredDelay;
@@ -2809,6 +2816,12 @@ flattenProgram(const vector<vector<RoseInstruction>> &programs) {
         case ROSE_INSTR_CHECK_LOOKAROUND:
             ri.u.checkLookaround.fail_jump = jump_val;
             break;
+        case ROSE_INSTR_CHECK_MASK:
+            ri.u.checkMask.fail_jump = jump_val;
+            break;
+        case ROSE_INSTR_CHECK_BYTE:
+            ri.u.checkByte.fail_jump = jump_val;
+            break;
         case ROSE_INSTR_CHECK_INFIX:
             ri.u.checkInfix.fail_jump = jump_val;
             break;
@@ -3162,6 +3175,95 @@ u32 addLookaround(build_context &bc, const vector<LookEntry> &look) {
     return verify_u32(idx);
 }
 
+static
+bool checkReachMask(const CharReach &cr, u8 &andmask, u8 &cmpmask) {
+    size_t reach_size = cr.count();
+    assert(reach_size > 0);
+    // check whether entry_size is some power of 2.
+    if ((reach_size - 1) & reach_size) {
+        return false;
+    }
+    make_and_cmp_mask(cr, &andmask, &cmpmask);
+    if ((1 << popcount32((u8)(~andmask))) ^ reach_size) {
+        return false;
+    }
+    return true;
+}
+
+static
+bool checkReachWithFlip(const CharReach &cr, u8 &andmask,
+                       u8 &cmpmask, u8 &flip) {
+    if (checkReachMask(cr, andmask, cmpmask)) {
+        flip = 0;
+        return true;
+    }
+    if (checkReachMask(~cr, andmask, cmpmask)) {
+        flip = 1;
+        return true;
+    }
+    return false;
+}
+
+static
+bool makeRoleByte(const vector<LookEntry> &look,
+                  vector<RoseInstruction> &program) {
+    if (look.size() == 1) {
+        const auto &entry = look[0];
+        u8 andmask_u8, cmpmask_u8;
+        u8 flip;
+        if (!checkReachWithFlip(entry.reach, andmask_u8, cmpmask_u8, flip)) {
+            return false;
+        }
+        s32 checkbyte_offset = verify_s32(entry.offset);
+        DEBUG_PRINTF("CHECK BYTE offset=%d\n", checkbyte_offset);
+        auto ri = RoseInstruction(ROSE_INSTR_CHECK_BYTE,
+                                  JumpTarget::NEXT_BLOCK);
+        ri.u.checkByte.and_mask = andmask_u8;
+        ri.u.checkByte.cmp_mask = cmpmask_u8;
+        ri.u.checkByte.negation = flip;
+        ri.u.checkByte.offset = checkbyte_offset;
+        program.push_back(ri);
+        return true;
+    }
+    return false;
+}
+
+static
+bool makeRoleMask(const vector<LookEntry> &look,
+                  vector<RoseInstruction> &program) {
+    if (look.back().offset < look.front().offset + 8) {
+        s32 base_offset = verify_s32(look.front().offset);
+        u64a and_mask = 0;
+        u64a cmp_mask = 0;
+        u64a neg_mask = 0;
+        for (const auto &entry : look) {
+            u8 andmask_u8, cmpmask_u8, flip;
+            if (!checkReachWithFlip(entry.reach, andmask_u8,
+                                    cmpmask_u8, flip)) {
+                return false;
+            }
+            DEBUG_PRINTF("entry offset %d\n", entry.offset);
+            u32 shift = (entry.offset - base_offset) << 3;
+            and_mask |= (u64a)andmask_u8 << shift;
+            cmp_mask |= (u64a)cmpmask_u8 << shift;
+            if (flip) {
+                neg_mask |= 0xffLLU << shift;
+            }
+        }
+        DEBUG_PRINTF("CHECK MASK and_mask=%llx cmp_mask=%llx\n",
+                     and_mask, cmp_mask);
+        auto ri = RoseInstruction(ROSE_INSTR_CHECK_MASK,
+                                  JumpTarget::NEXT_BLOCK);
+        ri.u.checkMask.and_mask = and_mask;
+        ri.u.checkMask.cmp_mask = cmp_mask;
+        ri.u.checkMask.neg_mask = neg_mask;
+        ri.u.checkMask.offset = base_offset;
+        program.push_back(ri);
+        return true;
+    }
+    return false;
+}
+
 static
 void makeRoleLookaround(RoseBuildImpl &build, build_context &bc, RoseVertex v,
                         vector<RoseInstruction> &program) {
@@ -3187,6 +3289,14 @@ void makeRoleLookaround(RoseBuildImpl &build, build_context &bc, RoseVertex v,
         return;
     }
 
+    if (makeRoleByte(look, program)) {
+        return;
+    }
+
+    if (makeRoleMask(look, program)) {
+        return;
+    }
+
     DEBUG_PRINTF("role has lookaround\n");
     u32 look_idx = addLookaround(bc, look);
     u32 look_count = verify_u32(look.size());
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index 75b831a5..1e1c36ca 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -290,6 +290,24 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_MASK) {
+                os << "    and_mask " << std::hex << ri->and_mask << endl;
+                os << "    cmp_mask " << ri->cmp_mask << endl;
+                os << "    neg_mask " << ri->neg_mask << std::dec<< endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_BYTE) {
+                os << "    and_mask " << std::hex << ri->and_mask << endl;
+                os << "    cmp_mask " << ri->cmp_mask << std::dec << endl;
+                os << "    negation " << ri->negation << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_INFIX) {
                 os << "    queue " << ri->queue << endl;
                 os << "    lag " << ri->lag << endl;
diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h
index cc3d07b0..6ca117ea 100644
--- a/src/rose/rose_program.h
+++ b/src/rose/rose_program.h
@@ -50,6 +50,8 @@ enum RoseInstructionCode {
     ROSE_INSTR_CHECK_BOUNDS,      //!< Bounds on distance from offset 0.
     ROSE_INSTR_CHECK_NOT_HANDLED, //!< Test & set role in "handled".
     ROSE_INSTR_CHECK_LOOKAROUND,  //!< Lookaround check.
+    ROSE_INSTR_CHECK_MASK,        //!< 8-bytes mask check.
+    ROSE_INSTR_CHECK_BYTE,        //!< Single Byte check.
     ROSE_INSTR_CHECK_INFIX,       //!< Infix engine must be in accept state.
     ROSE_INSTR_CHECK_PREFIX,      //!< Prefix engine must be in accept state.
     ROSE_INSTR_PUSH_DELAYED,      //!< Push delayed literal matches.
@@ -165,6 +167,24 @@ struct ROSE_STRUCT_CHECK_LOOKAROUND {
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
+struct ROSE_STRUCT_CHECK_MASK {
+    u8 code; //!< From enum roseInstructionCode.
+    u64a and_mask; //!< 64-bits and mask.
+    u64a cmp_mask; //!< 64-bits cmp mask.
+    u64a neg_mask; //!< 64-bits negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_BYTE {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 and_mask; //!< 8-bits and mask.
+    u8 cmp_mask; //!< 8-bits cmp mask.
+    u8 negation; //!< Flag about negation.
+    s32 offset; //!< The relative offset.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
 struct ROSE_STRUCT_CHECK_INFIX {
     u8 code; //!< From enum RoseInstructionCode.
     u32 queue; //!< Queue of leftfix to check.
diff --git a/src/rose/validate_mask.h b/src/rose/validate_mask.h
new file mode 100644
index 00000000..b2c2f5d6
--- /dev/null
+++ b/src/rose/validate_mask.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ue2common.h"
+
+// check positive bytes in cmp_result.
+// return one if the check passed, zero otherwise.
+static really_inline
+int posValidateMask(const u64a cmp_result, const u64a pos_mask) {
+    return !(cmp_result & pos_mask);
+}
+
+/*
+ * check negative bytes in cmp_result.
+ * return one if any byte in cmp_result is not 0, zero otherwise.
+ * check lowest 7 bits and highest bit of every byte respectively.
+ */
+static really_inline
+int negValidateMask(const u64a cmp_result, const u64a neg_mask) {
+    const u64a count_mask = 0x7f7f7f7f7f7f7f7f;
+    // check lowest 7 bits of every byte.
+    // the highest bit should be 1 if check passed.
+    u64a check_low = (cmp_result & count_mask) + count_mask;
+    // check the highest bit of every byte.
+    // combine the highest bit and 0x7f to 0xff if check passes.
+    // flip all 0xff to 0x00 and 0x7f to 0x80.
+    u64a check_all = ~(check_low | cmp_result | count_mask);
+    return !(check_all & neg_mask);
+}
+
+static really_inline
+int validateMask(u64a data, u64a valid_data_mask, u64a and_mask,
+                 u64a cmp_mask, u64a neg_mask) {
+    // skip some byte where valid_data_mask is 0x00 there.
+    and_mask &= valid_data_mask;
+    cmp_mask &= valid_data_mask;
+    neg_mask &= valid_data_mask;
+    u64a cmp_result = (data & and_mask) ^ cmp_mask;
+    /* do the positive check first since it's cheaper */
+    if (posValidateMask(cmp_result, ~neg_mask)
+        && negValidateMask(cmp_result, neg_mask)) {
+        return 1;
+    } else {
+        DEBUG_PRINTF("data %llx valid_data_mask(vdm) %llx\n",
+                     data, valid_data_mask);
+        DEBUG_PRINTF("and_mask & vdm %llx cmp_mask & vdm %llx\n", and_mask,
+                     cmp_mask);
+        DEBUG_PRINTF("cmp_result %llx neg_mask & vdm %llx\n",
+                     cmp_result, neg_mask);
+        return 0;
+    }
+}
diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt
index a893d3d5..8209c277 100644
--- a/unit/CMakeLists.txt
+++ b/unit/CMakeLists.txt
@@ -65,6 +65,7 @@ set(unit_internal_SOURCES
     internal/pqueue.cpp
     internal/repeat.cpp
     internal/rose_build_merge.cpp
+    internal/rose_mask.cpp
     internal/rvermicelli.cpp
     internal/simd_utils.cpp
     internal/shuffle.cpp
diff --git a/unit/internal/rose_mask.cpp b/unit/internal/rose_mask.cpp
new file mode 100644
index 00000000..e6be00f3
--- /dev/null
+++ b/unit/internal/rose_mask.cpp
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "rose/validate_mask.h"
+#include "gtest/gtest.h"
+
+#define ONES64 0xffffffffffffffffULL
+
+/* valid_data_mask is flexible, don't need to be fixed in Info */
+struct ValidateMaskTestInfo {
+    u64a data;
+    u64a and_mask;
+    u64a cmp_mask;
+    u64a neg_mask;
+};
+
+static const ValidateMaskTestInfo testBasic[] = {
+    /* data is randomly picked */
+    {0x1234abcd4321dcbaULL, 0xff09bbdd7f7ffeffULL,
+     0x1200abcd4561dcbbULL, 0xffff00ffULL},
+    /* data = "VaLiDaTe" */
+    {0x56614c6944615465ULL, 0xe0feffffdf7b5480ULL,
+     0x40614c6946615400ULL, 0xff0000ff000000ULL},
+    /* data = "\0\0\0MASK\0" */
+    {0x4d41534b00ULL, 0xfffffefebfdf002cULL,
+     0x5536344c0173002cULL, 0xffffff0000ff00ffULL},
+    /* data = "FOo14foo" */
+    {0x464f6f3134666f6fULL, 0xdfdffffef8c0f000ULL,
+     0x46466f3030406000ULL, 0xff000000000000ULL},
+    /* data = "FOo14foo" with different cmp_mask and neg_mask*/
+    {0x464f6f3134666f6fULL, 0xdfdffffef8c0f000ULL,
+     0x44464f3034606f60ULL, 0xffffff00ffffffffULL},
+};
+
+/*
+ * generate 37 different valid_data_mask
+ * 8 from 0xff to 0xff00000000000000
+ * 7 from 0xffff to 0xffff000000000000
+ * ...
+ * 0xffffffffffffffff and 0
+ */
+static int initLegalValidMasks(u64a validMasks[]) {
+    u64a data = ONES64;
+    int num = 0;
+    for (int i = 0; i < 64; i += 8) {
+        for (int j = 0; j <= i; j += 8) {
+            validMasks[num] = data << j;
+            num++;
+        }
+        data >>= 8;
+    }
+    validMasks[num] = 0;
+    num++;
+    return num;
+}
+
+/*
+ * generate all 256 neg_masks
+ * including 0, 0xff, 0xff00,..., 0xffffffffffffffff
+ */
+static int initLegalNegMasks(u64a negMasks[]) {
+    u64a data = 0;
+    u64a offset;
+    int num = 0;
+    while (data != ONES64) {
+        negMasks[num] = data;
+        num++;
+        offset = (data | (data +1)) ^ data;
+        data += 0xfeULL * offset + 1;
+    }
+    negMasks[num] = data;
+    num++;
+    return num;
+}
+
+
+/*
+ * check all legal valid_mask(37 different) for validateMask[]
+ */
+TEST(ValidateMask, ValidMaskTests) {
+    u64a validMasks[256];
+    int num = initLegalValidMasks(validMasks);
+
+    for (const auto &t : testBasic) {
+        for (int i = 0; i < num; i++) {
+            EXPECT_EQ(1, validateMask(t.data,
+                                      validMasks[i],
+                                      t.and_mask,
+                                      t.cmp_mask,
+                                      t.neg_mask));
+        }
+    }
+}
+
+/*
+ * fix neg_mask to 0 and ONES64,
+ * check output of ValidateMask on different valid_mask,
+ * for neg_mask = 0,
+ */
+TEST(ValidateMask, AdvancedValidMaskTests) {
+    u64a validMasks[256];
+    int num = initLegalValidMasks(validMasks);
+    int bool_result;
+    for (const auto &t: testBasic) {
+        for (int i = 0; i < num; i++) {
+            bool_result = !(validMasks[i] & t.neg_mask);
+            EXPECT_EQ(bool_result, validateMask(t.data,
+                                                validMasks[i],
+                                                t.and_mask,
+                                                t.cmp_mask,
+                                                0));
+            bool_result = (validMasks[i] | t.neg_mask) == t.neg_mask;
+            EXPECT_EQ(bool_result, validateMask(t.data,
+                                                validMasks[i],
+                                                t.and_mask,
+                                                t.cmp_mask,
+                                                ONES64));
+        }
+    }
+}
+
+/*
+ * test every pair of valid_data_mask and neg_mask
+ * and compute the expect output by a formula
+ */
+TEST(ValidateMask, FullTests) {
+    u64a validMasks[256];
+    u64a negMasks[256];
+    int vm_num = initLegalValidMasks(validMasks);
+    int nm_num = initLegalNegMasks(negMasks);
+    int bool_result;
+    for (const auto &t: testBasic) {
+        for (int i = 0; i < vm_num; i++) {
+            for (int j = 0; j < nm_num; j++) {
+                /*
+                 * treat t.neg_mask as a truthtable (a negative truthtable)
+                 * we expect validateMask output 1 if and only if
+                 * the truthtable(tt) and neg_mask(nm) looks same
+                 * under "&" operation with valid_data_mask(vdm)
+                 * that is
+                 * output = (tt & vdm) == (nm & vdm) ? 1 : 0;
+                 */
+                bool_result = (t.neg_mask & validMasks[i]) ==
+                              (negMasks[j] & validMasks[i]);
+                EXPECT_EQ(bool_result, validateMask(t.data,
+                                                    validMasks[i],
+                                                    t.and_mask,
+                                                    t.cmp_mask,
+                                                    negMasks[j]));
+            }
+        }
+    }
+}
+
+/*
+ * drop the original validateMask[].neg_mask
+ * and test more neg_mask and valid_mask manually
+ */
+TEST(ValidateMask, ManualTest_0) {
+    const auto &t = testBasic[0];
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 8,
+                              t.and_mask, t.cmp_mask, 0xffff0000ULL));
+    EXPECT_EQ(1, validateMask(t.data, (ONES64 << 16) >> 8,
+                              t.and_mask, t.cmp_mask, 0xffff0000ULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 16,
+                              t.and_mask, t.cmp_mask, 0xffffff00ULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 24,
+                              t.and_mask, t.cmp_mask, 0xff00ffffULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 32,
+                              t.and_mask, t.cmp_mask, 0xffffffff00ffULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 40,
+                              t.and_mask, t.cmp_mask, 0xff00ffULL));
+    EXPECT_EQ(1, validateMask(t.data, 0,
+                              t.and_mask, t.cmp_mask, ONES64));
+    EXPECT_EQ(1, validateMask(t.data, 0,
+                              t.and_mask, t.cmp_mask, ~t.neg_mask));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 16,
+                              t.and_mask, t.cmp_mask, 0xff0000ffULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64,
+                              t.and_mask, t.cmp_mask, 0xffff0000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 >> 32,
+                              t.and_mask, t.cmp_mask, 0xff00ffULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 8,
+                              t.and_mask, t.cmp_mask, 0xffffffffULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 16,
+                              t.and_mask, t.cmp_mask, 0xff0000ffULL));
+}
+
+TEST(ValidateMask, ManualTest_1) {
+    const auto &t = testBasic[1];
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 16,
+                              t.and_mask, t.cmp_mask, 0xff0000ff00ffffULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 32,
+                              t.and_mask, t.cmp_mask, 0xff000000000000ULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 32,
+                              t.and_mask, t.cmp_mask, 0xff0000ffff00ffULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 56,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 8,
+                              t.and_mask, t.cmp_mask, 0xffff0000ff000000ULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 16,
+                              t.and_mask, t.cmp_mask, 0xff000000ULL));
+    EXPECT_EQ(1, validateMask(t.data, (ONES64 << 32) >> 16,
+                              t.and_mask, t.cmp_mask, 0xff00ff00));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 40,
+                              t.and_mask, t.cmp_mask, 0xff00000000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 48,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 56,
+                              t.and_mask, t.cmp_mask, 0xff00000000000000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 16,
+                              t.and_mask, t.cmp_mask, 0xff0000ffff0000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 >> 8,
+                              t.and_mask, t.cmp_mask, 0xff000000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 >> 16,
+                              t.and_mask, t.cmp_mask, 0xffff000000ULL));
+    EXPECT_EQ(0, validateMask(t.data, (ONES64 << 40) >> 16,
+                              t.and_mask, t.cmp_mask, 0xff000000000000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 8,
+                              t.and_mask, t.cmp_mask, ONES64));
+}
+
+TEST(ValidateMask, ManualTest_2) {
+    const auto &t = testBasic[2];
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 24,
+                              t.and_mask, t.cmp_mask, 0xffffff0000000000ULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 56,
+                              t.and_mask, t.cmp_mask, 0xff00000000000000ULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 << 56,
+                              t.and_mask, t.cmp_mask, 0xff00ffffff00ffffULL));
+    EXPECT_EQ(1, validateMask(t.data, 0,
+                              t.and_mask, t.cmp_mask, ONES64));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 24,
+                              t.and_mask, t.cmp_mask, 0xff00ffULL));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 32,
+                              t.and_mask, t.cmp_mask, 0xffff00ff00ffULL));
+    EXPECT_EQ(1, validateMask(t.data, (ONES64 << 32) >> 24,
+                              t.and_mask, t.cmp_mask, 0xff0000ULL));
+    EXPECT_EQ(1, validateMask(t.data, (ONES64 << 32) >> 24,
+                              t.and_mask, t.cmp_mask, 0xff00ffULL));
+    EXPECT_EQ(1, validateMask(t.data, (ONES64 << 56) >> 40,
+                              t.and_mask, t.cmp_mask, 0xff0000ULL));
+    EXPECT_EQ(1, validateMask(t.data, (ONES64 << 56) >> 32,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(1, validateMask(t.data, ONES64 >> 40,
+                              t.and_mask, t.cmp_mask, 0xffffffff00ffULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(0, validateMask(t.data, ONES64,
+                              t.and_mask, t.cmp_mask, ONES64));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 56,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 48,
+                              t.and_mask, t.cmp_mask, 0xff00000000000000ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 << 8,
+                              t.and_mask, t.cmp_mask, 0xffffff00000000ffULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 >> 32,
+                              t.and_mask, t.cmp_mask, 0xffff00ULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 >> 32,
+                              t.and_mask, t.cmp_mask, 0xffffffULL));
+    EXPECT_EQ(0, validateMask(t.data, ONES64 >> 16,
+                              t.and_mask, t.cmp_mask, 0xff00ffULL));
+    EXPECT_EQ(0, validateMask(t.data, (ONES64 << 32) >> 24,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(0, validateMask(t.data, (ONES64 << 32) >> 24,
+                              t.and_mask, t.cmp_mask, 0xffffff00000000ffULL));
+    EXPECT_EQ(0, validateMask(t.data, (ONES64 << 32) >> 24,
+                              t.and_mask, t.cmp_mask, 0xffffff000000ff00ULL));
+    EXPECT_EQ(0, validateMask(t.data, (ONES64 << 56) >> 40,
+                              t.and_mask, t.cmp_mask, 0));
+    EXPECT_EQ(0, validateMask(t.data, (ONES64 << 56) >> 48,
+                              t.and_mask, t.cmp_mask, 0xff00ULL));
+}

From aed2e721f4e4f7a3776528015f8f7f821a29a634 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 11 Jul 2016 14:42:23 +1000
Subject: [PATCH 115/166] rose: tidy up CHECK_{BYTE,MASK} dump code

---
 src/rose/rose_dump.cpp | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index 1e1c36ca..c483443c 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -291,18 +291,25 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             PROGRAM_NEXT_INSTRUCTION
 
             PROGRAM_CASE(CHECK_MASK) {
-                os << "    and_mask " << std::hex << ri->and_mask << endl;
-                os << "    cmp_mask " << ri->cmp_mask << endl;
-                os << "    neg_mask " << ri->neg_mask << std::dec<< endl;
+                os << "    and_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->and_mask << std::dec << endl;
+                os << "    cmp_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->cmp_mask << std::dec << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
                 os << "    offset " << ri->offset << endl;
                 os << "    fail_jump " << offset + ri->fail_jump << endl;
             }
             PROGRAM_NEXT_INSTRUCTION
 
             PROGRAM_CASE(CHECK_BYTE) {
-                os << "    and_mask " << std::hex << ri->and_mask << endl;
-                os << "    cmp_mask " << ri->cmp_mask << std::dec << endl;
-                os << "    negation " << ri->negation << endl;
+                os << "    and_mask 0x" << std::hex << std::setw(2)
+                   << std::setfill('0') << u32{ri->and_mask} << std::dec
+                   << endl;
+                os << "    cmp_mask 0x" << std::hex << std::setw(2)
+                   << std::setfill('0') << u32{ri->cmp_mask} << std::dec
+                   << endl;
+                os << "    negation " << u32{ri->negation} << endl;
                 os << "    offset " << ri->offset << endl;
                 os << "    fail_jump " << offset + ri->fail_jump << endl;
             }

From e95a25193520e97e729181feb5c5a649c5102782 Mon Sep 17 00:00:00 2001
From: Xiang Wang <xiang.w.wang@intel.com>
Date: Mon, 11 Jul 2016 10:08:40 -0400
Subject: [PATCH 116/166] UE-2991: avoid copying one unnecessary subqueue item

---
 src/nfa/tamarama.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/nfa/tamarama.c b/src/nfa/tamarama.c
index b7ba126c..4286a67e 100644
--- a/src/nfa/tamarama.c
+++ b/src/nfa/tamarama.c
@@ -243,13 +243,15 @@ void copyBack(const struct  Tamarama *t, struct mq *q, struct mq *q1) {
     u32 numItems = q1->end > q1->cur + 1 ? q1->end - q1->cur - 1 : 1;
     // Also need to copy MQE_END if the main queue is empty
     if (q->cur == q->end) {
-        numItems++;
+        assert(q->cur > 1 && q1->items[q1->end - 1].type == MQE_END);
+        q->items[--q->cur] = q1->items[q1->end - 1];
     }
     u32 cur = q->cur - numItems;
     q->items[cur] = q1->items[q1->cur++];
     q->items[cur].type = MQE_START;
     q->cur = cur++;
     for (u32 i = 0; i < numItems - 1; ++i) {
+        assert(q1->cur < q1->end);
         u32 type = q1->items[q1->cur].type;
         if (type > MQE_END) {
             q1->items[q1->cur].type = type - event_base + base;

From 981b59fd053c002b67e81e23d07fd5e102b6d8fb Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Tue, 12 Jul 2016 14:01:51 +1000
Subject: [PATCH 117/166] minor eager prefixes improvements  - count eager
 prefixes as always run engine when comparing with smwr  - only check if a
 prefix is vacuous after adding back literal fragments

---
 src/rose/rose_build_bytecode.cpp | 7 ++++---
 src/rose/rose_build_misc.cpp     | 5 +++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index b8301dbd..37e6ae13 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -1274,9 +1274,6 @@ bool checkSuitableForEager(bool is_prefix, const left_id &left,
         if (proper_out_degree(g.startDs, g)) {
             return false; /* not purely anchored */
         }
-        if (is_match_vertex(g.start, g)) {
-            return false; /* vacuous (todo: handle?) */
-        }
 
         ei.new_graph = cloneHolder(*left.graph());
         auto gg = ei.new_graph;
@@ -1284,6 +1281,10 @@ bool checkSuitableForEager(bool is_prefix, const left_id &left,
 
         ei.lag_adjust = decreaseLag(build, *gg, succs);
 
+        if (is_match_vertex(gg->start, *gg)) {
+            return false; /* should not still be vacuous as lag decreased */
+        }
+
         if (!can_die_early(*gg, EAGER_DIE_BEFORE_LIMIT)) {
             DEBUG_PRINTF("not eager as stuck alive\n");
             return false;
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index 186f4d16..f7d49cbe 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -1234,6 +1234,11 @@ u32 roseQuality(const RoseEngine *t) {
         always_run++;
     }
 
+    if (t->eagerIterOffset) {
+        /* eager prefixes are always run */
+        always_run++;
+    }
+
     const HWLM *ftable = getFLiteralMatcher(t);
     if (ftable) {
         /* TODO: ignore conditional ftables, or ftables beyond smwr region */

From 4dbbc4eaa57987d1f962653348220e2a64829963 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 14 Jul 2016 10:05:47 +1000
Subject: [PATCH 118/166] rose: add RECORD_ANCHORED instruction to program

Moves recordAnchoredLiteralMatch from an unconditional call in the
anchored callback to being driven by a program instruction.
---
 src/rose/match.c                 | 26 --------------
 src/rose/program_runtime.h       | 32 +++++++++++++++++
 src/rose/rose_build_bytecode.cpp | 60 ++++++++++++++++++++++++++++++--
 src/rose/rose_dump.cpp           |  5 +++
 src/rose/rose_program.h          |  6 ++++
 5 files changed, 100 insertions(+), 29 deletions(-)

diff --git a/src/rose/match.c b/src/rose/match.c
index 15d3534c..95cb141e 100644
--- a/src/rose/match.c
+++ b/src/rose/match.c
@@ -112,28 +112,6 @@ hwlmcb_rv_t ensureMpvQueueFlushed(const struct RoseEngine *t,
     return ensureQueueFlushed_i(t, scratch, qi, loc, 1, in_chained);
 }
 
-static rose_inline
-void recordAnchoredLiteralMatch(const struct RoseEngine *t,
-                                struct hs_scratch *scratch, u32 literal_id,
-                                u64a end) {
-    assert(end);
-    struct fatbit **anchoredLiteralRows = getAnchoredLiteralLog(scratch);
-
-    DEBUG_PRINTF("record %u @ %llu\n", literal_id, end);
-
-    if (!bf64_set(&scratch->al_log_sum, end - 1)) {
-        // first time, clear row
-        DEBUG_PRINTF("clearing %llu/%u\n", end - 1, t->anchored_count);
-        fatbit_clear(anchoredLiteralRows[end - 1]);
-    }
-
-    u32 rel_idx = literal_id - t->anchored_base_id;
-    DEBUG_PRINTF("record %u @ %llu index %u/%u\n", literal_id, end, rel_idx,
-                 t->anchored_count);
-    assert(rel_idx < t->anchored_count);
-    fatbit_set(anchoredLiteralRows[end - 1], t->anchored_count, rel_idx);
-}
-
 hwlmcb_rv_t roseHandleChainMatch(const struct RoseEngine *t,
                                  struct hs_scratch *scratch, u32 event,
                                  u64a top_squash_distance, u64a end,
@@ -254,10 +232,6 @@ int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx) {
 
     DEBUG_PRINTF("DONE groups=0x%016llx\n", tctxt->groups);
 
-    if (real_end > t->floatingMinLiteralMatchOffset) {
-        recordAnchoredLiteralMatch(t, scratch, id, real_end);
-    }
-
     return MO_CONTINUE_MATCHING;
 }
 
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index fdaa2e07..fef41269 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -165,6 +165,33 @@ void rosePushDelayedMatch(const struct RoseEngine *t,
     fatbit_set(slot, delay_count, delay_index);
 }
 
+static rose_inline
+void recordAnchoredLiteralMatch(const struct RoseEngine *t,
+                                struct hs_scratch *scratch, u32 literal_id,
+                                u64a end) {
+    assert(end);
+
+    if (end <= t->floatingMinLiteralMatchOffset) {
+        return;
+    }
+
+    struct fatbit **anchoredLiteralRows = getAnchoredLiteralLog(scratch);
+
+    DEBUG_PRINTF("record %u @ %llu\n", literal_id, end);
+
+    if (!bf64_set(&scratch->al_log_sum, end - 1)) {
+        // first time, clear row
+        DEBUG_PRINTF("clearing %llu/%u\n", end - 1, t->anchored_count);
+        fatbit_clear(anchoredLiteralRows[end - 1]);
+    }
+
+    u32 rel_idx = literal_id - t->anchored_base_id;
+    DEBUG_PRINTF("record %u @ %llu index %u/%u\n", literal_id, end, rel_idx,
+                 t->anchored_count);
+    assert(rel_idx < t->anchored_count);
+    fatbit_set(anchoredLiteralRows[end - 1], t->anchored_count, rel_idx);
+}
+
 static rose_inline
 char roseLeftfixCheckMiracles(const struct RoseEngine *t,
                               const struct LeftNfaInfo *left,
@@ -1226,6 +1253,11 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(RECORD_ANCHORED) {
+                recordAnchoredLiteralMatch(t, scratch, ri->id, end);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CATCH_UP) {
                 if (roseCatchUpTo(t, scratch, end) == HWLM_TERMINATE_MATCHING) {
                     return HWLM_TERMINATE_MATCHING;
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 37e6ae13..5cd8161b 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -204,6 +204,7 @@ public:
         case ROSE_INSTR_CHECK_PREFIX: return &u.checkPrefix;
         case ROSE_INSTR_ANCHORED_DELAY: return &u.anchoredDelay;
         case ROSE_INSTR_PUSH_DELAYED: return &u.pushDelayed;
+        case ROSE_INSTR_RECORD_ANCHORED: return &u.recordAnchored;
         case ROSE_INSTR_CATCH_UP: return &u.catchUp;
         case ROSE_INSTR_CATCH_UP_MPV: return &u.catchUpMpv;
         case ROSE_INSTR_SOM_ADJUST: return &u.somAdjust;
@@ -255,6 +256,7 @@ public:
         case ROSE_INSTR_CHECK_PREFIX: return sizeof(u.checkPrefix);
         case ROSE_INSTR_ANCHORED_DELAY: return sizeof(u.anchoredDelay);
         case ROSE_INSTR_PUSH_DELAYED: return sizeof(u.pushDelayed);
+        case ROSE_INSTR_RECORD_ANCHORED: return sizeof(u.recordAnchored);
         case ROSE_INSTR_CATCH_UP: return sizeof(u.catchUp);
         case ROSE_INSTR_CATCH_UP_MPV: return sizeof(u.catchUpMpv);
         case ROSE_INSTR_SOM_ADJUST: return sizeof(u.somAdjust);
@@ -305,6 +307,7 @@ public:
         ROSE_STRUCT_CHECK_PREFIX checkPrefix;
         ROSE_STRUCT_ANCHORED_DELAY anchoredDelay;
         ROSE_STRUCT_PUSH_DELAYED pushDelayed;
+        ROSE_STRUCT_RECORD_ANCHORED recordAnchored;
         ROSE_STRUCT_CATCH_UP catchUp;
         ROSE_STRUCT_CATCH_UP_MPV catchUpMpv;
         ROSE_STRUCT_SOM_ADJUST somAdjust;
@@ -4432,6 +4435,49 @@ void makeGroupSquashInstruction(const RoseBuildImpl &build, u32 final_id,
     program.push_back(move(ri));
 }
 
+static
+u32 findMaxOffset(const RoseBuildImpl &build, u32 lit_id) {
+    const auto &lit_vertices = build.literal_info.at(lit_id).vertices;
+    assert(!lit_vertices.empty());
+
+    u32 max_offset = 0;
+    for (const auto &v : lit_vertices) {
+        max_offset = max(max_offset, build.g[v].max_offset);
+    }
+
+    return max_offset;
+}
+
+static
+void makeRecordAnchoredInstruction(const RoseBuildImpl &build,
+                                   build_context &bc, u32 final_id,
+                                   vector<RoseInstruction> &program) {
+    assert(contains(build.final_id_to_literal, final_id));
+    const auto &lit_ids = build.final_id_to_literal.at(final_id);
+
+    // Must be anchored.
+    assert(!lit_ids.empty());
+    if (build.literals.right.at(*begin(lit_ids)).table != ROSE_ANCHORED) {
+        return;
+    }
+
+    // If this anchored literal can never match past
+    // floatingMinLiteralMatchOffset, we will never have to record it.
+    u32 max_offset = 0;
+    for (u32 lit_id : lit_ids) {
+        assert(build.literals.right.at(lit_id).table == ROSE_ANCHORED);
+        max_offset = max(max_offset, findMaxOffset(build, lit_id));
+    }
+
+    if (max_offset <= bc.floatingMinLiteralMatchOffset) {
+        return;
+    }
+
+    auto ri = RoseInstruction(ROSE_INSTR_RECORD_ANCHORED);
+    ri.u.recordAnchored.id = final_id;
+    program.push_back(move(ri));
+}
+
 static
 u32 findMinOffset(const RoseBuildImpl &build, u32 lit_id) {
     const auto &lit_vertices = build.literal_info.at(lit_id).vertices;
@@ -4589,10 +4635,18 @@ vector<RoseInstruction> buildLiteralProgram(RoseBuildImpl &build,
         root_programs.push_back(role_prog);
     }
 
-    // Literal may squash groups.
     if (final_id != MO_INVALID_IDX) {
-        root_programs.push_back({});
-        makeGroupSquashInstruction(build, final_id, root_programs.back());
+        vector<RoseInstruction> prog;
+
+        // Literal may squash groups.
+        makeGroupSquashInstruction(build, final_id, prog);
+
+        // Literal may be anchored and need to be recorded.
+        makeRecordAnchoredInstruction(build, bc, final_id, prog);
+
+        if (!prog.empty()) {
+            root_programs.push_back(move(prog));
+        }
     }
 
     vector<RoseInstruction> root_program;
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index c483443c..dedd8fcf 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -337,6 +337,11 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(RECORD_ANCHORED) {
+                os << "    id " << ri->id << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CATCH_UP) {}
             PROGRAM_NEXT_INSTRUCTION
 
diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h
index 6ca117ea..545e190f 100644
--- a/src/rose/rose_program.h
+++ b/src/rose/rose_program.h
@@ -55,6 +55,7 @@ enum RoseInstructionCode {
     ROSE_INSTR_CHECK_INFIX,       //!< Infix engine must be in accept state.
     ROSE_INSTR_CHECK_PREFIX,      //!< Prefix engine must be in accept state.
     ROSE_INSTR_PUSH_DELAYED,      //!< Push delayed literal matches.
+    ROSE_INSTR_RECORD_ANCHORED,   //!< Record an anchored literal match.
     ROSE_INSTR_CATCH_UP,          //!< Catch up engines, anchored matches.
     ROSE_INSTR_CATCH_UP_MPV,      //!< Catch up the MPV.
     ROSE_INSTR_SOM_ADJUST,        //!< Set SOM from a distance to EOM.
@@ -207,6 +208,11 @@ struct ROSE_STRUCT_PUSH_DELAYED {
     u32 index; // Delay literal index (relative to first delay lit).
 };
 
+struct ROSE_STRUCT_RECORD_ANCHORED {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 id; //!< Literal ID.
+};
+
 struct ROSE_STRUCT_CATCH_UP {
     u8 code; //!< From enum RoseInstructionCode.
 };

From 8754cbbd2468d495c536107fa005c7187e54c5a1 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 14 Jul 2016 11:40:49 +1000
Subject: [PATCH 119/166] rose: use program offset, not final_id, in atable

This removes the need to look up the program offset in a table when
handling an anchored literal match.
---
 src/rose/match.c                 |  5 ++---
 src/rose/rose_build_anchored.cpp | 28 +++++++++++++++++++++++++++-
 src/rose/rose_build_anchored.h   |  5 ++++-
 src/rose/rose_build_bytecode.cpp | 13 +++++++++----
 4 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/src/rose/match.c b/src/rose/match.c
index 95cb141e..b641e39d 100644
--- a/src/rose/match.c
+++ b/src/rose/match.c
@@ -220,10 +220,9 @@ int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx) {
         tctxt->lastEndOffset = real_end;
     }
 
-    const u32 *programs = getByOffset(t, t->litProgramOffset);
-    assert(id < t->literalCount);
+    // Note that the "id" we have been handed is the program offset.
     const u8 flags = ROSE_PROG_FLAG_IN_ANCHORED;
-    if (roseRunProgram(t, scratch, programs[id], start, real_end, match_len,
+    if (roseRunProgram(t, scratch, id, start, real_end, match_len,
                        flags) == HWLM_TERMINATE_MATCHING) {
         assert(can_stop_matching(scratch));
         DEBUG_PRINTF("caller requested termination\n");
diff --git a/src/rose/rose_build_anchored.cpp b/src/rose/rose_build_anchored.cpp
index 286cc7ae..befd0bad 100644
--- a/src/rose/rose_build_anchored.cpp
+++ b/src/rose/rose_build_anchored.cpp
@@ -204,6 +204,28 @@ void remapAnchoredReports(RoseBuildImpl &tbi) {
     }
 }
 
+static
+void remapIds(flat_set<ReportID> &reports, const vector<u32> &litPrograms) {
+    flat_set<ReportID> new_reports;
+    for (auto id : reports) {
+        assert(id < litPrograms.size());
+        new_reports.insert(litPrograms.at(id));
+    }
+    reports = move(new_reports);
+}
+
+/**
+ * \brief Replace the reports (which are literal final_ids) in the given
+ * raw_dfa with program offsets.
+ */
+static
+void remapIdsToPrograms(raw_dfa &rdfa, const vector<u32> &litPrograms) {
+    for (dstate &ds : rdfa.states) {
+        remapIds(ds.reports, litPrograms);
+        remapIds(ds.reports_eod, litPrograms);
+    }
+}
+
 static
 void populate_holder(const simple_anchored_info &sai, const set<u32> &exit_ids,
                      NGHolder *h_in) {
@@ -826,7 +848,7 @@ vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build) {
 
 aligned_unique_ptr<anchored_matcher_info>
 buildAnchoredMatcher(RoseBuildImpl &build, vector<raw_dfa> &dfas,
-                     size_t *asize) {
+                     const vector<u32> &litPrograms, size_t *asize) {
     const CompileContext &cc = build.cc;
 
     if (dfas.empty()) {
@@ -835,6 +857,10 @@ buildAnchoredMatcher(RoseBuildImpl &build, vector<raw_dfa> &dfas,
         return nullptr;
     }
 
+    for (auto &rdfa : dfas) {
+        remapIdsToPrograms(rdfa, litPrograms);
+    }
+
     vector<aligned_unique_ptr<NFA>> nfas;
     vector<u32> start_offset; // start offset for each dfa (dots removed)
     size_t total_size = buildNfas(dfas, &nfas, &start_offset, cc, build.rm);
diff --git a/src/rose/rose_build_anchored.h b/src/rose/rose_build_anchored.h
index a5317f89..579b26d7 100644
--- a/src/rose/rose_build_anchored.h
+++ b/src/rose/rose_build_anchored.h
@@ -56,10 +56,13 @@ std::vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build);
 /**
  * \brief Construct an anchored_matcher_info runtime structure from the given
  * set of DFAs.
+ *
+ * Remap the literal final_ids used for raw_dfa reports to the program offsets
+ * given in litPrograms.
  */
 aligned_unique_ptr<anchored_matcher_info>
 buildAnchoredMatcher(RoseBuildImpl &build, std::vector<raw_dfa> &dfas,
-                     size_t *asize);
+                     const std::vector<u32> &litPrograms, size_t *asize);
 
 u32 anchoredStateSize(const anchored_matcher_info &atable);
 
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 5cd8161b..f451b8ea 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -417,6 +417,10 @@ struct build_context : boost::noncopyable {
      * that have already been pushed into the engine_blob. */
     ue2::unordered_map<u32, u32> engineOffsets;
 
+    /** \brief Literal programs, indexed by final_id, after they have been
+     * written to the engine_blob. */
+    vector<u32> litPrograms;
+
     /** \brief Minimum offset of a match from the floating table. */
     u32 floatingMinLiteralMatchOffset = 0;
 
@@ -4736,20 +4740,20 @@ pair<u32, u32> buildLiteralPrograms(RoseBuildImpl &build, build_context &bc) {
     const u32 num_literals = build.final_id_to_literal.size();
     auto lit_edge_map = findEdgesByLiteral(build);
 
-    vector<u32> litPrograms(num_literals);
+    bc.litPrograms.resize(num_literals);
     vector<u32> delayRebuildPrograms(num_literals);
 
     for (u32 finalId = 0; finalId != num_literals; ++finalId) {
         const auto &lit_edges = lit_edge_map[finalId];
 
-        litPrograms[finalId] =
+        bc.litPrograms[finalId] =
             writeLiteralProgram(build, bc, finalId, lit_edges);
         delayRebuildPrograms[finalId] =
             buildDelayRebuildProgram(build, bc, finalId);
     }
 
     u32 litProgramsOffset =
-        add_to_engine_blob(bc, begin(litPrograms), end(litPrograms));
+        add_to_engine_blob(bc, begin(bc.litPrograms), end(bc.litPrograms));
     u32 delayRebuildProgramsOffset = add_to_engine_blob(
         bc, begin(delayRebuildPrograms), end(delayRebuildPrograms));
 
@@ -5206,7 +5210,8 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     // Build anchored matcher.
     size_t asize = 0;
     u32 amatcherOffset = 0;
-    auto atable = buildAnchoredMatcher(*this, anchored_dfas, &asize);
+    auto atable = buildAnchoredMatcher(*this, anchored_dfas, bc.litPrograms,
+                                       &asize);
     if (atable) {
         currOffset = ROUNDUP_CL(currOffset);
         amatcherOffset = currOffset;

From 210246af018a0ba50a492e6343f4124c6d177884 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 14 Jul 2016 11:50:23 +1000
Subject: [PATCH 120/166] rose_build_anchored: remove unused forward decls

---
 src/rose/rose_build_anchored.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/rose/rose_build_anchored.h b/src/rose/rose_build_anchored.h
index 579b26d7..ef06fcbb 100644
--- a/src/rose/rose_build_anchored.h
+++ b/src/rose/rose_build_anchored.h
@@ -39,13 +39,10 @@
 #include <set>
 
 struct anchored_matcher_info;
-struct RoseEngine;
 
 namespace ue2 {
 
-class NGHolder;
 class RoseBuildImpl;
-struct Grey;
 struct raw_dfa;
 
 /**

From a427a2843b2cc73ae512c350bbf9e1a4e89124ba Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 14 Jul 2016 13:34:56 +1000
Subject: [PATCH 121/166] rose_build_anchored: clean up remapping

Note that there are no EOD reports in the anchored matcher raw_dfas.
---
 src/rose/rose_build_anchored.cpp | 66 ++++++++++++++++----------------
 1 file changed, 32 insertions(+), 34 deletions(-)

diff --git a/src/rose/rose_build_anchored.cpp b/src/rose/rose_build_anchored.cpp
index befd0bad..60732ff9 100644
--- a/src/rose/rose_build_anchored.cpp
+++ b/src/rose/rose_build_anchored.cpp
@@ -173,47 +173,36 @@ void mergeAnchoredDfas(vector<unique_ptr<raw_dfa>> &dfas,
 }
 
 static
-void translateReportSet(flat_set<ReportID> *rset, const RoseBuildImpl &tbi) {
-    flat_set<ReportID> old;
-    old.swap(*rset);
-    for (auto report_id : old) {
-        DEBUG_PRINTF("updating %u -> %u\n", report_id,
-                     tbi.literal_info[report_id].final_id);
-        rset->insert(tbi.literal_info[report_id].final_id);
+void remapAnchoredReports(raw_dfa &rdfa, const RoseBuildImpl &build) {
+    for (dstate &ds : rdfa.states) {
+        assert(ds.reports_eod.empty()); // Not used in anchored matcher.
+        if (ds.reports.empty()) {
+            continue;
+        }
+
+        flat_set<ReportID> new_reports;
+        for (auto id : ds.reports) {
+            assert(id < build.literal_info.size());
+            new_reports.insert(build.literal_info.at(id).final_id);
+        }
+        ds.reports = move(new_reports);
     }
 }
 
+/**
+ * \brief Replaces the report ids currently in the dfas (rose graph literal
+ * ids) with the final id for each literal.
+ */
 static
-void remapAnchoredReports(raw_dfa &dfa, const RoseBuildImpl &tbi) {
-    for (dstate &ds : dfa.states) {
-        translateReportSet(&ds.reports, tbi);
-        translateReportSet(&ds.reports_eod, tbi);
-    }
-}
-
-/* Replaces the report ids currently in the dfas (rose graph literal ids) with
- * the final id used by the runtime. */
-static
-void remapAnchoredReports(RoseBuildImpl &tbi) {
-    for (auto it = tbi.anchored_nfas.begin(); it != tbi.anchored_nfas.end();
-         ++it) {
-        for (auto &rdfa : it->second) {
+void remapAnchoredReports(RoseBuildImpl &build) {
+    for (auto &m : build.anchored_nfas) {
+        for (auto &rdfa : m.second) {
             assert(rdfa);
-            remapAnchoredReports(*rdfa, tbi);
+            remapAnchoredReports(*rdfa, build);
         }
     }
 }
 
-static
-void remapIds(flat_set<ReportID> &reports, const vector<u32> &litPrograms) {
-    flat_set<ReportID> new_reports;
-    for (auto id : reports) {
-        assert(id < litPrograms.size());
-        new_reports.insert(litPrograms.at(id));
-    }
-    reports = move(new_reports);
-}
-
 /**
  * \brief Replace the reports (which are literal final_ids) in the given
  * raw_dfa with program offsets.
@@ -221,8 +210,17 @@ void remapIds(flat_set<ReportID> &reports, const vector<u32> &litPrograms) {
 static
 void remapIdsToPrograms(raw_dfa &rdfa, const vector<u32> &litPrograms) {
     for (dstate &ds : rdfa.states) {
-        remapIds(ds.reports, litPrograms);
-        remapIds(ds.reports_eod, litPrograms);
+        assert(ds.reports_eod.empty()); // Not used in anchored matcher.
+        if (ds.reports.empty()) {
+            continue;
+        }
+
+        flat_set<ReportID> new_reports;
+        for (auto id : ds.reports) {
+            assert(id < litPrograms.size());
+            new_reports.insert(litPrograms.at(id));
+        }
+        ds.reports = move(new_reports);
     }
 }
 

From 9eb349a343c05d41a3577454d1a5d495d1d4be11 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Fri, 15 Jul 2016 09:50:08 +1000
Subject: [PATCH 122/166] rose: expose smwr builder, tidy up engine build

---
 src/compiler/compiler.cpp          | 12 +--------
 src/nfagraph/ng.cpp                |  9 ++++---
 src/nfagraph/ng.h                  |  2 +-
 src/rose/rose_build.h              |  7 +++---
 src/rose/rose_build_bytecode.cpp   | 39 ++++++++++++++++++++++++++++++
 src/rose/rose_build_impl.h         |  4 ++-
 src/rose/rose_build_misc.cpp       | 35 ++++++---------------------
 unit/internal/rose_build_merge.cpp | 19 ++++++---------
 8 files changed, 68 insertions(+), 59 deletions(-)

diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp
index ce5f8723..d56aff88 100644
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -52,7 +52,6 @@
 #include "parser/shortcut_literal.h"
 #include "parser/unsupported.h"
 #include "parser/utf8_validate.h"
-#include "smallwrite/smallwrite_build.h"
 #include "rose/rose_build.h"
 #include "rose/rose_build_dump.h"
 #include "som/slot_manager_dump.h"
@@ -304,15 +303,6 @@ aligned_unique_ptr<RoseEngine> generateRoseEngine(NG &ng) {
         return nullptr;
     }
 
-    /* avoid building a smwr if just a pure floating case. */
-    if (!roseIsPureLiteral(rose.get())) {
-        u32 qual = roseQuality(rose.get());
-        auto smwr = ng.smwr->build(qual);
-        if (smwr) {
-            rose = roseAddSmallWrite(rose.get(), smwr.get());
-        }
-    }
-
     dumpRose(*ng.rose, rose.get(), ng.cc.grey);
     dumpReportManager(ng.rm, ng.cc.grey);
     dumpSomSlotManager(ng.ssm, ng.cc.grey);
diff --git a/src/nfagraph/ng.cpp b/src/nfagraph/ng.cpp
index 5d4f1b97..5023cbef 100644
--- a/src/nfagraph/ng.cpp
+++ b/src/nfagraph/ng.cpp
@@ -62,8 +62,8 @@
 #include "ng_width.h"
 #include "ue2common.h"
 #include "nfa/goughcompile.h"
-#include "smallwrite/smallwrite_build.h"
 #include "rose/rose_build.h"
+#include "smallwrite/smallwrite_build.h"
 #include "util/compile_error.h"
 #include "util/container.h"
 #include "util/depth.h"
@@ -82,8 +82,8 @@ NG::NG(const CompileContext &in_cc, size_t num_patterns,
       rm(in_cc.grey),
       ssm(in_somPrecision),
       cc(in_cc),
-      rose(makeRoseBuilder(rm, ssm, cc, boundary)),
-      smwr(makeSmallWriteBuilder(num_patterns, rm, cc)) {
+      smwr(makeSmallWriteBuilder(num_patterns, rm, cc)),
+      rose(makeRoseBuilder(rm, ssm, *smwr, cc, boundary)) {
 }
 
 NG::~NG() {
@@ -580,7 +580,8 @@ bool NG::addLiteral(const ue2_literal &literal, u32 expr_index,
 
     minWidth = min(minWidth, depth(literal.length()));
 
-    smwr->add(literal, id); /* inform small write handler about this literal */
+    /* inform small write handler about this literal */
+    smwr->add(literal, id);
 
     return true;
 }
diff --git a/src/nfagraph/ng.h b/src/nfagraph/ng.h
index 95936fcc..4aa6a7dc 100644
--- a/src/nfagraph/ng.h
+++ b/src/nfagraph/ng.h
@@ -119,8 +119,8 @@ public:
     BoundaryReports boundary;
     const CompileContext cc;
 
-    const std::unique_ptr<RoseBuild> rose; //!< Rose builder.
     const std::unique_ptr<SmallWriteBuild> smwr; //!< SmallWrite builder.
+    const std::unique_ptr<RoseBuild> rose; //!< Rose builder.
 };
 
 /** \brief Run graph reduction passes.
diff --git a/src/rose/rose_build.h b/src/rose/rose_build.h
index bef2114f..c71671fa 100644
--- a/src/rose/rose_build.h
+++ b/src/rose/rose_build.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -65,6 +65,7 @@ struct raw_som_dfa;
 class  CharReach;
 class  NGHolder;
 class  ReportManager;
+class  SmallWriteBuild;
 class  SomSlotManager;
 
 class RoseDedupeAux {
@@ -128,6 +129,7 @@ public:
 // Construct a usable Rose builder.
 std::unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm,
                                            SomSlotManager &ssm,
+                                           SmallWriteBuild &smwr,
                                            const CompileContext &cc,
                                            const BoundaryReports &boundary);
 
@@ -140,9 +142,6 @@ size_t roseSize(const RoseEngine *t);
  * intended to indicate a lightweight rose. */
 u32 roseQuality(const RoseEngine *t);
 
-ue2::aligned_unique_ptr<RoseEngine>
-roseAddSmallWrite(const RoseEngine *t, const SmallWriteEngine *smwr);
-
 bool roseIsPureLiteral(const RoseEngine *t);
 
 size_t maxOverlap(const ue2_literal &a, const ue2_literal &b, u32 b_delay);
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index f451b8ea..a8440916 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -64,6 +64,7 @@
 #include "nfagraph/ng_stop.h"
 #include "nfagraph/ng_util.h"
 #include "nfagraph/ng_width.h"
+#include "smallwrite/smallwrite_build.h"
 #include "som/slot_manager.h"
 #include "util/alloc.h"
 #include "util/bitutils.h"
@@ -5114,6 +5115,41 @@ u32 buildEagerQueueIter(const set<u32> &eager, u32 leftfixBeginQueue,
     return addIteratorToTable(bc, iter);
 }
 
+static
+aligned_unique_ptr<RoseEngine> addSmallWriteEngine(RoseBuildImpl &build,
+                                        aligned_unique_ptr<RoseEngine> rose) {
+    assert(rose);
+
+    if (roseIsPureLiteral(rose.get())) {
+        DEBUG_PRINTF("pure literal case, not adding smwr\n");
+        return rose;
+    }
+
+    u32 qual = roseQuality(rose.get());
+    auto smwr_engine = build.smwr.build(qual);
+    if (!smwr_engine) {
+        DEBUG_PRINTF("no smwr built\n");
+        return rose;
+    }
+
+    const size_t mainSize = roseSize(rose.get());
+    const size_t smallWriteSize = smwrSize(smwr_engine.get());
+    DEBUG_PRINTF("adding smwr engine, size=%zu\n", smallWriteSize);
+
+    const size_t smwrOffset = ROUNDUP_CL(mainSize);
+    const size_t newSize = smwrOffset + smallWriteSize;
+
+    auto rose2 = aligned_zmalloc_unique<RoseEngine>(newSize);
+    char *ptr = (char *)rose2.get();
+    memcpy(ptr, rose.get(), mainSize);
+    memcpy(ptr + smwrOffset, smwr_engine.get(), smallWriteSize);
+
+    rose2->smallWriteOffset = verify_u32(smwrOffset);
+    rose2->size = verify_u32(newSize);
+
+    return rose2;
+}
+
 aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     DerivedBoundaryReports dboundary(boundary);
 
@@ -5467,6 +5503,9 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     // after we copied it into the engine bytecode.
     assert(byte_length(bc.engine_blob) == engineBlobSize);
 
+    // Add a small write engine if appropriate.
+    engine = addSmallWriteEngine(*this, move(engine));
+
     DEBUG_PRINTF("rose done %p\n", engine.get());
     return engine;
 }
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index a00bc4ea..19f803b2 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -60,6 +60,7 @@ struct BoundaryReports;
 struct CastleProto;
 struct CompileContext;
 class ReportManager;
+class SmallWriteBuild;
 class SomSlotManager;
 
 struct suffix_id {
@@ -415,7 +416,7 @@ std::set<ReportID> all_reports(const OutfixInfo &outfix);
 // Concrete impl class
 class RoseBuildImpl : public RoseBuild {
 public:
-    RoseBuildImpl(ReportManager &rm, SomSlotManager &ssm,
+    RoseBuildImpl(ReportManager &rm, SomSlotManager &ssm, SmallWriteBuild &smwr,
                   const CompileContext &cc, const BoundaryReports &boundary);
 
     ~RoseBuildImpl() override;
@@ -584,6 +585,7 @@ public:
     QueueIndexFactory qif;
     ReportManager &rm;
     SomSlotManager &ssm;
+    SmallWriteBuild &smwr;
     const BoundaryReports &boundary;
 
 private:
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index f7d49cbe..c2f9f580 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -67,7 +67,9 @@ namespace ue2 {
 // just to get it out of the header
 RoseBuild::~RoseBuild() { }
 
-RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, SomSlotManager &ssm_in,
+RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in,
+                             SomSlotManager &ssm_in,
+                             SmallWriteBuild &smwr_in,
                              const CompileContext &cc_in,
                              const BoundaryReports &boundary_in)
     : cc(cc_in),
@@ -83,6 +85,7 @@ RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, SomSlotManager &ssm_in,
       max_rose_anchored_floating_overlap(0),
       rm(rm_in),
       ssm(ssm_in),
+      smwr(smwr_in),
       boundary(boundary_in),
       next_nfa_report(0) {
     // add root vertices to graph
@@ -233,10 +236,12 @@ size_t RoseBuildImpl::minLiteralLen(RoseVertex v) const {
 }
 
 // RoseBuild factory
-unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm, SomSlotManager &ssm,
+unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm,
+                                      SomSlotManager &ssm,
+                                      SmallWriteBuild &smwr,
                                       const CompileContext &cc,
                                       const BoundaryReports &boundary) {
-    return ue2::make_unique<RoseBuildImpl>(rm, ssm, cc, boundary);
+    return ue2::make_unique<RoseBuildImpl>(rm, ssm, smwr, cc, boundary);
 }
 
 size_t roseSize(const RoseEngine *t) {
@@ -1279,30 +1284,6 @@ u32 roseQuality(const RoseEngine *t) {
     return 1;
 }
 
-/** \brief Add a SMWR engine to the given RoseEngine. */
-aligned_unique_ptr<RoseEngine> roseAddSmallWrite(const RoseEngine *t,
-                                                 const SmallWriteEngine *smwr) {
-    assert(t);
-    assert(smwr);
-
-    const u32 mainSize = roseSize(t);
-    const u32 smallWriteSize = smwrSize(smwr);
-
-    u32 smwrOffset = ROUNDUP_CL(mainSize);
-    u32 newSize = smwrOffset + smallWriteSize;
-
-    aligned_unique_ptr<RoseEngine> t2 =
-        aligned_zmalloc_unique<RoseEngine>(newSize);
-    char *ptr = (char *)t2.get();
-    memcpy(ptr, t, mainSize);
-    memcpy(ptr + smwrOffset, smwr, smallWriteSize);
-
-    t2->smallWriteOffset = smwrOffset;
-    t2->size = newSize;
-
-    return t2;
-}
-
 #ifndef NDEBUG
 /** \brief Returns true if all the graphs (NFA, DFA, Haig, etc) in this Rose
  * graph are implementable. */
diff --git a/unit/internal/rose_build_merge.cpp b/unit/internal/rose_build_merge.cpp
index ad6b0176..3f5a8382 100644
--- a/unit/internal/rose_build_merge.cpp
+++ b/unit/internal/rose_build_merge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,20 +39,12 @@
 #include "util/compile_context.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
+#include "smallwrite/smallwrite_build.h"
 #include "som/slot_manager.h"
 
 using std::vector;
 using namespace ue2;
 
-static
-std::unique_ptr<RoseBuild> constructBuilder(const Grey &grey) {
-    CompileContext cc(true, false, get_current_target(), grey);
-    ReportManager rm(cc.grey);
-    SomSlotManager ssm(8); // som precision
-    BoundaryReports boundary;
-    return makeRoseBuilder(rm, ssm, cc, boundary);
-}
-
 static
 std::unique_ptr<NGHolder> makeSuffixGraph(ReportID report) {
     auto h = ue2::make_unique<NGHolder>(NFA_SUFFIX);
@@ -100,7 +92,12 @@ size_t numUniqueSuffixGraphs(const RoseGraph &g) {
 
 TEST(RoseMerge, uncalcLeaves_nonleaf) {
     Grey grey;
-    auto build_base = constructBuilder(grey);
+    CompileContext cc(true, false, get_current_target(), grey);
+    ReportManager rm(cc.grey);
+    SomSlotManager ssm(8); // som precision
+    auto smwr = makeSmallWriteBuilder(1, rm, cc);
+    BoundaryReports boundary;
+    auto build_base = makeRoseBuilder(rm, ssm, *smwr, cc, boundary);
     ASSERT_NE(nullptr, build_base);
 
     RoseBuildImpl &build = static_cast<RoseBuildImpl &>(*build_base);

From b13a90e5d2122225d87f79bcede5235984d5e760 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Thu, 30 Jun 2016 09:57:18 +1000
Subject: [PATCH 123/166] compiledump: allow disabling of early graphs for
 large compiles

---
 src/rose/rose_build_dump.cpp | 8 ++++++--
 src/rose/rose_in_dump.cpp    | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp
index 5e176c30..a33e653a 100644
--- a/src/rose/rose_build_dump.cpp
+++ b/src/rose/rose_build_dump.cpp
@@ -273,14 +273,18 @@ void dumpRoseGraph(const RoseBuild &build_base, const RoseEngine *t,
     const RoseBuildImpl &build = dynamic_cast<const RoseBuildImpl &>(build_base);
 
     const Grey &grey = build.cc.grey;
-    if (!grey.dumpFlags) {
+
+    /* "early" rose graphs should only be dumped if we are dumping intermediate
+     * graphs. Early graphs can be identified by the lack of a RoseEngine. */
+    u32 flag_test = t ? Grey::DUMP_IMPL : Grey::DUMP_INT_GRAPH;
+
+    if (!(grey.dumpFlags & flag_test)) {
         return;
     }
 
     stringstream ss;
     ss << grey.dumpPath << filename;
 
-
     DEBUG_PRINTF("dumping graph to %s\n", ss.str().c_str());
     ofstream os(ss.str());
 
diff --git a/src/rose/rose_in_dump.cpp b/src/rose/rose_in_dump.cpp
index 899e50c4..97aefdc4 100644
--- a/src/rose/rose_in_dump.cpp
+++ b/src/rose/rose_in_dump.cpp
@@ -51,7 +51,7 @@ namespace ue2 {
 
 void dumpPreRoseGraph(const RoseInGraph &ig, const Grey &grey,
                       const char *filename) {
-    if (!grey.dumpFlags) {
+    if (!(grey.dumpFlags & Grey::DUMP_INT_GRAPH)) {
         return;
     }
 

From 5c5ec905cc8658df46b03e8eb164b84fc2e77b19 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Fri, 15 Jul 2016 13:07:00 +1000
Subject: [PATCH 124/166] violet: initial implementation

---
 CMakeLists.txt                        |    2 +
 src/grey.cpp                          |   26 +-
 src/grey.h                            |   10 +
 src/hwlm/hwlm_build.cpp               |    5 +
 src/nfa/nfa_kind.h                    |   28 +
 src/nfagraph/ng.cpp                   |    9 +
 src/nfagraph/ng_holder.h              |   10 +
 src/nfagraph/ng_literal_analysis.cpp  |  103 +-
 src/nfagraph/ng_literal_analysis.h    |   31 +-
 src/nfagraph/ng_literal_component.cpp |    4 +-
 src/nfagraph/ng_rose.cpp              |   69 +-
 src/nfagraph/ng_rose.h                |   12 +-
 src/nfagraph/ng_som.cpp               |    3 +-
 src/nfagraph/ng_split.cpp             |   14 +-
 src/nfagraph/ng_util.cpp              |   16 +-
 src/nfagraph/ng_util.h                |    4 +
 src/nfagraph/ng_violet.cpp            | 2642 +++++++++++++++++++++++++
 src/nfagraph/ng_violet.h              |   52 +
 src/parser/shortcut_literal.cpp       |    4 +-
 src/rose/rose_build_add.cpp           |  198 +-
 src/rose/rose_build_add_mask.cpp      |   17 +-
 src/rose/rose_build_compile.cpp       |   38 +-
 src/rose/rose_build_dump.cpp          |   22 -
 src/rose/rose_build_lookaround.cpp    |    6 +-
 src/rose/rose_build_util.h            |    3 +
 src/rose/rose_in_dump.cpp             |    5 +-
 src/rose/rose_in_graph.h              |    8 +-
 src/rose/rose_in_util.h               |    5 +
 src/rose/stream.c                     |    3 +-
 src/scratch.h                         |   12 +-
 src/util/ue2string.h                  |   11 +-
 31 files changed, 3171 insertions(+), 201 deletions(-)
 create mode 100644 src/nfagraph/ng_violet.cpp
 create mode 100644 src/nfagraph/ng_violet.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7dff1c51..e748e955 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -748,6 +748,8 @@ SET (hs_SRCS
     src/nfagraph/ng_util.h
     src/nfagraph/ng_vacuous.cpp
     src/nfagraph/ng_vacuous.h
+    src/nfagraph/ng_violet.cpp
+    src/nfagraph/ng_violet.h
     src/nfagraph/ng_width.cpp
     src/nfagraph/ng_width.h
     src/parser/AsciiComponentClass.cpp
diff --git a/src/grey.cpp b/src/grey.cpp
index f4a67677..bba5198a 100644
--- a/src/grey.cpp
+++ b/src/grey.cpp
@@ -34,7 +34,7 @@
 #include <string>
 #include <vector>
 
-#define DEFAULT_MAX_HISTORY 60
+#define DEFAULT_MAX_HISTORY 110
 
 using namespace std;
 
@@ -51,7 +51,9 @@ Grey::Grey(void) :
                    allowLbr(true),
                    allowMcClellan(true),
                    allowPuff(true),
+                   allowLiteral(true),
                    allowRose(true),
+                   allowViolet(true),
                    allowExtendedNFA(true), /* bounded repeats of course */
                    allowLimExNFA(true),
                    allowAnchoredAcyclic(true),
@@ -60,6 +62,13 @@ Grey::Grey(void) :
                    allowDecoratedLiteral(true),
                    allowNoodle(true),
                    fdrAllowTeddy(true),
+                   violetAvoidSuffixes(true),
+                   violetAvoidWeakInfixes(true),
+                   violetDoubleCut(true),
+                   violetExtractStrongLiterals(true),
+                   violetLiteralChains(true),
+                   violetDoubleCutLiteralLen(3),
+                   violetEarlyCleanLiteralLen(6),
                    puffImproveHead(true),
                    castleExclusive(true),
                    mergeSEP(true), /* short exhaustible passthroughs */
@@ -206,7 +215,9 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(allowLbr);
         G_UPDATE(allowMcClellan);
         G_UPDATE(allowPuff);
+        G_UPDATE(allowLiteral);
         G_UPDATE(allowRose);
+        G_UPDATE(allowViolet);
         G_UPDATE(allowExtendedNFA);
         G_UPDATE(allowLimExNFA);
         G_UPDATE(allowAnchoredAcyclic);
@@ -215,6 +226,13 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(allowDecoratedLiteral);
         G_UPDATE(allowNoodle);
         G_UPDATE(fdrAllowTeddy);
+        G_UPDATE(violetAvoidSuffixes);
+        G_UPDATE(violetAvoidWeakInfixes);
+        G_UPDATE(violetDoubleCut);
+        G_UPDATE(violetExtractStrongLiterals);
+        G_UPDATE(violetLiteralChains);
+        G_UPDATE(violetDoubleCutLiteralLen);
+        G_UPDATE(violetEarlyCleanLiteralLen);
         G_UPDATE(puffImproveHead);
         G_UPDATE(castleExclusive);
         G_UPDATE(mergeSEP);
@@ -315,7 +333,9 @@ void applyGreyOverrides(Grey *g, const string &s) {
             g->allowLitHaig = false;
             g->allowMcClellan = false;
             g->allowPuff = false;
+            g->allowLiteral = false;
             g->allowRose = false;
+            g->allowViolet = false;
             g->allowSmallLiteralSet = false;
             g->roseMasks = false;
             done = true;
@@ -331,7 +351,9 @@ void applyGreyOverrides(Grey *g, const string &s) {
             g->allowLitHaig = false;
             g->allowMcClellan = true;
             g->allowPuff = false;
+            g->allowLiteral = false;
             g->allowRose = false;
+            g->allowViolet = false;
             g->allowSmallLiteralSet = false;
             g->roseMasks = false;
             done = true;
@@ -347,7 +369,9 @@ void applyGreyOverrides(Grey *g, const string &s) {
             g->allowLitHaig = false;
             g->allowMcClellan = true;
             g->allowPuff = false;
+            g->allowLiteral = false;
             g->allowRose = false;
+            g->allowViolet = false;
             g->allowSmallLiteralSet = false;
             g->roseMasks = false;
             done = true;
diff --git a/src/grey.h b/src/grey.h
index 03e40ed5..1714a0eb 100644
--- a/src/grey.h
+++ b/src/grey.h
@@ -51,7 +51,9 @@ struct Grey {
     bool allowLbr;
     bool allowMcClellan;
     bool allowPuff;
+    bool allowLiteral;
     bool allowRose;
+    bool allowViolet;
     bool allowExtendedNFA;
     bool allowLimExNFA;
     bool allowAnchoredAcyclic;
@@ -62,6 +64,14 @@ struct Grey {
     bool allowNoodle;
     bool fdrAllowTeddy;
 
+    u32  violetAvoidSuffixes; /* 0=never, 1=sometimes, 2=always */
+    bool violetAvoidWeakInfixes;
+    bool violetDoubleCut;
+    bool violetExtractStrongLiterals;
+    bool violetLiteralChains;
+    u32  violetDoubleCutLiteralLen;
+    u32  violetEarlyCleanLiteralLen;
+
     bool puffImproveHead;
     bool castleExclusive; // enable castle mutual exclusion analysis
 
diff --git a/src/hwlm/hwlm_build.cpp b/src/hwlm/hwlm_build.cpp
index 42d6bbdf..7ba82fcc 100644
--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@@ -35,6 +35,7 @@
 #include "hwlm_internal.h"
 #include "noodle_engine.h"
 #include "noodle_build.h"
+#include "scratch.h"
 #include "ue2common.h"
 #include "fdr/fdr_compile.h"
 #include "nfa/shufticompile.h"
@@ -493,6 +494,10 @@ bool isNoodleable(const vector<hwlmLiteral> &lits,
                          stream_control->history_max);
             return false;
         }
+        if (2 * lits.front().s.length() - 2 > FDR_TEMP_BUF_SIZE) {
+            assert(0);
+            return false;
+        }
     }
 
     if (!lits.front().msk.empty()) {
diff --git a/src/nfa/nfa_kind.h b/src/nfa/nfa_kind.h
index adc7045f..f2ac6189 100644
--- a/src/nfa/nfa_kind.h
+++ b/src/nfa/nfa_kind.h
@@ -37,6 +37,8 @@
 
 #include "ue2common.h"
 
+#include <string>
+
 namespace ue2 {
 
 /** \brief Specify the use-case for an nfa engine. */
@@ -115,6 +117,32 @@ bool has_managed_reports(enum nfa_kind k) {
     }
 }
 
+#if defined(DEBUG) || defined(DUMP_SUPPORT)
+
+inline
+std::string to_string(nfa_kind k) {
+    switch (k) {
+    case NFA_PREFIX:
+        return "PREFIX";
+    case NFA_INFIX:
+        return "INFIX";
+    case NFA_SUFFIX:
+        return "SUFFIX";
+    case NFA_OUTFIX:
+        return "OUTFIX";
+    case NFA_REV_PREFIX:
+        return "REV_PREFIX";
+    case NFA_OUTFIX_RAW:
+        return "OUTFIX_RAW";
+    case NFA_EAGER_PREFIX:
+        return "EAGER_PREFIX";
+    }
+    assert(0);
+    return "?";
+}
+
+#endif
+
 } // namespace ue2
 
 #endif
diff --git a/src/nfagraph/ng.cpp b/src/nfagraph/ng.cpp
index 5023cbef..35b2eb35 100644
--- a/src/nfagraph/ng.cpp
+++ b/src/nfagraph/ng.cpp
@@ -57,6 +57,7 @@
 #include "ng_small_literal_set.h"
 #include "ng_som.h"
 #include "ng_vacuous.h"
+#include "ng_violet.h"
 #include "ng_utf8.h"
 #include "ng_util.h"
 #include "ng_width.h"
@@ -244,6 +245,10 @@ bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
         return true;
     }
 
+    if (doViolet(*ng.rose, g, w.prefilter, cc)) {
+        return true;
+    }
+
     if (splitOffRose(*ng.rose, g, w.prefilter, cc)) {
         return true;
     }
@@ -261,6 +266,10 @@ bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
         return true;
     }
 
+    if (doViolet(*ng.rose, g, w.prefilter, cc)) {
+        return true;
+    }
+
     if (splitOffRose(*ng.rose, g, w.prefilter, cc)) {
         return true;
     }
diff --git a/src/nfagraph/ng_holder.h b/src/nfagraph/ng_holder.h
index 5b6a3de8..f0a387d0 100644
--- a/src/nfagraph/ng_holder.h
+++ b/src/nfagraph/ng_holder.h
@@ -239,6 +239,16 @@ vertices(const NGHolder &h) {
  */
 void clear_graph(NGHolder &h);
 
+inline
+void renumber_edges(NGHolder &h) {
+    h.renumberEdges();
+}
+
+inline
+void renumber_vertices(NGHolder &h) {
+    h.renumberVertices();
+}
+
 /*
  * \brief Clear and remove all of the vertices pointed to by the given iterator
  * range.
diff --git a/src/nfagraph/ng_literal_analysis.cpp b/src/nfagraph/ng_literal_analysis.cpp
index 9cb0091e..9229457c 100644
--- a/src/nfagraph/ng_literal_analysis.cpp
+++ b/src/nfagraph/ng_literal_analysis.cpp
@@ -339,6 +339,12 @@ void processWorkQueue(const NGHolder &g, const NFAEdge &e,
                  g[source(e, g)].index, g[target(e, g)].index, s.size());
 }
 
+bool bad_mixed_sensitivity(const ue2_literal &s) {
+    /* TODO: if the mixed cases is entirely within MAX_MASK2_WIDTH of the end,
+     * we should be able to handle it */
+    return mixed_sensitivity(s) && s.length() > MAX_MASK2_WIDTH;
+}
+
 static
 u64a litUniqueness(const string &s) {
     CharReach seen(s);
@@ -624,6 +630,48 @@ u64a compressAndScore(set<ue2_literal> &s) {
     return score;
 }
 
+/* like compressAndScore, but replaces long mixed sensitivity literals with
+ * something weaker. */
+u64a sanitizeAndCompressAndScore(set<ue2_literal> &lits) {
+    const size_t maxExploded = 8; // only case-explode this far
+
+    /* TODO: the whole compression thing could be made better by systematically
+     * considering replacing literal sets not just by common suffixes but also
+     * by nocase literals. */
+
+    vector<ue2_literal> replacements;
+
+    for (auto it = lits.begin(); it != lits.end();) {
+        auto jt = it;
+        ++it;
+
+        if (!bad_mixed_sensitivity(*jt)) {
+            continue;
+        }
+
+        /* we have to replace *jt with something... */
+        ue2_literal s = *jt;
+        lits.erase(jt);
+
+        vector<ue2_literal> exploded;
+        for (auto cit = caseIterateBegin(s); cit != caseIterateEnd(); ++cit) {
+            exploded.emplace_back(*cit, false);
+            if (exploded.size() > maxExploded) {
+                goto dont_explode;
+            }
+        }
+        insert(&replacements, replacements.end(), exploded);
+
+        continue;
+    dont_explode:
+        make_nocase(&s);
+        replacements.push_back(s);
+    }
+
+    insert(&lits, replacements);
+    return compressAndScore(lits);
+}
+
 u64a scoreSet(const set<ue2_literal> &s) {
     if (s.empty()) {
         return NO_LITERAL_AT_EDGE_SCORE;
@@ -674,7 +722,7 @@ set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAVertex &v,
     return s;
 }
 
-vector<u64a> scoreEdges(const NGHolder &g) {
+vector<u64a> scoreEdges(const NGHolder &g, const flat_set<NFAEdge> &known_bad) {
     assert(hasCorrectlyNumberedEdges(g));
 
     vector<u64a> scores(num_edges(g));
@@ -682,8 +730,12 @@ vector<u64a> scoreEdges(const NGHolder &g) {
     for (const auto &e : edges_range(g)) {
         u32 eidx = g[e].index;
         assert(eidx < scores.size());
-        set<ue2_literal> ls = getLiteralSet(g, e);
-        scores[eidx] = compressAndScore(ls);
+        if (contains(known_bad, e)) {
+            scores[eidx] = NO_LITERAL_AT_EDGE_SCORE;
+        } else {
+            set<ue2_literal> ls = getLiteralSet(g, e);
+            scores[eidx] = compressAndScore(ls);
+        }
     }
 
     return scores;
@@ -842,4 +894,49 @@ bool getTrailingLiteral(const NGHolder &g, ue2_literal *lit_out) {
     return true;
 }
 
+bool literalIsWholeGraph(const NGHolder &g, const ue2_literal &lit) {
+    NFAVertex v = g.accept;
+
+    for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
+        NGHolder::inv_adjacency_iterator ai, ae;
+        tie(ai, ae) = inv_adjacent_vertices(v, g);
+        if (ai == ae) {
+            assert(0); // no predecessors?
+            return false;
+        }
+        v = *ai++;
+        if (ai != ae) {
+            DEBUG_PRINTF("branch, fail\n");
+            return false;
+        }
+
+        if (is_special(v, g)) {
+            DEBUG_PRINTF("special found, fail\n");
+            return false;
+        }
+
+        const CharReach &cr_g = g[v].char_reach;
+        const CharReach &cr_l = *it;
+
+        if (!cr_l.isSubsetOf(cr_g)) {
+            /* running over the prefix is needed to prevent false postives */
+            DEBUG_PRINTF("reach fail\n");
+            return false;
+        }
+    }
+
+    // Our last value for v should have only start states for predecessors.
+    for (auto u : inv_adjacent_vertices_range(v, g)) {
+        if (!is_any_start(u, g)) {
+            DEBUG_PRINTF("pred is not start\n");
+            return false;
+        }
+    }
+
+    assert(num_vertices(g) == lit.length() + N_SPECIALS);
+
+    DEBUG_PRINTF("ok\n");
+    return true;
+}
+
 } // namespace ue2
diff --git a/src/nfagraph/ng_literal_analysis.h b/src/nfagraph/ng_literal_analysis.h
index 4fa72b9f..6fd9c525 100644
--- a/src/nfagraph/ng_literal_analysis.h
+++ b/src/nfagraph/ng_literal_analysis.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,9 +42,7 @@
 namespace ue2 {
 
 #define NO_LITERAL_AT_EDGE_SCORE  10000000ULL
-
-/* Score for special-to-special edges */
-#define INVALID_EDGE_CAP 100000000ULL
+#define INVALID_EDGE_CAP         100000000ULL /* special-to-special score */
 
 class NGHolder;
 
@@ -59,9 +57,20 @@ std::set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAVertex &v,
                                     bool only_first_encounter = true);
 std::set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAEdge &e);
 
-/** Score all the edges in the given graph, returning them in \p scores indexed
+/**
+ * Returns true if we are unable to use a mixed sensitivity literal in rose (as
+ * our literal matchers are generally either case sensitive or not).
+ *
+ * Shortish mixed sensitivity literals can be handled by confirm checks in rose
+ * and are not flagged as bad.
+ */
+bool bad_mixed_sensitivity(const ue2_literal &s);
+
+/**
+ * Score all the edges in the given graph, returning them in \p scores indexed
  * by edge_index. */
-std::vector<u64a> scoreEdges(const NGHolder &h);
+std::vector<u64a> scoreEdges(const NGHolder &h,
+                             const flat_set<NFAEdge> &known_bad = {});
 
 /** Returns a score for a literal set. Lower scores are better. */
 u64a scoreSet(const std::set<ue2_literal> &s);
@@ -69,6 +78,12 @@ u64a scoreSet(const std::set<ue2_literal> &s);
 /** Compress a literal set to fewer literals. */
 u64a compressAndScore(std::set<ue2_literal> &s);
 
+/**
+ * Compress a literal set to fewer literals and replace any long mixed
+ * sensitivity literals with supported literals.
+ */
+u64a sanitizeAndCompressAndScore(std::set<ue2_literal> &s);
+
 bool splitOffLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
                             NGHolder *rhs);
 
@@ -77,6 +92,10 @@ bool splitOffAnchoredLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
 
 bool getTrailingLiteral(const NGHolder &g, ue2_literal *lit_out);
 
+/** \brief Returns true if the given literal is the only thing in the graph,
+ * from (start or startDs) to accept. */
+bool literalIsWholeGraph(const NGHolder &g, const ue2_literal &lit);
+
 } // namespace ue2
 
 #endif
diff --git a/src/nfagraph/ng_literal_component.cpp b/src/nfagraph/ng_literal_component.cpp
index 9ee4f151..871c8ac7 100644
--- a/src/nfagraph/ng_literal_component.cpp
+++ b/src/nfagraph/ng_literal_component.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -186,7 +186,7 @@ bool splitOffLiteral(NG &ng, NGWrapper &g, NFAVertex v, const bool anchored,
 
 /** \brief Split off literals. True if any changes were made to the graph. */
 bool splitOffLiterals(NG &ng, NGWrapper &g) {
-    if (!ng.cc.grey.allowRose) {
+    if (!ng.cc.grey.allowLiteral) {
         return false;
     }
 
diff --git a/src/nfagraph/ng_rose.cpp b/src/nfagraph/ng_rose.cpp
index 9b8f0e9a..4b16364a 100644
--- a/src/nfagraph/ng_rose.cpp
+++ b/src/nfagraph/ng_rose.cpp
@@ -773,51 +773,6 @@ unique_ptr<VertLitInfo> LitCollection::pickNext() {
 
 }
 
-/** \brief Returns true if the given literal is the only thing in the graph,
- * from start to accept. */
-static
-bool literalIsWholeGraph(const NGHolder &g, const ue2_literal &lit) {
-    NFAVertex v = g.accept;
-
-    for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
-        NGHolder::inv_adjacency_iterator ai, ae;
-        tie(ai, ae) = inv_adjacent_vertices(v, g);
-        if (ai == ae) {
-            assert(0); // no predecessors?
-            return false;
-        }
-        v = *ai++;
-        if (ai != ae) {
-            DEBUG_PRINTF("branch, fail\n");
-            return false;
-        }
-
-        if (is_special(v, g)) {
-            DEBUG_PRINTF("special found, fail\n");
-            return false;
-        }
-
-        const CharReach &cr = g[v].char_reach;
-        if (cr != *it) {
-            DEBUG_PRINTF("reach fail\n");
-            return false;
-        }
-    }
-
-    // Our last value for v should have only start states for predecessors.
-    for (auto u : inv_adjacent_vertices_range(v, g)) {
-        if (!is_any_start(u, g)) {
-            DEBUG_PRINTF("pred is not start\n");
-            return false;
-        }
-    }
-
-    assert(num_vertices(g) == lit.length() + N_SPECIALS);
-
-    DEBUG_PRINTF("ok\n");
-    return true;
-}
-
 static
 bool can_match(const NGHolder &g, const ue2_literal &lit, bool overhang_ok) {
     set<NFAVertex> curr, next;
@@ -933,20 +888,11 @@ u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
     return delay;
 }
 
-static
 void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
-                                  u32 delay) {
+                                  u32 delay, const vector<NFAVertex> &preds) {
     assert(delay <= lit.length());
     DEBUG_PRINTF("adding on '%s' %u\n", ((const string &)lit).c_str(), delay);
 
-    vector<NFAVertex> preds;
-    insert(&preds, preds.end(), inv_adjacent_vertices(g.accept, g));
-    clear_in_edges(g.accept, g);
-
-    for (auto v : preds) {
-        g[v].reports.clear(); /* clear report from old accepts */
-    }
-
     NFAVertex prev = g.accept;
     auto it = lit.rbegin();
     while (delay--) {
@@ -972,6 +918,19 @@ void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
     assert(allMatchStatesHaveReports(g));
 }
 
+void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
+                                  u32 delay) {
+    vector<NFAVertex> preds;
+    insert(&preds, preds.end(), inv_adjacent_vertices(g.accept, g));
+    clear_in_edges(g.accept, g);
+
+    for (auto v : preds) {
+        g[v].reports.clear(); /* clear report from old accepts */
+    }
+
+    restoreTrailingLiteralStates(g, lit, delay, preds);
+}
+
 /* return false if we should get rid of the edge altogether */
 static
 bool removeLiteralFromLHS(RoseInGraph &ig, const RoseInEdge &lhs,
diff --git a/src/nfagraph/ng_rose.h b/src/nfagraph/ng_rose.h
index 4e16a3c4..d180e8a5 100644
--- a/src/nfagraph/ng_rose.h
+++ b/src/nfagraph/ng_rose.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,8 +33,11 @@
 #ifndef NG_ROSE_H
 #define NG_ROSE_H
 
+#include "ng_holder.h"
 #include "ue2common.h"
 
+#include <vector>
+
 namespace ue2 {
 
 class NGHolder;
@@ -65,6 +68,13 @@ bool checkRose(const ReportManager &rm, const NGHolder &h, bool prefilter,
 u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
                                 u32 max_delay, bool overhang_ok = true);
 
+void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
+                                  u32 delay);
+
+void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
+                                  u32 delay,
+                                  const std::vector<NFAVertex> &preds);
+
 } // namespace ue2
 
 #endif // NG_ROSE_H
diff --git a/src/nfagraph/ng_som.cpp b/src/nfagraph/ng_som.cpp
index 4af0e20c..ed2942bb 100644
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@@ -2064,8 +2064,7 @@ sombe_rv doHaigLitSom(NG &ng, NGHolder &g, const NGWrapper &w, u32 comp_id,
     ReportManager &rm = ng.rm;
     SomSlotManager &ssm = ng.ssm;
 
-    // This approach relies on Rose.
-    if (!cc.grey.allowRose) {
+    if (!cc.grey.allowHaigLit) {
         return SOMBE_FAIL;
     }
 
diff --git a/src/nfagraph/ng_split.cpp b/src/nfagraph/ng_split.cpp
index 75150136..bce638c0 100644
--- a/src/nfagraph/ng_split.cpp
+++ b/src/nfagraph/ng_split.cpp
@@ -100,7 +100,12 @@ void splitLHS(const NGHolder &base, const vector<NFAVertex> &pivots,
         add_edge((*lhs_map)[pivot], lhs->accept, *lhs);
     }
 
-    pruneUseless(*lhs);
+    /* should do the renumbering unconditionally as we know edges are already
+     * misnumbered */
+    pruneUseless(*lhs, false);
+    renumber_edges(*lhs);
+    renumber_vertices(*lhs);
+
     filterSplitMap(*lhs, lhs_map);
 
     switch (base.kind) {
@@ -148,7 +153,12 @@ void splitRHS(const NGHolder &base, const vector<NFAVertex> &pivots,
         assert(contains(*rhs_map, pivot));
         add_edge(rhs->start, (*rhs_map)[pivot], *rhs);
     }
-    pruneUseless(*rhs);
+
+     /* should do the renumbering unconditionally as we know edges are already
+      * misnumbered */
+    pruneUseless(*rhs, false);
+    renumber_edges(*rhs);
+    renumber_vertices(*rhs);
     filterSplitMap(*rhs, rhs_map);
 
     switch (base.kind) {
diff --git a/src/nfagraph/ng_util.cpp b/src/nfagraph/ng_util.cpp
index 935a223e..e9f6be55 100644
--- a/src/nfagraph/ng_util.cpp
+++ b/src/nfagraph/ng_util.cpp
@@ -209,6 +209,15 @@ bool isAnchored(const NGHolder &g) {
     return true;
 }
 
+bool isFloating(const NGHolder &g) {
+    for (auto v : adjacent_vertices_range(g.start, g)) {
+        if (v != g.startDs && !edge(g.startDs, v, g).second) {
+            return false;
+        }
+    }
+    return true;
+}
+
 bool isAcyclic(const NGHolder &g) {
     try {
         depth_first_search(
@@ -657,7 +666,8 @@ bool hasCorrectlyNumberedVertices(const NGHolder &g) {
         }
         ids[id] = true;
     }
-    return find(ids.begin(), ids.end(), false) == ids.end();
+    return find(ids.begin(), ids.end(), false) == ids.end()
+        && num_vertices(g) == num_vertices(g.g);
 }
 
 /** Assertion: returns true if the edges in this graph are contiguously (and
@@ -672,8 +682,10 @@ bool hasCorrectlyNumberedEdges(const NGHolder &g) {
         }
         ids[id] = true;
     }
-    return find(ids.begin(), ids.end(), false) == ids.end();
+    return find(ids.begin(), ids.end(), false) == ids.end()
+        && num_edges(g) == num_edges(g.g);
 }
+
 #endif // NDEBUG
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_util.h b/src/nfagraph/ng_util.h
index 955c9b7b..833523c7 100644
--- a/src/nfagraph/ng_util.h
+++ b/src/nfagraph/ng_util.h
@@ -228,6 +228,10 @@ bool isVacuous(const NGHolder &h);
  * proper successors). */
 bool isAnchored(const NGHolder &h);
 
+/** \brief True if the graph contains no anchored vertices (start has no
+ * successors aside from startDs or vertices connected to startDs). */
+bool isFloating(const NGHolder &h);
+
 /** True if the graph contains no back-edges at all, other than the
  * startDs self-loop. */
 bool isAcyclic(const NGHolder &g);
diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
new file mode 100644
index 00000000..0d1c1c12
--- /dev/null
+++ b/src/nfagraph/ng_violet.cpp
@@ -0,0 +1,2642 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "ng_violet.h"
+
+#include "grey.h"
+#include "ng_depth.h"
+#include "ng_dominators.h"
+#include "ng_dump.h"
+#include "ng_equivalence.h"
+#include "ng_holder.h"
+#include "ng_is_equal.h"
+#include "ng_literal_analysis.h"
+#include "ng_netflow.h"
+#include "ng_prune.h"
+#include "ng_redundancy.h"
+#include "ng_region.h"
+#include "ng_reports.h"
+#include "ng_rose.h"
+#include "ng_split.h"
+#include "ng_util.h"
+#include "ng_width.h"
+#include "rose/rose_build.h"
+#include "rose/rose_build_util.h"
+#include "rose/rose_in_dump.h"
+#include "rose/rose_in_graph.h"
+#include "rose/rose_in_util.h"
+#include "util/compare.h"
+#include "util/compile_context.h"
+#include "util/container.h"
+#include "util/graph.h"
+#include "util/graph_range.h"
+#include "util/make_unique.h"
+#include "util/order_check.h"
+#include "util/target_info.h"
+#include "util/ue2string.h"
+#include "util/ue2_containers.h"
+
+#include <set>
+#include <utility>
+#include <vector>
+#include <boost/core/noncopyable.hpp>
+#include <boost/graph/reverse_graph.hpp>
+#include <boost/graph/topological_sort.hpp>
+#include <boost/range/adaptor/map.hpp>
+
+#define STAGE_DEBUG_PRINTF DEBUG_PRINTF
+
+using namespace std;
+using boost::adaptors::map_values;
+
+namespace ue2 {
+
+/* createsAnchoredLHS() is conservative as the depths take into account
+ * back edges that come from beyond the split point and would be missing after
+ * the graph is split. */
+static
+bool createsAnchoredLHS(const NGHolder &g, const vector<NFAVertex> &vv,
+                        const vector<NFAVertexDepth> &depths,
+                        const Grey &grey, depth max_depth = depth::infinity()) {
+    max_depth = min(max_depth, depth(grey.maxAnchoredRegion));
+
+    for (auto v : vv) {
+        /* avoid issues of self loops blowing out depths:
+         *     look at preds, add 1 */
+        for (auto u : inv_adjacent_vertices_range(v, g)) {
+            if (u == v) {
+                continue;
+            }
+
+            u32 idx = g[u].index;
+            assert(idx < depths.size());
+            if (maxDistFromStartOfData(depths.at(idx)) >= max_depth) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+/* createsTransientLHS() is conservative as the depths take into account
+ * back edges that come from beyond the split point and would be missing after
+ * the graph is split. */
+static
+bool createsTransientLHS(const NGHolder &g, const vector<NFAVertex> &vv,
+                         const vector<NFAVertexDepth> &depths,
+                         const Grey &grey) {
+    const depth max_depth(grey.maxHistoryAvailable);
+
+    for (auto v : vv) {
+        /* avoid issues of self loops blowing out depths:
+         *     look at preds, add 1 */
+        for (auto u : inv_adjacent_vertices_range(v, g)) {
+            if (u == v) {
+                continue;
+            }
+
+            u32 idx = g[u].index;
+            assert(idx < depths.size());
+            if (maxDistFromInit(depths.at(idx)) >= max_depth) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+namespace {
+/**
+ * Information on a cut: vertices and literals.
+ */
+struct VertLitInfo {
+    VertLitInfo() {}
+    VertLitInfo(NFAVertex v, const set<ue2_literal> &litlit, bool c_anch,
+                bool c_tran = false)
+        : vv(vector<NFAVertex>(1, v)), lit(litlit), creates_anchored(c_anch),
+          creates_transient(c_tran) {}
+    VertLitInfo(const vector<NFAVertex> &vv_in, const set<ue2_literal> &lit_in,
+                bool c_anch)
+        : vv(vv_in), lit(lit_in), creates_anchored(c_anch) {}
+    vector<NFAVertex> vv;
+    set<ue2_literal> lit;
+
+    bool creates_anchored = false;
+    bool creates_transient = false;
+};
+
+/**
+ * \brief Comparator class for sorting LitCollection::lits.
+ *
+ * This is separated out from LitCollection itself as passing LitCollection to
+ * std::sort() would incur a (potentially expensive) copy.
+ */
+class LitComparator {
+public:
+    LitComparator(const NGHolder &g_in, bool sa, bool st)
+        : g(g_in), seeking_anchored(sa), seeking_transient(st) {}
+    bool operator()(const unique_ptr<VertLitInfo> &a,
+                    const unique_ptr<VertLitInfo> &b) const {
+        assert(a && b);
+
+        if (seeking_anchored) {
+            if (a->creates_anchored != b->creates_anchored) {
+                return a->creates_anchored < b->creates_anchored;
+            }
+        }
+
+        if (seeking_transient) {
+            if (a->creates_transient != b->creates_transient) {
+                return a->creates_transient < b->creates_transient;
+            }
+        }
+
+        u64a score_a = scoreSet(a->lit);
+        u64a score_b = scoreSet(b->lit);
+
+        if (score_a != score_b) {
+            return score_a > score_b;
+        }
+
+        /* vertices should only be in one candidate cut */
+        assert(a->vv == b->vv || a->vv.front() != b->vv.front());
+        return g[a->vv.front()].index > g[b->vv.front()].index;
+    }
+
+private:
+    const NGHolder &g; /**< graph on which cuts are found */
+
+    bool seeking_anchored;
+    bool seeking_transient;
+};
+}
+
+static
+size_t shorter_than(const set<ue2_literal> &s, size_t limit) {
+    size_t count = 0;
+
+    for (const auto &lit : s) {
+        if (lit.length() < limit) {
+            count++;
+        }
+    }
+
+    return count;
+}
+
+static
+u32 min_len(const set<ue2_literal> &s) {
+    u32 rv = ~0U;
+
+    for (const auto &lit : s) {
+        rv = min(rv, (u32)lit.length());
+    }
+
+    return rv;
+}
+
+static
+u32 min_period(const set<ue2_literal> &s) {
+    u32 rv = ~0U;
+
+    for (const auto &lit : s) {
+        rv = min(rv, (u32)minStringPeriod(lit));
+    }
+    DEBUG_PRINTF("min period %u\n", rv);
+    return rv;
+}
+
+#define MIN_ANCHORED_LEN 2
+
+static
+bool validateRoseLiteralSetQuality(const set<ue2_literal> &s, u64a score,
+                                   bool anchored, u32 min_allowed_floating_len,
+                                   bool desperation) {
+    u32 min_allowed_len = anchored ? MIN_ANCHORED_LEN
+                                   : min_allowed_floating_len;
+
+    assert(none_of(begin(s), end(s), bad_mixed_sensitivity));
+
+    if (score >= NO_LITERAL_AT_EDGE_SCORE) {
+        DEBUG_PRINTF("candidate is too bad %llu/%zu\n", score, s.size());
+        return false;
+    }
+
+    assert(!s.empty());
+    if (s.empty()) {
+        DEBUG_PRINTF("candidate is too bad/something went wrong\n");
+        return false;
+    }
+
+    u32 s_min_len = min_len(s);
+    u32 s_min_period = min_period(s);
+    size_t short_count = shorter_than(s, 5);
+
+    DEBUG_PRINTF("cand '%s': score %llu count=%zu min_len=%u min_period=%u"
+                 " short_count=%zu desp=%d\n",
+                 dumpString(*s.begin()).c_str(), score, s.size(), s_min_len,
+                 s_min_period, short_count, (int)desperation);
+
+    bool ok = true;
+
+    if (s.size() > 10 /* magic number is magic */
+        || s_min_len < min_allowed_len
+        || (s_min_period <= 1 && min_allowed_len != 1)) {
+        ok = false;
+    }
+
+    if (!ok && desperation
+        && s.size() <= 20 /* more magic numbers are magical */
+        && (s_min_len > 5 || (s_min_len > 2 && short_count <= 10))
+        && s_min_period > 1) {
+        DEBUG_PRINTF("candidate is ok\n");
+        ok = true;
+    }
+
+    if (!ok && desperation
+        && s.size() <= 50 /* more magic numbers are magical */
+        && s_min_len > 10
+        && s_min_period > 1) {
+        DEBUG_PRINTF("candidate is ok\n");
+        ok = true;
+    }
+
+    if (!ok) {
+        DEBUG_PRINTF("candidate is too shitty\n");
+        return false;
+    }
+
+    return true;
+}
+
+static UNUSED
+void dumpRoseLiteralSet(const set<ue2_literal> &s) {
+    for (UNUSED const auto &lit : s) {
+        DEBUG_PRINTF("    lit: %s\n", dumpString(lit).c_str());
+    }
+}
+
+static
+void getSimpleRoseLiterals(const NGHolder &g, bool seeking_anchored,
+                           const vector<NFAVertexDepth> *depths,
+                           const set<NFAVertex> &a_dom,
+                           vector<unique_ptr<VertLitInfo>> *lits,
+                           u32 min_allowed_len, bool desperation,
+                           const CompileContext &cc) {
+    assert(depths || !seeking_anchored);
+
+    map<NFAVertex, u64a> scores;
+    map<NFAVertex, unique_ptr<VertLitInfo>> lit_info;
+    set<ue2_literal> s;
+
+    for (auto v : a_dom) {
+        s = getLiteralSet(g, v, true); /* RHS will take responsibility for any
+                                          revisits to the target vertex */
+
+        if (s.empty()) {
+            DEBUG_PRINTF("candidate is too shitty\n");
+            continue;
+        }
+
+        DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
+        dumpRoseLiteralSet(s);
+        u64a score = sanitizeAndCompressAndScore(s);
+
+        bool anchored = false;
+        if (seeking_anchored) {
+            anchored = createsAnchoredLHS(g, {v}, *depths, cc.grey);
+        }
+
+        if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
+                                           desperation)) {
+            continue;
+        }
+
+        DEBUG_PRINTF("candidate is a candidate\n");
+        scores[v] = score;
+        lit_info[v] = make_unique<VertLitInfo>(v, s, anchored);
+    }
+
+    /* try to filter out cases where appending some characters produces worse
+     * literals. Only bother to look back one byte, TODO make better */
+    for (auto u : a_dom) {
+        if (out_degree(u, g) != 1 || !scores[u]) {
+            continue;
+        }
+        NFAVertex v = *adjacent_vertices(u, g).first;
+        if (contains(scores, v) && scores[v] >= scores[u]) {
+            DEBUG_PRINTF("killing off v as score %llu >= %llu\n",
+                         scores[v], scores[u]);
+            lit_info.erase(v);
+        }
+    }
+
+    lits->reserve(lit_info.size());
+    for (auto &m : lit_info) {
+        lits->push_back(move(m.second));
+    }
+    DEBUG_PRINTF("%zu candidate literal sets\n", lits->size());
+}
+
+static
+void getRegionRoseLiterals(const NGHolder &g, bool seeking_anchored,
+                           const vector<NFAVertexDepth> *depths,
+                           const set<NFAVertex> &bad,
+                           const set<NFAVertex> *allowed,
+                           vector<unique_ptr<VertLitInfo>> *lits,
+                           u32 min_allowed_len, bool desperation,
+                           const CompileContext &cc) {
+    /* This allows us to get more places to split the graph as we are not
+       limited to points where there is a single vertex to split at. */
+
+    assert(depths || !seeking_anchored);
+
+    /* TODO: operate over 'proto-regions' which ignore back edges */
+    auto regions = assignRegions(g);
+
+    set<u32> mand, optional;
+    map<u32, vector<NFAVertex> > exits;
+
+    for (auto v : vertices_range(g)) {
+        u32 region = regions[v];
+        if (is_any_start(v, g) || region == 0) {
+            continue;
+        }
+
+        if (is_any_accept(v, g)) {
+            continue;
+        }
+
+        if (!generates_callbacks(g) && is_match_vertex(v, g)) {
+            /* we cannot leave a completely vacuous infix */
+            continue;
+        }
+
+        if (isRegionExit(g, v, regions)) {
+            exits[region].push_back(v);
+        }
+
+        if (isRegionEntry(g, v, regions)) {
+            // Determine whether this region is mandatory or optional. We only
+            // need to do this check for the first entry vertex we encounter
+            // for this region.
+            if (!contains(mand, region) && !contains(optional, region)) {
+                if (isOptionalRegion(g, v, regions)) {
+                    optional.insert(region);
+                } else {
+                    mand.insert(region);
+                }
+            }
+        }
+    }
+
+    for (const auto &m : exits) {
+        if (false) {
+        next_cand:
+            continue;
+        }
+
+        const u32 region = m.first;
+        const vector<NFAVertex> &vv = m.second;
+        assert(!vv.empty());
+
+        if (!contains(mand, region)) {
+            continue;
+        }
+
+        for (auto v : vv) {
+             /* if an exit is in bad, the region is already handled well
+              * by getSimpleRoseLiterals or is otherwise bad */
+            if (contains(bad, v)) {
+                goto next_cand;
+            }
+            /* if we are only allowed to consider some vertices, v must be in
+               the list; */
+            if (allowed && !contains(*allowed, v)) {
+                goto next_cand;
+            }
+        }
+
+        /* the final region may not have a neat exit. validate that all exits
+         * have an edge to each accept or none do */
+        bool edge_to_a = edge(vv[0], g.accept, g).second;
+        bool edge_to_aeod = edge(vv[0], g.acceptEod, g).second;
+        const auto &reports = g[vv[0]].reports;
+        for (auto v : vv) {
+            if (edge_to_a != edge(v, g.accept, g).second) {
+                goto next_cand;
+            }
+
+            if (edge_to_aeod != edge(v, g.acceptEod, g).second) {
+                goto next_cand;
+            }
+
+            if (g[v].reports != reports) {
+                goto next_cand;
+            }
+        }
+
+        DEBUG_PRINTF("inspecting region %u\n", region);
+        set<ue2_literal> s;
+        for (auto v : vv) {
+            DEBUG_PRINTF("   exit vertex: %u\n", g[v].index);
+            /* Note: RHS can not be depended on to take all subsequent revisits
+             * to this vertex */
+            set<ue2_literal> ss = getLiteralSet(g, v, false);
+            if (ss.empty()) {
+                DEBUG_PRINTF("candidate is too shitty\n");
+                goto next_cand;
+            }
+            insert(&s, ss);
+        }
+
+        assert(!s.empty());
+
+        DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
+        dumpRoseLiteralSet(s);
+        u64a score = sanitizeAndCompressAndScore(s);
+
+        DEBUG_PRINTF("|candidate literal set| = %zu\n", s.size());
+        dumpRoseLiteralSet(s);
+
+        bool anchored = false;
+        if (seeking_anchored) {
+            anchored = createsAnchoredLHS(g, vv, *depths, cc.grey);
+        }
+
+        if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
+                                           desperation)) {
+            goto next_cand;
+        }
+
+        DEBUG_PRINTF("candidate is a candidate\n");
+        lits->push_back(make_unique<VertLitInfo>(vv, s, anchored));
+    }
+}
+
+static
+void filterCandPivots(const NGHolder &g, const set<NFAVertex> &cand_raw,
+                      set<NFAVertex> *out) {
+    for (auto u : cand_raw) {
+        const CharReach &u_cr = g[u].char_reach;
+        if (u_cr.count() > 40) {
+            continue; /* too wide to be plausible */
+        }
+
+        if (u_cr.count() > 2) {
+            /* include u as a candidate as successor may have backed away from
+             * expanding through it */
+            out->insert(u);
+            continue;
+        }
+
+        NFAVertex v = getSoleDestVertex(g, u);
+        if (v && in_degree(v, g) == 1 && out_degree(u, g) == 1) {
+            const CharReach &v_cr = g[v].char_reach;
+            if (v_cr.count() == 1 || v_cr.isCaselessChar()) {
+                continue; /* v will always generate better literals */
+            }
+        }
+
+        out->insert(u);
+    }
+}
+
+/* cand_raw is the candidate set before filtering points which are clearly
+ * a bad idea. */
+static
+void getCandidatePivots(const NGHolder &g, set<NFAVertex> *cand,
+                        set<NFAVertex> *cand_raw) {
+    ue2::unordered_map<NFAVertex, NFAVertex> dominators = findDominators(g);
+
+    set<NFAVertex> accepts;
+
+    for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
+        if (is_special(v, g)) {
+            continue;
+        }
+        accepts.insert(v);
+    }
+    for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
+        if (is_special(v, g)) {
+            continue;
+        }
+        accepts.insert(v);
+    }
+
+    assert(!accepts.empty());
+
+    vector<NFAVertex> dom_trace;
+    auto ait = accepts.begin();
+    assert(ait != accepts.end());
+    NFAVertex curr = *ait;
+    while (curr && !is_special(curr, g)) {
+        dom_trace.push_back(curr);
+        curr = dominators[curr];
+    }
+    reverse(dom_trace.begin(), dom_trace.end());
+    for (++ait; ait != accepts.end(); ++ait) {
+        curr = *ait;
+        vector<NFAVertex> dom_trace2;
+        while (curr && !is_special(curr, g)) {
+            dom_trace2.push_back(curr);
+            curr = dominators[curr];
+        }
+        reverse(dom_trace2.begin(), dom_trace2.end());
+        auto dti = dom_trace.begin(), dtie = dom_trace.end();
+        auto dtj = dom_trace2.begin(), dtje = dom_trace2.end();
+        while (dti != dtie && dtj != dtje && *dti == *dtj) {
+            ++dti;
+            ++dtj;
+        }
+        dom_trace.erase(dti, dtie);
+    }
+
+    cand_raw->insert(dom_trace.begin(), dom_trace.end());
+
+    filterCandPivots(g, *cand_raw, cand);
+}
+
+static
+unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
+                                      const vector<NFAVertexDepth> *depths,
+                                      bool for_prefix, u32 min_len,
+                                      const set<NFAVertex> *allowed_cand,
+                                      const set<NFAVertex> *disallowed_cand,
+                                      const CompileContext &cc) {
+    assert(!for_prefix || depths);
+
+    /* look for a single simple split point */
+    set<NFAVertex> cand;
+    set<NFAVertex> cand_raw;
+
+    getCandidatePivots(g, &cand, &cand_raw);
+
+    if (allowed_cand) {
+        set<NFAVertex> cand2;
+        set<NFAVertex> cand2_raw;
+        set_intersection(allowed_cand->begin(), allowed_cand->end(),
+                         cand.begin(), cand.end(),
+                         inserter(cand2, cand2.begin()));
+
+        set_intersection(allowed_cand->begin(), allowed_cand->end(),
+                         cand_raw.begin(), cand_raw.end(),
+                         inserter(cand2_raw, cand2_raw.begin()));
+
+        cand = std::move(cand2);
+        cand_raw = std::move(cand2_raw);
+    }
+    if (disallowed_cand) {
+        DEBUG_PRINTF("%zu disallowed candidates\n", disallowed_cand->size());
+        DEBUG_PRINTF("|old cand| = %zu\n", cand.size());
+        erase_all(&cand, *disallowed_cand);
+        insert(&cand_raw, *disallowed_cand);
+    }
+
+    if (!generates_callbacks(g)) {
+        /* not output exposed so must leave some RHS */
+        for (NFAVertex v : inv_adjacent_vertices_range(g.accept, g)) {
+            cand.erase(v);
+            cand_raw.erase(v);
+        }
+
+        for (NFAVertex v : inv_adjacent_vertices_range(g.acceptEod, g)) {
+            cand.erase(v);
+            cand_raw.erase(v);
+        }
+    }
+
+    DEBUG_PRINTF("|cand| = %zu\n", cand.size());
+
+    bool seeking_anchored = for_prefix;
+    bool seeking_transient = for_prefix; //cc.streaming;
+
+    /* TODO: revisit when backstop goes away */
+    bool desperation = for_prefix && cc.streaming;
+
+    vector<unique_ptr<VertLitInfo>> lits; /**< sorted list of potential cuts */
+
+    getSimpleRoseLiterals(g, seeking_anchored, depths, cand, &lits, min_len,
+                          desperation, cc);
+    getRegionRoseLiterals(g, seeking_anchored, depths, cand_raw, allowed_cand,
+                          &lits, min_len, desperation, cc);
+
+    if (lits.empty()) {
+        DEBUG_PRINTF("no literals found\n");
+        return nullptr;
+    }
+
+    if (seeking_transient) {
+        for (auto &a : lits) {
+            a->creates_transient
+                = createsTransientLHS(g, a->vv, *depths, cc.grey);
+        }
+    }
+
+    auto cmp = LitComparator(g, seeking_anchored, seeking_transient);
+
+    unique_ptr<VertLitInfo> best = move(lits.back());
+    lits.pop_back();
+    while (!lits.empty()) {
+        if (cmp(best, lits.back())) {
+            best = move(lits.back());
+        }
+        lits.pop_back();
+    }
+
+    DEBUG_PRINTF("best is '%s' %u a%d t%d\n",
+        ((const string &)*best->lit.begin()).c_str(),
+        g[best->vv.front()].index,
+        depths ? (int)createsAnchoredLHS(g, best->vv, *depths, cc.grey) : 0,
+        depths ? (int)createsTransientLHS(g, best->vv, *depths, cc.grey) : 0);
+
+    return best;
+}
+
+static
+void poisonFromSuccessor(const NGHolder &h, const ue2_literal &succ,
+                         bool overhang_ok, flat_set<NFAEdge> &bad) {
+    DEBUG_PRINTF("poisoning holder of size %zu, succ len %zu\n",
+                 num_vertices(h), succ.length());
+
+    map<NFAVertex, flat_set<NFAEdge> > curr;
+    for (const auto &e : in_edges_range(h.accept, h)) {
+        curr[source(e, h)].insert(e);
+    }
+
+    map<NFAVertex, flat_set<NFAEdge> > next;
+    for (auto it = succ.rbegin(); it != succ.rend(); ++it) {
+        for (const auto &path : curr) {
+            NFAVertex u = path.first;
+            const auto &path_set = path.second;
+            if (u == h.start && overhang_ok) {
+                DEBUG_PRINTF("poisoning early %zu [overhang]\n",
+                             path_set.size());
+                insert(&bad, path_set);
+                continue;
+            }
+            if (overlaps(h[u].char_reach, *it)) {
+                for (const auto &e : in_edges_range(u, h)) {
+                    auto &new_path_set = next[source(e, h)];
+                    insert(&new_path_set, path_set);
+                    new_path_set.insert(e);
+                }
+            }
+        }
+        DEBUG_PRINTF("succ char matches at %zu paths\n", next.size());
+        assert(overhang_ok || !curr.empty());
+        swap(curr, next);
+        next.clear();
+    }
+
+    assert(overhang_ok || !curr.empty());
+    for (const auto &path : curr) {
+        insert(&bad, path.second);
+        DEBUG_PRINTF("poisoning %zu vertices\n", path.second.size());
+    }
+}
+
+static
+void poisonForGoodPrefix(const NGHolder &h,
+                         const vector<NFAVertexDepth> &depths,
+                         flat_set<NFAEdge> &bad, const Grey &grey) {
+    for (const auto &v : vertices_range(h)) {
+        if (!createsAnchoredLHS(h, {v}, depths, grey)
+            && !createsTransientLHS(h, {v}, depths, grey)) {
+            insert(&bad, in_edges_range(v, h));
+        }
+    }
+}
+
+static
+flat_set<NFAEdge> poisonEdges(const NGHolder &h,
+                         const vector<NFAVertexDepth> *depths,
+                         const RoseInGraph &vg, const vector<RoseInEdge> &ee,
+                         bool for_prefix, const Grey &grey) {
+    DEBUG_PRINTF("poisoning edges %zu successor edges\n", ee.size());
+
+    /* poison edges covered by successor literal */
+
+    set<pair<ue2_literal, bool> > succs;
+    for (const RoseInEdge &ve : ee) {
+        if (vg[target(ve, vg)].type != RIV_LITERAL) {
+            /* nothing to poison in suffixes/outfixes */
+            assert(vg[target(ve, vg)].type == RIV_ACCEPT);
+            continue;
+        }
+        succs.insert({vg[target(ve, vg)].s,
+                    vg[source(ve, vg)].type == RIV_LITERAL});
+
+    }
+
+    DEBUG_PRINTF("poisoning edges %zu successor literals\n", succs.size());
+
+    flat_set<NFAEdge> bad;
+    for (const auto &p : succs) {
+        poisonFromSuccessor(h, p.first, p.second, bad);
+    }
+
+    /* poison edges which don't significantly improve a prefix */
+
+    if (for_prefix) {
+        poisonForGoodPrefix(h, *depths, bad, grey);
+    }
+
+    return bad;
+}
+
+static
+set<NFAVertex> poisonVertices(const NGHolder &h, const RoseInGraph &vg,
+                              const vector<RoseInEdge> &ee, const Grey &grey) {
+    flat_set<NFAEdge> bad_edges = poisonEdges(h, nullptr, vg, ee, false, grey);
+    set<NFAVertex> bad_vertices;
+    for (const NFAEdge &e : bad_edges) {
+        bad_vertices.insert(target(e, h));
+        DEBUG_PRINTF("bad: %u->%u\n", h[source(e, h)].index,
+                     h[target(e, h)].index);
+    }
+
+    return bad_vertices;
+}
+
+static
+unique_ptr<VertLitInfo> findBestNormalSplit(const NGHolder &g,
+                                            const RoseInGraph &vg,
+                                            const vector<RoseInEdge> &ee,
+                                            const CompileContext &cc) {
+    assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX);
+    set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
+
+    return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength,
+                         nullptr, &bad_vertices, cc);
+}
+
+static
+unique_ptr<VertLitInfo> findSimplePrefixSplit(const NGHolder &g,
+                                              const CompileContext &cc) {
+    DEBUG_PRINTF("looking for simple prefix split\n");
+    bool anchored = !proper_out_degree(g.startDs, g);
+    NFAVertex u = anchored ? g.start : g.startDs;
+
+    if (out_degree(u, g) != 2) { /* startDs + succ */
+        return nullptr;
+    }
+
+    NFAVertex v = NGHolder::null_vertex();
+    for (NFAVertex t : adjacent_vertices_range(u, g)) {
+        if (t != g.startDs) {
+            assert(!v);
+            v = t;
+        }
+    }
+    assert(v);
+
+    if (!anchored) {
+        if (out_degree(g.start, g) > 2) {
+            return nullptr;
+        }
+        if (out_degree(g.start, g) == 2 && !edge(g.start, v, g).second) {
+            return nullptr;
+        }
+    }
+
+    NFAVertex best_v = NGHolder::null_vertex();
+    ue2_literal best_lit;
+
+    u32 limit = cc.grey.maxHistoryAvailable;
+    if (anchored) {
+        LIMIT_TO_AT_MOST(&limit, cc.grey.maxAnchoredRegion);
+    }
+
+    ue2_literal curr_lit;
+    for (u32 i = 0; i < limit; i++) {
+        const auto &v_cr = g[v].char_reach;
+        if (v_cr.count() == 1 || v_cr.isCaselessChar()) {
+            curr_lit.push_back(v_cr.find_first(), v_cr.isCaselessChar());
+        } else {
+            curr_lit.clear();
+        }
+
+        if (curr_lit.length() > best_lit.length()) {
+            best_lit = curr_lit;
+            best_v = v;
+        }
+
+        if (out_degree(v, g) != 1) {
+            break;
+        }
+        v = *adjacent_vertices(v, g).first;
+    }
+
+    if (best_lit.length() < cc.grey.minRoseLiteralLength) {
+        return nullptr;
+    }
+
+    set<ue2_literal> best_lit_set({best_lit});
+    if (bad_mixed_sensitivity(best_lit)) {
+        sanitizeAndCompressAndScore(best_lit_set);
+    }
+
+    return ue2::make_unique<VertLitInfo>(best_v, best_lit_set, anchored, true);
+}
+
+static
+unique_ptr<VertLitInfo> findBestPrefixSplit(const NGHolder &g,
+                                        const vector<NFAVertexDepth> &depths,
+                                        const RoseInGraph &vg,
+                                        const vector<RoseInEdge> &ee,
+                                        const CompileContext &cc) {
+    assert(g.kind == NFA_PREFIX);
+    set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
+    auto rv = findBestSplit(g, &depths, true, cc.grey.minRoseLiteralLength,
+                            nullptr, &bad_vertices, cc);
+
+    /* large back edges may prevent us identifing anchored or transient cases
+     * properly - use a simple walk instead */
+    if (!rv || !(rv->creates_transient || rv->creates_anchored)) {
+        auto rv2 = findSimplePrefixSplit(g, cc);
+        if (rv2) {
+            return rv2;
+        }
+    }
+
+    return rv;
+}
+
+static
+unique_ptr<VertLitInfo> findBestCleanSplit(const NGHolder &g,
+                                           const CompileContext &cc) {
+    assert(g.kind != NFA_PREFIX);
+    set<NFAVertex> cleanSplits;
+    for (NFAVertex v : vertices_range(g)) {
+        if (!g[v].char_reach.all() || !edge(v, v, g).second) {
+            continue;
+        }
+        insert(&cleanSplits, inv_adjacent_vertices(v, g));
+        cleanSplits.erase(v);
+    }
+    cleanSplits.erase(g.start);
+    if (cleanSplits.empty()) {
+        return nullptr;
+    }
+    return findBestSplit(g, nullptr, false, cc.grey.violetEarlyCleanLiteralLen,
+                         &cleanSplits, nullptr, cc);
+}
+
+static
+bool can_match(const NGHolder &g, const ue2_literal &lit, bool overhang_ok) {
+    set<NFAVertex> curr, next;
+    curr.insert(g.accept);
+
+    for (auto it = lit.rbegin(); it != lit.rend(); ++it) {
+        next.clear();
+
+        for (auto v : curr) {
+            for (auto u : inv_adjacent_vertices_range(v, g)) {
+                if (u == g.start) {
+                    if (overhang_ok) {
+                        DEBUG_PRINTF("bail\n");
+                        return true;
+                    } else {
+                        continue; /* it is not possible for a lhs literal to
+                                   * overhang the start */
+                    }
+                }
+
+                const CharReach &cr = g[u].char_reach;
+                if (!overlaps(*it, cr)) {
+                     continue;
+                }
+
+                next.insert(u);
+            }
+        }
+
+        curr.swap(next);
+    }
+
+    return !curr.empty();
+}
+
+static
+bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
+                   const vector<RoseInEdge> &ee, const VertLitInfo &split) {
+    const vector<NFAVertex> &splitters = split.vv;
+    assert(!splitters.empty());
+
+    shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
+    shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
+
+    ue2::unordered_map<NFAVertex, NFAVertex> lhs_map;
+    ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
+
+    splitGraph(base_graph, splitters, lhs.get(), &lhs_map, rhs.get(), &rhs_map);
+    DEBUG_PRINTF("split %s:%zu into %s:%zu + %s:%zu\n",
+                 to_string(base_graph.kind).c_str(), num_vertices(base_graph),
+                 to_string(lhs->kind).c_str(), num_vertices(*lhs),
+                 to_string(rhs->kind).c_str(), num_vertices(*rhs));
+
+    bool suffix = vg[target(ee.front(), vg)].type == RIV_ACCEPT;
+
+    if (is_triggered(base_graph)) {
+        /* if we are already guarded, check if the split reduces the size of
+         * the problem before continuing with the split */
+        if (num_vertices(*lhs) >= num_vertices(base_graph)
+            && !(suffix && isVacuous(*rhs))) {
+            DEBUG_PRINTF("split's lhs is no smaller\n");
+            return false;
+        }
+
+        if (num_vertices(*rhs) >= num_vertices(base_graph)) {
+            DEBUG_PRINTF("split's rhs is no smaller\n");
+            return false;
+        }
+    }
+
+    bool do_accept = false;
+    bool do_accept_eod = false;
+    assert(rhs);
+    if (isVacuous(*rhs) && suffix) {
+        if (edge(rhs->start, rhs->accept, *rhs).second) {
+            DEBUG_PRINTF("rhs has a cliche\n");
+            do_accept = true;
+            remove_edge(rhs->start, rhs->accept, *rhs);
+        }
+
+        if (edge(rhs->start, rhs->acceptEod, *rhs).second) {
+            DEBUG_PRINTF("rhs has an eod cliche\n");
+            do_accept_eod = true;
+            remove_edge(rhs->start, rhs->acceptEod, *rhs);
+        }
+
+        renumber_edges(*rhs);
+    }
+
+    /* check if we still have a useful graph left over */
+    bool do_norm = out_degree(rhs->start, *rhs) != 1;
+
+    set<ReportID> splitter_reports;
+    for (auto v : splitters) {
+        insert(&splitter_reports, base_graph[v].reports);
+    }
+
+    /* find the targets of each source vertex */
+    map<RoseInVertex, flat_set<RoseInVertex> > images;
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex src = source(e, vg);
+        RoseInVertex dest = target(e, vg);
+        images[src].insert(dest);
+        remove_edge(e, vg);
+    }
+
+    map<flat_set<RoseInVertex>, vector<RoseInVertex> > verts_by_image;
+
+    for (const auto &elem : images) {
+        RoseInVertex u = elem.first;
+        const auto &image = elem.second;
+
+        if (contains(verts_by_image, image)) {
+            for (RoseInVertex v : verts_by_image[image]) {
+                add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
+            }
+            continue;
+        }
+
+        for (const auto &lit : split.lit) {
+            assert(!bad_mixed_sensitivity(lit));
+
+            /* don't allow overhang in can_match() as literals should
+             * correspond to the edge graph being split; overhanging the graph
+             * would indicate a false path.*/
+            if (!can_match(*lhs, lit, false)) {
+                DEBUG_PRINTF("'%s' did not match lhs\n",
+                             escapeString(lit).c_str());
+                continue;
+            }
+
+            DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str());
+            auto v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+            add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
+
+            /* work out delay later */
+            if (do_accept) {
+                DEBUG_PRINTF("rhs has a cliche\n");
+                auto tt = add_vertex(RoseInVertexProps::makeAccept(
+                                                         splitter_reports), vg);
+                add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+            }
+
+            if (do_accept_eod) {
+                DEBUG_PRINTF("rhs has an eod cliche\n");
+                auto tt = add_vertex(RoseInVertexProps::makeAcceptEod(
+                                                         splitter_reports), vg);
+                add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+            }
+
+            if (do_norm) {
+                assert(out_degree(rhs->start, *rhs) > 1);
+                for (RoseInVertex dest : image) {
+                    add_edge(v, dest, RoseInEdgeProps(rhs, 0U), vg);
+                }
+            }
+            verts_by_image[image].push_back(v);
+        }
+    }
+
+    assert(hasCorrectlyNumberedVertices(*rhs));
+    assert(hasCorrectlyNumberedEdges(*rhs));
+    assert(hasCorrectlyNumberedVertices(*lhs));
+    assert(hasCorrectlyNumberedEdges(*lhs));
+
+    return true;
+}
+
+#define MAX_NETFLOW_CUT_WIDTH 40 /* magic number is magic */
+#define MAX_LEN_2_LITERALS_PER_CUT 3
+
+static
+bool checkValidNetflowLits(NGHolder &h, const vector<u64a> &scores,
+                           const map<NFAEdge, set<ue2_literal>> &cut_lits,
+                           u32 min_allowed_length) {
+    DEBUG_PRINTF("cut width %zu; min allowed %u\n", cut_lits.size(),
+                 min_allowed_length);
+    if (cut_lits.size() > MAX_NETFLOW_CUT_WIDTH) {
+        return false;
+    }
+
+    u32 len_2_count = 0;
+
+    for (const auto &cut : cut_lits) {
+        if (scores[h[cut.first].index] >= NO_LITERAL_AT_EDGE_SCORE) {
+            DEBUG_PRINTF("cut uses a forbidden edge\n");
+            return false;
+        }
+
+        if (min_len(cut.second) < min_allowed_length) {
+            DEBUG_PRINTF("cut uses a bad literal\n");
+            return false;
+        }
+
+        for (const auto &lit : cut.second) {
+            if (lit.length() == 2) {
+                len_2_count++;
+            }
+        }
+    }
+
+    if (len_2_count > MAX_LEN_2_LITERALS_PER_CUT) {
+        return false;
+    }
+
+    return true;
+}
+
+static
+void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
+                     const vector<RoseInEdge> &to_cut,
+                     const vector<NFAEdge> &cut,
+                     const map<NFAEdge, set<ue2_literal> > &cut_lits) {
+    set<RoseInVertex> sources;
+    for (const RoseInEdge &ve : to_cut) {
+        assert(&h == &*vg[ve].graph);
+        sources.insert(source(ve, vg));
+    }
+
+    DEBUG_PRINTF("splitting %s:\n", to_string(h.kind).c_str());
+
+    /* create literal vertices and connect preds */
+    map<RoseInVertex, vector<pair<RoseInVertex, NFAVertex> > > verts_by_source;
+    for (RoseInVertex src : sources) {
+        /* iterate over cut for determinism */
+        for (const auto &e : cut) {
+            NFAVertex prev_v = source(e, h);
+            NFAVertex pivot = target(e, h);
+
+            DEBUG_PRINTF("splitting on pivot %u\n", h[pivot].index);
+            ue2::unordered_map<NFAVertex, NFAVertex> temp_map;
+            shared_ptr<NGHolder> new_lhs = make_shared<NGHolder>();
+            splitLHS(h, pivot, new_lhs.get(), &temp_map);
+
+            /* want to cut off paths to pivot from things other than the pivot -
+             * makes a more svelte graphy */
+            clear_in_edges(temp_map[pivot], *new_lhs);
+            add_edge(temp_map[prev_v], temp_map[pivot], *new_lhs);
+
+            pruneUseless(*new_lhs, false);
+            renumber_vertices(*new_lhs);
+            renumber_edges(*new_lhs);
+
+            DEBUG_PRINTF("    into lhs %s\n", to_string(new_lhs->kind).c_str());
+
+            assert(hasCorrectlyNumberedVertices(*new_lhs));
+            assert(hasCorrectlyNumberedEdges(*new_lhs));
+
+            const set<ue2_literal> &lits = cut_lits.at(e);
+            for (const auto &lit : lits) {
+                if (!can_match(*new_lhs, lit, is_triggered(h))) {
+                    continue;
+                }
+
+                RoseInVertex v
+                    = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+
+                /* if this is a prefix/infix an edge directly to accept should
+                 * represent a false path as we have poisoned vertices covered
+                 * by the literals. */
+                if (generates_callbacks(h)) {
+                    if (edge(pivot, h.accept, h).second) {
+                        DEBUG_PRINTF("adding acceptEod\n");
+                        /* literal has a direct connection to accept */
+                        const flat_set<ReportID> &reports = h[pivot].reports;
+                        auto tt = add_vertex(
+                                    RoseInVertexProps::makeAccept(reports), vg);
+                        add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+                    }
+
+                    if (edge(pivot, h.acceptEod, h).second) {
+                        assert(generates_callbacks(h));
+                        DEBUG_PRINTF("adding acceptEod\n");
+                        /* literal has a direct connection to accept */
+                        const flat_set<ReportID> &reports = h[pivot].reports;
+                        auto tt = add_vertex(
+                                 RoseInVertexProps::makeAcceptEod(reports), vg);
+                        add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+                    }
+                }
+
+                add_edge(src, v, RoseInEdgeProps(new_lhs, 0), vg);
+                verts_by_source[src].push_back({v, pivot});
+            }
+        }
+    }
+
+    /* wire the literal vertices up to successors */
+    map<vector<NFAVertex>, shared_ptr<NGHolder> > done_rhs;
+    for (const RoseInEdge &ve : to_cut) {
+        RoseInVertex src = source(ve, vg);
+        RoseInVertex dest = target(ve, vg);
+
+        /* iterate over cut for determinism */
+        for (const auto &elem : verts_by_source[src]) {
+            NFAVertex pivot = elem.second;
+            RoseInVertex v = elem.first;
+
+            vector<NFAVertex> adj;
+            insert(&adj, adj.end(), adjacent_vertices(pivot, h));
+            /* we can ignore presence of accept, accepteod in adj as it is best
+               effort */
+
+            if (!contains(done_rhs, adj)) {
+                ue2::unordered_map<NFAVertex, NFAVertex> temp_map;
+                shared_ptr<NGHolder> new_rhs = make_shared<NGHolder>();
+                splitRHS(h, adj, new_rhs.get(), &temp_map);
+                remove_edge(new_rhs->start, new_rhs->accept, *new_rhs);
+                remove_edge(new_rhs->start, new_rhs->acceptEod, *new_rhs);
+                renumber_edges(*new_rhs);
+                DEBUG_PRINTF("    into rhs %s\n",
+                              to_string(new_rhs->kind).c_str());
+                done_rhs.emplace(adj, new_rhs);
+            }
+
+            assert(done_rhs[adj].get());
+            shared_ptr<NGHolder> new_rhs = done_rhs[adj];
+
+            assert(hasCorrectlyNumberedVertices(*new_rhs));
+            assert(hasCorrectlyNumberedEdges(*new_rhs));
+
+            if (vg[dest].type == RIV_LITERAL
+                && !can_match(*new_rhs, vg[dest].s, true)) {
+                continue;
+            }
+
+            if (out_degree(new_rhs->start, *new_rhs) != 1) {
+                add_edge(v, dest, RoseInEdgeProps(new_rhs, 0), vg);
+            }
+        }
+
+        remove_edge(ve, vg);
+    }
+}
+
+static
+bool doNetflowCut(NGHolder &h,
+                  const vector<NFAVertexDepth> *depths,
+                  RoseInGraph &vg,
+                  const vector<RoseInEdge> &ee, bool for_prefix,
+                  const Grey &grey, u32 min_allowed_length = 0U) {
+    ENSURE_AT_LEAST(&min_allowed_length, grey.minRoseNetflowLiteralLength);
+
+    DEBUG_PRINTF("doing netflow cut\n");
+    /* TODO: we should really get literals/scores from the full graph as this
+     * allows us to overlap with previous cuts. */
+    assert(!ee.empty());
+    assert(&h == &*vg[ee.front()].graph);
+    assert(!for_prefix || depths);
+
+    if (num_edges(h) > grey.maxRoseNetflowEdges) {
+        /* We have a limit on this because scoring edges and running netflow
+         * gets very slow for big graphs. */
+        DEBUG_PRINTF("too many edges, skipping netflow cut\n");
+        return false;
+    }
+
+    assert(hasCorrectlyNumberedVertices(h));
+    assert(hasCorrectlyNumberedEdges(h));
+
+    auto known_bad = poisonEdges(h, depths, vg, ee, for_prefix, grey);
+
+    /* Step 1: Get scores for all edges */
+    vector<u64a> scores = scoreEdges(h, known_bad); /* scores by edge_index */
+
+    /* Step 2: Find cutset based on scores */
+    vector<NFAEdge> cut = findMinCut(h, scores);
+
+    /* Step 3: Get literals corresponding to cut edges */
+    map<NFAEdge, set<ue2_literal>> cut_lits;
+    for (const auto &e : cut) {
+        set<ue2_literal> lits = getLiteralSet(h, e);
+        sanitizeAndCompressAndScore(lits);
+
+        cut_lits[e] = lits;
+
+        DEBUG_PRINTF("cut lit '%s' %u->%u\n",
+                     ((const string &)*cut_lits[e].begin()).c_str(),
+                     h[source(e, h)].index, h[target(e, h)].index);
+    }
+
+    /* if literals are underlength bail or if it involves a forbidden edge*/
+    if (!checkValidNetflowLits(h, scores, cut_lits, min_allowed_length)) {
+        return false;
+    }
+    DEBUG_PRINTF("splitting\n");
+
+    /* Step 4: Split graph based on cuts */
+    splitEdgesByCut(h, vg, ee, cut, cut_lits);
+
+    return true;
+}
+
+static
+bool deanchorIfNeeded(NGHolder &g) {
+    DEBUG_PRINTF("hi\n");
+    if (proper_out_degree(g.startDs, g)) {
+        return false;
+    }
+
+    /* look for a non-special dot with a loop following start */
+    set<NFAVertex> succ_g;
+    insert(&succ_g, adjacent_vertices(g.start, g));
+    succ_g.erase(g.startDs);
+
+    for (auto v : adjacent_vertices_range(g.start, g)) {
+        DEBUG_PRINTF("inspecting cand %u || = %zu\n", g[v].index,
+                     g[v].char_reach.count());
+
+        if (v == g.startDs || !g[v].char_reach.all()) {
+            continue;
+        }
+
+        set<NFAVertex> succ_v;
+        insert(&succ_v, adjacent_vertices(v, g));
+
+        if (succ_v == succ_g) {
+            DEBUG_PRINTF("found ^.*\n");
+            for (auto succ : succ_g) {
+                add_edge(g.startDs, succ, g);
+            }
+            clear_vertex(v, g);
+            remove_vertex(v, g);
+            renumber_vertices(g);
+            return true;
+        }
+
+        if (succ_g.size() == 1 && hasSelfLoop(v, g)) {
+            DEBUG_PRINTF("found ^.+\n");
+            add_edge(g.startDs, v, g);
+            remove_edge(v, v, g);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static
+RoseInGraph populateTrivialGraph(const NGHolder &h) {
+    RoseInGraph g;
+    shared_ptr<NGHolder> root_g = cloneHolder(h);
+    bool orig_anch = isAnchored(*root_g);
+    orig_anch |= deanchorIfNeeded(*root_g);
+
+    DEBUG_PRINTF("orig_anch %d\n", (int)orig_anch);
+
+    auto start = add_vertex(RoseInVertexProps::makeStart(orig_anch), g);
+    auto accept = add_vertex(RoseInVertexProps::makeAccept(set<ReportID>()), g);
+
+    add_edge(start, accept, RoseInEdgeProps(root_g, 0), g);
+
+    return g;
+}
+
+static
+void avoidOutfixes(RoseInGraph &vg, const CompileContext &cc) {
+    STAGE_DEBUG_PRINTF("AVOIDING OUTFIX\n");
+    if (num_vertices(vg) > 2) {
+        /* must be at least one literal aside from start and accept */
+        return;
+    }
+
+    RoseInEdge e = *edges(vg).first;
+
+    NGHolder &h = *vg[e].graph;
+
+    renumber_vertices(h);
+    renumber_edges(h);
+
+    unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, {e}, cc);
+
+    if (split && splitRoseEdge(h, vg, {e}, *split)) {
+        DEBUG_PRINTF("split on simple literal\n");
+    } else {
+        doNetflowCut(h, nullptr, vg, {e}, false, cc.grey);
+    }
+}
+
+static
+void removeRedundantPrefixes(RoseInGraph &g) {
+    STAGE_DEBUG_PRINTF("REMOVING REDUNDANT PREFIXES\n");
+
+    for (const RoseInEdge &e : edges_range(g)) {
+        RoseInVertex s = source(e, g);
+        RoseInVertex t = target(e, g);
+
+        if (g[s].type != RIV_START || g[t].type != RIV_LITERAL) {
+            continue;
+        }
+
+        if (!g[e].graph) {
+            continue;
+        }
+
+        assert(!g[t].delay);
+        const ue2_literal &lit = g[t].s;
+
+        if (!literalIsWholeGraph(*g[e].graph, lit)) {
+            DEBUG_PRINTF("not whole graph\n");
+            continue;
+        }
+
+        if (!isFloating(*g[e].graph)) {
+            DEBUG_PRINTF("not floating\n");
+            continue;
+        }
+        g[e].graph.reset();
+    }
+}
+
+static
+u32 maxDelay(const CompileContext &cc) {
+    if (!cc.streaming) {
+        return MO_INVALID_IDX;
+    }
+    return cc.grey.maxHistoryAvailable;
+}
+
+static
+void removeRedundantLiteralsFromPrefixes(RoseInGraph &g,
+                                         const CompileContext &cc) {
+    STAGE_DEBUG_PRINTF("REMOVING LITERALS FROM PREFIXES\n");
+
+    vector<RoseInEdge> to_anchor;
+    for (const RoseInEdge &e : edges_range(g)) {
+        RoseInVertex s = source(e, g);
+        RoseInVertex t = target(e, g);
+
+        if (g[s].type != RIV_START && g[s].type != RIV_ANCHORED_START) {
+            continue;
+        }
+
+        if (g[t].type != RIV_LITERAL) {
+            continue;
+        }
+
+        if (!g[e].graph) {
+            continue;
+        }
+
+        assert(!g[t].delay);
+        const ue2_literal &lit = g[t].s;
+
+        DEBUG_PRINTF("removing states for literal: %s\n",
+                     dumpString(lit).c_str());
+
+        unique_ptr<NGHolder> h = cloneHolder(*g[e].graph);
+        const u32 max_delay = maxDelay(cc);
+
+        u32 delay = removeTrailingLiteralStates(*h, lit, max_delay,
+                                              false /* can't overhang start */);
+
+        DEBUG_PRINTF("got delay %u (max allowed %u)\n", delay, max_delay);
+
+        if (edge(h->startDs, h->accept, *h).second) {
+            /* we should have delay == lit.length(), but in really complex
+             * cases we may fail to identify that we can remove the whole
+             * graph. Regardless, the fact that sds is wired to accept means the
+             * graph serves no purpose. */
+            DEBUG_PRINTF("whole graph\n");
+            g[e].graph.reset();
+            continue;
+        }
+
+        if (delay == lit.length() && edge(h->start, h->accept, *h).second
+            && num_vertices(*h) == N_SPECIALS) {
+            to_anchor.push_back(e);
+            continue;
+        }
+
+        /* if we got here we should still have an interesting graph */
+        assert(delay == max_delay || num_vertices(*h) > N_SPECIALS);
+
+        if (delay && delay != MO_INVALID_IDX) {
+            DEBUG_PRINTF("setting delay %u on lhs %p\n", delay, h.get());
+
+            g[e].graph = move(h);
+            g[e].graph_lag = delay;
+        }
+    }
+
+    if (!to_anchor.empty()) {
+        RoseInVertex anch = add_vertex(RoseInVertexProps::makeStart(true), g);
+
+        for (RoseInEdge e : to_anchor) {
+            DEBUG_PRINTF("rehoming to anchor\n");
+            RoseInVertex v = target(e, g);
+            add_edge(anch, v, g);
+            remove_edge(e, g);
+        }
+    }
+}
+
+static
+bool isStarCliche(const NGHolder &g) {
+    DEBUG_PRINTF("checking graph with %zu vertices\n", num_vertices(g));
+
+    bool nonspecials_seen = false;
+
+    for (auto v : vertices_range(g)) {
+        if (is_special(v, g)) {
+            continue;
+        }
+
+        if (nonspecials_seen) {
+            return false;
+        }
+        nonspecials_seen = true;
+
+        if (!g[v].char_reach.all()) {
+            return false;
+        }
+
+        if (!hasSelfLoop(v, g)) {
+            return false;
+        }
+        if (!edge(v, g.accept, g).second) {
+            return false;
+        }
+    }
+
+    if (!nonspecials_seen) {
+        return false;
+    }
+
+    if (!edge(g.start, g.accept, g).second) {
+        return false;
+    }
+
+    return true;
+}
+
+static
+void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig,
+                                      const vector<RoseInEdge> &ee,
+                                      const CompileContext &cc) {
+    /* TODO: This could be better by not creating a separate graph for each
+     * successor literal. This would require using distinct report ids and also
+     * taking into account overlap of successor literals. */
+
+    set<ue2_literal> preds;
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex u = source(e, ig);
+        assert(ig[u].type == RIV_LITERAL);
+        assert(!ig[e].graph_lag);
+        assert(!ig[u].delay);
+        preds.insert(ig[u].s);
+    }
+
+    set<ue2_literal> succs;
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex v = target(e, ig);
+        assert(ig[v].type == RIV_LITERAL);
+        assert(!ig[v].delay);
+        succs.insert(ig[v].s);
+    }
+
+    map<ue2_literal, pair<shared_ptr<NGHolder>, u32> > graphs; /* + delay */
+
+    for (const ue2_literal &right : succs) {
+        size_t max_overlap = 0;
+        for (const ue2_literal &left : preds) {
+            size_t overlap = maxOverlap(left, right, 0);
+            ENSURE_AT_LEAST(&max_overlap, overlap);
+        }
+
+        u32 max_allowed_delay = right.length() - max_overlap;
+
+        if (cc.streaming) {
+            LIMIT_TO_AT_MOST(&max_allowed_delay, cc.grey.maxHistoryAvailable);
+        }
+
+        if (!max_allowed_delay) {
+            continue;
+        }
+
+        shared_ptr<NGHolder> h_new = cloneHolder(h);
+
+        u32 delay = removeTrailingLiteralStates(*h_new, right,
+                                                max_allowed_delay);
+
+        if (delay == MO_INVALID_IDX) {
+            /* successor literal could not match infix -> ignore flase path */
+            assert(0);
+            continue;
+        }
+
+        graphs[right] = make_pair(h_new, delay);
+    }
+
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex v = target(e, ig);
+        const ue2_literal &succ = ig[v].s;
+        if (!contains(graphs, succ)) {
+            continue;
+        }
+
+        ig[e].graph = graphs[succ].first;
+        ig[e].graph_lag = graphs[succ].second;
+
+        if (isStarCliche(*ig[e].graph)) {
+            DEBUG_PRINTF("is a X star!\n");
+            ig[e].graph.reset();
+            ig[e].graph_lag = 0;
+        }
+    }
+}
+
+static
+void removeRedundantLiteralsFromInfixes(RoseInGraph &g,
+                                        const CompileContext &cc) {
+    map<NGHolder *, vector<RoseInEdge> > infixes;
+
+    for (const RoseInEdge &e : edges_range(g)) {
+        RoseInVertex s = source(e, g);
+        RoseInVertex t = target(e, g);
+
+        if (g[s].type != RIV_LITERAL || g[t].type != RIV_LITERAL) {
+            continue;
+        }
+
+        if (!g[e].graph) {
+            continue;
+        }
+
+        assert(!g[t].delay);
+        infixes[&*g[e].graph].push_back(e);
+    }
+
+    for (const auto &info : infixes) {
+        removeRedundantLiteralsFromInfix(*info.first, g, info.second, cc);
+    }
+}
+
+
+static
+void removeRedundantLiterals(RoseInGraph &g, const CompileContext &cc) {
+    removeRedundantLiteralsFromPrefixes(g, cc);
+    removeRedundantLiteralsFromInfixes(g, cc);
+}
+
+static
+RoseInVertex getStart(RoseInGraph &vg) {
+    for (RoseInVertex v : vertices_range(vg)) {
+        if (vg[v].type == RIV_START || vg[v].type == RIV_ANCHORED_START) {
+            return v;
+        }
+    }
+    assert(0);
+    return RoseInGraph::null_vertex();
+}
+
+/* Finds the intial accept vertex created to which suffix/outfixes are attached
+ */
+static
+RoseInVertex getPrimaryAccept(RoseInGraph &vg) {
+    for (RoseInVertex v : vertices_range(vg)) {
+        if (vg[v].type == RIV_ACCEPT && vg[v].reports.empty()) {
+            return v;
+        }
+    }
+    assert(0);
+    return RoseInGraph::null_vertex();
+}
+
+static
+bool willBeTransient(const depth &max_depth, const CompileContext &cc) {
+    if (!cc.streaming) {
+        return max_depth <= depth(ROSE_BLOCK_TRANSIENT_MAX_WIDTH);
+    } else {
+        return max_depth <= depth(cc.grey.maxHistoryAvailable + 1);
+    }
+}
+
+static
+bool willBeAnchoredTable(const depth &max_depth, const Grey &grey) {
+    return max_depth <= depth(grey.maxAnchoredRegion);
+}
+
+static
+unique_ptr<NGHolder> make_chain(u32 count) {
+    assert(count);
+
+    auto rv = make_unique<NGHolder>(NFA_INFIX);
+
+    NGHolder &h = *rv;
+
+    NFAVertex u = h.start;
+    for (u32 i = 0; i < count; i++) {
+        NFAVertex v = add_vertex(h);
+        h[v].char_reach = CharReach::dot();
+        add_edge(u, v, h);
+        u = v;
+    }
+    h[u].reports.insert(0);
+    add_edge(u, h.accept, h);
+
+    return rv;
+}
+
+#define SHORT_TRIGGER_LEN 16
+
+static
+bool makeTransientFromLongLiteral(NGHolder &h, RoseInGraph &vg,
+                                  const vector<RoseInEdge> &ee,
+                                  const CompileContext &cc) {
+    /* check max width and literal lengths to see if possible */
+    size_t min_lit = ~0ULL;
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex v = target(e, vg);
+        LIMIT_TO_AT_MOST(&min_lit, vg[v].s.length());
+    }
+
+    if (min_lit <= SHORT_TRIGGER_LEN || min_lit >= UINT_MAX) {
+        return false;
+    }
+
+    depth max_width = findMaxWidth(h);
+
+    u32 delta = min_lit - SHORT_TRIGGER_LEN;
+
+    if (!willBeTransient(max_width - depth(delta), cc)
+        && !willBeAnchoredTable(max_width - depth(delta), cc.grey)) {
+        return false;
+    }
+
+    DEBUG_PRINTF("candidate for splitting long literal (len %zu)\n", min_lit);
+    DEBUG_PRINTF("delta = %u\n", delta);
+
+    /* try split */
+    map<RoseInVertex, shared_ptr<NGHolder> > graphs;
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex v = target(e, vg);
+
+        shared_ptr<NGHolder> h_new = cloneHolder(h);
+
+        u32 delay = removeTrailingLiteralStates(*h_new, vg[v].s, delta);
+
+        DEBUG_PRINTF("delay %u\n", delay);
+
+        if (delay != delta) {
+            DEBUG_PRINTF("unable to trim literal\n");
+            return false;
+        }
+
+        if (in_degree(v, vg) != 1) {
+            DEBUG_PRINTF("complicated\n");
+            return false;
+        }
+
+        DEBUG_PRINTF("new mw = %u\n", (u32)findMaxWidth(*h_new));
+        assert(willBeTransient(findMaxWidth(*h_new), cc)
+               || willBeAnchoredTable(findMaxWidth(*h_new), cc.grey));
+
+        graphs[v] = h_new;
+    }
+
+    /* add .{repeats} from prefixes to long literals */
+    for (const RoseInEdge &e : ee) {
+        RoseInVertex s = source(e, vg);
+        RoseInVertex t = target(e, vg);
+
+        remove_edge(e, vg);
+        const ue2_literal &orig_lit = vg[t].s;
+
+        ue2_literal lit(orig_lit.begin(), orig_lit.end() - delta);
+
+        ue2_literal lit2(orig_lit.end() - delta, orig_lit.end());
+
+        assert(lit.length() + delta == orig_lit.length());
+
+        vg[t].s = lit2;
+
+        RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+        add_edge(s, v, RoseInEdgeProps(graphs[t], 0), vg);
+        add_edge(v, t, RoseInEdgeProps(make_chain(delta), 0), vg);
+    }
+
+    DEBUG_PRINTF("success\n");
+    /* TODO: alter split point to avoid pathological splits */
+    return true;
+}
+
+static
+bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
+                   const CompileContext &cc) {
+    DEBUG_PRINTF("trying to improve prefix %p, %zu verts\n", &h,
+                  num_vertices(h));
+
+    renumber_vertices(h);
+    renumber_edges(h);
+
+    vector<NFAVertexDepth> depths;
+    calcDepths(h, depths);
+
+    /* If the reason the prefix is not transient is due to a very long literal
+     * following, we can make it transient by restricting ourselves to using
+     * just the head of the literal. */
+    if (makeTransientFromLongLiteral(h, vg, ee, cc)) {
+        return true;
+    }
+
+    unique_ptr<VertLitInfo> split = findBestPrefixSplit(h, depths, vg, ee, cc);
+
+    if (split && (split->creates_transient || split->creates_anchored)
+        && splitRoseEdge(h, vg, ee, *split)) {
+        DEBUG_PRINTF("split on simple literal\n");
+        return true;
+    }
+
+    /* large back edges may prevent us identifing anchored or transient cases
+     * properly - use a simple walk instead */
+
+    if (doNetflowCut(h, &depths, vg, ee, true, cc.grey)) {
+        return true;
+    }
+
+    if (split && splitRoseEdge(h, vg, ee, *split)) {
+        /* use the simple split even though it doesn't create a transient
+         * prefix */
+        DEBUG_PRINTF("split on simple literal\n");
+        return true;
+    }
+
+    /* look for netflow cuts which don't produce good prefixes */
+    if (doNetflowCut(h, &depths, vg, ee, false, cc.grey)) {
+        return true;
+    }
+
+    if (ee.size() > 1) {
+        DEBUG_PRINTF("split the prefix apart based on succ literals\n");
+        unordered_map<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> >,
+                      NGHolderHasher, NGHolderEqual> trimmed;
+
+        for (const auto &e : ee) {
+            shared_ptr<NGHolder> hh = cloneHolder(h);
+            auto succ_lit = vg[target(e, vg)].s;
+            u32 delay = removeTrailingLiteralStates(*hh, succ_lit,
+                                                    succ_lit.length(),
+                                              false /* can't overhang start */);
+            if (!delay) {
+                DEBUG_PRINTF("could not remove any literal, skip over\n");
+                continue;
+            }
+
+            trimmed[hh].emplace_back(e, delay);
+        }
+
+        if (trimmed.size() == 1) {
+            return false;
+        }
+
+        /* shift the contents to a vector so we can modify the graphs without
+         * violating the map's invariants. */
+        vector<pair<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> > > >
+            trimmed_vec(trimmed.begin(), trimmed.end());
+        trimmed.clear();
+        for (auto &elem : trimmed_vec) {
+            shared_ptr<NGHolder> &hp = elem.first;
+            NGHolder &h = *hp;
+
+            vector<NFAVertex> base_states;
+            insert(&base_states, base_states.end(),
+                   inv_adjacent_vertices(h.accept, h));
+            clear_in_edges(h.accept, h);
+
+            for (auto v : base_states) {
+                h[v].reports.clear(); /* clear report from old accepts */
+            }
+
+            for (const auto &edge_delay : elem.second) {
+                const RoseInEdge &e = edge_delay.first;
+                u32 delay = edge_delay.second;
+                auto succ_lit = vg[target(e, vg)].s;
+
+                vg[e].graph = hp;
+                assert(delay <= succ_lit.length());
+                restoreTrailingLiteralStates(*vg[e].graph, succ_lit, delay,
+                                             base_states);
+            }
+        }
+        return true;
+    }
+
+    return false;
+}
+
+#define MAX_FIND_BETTER_PREFIX_GEN   4
+#define MAX_FIND_BETTER_PREFIX_COUNT 100
+
+static
+void findBetterPrefixes(RoseInGraph &vg, const CompileContext &cc) {
+    STAGE_DEBUG_PRINTF("FIND BETTER PREFIXES\n");
+    RoseInVertex start = getStart(vg);
+
+    bool changed;
+    u32 gen = 0;
+    do {
+        DEBUG_PRINTF("gen %u\n", gen);
+        changed = false;
+        vector<NGHolder *> seen_order;
+        map<NGHolder *, vector<RoseInEdge> > prefixes;
+
+        /* find prefixes */
+        for (const RoseInEdge &e : out_edges_range(start, vg)) {
+            /* outfixes shouldn't have made it this far */
+            assert(vg[target(e, vg)].type == RIV_LITERAL);
+            if (vg[e].graph) {
+                NGHolder *h = vg[e].graph.get();
+                if (!contains(prefixes, h)) {
+                    seen_order.push_back(h);
+                }
+                prefixes[h].push_back(e);
+            }
+        }
+
+        if (prefixes.size() > MAX_FIND_BETTER_PREFIX_COUNT) {
+            break;
+        }
+
+        /* look for bad prefixes and try to split */
+        for (NGHolder *h : seen_order) {
+            depth max_width = findMaxWidth(*h);
+            if (willBeTransient(max_width, cc)
+                || willBeAnchoredTable(max_width, cc.grey)) {
+                continue;
+            }
+
+            changed = improvePrefix(*h, vg, prefixes[h], cc);
+        }
+    } while (changed && gen++ < MAX_FIND_BETTER_PREFIX_GEN);
+}
+
+#define STRONG_LITERAL_LENGTH 20
+#define MAX_EXTRACT_STRONG_LITERAL_GRAPHS 10
+
+static
+bool extractStrongLiteral(NGHolder &h, RoseInGraph &vg,
+                          const vector<RoseInEdge> &ee,
+                          const CompileContext &cc) {
+    DEBUG_PRINTF("looking for string literal\n");
+    unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc);
+
+    if (split && min_len(split->lit) >= STRONG_LITERAL_LENGTH) {
+        DEBUG_PRINTF("splitting simple literal\n");
+        return splitRoseEdge(h, vg, ee, *split);
+    }
+
+    return false;
+}
+
+static
+void extractStrongLiterals(RoseInGraph &vg, const CompileContext &cc) {
+    if (!cc.grey.violetExtractStrongLiterals) {
+        return;
+    }
+    STAGE_DEBUG_PRINTF("EXTRACT STRONG LITERALS\n");
+    set<NGHolder *> stuck;
+
+    bool changed;
+    do {
+        changed = false;
+
+        vector<NGHolder *> seen_order;
+        map<NGHolder *, vector<RoseInEdge> > edges_by_graph;
+        for (const RoseInEdge &ve : edges_range(vg)) {
+            if (vg[source(ve, vg)].type != RIV_LITERAL) {
+                continue;
+            }
+            if (vg[ve].graph) {
+                if (!contains(edges_by_graph, vg[ve].graph.get())) {
+                    seen_order.push_back(vg[ve].graph.get());
+                }
+                edges_by_graph[vg[ve].graph.get()].push_back(ve);
+            }
+        }
+
+        if (edges_by_graph.size() > MAX_EXTRACT_STRONG_LITERAL_GRAPHS) {
+            DEBUG_PRINTF("too many graphs, stopping\n");
+            return;
+        }
+
+        for (NGHolder *g : seen_order) {
+            if (contains(stuck, g)) {
+                DEBUG_PRINTF("already known to be bad\n");
+                continue;
+            }
+            bool rv = extractStrongLiteral(*g, vg, edges_by_graph[g], cc);
+            if (rv) {
+                changed = true;
+            } else {
+                stuck.insert(g);
+            }
+        }
+    } while (changed);
+}
+
+#define INFIX_STRONG_GUARD_LEN 8
+#define INFIX_MIN_SPLIT_LITERAL_LEN 12
+
+static
+bool improveInfix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
+                  const CompileContext &cc) {
+    unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc);
+
+    if (split && min_len(split->lit) >= INFIX_MIN_SPLIT_LITERAL_LEN
+        && splitRoseEdge(h, vg, ee, *split)) {
+        DEBUG_PRINTF("splitting simple literal\n");
+        return true;
+    }
+
+    DEBUG_PRINTF("trying for a netflow cut\n");
+    /* look for netflow cuts which don't produce good prefixes */
+    bool rv = doNetflowCut(h, nullptr, vg, ee, false, cc.grey, 8);
+
+    DEBUG_PRINTF("did netfow cut? = %d\n", (int)rv);
+
+    return rv;
+}
+
+/**
+ * Infixes which are weakly guarded can, in effect, act like prefixes as they
+ * will often be live. We should try to split these infixes further if they
+ * contain strong literals so that we are at least running smaller weak infixes
+ * which can hopeful be accelerated/miracled.
+ */
+static
+void improveWeakInfixes(RoseInGraph &vg, const CompileContext &cc) {
+    if (!cc.grey.violetAvoidWeakInfixes) {
+        return;
+    }
+    STAGE_DEBUG_PRINTF("IMPROVE WEAK INFIXES\n");
+
+    RoseInVertex start = getStart(vg);
+
+    set<NGHolder *> weak;
+    vector<NGHolder *> ordered_weak;
+
+    for (RoseInVertex vv : adjacent_vertices_range(start, vg)) {
+        /* outfixes shouldn't have made it this far */
+        assert(vg[vv].type == RIV_LITERAL);
+        if (vg[vv].s.length() >= INFIX_STRONG_GUARD_LEN) {
+            continue;
+        }
+
+        for (const RoseInEdge &e : out_edges_range(vv, vg)) {
+            if (vg[target(e, vg)].type != RIV_LITERAL || !vg[e].graph) {
+                continue;
+            }
+
+            NGHolder *h = vg[e].graph.get();
+            DEBUG_PRINTF("'%s' guards %p\n", dumpString(vg[vv].s).c_str(), h);
+            if (!contains(weak, h)) {
+                weak.insert(h);
+                ordered_weak.push_back(h);
+            }
+        }
+    }
+
+    map<NGHolder *, vector<RoseInEdge> > weak_edges;
+    for (const RoseInEdge &ve : edges_range(vg)) {
+        if (contains(weak, vg[ve].graph.get())) {
+            weak_edges[vg[ve].graph.get()].push_back(ve);
+        }
+    }
+
+    for (NGHolder *h : ordered_weak) {
+        improveInfix(*h, vg, weak_edges[h], cc);
+    }
+}
+
+static
+void splitEdgesForSuffix(const NGHolder &base_graph, RoseInGraph &vg,
+                         const vector<RoseInEdge> &ee, const VertLitInfo &split,
+                         bool eod, const flat_set<ReportID> &reports) {
+    const vector<NFAVertex> &splitters = split.vv;
+    assert(!splitters.empty());
+
+    shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
+    unordered_map<NFAVertex, NFAVertex> v_map;
+    cloneHolder(*lhs, base_graph, &v_map);
+    lhs->kind = NFA_INFIX;
+    clear_in_edges(lhs->accept, *lhs);
+    clear_in_edges(lhs->acceptEod, *lhs);
+    add_edge(lhs->accept, lhs->acceptEod, *lhs);
+    clearReports(*lhs);
+    for (NFAVertex v : splitters) {
+        add_edge(v_map[v], lhs->accept, *lhs);
+        (*lhs)[v_map[v]].reports.insert(0);
+    }
+    pruneUseless(*lhs);
+
+    /* create literal vertices and connect preds */
+    for (const auto &lit : split.lit) {
+        if (!can_match(*lhs, lit, is_triggered(*lhs))) {
+            continue;
+        }
+
+        DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str());
+        RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+
+        RoseInVertex tt;
+        if (eod) {
+            DEBUG_PRINTF("doing eod\n");
+            tt = add_vertex(RoseInVertexProps::makeAcceptEod(reports), vg);
+        } else {
+            DEBUG_PRINTF("doing non-eod\n");
+            tt = add_vertex(RoseInVertexProps::makeAccept(reports), vg);
+        }
+        add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+
+        for (const RoseInEdge &e : ee) {
+            RoseInVertex u = source(e, vg);
+            assert(!edge(u, v, vg).second);
+            add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
+        }
+    }
+}
+
+#define MIN_SUFFIX_LEN 6
+
+static
+bool replaceSuffixWithInfix(const NGHolder &h, RoseInGraph &vg,
+                            const vector<RoseInEdge> &suffix_edges,
+                            const CompileContext &cc) {
+    DEBUG_PRINTF("inspecting suffix : %p on %zu edges\n", &h,
+                 suffix_edges.size());
+    /*
+     * We would, in general, rather not have output exposed engines because
+     * once they are triggered, they must be run while infixes only have to run
+     * if the successor literal is seen. Matches from output exposed engines
+     * also have to be placed in a priority queue and interleaved with matches
+     * from other sources.
+     *
+     * Note:
+     * - if the LHS is extremely unlikely we may be better off leaving
+     *   a suffix unguarded.
+     *
+     * - limited width suffixes may be less bad as they won't be continuously
+     *   active, we may want to have (a) stronger controls on if we want to pick
+     *   a trailing literal in these cases and/or (b) look also for literals
+     *   near accept as well as right on accept
+     *
+     * TODO: improve heuristics, splitting logic.
+     */
+
+    /* we may do multiple splits corresponding to different report behaviour */
+    set<NFAVertex> seen;
+    map<pair<bool, flat_set<ReportID> >, VertLitInfo> by_reports; /* eod, rep */
+
+    for (NFAVertex v : inv_adjacent_vertices_range(h.accept, h)) {
+        set<ue2_literal> ss = getLiteralSet(h, v, false);
+        if (ss.empty()) {
+            DEBUG_PRINTF("candidate is too shitty\n");
+            return false;
+        }
+
+        VertLitInfo &vli = by_reports[make_pair(false, h[v].reports)];
+        insert(&vli.lit, ss);
+        vli.vv.push_back(v);
+        seen.insert(v);
+    }
+
+    seen.insert(h.accept);
+    for (NFAVertex v : inv_adjacent_vertices_range(h.acceptEod, h)) {
+        if (contains(seen, v)) {
+            continue;
+        }
+
+        set<ue2_literal> ss = getLiteralSet(h, v, false);
+        if (ss.empty()) {
+            DEBUG_PRINTF("candidate is too shitty\n");
+            return false;
+        }
+
+        VertLitInfo &vli = by_reports[make_pair(true, h[v].reports)];
+        insert(&vli.lit, ss);
+        vli.vv.push_back(v);
+    }
+
+    assert(!by_reports.empty());
+
+    /* TODO: how strong a min len do we want here ? */
+    u32 min_len = cc.grey.minRoseLiteralLength;
+    ENSURE_AT_LEAST(&min_len, MIN_SUFFIX_LEN);
+
+    for (auto &vli : by_reports | map_values) {
+        u64a score = sanitizeAndCompressAndScore(vli.lit);
+
+        if (vli.lit.empty()
+            || !validateRoseLiteralSetQuality(vli.lit, score, false, min_len,
+                                              false)) {
+            return false;
+        }
+    }
+
+    for (const auto &info : by_reports) {
+        DEBUG_PRINTF("splitting on simple literals\n");
+        splitEdgesForSuffix(h, vg, suffix_edges, info.second,
+                            info.first.first /* eod */,
+                            info.first.second /* reports */);
+    }
+
+    for (const RoseInEdge &e : suffix_edges) {
+        remove_edge(e, vg);
+    }
+    return true;
+}
+
+static
+void avoidSuffixes(RoseInGraph &vg, const CompileContext &cc) {
+    if (!cc.grey.violetAvoidSuffixes) {
+        return;
+    }
+
+    STAGE_DEBUG_PRINTF("AVOID SUFFIXES\n");
+
+    RoseInVertex accept = getPrimaryAccept(vg);
+    map<const NGHolder *, vector<RoseInEdge> > suffixes;
+    vector<const NGHolder *> ordered_suffixes;
+
+    /* find suffixes */
+    for (const RoseInEdge &e : in_edges_range(accept, vg)) {
+        /* outfixes shouldn't have made it this far */
+        assert(vg[source(e, vg)].type == RIV_LITERAL);
+        assert(vg[e].graph); /* non suffix paths should be wired to other
+                                accepts */
+        const NGHolder *h = vg[e].graph.get();
+        if (!contains(suffixes, h)) {
+            ordered_suffixes.push_back(h);
+        }
+        suffixes[h].push_back(e);
+    }
+
+    /* look at suffixes and try to split */
+    for (const NGHolder *h : ordered_suffixes) {
+        replaceSuffixWithInfix(*h, vg, suffixes[h], cc);
+    }
+}
+
+static
+bool leadingDotStartLiteral(const NGHolder &h, VertLitInfo *out) {
+    if (out_degree(h.start, h) != 3) {
+        return false;
+    }
+
+    NFAVertex v = NGHolder::null_vertex();
+    NFAVertex ds = NGHolder::null_vertex();
+
+    for (NFAVertex a : adjacent_vertices_range(h.start, h)) {
+        if (a == h.startDs) {
+            continue;
+        }
+        if (h[a].char_reach.all()) {
+            ds = a;
+            if (out_degree(ds, h) != 2 || !edge(ds, ds, h).second) {
+                return false;
+            }
+        } else {
+            v = a;
+        }
+    }
+
+    if (!v || !ds || !edge(ds, v, h).second) {
+        return false;
+    }
+
+    if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) {
+        return false;
+    }
+
+    ue2_literal lit;
+    lit.push_back(h[v].char_reach.find_first(),
+                  h[v].char_reach.isCaselessChar());
+    while (out_degree(v, h) == 1) {
+        NFAVertex vv = *adjacent_vertices(v, h).first;
+        if (h[vv].char_reach.count() != 1
+            && !h[vv].char_reach.isCaselessChar()) {
+            break;
+        }
+
+        v = vv;
+
+        lit.push_back(h[v].char_reach.find_first(),
+                      h[v].char_reach.isCaselessChar());
+    }
+
+    if (is_match_vertex(v, h) && h.kind != NFA_SUFFIX) {
+        /* we have rediscovered the post-infix literal */
+        return false;
+    }
+
+    if (bad_mixed_sensitivity(lit)) {
+        make_nocase(&lit);
+    }
+
+    DEBUG_PRINTF("%u found %s\n", h[v].index, dumpString(lit).c_str());
+    out->vv = {v};
+    out->lit = {lit};
+    return true;
+}
+
+static
+bool lookForDoubleCut(const NGHolder &h, const vector<RoseInEdge> &ee,
+                      RoseInGraph &vg, const Grey &grey) {
+    VertLitInfo info;
+    if (!leadingDotStartLiteral(h, &info)
+        || min_len(info.lit) < grey.violetDoubleCutLiteralLen) {
+        return false;
+    }
+    DEBUG_PRINTF("performing split\n");
+    return splitRoseEdge(h, vg, ee, {info});
+}
+
+static
+void lookForDoubleCut(RoseInGraph &vg, const CompileContext &cc) {
+    if (!cc.grey.violetDoubleCut) {
+        return;
+    }
+
+    map<const NGHolder *, vector<RoseInEdge> > right_edges;
+    vector<const NGHolder *> ordered_graphs;
+    for (const RoseInEdge &ve : edges_range(vg)) {
+        if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
+            const NGHolder *h = vg[ve].graph.get();
+            if (!contains(right_edges, h)) {
+                ordered_graphs.push_back(h);
+            }
+            right_edges[h].push_back(ve);
+        }
+    }
+
+    for (const NGHolder *h : ordered_graphs) {
+        lookForDoubleCut(*h, right_edges[h], vg, cc.grey);
+    }
+}
+
+static
+pair<NFAVertex, ue2_literal> findLiteralBefore(const NGHolder &h, NFAVertex v) {
+    ue2_literal lit;
+    if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) {
+        return {v, std::move(lit) };
+    }
+    lit.push_back(h[v].char_reach.find_first(),
+                  h[v].char_reach.isCaselessChar());
+
+    while (in_degree(v, h) == 1) {
+        NFAVertex vv = *inv_adjacent_vertices(v, h).first;
+        if (h[vv].char_reach.count() != 1
+            && !h[vv].char_reach.isCaselessChar()) {
+            break;
+        }
+
+        lit.push_back(h[vv].char_reach.find_first(),
+                      h[vv].char_reach.isCaselessChar());
+        v = vv;
+    }
+
+    return {v, std::move(lit) };
+}
+
+static
+bool lookForDotStarPred(NFAVertex v, const NGHolder &h,
+                        NFAVertex *u, NFAVertex *ds) {
+    *u = NGHolder::null_vertex();
+    *ds = NGHolder::null_vertex();
+    for (NFAVertex a : inv_adjacent_vertices_range(v, h)) {
+        if (h[a].char_reach.all()) {
+            if (!edge(a, a, h).second) {
+                return false;
+            }
+
+            if (*ds) {
+                return false;
+            }
+
+            *ds = a;
+        } else {
+            if (*u) {
+                return false;
+            }
+            *u = a;
+        }
+    }
+
+    if (!*u || !*ds) {
+        return false;
+    }
+
+    return true;
+}
+
+static
+bool trailingDotStarLiteral(const NGHolder &h, VertLitInfo *out) {
+    /* Note: there is no delay yet - so the final literal is the already
+     * discovered successor literal - we are in fact interested in the literal
+     * before it. */
+
+    if (in_degree(h.accept, h) != 1) {
+        return false;
+    }
+
+    if (in_degree(h.acceptEod, h) != 1) {
+        assert(0);
+        return false;
+    }
+
+    NFAVertex v
+        = findLiteralBefore(h, *inv_adjacent_vertices(h.accept, h).first).first;
+
+    NFAVertex u;
+    NFAVertex ds;
+
+    if (!lookForDotStarPred(v, h, &u, &ds)) {
+        return false;
+    }
+
+    v = u;
+    auto rv = findLiteralBefore(h, v);
+
+    if (!lookForDotStarPred(v, h, &u, &ds)) {
+        return false;
+    }
+
+    ue2_literal lit = reverse_literal(rv.second);
+    DEBUG_PRINTF("%u found %s\n", h[v].index, dumpString(lit).c_str());
+
+    if (bad_mixed_sensitivity(lit)) {
+        make_nocase(&lit);
+    }
+
+    out->vv = {v};
+    out->lit = {lit};
+    return true;
+}
+
+static
+bool lookForTrailingLiteralDotStar(const NGHolder &h,
+                                   const vector<RoseInEdge> &ee,
+                                   RoseInGraph &vg, const Grey &grey) {
+    VertLitInfo info;
+    if (!trailingDotStarLiteral(h, &info)
+        || min_len(info.lit) < grey.violetDoubleCutLiteralLen) {
+        return false;
+    }
+    DEBUG_PRINTF("performing split\n");
+    return splitRoseEdge(h, vg, ee, info);
+}
+
+/* In streaming mode, active engines have to be caught up at stream boundaries
+ * and have to be stored in stream state, so we prefer to decompose patterns
+ * in to literals with no state between them if possible. */
+static
+void decomposeLiteralChains(RoseInGraph &vg, const CompileContext &cc) {
+    if (!cc.grey.violetLiteralChains) {
+        return;
+    }
+
+    bool changed;
+    do {
+        changed = false;
+
+        map<const NGHolder *, vector<RoseInEdge> > right_edges;
+        vector<const NGHolder *> ordered_graphs;
+        for (const RoseInEdge &ve : edges_range(vg)) {
+            if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
+                const NGHolder *h = vg[ve].graph.get();
+                if (!contains(right_edges, h)) {
+                    ordered_graphs.push_back(h);
+                }
+                right_edges[h].push_back(ve);
+            }
+        }
+
+        for (const NGHolder *h : ordered_graphs) {
+            const vector<RoseInEdge> &ee = right_edges[h];
+            bool rv = lookForDoubleCut(*h, ee, vg, cc.grey);
+            if (!rv && h->kind != NFA_SUFFIX) {
+                rv = lookForTrailingLiteralDotStar(*h, ee, vg, cc.grey);
+            }
+            changed |= rv;
+        }
+    } while (changed);
+}
+
+static
+bool lookForCleanSplit(const NGHolder &h, const vector<RoseInEdge> &ee,
+                       RoseInGraph &vg, const CompileContext &cc) {
+    unique_ptr<VertLitInfo> split = findBestCleanSplit(h, cc);
+
+    if (split) {
+        return splitRoseEdge(h, vg, {ee}, *split);
+    }
+
+    return false;
+}
+
+#define MAX_DESIRED_CLEAN_SPLIT_DEPTH 4
+
+static
+void lookForCleanEarlySplits(RoseInGraph &vg, const CompileContext &cc) {
+    u32 gen = 0;
+    set<RoseInVertex> prev = {getStart(vg)};
+
+    while (gen < MAX_DESIRED_CLEAN_SPLIT_DEPTH) {
+        set<RoseInVertex> curr;
+        for (RoseInVertex u : prev) {
+            insert(&curr, adjacent_vertices(u, vg));
+        }
+
+        map<const NGHolder *, vector<RoseInEdge> > rightfixes;
+        vector<NGHolder *> ordered_graphs;
+        for (RoseInVertex v : curr) {
+            for (const RoseInEdge &e : out_edges_range(v, vg)) {
+                if (vg[e].graph) {
+                    NGHolder *h = vg[e].graph.get();
+                    if (!contains(rightfixes, h)) {
+                        ordered_graphs.push_back(h);
+                    }
+                    rightfixes[h].push_back(e);
+                }
+            }
+        }
+
+        for (const NGHolder *h : ordered_graphs) {
+            lookForCleanSplit(*h, rightfixes[h], vg, cc);
+        }
+
+        prev = curr;
+        gen++;
+    }
+}
+
+static
+void rehomeEodSuffixes(RoseInGraph &vg) {
+    // Find edges to accept with EOD-anchored graphs that we can move over to
+    // acceptEod.
+    vector<RoseInEdge> acc_edges;
+    for (const auto &e : edges_range(vg)) {
+        if (vg[target(e, vg)].type != RIV_ACCEPT) {
+            continue;
+        }
+        if (vg[e].haig || !vg[e].graph) {
+            continue;
+        }
+
+        const NGHolder &h = *vg[e].graph;
+
+        if (in_degree(h.accept, h)) {
+            DEBUG_PRINTF("graph isn't eod anchored\n");
+            continue;
+        }
+
+        acc_edges.push_back(e);
+    }
+
+    for (const RoseInEdge &e : acc_edges) {
+        // Move this edge from accept to acceptEod
+        RoseInVertex w = add_vertex(RoseInVertexProps::makeAcceptEod(), vg);
+        add_edge(source(e, vg), w, vg[e], vg);
+        remove_edge(e, vg);
+    }
+
+    /* old accept vertices will be tidied up by final pruneUseless() call */
+}
+
+bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
+              const CompileContext &cc) {
+    assert(!can_never_match(h));
+
+    if (!cc.grey.allowViolet) {
+        return false;
+    }
+
+    DEBUG_PRINTF("hello world\n");
+
+    RoseInGraph vg = populateTrivialGraph(h);
+
+    /* Step 1: avoid outfixes as we always have to run them. */
+    avoidOutfixes(vg, cc);
+
+    if (num_vertices(vg) <= 2) {
+        /* only have an outfix; leave for ng_rose for now */
+        return false;
+    }
+
+    removeRedundantPrefixes(vg);
+    dumpPreRoseGraph(vg, cc.grey, "pre_prefix_rose.dot");
+
+    /* Step 2: avoid non-transient prefixes (esp in streaming mode) */
+    findBetterPrefixes(vg, cc);
+
+    dumpPreRoseGraph(vg, cc.grey, "post_prefix_rose.dot");
+
+    extractStrongLiterals(vg, cc);
+    dumpPreRoseGraph(vg, cc.grey, "post_extract_rose.dot");
+    improveWeakInfixes(vg, cc);
+    dumpPreRoseGraph(vg, cc.grey, "post_infix_rose.dot");
+
+    /* Step 3: avoid output exposed engines if there is a strong trailing
+       literal) */
+    avoidSuffixes(vg, cc);
+
+    /* Step 4: look for infixes/suffixes with leading .*literals
+     * This can reduce the amount of work a heavily picked literal has to do and
+     * reduce the amount of state used as .* is handled internally to rose. */
+    lookForDoubleCut(vg, cc);
+
+    if (cc.streaming) {
+        lookForCleanEarlySplits(vg, cc);
+        decomposeLiteralChains(vg, cc);
+    }
+
+    /* Step 5: avoid unimplementable, or overly large engines if possible */
+    /* TODO: later - ng_rose is currently acting as a backstop */
+
+    /* Step 6: send to rose */
+    rehomeEodSuffixes(vg);
+    removeRedundantLiterals(vg, cc);
+
+    pruneUseless(vg);
+    dumpPreRoseGraph(vg, cc.grey);
+    calcVertexOffsets(vg);
+    bool rv = rose.addRose(vg, prefilter);
+    DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail");
+    return rv;
+}
+
+}
diff --git a/src/nfagraph/ng_violet.h b/src/nfagraph/ng_violet.h
new file mode 100644
index 00000000..fb62bfc0
--- /dev/null
+++ b/src/nfagraph/ng_violet.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Violet method of rose construction from NGHolder.
+ */
+
+#ifndef NG_VIOLET_H
+#define NG_VIOLET_H
+
+#include "ue2common.h"
+
+namespace ue2 {
+
+class NGHolder;
+class RoseBuild;
+
+struct CompileContext;
+
+/** \brief Attempt to consume the entire pattern in graph \a h with Rose.
+ * Returns true if successful. */
+bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
+              const CompileContext &cc);
+
+} // namespace ue2
+
+#endif
diff --git a/src/parser/shortcut_literal.cpp b/src/parser/shortcut_literal.cpp
index f6f5d383..3f58d752 100644
--- a/src/parser/shortcut_literal.cpp
+++ b/src/parser/shortcut_literal.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -162,7 +162,7 @@ ConstructLiteralVisitor::~ConstructLiteralVisitor() {}
 bool shortcutLiteral(NG &ng, const ParsedExpression &expr) {
     assert(expr.component);
 
-    if (!ng.cc.grey.allowRose) {
+    if (!ng.cc.grey.allowLiteral) {
         return false;
     }
 
diff --git a/src/rose/rose_build_add.cpp b/src/rose/rose_build_add.cpp
index fe2c259e..195c4bad 100644
--- a/src/rose/rose_build_add.cpp
+++ b/src/rose/rose_build_add.cpp
@@ -651,26 +651,94 @@ floating:
 }
 
 static
-unique_ptr<NGHolder> makeRoseEodPrefix(const NGHolder &h,
-                                       ReportID prefix_report) {
+unique_ptr<NGHolder> makeRoseEodPrefix(const NGHolder &h, RoseBuildImpl &build,
+                                   map<flat_set<ReportID>, ReportID> &remap) {
     assert(generates_callbacks(h));
-    auto g = cloneHolder(h);
-    g->kind = is_triggered(h) ? NFA_INFIX : NFA_PREFIX;
-    setReportId(*g, prefix_report);
+    assert(!in_degree(h.accept, h));
+    auto gg = cloneHolder(h);
+    NGHolder &g = *gg;
+    g.kind = is_triggered(h) ? NFA_INFIX : NFA_PREFIX;
 
     // Move acceptEod edges over to accept.
     vector<NFAEdge> dead;
-    for (const auto &e : in_edges_range(g->acceptEod, *g)) {
-        NFAVertex u = source(e, *g);
-        if (u == g->accept) {
+    for (const auto &e : in_edges_range(g.acceptEod, g)) {
+        NFAVertex u = source(e, g);
+        if (u == g.accept) {
             continue;
         }
-        add_edge_if_not_present(u, g->accept, *g);
+        add_edge_if_not_present(u, g.accept, g);
         dead.push_back(e);
+
+        if (!contains(remap, g[u].reports)) {
+            remap[g[u].reports] = build.getNewNfaReport();
+        }
+
+        g[u].reports = { remap[g[u].reports] };
     }
 
-    remove_edges(dead, *g);
-    return g;
+    remove_edges(dead, g);
+    return gg;
+}
+
+static
+u32 getEodEventID(RoseBuildImpl &build) {
+    // Allocate the EOD event if it hasn't been already.
+    if (build.eod_event_literal_id == MO_INVALID_IDX) {
+        build.eod_event_literal_id = build.getLiteralId({}, 0, ROSE_EVENT);
+    }
+
+    return build.eod_event_literal_id;
+}
+
+static
+void makeEodEventLeftfix(RoseBuildImpl &build, RoseVertex u,
+                         const NGHolder &h) {
+    assert(!build.isInETable(u));
+
+    RoseGraph &g = build.g;
+    map<flat_set<ReportID>, ReportID> report_remap;
+    shared_ptr<NGHolder> eod_leftfix
+        = makeRoseEodPrefix(h, build, report_remap);
+
+    u32 eod_event = getEodEventID(build);
+
+    for (const auto &report_mapping : report_remap) {
+        RoseVertex v = add_vertex(g);
+        g[v].idx = build.vertexIndex++;
+        g[v].literals.insert(eod_event);
+        build.literal_info[eod_event].vertices.insert(v);
+
+        map<u32, set<ReportID> > report_remap;
+        g[v].left.graph = eod_leftfix;
+        g[v].left.leftfix_report = report_mapping.second;
+        g[v].left.lag = 0;
+        RoseEdge e1 = add_edge(u, v, g).first;
+        g[e1].minBound = 0;
+        g[e1].maxBound = ROSE_BOUND_INF;
+        g[v].min_offset = add_rose_depth(g[u].min_offset,
+                                         findMinWidth(*g[v].left.graph));
+        g[v].max_offset = ROSE_BOUND_INF;
+
+        depth max_width = findMaxWidth(*g[v].left.graph);
+        if (u != build.root && max_width.is_finite()
+            && (!build.isAnyStart(u) || isPureAnchored(*g[v].left.graph))) {
+            g[e1].maxBound = max_width;
+            g[v].max_offset = add_rose_depth(g[u].max_offset, max_width);
+        }
+
+        g[e1].history = ROSE_ROLE_HISTORY_NONE; // handled by prefix
+        RoseVertex w = add_vertex(g);
+        g[w].idx = build.vertexIndex++;
+        g[w].eod_accept = true;
+        g[w].reports = report_mapping.first;
+        g[w].min_offset = g[v].min_offset;
+        g[w].max_offset = g[v].max_offset;
+        RoseEdge e = add_edge(v, w, g).first;
+        g[e].minBound = 0;
+        g[e].maxBound = 0;
+        g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE;
+        DEBUG_PRINTF("accept eod vertex (idx=%zu)\n", g[w].idx);
+    }
 }
 
 static
@@ -686,8 +754,20 @@ void doRoseAcceptVertex(RoseBuildImpl *tbi,
         RoseVertex u = pv.first;
         const RoseInEdgeProps &edge_props = bd.ig[pv.second];
 
+        /* We need to duplicate the parent vertices if:
+         *
+         * 1) It already has a suffix, etc as we are going to add the specified
+         * suffix, etc to the parents and we do not want to overwrite the
+         * existing information.
+         *
+         * 2) We are making the an EOD accept and the vertex already has other
+         * out-edges - The LAST_BYTE history used for EOD accepts is
+         * incompatible with normal successors. As accepts are processed last we
+         * do not need to worry about other normal successors being added later.
+         */
         if (g[u].suffix || !g[u].reports.empty()
-             /* also poss accept eod edge: TODO check properly */
+            || (ig[iv].type == RIV_ACCEPT_EOD && out_degree(u, g)
+                && !edge_props.graph)
             || (!isLeafNode(u, g) && !tbi->isAnyStart(u))) {
             DEBUG_PRINTF("duplicating for parent %zu\n", g[u].idx);
             assert(!tbi->isAnyStart(u));
@@ -719,74 +799,37 @@ void doRoseAcceptVertex(RoseBuildImpl *tbi,
             }
         } else {
             assert(ig[iv].type == RIV_ACCEPT_EOD);
+            assert(!edge_props.haig);
 
-            if (edge_props.graph && tbi->isInETable(u)) {
+            if (!edge_props.graph) {
+                RoseVertex w = add_vertex(g);
+                g[w].idx = tbi->vertexIndex++;
+                g[w].eod_accept = true;
+                g[w].reports = ig[iv].reports;
+                g[w].min_offset = g[u].min_offset;
+                g[w].max_offset = g[u].max_offset;
+                RoseEdge e = add_edge(u, w, g).first;
+                g[e].minBound = 0;
+                g[e].maxBound = 0;
+                g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE;
+                DEBUG_PRINTF("accept eod vertex (idx=%zu)\n", g[w].idx);
+                continue;
+            }
+
+            const NGHolder &h = *edge_props.graph;
+            assert(!in_degree(h.accept, h));
+            assert(generates_callbacks(h));
+
+            if (tbi->isInETable(u)) {
+                assert(h.kind == NFA_SUFFIX);
                 assert(!tbi->isAnyStart(u));
                 /* etable can't/shouldn't use eod event */
                 DEBUG_PRINTF("adding suffix to i%zu\n", g[u].idx);
                 g[u].suffix.graph = edge_props.graph;
-                assert(g[u].suffix.graph->kind == NFA_SUFFIX);
-                dumpHolder(*g[u].suffix.graph, 98, "eod_suffix", tbi->cc.grey);
-                assert(!in_degree(g[u].suffix.graph->accept,
-                                  *g[u].suffix.graph));
-                set<ReportID> reports = all_reports(*g[u].suffix.graph);
-                tbi->rm.getReport(*reports.begin());
-                assert(reports.size() == 1);
-                /* TODO: set dfa_(min|max)_width */
                 continue;
-            } else if (edge_props.graph) {
-                assert(!edge_props.haig);
-                assert(!tbi->isInETable(u));
-
-                // Allocate the EOD event if it hasn't been already.
-                if (tbi->eod_event_literal_id == MO_INVALID_IDX) {
-                    tbi->eod_event_literal_id =
-                        tbi->getLiteralId(ue2_literal(), 0, ROSE_EVENT);
-                }
-
-                RoseVertex v = add_vertex(g);
-                g[v].idx = tbi->vertexIndex++;
-                g[v].literals.insert(tbi->eod_event_literal_id);
-                tbi->literal_info[tbi->eod_event_literal_id].vertices.insert(v);
-
-                ReportID prefix_report = tbi->getNewNfaReport();
-                g[v].left.graph
-                    = makeRoseEodPrefix(*edge_props.graph, prefix_report);
-                g[v].left.leftfix_report = prefix_report;
-                g[v].left.lag = 0;
-                RoseEdge e1 = add_edge(u, v, g).first;
-                g[e1].minBound = 0;
-                g[e1].maxBound = ROSE_BOUND_INF;
-                g[v].min_offset = add_rose_depth(
-                        g[u].min_offset, findMinWidth(*g[v].left.graph));
-                g[v].max_offset = ROSE_BOUND_INF;
-
-                DEBUG_PRINTF("hi\n");
-                depth max_width = findMaxWidth(*g[v].left.graph);
-                if (u != tbi->root
-                    && max_width.is_finite()
-                    && (!tbi->isAnyStart(u)
-                        || isPureAnchored(*g[v].left.graph))) {
-                    g[e1].maxBound = max_width;
-                    g[v].max_offset = add_rose_depth(g[u].max_offset, max_width);
-                }
-
-                g[e1].history = ROSE_ROLE_HISTORY_NONE; // handled by prefix
-                u = v;
             }
-            assert(!edge_props.haig);
 
-            RoseVertex w = add_vertex(g);
-            g[w].idx = tbi->vertexIndex++;
-            g[w].eod_accept = true;
-            g[w].reports = ig[iv].reports;
-            g[w].min_offset = g[u].min_offset;
-            g[w].max_offset = g[u].max_offset;
-            RoseEdge e = add_edge(u, w, g).first;
-            g[e].minBound = 0;
-            g[e].maxBound = 0;
-            g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE;
-            DEBUG_PRINTF("accept eod vertex (idx=%zu)\n", g[w].idx);
+            makeEodEventLeftfix(*tbi, u, h);
         }
     }
 }
@@ -887,7 +930,8 @@ bool suitableForEod(const RoseInGraph &ig, vector<RoseInVertex> topo,
             ENSURE_AT_LEAST(&v_depth, (u32)max_width);
         }
 
-        if (v_depth == ROSE_BOUND_INF || v_depth > cc.grey.maxHistoryAvailable) {
+        if (v_depth == ROSE_BOUND_INF
+            || v_depth > cc.grey.maxHistoryAvailable) {
             DEBUG_PRINTF("not suitable for eod table %u\n", v_depth);
             return false;
         }
@@ -900,6 +944,13 @@ bool suitableForEod(const RoseInGraph &ig, vector<RoseInVertex> topo,
     return true;
 }
 
+static
+void shift_accepts_to_end(const RoseInGraph &ig,
+                          vector<RoseInVertex> &topo_order) {
+    stable_partition(begin(topo_order), end(topo_order),
+                     [&](RoseInVertex v){ return !is_any_accept(v, ig); });
+}
+
 static
 void populateRoseGraph(RoseBuildImpl *tbi, RoseBuildData &bd) {
     const RoseInGraph &ig = bd.ig;
@@ -912,6 +963,7 @@ void populateRoseGraph(RoseBuildImpl *tbi, RoseBuildData &bd) {
     map<RoseInVertex, vector<RoseVertex> > vertex_map;
 
     vector<RoseInVertex> v_order = topo_order(ig);
+    shift_accepts_to_end(ig, v_order);
 
     u32 eod_space_required;
     bool use_eod_table = suitableForEod(ig, v_order, &eod_space_required,
diff --git a/src/rose/rose_build_add_mask.cpp b/src/rose/rose_build_add_mask.cpp
index d8eb939a..45333a38 100644
--- a/src/rose/rose_build_add_mask.cpp
+++ b/src/rose/rose_build_add_mask.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -336,7 +336,8 @@ void buildLiteralMask(const vector<CharReach> &mask, vector<u8> &msk,
 }
 
 static
-bool validateTransientMask(const vector<CharReach> &mask, bool eod, const Grey &grey) {
+bool validateTransientMask(const vector<CharReach> &mask, bool anchored,
+                           bool eod, const Grey &grey) {
     assert(!mask.empty());
 
     // An EOD anchored mask requires that everything fit into history, while an
@@ -348,6 +349,12 @@ bool validateTransientMask(const vector<CharReach> &mask, bool eod, const Grey &
         return false;
     }
 
+    /* although anchored masks cannot be transient, short masks may be placed
+     * into the atable. */
+    if (anchored && mask.size() > grey.maxAnchoredRegion) {
+        return false;
+    }
+
     vector<ue2_literal> lits;
     u32 lit_minBound; /* minBound of each literal in lit */
     u32 lit_length;   /* length of each literal in lit */
@@ -703,7 +710,7 @@ bool checkAllowMask(const vector<CharReach> &mask, ue2_literal *lit,
 
 bool RoseBuildImpl::add(bool anchored, const vector<CharReach> &mask,
                         const ue2::flat_set<ReportID> &reports) {
-    if (validateTransientMask(mask, false, cc.grey)) {
+    if (validateTransientMask(mask, anchored, false, cc.grey)) {
         bool eod = false;
         addTransientMask(*this, mask, reports, anchored, eod);
         return true;
@@ -726,8 +733,8 @@ bool RoseBuildImpl::add(bool anchored, const vector<CharReach> &mask,
 
 bool RoseBuildImpl::validateMask(const vector<CharReach> &mask,
                                  UNUSED const ue2::flat_set<ReportID> &reports,
-                                 UNUSED bool anchored, bool eod) const {
-    return validateTransientMask(mask, eod, cc.grey);
+                                 bool anchored, bool eod) const {
+    return validateTransientMask(mask, anchored, eod, cc.grey);
 }
 
 static
diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index 472de156..d327193f 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -433,6 +433,9 @@ RoseRoleHistory findHistoryScheme(const RoseBuildImpl &tbi, const RoseEdge &e) {
 
         // If the bounds are {0,0}, this role can only match precisely at EOD.
         if (minBound == 0 && maxBound == 0) {
+            /* last byte history will squash the state byte so cannot have other
+             * succ */
+            assert(out_degree(u, g) == 1);
             return ROSE_ROLE_HISTORY_LAST_BYTE;
         }
 
@@ -915,19 +918,32 @@ void RoseBuildImpl::findTransientLeftfixes(void) {
             continue;
         }
 
-        u32 his = g[v].left.lag + max_width;
+        if (cc.streaming) {
+            /* STREAMING: transient prefixes must be able to run using history
+             * rather than storing state. */
+            u32 his = g[v].left.lag + max_width;
 
-        // If this vertex has an event literal, we need to add one to cope
-        // with it.
-        if (hasLiteralInTable(v, ROSE_EVENT)) {
-            his++;
-        }
+            // If this vertex has an event literal, we need to add one to cope
+            // with it.
+            if (hasLiteralInTable(v, ROSE_EVENT)) {
+                his++;
+            }
 
-        /* +1 as trigger must appear in main buffer and no byte is needed to
-         * decompress the state */
-        if (his <= cc.grey.maxHistoryAvailable + 1) {
-            transient.insert(left);
-            DEBUG_PRINTF("a transient leftfix has been spotted his=%u\n", his);
+            /* +1 as trigger must appear in main buffer and no byte is needed to
+             * decompress the state */
+            if (his <= cc.grey.maxHistoryAvailable + 1) {
+                transient.insert(left);
+                DEBUG_PRINTF("a transient leftfix spotted his=%u\n", his);
+            }
+        } else {
+            /* BLOCK: transientness is less important and more fuzzy, ideally
+             * it should be quick to calculate the state. No need to worry about
+             * history (and hence lag). */
+            if (max_width < depth(ROSE_BLOCK_TRANSIENT_MAX_WIDTH)) {
+                transient.insert(left);
+                DEBUG_PRINTF("a transient block leftfix spotted [%u]\n",
+                             (u32)max_width);
+            }
         }
     }
 }
diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp
index a33e653a..5fb27c55 100644
--- a/src/rose/rose_build_dump.cpp
+++ b/src/rose/rose_build_dump.cpp
@@ -61,28 +61,6 @@ using namespace std;
 
 namespace ue2 {
 
-static
-string to_string(nfa_kind k) {
-    switch (k) {
-    case NFA_PREFIX:
-        return "PREFIX";
-    case NFA_INFIX:
-        return "INFIX";
-    case NFA_SUFFIX:
-        return "SUFFIX";
-    case NFA_OUTFIX:
-        return "OUTFIX";
-    case NFA_REV_PREFIX:
-        return "REV_PREFIX";
-    case NFA_OUTFIX_RAW:
-        return "OUTFIX_RAW";
-    case NFA_EAGER_PREFIX:
-        return "EAGER_PREFIX";
-    }
-    assert(0);
-    return "?";
-}
-
 /** \brief Return the kind of a left_id or a suffix_id. */
 template<class Graph>
 string render_kind(const Graph &g) {
diff --git a/src/rose/rose_build_lookaround.cpp b/src/rose/rose_build_lookaround.cpp
index a06bacef..ba77b402 100644
--- a/src/rose/rose_build_lookaround.cpp
+++ b/src/rose/rose_build_lookaround.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -684,6 +684,10 @@ bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v,
 
     lookaround.reserve(look.size());
     for (const auto &m : look) {
+        if (m.first < -128 || m.first > 127) {
+            DEBUG_PRINTF("range too big\n");
+            return false;
+        }
         s8 offset = verify_s8(m.first);
         lookaround.emplace_back(offset, m.second);
     }
diff --git a/src/rose/rose_build_util.h b/src/rose/rose_build_util.h
index 536b031a..85cfc010 100644
--- a/src/rose/rose_build_util.h
+++ b/src/rose/rose_build_util.h
@@ -36,6 +36,9 @@
 
 namespace ue2 {
 
+/** Max allowed width for transient graphs in block mode */
+#define ROSE_BLOCK_TRANSIENT_MAX_WIDTH 255U
+
 // Comparator for vertices using their index property.
 struct VertexIndexComp {
     VertexIndexComp(const RoseGraph &gg) : g(gg) {}
diff --git a/src/rose/rose_in_dump.cpp b/src/rose/rose_in_dump.cpp
index 97aefdc4..fbd6858b 100644
--- a/src/rose/rose_in_dump.cpp
+++ b/src/rose/rose_in_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -107,7 +107,8 @@ void dumpPreRoseGraph(const RoseInGraph &ig, const Grey &grey,
                 size_t id = graph_ids.size();
                 graph_ids[&*ig[e].graph] = id;
             }
-            fprintf(f, "graph %zu", graph_ids[&*ig[e].graph]);
+            fprintf(f, "graph %zu\n%s", graph_ids[&*ig[e].graph],
+                    to_string(ig[e].graph->kind).c_str());
         }
         if (ig[e].haig) {
             fprintf(f, "haig ");
diff --git a/src/rose/rose_in_graph.h b/src/rose/rose_in_graph.h
index 2c00a418..14d4d9b2 100644
--- a/src/rose/rose_in_graph.h
+++ b/src/rose/rose_in_graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -106,6 +106,12 @@ public:
                                  ROSE_BOUND_INF);
     }
 
+    /* for when there is a suffix graph which handles the reports */
+    static RoseInVertexProps makeAcceptEod() {
+        return RoseInVertexProps(RIV_ACCEPT_EOD, ue2_literal(), 0,
+                                 ROSE_BOUND_INF);
+    }
+
     static RoseInVertexProps makeStart(bool anchored) {
         DEBUG_PRINTF("making %s\n", anchored ? "anchored start" : "start");
         if (anchored) {
diff --git a/src/rose/rose_in_util.h b/src/rose/rose_in_util.h
index 7c74554a..1f3c4ef7 100644
--- a/src/rose/rose_in_util.h
+++ b/src/rose/rose_in_util.h
@@ -46,6 +46,11 @@ void calcVertexOffsets(RoseInGraph &ig);
 enum nfa_kind whatRoseIsThis(const RoseInGraph &in, const RoseInEdge &e);
 void pruneUseless(RoseInGraph &g);
 
+inline
+bool is_any_accept(RoseInVertex v, const RoseInGraph &g) {
+    return g[v].type == RIV_ACCEPT || g[v].type == RIV_ACCEPT_EOD;
+}
+
 }
 
 #endif
diff --git a/src/rose/stream.c b/src/rose/stream.c
index c0b69f4c..b934f98f 100644
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@@ -510,7 +510,8 @@ void runEagerPrefixesStream(const struct RoseEngine *t,
 }
 
 void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
-    DEBUG_PRINTF("OH HAI\n");
+    DEBUG_PRINTF("OH HAI [%llu, %llu)\n", scratch->core_info.buf_offset,
+                 scratch->core_info.buf_offset + (u64a)scratch->core_info.len);
     assert(t);
     assert(scratch->core_info.hbuf);
     assert(scratch->core_info.buf);
diff --git a/src/scratch.h b/src/scratch.h
index f8e322f8..a2f02503 100644
--- a/src/scratch.h
+++ b/src/scratch.h
@@ -45,7 +45,7 @@ extern "C"
 #endif
 
 UNUSED static const u32 SCRATCH_MAGIC = 0x544F4259;
-#define FDR_TEMP_BUF_SIZE 200
+#define FDR_TEMP_BUF_SIZE 220
 
 struct fatbit;
 struct hs_scratch;
@@ -141,7 +141,6 @@ struct match_deduper {
 struct ALIGN_CL_DIRECTIVE hs_scratch {
     u32 magic;
     u8 in_use; /**< non-zero when being used by an API call. */
-    char *scratch_alloc; /* user allocated scratch object */
     u32 queueCount;
     u32 bStateSize; /**< sizeof block mode states */
     u32 tStateSize; /**< sizeof transient rose states */
@@ -161,10 +160,6 @@ struct ALIGN_CL_DIRECTIVE hs_scratch {
     struct match_deduper deduper;
     u32 anchored_literal_region_len;
     u32 anchored_literal_count;
-    u32 delay_count;
-    u32 scratchSize;
-    u8 ALIGN_DIRECTIVE fdr_temp_buf[FDR_TEMP_BUF_SIZE];
-    u32 handledKeyCount;
     struct fatbit *handled_roles; /**< fatbit of ROLES (not states) already
                                    * handled by this literal */
     u64a *som_store; /**< array of som locations */
@@ -176,6 +171,11 @@ struct ALIGN_CL_DIRECTIVE hs_scratch {
                             * location had been writable */
     u64a som_set_now_offset; /**< offset at which som_set_now represents */
     u32 som_store_count;
+    u32 handledKeyCount;
+    u32 delay_count;
+    u32 scratchSize;
+    char *scratch_alloc; /* user allocated scratch object */
+    u8 ALIGN_DIRECTIVE fdr_temp_buf[FDR_TEMP_BUF_SIZE];
 };
 
 /* array of fatbit ptr; TODO: why not an array of fatbits? */
diff --git a/src/util/ue2string.h b/src/util/ue2string.h
index 88695ea9..3c7be473 100644
--- a/src/util/ue2string.h
+++ b/src/util/ue2string.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -55,8 +55,6 @@ size_t maxStringSelfOverlap(const std::string &a, bool nocase);
 /// Compares two strings, returns non-zero if they're different.
 u32 cmp(const char *a, const char *b, size_t len, bool nocase);
 
-class CharReach;
-
 struct ue2_literal {
 public:
     /// Single element proxy, pointed to by our const_iterator.
@@ -124,6 +122,13 @@ public:
     ue2_literal &operator=(const ue2_literal &) = default;
     ue2_literal &operator=(ue2_literal &&) = default;
 
+    template<typename InputIt>
+    ue2_literal(InputIt b, InputIt e) {
+        for (; b != e; ++b) {
+            push_back(*b);
+        }
+    }
+
     size_type length() const { return s.length(); }
     bool empty() const { return s.empty(); }
     ue2_literal substr(size_type pos, size_type n = std::string::npos) const;

From 34289eb3b40470ecc50f6d6d0e1dbcc3608b90fc Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Fri, 15 Jul 2016 14:32:21 +1000
Subject: [PATCH 125/166] violet: 32bit fix

---
 src/nfagraph/ng_violet.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 0d1c1c12..c4089f7d 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -1718,7 +1718,7 @@ bool makeTransientFromLongLiteral(NGHolder &h, RoseInGraph &vg,
                                   const vector<RoseInEdge> &ee,
                                   const CompileContext &cc) {
     /* check max width and literal lengths to see if possible */
-    size_t min_lit = ~0ULL;
+    size_t min_lit = (size_t)~0ULL;
     for (const RoseInEdge &e : ee) {
         RoseInVertex v = target(e, vg);
         LIMIT_TO_AT_MOST(&min_lit, vg[v].s.length());

From d574557200cac26faa89ee24f4f1a479c9a3aa2b Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Mon, 18 Jul 2016 11:33:13 +1000
Subject: [PATCH 126/166] take mask overhang into account for hwlm accel, float
 min dist

---
 src/hwlm/hwlm_build.cpp          | 47 ++++++++++++++++++++++++++++----
 src/rose/rose_build_bytecode.cpp |  7 +++--
 src/rose/rose_build_impl.h       | 11 ++++++++
 3 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/src/hwlm/hwlm_build.cpp b/src/hwlm/hwlm_build.cpp
index 7ba82fcc..b1814245 100644
--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@@ -64,6 +64,28 @@ namespace ue2 {
 static const unsigned int MAX_ACCEL_OFFSET = 16;
 static const unsigned int MAX_SHUFTI_WIDTH = 240;
 
+static
+size_t mask_overhang(const hwlmLiteral &lit) {
+    size_t msk_true_size = lit.msk.size();
+    assert(msk_true_size <= HWLM_MASKLEN);
+    assert(HWLM_MASKLEN <= MAX_ACCEL_OFFSET);
+    for (u8 c : lit.msk) {
+        if (!c) {
+            msk_true_size--;
+        } else {
+            break;
+        }
+    }
+
+    if (lit.s.length() >= msk_true_size) {
+        return 0;
+    }
+
+    /* only short literals should be able to have a mask which overhangs */
+    assert(lit.s.length() < MAX_ACCEL_OFFSET);
+    return msk_true_size - lit.s.length();
+}
+
 static
 bool findDVerm(const vector<const hwlmLiteral *> &lits, AccelAux *aux) {
     const hwlmLiteral &first = *lits.front();
@@ -169,7 +191,8 @@ bool findDVerm(const vector<const hwlmLiteral *> &lits, AccelAux *aux) {
                 }
 
                 if (found) {
-                    curr.max_offset = MAX(curr.max_offset, j);
+                    assert(j + mask_overhang(lit) <= MAX_ACCEL_OFFSET);
+                    ENSURE_AT_LEAST(&curr.max_offset, j + mask_overhang(lit));
                     break;
                 }
             }
@@ -290,8 +313,8 @@ bool findSVerm(const vector<const hwlmLiteral *> &lits, AccelAux *aux) {
                 }
 
                 if (found) {
-                    curr.max_offset = MAX(curr.max_offset, j);
-                    break;
+                    assert(j + mask_overhang(lit) <= MAX_ACCEL_OFFSET);
+                    ENSURE_AT_LEAST(&curr.max_offset, j + mask_overhang(lit));
                 }
             }
         }
@@ -392,13 +415,25 @@ void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
             continue;
         }
 
-        for (u32 i = 0; i < MAX_ACCEL_OFFSET; i++) {
+        u32 overhang = mask_overhang(lit);
+        for (u32 i = 0; i < overhang; i++) {
+            /* this offset overhangs the start of the real literal; look at the
+             * msk/cmp */
+            for (u32 j = 0; j < N_CHARS; j++) {
+                if ((j & lit.msk[i]) == lit.cmp[i]) {
+                    reach[i].set(j);
+                }
+            }
+        }
+        for (u32 i = overhang; i < MAX_ACCEL_OFFSET; i++) {
             CharReach &reach_i = reach[i];
+            u32 i_effective = i - overhang;
 
-            if (litGuardedByCharReach(reach_i, lit, i)) {
+            if (litGuardedByCharReach(reach_i, lit, i_effective)) {
                 continue;
             }
-            unsigned char c = i < lit.s.length() ? lit.s[i] : lit.s.back();
+            unsigned char c = i_effective < lit.s.length() ? lit.s[i_effective]
+                                                           : lit.s.back();
             if (lit.nocase) {
                 reach_i.set(mytoupper(c));
                 reach_i.set(mytolower(c));
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index a8440916..c0d8d0a7 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -5031,6 +5031,9 @@ void fillMatcherDistances(const RoseBuildImpl &build, RoseEngine *engine) {
             u32 max_d = g[v].max_offset;
             u32 min_d = g[v].min_offset;
 
+            DEBUG_PRINTF("checking %u: elen %zu min/max %u/%u\n", lit_id,
+                         key.elength_including_mask(), min_d, max_d);
+
             if (build.literal_info[lit_id].undelayed_id != lit_id) {
                 /* this is a delayed match; need to update delay properties */
                 /* TODO: can delayed literals ever be in another table ? */
@@ -5050,9 +5053,9 @@ void fillMatcherDistances(const RoseBuildImpl &build, RoseEngine *engine) {
             switch (key.table) {
             case ROSE_FLOATING:
                 ENSURE_AT_LEAST(&engine->floatingDistance, max_d);
-                if (min_d >= key.elength()) {
+                if (min_d >= key.elength_including_mask()) {
                     LIMIT_TO_AT_MOST(&engine->floatingMinDistance,
-                                     min_d - (u32)key.elength());
+                                     min_d - (u32)key.elength_including_mask());
                 } else {
                     /* overlapped literals from rose + anchored table can
                      * cause us to underflow due to sloppiness in
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index 19f803b2..15047491 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -277,6 +277,17 @@ struct rose_literal_id {
     u32 distinctiveness;
 
     size_t elength(void) const { return s.length() + delay; }
+    size_t elength_including_mask(void) const {
+        size_t mask_len = msk.size();
+        for (u8 c : msk) {
+            if (!c) {
+                mask_len--;
+            } else {
+                break;
+            }
+        }
+        return MAX(mask_len, s.length()) + delay;
+    }
 };
 
 static inline

From 3d9a60d0234a8636834c8348fff0b09a0d98efaa Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 18 Jul 2016 11:32:18 +1000
Subject: [PATCH 127/166] teddy: apply poison mask after prep_conf_ work

This simplifies the code, and removes all the all-ones p_mask uses,
which we were otherwise trusting the optimizer to remove.
---
 src/fdr/teddy.c      | 124 ++++++++++++++++---------------
 src/fdr/teddy_avx2.c | 171 +++++++++++++++++++++----------------------
 2 files changed, 148 insertions(+), 147 deletions(-)

diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 9f8b5104..e7a0fccd 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -125,36 +125,34 @@ do {                                                                        \
 #endif
 
 static really_inline
-m128 prep_conf_teddy_m1(const m128 *maskBase, m128 p_mask, m128 val) {
+m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
-    return and128(and128(pshufb(maskBase[0*2], lo),
-                         pshufb(maskBase[0*2+1], hi)), p_mask);
+    return and128(pshufb(maskBase[0*2], lo), pshufb(maskBase[0*2+1], hi));
 }
 
 static really_inline
-m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 p_mask,
-                        m128 val) {
+m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
-    m128 r = prep_conf_teddy_m1(maskBase, p_mask, val);
+    m128 r = prep_conf_teddy_m1(maskBase, val);
 
     m128 res_1 = and128(pshufb(maskBase[1*2], lo),
                         pshufb(maskBase[1*2+1], hi));
     m128 res_shifted_1 = palignr(res_1, *old_1, 16-1);
     *old_1 = res_1;
-    return and128(and128(r, p_mask), res_shifted_1);
+    return and128(r, res_shifted_1);
 }
 
 static really_inline
 m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
-                        m128 p_mask, m128 val) {
+                        m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
-    m128 r = prep_conf_teddy_m2(maskBase, old_1, p_mask, val);
+    m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
 
     m128 res_2 = and128(pshufb(maskBase[2*2], lo),
                         pshufb(maskBase[2*2+1], hi));
@@ -165,11 +163,11 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
 
 static really_inline
 m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
-                        m128 *old_3, m128 p_mask, m128 val) {
+                        m128 *old_3, m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
-    m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, p_mask, val);
+    m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
 
     m128 res_3 = and128(pshufb(maskBase[3*2], lo),
                         pshufb(maskBase[3*2+1], hi));
@@ -201,13 +199,14 @@ hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
         ptr += 16;
     }
@@ -215,9 +214,9 @@ hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
     for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
-        m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
+        m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
     }
 
@@ -225,7 +224,8 @@ hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
     }
 
@@ -255,13 +255,14 @@ hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -269,9 +270,9 @@ hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
     for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
+        m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -279,7 +280,8 @@ hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
 
@@ -310,14 +312,14 @@ hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
-                                      load128(ptr));
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
@@ -325,11 +327,9 @@ hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
     for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
-                                      load128(ptr));
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
-                                      load128(ptr + 16));
+        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
     }
 
@@ -337,7 +337,8 @@ hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
     }
 
@@ -368,14 +369,14 @@ hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
-                                      load128(ptr));
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -383,11 +384,9 @@ hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
     for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
-                                      load128(ptr));
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
-                                      load128(ptr + 16));
+        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -395,7 +394,8 @@ hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                    a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
 
@@ -428,14 +428,15 @@ hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 3);
         m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      p_mask, val_0);
+                                      val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      ones128(), load128(ptr));
+                                      load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
@@ -444,10 +445,10 @@ hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      ones128(), load128(ptr));
+                                      load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
         m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      ones128(), load128(ptr + 16));
+                                      load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
     }
 
@@ -455,8 +456,8 @@ hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 3);
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
     }
 
@@ -489,14 +490,15 @@ hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 3);
         m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      p_mask, val_0);
+                                      val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      ones128(), load128(ptr));
+                                      load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -505,10 +507,10 @@ hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      ones128(), load128(ptr));
+                                      load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
         m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      ones128(), load128(ptr + 16));
+                                      load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -516,8 +518,8 @@ hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
         m128 p_mask;
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 3);
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      p_mask, val_0);
+        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
 
@@ -551,14 +553,15 @@ hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 4);
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, p_mask, val_0);
+                                      &res_old_3, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, ones128(), load128(ptr));
+                                      &res_old_3, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
@@ -567,10 +570,10 @@ hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, ones128(), load128(ptr));
+                                      &res_old_3, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
         m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, ones128(), load128(ptr + 16));
+                                      &res_old_3, load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
     }
 
@@ -579,7 +582,8 @@ hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 4);
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, p_mask, val_0);
+                                      &res_old_3, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
     }
 
@@ -613,14 +617,15 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 4);
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, p_mask, val_0);
+                                      &res_old_3, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, ones128(), load128(ptr));
+                                      &res_old_3, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -629,10 +634,10 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, ones128(), load128(ptr));
+                                      &res_old_3, load128(ptr));
         CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
         m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, ones128(), load128(ptr + 16));
+                                      &res_old_3, load128(ptr + 16));
         CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -641,7 +646,8 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
         m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
                                      a->buf_history, a->len_history, 4);
         m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, p_mask, val_0);
+                                      &res_old_3, val_0);
+        r_0 = and128(r_0, p_mask);
         CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
     }
 
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index 428c9446..e4a836d4 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -407,36 +407,35 @@ void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) {
 }
 
 static really_inline
-m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 p_mask, m256 val) {
+m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
-    return and256(and256(vpshufb(maskBase[0*2], lo),
-                         vpshufb(maskBase[0*2+1], hi)), p_mask);
+    return and256(vpshufb(maskBase[0*2], lo),
+                  vpshufb(maskBase[0*2+1], hi));
 }
 
 static really_inline
-m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 p_mask,
-                            m256 val) {
+m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
-    m256 r = prep_conf_fat_teddy_m1(maskBase, p_mask, val);
+    m256 r = prep_conf_fat_teddy_m1(maskBase, val);
 
     m256 res_1 = and256(vpshufb(maskBase[1*2], lo),
                         vpshufb(maskBase[1*2+1], hi));
     m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1);
     *old_1 = res_1;
-    return and256(and256(r, p_mask), res_shifted_1);
+    return and256(r, res_shifted_1);
 }
 
 static really_inline
 m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
-                            m256 p_mask, m256 val) {
+                            m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
-    m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, p_mask, val);
+    m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
 
     m256 res_2 = and256(vpshufb(maskBase[2*2], lo),
                         vpshufb(maskBase[2*2+1], hi));
@@ -447,11 +446,11 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
 
 static really_inline
 m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
-                            m256 *old_3, m256 p_mask, m256 val) {
+                            m256 *old_3, m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
-    m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, p_mask, val);
+    m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
 
     m256 res_3 = and256(vpshufb(maskBase[3*2], lo),
                         vpshufb(maskBase[3*2+1], hi));
@@ -461,12 +460,10 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
 }
 
 static really_inline
-m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi,
-                             m256 p_mask) {
+m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi) {
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
-    m256 res = and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi));
-    return and256(res, p_mask);
+    return and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi));
 }
 
 static really_inline
@@ -503,13 +500,14 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
         ptr += 16;
     }
@@ -517,10 +515,9 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
     for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, ones256(),
-                                          load2x128(ptr + 16));
+        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
     }
 
@@ -528,7 +525,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
     }
 
@@ -558,13 +556,14 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -572,10 +571,9 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
     for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, ones256(),
-                                          load2x128(ptr + 16));
+        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -583,7 +581,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
 
@@ -614,14 +613,14 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
-                                          load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
@@ -629,10 +628,9 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
     for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
-                                          load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
                                           load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
     }
@@ -641,7 +639,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
     }
 
@@ -672,25 +671,24 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
-                                          load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-         ptr += 16;
+        ptr += 16;
     }
 
     for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
-                                          load2x128(ptr));
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
                                           load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
@@ -699,7 +697,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
 
@@ -732,14 +731,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 3);
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          p_mask, val_0);
+                                          val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          ones256(), load2x128(ptr));
+                                          load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
@@ -748,10 +748,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          ones256(), load2x128(ptr));
+                                          load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
         m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          ones256(), load2x128(ptr + 16));
+                                          load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
     }
 
@@ -760,7 +760,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 3);
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          p_mask, val_0);
+                                          val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
     }
 
@@ -793,14 +794,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 3);
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          p_mask, val_0);
+                                          val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          ones256(), load2x128(ptr));
+                                          load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -809,10 +811,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          ones256(), load2x128(ptr));
+                                          load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
         m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          ones256(), load2x128(ptr + 16));
+                                          load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -821,7 +823,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 3);
         m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          p_mask, val_0);
+                                          val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
 
@@ -855,15 +858,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 4);
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, p_mask, val_0);
+                                          &res_old_3, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, ones256(),
-                                          load2x128(ptr));
+                                          &res_old_3, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
         ptr += 16;
     }
@@ -872,12 +875,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, ones256(),
-                                          load2x128(ptr));
+                                          &res_old_3, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
         m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, ones256(),
-                                          load2x128(ptr + 16));
+                                          &res_old_3, load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
     }
 
@@ -886,7 +887,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 4);
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, p_mask, val_0);
+                                          &res_old_3, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
     }
 
@@ -920,15 +922,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 4);
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, p_mask, val_0);
+                                          &res_old_3, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
 
     if (ptr + 16 < buf_end) {
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, ones256(),
-                                          load2x128(ptr));
+                                          &res_old_3, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
         ptr += 16;
     }
@@ -937,12 +939,10 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
         __builtin_prefetch(ptr + (iterBytes*4));
         CHECK_FLOOD;
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, ones256(),
-                                          load2x128(ptr));
+                                          &res_old_3, load2x128(ptr));
         CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
         m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, ones256(),
-                                          load2x128(ptr + 16));
+                                          &res_old_3, load2x128(ptr + 16));
         CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
     }
 
@@ -951,7 +951,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
         m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
                                        a->buf_history, a->len_history, 4);
         m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, p_mask, val_0);
+                                          &res_old_3, val_0);
+        r_0 = and256(r_0, p_mask);
         CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
     }
 
@@ -986,16 +987,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
                                      buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             p_mask);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
+        res_0 = and256(res_0, p_mask);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
         ptr += 32;
     }
 
     if (ptr + 32 < buf_end) {
         m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             ones256());
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
         ptr += 32;
     }
@@ -1005,13 +1005,11 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
         CHECK_FLOOD;
 
         m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             ones256());
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
         CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
 
         m256 val_1 = load256(ptr + 32);
-        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi,
-                                             ones256());
+        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi);
         CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
     }
 
@@ -1019,8 +1017,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
                                      buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             p_mask);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
+        res_0 = and256(res_0, p_mask);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
     }
 
@@ -1055,16 +1053,15 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
                                      buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             p_mask);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
+        res_0 = and256(res_0, p_mask);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
         ptr += 32;
     }
 
     if (ptr + 32 < buf_end) {
         m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             ones256());
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
         ptr += 32;
     }
@@ -1074,13 +1071,11 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
         CHECK_FLOOD;
 
         m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             ones256());
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
         CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
 
         m256 val_1 = load256(ptr + 32);
-        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi,
-                                             ones256());
+        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi);
         CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
     }
 
@@ -1088,8 +1083,8 @@ hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
         m256 p_mask;
         m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
                                      buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
-                                             p_mask);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
+        res_0 = and256(res_0, p_mask);
         CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
     }
 

From 4ce268af47aa74f8dcd0ce50eb06646af2d208b5 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 18 Jul 2016 12:41:31 +1000
Subject: [PATCH 128/166] ng: ensure that only match states have reports

Ensure (and assert) that vertices without an edge to {accept, acceptEod}
do not have reports set.
---
 src/nfagraph/ng.cpp                 |  5 +++++
 src/nfagraph/ng_asserts.cpp         |  3 ++-
 src/nfagraph/ng_calc_components.cpp |  6 ++++++
 src/nfagraph/ng_limex.cpp           | 10 +++++++---
 src/nfagraph/ng_rose.cpp            |  1 +
 src/nfagraph/ng_util.cpp            | 20 ++++++++++++++------
 src/nfagraph/ng_util.h              | 22 ++++++++++++++++++----
 unit/internal/lbr.cpp               |  2 ++
 unit/internal/limex_nfa.cpp         | 11 ++++++++---
 unit/internal/nfagraph_repeat.cpp   | 10 +++++++---
 10 files changed, 70 insertions(+), 20 deletions(-)

diff --git a/src/nfagraph/ng.cpp b/src/nfagraph/ng.cpp
index 35b2eb35..deca3fd5 100644
--- a/src/nfagraph/ng.cpp
+++ b/src/nfagraph/ng.cpp
@@ -105,6 +105,7 @@ bool addComponentSom(NG &ng, NGHolder &g, const NGWrapper &w,
     DEBUG_PRINTF("doing som\n");
     dumpComponent(g, "03_presom", w.expressionIndex, comp_id, ng.cc.grey);
     assert(hasCorrectlyNumberedVertices(g));
+    assert(allMatchStatesHaveReports(w));
 
     // First, we try the "SOM chain" support in ng_som.cpp.
 
@@ -208,6 +209,8 @@ bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
 
     dumpComponent(g, "01_begin", w.expressionIndex, comp_id, ng.cc.grey);
 
+    assert(allMatchStatesHaveReports(w));
+
     reduceGraph(g, som, w.utf8, cc);
 
     dumpComponent(g, "02_reduced", w.expressionIndex, comp_id, ng.cc.grey);
@@ -232,6 +235,8 @@ bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
         }
     }
 
+    assert(allMatchStatesHaveReports(w));
+
     if (splitOffAnchoredAcyclic(*ng.rose, g, cc)) {
         return true;
     }
diff --git a/src/nfagraph/ng_asserts.cpp b/src/nfagraph/ng_asserts.cpp
index 2d02751f..e9e39345 100644
--- a/src/nfagraph/ng_asserts.cpp
+++ b/src/nfagraph/ng_asserts.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -553,6 +553,7 @@ void ensureCodePointStart(ReportManager &rm, NGWrapper &g) {
         add_edge(g.startDs, v_4, g);
         remove_edge(orig, g);
         g.renumberEdges();
+        clearReports(g);
     }
 }
 
diff --git a/src/nfagraph/ng_calc_components.cpp b/src/nfagraph/ng_calc_components.cpp
index 9365cfb3..658e7001 100644
--- a/src/nfagraph/ng_calc_components.cpp
+++ b/src/nfagraph/ng_calc_components.cpp
@@ -363,6 +363,12 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
         *shell_comp = true;
     }
 
+    // Ensure that only vertices with accept edges have reports.
+    for (auto &gc : comps) {
+        assert(gc);
+        clearReports(*gc);
+    }
+
     // We should never produce empty component graphs.
     assert(all_of(begin(comps), end(comps),
                   [](const unique_ptr<NGHolder> &g_comp) {
diff --git a/src/nfagraph/ng_limex.cpp b/src/nfagraph/ng_limex.cpp
index a82d18b6..a8a5113d 100644
--- a/src/nfagraph/ng_limex.cpp
+++ b/src/nfagraph/ng_limex.cpp
@@ -79,13 +79,17 @@ bool sanityCheckGraph(const NGHolder &g,
             }
         }
 
-        // Vertices with edges to accept or acceptEod must have reports.
+        // Vertices with edges to accept or acceptEod must have reports and
+        // other vertices must not have them.
         if (is_match_vertex(v, g) && v != g.accept) {
             if (g[v].reports.empty()) {
-                DEBUG_PRINTF("vertex %u has no reports\n",
-                             g[v].index);
+                DEBUG_PRINTF("vertex %u has no reports\n", g[v].index);
                 return false;
             }
+        } else if (!g[v].reports.empty()) {
+            DEBUG_PRINTF("vertex %u has reports but no accept edge\n",
+                         g[v].index);
+            return false;
         }
 
         // Participant vertices should have distinct state indices.
diff --git a/src/nfagraph/ng_rose.cpp b/src/nfagraph/ng_rose.cpp
index 4b16364a..aba4a7c3 100644
--- a/src/nfagraph/ng_rose.cpp
+++ b/src/nfagraph/ng_rose.cpp
@@ -872,6 +872,7 @@ u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
     }
 
     clear_in_edges(g.accept, g);
+    clearReports(g);
 
     vector<NFAVertex> verts(pred.begin(), pred.end());
     sort(verts.begin(), verts.end(), VertexIndexOrdering<NGHolder>(g));
diff --git a/src/nfagraph/ng_util.cpp b/src/nfagraph/ng_util.cpp
index e9f6be55..c629d553 100644
--- a/src/nfagraph/ng_util.cpp
+++ b/src/nfagraph/ng_util.cpp
@@ -631,16 +631,18 @@ unique_ptr<NGHolder> cloneHolder(const NGHolder &in) {
 }
 
 #ifndef NDEBUG
-/** \brief Used in sanity-checking assertions: returns true if all vertices
- * leading to accept or acceptEod have at least one report ID. */
+
 bool allMatchStatesHaveReports(const NGHolder &g) {
+    unordered_set<NFAVertex> reporters;
     for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
         if (g[v].reports.empty()) {
             DEBUG_PRINTF("vertex %u has no reports!\n",
                          g[v].index);
             return false;
         }
+        reporters.insert(v);
     }
+
     for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
         if (v == g.accept) {
             continue; // stylised edge
@@ -650,12 +652,20 @@ bool allMatchStatesHaveReports(const NGHolder &g) {
                          g[v].index);
             return false;
         }
+        reporters.insert(v);
     }
+
+    for (auto v : vertices_range(g)) {
+        if (!contains(reporters, v) && !g[v].reports.empty()) {
+            DEBUG_PRINTF("vertex %u is not a match state, but has reports!\n",
+                         g[v].index);
+            return false;
+        }
+    }
+
     return true;
 }
 
-/** Assertion: returns true if the vertices in this graph are contiguously (and
- * uniquely) numbered from zero. */
 bool hasCorrectlyNumberedVertices(const NGHolder &g) {
     size_t count = num_vertices(g);
     vector<bool> ids(count, false);
@@ -670,8 +680,6 @@ bool hasCorrectlyNumberedVertices(const NGHolder &g) {
         && num_vertices(g) == num_vertices(g.g);
 }
 
-/** Assertion: returns true if the edges in this graph are contiguously (and
- * uniquely) numbered from zero. */
 bool hasCorrectlyNumberedEdges(const NGHolder &g) {
     size_t count = num_edges(g);
     vector<bool> ids(count, false);
diff --git a/src/nfagraph/ng_util.h b/src/nfagraph/ng_util.h
index 833523c7..4f58dc45 100644
--- a/src/nfagraph/ng_util.h
+++ b/src/nfagraph/ng_util.h
@@ -297,15 +297,29 @@ void clearReports(NGHolder &g);
 void duplicateReport(NGHolder &g, ReportID r_old, ReportID r_new);
 
 #ifndef NDEBUG
-// Assertions: only available in internal builds
 
-/** \brief Used in sanity-checking assertions: returns true if all vertices
- * leading to accept or acceptEod have at least one report ID. */
+// Assertions: only available in internal builds.
+
+/**
+ * Used in sanity-checking assertions: returns true if all vertices
+ * with edges to accept or acceptEod have at least one report ID. Additionally,
+ * checks that ONLY vertices with edges to accept or acceptEod has reports.
+ */
 bool allMatchStatesHaveReports(const NGHolder &g);
 
+/**
+ * Assertion: returns true if the vertices in this graph are contiguously (and
+ * uniquely) numbered from zero.
+ */
 bool hasCorrectlyNumberedVertices(const NGHolder &g);
+
+/**
+ * Assertion: returns true if the edges in this graph are contiguously (and
+ * uniquely) numbered from zero.
+ */
 bool hasCorrectlyNumberedEdges(const NGHolder &g);
-#endif
+
+#endif // NDEBUG
 
 } // namespace ue2
 
diff --git a/unit/internal/lbr.cpp b/unit/internal/lbr.cpp
index bd799c0f..e40bda02 100644
--- a/unit/internal/lbr.cpp
+++ b/unit/internal/lbr.cpp
@@ -36,6 +36,7 @@
 #include "nfa/nfa_internal.h"
 #include "nfa/nfa_api_util.h"
 #include "nfagraph/ng_lbr.h"
+#include "nfagraph/ng_util.h"
 #include "util/alloc.h"
 #include "util/compile_context.h"
 #include "grey.h"
@@ -97,6 +98,7 @@ protected:
         ParsedExpression parsed(0, pattern.c_str(), flags, 0);
         unique_ptr<NGWrapper> g = buildWrapper(rm, cc, parsed);
         ASSERT_TRUE(g != nullptr);
+        clearReports(*g);
 
         ASSERT_TRUE(isLBR(*g, grey));
 
diff --git a/unit/internal/limex_nfa.cpp b/unit/internal/limex_nfa.cpp
index 926bf6eb..6bb4fcb9 100644
--- a/unit/internal/limex_nfa.cpp
+++ b/unit/internal/limex_nfa.cpp
@@ -31,14 +31,15 @@
 
 #include "grey.h"
 #include "compiler/compiler.h"
-#include "nfagraph/ng.h"
-#include "nfagraph/ng_limex.h"
-#include "nfagraph/ng_restructuring.h"
 #include "nfa/limex_context.h"
 #include "nfa/limex_internal.h"
 #include "nfa/nfa_api.h"
 #include "nfa/nfa_api_util.h"
 #include "nfa/nfa_internal.h"
+#include "nfagraph/ng.h"
+#include "nfagraph/ng_limex.h"
+#include "nfagraph/ng_restructuring.h"
+#include "nfagraph/ng_util.h"
 #include "util/alloc.h"
 #include "util/target_info.h"
 
@@ -76,6 +77,7 @@ protected:
         ParsedExpression parsed(0, expr.c_str(), flags, 0);
         unique_ptr<NGWrapper> g = buildWrapper(rm, cc, parsed);
         ASSERT_TRUE(g != nullptr);
+        clearReports(*g);
 
         rm.setProgramOffset(0, MATCH_REPORT);
 
@@ -310,10 +312,12 @@ protected:
         ParsedExpression parsed(0, expr.c_str(), flags, 0);
         unique_ptr<NGWrapper> g = buildWrapper(rm, cc, parsed);
         ASSERT_TRUE(g != nullptr);
+        clearReports(*g);
 
         // Reverse the graph and add some reports on the accept vertices.
         NGHolder g_rev(NFA_REV_PREFIX);
         reverseHolder(*g, g_rev);
+        clearReports(g_rev);
         for (NFAVertex v : inv_adjacent_vertices_range(g_rev.accept, g_rev)) {
             g_rev[v].reports.insert(0);
         }
@@ -367,6 +371,7 @@ protected:
         ReportManager rm(cc.grey);
         unique_ptr<NGWrapper> g = buildWrapper(rm, cc, parsed);
         ASSERT_TRUE(g != nullptr);
+        clearReports(*g);
 
         rm.setProgramOffset(0, MATCH_REPORT);
 
diff --git a/unit/internal/nfagraph_repeat.cpp b/unit/internal/nfagraph_repeat.cpp
index 2473d755..b34d1271 100644
--- a/unit/internal/nfagraph_repeat.cpp
+++ b/unit/internal/nfagraph_repeat.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,6 +32,7 @@
 
 #include "gtest/gtest.h"
 #include "nfagraph/ng_repeat.h"
+#include "nfagraph/ng_util.h"
 #include "util/depth.h"
 #include "hs_compile.h"
 
@@ -89,12 +90,15 @@ static const PureRepeatTest pureRepeatTests[] = {
     { "^..?..?..?..?..?", 5, 10 }
 };
 
-INSTANTIATE_TEST_CASE_P(PureRepeat, NFAPureRepeatTest, ValuesIn(pureRepeatTests));
+INSTANTIATE_TEST_CASE_P(PureRepeat, NFAPureRepeatTest,
+                        ValuesIn(pureRepeatTests));
 
 TEST_P(NFAPureRepeatTest, Check) {
     const PureRepeatTest &t = GetParam();
     SCOPED_TRACE(testing::Message() << "Pattern: " << t.pattern);
-    unique_ptr<NGWrapper> w(constructGraph(t.pattern, HS_FLAG_ALLOWEMPTY));
+    auto w = constructGraph(t.pattern, HS_FLAG_ALLOWEMPTY);
+    ASSERT_TRUE(w != nullptr);
+    clearReports(*w);
 
     PureRepeat repeat;
     bool result = isPureRepeat(*w, repeat);

From 7f49958824853b2f4d6fa49c5edc41dd2b7c41ee Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 14 Jul 2016 16:17:06 +1000
Subject: [PATCH 129/166] rose: only write out report programs if in use

These programs are only used by output-exposed engines.
---
 src/rose/rose_build_bytecode.cpp    | 53 +++++++++++++++++++++++------
 src/rose/rose_internal.h            |  9 +++--
 src/smallwrite/smallwrite_build.cpp | 16 +++++++++
 src/smallwrite/smallwrite_build.h   |  4 +++
 4 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index c0d8d0a7..6eb42d4c 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -4761,27 +4761,57 @@ pair<u32, u32> buildLiteralPrograms(RoseBuildImpl &build, build_context &bc) {
     return {litProgramsOffset, delayRebuildProgramsOffset};
 }
 
+/**
+ * \brief Returns all reports used by output-exposed engines, for which we need
+ * to generate programs.
+ */
 static
-u32 buildReportPrograms(RoseBuildImpl &build, build_context &bc) {
-    const auto &rm = build.rm;
-    const u32 numReports = verify_u32(rm.numReports());
-    vector<u32> programs(numReports);
+set<ReportID> findEngineReports(const RoseBuildImpl &build) {
+    set<ReportID> reports;
+
+    // The small write engine uses these engine report programs.
+    insert(&reports, build.smwr.all_reports());
+
+    for (const auto &outfix : build.outfixes) {
+        insert(&reports, all_reports(outfix));
+    }
+
+    const auto &g = build.g;
+    for (auto v : vertices_range(g)) {
+        if (g[v].suffix) {
+            insert(&reports, all_reports(g[v].suffix));
+        }
+    }
+
+    DEBUG_PRINTF("%zu engine reports (of %zu)\n", reports.size(),
+                 build.rm.numReports());
+    return reports;
+}
+
+static
+pair<u32, u32> buildReportPrograms(RoseBuildImpl &build, build_context &bc) {
+    const auto reports = findEngineReports(build);
+    vector<u32> programs;
+    programs.reserve(reports.size());
 
     vector<RoseInstruction> program;
-    for (ReportID id = 0; id < numReports; id++) {
+    for (ReportID id : reports) {
         program.clear();
         const bool has_som = false;
         makeCatchupMpv(build, bc, id, program);
         makeReport(build, id, has_som, program);
         program = flattenProgram({program});
         applyFinalSpecialisation(program);
-        programs[id] = writeProgram(bc, program);
-        build.rm.setProgramOffset(id, programs[id]);
+        u32 offset = writeProgram(bc, program);
+        programs.push_back(offset);
+        build.rm.setProgramOffset(id, offset);
         DEBUG_PRINTF("program for report %u @ %u (%zu instructions)\n", id,
                      programs.back(), program.size());
     }
 
-    return add_to_engine_blob(bc, begin(programs), end(programs));
+    u32 offset = add_to_engine_blob(bc, begin(programs), end(programs));
+    u32 count = verify_u32(programs.size());
+    return {offset, count};
 }
 
 static
@@ -5174,7 +5204,10 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
 
     auto boundary_out = makeBoundaryPrograms(*this, bc, boundary, dboundary);
 
-    u32 reportProgramOffset = buildReportPrograms(*this, bc);
+    u32 reportProgramOffset;
+    u32 reportProgramCount;
+    tie(reportProgramOffset, reportProgramCount) =
+        buildReportPrograms(*this, bc);
 
     // Build NFAs
     set<u32> no_retrigger_queues;
@@ -5394,7 +5427,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     engine->litProgramOffset = litProgramOffset;
     engine->litDelayRebuildProgramOffset = litDelayRebuildProgramOffset;
     engine->reportProgramOffset = reportProgramOffset;
-    engine->reportProgramCount = verify_u32(rm.reports().size());
+    engine->reportProgramCount = reportProgramCount;
     engine->runtimeImpl = pickRuntimeImpl(*this, bc, outfixEndQueue);
     engine->mpvTriggeredByLeaf = anyEndfixMpvTriggers(*this);
 
diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h
index 2b646af0..51913984 100644
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@@ -347,10 +347,15 @@ struct RoseEngine {
      * literals. */
     u32 litDelayRebuildProgramOffset;
 
-    /** \brief Offset of u32 array of program offsets for internal reports. */
+    /**
+     * \brief Offset of u32 array of program offsets for reports used by
+     * output-exposed engines.
+     */
     u32 reportProgramOffset;
 
-    /** \brief Number of programs for internal reports. */
+    /**
+     * \brief Number of programs for reports used by output-exposed engines.
+     */
     u32 reportProgramCount;
 
     /**
diff --git a/src/smallwrite/smallwrite_build.cpp b/src/smallwrite/smallwrite_build.cpp
index 7fb54440..1cffe514 100644
--- a/src/smallwrite/smallwrite_build.cpp
+++ b/src/smallwrite/smallwrite_build.cpp
@@ -74,6 +74,8 @@ public:
     void add(const NGWrapper &w) override;
     void add(const ue2_literal &literal, ReportID r) override;
 
+    set<ReportID> all_reports() const override;
+
     bool determiniseLiterals();
 
     const ReportManager &rm;
@@ -413,6 +415,20 @@ SmallWriteBuildImpl::build(u32 roseQuality) {
     return smwr;
 }
 
+set<ReportID> SmallWriteBuildImpl::all_reports() const {
+    set<ReportID> reports;
+    if (poisoned) {
+        return reports;
+    }
+    if (rdfa) {
+        insert(&reports, ::ue2::all_reports(*rdfa));
+    }
+    for (const auto &cand : cand_literals) {
+        reports.insert(cand.second);
+    }
+    return reports;
+}
+
 size_t smwrSize(const SmallWriteEngine *smwr) {
     assert(smwr);
     return smwr->size;
diff --git a/src/smallwrite/smallwrite_build.h b/src/smallwrite/smallwrite_build.h
index 59a8528a..84c6df3a 100644
--- a/src/smallwrite/smallwrite_build.h
+++ b/src/smallwrite/smallwrite_build.h
@@ -38,6 +38,8 @@
 #include "ue2common.h"
 #include "util/alloc.h"
 
+#include <set>
+
 #include <boost/core/noncopyable.hpp>
 
 struct SmallWriteEngine;
@@ -61,6 +63,8 @@ public:
 
     virtual void add(const NGWrapper &w) = 0;
     virtual void add(const ue2_literal &literal, ReportID r) = 0;
+
+    virtual std::set<ReportID> all_reports() const = 0;
 };
 
 // Construct a usable SmallWrite builder.

From 1ff17a2ea38756d4e32ec71a2c8c851b8eea7e63 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 21 Jul 2016 10:33:21 +1000
Subject: [PATCH 130/166] rose: clear reports in removeFalsePaths

---
 src/rose/rose_build_add.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/rose/rose_build_add.cpp b/src/rose/rose_build_add.cpp
index 195c4bad..0a91a76a 100644
--- a/src/rose/rose_build_add.cpp
+++ b/src/rose/rose_build_add.cpp
@@ -418,6 +418,7 @@ void removeFalsePaths(NGHolder &g, const ue2_literal &lit) {
     }
 
     pruneUseless(g);
+    clearReports(g);
     assert(in_degree(g.accept, g) || in_degree(g.acceptEod, g) > 1);
     assert(allMatchStatesHaveReports(g));
 

From de543bec24f388aeca1bee09307973db935c78f5 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 21 Jul 2016 12:46:07 +1000
Subject: [PATCH 131/166] ng_violet: use dumpString for debug output

---
 src/nfagraph/ng_violet.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index c4089f7d..fe917c77 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -672,7 +672,7 @@ unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
     }
 
     DEBUG_PRINTF("best is '%s' %u a%d t%d\n",
-        ((const string &)*best->lit.begin()).c_str(),
+        dumpString(*best->lit.begin()).c_str(),
         g[best->vv.front()].index,
         depths ? (int)createsAnchoredLHS(g, best->vv, *depths, cc.grey) : 0,
         depths ? (int)createsTransientLHS(g, best->vv, *depths, cc.grey) : 0);
@@ -1287,7 +1287,7 @@ bool doNetflowCut(NGHolder &h,
         cut_lits[e] = lits;
 
         DEBUG_PRINTF("cut lit '%s' %u->%u\n",
-                     ((const string &)*cut_lits[e].begin()).c_str(),
+                     dumpString(*cut_lits[e].begin()).c_str(),
                      h[source(e, h)].index, h[target(e, h)].index);
     }
 

From 04634f2e871b7b6bf7bb03e55c847d07e98c7b4c Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 21 Jul 2016 13:25:26 +1000
Subject: [PATCH 132/166] flood_compile: escape chars in debugging

---
 src/fdr/flood_compile.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/fdr/flood_compile.cpp b/src/fdr/flood_compile.cpp
index 2ee5a1c5..62693c30 100644
--- a/src/fdr/flood_compile.cpp
+++ b/src/fdr/flood_compile.cpp
@@ -69,7 +69,7 @@ static
 void updateFloodSuffix(vector<FDRFlood> &tmpFlood, u8 c, u32 suffix) {
     FDRFlood &fl = tmpFlood[c];
     fl.suffix = MAX(fl.suffix, suffix + 1);
-    DEBUG_PRINTF("Updated Flood Suffix for char '%c' to %u\n", c, fl.suffix);
+    DEBUG_PRINTF("Updated Flood Suffix for char 0x%02x to %u\n", c, fl.suffix);
 }
 
 static
@@ -125,8 +125,9 @@ setupFDRFloodControl(const vector<hwlmLiteral> &lits,
         for (u32 i = 0; i < iEnd; i++) {
             if (i < litSize) {
                 if (isDifferent(c, lit.s[litSize - i - 1], lit.nocase)) {
-                    DEBUG_PRINTF("non-flood char in literal[%u] %c != %c\n",
-                                                i, c, lit.s[litSize - i - 1]);
+                    DEBUG_PRINTF("non-flood char in literal[%u]: "
+                                 "0x%02x != 0x%02x\n",
+                                 i, c, lit.s[litSize - i - 1]);
                     upSuffix = MIN(upSuffix, i);
                     loSuffix = MIN(loSuffix, i); // makes sense only for case-less
                     break;

From 55c2d20e2c8257d9482eee6da35d13083566b9e0 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 21 Jul 2016 13:32:55 +1000
Subject: [PATCH 133/166] rose: use dumpString for debug output

---
 src/nfagraph/ng_rose.cpp    | 10 +++++-----
 src/rose/rose_build_add.cpp |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/nfagraph/ng_rose.cpp b/src/nfagraph/ng_rose.cpp
index aba4a7c3..24570a01 100644
--- a/src/nfagraph/ng_rose.cpp
+++ b/src/nfagraph/ng_rose.cpp
@@ -750,7 +750,7 @@ unique_ptr<VertLitInfo> LitCollection::pickNext() {
         for (auto v : lits.back()->vv) {
             if (contains(poisoned, v)) {
                 DEBUG_PRINTF("skipping '%s' as overlapped\n",
-                     ((const string &)*lits.back()->lit.begin()).c_str());
+                             dumpString(*(lits.back()->lit.begin())).c_str());
                 lits.pop_back();
                 goto next_lit;
             }
@@ -760,7 +760,7 @@ unique_ptr<VertLitInfo> LitCollection::pickNext() {
         lits.pop_back();
         poisonCandidates(*rv);
         DEBUG_PRINTF("best is '%s' %u a%d t%d\n",
-                     ((const string &)*rv->lit.begin()).c_str(),
+                     dumpString(*(rv->lit.begin())).c_str(),
                      g[rv->vv.front()].index,
                      (int)createsAnchoredLHS(g, rv->vv, depths, grey),
                      (int)createsTransientLHS(g, rv->vv, depths, grey));
@@ -815,7 +815,7 @@ u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
         max_delay--;
     }
 
-    DEBUG_PRINTF("killing off '%s'\n", ((const string &)lit).c_str());
+    DEBUG_PRINTF("killing off '%s'\n", dumpString(lit).c_str());
     set<NFAVertex> curr, next;
     curr.insert(g.accept);
 
@@ -892,7 +892,7 @@ u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
 void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
                                   u32 delay, const vector<NFAVertex> &preds) {
     assert(delay <= lit.length());
-    DEBUG_PRINTF("adding on '%s' %u\n", ((const string &)lit).c_str(), delay);
+    DEBUG_PRINTF("adding on '%s' %u\n", dumpString(lit).c_str(), delay);
 
     NFAVertex prev = g.accept;
     auto it = lit.rbegin();
@@ -1786,7 +1786,7 @@ bool doNetflowCut(RoseInGraph &ig, const vector<RoseInEdge> &to_cut,
         cut_lits[e] = lits;
 
         DEBUG_PRINTF("cut lit '%s'\n",
-                     ((const string &)*cut_lits[e].begin()).c_str());
+                     dumpString(*cut_lits[e].begin()).c_str());
     }
 
     /* if literals are underlength bail or if it involves a forbidden edge*/
diff --git a/src/rose/rose_build_add.cpp b/src/rose/rose_build_add.cpp
index 0a91a76a..18968e10 100644
--- a/src/rose/rose_build_add.cpp
+++ b/src/rose/rose_build_add.cpp
@@ -375,7 +375,7 @@ void createVertices(RoseBuildImpl *tbi,
 /* ensure the holder does not accept any paths which do not end with lit */
 static
 void removeFalsePaths(NGHolder &g, const ue2_literal &lit) {
-    DEBUG_PRINTF("strip '%s'\n", ((const string &)lit).c_str());
+    DEBUG_PRINTF("strip '%s'\n", dumpString(lit).c_str());
     set<NFAVertex> curr, next;
     curr.insert(g.accept);
     curr.insert(g.acceptEod);

From 63528f1cd2724b0e00a7e8e7e1b363f758bdace9 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 21 Jul 2016 12:57:12 +1000
Subject: [PATCH 134/166] ng_violet: iterate in edge order

---
 src/nfagraph/ng_violet.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index fe917c77..3c79dbc3 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -1622,7 +1622,8 @@ void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig,
 static
 void removeRedundantLiteralsFromInfixes(RoseInGraph &g,
                                         const CompileContext &cc) {
-    map<NGHolder *, vector<RoseInEdge> > infixes;
+    vector<NGHolder *> seen_order;
+    map<NGHolder *, vector<RoseInEdge>> infixes;
 
     for (const RoseInEdge &e : edges_range(g)) {
         RoseInVertex s = source(e, g);
@@ -1637,11 +1638,16 @@ void removeRedundantLiteralsFromInfixes(RoseInGraph &g,
         }
 
         assert(!g[t].delay);
-        infixes[&*g[e].graph].push_back(e);
+
+        NGHolder *h = g[e].graph.get();
+        if (!contains(infixes, h)) {
+            seen_order.push_back(h);
+        }
+        infixes[h].push_back(e);
     }
 
-    for (const auto &info : infixes) {
-        removeRedundantLiteralsFromInfix(*info.first, g, info.second, cc);
+    for (NGHolder *h : seen_order) {
+        removeRedundantLiteralsFromInfix(*h, g, infixes[h], cc);
     }
 }
 

From 68ae4cc7c84ae7d618519ecbb6d8a7dda2d41c22 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 21 Jul 2016 14:22:05 +1000
Subject: [PATCH 135/166] ng_violet: det. vertex ordering in splitRoseEdge

---
 src/nfagraph/ng_violet.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 3c79dbc3..27f58d0a 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -1007,20 +1007,24 @@ bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
         insert(&splitter_reports, base_graph[v].reports);
     }
 
-    /* find the targets of each source vertex */
-    map<RoseInVertex, flat_set<RoseInVertex> > images;
+    /* find the targets of each source vertex; note the use of vectors to
+     * preserve deterministic ordering */
+    vector<RoseInVertex> sources;
+    map<RoseInVertex, vector<RoseInVertex>> images;
     for (const RoseInEdge &e : ee) {
         RoseInVertex src = source(e, vg);
         RoseInVertex dest = target(e, vg);
-        images[src].insert(dest);
+        if (!contains(images, src)) {
+            sources.push_back(src);
+        }
+        images[src].push_back(dest);
         remove_edge(e, vg);
     }
 
-    map<flat_set<RoseInVertex>, vector<RoseInVertex> > verts_by_image;
+    map<vector<RoseInVertex>, vector<RoseInVertex>> verts_by_image;
 
-    for (const auto &elem : images) {
-        RoseInVertex u = elem.first;
-        const auto &image = elem.second;
+    for (const auto &u : sources) {
+        const auto &image = images[u];
 
         if (contains(verts_by_image, image)) {
             for (RoseInVertex v : verts_by_image[image]) {

From cbd115f7fef451be3650d847eccbcef0e8bf398a Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Mon, 25 Jul 2016 15:33:40 +1000
Subject: [PATCH 136/166] Don't shadow names

---
 CMakeLists.txt                       |  2 +-
 src/fdr/fdr_streaming_compile.cpp    |  6 +--
 src/fdr/teddy_compile.cpp            |  4 +-
 src/nfa/goughcompile.cpp             |  4 +-
 src/nfa/multiaccel_compilehelper.cpp |  8 +--
 src/nfagraph/ng_violet.cpp           |  8 +--
 src/parser/Parser.rl                 | 14 ++---
 src/rose/rose_build_add.cpp          |  1 -
 src/rose/rose_build_bytecode.cpp     |  6 +--
 src/rose/rose_build_compile.cpp      |  4 +-
 src/rose/rose_build_matchers.cpp     |  4 +-
 unit/internal/shufti.cpp             | 80 ++++++++++++++--------------
 12 files changed, 70 insertions(+), 71 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e748e955..e1f27562 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -174,7 +174,7 @@ else()
 
     # set compiler flags - more are tested and added later
     set(EXTRA_C_FLAGS "-std=c99 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
-    set(EXTRA_CXX_FLAGS "-std=c++11 -Wall -Wextra -Wno-shadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
+    set(EXTRA_CXX_FLAGS "-std=c++11 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
     if (NOT RELEASE_BUILD)
         # -Werror is most useful during development, don't potentially break
         # release builds
diff --git a/src/fdr/fdr_streaming_compile.cpp b/src/fdr/fdr_streaming_compile.cpp
index f3001743..f84e3ad6 100644
--- a/src/fdr/fdr_streaming_compile.cpp
+++ b/src/fdr/fdr_streaming_compile.cpp
@@ -195,18 +195,18 @@ struct OffsetIDFromEndOrder {
 
 static
 void fillHashes(const vector<hwlmLiteral> &long_lits, size_t max_len,
-                FDRSHashEntry *tab, size_t numEntries, MODES m,
+                FDRSHashEntry *tab, size_t numEntries, MODES mode,
                 map<u32, u32> &litToOffsetVal) {
     const u32 nbits = lg2(numEntries);
     map<u32, deque<pair<u32, u32> > > bucketToLitOffPairs;
     map<u32, u64a> bucketToBitfield;
 
     for (const auto &lit : long_lits) {
-        if ((m == CASELESS) != lit.nocase) {
+        if ((mode == CASELESS) != lit.nocase) {
             continue;
         }
         for (u32 j = 1; j < lit.s.size() - max_len + 1; j++) {
-            u32 h = hashLit(lit, j, max_len, m);
+            u32 h = hashLit(lit, j, max_len, mode);
             u32 h_ent = h & ((1U << nbits) - 1);
             u32 h_low = (h >> nbits) & 63;
             bucketToLitOffPairs[h_ent].emplace_back(lit.id, j);
diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp
index 729c9c1f..15b9665b 100644
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -266,9 +266,9 @@ bool TeddyCompiler::pack(map<BucketIndex,
 
     u32 bucket_id = 0;
     for (const TeddySet &ts : sts) {
-        const auto &lits = ts.getLits();
+        const auto &ts_lits = ts.getLits();
         auto &bucket_lits = bucketToLits[bucket_id];
-        bucket_lits.insert(end(bucket_lits), begin(lits), end(lits));
+        bucket_lits.insert(end(bucket_lits), begin(ts_lits), end(ts_lits));
         bucket_id++;
     }
     return true;
diff --git a/src/nfa/goughcompile.cpp b/src/nfa/goughcompile.cpp
index 647dc496..314b6fd0 100644
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@@ -79,9 +79,9 @@ namespace {
 class gough_build_strat : public mcclellan_build_strat {
 public:
     gough_build_strat(
-        raw_som_dfa &r, const GoughGraph &g, const ReportManager &rm,
+        raw_som_dfa &r, const GoughGraph &g, const ReportManager &rm_in,
         const map<dstate_id_t, gough_accel_state_info> &accel_info)
-        : mcclellan_build_strat(r, rm), rdfa(r), gg(g),
+        : mcclellan_build_strat(r, rm_in), rdfa(r), gg(g),
           accel_gough_info(accel_info) {}
     unique_ptr<raw_report_info> gatherReports(vector<u32> &reports /* out */,
                             vector<u32> &reports_eod /* out */,
diff --git a/src/nfa/multiaccel_compilehelper.cpp b/src/nfa/multiaccel_compilehelper.cpp
index f1cf2a4c..4c1f8101 100644
--- a/src/nfa/multiaccel_compilehelper.cpp
+++ b/src/nfa/multiaccel_compilehelper.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -347,9 +347,9 @@ void match(accel_data &d, const CharReach &ref_cr, const CharReach &cur_cr) {
     }
 }
 
-MultiaccelCompileHelper::MultiaccelCompileHelper(const CharReach &ref_cr, u32 off,
-                                                 unsigned max_len) :
-        cr(ref_cr), offset(off), max_len(max_len) {
+MultiaccelCompileHelper::MultiaccelCompileHelper(const CharReach &ref_cr,
+                                                 u32 off, unsigned max_length)
+    : cr(ref_cr), offset(off), max_len(max_length) {
     int accel_num = (int) MultibyteAccelInfo::MAT_MAX;
     accels.resize(accel_num);
 
diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 27f58d0a..e806d5c7 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -1880,15 +1880,15 @@ bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
         trimmed.clear();
         for (auto &elem : trimmed_vec) {
             shared_ptr<NGHolder> &hp = elem.first;
-            NGHolder &h = *hp;
+            NGHolder &eh = *hp;
 
             vector<NFAVertex> base_states;
             insert(&base_states, base_states.end(),
-                   inv_adjacent_vertices(h.accept, h));
-            clear_in_edges(h.accept, h);
+                   inv_adjacent_vertices(eh.accept, eh));
+            clear_in_edges(eh.accept, eh);
 
             for (auto v : base_states) {
-                h[v].reports.clear(); /* clear report from old accepts */
+                eh[v].reports.clear(); /* clear report from old accepts */
             }
 
             for (const auto &edge_delay : elem.second) {
diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl
index 65cd7c1a..9bd4d96d 100644
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -1447,12 +1447,12 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                       // Otherwise, we interpret the first three digits as an
                       // octal escape, and the remaining characters stand for
                       // themselves as literals.
-                      const u8 *p = ts;
+                      const u8 *s = ts;
                       unsigned int accum = 0;
                       unsigned int oct_digits = 0;
-                      assert(*p == '\\'); // token starts at backslash
-                      for (++p; p < te && oct_digits < 3; ++oct_digits, ++p) {
-                          u8 digit = *p - '0';
+                      assert(*s == '\\'); // token starts at backslash
+                      for (++s; s < te && oct_digits < 3; ++oct_digits, ++s) {
+                          u8 digit = *s - '0';
                           if (digit < 8) {
                               accum = digit + accum * 8;
                           } else {
@@ -1465,8 +1465,8 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                       }
 
                       // And then the rest of the digits, if any, are literal.
-                      for (; p < te; ++p) {
-                          addLiteral(currentSeq, *p, mode);
+                      for (; s < te; ++s) {
+                          addLiteral(currentSeq, *s, mode);
                       }
                   }
               };
diff --git a/src/rose/rose_build_add.cpp b/src/rose/rose_build_add.cpp
index 18968e10..0f0e8d18 100644
--- a/src/rose/rose_build_add.cpp
+++ b/src/rose/rose_build_add.cpp
@@ -709,7 +709,6 @@ void makeEodEventLeftfix(RoseBuildImpl &build, RoseVertex u,
         g[v].literals.insert(eod_event);
         build.literal_info[eod_event].vertices.insert(v);
 
-        map<u32, set<ReportID> > report_remap;
         g[v].left.graph = eod_leftfix;
         g[v].left.leftfix_report = report_mapping.second;
         g[v].left.lag = 0;
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 6eb42d4c..085aca79 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -4906,11 +4906,11 @@ void addEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
             assert(contains(bc.roleStateIndices, u));
             u32 predStateIdx = bc.roleStateIndices.at(u);
 
-            auto program = makeEodAnchorProgram(build, bc, e, multiple_preds);
-            if (program.empty()) {
+            auto prog = makeEodAnchorProgram(build, bc, e, multiple_preds);
+            if (prog.empty()) {
                 continue;
             }
-            predProgramLists[predStateIdx].push_back(program);
+            predProgramLists[predStateIdx].push_back(prog);
         }
     }
 
diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index d327193f..8545ca70 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -511,8 +511,8 @@ bool RoseBuildImpl::isDirectReport(u32 id) const {
         }
 
         // Use the program to handle cases that aren't external reports.
-        for (const ReportID &id : g[v].reports) {
-            if (!isExternalReport(rm.getReport(id))) {
+        for (const ReportID &rid : g[v].reports) {
+            if (!isExternalReport(rm.getReport(rid))) {
                 return false;
             }
         }
diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp
index 7b20bd1c..2eb70f60 100644
--- a/src/rose/rose_build_matchers.cpp
+++ b/src/rose/rose_build_matchers.cpp
@@ -416,8 +416,8 @@ bool isDirectHighlander(const RoseBuildImpl &build, const u32 id,
         return false;
     }
 
-    auto is_simple_exhaustible = [&build](ReportID id) {
-        const Report &report = build.rm.getReport(id);
+    auto is_simple_exhaustible = [&build](ReportID rid) {
+        const Report &report = build.rm.getReport(rid);
         return isSimpleExhaustible(report);
     };
 
diff --git a/unit/internal/shufti.cpp b/unit/internal/shufti.cpp
index b8d77d37..81495a9c 100644
--- a/unit/internal/shufti.cpp
+++ b/unit/internal/shufti.cpp
@@ -283,9 +283,9 @@ TEST(DoubleShufti, BuildMask1) {
 
     lits.insert(make_pair('a', 'B'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
                                      &lo2m, &hi2m);
-    ASSERT_TRUE(rv);
+    ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
     u8 *lo2 = (u8 *)&lo2m;
@@ -326,9 +326,9 @@ TEST(DoubleShufti, BuildMask2) {
     lits.insert(make_pair('a','z'));
     lits.insert(make_pair('B','z'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
                                      &lo2m, &hi2m);
-    ASSERT_TRUE(rv);
+    ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
     u8 *lo2 = (u8 *)&lo2m;
@@ -354,9 +354,9 @@ TEST(DoubleShufti, BuildMask4) {
     lits.insert(make_pair('A','z'));
     lits.insert(make_pair('b','z'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
                                      &lo2m, &hi2m);
-    ASSERT_TRUE(rv);
+    ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
     u8 *lo2 = (u8 *)&lo2m;
@@ -383,9 +383,9 @@ TEST(DoubleShufti, BuildMask5) {
     CharReach bytes;
     bytes.set('X');
 
-    bool rv = shuftiBuildDoubleMasks(bytes, lits, &lo1m, &hi1m,
+    bool ret = shuftiBuildDoubleMasks(bytes, lits, &lo1m, &hi1m,
                                      &lo2m, &hi2m);
-    ASSERT_TRUE(rv);
+    ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
     u8 *lo2 = (u8 *)&lo2m;
@@ -421,9 +421,9 @@ TEST(DoubleShufti, BuildMask6) {
     lits.insert(make_pair('A','x'));
     lits.insert(make_pair('b','x'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1m, &hi1m,
                                      &lo2m, &hi2m);
-    ASSERT_TRUE(rv);
+    ASSERT_TRUE(ret);
 
     u8 *lo1 = (u8 *)&lo1m;
     u8 *lo2 = (u8 *)&lo2m;
@@ -485,9 +485,9 @@ TEST(DoubleShufti, ExecNoMatch1) {
 
     lits.insert(make_pair('a','b'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1,
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1,
                                      &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -506,8 +506,8 @@ TEST(DoubleShufti, ExecNoMatch1b) {
 
     lits.insert(make_pair('b','a'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -527,8 +527,8 @@ TEST(DoubleShufti, ExecNoMatch2) {
     lits.insert(make_pair('a','b'));
     lits.insert(make_pair('B','b'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -548,8 +548,8 @@ TEST(DoubleShufti, ExecNoMatch2b) {
     lits.insert(make_pair('b','a'));
     lits.insert(make_pair('b','B'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -568,8 +568,8 @@ TEST(DoubleShufti, ExecNoMatch3) {
 
     lits.insert(make_pair('V','e'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
 
@@ -588,8 +588,8 @@ TEST(DoubleShufti, ExecNoMatch3b) {
 
     lits.insert(make_pair('e','V'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee";
 
@@ -608,8 +608,8 @@ TEST(DoubleShufti, ExecMatch1) {
 
     lits.insert(make_pair('a','b'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -629,8 +629,8 @@ TEST(DoubleShufti, ExecMatch2) {
 
     lits.insert(make_pair('a','a'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -651,8 +651,8 @@ TEST(DoubleShufti, ExecMatch3) {
     lits.insert(make_pair('B','a'));
     lits.insert(make_pair('a','a'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbBaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -675,8 +675,8 @@ TEST(DoubleShufti, ExecMatch4) {
     lits.insert(make_pair('C','a'));
     lits.insert(make_pair('c','a'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbAaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -717,8 +717,8 @@ TEST(DoubleShufti, ExecMatch4b) {
     lits.insert(make_pair('a','C'));
     lits.insert(make_pair('a','c'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     /*          0123456789012345678901234567890 */
     char t1[] = "bbbbbbbbbbbbbbbbbaAaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb";
@@ -756,8 +756,8 @@ TEST(DoubleShufti, ExecMatch5) {
 
     lits.insert(make_pair('a','A'));
 
-    bool rv = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(CharReach(), lits, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -780,8 +780,8 @@ TEST(DoubleShufti, ExecMatchMixed1) {
     // just one one-byte literal
     onebyte.set('a');
 
-    bool rv = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
 
@@ -804,8 +804,8 @@ TEST(DoubleShufti, ExecMatchMixed2) {
     onebyte.set('a');
     twobyte.insert(make_pair('x', 'y'));
 
-    bool rv = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
     char t2[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -838,8 +838,8 @@ TEST(DoubleShufti, ExecMatchMixed3) {
     onebyte.set('a');
     twobyte.insert(make_pair('x', 'y'));
 
-    bool rv = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
-    ASSERT_TRUE(rv);
+    bool ret = shuftiBuildDoubleMasks(onebyte, twobyte, &lo1, &hi1, &lo2, &hi2);
+    ASSERT_TRUE(ret);
 
     const int len = 420;
     char t1[len + 1];

From 89ddb85637cfc5be067fbb127c4b800e067ca243 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Mon, 25 Jul 2016 16:06:37 +1000
Subject: [PATCH 137/166] Remove enum typedef

---
 src/fdr/fdr_streaming_compile.cpp | 10 +++++-----
 src/fdr/fdr_streaming_internal.h  | 18 +++++++++---------
 src/fdr/fdr_streaming_runtime.h   | 13 +++++++------
 3 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/fdr/fdr_streaming_compile.cpp b/src/fdr/fdr_streaming_compile.cpp
index f84e3ad6..b2e1656c 100644
--- a/src/fdr/fdr_streaming_compile.cpp
+++ b/src/fdr/fdr_streaming_compile.cpp
@@ -147,7 +147,7 @@ void analyzeLits(const vector<hwlmLiteral> &long_lits, size_t max_len,
     }
 
     for (const auto &lit : long_lits) {
-        MODES m = lit.nocase ? CASELESS : CASEFUL;
+        Modes m = lit.nocase ? CASELESS : CASEFUL;
         for (u32 j = 1; j < lit.s.size() - max_len + 1; j++) {
             hashedPositions[m]++;
         }
@@ -162,7 +162,7 @@ void analyzeLits(const vector<hwlmLiteral> &long_lits, size_t max_len,
 
 #ifdef DEBUG_COMPILE
     printf("analyzeLits:\n");
-    for (MODES m = CASEFUL; m < MAX_MODES; m++) {
+    for (Modes m = CASEFUL; m < MAX_MODES; m++) {
         printf("mode %s boundary %d positions %d hashedPositions %d "
                "hashEntries %d\n",
                (m == CASEFUL) ? "caseful" : "caseless", boundaries[m],
@@ -173,7 +173,7 @@ void analyzeLits(const vector<hwlmLiteral> &long_lits, size_t max_len,
 }
 
 static
-u32 hashLit(const hwlmLiteral &l, u32 offset, size_t max_len, MODES m) {
+u32 hashLit(const hwlmLiteral &l, u32 offset, size_t max_len, Modes m) {
     return streaming_hash((const u8 *)l.s.c_str() + offset, max_len, m);
 }
 
@@ -195,7 +195,7 @@ struct OffsetIDFromEndOrder {
 
 static
 void fillHashes(const vector<hwlmLiteral> &long_lits, size_t max_len,
-                FDRSHashEntry *tab, size_t numEntries, MODES mode,
+                FDRSHashEntry *tab, size_t numEntries, Modes mode,
                 map<u32, u32> &litToOffsetVal) {
     const u32 nbits = lg2(numEntries);
     map<u32, deque<pair<u32, u32> > > bucketToLitOffPairs;
@@ -412,7 +412,7 @@ fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
     ptr = secondaryTable.get() + htOffset[CASEFUL];
     for (u32 m = CASEFUL; m < MAX_MODES; ++m) {
         fillHashes(long_lits, max_len, (FDRSHashEntry *)ptr, hashEntries[m],
-                   (MODES)m, litToOffsetVal);
+                   (Modes)m, litToOffsetVal);
         ptr += htSize[m];
     }
 
diff --git a/src/fdr/fdr_streaming_internal.h b/src/fdr/fdr_streaming_internal.h
index 26602ce1..11b07b56 100644
--- a/src/fdr/fdr_streaming_internal.h
+++ b/src/fdr/fdr_streaming_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,11 +41,11 @@
 // hash table (caseful) (FDRSHashEntry)
 // hash table (caseless) (FDRSHashEntry)
 
-typedef enum {
+enum Modes {
     CASEFUL = 0,
     CASELESS = 1,
     MAX_MODES = 2
-} MODES;
+};
 
 // We have one of these structures hanging off the 'link' of our secondary
 // FDR table that handles streaming strings
@@ -91,12 +91,12 @@ struct FDRSHashEntry {
 };
 
 static really_inline
-u32 get_start_lit_idx(const struct FDRSTableHeader * h, MODES m) {
+u32 get_start_lit_idx(const struct FDRSTableHeader * h, enum Modes m) {
     return m == CASEFUL ? 0 : h->boundary[m-1];
 }
 
 static really_inline
-u32 get_end_lit_idx(const struct FDRSTableHeader * h, MODES m) {
+u32 get_end_lit_idx(const struct FDRSTableHeader * h, enum Modes m) {
     return h->boundary[m];
 }
 
@@ -107,17 +107,17 @@ const struct FDRSLiteral * getLitTab(const struct FDRSTableHeader * h) {
 }
 
 static really_inline
-u32 getBaseOffsetOfLits(const struct FDRSTableHeader * h, MODES m) {
+u32 getBaseOffsetOfLits(const struct FDRSTableHeader * h, enum Modes m) {
     return getLitTab(h)[get_start_lit_idx(h, m)].offset;
 }
 
 static really_inline
-u32 packStateVal(const struct FDRSTableHeader * h, MODES m, u32 v) {
+u32 packStateVal(const struct FDRSTableHeader * h, enum Modes m, u32 v) {
     return v - getBaseOffsetOfLits(h, m) + 1;
 }
 
 static really_inline
-u32 unpackStateVal(const struct FDRSTableHeader * h, MODES m, u32 v) {
+u32 unpackStateVal(const struct FDRSTableHeader * h, enum Modes m, u32 v) {
     return v + getBaseOffsetOfLits(h, m) - 1;
 }
 
@@ -127,7 +127,7 @@ u32 has_bit(const struct FDRSHashEntry * ent, u32 bit) {
 }
 
 static really_inline
-u32 streaming_hash(const u8 *ptr, UNUSED size_t len, MODES mode) {
+u32 streaming_hash(const u8 *ptr, UNUSED size_t len, enum Modes mode) {
     const u64a CASEMASK = 0xdfdfdfdfdfdfdfdfULL;
     const u64a MULTIPLIER = 0x0b4e0ef37bc32127ULL;
     assert(len >= 32);
diff --git a/src/fdr/fdr_streaming_runtime.h b/src/fdr/fdr_streaming_runtime.h
index fa5843c5..8e264c76 100644
--- a/src/fdr/fdr_streaming_runtime.h
+++ b/src/fdr/fdr_streaming_runtime.h
@@ -143,7 +143,7 @@ u32 fdrStreamStateActive(const struct FDR * fdr, const u8 * stream_state) {
 // binary search for the literal index that contains the current state
 static really_inline
 u32 findLitTabEntry(const struct FDRSTableHeader * streamingTable,
-                    u32 stateValue, MODES m) {
+                    u32 stateValue, enum Modes m) {
     const struct FDRSLiteral * litTab = getLitTab(streamingTable);
     u32 lo = get_start_lit_idx(streamingTable, m);
     u32 hi = get_end_lit_idx(streamingTable, m);
@@ -175,7 +175,7 @@ void fdrUnpackStateMode(struct FDR_Runtime_Args *a,
                         const struct FDRSTableHeader *streamingTable,
                         const struct FDRSLiteral * litTab,
                         const u32 *state_table,
-                        const MODES m) {
+                        const enum Modes m) {
     if (!state_table[m]) {
         return;
     }
@@ -213,8 +213,9 @@ void fdrUnpackState(const struct FDR * fdr, struct FDR_Runtime_Args * a,
 }
 
 static really_inline
-u32 do_single_confirm(const struct FDRSTableHeader * streamingTable,
-                      const struct FDR_Runtime_Args * a, u32 hashState, MODES m) {
+u32 do_single_confirm(const struct FDRSTableHeader *streamingTable,
+                      const struct FDR_Runtime_Args *a, u32 hashState,
+                      enum Modes m) {
     const struct FDRSLiteral * litTab = getLitTab(streamingTable);
     u32 idx = findLitTabEntry(streamingTable, hashState, m);
     size_t found_offset = litTab[idx].offset;
@@ -279,7 +280,7 @@ void fdrFindStreamingHash(const struct FDR_Runtime_Args *a,
 
 static really_inline
 const struct FDRSHashEntry *getEnt(const struct FDRSTableHeader *streamingTable,
-                                   u32 h, const MODES m) {
+                                   u32 h, const enum Modes m) {
     u32 nbits = streamingTable->hashNBits[m];
     if (!nbits) {
         return NULL;
@@ -303,7 +304,7 @@ const struct FDRSHashEntry *getEnt(const struct FDRSTableHeader *streamingTable,
 static really_inline
 void fdrPackStateMode(u32 *state_table, const struct FDR_Runtime_Args *a,
                       const struct FDRSTableHeader *streamingTable,
-                      const struct FDRSHashEntry *ent, const MODES m) {
+                      const struct FDRSHashEntry *ent, const enum Modes m) {
     assert(ent);
     assert(streamingTable->hashNBits[m]);
 

From 151810b4fc534d63880d3b82763f073bdab388c1 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Mon, 25 Jul 2016 16:24:30 +1000
Subject: [PATCH 138/166] Older gcc doesn't like shadowing the function

---
 src/rose/rose_build_impl.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index 15047491..d239a698 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -338,7 +338,7 @@ struct OutfixInfo {
     template<class T>
     explicit OutfixInfo(std::unique_ptr<T> x) : proto(std::move(x)) {}
 
-    explicit OutfixInfo(MpvProto mpv) : proto(std::move(mpv)) {}
+    explicit OutfixInfo(MpvProto mpv_in) : proto(std::move(mpv_in)) {}
 
     u32 get_queue(QueueIndexFactory &qif);
 
@@ -348,14 +348,14 @@ struct OutfixInfo {
     }
 
     bool is_nonempty_mpv() const {
-        auto *mpv = boost::get<MpvProto>(&proto);
-        return mpv && !mpv->empty();
+        auto *m = boost::get<MpvProto>(&proto);
+        return m && !m->empty();
     }
 
     bool is_dead() const {
-        auto *mpv = boost::get<MpvProto>(&proto);
-        if (mpv) {
-            return mpv->empty();
+        auto *m = boost::get<MpvProto>(&proto);
+        if (m) {
+            return m->empty();
         }
         return boost::get<boost::blank>(&proto) != nullptr;
     }

From a8cceeeddc40b0fec0001ae23e6129e88f8391da Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 27 Jul 2016 09:29:39 +1000
Subject: [PATCH 139/166] ng_violet: fix non-determinism in splitEdgesByCut

---
 src/nfagraph/ng_violet.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index e806d5c7..8843196f 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -1126,18 +1126,19 @@ static
 void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
                      const vector<RoseInEdge> &to_cut,
                      const vector<NFAEdge> &cut,
-                     const map<NFAEdge, set<ue2_literal> > &cut_lits) {
-    set<RoseInVertex> sources;
-    for (const RoseInEdge &ve : to_cut) {
-        assert(&h == &*vg[ve].graph);
-        sources.insert(source(ve, vg));
-    }
-
+                     const map<NFAEdge, set<ue2_literal>> &cut_lits) {
     DEBUG_PRINTF("splitting %s:\n", to_string(h.kind).c_str());
 
     /* create literal vertices and connect preds */
-    map<RoseInVertex, vector<pair<RoseInVertex, NFAVertex> > > verts_by_source;
-    for (RoseInVertex src : sources) {
+    unordered_set<RoseInVertex> done_sources;
+    map<RoseInVertex, vector<pair<RoseInVertex, NFAVertex>>> verts_by_source;
+    for (const RoseInEdge &ve : to_cut) {
+        assert(&h == &*vg[ve].graph);
+        RoseInVertex src = source(ve, vg);
+        if (!done_sources.insert(src).second) {
+            continue; /* already processed */
+        }
+
         /* iterate over cut for determinism */
         for (const auto &e : cut) {
             NFAVertex prev_v = source(e, h);

From f3ccbf19b88be78bced19e0d3d8c74edd90d3bd8 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 27 Jul 2016 10:23:42 +1000
Subject: [PATCH 140/166] ng_violet: fix non-determinism in deanchorIfNeeded

---
 src/nfagraph/ng_violet.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 8843196f..27f84e99 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -1333,7 +1333,10 @@ bool deanchorIfNeeded(NGHolder &g) {
 
         if (succ_v == succ_g) {
             DEBUG_PRINTF("found ^.*\n");
-            for (auto succ : succ_g) {
+            for (auto succ : adjacent_vertices_range(g.start, g)) {
+                if (succ == g.startDs) {
+                    continue;
+                }
                 add_edge(g.startDs, succ, g);
             }
             clear_vertex(v, g);

From d119dd95fd020b27cff21c8a602e0ea6e7404605 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 27 Jul 2016 10:42:46 +1000
Subject: [PATCH 141/166] ng_violet: trivial typo fixes

---
 src/nfagraph/ng_violet.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 27f84e99..757582f5 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -878,7 +878,7 @@ unique_ptr<VertLitInfo> findBestPrefixSplit(const NGHolder &g,
     auto rv = findBestSplit(g, &depths, true, cc.grey.minRoseLiteralLength,
                             nullptr, &bad_vertices, cc);
 
-    /* large back edges may prevent us identifing anchored or transient cases
+    /* large back edges may prevent us identifying anchored or transient cases
      * properly - use a simple walk instead */
     if (!rv || !(rv->creates_transient || rv->creates_anchored)) {
         auto rv2 = findSimplePrefixSplit(g, cc);
@@ -1601,7 +1601,7 @@ void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig,
                                                 max_allowed_delay);
 
         if (delay == MO_INVALID_IDX) {
-            /* successor literal could not match infix -> ignore flase path */
+            /* successor literal could not match infix -> ignore false path */
             assert(0);
             continue;
         }
@@ -1677,7 +1677,9 @@ RoseInVertex getStart(RoseInGraph &vg) {
     return RoseInGraph::null_vertex();
 }
 
-/* Finds the intial accept vertex created to which suffix/outfixes are attached
+/**
+ * Finds the initial accept vertex created to which suffix/outfixes are
+ * attached.
  */
 static
 RoseInVertex getPrimaryAccept(RoseInGraph &vg) {

From c58d9d04a12cc10aba6a457cf93d6f0ad76aad74 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 27 Jul 2016 11:19:58 +1000
Subject: [PATCH 142/166] ng_violet: fix non-det in lookForCleanEarlySplits

---
 src/nfagraph/ng_violet.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 757582f5..26fb0ef5 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -2529,15 +2529,22 @@ bool lookForCleanSplit(const NGHolder &h, const vector<RoseInEdge> &ee,
 static
 void lookForCleanEarlySplits(RoseInGraph &vg, const CompileContext &cc) {
     u32 gen = 0;
-    set<RoseInVertex> prev = {getStart(vg)};
+
+    vector<RoseInVertex> prev = {getStart(vg)};
 
     while (gen < MAX_DESIRED_CLEAN_SPLIT_DEPTH) {
-        set<RoseInVertex> curr;
+        /* collect vertices in edge order for determinism */
+        vector<RoseInVertex> curr;
+        set<RoseInVertex> curr_seen;
         for (RoseInVertex u : prev) {
-            insert(&curr, adjacent_vertices(u, vg));
+            for (auto v : adjacent_vertices_range(u, vg)) {
+                if (curr_seen.insert(v).second) {
+                    curr.push_back(v);
+                }
+            }
         }
 
-        map<const NGHolder *, vector<RoseInEdge> > rightfixes;
+        map<const NGHolder *, vector<RoseInEdge>> rightfixes;
         vector<NGHolder *> ordered_graphs;
         for (RoseInVertex v : curr) {
             for (const RoseInEdge &e : out_edges_range(v, vg)) {

From 67e450115aa85c86fcfeaa356143f3d5d2487d66 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 28 Jul 2016 12:38:26 +1000
Subject: [PATCH 143/166] parser: ignore \E that is not preceded by \Q

This conforms to PCRE's behaviour, where an isolated \E that is not
preceded by \Q is ignored.
---
 src/parser/Parser.rl            | 5 ++---
 unit/hyperscan/bad_patterns.txt | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl
index 9bd4d96d..53130ddf 100644
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@@ -1226,9 +1226,8 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
               '\\Q' => {
                   fgoto readQuotedLiteral;
               };
-              '\\E' => {
-                  throw LocatedParseError("Unmatched \\E");
-              };
+              # An \E that is not preceded by a \Q is ignored
+              '\\E' => { /* noop */ };
               # Match any character
               '\.' => {
                   currentSeq->addComponent(generateComponent(CLASS_ANY, false, mode));
diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt
index 9fc3a413..1a33210d 100644
--- a/unit/hyperscan/bad_patterns.txt
+++ b/unit/hyperscan/bad_patterns.txt
@@ -32,7 +32,6 @@
 31:/\B/W #\B unsupported in UCP mode at index 0.
 32:/foo(?{print "Hello world\n";})bar/ #Embedded code is not supported at index 3.
 33:/the (\S+)(?{ $color = $^N }) (\S+)(?{ $animal = $^N })/i #Embedded code is not supported at index 9.
-34:/foobar\E/s #Unmatched \E at index 6.
 35:/\X/8 #\X unsupported at index 0.
 36:/\B+/ #Invalid repeat at index 2.
 37:/\B?/ #Invalid repeat at index 2.

From 2aaa292aaed79b44a13d1f15cc84b71d9fa17530 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 26 Jul 2016 09:45:57 +1000
Subject: [PATCH 144/166] dump: offsets for SOM_REL, SOM_ABS reports

---
 src/nfagraph/ng_dump.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/nfagraph/ng_dump.cpp b/src/nfagraph/ng_dump.cpp
index 60122cf3..57668caf 100644
--- a/src/nfagraph/ng_dump.cpp
+++ b/src/nfagraph/ng_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -450,7 +450,13 @@ void dumpReportManager(const ReportManager &rm, const Grey &grey) {
             fprintf(f, " reverse nfa: %u", report.revNfaIndex);
         }
         if (isSomRelSetReport(report)) {
-            fprintf(f, " set, adjust: %lld", report.somDistance);
+            fprintf(f, " set, adjust: %llu", report.somDistance);
+        }
+        if (report.type == EXTERNAL_CALLBACK_SOM_REL) {
+            fprintf(f, " relative: %llu", report.somDistance);
+        }
+        if (report.type == EXTERNAL_CALLBACK_SOM_ABS) {
+            fprintf(f, " absolute: %llu", report.somDistance);
         }
         fprintf(f, "\n");
     }

From cded5552c2f2570ce4f517cabf7d50c6817be9ff Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Fri, 29 Jul 2016 15:47:55 +1000
Subject: [PATCH 145/166] rose: don't leave stray reports when copying the
 subgraph leading to a report

---
 src/rose/rose_build_role_aliasing.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index b2f6b385..2c7568f4 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -1234,8 +1234,7 @@ bool attemptRoseGraphMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
         ReportID new_report = build.getNewNfaReport();
         shared_ptr<NGHolder> new_graph = cloneHolder(*b_h);
         duplicateReport(*new_graph, b_left.leftfix_report, new_report);
-        pruneReportIfUnused(build, new_graph, set<RoseVertex>(),
-                            b_left.leftfix_report);
+        pruneAllOtherReports(*new_graph, new_report);
 
         rai.rev_leftfix[a_left_id].erase(a);
         rai.rev_leftfix[b_left_id].erase(b);

From b8d33732b5248bdedaf7d37e5e7e75a0e48f6bf0 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Wed, 27 Jul 2016 16:55:28 +1000
Subject: [PATCH 146/166] Check for misaligned memory in compile error code

We now check that mem alloc for error message is aligned, and
fail with an appropriate message in the compile error.
---
 src/compiler/error.cpp        | 19 ++++++++++++--
 src/hs_compile.h              |  8 +++++-
 unit/hyperscan/allocators.cpp | 48 ++++++++++++++++++++++++++++++++++-
 3 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/src/compiler/error.cpp b/src/compiler/error.cpp
index e806b7a0..07db9819 100644
--- a/src/compiler/error.cpp
+++ b/src/compiler/error.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,6 +42,7 @@ using std::string;
 
 static const char failureNoMemory[] = "Unable to allocate memory.";
 static const char failureInternal[] = "Internal error.";
+static const char failureBadAlloc[] = "Allocator returned misaligned memory.";
 
 extern const hs_compile_error_t hs_enomem = {
     const_cast<char *>(failureNoMemory), 0
@@ -49,6 +50,9 @@ extern const hs_compile_error_t hs_enomem = {
 extern const hs_compile_error_t hs_einternal = {
     const_cast<char *>(failureInternal), 0
 };
+extern const hs_compile_error_t hs_badalloc = {
+    const_cast<char *>(failureBadAlloc), 0
+};
 
 namespace ue2 {
 
@@ -56,8 +60,18 @@ hs_compile_error_t *generateCompileError(const string &err, int expression) {
     hs_compile_error_t *ret =
         (struct hs_compile_error *)hs_misc_alloc(sizeof(hs_compile_error_t));
     if (ret) {
+        hs_error_t e = hs_check_alloc(ret);
+        if (e != HS_SUCCESS) {
+            hs_misc_free(ret);
+            return const_cast<hs_compile_error_t *>(&hs_badalloc);
+        }
         char *msg = (char *)hs_misc_alloc(err.size() + 1);
         if (msg) {
+            e = hs_check_alloc(msg);
+            if (e != HS_SUCCESS) {
+                hs_misc_free(msg);
+                return const_cast<hs_compile_error_t *>(&hs_badalloc);
+            }
             memcpy(msg, err.c_str(), err.size() + 1);
             ret->message = msg;
         } else {
@@ -83,7 +97,8 @@ void freeCompileError(hs_compile_error_t *error) {
     if (!error) {
         return;
     }
-    if (error == &hs_enomem || error == &hs_einternal) {
+    if (error == &hs_enomem || error == &hs_einternal ||
+        error == &hs_badalloc) {
         // These are not allocated.
         return;
     }
diff --git a/src/hs_compile.h b/src/hs_compile.h
index 48168cc2..c5212cbe 100644
--- a/src/hs_compile.h
+++ b/src/hs_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -98,6 +98,12 @@ extern "C"
  *      The library was unable to allocate temporary storage used during
  *      compilation time.
  *
+ *    - *Allocator returned misaligned memory*
+ *
+ *      The memory allocator (either malloc() or the allocator set with @ref
+ *      hs_set_allocator()) did not correctly return memory suitably aligned
+ *      for the largest representable data type on this platform.
+ *
  *    - *Internal error*
  *
  *      An unexpected error occurred: if this error is reported, please contact
diff --git a/unit/hyperscan/allocators.cpp b/unit/hyperscan/allocators.cpp
index 66c456ee..40c45072 100644
--- a/unit/hyperscan/allocators.cpp
+++ b/unit/hyperscan/allocators.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,9 @@
 #include "test_util.h"
 
 #include <cstdlib>
+#include <string>
+
+using std::string;
 
 static void *null_malloc(size_t) { return nullptr; }
 
@@ -83,6 +86,22 @@ TEST(CustomAllocator, TwoAlignedCompile) {
     hs_set_database_allocator(nullptr, nullptr);
 }
 
+TEST(CustomAllocator, TwoAlignedCompileError) {
+    hs_set_misc_allocator(two_aligned_malloc, two_aligned_free);
+
+    hs_database_t *db = nullptr;
+    hs_compile_error_t *compile_err = nullptr;
+    const hs_platform_info_t *platform = nullptr;
+    hs_error_t err =
+        hs_compile("\\1", 0, HS_MODE_BLOCK, platform, &db, &compile_err);
+    ASSERT_EQ(HS_COMPILER_ERROR, err);
+    ASSERT_EQ(nullptr, db);
+    ASSERT_NE(nullptr, compile_err);
+    EXPECT_STREQ("Allocator returned misaligned memory.", compile_err->message);
+    hs_free_compile_error(compile_err);
+    hs_set_database_allocator(nullptr, nullptr);
+}
+
 TEST(CustomAllocator, TwoAlignedDatabaseInfo) {
     hs_database_t *db = buildDB("foobar", 0, 0, HS_MODE_BLOCK);
     ASSERT_TRUE(db != nullptr);
@@ -149,3 +168,30 @@ TEST(CustomAllocator, TwoAlignedAllocScratch) {
     hs_set_scratch_allocator(nullptr, nullptr);
     hs_free_database(db);
 }
+
+TEST(CustomAllocator, NullMallocExpressionInfo) {
+    hs_set_allocator(null_malloc, nullptr);
+
+    string pattern = "foobar";
+    hs_expr_info_t *info = nullptr;
+    hs_compile_error_t *c_err = nullptr;
+    hs_error_t err = hs_expression_info(pattern.c_str(), 0, &info, &c_err);
+    ASSERT_EQ(HS_COMPILER_ERROR, err);
+    ASSERT_NE(nullptr, c_err);
+    hs_free_compile_error(c_err);
+    hs_set_allocator(nullptr, nullptr);
+}
+
+TEST(CustomAllocator, TwoAlignedExpressionInfo) {
+    hs_set_misc_allocator(two_aligned_malloc, two_aligned_free);
+
+    string pattern = "\\1";
+    hs_expr_info_t *info = nullptr;
+    hs_compile_error_t *c_err = nullptr;
+    hs_error_t err = hs_expression_info(pattern.c_str(), 0, &info, &c_err);
+    ASSERT_EQ(HS_COMPILER_ERROR, err);
+    ASSERT_NE(nullptr, c_err);
+    EXPECT_STREQ("Allocator returned misaligned memory.", c_err->message);
+    hs_free_compile_error(c_err);
+    hs_set_allocator(nullptr, nullptr);
+}

From 093029b5d1494a1a3872ea0d9681651f0f14e4dc Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Mon, 1 Aug 2016 12:43:13 +1000
Subject: [PATCH 147/166] add a csv version of rose_components

---
 src/rose/rose_dump.cpp | 72 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
index dedd8fcf..a3d00943 100644
--- a/src/rose/rose_dump.cpp
+++ b/src/rose/rose_dump.cpp
@@ -40,6 +40,7 @@
 #include "nfa/nfa_build_util.h"
 #include "nfa/nfa_dump_api.h"
 #include "nfa/nfa_internal.h"
+#include "nfa/nfa_kind.h"
 #include "util/dump_charclass.h"
 #include "util/multibit_internal.h"
 #include "util/multibit.h"
@@ -697,6 +698,76 @@ void dumpComponentInfo(const RoseEngine *t, const string &base) {
     }
 }
 
+
+static
+void dumpComponentInfoCsv(const RoseEngine *t, const string &base) {
+    FILE *f = fopen((base +"rose_components.csv").c_str(), "w");
+
+    fprintf(f, "Index, Offset,Engine Type,States,Stream State,Bytecode Size,"
+            "Kind,Notes\n");
+
+    for (u32 i = 0; i < t->queueCount; i++) {
+        const NfaInfo *nfa_info = getNfaInfoByQueue(t, i);
+        const NFA *n = getNfaByInfo(t, nfa_info);
+        nfa_kind kind;
+        stringstream notes;
+
+        if (i < t->outfixBeginQueue) {
+            notes << "chained;";
+        }
+
+        if (nfa_info->eod) {
+            notes << "eod;";
+        }
+
+        if (i < t->outfixEndQueue) {
+            kind = NFA_OUTFIX;
+        } else if (i < t->leftfixBeginQueue) {
+            kind = NFA_SUFFIX;
+        } else {
+            const LeftNfaInfo *left = getLeftInfoByQueue(t, i);
+            if (left->eager) {
+                notes << "eager;";
+            }
+            if (left->transient) {
+                notes << "transient " << (u32)left->transient << ";";
+            }
+            if (left->infix) {
+                kind = NFA_INFIX;
+                u32 maxQueueLen = left->maxQueueLen;
+                if (maxQueueLen != (u32)(-1)) {
+                    notes << "maxqlen=" << maxQueueLen << ";";
+                }
+            } else {
+                kind = NFA_PREFIX;
+            }
+            notes << "maxlag=" << left->maxLag << ";";
+            if (left->stopTable) {
+                notes << "miracles;";
+            }
+            if (left->countingMiracleOffset) {
+                auto cm = (const RoseCountingMiracle *)
+                    ((const char *)t + left->countingMiracleOffset);
+                notes << "counting_miracle:" << (int)cm->count
+                      << (cm->shufti ? "s" : "v") << ";";
+            }
+            if (nfaSupportsZombie(n)) {
+                notes << " zombie;";
+            }
+            if (left->eod_check) {
+            notes << "left_eod;";
+            }
+        }
+
+        fprintf(f, "%u,%zd,\"%s\",%u,%u,%u,%s,%s\n", i,
+                (const char *)n - (const char *)t, describe(*n).c_str(),
+                n->nPositions, n->streamStateSize, n->length,
+                to_string(kind).c_str(), notes.str().c_str());
+    }
+    fclose(f);
+}
+
+
 static
 void dumpExhaust(const RoseEngine *t, const string &base) {
     stringstream sstxt;
@@ -1113,6 +1184,7 @@ void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
 void roseDumpComponents(const RoseEngine *t, bool dump_raw,
                         const string &base) {
     dumpComponentInfo(t, base);
+    dumpComponentInfoCsv(t, base);
     dumpNfas(t, dump_raw, base);
     dumpAnchored(t, base);
     dumpRevComponentInfo(t, base);

From a9fddbc400c9594514214bcdd27f9fb3809c0c18 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 2 Aug 2016 14:49:38 +1000
Subject: [PATCH 148/166] nfa: delete largely-unused struct LimExNFABase

---
 src/nfa/limex_internal.h   |  20 +---
 src/nfa/nfa_build_util.cpp | 192 ++++++++++++++++++++++++++-----------
 2 files changed, 137 insertions(+), 75 deletions(-)

diff --git a/src/nfa/limex_internal.h b/src/nfa/limex_internal.h
index 1483a911..6bc9a597 100644
--- a/src/nfa/limex_internal.h
+++ b/src/nfa/limex_internal.h
@@ -99,24 +99,6 @@ enum LimExSquash {
     LIMEX_SQUASH_REPORT = 3  //!< squash when report is raised
 };
 
-struct LimExNFABase {
-    u8 reachMap[N_CHARS];
-    u32 reachSize;
-    u32 accelCount;
-    u32 accelTableOffset;
-    u32 accelAuxCount;
-    u32 accelAuxOffset;
-    u32 acceptCount;
-    u32 acceptOffset;
-    u32 acceptEodCount;
-    u32 acceptEodOffset;
-    u32 exceptionCount;
-    u32 exceptionOffset;
-    u32 exReportOffset;
-    u32 repeatCount;
-    u32 repeatOffset;
-};
-
 /* uniform looking types for the macros */
 typedef u8   u_8;
 typedef u16  u_16;
@@ -137,7 +119,7 @@ struct NFAException##size {                                                 \
     u8 trigger; /**< from enum LimExTrigger */                              \
 };                                                                          \
                                                                             \
-struct LimExNFA##size { /* MUST align with LimExNFABase */                  \
+struct LimExNFA##size {                                                     \
     u8 reachMap[N_CHARS]; /**< map of char -> entry in reach[] */           \
     u32 reachSize; /**< number of reach masks */                            \
     u32 accelCount; /**< number of entries in accel table */                \
diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp
index 9244dcfb..ce473196 100644
--- a/src/nfa/nfa_build_util.cpp
+++ b/src/nfa/nfa_build_util.cpp
@@ -78,7 +78,7 @@ struct DISPATCH_BY_NFA_TYPE_INT<sfunc, rv_t, arg_t, INVALID_NFA> {
                              decltype(arg), (NFAEngineType)0>::doOp(i, arg)
 }
 
-typedef bool (*has_accel_fn)(const NFA *nfa);
+typedef bool (*nfa_dispatch_fn)(const NFA *nfa);
 
 template<typename T>
 static
@@ -87,8 +87,37 @@ bool has_accel_limex(const NFA *nfa) {
     return limex->accelCount;
 }
 
+template<typename T>
 static
-bool has_accel_generic(const NFA *) {
+bool has_repeats_limex(const NFA *nfa) {
+    const T *limex = (const T *)getImplNfa(nfa);
+    return limex->repeatCount;
+}
+
+
+template<typename T>
+static
+bool has_repeats_other_than_firsts_limex(const NFA *nfa) {
+    const T *limex = (const T *)getImplNfa(nfa);
+    const char *ptr = (const char *)limex;
+
+    const u32 *repeatOffset = (const u32 *)(ptr + limex->repeatOffset);
+
+    for (u32 i = 0; i < limex->repeatCount; i++) {
+        u32 offset = repeatOffset[i];
+        const NFARepeatInfo *info = (const NFARepeatInfo *)(ptr + offset);
+        const RepeatInfo *repeat =
+            (const RepeatInfo *)((const char *)info + sizeof(*info));
+        if (repeat->type != REPEAT_FIRST) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static
+bool dispatch_false(const NFA *) {
     return false;
 }
 
@@ -146,13 +175,20 @@ enum NFACategory {NFA_LIMEX, NFA_OTHER};
         static const NFACategory category = NFA_LIMEX;                  \
         typedef LimExNFA##mlt_size implNFA_t;                           \
         typedef u_##mlt_size tableRow_t;                                \
-        static const has_accel_fn has_accel;                            \
+        static const nfa_dispatch_fn has_accel;                         \
+        static const nfa_dispatch_fn has_repeats;                       \
+        static const nfa_dispatch_fn has_repeats_other_than_firsts;     \
         static const u32 stateAlign =                                   \
                 MAX(alignof(tableRow_t), alignof(RepeatControl));       \
         static const bool fast = mlt_size <= 64;                        \
     };                                                                  \
-    const has_accel_fn NFATraits<LIMEX_NFA_##mlt_size>::has_accel       \
+    const nfa_dispatch_fn NFATraits<LIMEX_NFA_##mlt_size>::has_accel    \
             = has_accel_limex<LimExNFA##mlt_size>;                      \
+    const nfa_dispatch_fn NFATraits<LIMEX_NFA_##mlt_size>::has_repeats  \
+            = has_repeats_limex<LimExNFA##mlt_size>;                    \
+    const nfa_dispatch_fn                                               \
+        NFATraits<LIMEX_NFA_##mlt_size>::has_repeats_other_than_firsts  \
+            = has_repeats_other_than_firsts_limex<LimExNFA##mlt_size>;  \
     DO_IF_DUMP_SUPPORT(                                                 \
     const char *NFATraits<LIMEX_NFA_##mlt_size>::name                   \
         = "LimEx "#mlt_size;                                            \
@@ -173,9 +209,13 @@ template<> struct NFATraits<MCCLELLAN_NFA_8> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 1;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<MCCLELLAN_NFA_8>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_8>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_8>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_8>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<MCCLELLAN_NFA_8>::name = "McClellan 8";
 #endif
@@ -185,9 +225,13 @@ template<> struct NFATraits<MCCLELLAN_NFA_16> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 2;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<MCCLELLAN_NFA_16>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_16>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_16>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_16>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<MCCLELLAN_NFA_16>::name = "McClellan 16";
 #endif
@@ -197,9 +241,13 @@ template<> struct NFATraits<GOUGH_NFA_8> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<GOUGH_NFA_8>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_8>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_8>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_8>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<GOUGH_NFA_8>::name = "Goughfish 8";
 #endif
@@ -209,9 +257,13 @@ template<> struct NFATraits<GOUGH_NFA_16> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<GOUGH_NFA_16>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_16>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_16>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_16>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<GOUGH_NFA_16>::name = "Goughfish 16";
 #endif
@@ -221,9 +273,13 @@ template<> struct NFATraits<MPV_NFA_0> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<MPV_NFA_0>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<MPV_NFA_0>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<MPV_NFA_0>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MPV_NFA_0>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<MPV_NFA_0>::name = "Mega-Puff-Vac";
 #endif
@@ -233,9 +289,13 @@ template<> struct NFATraits<CASTLE_NFA_0> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<CASTLE_NFA_0>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<CASTLE_NFA_0>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<CASTLE_NFA_0>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<CASTLE_NFA_0>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<CASTLE_NFA_0>::name = "Castle";
 #endif
@@ -245,9 +305,13 @@ template<> struct NFATraits<LBR_NFA_Dot> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<LBR_NFA_Dot>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Dot>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Dot>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Dot>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<LBR_NFA_Dot>::name = "Lim Bounded Repeat (D)";
 #endif
@@ -257,9 +321,13 @@ template<> struct NFATraits<LBR_NFA_Verm> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<LBR_NFA_Verm>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Verm>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Verm>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Verm>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<LBR_NFA_Verm>::name = "Lim Bounded Repeat (V)";
 #endif
@@ -269,9 +337,13 @@ template<> struct NFATraits<LBR_NFA_NVerm> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<LBR_NFA_NVerm>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<LBR_NFA_NVerm>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_NVerm>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_NVerm>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<LBR_NFA_NVerm>::name = "Lim Bounded Repeat (NV)";
 #endif
@@ -281,9 +353,13 @@ template<> struct NFATraits<LBR_NFA_Shuf> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<LBR_NFA_Shuf>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Shuf>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Shuf>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Shuf>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<LBR_NFA_Shuf>::name = "Lim Bounded Repeat (S)";
 #endif
@@ -293,9 +369,13 @@ template<> struct NFATraits<LBR_NFA_Truf> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<LBR_NFA_Truf>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Truf>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Truf>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_Truf>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<LBR_NFA_Truf>::name = "Lim Bounded Repeat (M)";
 #endif
@@ -305,9 +385,13 @@ template<> struct NFATraits<TAMARAMA_NFA_0> {
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 32;
     static const bool fast = true;
-    static const has_accel_fn has_accel;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const has_accel_fn NFATraits<TAMARAMA_NFA_0>::has_accel = has_accel_generic;
+const nfa_dispatch_fn NFATraits<TAMARAMA_NFA_0>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<TAMARAMA_NFA_0>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<TAMARAMA_NFA_0>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
 const char *NFATraits<TAMARAMA_NFA_0>::name = "Tamarama";
 #endif
@@ -362,42 +446,39 @@ struct is_limex {
 };
 }
 
+namespace {
+template<NFAEngineType t>
+struct has_repeats_other_than_firsts_dispatch {
+    static nfa_dispatch_fn call(const void *) {
+        return NFATraits<t>::has_repeats_other_than_firsts;
+    }
+};
+}
+
 bool has_bounded_repeats_other_than_firsts(const NFA &nfa) {
-    if (!DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type, is_limex, &nfa)) {
-        return false;
+    return DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type,
+                                has_repeats_other_than_firsts_dispatch,
+                                &nfa)(&nfa);
+}
+
+namespace {
+template<NFAEngineType t>
+struct has_repeats_dispatch {
+    static nfa_dispatch_fn call(const void *) {
+        return NFATraits<t>::has_repeats;
     }
-
-    const LimExNFABase *limex = (const LimExNFABase *)getImplNfa(&nfa);
-    const char *ptr = (const char *)limex;
-
-    const u32 *repeatOffset = (const u32 *)(ptr + limex->repeatOffset);
-
-    for (u32 i = 0; i < limex->repeatCount; i++) {
-        u32 offset = repeatOffset[i];
-        const NFARepeatInfo *info = (const NFARepeatInfo *)(ptr + offset);
-        const RepeatInfo *repeat =
-            (const RepeatInfo *)((const char *)info + sizeof(*info));
-        if (repeat->type != REPEAT_FIRST) {
-            return true;
-        }
-    }
-
-    return false;
+};
 }
 
 bool has_bounded_repeats(const NFA &nfa) {
-    if (!DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type, is_limex, &nfa)) {
-        return false;
-    }
-
-    const LimExNFABase *limex = (const LimExNFABase *)getImplNfa(&nfa);
-    return limex->repeatCount;
+    return DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type, has_repeats_dispatch,
+                                &nfa)(&nfa);
 }
 
 namespace {
 template<NFAEngineType t>
 struct has_accel_dispatch {
-    static has_accel_fn call(const void *) {
+    static nfa_dispatch_fn call(const void *) {
         return NFATraits<t>::has_accel;
     }
 };
@@ -405,8 +486,7 @@ struct has_accel_dispatch {
 
 bool has_accel(const NFA &nfa) {
     return DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type, has_accel_dispatch,
-                                &nfa)
-        (&nfa);
+                                &nfa)(&nfa);
 }
 
 bool requires_decompress_key(const NFA &nfa) {

From 56bf25b0917b6faf6b7ac20b373f0b3faa38549f Mon Sep 17 00:00:00 2001
From: Anatoly Burakov <anatoly.burakov@intel.com>
Date: Thu, 19 May 2016 14:16:35 +0100
Subject: [PATCH 149/166] McClellan refactor

Taking dfa strat out of McClellan, to be reused by other DFAs
---
 CMakeLists.txt                                |   6 +-
 ...le_accel.cpp => accel_dfa_build_strat.cpp} | 413 ++++++++++++------
 ...ompile_accel.h => accel_dfa_build_strat.h} |  47 +-
 src/nfa/dfa_build_strat.cpp                   | 211 +++++++++
 src/nfa/dfa_build_strat.h                     |  68 +++
 src/nfa/mcclellancompile.cpp                  | 157 +------
 src/nfa/mcclellancompile.h                    |  41 +-
 src/nfa/mcclellancompile_util.cpp             |  59 ---
 src/nfa/mcclellancompile_util.h               |   2 -
 9 files changed, 624 insertions(+), 380 deletions(-)
 rename src/nfa/{mcclellancompile_accel.cpp => accel_dfa_build_strat.cpp} (58%)
 mode change 100644 => 100755
 rename src/nfa/{mcclellancompile_accel.h => accel_dfa_build_strat.h} (60%)
 mode change 100644 => 100755
 create mode 100755 src/nfa/dfa_build_strat.cpp
 create mode 100644 src/nfa/dfa_build_strat.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1f27562..1719ec2d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -597,11 +597,15 @@ SET (hs_SRCS
     src/hwlm/noodle_build.h
     src/hwlm/noodle_internal.h
     src/nfa/accel.h
+    src/nfa/accel_dfa_build_strat.cpp
+    src/nfa/accel_dfa_build_strat.h
     src/nfa/accelcompile.cpp
     src/nfa/accelcompile.h
     src/nfa/callback.h
     src/nfa/castlecompile.cpp
     src/nfa/castlecompile.h
+    src/nfa/dfa_build_strat.cpp
+    src/nfa/dfa_build_strat.h
     src/nfa/dfa_min.cpp
     src/nfa/dfa_min.h
     src/nfa/goughcompile.cpp
@@ -613,8 +617,6 @@ SET (hs_SRCS
     src/nfa/mcclellan_internal.h
     src/nfa/mcclellancompile.cpp
     src/nfa/mcclellancompile.h
-    src/nfa/mcclellancompile_accel.cpp
-    src/nfa/mcclellancompile_accel.h
     src/nfa/mcclellancompile_util.cpp
     src/nfa/mcclellancompile_util.h
     src/nfa/limex_compile.cpp
diff --git a/src/nfa/mcclellancompile_accel.cpp b/src/nfa/accel_dfa_build_strat.cpp
old mode 100644
new mode 100755
similarity index 58%
rename from src/nfa/mcclellancompile_accel.cpp
rename to src/nfa/accel_dfa_build_strat.cpp
index c5325fcc..ba21adc7
--- a/src/nfa/mcclellancompile_accel.cpp
+++ b/src/nfa/accel_dfa_build_strat.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,18 +26,20 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "mcclellancompile_accel.h"
-
-#include "mcclellancompile_util.h"
+#include "accel_dfa_build_strat.h"
 
+#include "accel.h"
 #include "grey.h"
 #include "nfagraph/ng_limex_accel.h"
+#include "shufticompile.h"
+#include "trufflecompile.h"
 #include "util/charreach.h"
 #include "util/container.h"
 #include "util/dump_charclass.h"
+#include "util/verify_types.h"
 
-#include <vector>
 #include <sstream>
+#include <vector>
 
 #define PATHS_LIMIT 500
 
@@ -46,14 +48,13 @@ using namespace std;
 namespace ue2 {
 
 namespace {
-
 struct path {
     vector<CharReach> reach;
     dstate_id_t dest = DEAD_STATE;
-    explicit path(dstate_id_t base) : dest(base) {}
+    explicit path(dstate_id_t base) : dest(base) {
+    }
+};
 };
-
-}
 
 static UNUSED
 string describeClasses(const vector<CharReach> &v) {
@@ -85,8 +86,8 @@ bool is_useful_path(const vector<path> &good, const path &p) {
                 goto next;
             }
         }
-        DEBUG_PRINTF("better: [%s] -> %u\n",
-                     describeClasses(g.reach).c_str(), g.dest);
+        DEBUG_PRINTF("better: [%s] -> %u\n", describeClasses(g.reach).c_str(),
+                     g.dest);
 
         return false;
     next:;
@@ -106,8 +107,7 @@ path append(const path &orig, const CharReach &cr, u32 new_dest) {
 
 static
 void extend(const raw_dfa &rdfa, const path &p,
-            map<u32, vector<path> > &all,
-            vector<path> &out) {
+            map<u32, vector<path>> &all, vector<path> &out) {
     dstate s = rdfa.states[p.dest];
 
     if (!p.reach.empty() && p.reach.back().none()) {
@@ -147,17 +147,17 @@ void extend(const raw_dfa &rdfa, const path &p,
         }
 
         DEBUG_PRINTF("----good: [%s] -> %u\n",
-                         describeClasses(pp.reach).c_str(), pp.dest);
+                     describeClasses(pp.reach).c_str(), pp.dest);
         all[e.first].push_back(pp);
         out.push_back(pp);
     }
 }
 
 static
-vector<vector<CharReach> > generate_paths(const raw_dfa &rdfa, dstate_id_t base,
-                                          u32 len) {
-    vector<path> paths{ path(base) };
-    map<u32, vector<path> > all;
+vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
+                                         dstate_id_t base, u32 len) {
+    vector<path> paths{path(base)};
+    map<u32, vector<path>> all;
     all[base].push_back(path(base));
     for (u32 i = 0; i < len && paths.size() < PATHS_LIMIT; i++) {
         vector<path> next_gen;
@@ -170,7 +170,7 @@ vector<vector<CharReach> > generate_paths(const raw_dfa &rdfa, dstate_id_t base,
 
     dump_paths(paths);
 
-    vector<vector<CharReach> > rv;
+    vector<vector<CharReach>> rv;
     for (auto &p : paths) {
         rv.push_back(move(p.reach));
     }
@@ -181,16 +181,58 @@ static
 AccelScheme look_for_offset_accel(const raw_dfa &rdfa, dstate_id_t base,
                                   u32 max_allowed_accel_offset) {
     DEBUG_PRINTF("looking for accel for %hu\n", base);
-    vector<vector<CharReach> > paths = generate_paths(rdfa, base,
-                                                   max_allowed_accel_offset + 1);
+    vector<vector<CharReach>> paths =
+        generate_paths(rdfa, base, max_allowed_accel_offset + 1);
     AccelScheme as = findBestAccelScheme(paths, CharReach(), true);
     DEBUG_PRINTF("found %s + %u\n", describeClass(as.cr).c_str(), as.offset);
     return as;
 }
 
+static UNUSED
+bool better(const AccelScheme &a, const AccelScheme &b) {
+    if (!a.double_byte.empty() && b.double_byte.empty()) {
+        return true;
+    }
+
+    if (!b.double_byte.empty()) {
+        return false;
+    }
+
+    return a.cr.count() < b.cr.count();
+}
+
+static
+vector<CharReach> reverse_alpha_remapping(const raw_dfa &rdfa) {
+    vector<CharReach> rv(rdfa.alpha_size - 1); /* TOP not required */
+
+    for (u32 i = 0; i < N_CHARS; i++) {
+        rv.at(rdfa.alpha_remap[i]).set(i);
+    }
+
+    return rv;
+}
+
+static
+bool double_byte_ok(const AccelScheme &info) {
+    return !info.double_byte.empty() &&
+           info.double_cr.count() < info.double_byte.size() &&
+           info.double_cr.count() <= 2 && !info.double_byte.empty();
+}
+
+static
+bool has_self_loop(dstate_id_t s, const raw_dfa &raw) {
+    u16 top_remap = raw.alpha_remap[TOP];
+    for (u32 i = 0; i < raw.states[s].next.size(); i++) {
+        if (i != top_remap && raw.states[s].next[i] == s) {
+            return true;
+        }
+    }
+    return false;
+}
+
 static
 vector<u16> find_nonexit_symbols(const raw_dfa &rdfa,
-                                 const CharReach &escape) {
+                                        const CharReach &escape) {
     set<u16> rv;
     CharReach nonexit = ~escape;
     for (auto i = nonexit.find_first(); i != CharReach::npos;
@@ -201,9 +243,58 @@ vector<u16> find_nonexit_symbols(const raw_dfa &rdfa,
     return vector<u16>(rv.begin(), rv.end());
 }
 
+static
+dstate_id_t get_sds_or_proxy(const raw_dfa &raw) {
+    if (raw.start_floating != DEAD_STATE) {
+        DEBUG_PRINTF("has floating start\n");
+        return raw.start_floating;
+    }
+
+    DEBUG_PRINTF("looking for SDS proxy\n");
+
+    dstate_id_t s = raw.start_anchored;
+
+    if (has_self_loop(s, raw)) {
+        return s;
+    }
+
+    u16 top_remap = raw.alpha_remap[TOP];
+
+    ue2::unordered_set<dstate_id_t> seen;
+    while (true) {
+        seen.insert(s);
+        DEBUG_PRINTF("basis %hu\n", s);
+
+        /* check if we are connected to a state with a self loop */
+        for (u32 i = 0; i < raw.states[s].next.size(); i++) {
+            dstate_id_t t = raw.states[s].next[i];
+            if (i != top_remap && t != DEAD_STATE && has_self_loop(t, raw)) {
+                return t;
+            }
+        }
+
+        /* find a neighbour to use as a basis for looking for the sds proxy */
+        dstate_id_t t = DEAD_STATE;
+        for (u32 i = 0; i < raw.states[s].next.size(); i++) {
+            dstate_id_t tt = raw.states[s].next[i];
+            if (i != top_remap && tt != DEAD_STATE && !contains(seen, tt)) {
+                t = tt;
+                break;
+            }
+        }
+
+        if (t == DEAD_STATE) {
+            /* we were unable to find a state to use as a SDS proxy */
+            return DEAD_STATE;
+        }
+
+        s = t;
+    }
+}
+
 static
 set<dstate_id_t> find_region(const raw_dfa &rdfa, dstate_id_t base,
-                             const AccelScheme &ei) {
+                                    const AccelScheme &ei) {
     DEBUG_PRINTF("looking for region around %hu\n", base);
 
     set<dstate_id_t> region = {base};
@@ -236,98 +327,10 @@ set<dstate_id_t> find_region(const raw_dfa &rdfa, dstate_id_t base,
     return region;
 }
 
-static
-bool better(const AccelScheme &a, const AccelScheme &b) {
-    if (!a.double_byte.empty() && b.double_byte.empty()) {
-        return true;
-    }
-
-    if (!b.double_byte.empty()) {
-        return false;
-    }
-
-    return a.cr.count() < b.cr.count();
-}
-
-static
-vector<CharReach> reverse_alpha_remapping(const raw_dfa &rdfa) {
-    vector<CharReach> rv(rdfa.alpha_size - 1); /* TOP not required */
-
-    for (u32 i = 0; i < N_CHARS; i++) {
-        rv.at(rdfa.alpha_remap[i]).set(i);
-    }
-
-    return rv;
-}
-
-map<dstate_id_t, AccelScheme> populateAccelerationInfo(const raw_dfa &rdfa,
-                                                   const dfa_build_strat &strat,
-                                                   const Grey &grey) {
-    map<dstate_id_t, AccelScheme> rv;
-    if (!grey.accelerateDFA) {
-        return rv;
-    }
-
-    dstate_id_t sds_proxy = get_sds_or_proxy(rdfa);
-    DEBUG_PRINTF("sds %hu\n", sds_proxy);
-
-    for (size_t i = 0; i < rdfa.states.size(); i++) {
-        if (i == DEAD_STATE) {
-            continue;
-        }
-
-        /* Note on report acceleration states: While we can't accelerate while we
-         * are spamming out callbacks, the QR code paths don't raise reports
-         * during scanning so they can accelerate report states. */
-        if (generates_callbacks(rdfa.kind) && !rdfa.states[i].reports.empty()) {
-            continue;
-        }
-
-        size_t single_limit = i == sds_proxy ? ACCEL_DFA_MAX_FLOATING_STOP_CHAR
-                                             : ACCEL_DFA_MAX_STOP_CHAR;
-        DEBUG_PRINTF("inspecting %zu/%hu: %zu\n", i, sds_proxy, single_limit);
-
-        AccelScheme ei = strat.find_escape_strings(i);
-        if (ei.cr.count() > single_limit) {
-            DEBUG_PRINTF("state %zu is not accelerable has %zu\n", i,
-                         ei.cr.count());
-            continue;
-        }
-
-        DEBUG_PRINTF("state %zu should be accelerable %zu\n",
-                     i, ei.cr.count());
-
-        rv[i] = ei;
-    }
-
-    /* provide accleration states to states in the region of sds */
-    if (contains(rv, sds_proxy)) {
-        AccelScheme sds_ei = rv[sds_proxy];
-        sds_ei.double_byte.clear(); /* region based on single byte scheme
-                                     * may differ from double byte */
-        DEBUG_PRINTF("looking to expand offset accel to nearby states, %zu\n",
-                     sds_ei.cr.count());
-        auto sds_region = find_region(rdfa, sds_proxy, sds_ei);
-        for (auto s : sds_region) {
-            if (!contains(rv, s) || better(sds_ei, rv[s])) {
-                rv[s] = sds_ei;
-            }
-        }
-    }
-
-    return rv;
-}
-
-static
-bool double_byte_ok(const AccelScheme &info) {
-    return !info.double_byte.empty()
-        && info.double_cr.count() < info.double_byte.size()
-        && info.double_cr.count() <= 2 && !info.double_byte.empty();
-}
-
-AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa, dstate_id_t this_idx,
-                                       u32 max_allowed_accel_offset) {
+AccelScheme
+accel_dfa_build_strat::find_escape_strings(dstate_id_t this_idx) const {
     AccelScheme rv;
+    const raw_dfa &rdfa = get_raw();
     rv.cr.clear();
     rv.offset = 0;
     const dstate &raw = rdfa.states[this_idx];
@@ -354,7 +357,7 @@ AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa, dstate_id_t this_idx
 
         if (!raw_next.reports.empty() && generates_callbacks(rdfa.kind)) {
             DEBUG_PRINTF("leads to report\n");
-            outs2_broken = true;  /* cannot accelerate over reports */
+            outs2_broken = true; /* cannot accelerate over reports */
             continue;
         }
         succs[next_id] |= cr_i;
@@ -402,14 +405,12 @@ AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa, dstate_id_t this_idx
 
     DEBUG_PRINTF("this %u, sds proxy %hu\n", this_idx, get_sds_or_proxy(rdfa));
     DEBUG_PRINTF("broken %d\n", outs2_broken);
-    if (!double_byte_ok(rv) && !is_triggered(rdfa.kind)
-        && this_idx == rdfa.start_floating
-        && this_idx != DEAD_STATE) {
+    if (!double_byte_ok(rv) && !is_triggered(rdfa.kind) &&
+        this_idx == rdfa.start_floating && this_idx != DEAD_STATE) {
         DEBUG_PRINTF("looking for offset accel at %u\n", this_idx);
-        auto offset = look_for_offset_accel(rdfa, this_idx,
-                                            max_allowed_accel_offset);
-        DEBUG_PRINTF("width %zu vs %zu\n", offset.cr.count(),
-                      rv.cr.count());
+        auto offset =
+            look_for_offset_accel(rdfa, this_idx, max_allowed_offset_accel());
+        DEBUG_PRINTF("width %zu vs %zu\n", offset.cr.count(), rv.cr.count());
         if (double_byte_ok(offset) || offset.cr.count() < rv.cr.count()) {
             DEBUG_PRINTF("using offset accel\n");
             rv = offset;
@@ -419,4 +420,172 @@ AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa, dstate_id_t this_idx
     return rv;
 }
 
+void
+accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
+                                  const AccelScheme &info,
+                                  void *accel_out) {
+    AccelAux *accel = (AccelAux *)accel_out;
+
+    DEBUG_PRINTF("accelerations scheme has offset s%u/d%u\n", info.offset,
+                 info.double_offset);
+    accel->generic.offset = verify_u8(info.offset);
+
+    if (double_byte_ok(info) && info.double_cr.none() &&
+        info.double_byte.size() == 1) {
+        accel->accel_type = ACCEL_DVERM;
+        accel->dverm.c1 = info.double_byte.begin()->first;
+        accel->dverm.c2 = info.double_byte.begin()->second;
+        accel->dverm.offset = verify_u8(info.double_offset);
+        DEBUG_PRINTF("state %hu is double vermicelli\n", this_idx);
+        return;
+    }
+
+    if (double_byte_ok(info) && info.double_cr.none() &&
+        (info.double_byte.size() == 2 || info.double_byte.size() == 4)) {
+        bool ok = true;
+
+        assert(!info.double_byte.empty());
+        u8 firstC = info.double_byte.begin()->first & CASE_CLEAR;
+        u8 secondC = info.double_byte.begin()->second & CASE_CLEAR;
+
+        for (const pair<u8, u8> &p : info.double_byte) {
+            if ((p.first & CASE_CLEAR) != firstC ||
+                (p.second & CASE_CLEAR) != secondC) {
+                ok = false;
+                break;
+            }
+        }
+
+        if (ok) {
+            accel->accel_type = ACCEL_DVERM_NOCASE;
+            accel->dverm.c1 = firstC;
+            accel->dverm.c2 = secondC;
+            accel->dverm.offset = verify_u8(info.double_offset);
+            DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
+            return;
+        }
+
+        u8 m1;
+        u8 m2;
+        if (buildDvermMask(info.double_byte, &m1, &m2)) {
+            accel->accel_type = ACCEL_DVERM_MASKED;
+            accel->dverm.offset = verify_u8(info.double_offset);
+            accel->dverm.c1 = info.double_byte.begin()->first & m1;
+            accel->dverm.c2 = info.double_byte.begin()->second & m2;
+            accel->dverm.m1 = m1;
+            accel->dverm.m2 = m2;
+            DEBUG_PRINTF(
+                "building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
+                accel->dverm.c1, accel->dverm.c2);
+            return;
+        }
+    }
+
+    if (double_byte_ok(info) &&
+        shuftiBuildDoubleMasks(info.double_cr, info.double_byte,
+                               &accel->dshufti.lo1, &accel->dshufti.hi1,
+                               &accel->dshufti.lo2, &accel->dshufti.hi2)) {
+        accel->accel_type = ACCEL_DSHUFTI;
+        accel->dshufti.offset = verify_u8(info.double_offset);
+        DEBUG_PRINTF("state %hu is double shufti\n", this_idx);
+        return;
+    }
+
+    if (info.cr.none()) {
+        accel->accel_type = ACCEL_RED_TAPE;
+        DEBUG_PRINTF("state %hu is a dead end full of bureaucratic red tape"
+                     " from which there is no escape\n",
+                     this_idx);
+        return;
+    }
+
+    if (info.cr.count() == 1) {
+        accel->accel_type = ACCEL_VERM;
+        accel->verm.c = info.cr.find_first();
+        DEBUG_PRINTF("state %hu is vermicelli\n", this_idx);
+        return;
+    }
+
+    if (info.cr.count() == 2 && info.cr.isCaselessChar()) {
+        accel->accel_type = ACCEL_VERM_NOCASE;
+        accel->verm.c = info.cr.find_first() & CASE_CLEAR;
+        DEBUG_PRINTF("state %hu is caseless vermicelli\n", this_idx);
+        return;
+    }
+
+    if (info.cr.count() > max_floating_stop_char()) {
+        accel->accel_type = ACCEL_NONE;
+        DEBUG_PRINTF("state %hu is too broad\n", this_idx);
+        return;
+    }
+
+    accel->accel_type = ACCEL_SHUFTI;
+    if (-1 != shuftiBuildMasks(info.cr, &accel->shufti.lo, &accel->shufti.hi)) {
+        DEBUG_PRINTF("state %hu is shufti\n", this_idx);
+        return;
+    }
+
+    assert(!info.cr.none());
+    accel->accel_type = ACCEL_TRUFFLE;
+    truffleBuildMasks(info.cr, &accel->truffle.mask1, &accel->truffle.mask2);
+    DEBUG_PRINTF("state %hu is truffle\n", this_idx);
 }
+
+map<dstate_id_t, AccelScheme>
+accel_dfa_build_strat::getAccelInfo(const Grey &grey) {
+    map<dstate_id_t, AccelScheme> rv;
+    raw_dfa &rdfa = get_raw();
+    if (!grey.accelerateDFA) {
+        return rv;
+    }
+
+    dstate_id_t sds_proxy = get_sds_or_proxy(rdfa);
+    DEBUG_PRINTF("sds %hu\n", sds_proxy);
+
+    for (size_t i = 0; i < rdfa.states.size(); i++) {
+        if (i == DEAD_STATE) {
+            continue;
+        }
+
+        /* Note on report acceleration states: While we can't accelerate while
+         * we
+         * are spamming out callbacks, the QR code paths don't raise reports
+         * during scanning so they can accelerate report states. */
+        if (generates_callbacks(rdfa.kind) && !rdfa.states[i].reports.empty()) {
+            continue;
+        }
+
+        size_t single_limit =
+            i == sds_proxy ? max_floating_stop_char() : max_stop_char();
+        DEBUG_PRINTF("inspecting %zu/%hu: %zu\n", i, sds_proxy, single_limit);
+
+        AccelScheme ei = find_escape_strings(i);
+        if (ei.cr.count() > single_limit) {
+            DEBUG_PRINTF("state %zu is not accelerable has %zu\n", i,
+                         ei.cr.count());
+            continue;
+        }
+
+        DEBUG_PRINTF("state %zu should be accelerable %zu\n", i, ei.cr.count());
+
+        rv[i] = ei;
+    }
+
+    /* provide accleration states to states in the region of sds */
+    if (contains(rv, sds_proxy)) {
+        AccelScheme sds_ei = rv[sds_proxy];
+        sds_ei.double_byte.clear(); /* region based on single byte scheme
+                                     * may differ from double byte */
+        DEBUG_PRINTF("looking to expand offset accel to nearby states, %zu\n",
+                     sds_ei.cr.count());
+        auto sds_region = find_region(rdfa, sds_proxy, sds_ei);
+        for (auto s : sds_region) {
+            if (!contains(rv, s) || better(sds_ei, rv[s])) {
+                rv[s] = sds_ei;
+            }
+        }
+    }
+
+    return rv;
+}
+};
diff --git a/src/nfa/mcclellancompile_accel.h b/src/nfa/accel_dfa_build_strat.h
old mode 100644
new mode 100755
similarity index 60%
rename from src/nfa/mcclellancompile_accel.h
rename to src/nfa/accel_dfa_build_strat.h
index 427267d7..3cfaf272
--- a/src/nfa/mcclellancompile_accel.h
+++ b/src/nfa/accel_dfa_build_strat.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,36 +26,35 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef MCCLELLANCOMPILE_ACCEL_H
-#define MCCLELLANCOMPILE_ACCEL_H
+#ifndef ACCEL_DFA_BUILD_STRAT_H
+#define ACCEL_DFA_BUILD_STRAT_H
 
-#include "mcclellancompile.h"
+#include "rdfa.h"
+#include "dfa_build_strat.h"
+#include "ue2common.h"
+#include "util/accel_scheme.h"
 
 #include <map>
 
 namespace ue2 {
 
+class ReportManager;
 struct Grey;
 
-#define ACCEL_DFA_MAX_OFFSET_DEPTH 4
+class accel_dfa_build_strat : public dfa_build_strat {
+public:
+    explicit accel_dfa_build_strat(const ReportManager &rm_in)
+        : dfa_build_strat(rm_in) {}
+    virtual AccelScheme find_escape_strings(dstate_id_t this_idx) const;
+    virtual size_t accelSize(void) const = 0;
+    virtual u32 max_allowed_offset_accel() const = 0;
+    virtual u32 max_stop_char() const = 0;
+    virtual u32 max_floating_stop_char() const = 0;
+    virtual void buildAccel(dstate_id_t this_idx, const AccelScheme &info,
+                            void *accel_out);
+    virtual std::map<dstate_id_t, AccelScheme> getAccelInfo(const Grey &grey);
+};
 
-/** Maximum tolerated number of escape character from an accel state.
- * This is larger than nfa, as we don't have a budget and the nfa cheats on stop
- * characters for sets of states */
-#define ACCEL_DFA_MAX_STOP_CHAR 160
+} // namespace ue2
 
-/** Maximum tolerated number of escape character from a sds accel state. Larger
- * than normal states as accelerating sds is important. Matches NFA value */
-#define ACCEL_DFA_MAX_FLOATING_STOP_CHAR 192
-
-std::map<dstate_id_t, AccelScheme> populateAccelerationInfo(const raw_dfa &rdfa,
-                                                   const dfa_build_strat &strat,
-                                                   const Grey &grey);
-
-AccelScheme find_mcclellan_escape_info(const raw_dfa &rdfa,
-                                       dstate_id_t this_idx,
-                                       u32 max_allowed_accel_offset);
-
-}
-
-#endif
+#endif // ACCEL_DFA_BUILD_STRAT_H
diff --git a/src/nfa/dfa_build_strat.cpp b/src/nfa/dfa_build_strat.cpp
new file mode 100755
index 00000000..1d31feb1
--- /dev/null
+++ b/src/nfa/dfa_build_strat.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "dfa_build_strat.h"
+
+#include "accel.h"
+#include "accelcompile.h"
+#include "grey.h"
+#include "mcclellan_internal.h"
+#include "mcclellancompile_util.h"
+#include "nfa_internal.h"
+#include "shufticompile.h"
+#include "trufflecompile.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/bitutils.h"
+#include "util/charreach.h"
+#include "util/compare.h"
+#include "util/compile_context.h"
+#include "util/container.h"
+#include "util/make_unique.h"
+#include "util/order_check.h"
+#include "util/report_manager.h"
+#include "util/ue2_containers.h"
+#include "util/unaligned.h"
+#include "util/verify_types.h"
+
+#include <vector>
+
+using namespace std;
+
+namespace ue2 {
+
+// prevent weak vtables for raw_report_info, dfa_build_strat and raw_dfa
+raw_report_info::~raw_report_info() {
+}
+
+dfa_build_strat::~dfa_build_strat() {
+}
+
+raw_dfa::~raw_dfa() {
+}
+
+namespace {
+
+struct raw_report_list {
+    flat_set<ReportID> reports;
+
+    raw_report_list(const flat_set<ReportID> &reports_in,
+                    const ReportManager &rm, bool do_remap) {
+        if (do_remap) {
+            for (auto &id : reports_in) {
+                reports.insert(rm.getProgramOffset(id));
+            }
+        } else {
+            reports = reports_in;
+        }
+    }
+
+    bool operator<(const raw_report_list &b) const {
+        return reports < b.reports;
+    }
+};
+
+struct raw_report_info_impl : public raw_report_info {
+    vector<raw_report_list> rl;
+    u32 getReportListSize() const override;
+    size_t size() const override;
+    void fillReportLists(NFA *n, size_t base_offset,
+                         std::vector<u32> &ro /* out */) const override;
+};
+}
+
+unique_ptr<raw_report_info>
+dfa_build_strat::gatherReports(vector<u32> &reports, vector<u32> &reports_eod,
+                               u8 *isSingleReport, ReportID *arbReport) const {
+    auto &rdfa = get_raw();
+    DEBUG_PRINTF("gathering reports\n");
+
+    const bool remap_reports = has_managed_reports(rdfa.kind);
+
+    auto ri = ue2::make_unique<raw_report_info_impl>();
+    map<raw_report_list, u32> rev;
+
+    for (const dstate &s : rdfa.states) {
+        if (s.reports.empty()) {
+            reports.push_back(MO_INVALID_IDX);
+            continue;
+        }
+
+        raw_report_list rrl(s.reports, rm, remap_reports);
+        DEBUG_PRINTF("non empty r\n");
+        if (rev.find(rrl) != rev.end()) {
+            reports.push_back(rev[rrl]);
+        } else {
+            DEBUG_PRINTF("adding to rl %zu\n", ri->size());
+            rev[rrl] = ri->size();
+            reports.push_back(ri->size());
+            ri->rl.push_back(rrl);
+        }
+    }
+
+    for (const dstate &s : rdfa.states) {
+        if (s.reports_eod.empty()) {
+            reports_eod.push_back(MO_INVALID_IDX);
+            continue;
+        }
+
+        DEBUG_PRINTF("non empty r eod\n");
+        raw_report_list rrl(s.reports_eod, rm, remap_reports);
+        if (rev.find(rrl) != rev.end()) {
+            reports_eod.push_back(rev[rrl]);
+            continue;
+        }
+
+        DEBUG_PRINTF("adding to rl eod %zu\n", s.reports_eod.size());
+        rev[rrl] = ri->size();
+        reports_eod.push_back(ri->size());
+        ri->rl.push_back(rrl);
+    }
+
+    assert(!ri->rl.empty()); /* all components should be able to generate
+                                reports */
+    if (!ri->rl.empty()) {
+        *arbReport = *ri->rl.begin()->reports.begin();
+    } else {
+        *arbReport = 0;
+    }
+
+    /* if we have only a single report id generated from all accepts (not eod)
+     * we can take some short cuts */
+    set<ReportID> reps;
+
+    for (u32 rl_index : reports) {
+        if (rl_index == MO_INVALID_IDX) {
+            continue;
+        }
+        assert(rl_index < ri->size());
+        insert(&reps, ri->rl[rl_index].reports);
+    }
+
+    if (reps.size() == 1) {
+        *isSingleReport = 1;
+        *arbReport = *reps.begin();
+        DEBUG_PRINTF("single -- %u\n", *arbReport);
+    } else {
+        *isSingleReport = 0;
+    }
+
+    return move(ri);
+}
+
+u32 raw_report_info_impl::getReportListSize() const {
+    u32 rv = 0;
+
+    for (const auto &reps : rl) {
+        rv += sizeof(report_list);
+        rv += sizeof(ReportID) * reps.reports.size();
+    }
+
+    return rv;
+}
+
+size_t raw_report_info_impl::size() const {
+    return rl.size();
+}
+
+void raw_report_info_impl::fillReportLists(NFA *n, size_t base_offset,
+                                           vector<u32> &ro) const {
+    for (const auto &reps : rl) {
+        ro.push_back(base_offset);
+
+        report_list *p = (report_list *)((char *)n + base_offset);
+
+        u32 i = 0;
+        for (const ReportID report : reps.reports) {
+            p->report[i++] = report;
+        }
+        p->count = verify_u32(reps.reports.size());
+
+        base_offset += sizeof(report_list);
+        base_offset += sizeof(ReportID) * reps.reports.size();
+    }
+}
+
+} // namespace ue2
diff --git a/src/nfa/dfa_build_strat.h b/src/nfa/dfa_build_strat.h
new file mode 100644
index 00000000..cda00162
--- /dev/null
+++ b/src/nfa/dfa_build_strat.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DFA_BUILD_STRAT_H
+#define DFA_BUILD_STRAT_H
+
+#include "rdfa.h"
+#include "ue2common.h"
+
+#include <memory>
+#include <vector>
+
+struct NFA;
+
+namespace ue2 {
+
+class ReportManager;
+
+struct raw_report_info {
+    virtual ~raw_report_info();
+    virtual u32 getReportListSize() const = 0; /* in bytes */
+    virtual size_t size() const = 0; /* number of lists */
+    virtual void fillReportLists(NFA *n, size_t base_offset,
+                                 std::vector<u32> &ro /* out */) const = 0;
+};
+
+class dfa_build_strat {
+public:
+    explicit dfa_build_strat(const ReportManager &rm_in) : rm(rm_in) {}
+    virtual ~dfa_build_strat();
+    virtual raw_dfa &get_raw() const = 0;
+    virtual std::unique_ptr<raw_report_info> gatherReports(
+                               std::vector<u32> &reports /* out */,
+                               std::vector<u32> &reports_eod /* out */,
+                               u8 *isSingleReport /* out */,
+                               ReportID *arbReport /* out */) const = 0;
+protected:
+    const ReportManager &rm;
+};
+
+} // namespace ue2
+
+#endif // DFA_BUILD_STRAT_H
diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index a9fbce94..c26fb904 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -32,7 +32,6 @@
 #include "accelcompile.h"
 #include "grey.h"
 #include "mcclellan_internal.h"
-#include "mcclellancompile_accel.h"
 #include "mcclellancompile_util.h"
 #include "nfa_internal.h"
 #include "shufticompile.h"
@@ -65,6 +64,17 @@
 using namespace std;
 using boost::adaptors::map_keys;
 
+#define ACCEL_DFA_MAX_OFFSET_DEPTH 4
+
+/** Maximum tolerated number of escape character from an accel state.
+ * This is larger than nfa, as we don't have a budget and the nfa cheats on stop
+ * characters for sets of states */
+#define ACCEL_DFA_MAX_STOP_CHAR 160
+
+/** Maximum tolerated number of escape character from a sds accel state. Larger
+ * than normal states as accelerating sds is important. Matches NFA value */
+#define ACCEL_DFA_MAX_FLOATING_STOP_CHAR 192
+
 namespace ue2 {
 
 namespace /* anon */ {
@@ -75,7 +85,7 @@ struct dstate_extra {
 };
 
 struct dfa_info {
-    dfa_build_strat &strat;
+    accel_dfa_build_strat &strat;
     raw_dfa &raw;
     vector<dstate> &states;
     vector<dstate_extra> extra;
@@ -85,7 +95,7 @@ struct dfa_info {
 
     u8 getAlphaShift() const;
 
-    explicit dfa_info(dfa_build_strat &s)
+    explicit dfa_info(accel_dfa_build_strat &s)
                                 : strat(s),
                                   raw(s.get_raw()),
                                   states(raw.states),
@@ -128,13 +138,6 @@ mstate_aux *getAux(NFA *n, dstate_id_t i) {
     return aux;
 }
 
-static
-bool double_byte_ok(const AccelScheme &info) {
-    return !info.double_byte.empty()
-        && info.double_cr.count() < info.double_byte.size()
-        && info.double_cr.count() <= 2 && !info.double_byte.empty();
-}
-
 static
 void markEdges(NFA *n, u16 *succ_table, const dfa_info &info) {
     assert((size_t)succ_table % 2 == 0);
@@ -190,120 +193,12 @@ u32 mcclellan_build_strat::max_allowed_offset_accel() const {
     return ACCEL_DFA_MAX_OFFSET_DEPTH;
 }
 
-AccelScheme mcclellan_build_strat::find_escape_strings(dstate_id_t this_idx)
-    const {
-    return find_mcclellan_escape_info(rdfa, this_idx,
-                                      max_allowed_offset_accel());
+u32 mcclellan_build_strat::max_stop_char() const {
+    return ACCEL_DFA_MAX_STOP_CHAR;
 }
 
-/** builds acceleration schemes for states */
-void mcclellan_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
-                                       const AccelScheme &info,
-                                       void *accel_out) {
-    AccelAux *accel = (AccelAux *)accel_out;
-
-    DEBUG_PRINTF("accelerations scheme has offset s%u/d%u\n", info.offset,
-                 info.double_offset);
-    accel->generic.offset = verify_u8(info.offset);
-
-    if (double_byte_ok(info) && info.double_cr.none()
-        && info.double_byte.size() == 1) {
-        accel->accel_type = ACCEL_DVERM;
-        accel->dverm.c1 = info.double_byte.begin()->first;
-        accel->dverm.c2 = info.double_byte.begin()->second;
-        accel->dverm.offset = verify_u8(info.double_offset);
-        DEBUG_PRINTF("state %hu is double vermicelli\n", this_idx);
-        return;
-    }
-
-    if (double_byte_ok(info) && info.double_cr.none()
-        && (info.double_byte.size() == 2 || info.double_byte.size() == 4)) {
-        bool ok = true;
-
-        assert(!info.double_byte.empty());
-        u8 firstC = info.double_byte.begin()->first & CASE_CLEAR;
-        u8 secondC = info.double_byte.begin()->second & CASE_CLEAR;
-
-        for (const pair<u8, u8> &p : info.double_byte) {
-            if ((p.first & CASE_CLEAR) != firstC
-             || (p.second & CASE_CLEAR) != secondC) {
-                ok = false;
-                break;
-            }
-        }
-
-        if (ok) {
-            accel->accel_type = ACCEL_DVERM_NOCASE;
-            accel->dverm.c1 = firstC;
-            accel->dverm.c2 = secondC;
-            accel->dverm.offset = verify_u8(info.double_offset);
-            DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
-            return;
-        }
-
-        u8 m1;
-        u8 m2;
-        if (buildDvermMask(info.double_byte, &m1, &m2)) {
-            accel->accel_type = ACCEL_DVERM_MASKED;
-            accel->dverm.offset = verify_u8(info.double_offset);
-            accel->dverm.c1 = info.double_byte.begin()->first & m1;
-            accel->dverm.c2 = info.double_byte.begin()->second & m2;
-            accel->dverm.m1 = m1;
-            accel->dverm.m2 = m2;
-            DEBUG_PRINTF("building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
-                         accel->dverm.c1, accel->dverm.c2);
-            return;
-        }
-    }
-
-    if (double_byte_ok(info)
-        && shuftiBuildDoubleMasks(info.double_cr, info.double_byte,
-                                  &accel->dshufti.lo1, &accel->dshufti.hi1,
-                                  &accel->dshufti.lo2, &accel->dshufti.hi2)) {
-        accel->accel_type = ACCEL_DSHUFTI;
-        accel->dshufti.offset = verify_u8(info.double_offset);
-        DEBUG_PRINTF("state %hu is double shufti\n", this_idx);
-        return;
-    }
-
-    if (info.cr.none()) {
-        accel->accel_type = ACCEL_RED_TAPE;
-        DEBUG_PRINTF("state %hu is a dead end full of bureaucratic red tape"
-                     " from which there is no escape\n", this_idx);
-        return;
-    }
-
-    if (info.cr.count() == 1) {
-        accel->accel_type = ACCEL_VERM;
-        accel->verm.c = info.cr.find_first();
-        DEBUG_PRINTF("state %hu is vermicelli\n", this_idx);
-        return;
-    }
-
-    if (info.cr.count() == 2 && info.cr.isCaselessChar()) {
-        accel->accel_type = ACCEL_VERM_NOCASE;
-        accel->verm.c = info.cr.find_first() & CASE_CLEAR;
-        DEBUG_PRINTF("state %hu is caseless vermicelli\n", this_idx);
-        return;
-    }
-
-    if (info.cr.count() > ACCEL_DFA_MAX_FLOATING_STOP_CHAR) {
-        accel->accel_type = ACCEL_NONE;
-        DEBUG_PRINTF("state %hu is too broad\n", this_idx);
-        return;
-    }
-
-    accel->accel_type = ACCEL_SHUFTI;
-    if (-1 != shuftiBuildMasks(info.cr, &accel->shufti.lo,
-                               &accel->shufti.hi)) {
-        DEBUG_PRINTF("state %hu is shufti\n", this_idx);
-        return;
-    }
-
-    assert(!info.cr.none());
-    accel->accel_type = ACCEL_TRUFFLE;
-    truffleBuildMasks(info.cr, &accel->truffle.mask1, &accel->truffle.mask2);
-    DEBUG_PRINTF("state %hu is truffle\n", this_idx);
+u32 mcclellan_build_strat::max_floating_stop_char() const {
+    return ACCEL_DFA_MAX_FLOATING_STOP_CHAR;
 }
 
 static
@@ -343,15 +238,6 @@ void populateBasicInfo(size_t state_size, const dfa_info &info,
     }
 }
 
-raw_dfa::~raw_dfa() {
-}
-
-raw_report_info::raw_report_info() {
-}
-
-raw_report_info::~raw_report_info() {
-}
-
 namespace {
 
 struct raw_report_list {
@@ -592,7 +478,7 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
 
     auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
     map<dstate_id_t, AccelScheme> accel_escape_info
-        = populateAccelerationInfo(info.raw, info.strat, cc.grey);
+            = info.strat.getAccelInfo(cc.grey);
 
     size_t tran_size = (1 << info.getAlphaShift())
         * sizeof(u16) * count_real_states;
@@ -811,7 +697,7 @@ aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
 
     auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
     map<dstate_id_t, AccelScheme> accel_escape_info
-        = populateAccelerationInfo(info.raw, info.strat, cc.grey);
+        = info.strat.getAccelInfo(cc.grey);
 
     size_t tran_size = sizeof(u8) * (1 << info.getAlphaShift()) * info.size();
     size_t aux_size = sizeof(mstate_aux) * info.size();
@@ -1053,7 +939,7 @@ bool is_cyclic_near(const raw_dfa &raw, dstate_id_t root) {
     return false;
 }
 
-aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, dfa_build_strat &strat,
+aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
                                            const CompileContext &cc,
                                            set<dstate_id_t> *accel_states) {
     u16 total_daddy = 0;
@@ -1128,7 +1014,4 @@ bool has_accel_dfa(const NFA *nfa) {
     return m->has_accel;
 }
 
-dfa_build_strat::~dfa_build_strat() {
-}
-
 } // namespace ue2
diff --git a/src/nfa/mcclellancompile.h b/src/nfa/mcclellancompile.h
index ba519cac..e5e56cc8 100644
--- a/src/nfa/mcclellancompile.h
+++ b/src/nfa/mcclellancompile.h
@@ -29,6 +29,7 @@
 #ifndef MCCLELLANCOMPILE_H
 #define MCCLELLANCOMPILE_H
 
+#include "accel_dfa_build_strat.h"
 #include "rdfa.h"
 #include "ue2common.h"
 #include "util/accel_scheme.h"
@@ -47,48 +48,20 @@ namespace ue2 {
 class ReportManager;
 struct CompileContext;
 
-struct raw_report_info {
-    raw_report_info();
-    virtual ~raw_report_info();
-    virtual u32 getReportListSize() const = 0; /* in bytes */
-    virtual size_t size() const = 0; /* number of lists */
-    virtual void fillReportLists(NFA *n, size_t base_offset,
-                                 std::vector<u32> &ro /* out */) const = 0;
-};
-
-class dfa_build_strat {
-public:
-    explicit dfa_build_strat(const ReportManager &rm_in) : rm(rm_in) {}
-    virtual ~dfa_build_strat();
-    virtual raw_dfa &get_raw() const = 0;
-    virtual std::unique_ptr<raw_report_info> gatherReports(
-                               std::vector<u32> &reports /* out */,
-                               std::vector<u32> &reports_eod /* out */,
-                               u8 *isSingleReport /* out */,
-                               ReportID *arbReport  /* out */) const = 0;
-    virtual AccelScheme find_escape_strings(dstate_id_t this_idx) const = 0;
-    virtual size_t accelSize(void) const = 0;
-    virtual void buildAccel(dstate_id_t this_idx, const AccelScheme &info,
-                            void *accel_out) = 0;
-protected:
-    const ReportManager &rm;
-};
-
-class mcclellan_build_strat : public dfa_build_strat {
+class mcclellan_build_strat : public accel_dfa_build_strat {
 public:
     mcclellan_build_strat(raw_dfa &rdfa_in, const ReportManager &rm_in)
-        : dfa_build_strat(rm_in), rdfa(rdfa_in) {}
+        : accel_dfa_build_strat(rm_in), rdfa(rdfa_in) {}
     raw_dfa &get_raw() const override { return rdfa; }
     std::unique_ptr<raw_report_info> gatherReports(
                                   std::vector<u32> &reports /* out */,
                                   std::vector<u32> &reports_eod /* out */,
                                   u8 *isSingleReport /* out */,
                                   ReportID *arbReport  /* out */) const override;
-    AccelScheme find_escape_strings(dstate_id_t this_idx) const override;
     size_t accelSize(void) const override;
-    void buildAccel(dstate_id_t this_idx,const AccelScheme &info,
-                    void *accel_out) override;
-    virtual u32 max_allowed_offset_accel() const;
+    u32 max_allowed_offset_accel() const override;
+    u32 max_stop_char() const override;
+    u32 max_floating_stop_char() const override;
 
 private:
     raw_dfa &rdfa;
@@ -103,7 +76,7 @@ mcclellanCompile(raw_dfa &raw, const CompileContext &cc,
 
 /* used internally by mcclellan/haig/gough compile process */
 ue2::aligned_unique_ptr<NFA>
-mcclellanCompile_i(raw_dfa &raw, dfa_build_strat &strat,
+mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
                    const CompileContext &cc,
                    std::set<dstate_id_t> *accel_states = nullptr);
 
diff --git a/src/nfa/mcclellancompile_util.cpp b/src/nfa/mcclellancompile_util.cpp
index 2f1ffa02..a61a19ab 100644
--- a/src/nfa/mcclellancompile_util.cpp
+++ b/src/nfa/mcclellancompile_util.cpp
@@ -336,65 +336,6 @@ size_t hash_dfa(const raw_dfa &rdfa) {
     return v;
 }
 
-static
-bool has_self_loop(dstate_id_t s, const raw_dfa &raw) {
-    u16 top_remap = raw.alpha_remap[TOP];
-    for (u32 i = 0; i < raw.states[s].next.size(); i++) {
-        if (i != top_remap && raw.states[s].next[i] == s) {
-            return true;
-        }
-    }
-    return false;
-}
-
-dstate_id_t get_sds_or_proxy(const raw_dfa &raw) {
-    if (raw.start_floating != DEAD_STATE) {
-        DEBUG_PRINTF("has floating start\n");
-        return raw.start_floating;
-    }
-
-    DEBUG_PRINTF("looking for SDS proxy\n");
-
-    dstate_id_t s = raw.start_anchored;
-
-    if (has_self_loop(s, raw)) {
-        return s;
-    }
-
-    u16 top_remap = raw.alpha_remap[TOP];
-
-    ue2::unordered_set<dstate_id_t> seen;
-    while (true) {
-        seen.insert(s);
-        DEBUG_PRINTF("basis %hu\n", s);
-
-        /* check if we are connected to a state with a self loop */
-        for (u32 i = 0; i < raw.states[s].next.size(); i++) {
-            dstate_id_t t = raw.states[s].next[i];
-            if (i != top_remap && t != DEAD_STATE && has_self_loop(t, raw)) {
-                return t;
-            }
-        }
-
-        /* find a neighbour to use as a basis for looking for the sds proxy */
-        dstate_id_t t = DEAD_STATE;
-        for (u32 i = 0; i < raw.states[s].next.size(); i++) {
-            dstate_id_t tt = raw.states[s].next[i];
-            if (i != top_remap && tt != DEAD_STATE && !contains(seen, tt)) {
-                t = tt;
-                break;
-            }
-        }
-
-        if (t == DEAD_STATE) {
-            /* we were unable to find a state to use as a SDS proxy */
-            return DEAD_STATE;
-        }
-
-        s = t;
-    }
-}
-
 static
 bool can_die_early(const raw_dfa &raw, dstate_id_t s,
                    map<dstate_id_t, u32> &visited, u32 age_limit) {
diff --git a/src/nfa/mcclellancompile_util.h b/src/nfa/mcclellancompile_util.h
index 3d3ee2e7..554c1efd 100644
--- a/src/nfa/mcclellancompile_util.h
+++ b/src/nfa/mcclellancompile_util.h
@@ -55,8 +55,6 @@ size_t hash_dfa_no_reports(const raw_dfa &rdfa);
 /** \brief Compute a simple hash of this raw_dfa, including its reports. */
 size_t hash_dfa(const raw_dfa &rdfa);
 
-dstate_id_t get_sds_or_proxy(const raw_dfa &raw);
-
 bool can_die_early(const raw_dfa &raw, u32 age_limit);
 
 } // namespace ue2

From 6331da4e29a740331f0a450e558b4bc55dce1cf2 Mon Sep 17 00:00:00 2001
From: Anatoly Burakov <anatoly.burakov@intel.com>
Date: Thu, 10 Mar 2016 09:57:41 +0000
Subject: [PATCH 150/166] dfa: adding new Sheng engine

A new shuffle-based DFA engine, complete with acceleration and smallwrite.
---
 CMakeLists.txt                      |  11 +
 src/grey.cpp                        |   4 +
 src/grey.h                          |   2 +
 src/nfa/mcclellancompile.cpp        |   2 +-
 src/nfa/mcclellancompile.h          |   2 +-
 src/nfa/nfa_api.h                   |   3 +
 src/nfa/nfa_api_dispatch.c          |   2 +
 src/nfa/nfa_build_util.cpp          |  25 +-
 src/nfa/nfa_dump_dispatch.cpp       |   2 +
 src/nfa/nfa_internal.h              |  14 +-
 src/nfa/sheng.c                     | 676 ++++++++++++++++++++++++++++
 src/nfa/sheng.h                     |  61 +++
 src/nfa/sheng_defs.h                | 353 +++++++++++++++
 src/nfa/sheng_impl.h                |  97 ++++
 src/nfa/sheng_impl4.h               | 284 ++++++++++++
 src/nfa/sheng_internal.h            |  70 +++
 src/nfa/shengcompile.cpp            | 541 ++++++++++++++++++++++
 src/nfa/shengcompile.h              |  80 ++++
 src/nfa/shengdump.cpp               | 265 +++++++++++
 src/nfa/shengdump.h                 |  49 ++
 src/nfagraph/ng_limex.cpp           |   3 +
 src/rose/rose_build_bytecode.cpp    |  37 +-
 src/runtime.c                       |   8 +-
 src/smallwrite/smallwrite_build.cpp |  25 +-
 24 files changed, 2591 insertions(+), 25 deletions(-)
 create mode 100644 src/nfa/sheng.c
 create mode 100644 src/nfa/sheng.h
 create mode 100644 src/nfa/sheng_defs.h
 create mode 100644 src/nfa/sheng_impl.h
 create mode 100644 src/nfa/sheng_impl4.h
 create mode 100644 src/nfa/sheng_internal.h
 create mode 100644 src/nfa/shengcompile.cpp
 create mode 100644 src/nfa/shengcompile.h
 create mode 100644 src/nfa/shengdump.cpp
 create mode 100644 src/nfa/shengdump.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1719ec2d..8834d4d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -476,6 +476,12 @@ set (hs_exec_SRCS
     src/nfa/repeat.c
     src/nfa/repeat.h
     src/nfa/repeat_internal.h
+    src/nfa/sheng.c
+    src/nfa/sheng.h
+    src/nfa/sheng_defs.h
+    src/nfa/sheng_impl.h
+    src/nfa/sheng_impl4.h
+    src/nfa/sheng_internal.h
     src/nfa/shufti_common.h
     src/nfa/shufti.c
     src/nfa/shufti.h
@@ -641,6 +647,9 @@ SET (hs_SRCS
     src/nfa/repeat_internal.h
     src/nfa/repeatcompile.cpp
     src/nfa/repeatcompile.h
+    src/nfa/sheng_internal.h
+    src/nfa/shengcompile.cpp
+    src/nfa/shengcompile.h
     src/nfa/shufticompile.cpp
     src/nfa/shufticompile.h
     src/nfa/tamaramacompile.cpp
@@ -927,6 +936,8 @@ set(hs_dump_SRCS
     src/nfa/nfa_dump_dispatch.cpp
     src/nfa/nfa_dump_internal.cpp
     src/nfa/nfa_dump_internal.h
+    src/nfa/shengdump.cpp
+    src/nfa/shengdump.h
     src/nfa/tamarama_dump.cpp
     src/nfa/tamarama_dump.h
     src/parser/dump.cpp
diff --git a/src/grey.cpp b/src/grey.cpp
index bba5198a..bad56b56 100644
--- a/src/grey.cpp
+++ b/src/grey.cpp
@@ -50,6 +50,7 @@ Grey::Grey(void) :
                    allowLitHaig(true),
                    allowLbr(true),
                    allowMcClellan(true),
+                   allowSheng(true),
                    allowPuff(true),
                    allowLiteral(true),
                    allowRose(true),
@@ -127,6 +128,7 @@ Grey::Grey(void) :
                    equivalenceEnable(true),
 
                    allowSmallWrite(true), // McClellan dfas for small patterns
+                   allowSmallWriteSheng(false), // allow use of Sheng for SMWR
 
                    smallWriteLargestBuffer(70), // largest buffer that can be
                                                 // considered a small write
@@ -214,6 +216,7 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(allowLitHaig);
         G_UPDATE(allowLbr);
         G_UPDATE(allowMcClellan);
+        G_UPDATE(allowSheng);
         G_UPDATE(allowPuff);
         G_UPDATE(allowLiteral);
         G_UPDATE(allowRose);
@@ -290,6 +293,7 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(miracleHistoryBonus);
         G_UPDATE(equivalenceEnable);
         G_UPDATE(allowSmallWrite);
+        G_UPDATE(allowSmallWriteSheng);
         G_UPDATE(smallWriteLargestBuffer);
         G_UPDATE(smallWriteLargestBufferBad);
         G_UPDATE(limitSmallWriteOutfixSize);
diff --git a/src/grey.h b/src/grey.h
index 1714a0eb..90f5f826 100644
--- a/src/grey.h
+++ b/src/grey.h
@@ -50,6 +50,7 @@ struct Grey {
     bool allowLitHaig;
     bool allowLbr;
     bool allowMcClellan;
+    bool allowSheng;
     bool allowPuff;
     bool allowLiteral;
     bool allowRose;
@@ -149,6 +150,7 @@ struct Grey {
 
     // SmallWrite engine
     bool allowSmallWrite;
+    bool allowSmallWriteSheng;
     u32 smallWriteLargestBuffer;  // largest buffer that can be small write
     u32 smallWriteLargestBufferBad;// largest buffer that can be small write
     u32 limitSmallWriteOutfixSize; //!< max total size of outfix DFAs
diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index c26fb904..09006d5b 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -1009,7 +1009,7 @@ u32 mcclellanStartReachSize(const raw_dfa *raw) {
     return out.count();
 }
 
-bool has_accel_dfa(const NFA *nfa) {
+bool has_accel_mcclellan(const NFA *nfa) {
     const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
     return m->has_accel;
 }
diff --git a/src/nfa/mcclellancompile.h b/src/nfa/mcclellancompile.h
index e5e56cc8..e6f548a7 100644
--- a/src/nfa/mcclellancompile.h
+++ b/src/nfa/mcclellancompile.h
@@ -87,7 +87,7 @@ u32 mcclellanStartReachSize(const raw_dfa *raw);
 
 std::set<ReportID> all_reports(const raw_dfa &rdfa);
 
-bool has_accel_dfa(const NFA *nfa);
+bool has_accel_mcclellan(const NFA *nfa);
 
 } // namespace ue2
 
diff --git a/src/nfa/nfa_api.h b/src/nfa/nfa_api.h
index 9e0b6f89..e3f7f743 100644
--- a/src/nfa/nfa_api.h
+++ b/src/nfa/nfa_api.h
@@ -127,6 +127,9 @@ char nfaQueueExec(const struct NFA *nfa, struct mq *q, s64a end);
  */
 char nfaQueueExec_raw(const struct NFA *nfa, struct mq *q, s64a end);
 
+/** Return value indicating that the engine is dead. */
+#define MO_DEAD 0
+
 /** Return value indicating that the engine is alive. */
 #define MO_ALIVE 1
 
diff --git a/src/nfa/nfa_api_dispatch.c b/src/nfa/nfa_api_dispatch.c
index 789c3014..c67103b3 100644
--- a/src/nfa/nfa_api_dispatch.c
+++ b/src/nfa/nfa_api_dispatch.c
@@ -42,6 +42,7 @@
 #include "limex.h"
 #include "mcclellan.h"
 #include "mpv.h"
+#include "sheng.h"
 #include "tamarama.h"
 
 #define DISPATCH_CASE(dc_ltype, dc_ftype, dc_subtype, dc_func_call) \
@@ -69,6 +70,7 @@
         DISPATCH_CASE(LBR, Lbr, Shuf, dbnt_func);             \
         DISPATCH_CASE(LBR, Lbr, Truf, dbnt_func);             \
         DISPATCH_CASE(CASTLE, Castle, 0, dbnt_func);          \
+        DISPATCH_CASE(SHENG, Sheng, 0, dbnt_func);            \
         DISPATCH_CASE(TAMARAMA, Tamarama, 0, dbnt_func);      \
     default:                                                  \
         assert(0);                                            \
diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp
index ce473196..93376b01 100644
--- a/src/nfa/nfa_build_util.cpp
+++ b/src/nfa/nfa_build_util.cpp
@@ -30,6 +30,7 @@
 
 #include "limex_internal.h"
 #include "mcclellancompile.h"
+#include "shengcompile.h"
 #include "nfa_internal.h"
 #include "repeat_internal.h"
 #include "ue2common.h"
@@ -213,7 +214,7 @@ template<> struct NFATraits<MCCLELLAN_NFA_8> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_8>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_8>::has_accel = has_accel_mcclellan;
 const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_8>::has_repeats = dispatch_false;
 const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_8>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
@@ -229,7 +230,7 @@ template<> struct NFATraits<MCCLELLAN_NFA_16> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_16>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_16>::has_accel = has_accel_mcclellan;
 const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_16>::has_repeats = dispatch_false;
 const nfa_dispatch_fn NFATraits<MCCLELLAN_NFA_16>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
@@ -245,7 +246,7 @@ template<> struct NFATraits<GOUGH_NFA_8> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const nfa_dispatch_fn NFATraits<GOUGH_NFA_8>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_8>::has_accel = has_accel_mcclellan;
 const nfa_dispatch_fn NFATraits<GOUGH_NFA_8>::has_repeats = dispatch_false;
 const nfa_dispatch_fn NFATraits<GOUGH_NFA_8>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
@@ -261,7 +262,7 @@ template<> struct NFATraits<GOUGH_NFA_16> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
-const nfa_dispatch_fn NFATraits<GOUGH_NFA_16>::has_accel = has_accel_dfa;
+const nfa_dispatch_fn NFATraits<GOUGH_NFA_16>::has_accel = has_accel_mcclellan;
 const nfa_dispatch_fn NFATraits<GOUGH_NFA_16>::has_repeats = dispatch_false;
 const nfa_dispatch_fn NFATraits<GOUGH_NFA_16>::has_repeats_other_than_firsts = dispatch_false;
 #if defined(DUMP_SUPPORT)
@@ -380,6 +381,22 @@ const nfa_dispatch_fn NFATraits<LBR_NFA_Truf>::has_repeats_other_than_firsts = d
 const char *NFATraits<LBR_NFA_Truf>::name = "Lim Bounded Repeat (M)";
 #endif
 
+template<> struct NFATraits<SHENG_NFA_0> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 1;
+    static const bool fast = true;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<SHENG_NFA_0>::has_accel = has_accel_sheng;
+const nfa_dispatch_fn NFATraits<SHENG_NFA_0>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<SHENG_NFA_0>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<SHENG_NFA_0>::name = "Sheng";
+#endif
+
 template<> struct NFATraits<TAMARAMA_NFA_0> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
diff --git a/src/nfa/nfa_dump_dispatch.cpp b/src/nfa/nfa_dump_dispatch.cpp
index cf2aa7f5..388ac003 100644
--- a/src/nfa/nfa_dump_dispatch.cpp
+++ b/src/nfa/nfa_dump_dispatch.cpp
@@ -40,6 +40,7 @@
 #include "limex.h"
 #include "mcclellandump.h"
 #include "mpv_dump.h"
+#include "shengdump.h"
 #include "tamarama_dump.h"
 
 #ifndef DUMP_SUPPORT
@@ -74,6 +75,7 @@ namespace ue2 {
         DISPATCH_CASE(LBR, Lbr, Shuf, dbnt_func);             \
         DISPATCH_CASE(LBR, Lbr, Truf, dbnt_func);             \
         DISPATCH_CASE(CASTLE, Castle, 0, dbnt_func);          \
+        DISPATCH_CASE(SHENG, Sheng, 0, dbnt_func);            \
         DISPATCH_CASE(TAMARAMA, Tamarama, 0, dbnt_func);      \
     default:                                                  \
         assert(0);                                            \
diff --git a/src/nfa/nfa_internal.h b/src/nfa/nfa_internal.h
index a3703cb5..41fee73e 100644
--- a/src/nfa/nfa_internal.h
+++ b/src/nfa/nfa_internal.h
@@ -67,6 +67,7 @@ enum NFAEngineType {
     LBR_NFA_Shuf,       /**< magic pseudo nfa */
     LBR_NFA_Truf,       /**< magic pseudo nfa */
     CASTLE_NFA_0,       /**< magic pseudo nfa */
+    SHENG_NFA_0,        /**< magic pseudo nfa */
     TAMARAMA_NFA_0,     /**< magic nfa container */
     /** \brief bogus NFA - not used */
     INVALID_NFA
@@ -146,10 +147,17 @@ static really_inline int isGoughType(u8 t) {
     return t == GOUGH_NFA_8 || t == GOUGH_NFA_16;
 }
 
-/** \brief True if the given type (from NFA::type) is a McClellan or Gough DFA.
- * */
+/** \brief True if the given type (from NFA::type) is a Sheng DFA. */
+static really_inline int isShengType(u8 t) {
+    return t == SHENG_NFA_0;
+}
+
+/**
+ * \brief True if the given type (from NFA::type) is a McClellan, Gough or
+ * Sheng DFA.
+ */
 static really_inline int isDfaType(u8 t) {
-    return isMcClellanType(t) || isGoughType(t);
+    return isMcClellanType(t) || isGoughType(t) || isShengType(t);
 }
 
 /** \brief True if the given type (from NFA::type) is an NFA. */
diff --git a/src/nfa/sheng.c b/src/nfa/sheng.c
new file mode 100644
index 00000000..bbbf1f20
--- /dev/null
+++ b/src/nfa/sheng.c
@@ -0,0 +1,676 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sheng.h"
+
+#include "accel.h"
+#include "sheng_internal.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_internal.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+#include "util/join.h"
+#include "util/simd_utils.h"
+
+enum MatchMode {
+    CALLBACK_OUTPUT,
+    STOP_AT_MATCH,
+    NO_MATCHES
+};
+
+static really_inline
+const struct sheng *get_sheng(const struct NFA *n) {
+    return (const struct sheng *)getImplNfa(n);
+}
+
+static really_inline
+const struct sstate_aux *get_aux(const struct sheng *sh, u8 id) {
+    u32 offset = sh->aux_offset - sizeof(struct NFA) +
+            (id & SHENG_STATE_MASK) * sizeof(struct sstate_aux);
+    DEBUG_PRINTF("Getting aux for state %u at offset %llu\n",
+                 id & SHENG_STATE_MASK, (u64a)offset + sizeof(struct NFA));
+    return (const struct sstate_aux *)((const char *) sh + offset);
+}
+
+static really_inline
+const union AccelAux *get_accel(const struct sheng *sh, u8 id) {
+    const struct sstate_aux *saux = get_aux(sh, id);
+    DEBUG_PRINTF("Getting accel aux at offset %u\n", saux->accel);
+    const union AccelAux *aux = (const union AccelAux *)
+            ((const char *)sh + saux->accel - sizeof(struct NFA));
+    return aux;
+}
+
+static really_inline
+const struct report_list *get_rl(const struct sheng *sh,
+                                 const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept - sizeof(struct NFA));
+}
+
+static really_inline
+const struct report_list *get_eod_rl(const struct sheng *sh,
+                                     const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept_eod - sizeof(struct NFA));
+}
+
+static really_inline
+char shengHasAccept(const struct sheng *sh, const struct sstate_aux *aux,
+                    ReportID report) {
+    assert(sh && aux);
+
+    const struct report_list *rl = get_rl(sh, aux);
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+static really_inline
+char fireSingleReport(NfaCallback cb, void *ctxt, ReportID r, u64a loc) {
+    DEBUG_PRINTF("reporting %u\n", r);
+    if (cb(0, loc, r, ctxt) == MO_HALT_MATCHING) {
+        return MO_HALT_MATCHING; /* termination requested */
+    }
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+static really_inline
+char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt,
+                 const u8 state, u64a loc, u8 *const cached_accept_state,
+                 ReportID *const cached_accept_id, char eod) {
+    DEBUG_PRINTF("reporting matches @ %llu\n", loc);
+
+    if (!eod && state == *cached_accept_state) {
+        DEBUG_PRINTF("reporting %u\n", *cached_accept_id);
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+    const struct sstate_aux *aux = get_aux(sh, state);
+    const struct report_list *rl = eod ? get_eod_rl(sh, aux) : get_rl(sh, aux);
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = state;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+/* include Sheng function definitions */
+#include "sheng_defs.h"
+
+static really_inline
+char runShengCb(const struct sheng *sh, NfaCallback cb, void *ctxt, u64a offset,
+                u8 *const cached_accept_state, ReportID *const cached_accept_id,
+                const u8 *cur_buf, const u8 *start, const u8 *end, u8 can_die,
+                u8 has_accel, u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    int rv;
+    /* scan and report all matches */
+    if (can_die) {
+        if (has_accel) {
+            rv = sheng4_coda(state, cb, ctxt, sh, cached_accept_state,
+                             cached_accept_id, single, offset, cur_buf, start,
+                             end, scanned);
+        } else {
+            rv = sheng4_cod(state, cb, ctxt, sh, cached_accept_state,
+                            cached_accept_id, single, offset, cur_buf, start,
+                            end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng_cod(state, cb, ctxt, sh, cached_accept_state,
+                       cached_accept_id, single, offset, cur_buf, *scanned, end,
+                       scanned);
+    } else {
+        if (has_accel) {
+            rv = sheng4_coa(state, cb, ctxt, sh, cached_accept_state,
+                            cached_accept_id, single, offset, cur_buf, start,
+                            end, scanned);
+        } else {
+            rv = sheng4_co(state, cb, ctxt, sh, cached_accept_state,
+                           cached_accept_id, single, offset, cur_buf, start,
+                           end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng_co(state, cb, ctxt, sh, cached_accept_state,
+                      cached_accept_id, single, offset, cur_buf, *scanned, end,
+                      scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    return MO_ALIVE;
+}
+
+static really_inline
+void runShengNm(const struct sheng *sh, NfaCallback cb, void *ctxt, u64a offset,
+                u8 *const cached_accept_state, ReportID *const cached_accept_id,
+                const u8 *cur_buf, const u8 *start, const u8 *end, u8 can_die,
+                u8 has_accel, u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    /* just scan the buffer */
+    if (can_die) {
+        if (has_accel) {
+            sheng4_nmda(state, cb, ctxt, sh, cached_accept_state,
+                        cached_accept_id, single, offset, cur_buf, start, end,
+                        scanned);
+        } else {
+            sheng4_nmd(state, cb, ctxt, sh, cached_accept_state,
+                       cached_accept_id, single, offset, cur_buf, start, end,
+                       scanned);
+        }
+        sheng_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                  single, offset, cur_buf, *scanned, end, scanned);
+    } else {
+        sheng4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                  single, offset, cur_buf, start, end, scanned);
+        sheng_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                 single, offset, cur_buf, *scanned, end, scanned);
+    }
+}
+
+static really_inline
+char runShengSam(const struct sheng *sh, NfaCallback cb, void *ctxt,
+                 u64a offset, u8 *const cached_accept_state,
+                 ReportID *const cached_accept_id, const u8 *cur_buf,
+                 const u8 *start, const u8 *end, u8 can_die, u8 has_accel,
+                 u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    int rv;
+    /* scan until first match */
+    if (can_die) {
+        if (has_accel) {
+            rv = sheng4_samda(state, cb, ctxt, sh, cached_accept_state,
+                              cached_accept_id, single, offset, cur_buf, start,
+                              end, scanned);
+        } else {
+            rv = sheng4_samd(state, cb, ctxt, sh, cached_accept_state,
+                             cached_accept_id, single, offset, cur_buf, start,
+                             end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng_samd(state, cb, ctxt, sh, cached_accept_state,
+                        cached_accept_id, single, offset, cur_buf, *scanned,
+                        end, scanned);
+    } else {
+        if (has_accel) {
+            rv = sheng4_sama(state, cb, ctxt, sh, cached_accept_state,
+                             cached_accept_id, single, offset, cur_buf, start,
+                             end, scanned);
+        } else {
+            rv = sheng4_sam(state, cb, ctxt, sh, cached_accept_state,
+                            cached_accept_id, single, offset, cur_buf, start,
+                            end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng_sam(state, cb, ctxt, sh, cached_accept_state,
+                       cached_accept_id, single, offset, cur_buf, *scanned, end,
+                       scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    /* if we stopped before we expected, we found a match */
+    if (rv == MO_MATCHES_PENDING) {
+        return MO_MATCHES_PENDING;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char runSheng(const struct sheng *sh, struct mq *q, s64a b_end,
+              enum MatchMode mode) {
+    u8 state = *(u8 *)q->state;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    DEBUG_PRINTF("starting Sheng execution in state %u\n",
+                 state & SHENG_STATE_MASK);
+
+    if (q->report_current) {
+        DEBUG_PRINTF("reporting current pending matches\n");
+        assert(sh);
+
+        q->report_current = 0;
+
+        int rv;
+        if (single) {
+            rv = fireSingleReport(q->cb, q->context, sh->report,
+                                  q_cur_offset(q));
+        } else {
+            rv = fireReports(sh, q->cb, q->context, state, q_cur_offset(q),
+                             &cached_accept_state, &cached_accept_id, 0);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
+            return MO_DEAD;
+        }
+
+        DEBUG_PRINTF("proceeding with matching\n");
+    }
+
+    assert(q_cur_type(q) == MQE_START);
+    s64a start = q_cur_loc(q);
+
+    DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start,
+                 mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" :
+                     mode == NO_MATCHES ? "NO MATCHES" :
+                         mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???");
+
+    DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                 q_cur_type(q) == MQE_START ? "START" :
+                     q_cur_type(q) == MQE_TOP ? "TOP" :
+                         q_cur_type(q) == MQE_END ? "END" : "???");
+
+    const u8* cur_buf;
+    if (start < 0) {
+        DEBUG_PRINTF("negative location, scanning history\n");
+        DEBUG_PRINTF("min location: %zd\n", -q->hlength);
+        cur_buf = q->history + q->hlength;
+    } else {
+        DEBUG_PRINTF("positive location, scanning buffer\n");
+        DEBUG_PRINTF("max location: %lli\n", b_end);
+        cur_buf = q->buffer;
+    }
+
+    /* if we our queue event is past our end */
+    if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+        DEBUG_PRINTF("current location past buffer end\n");
+        DEBUG_PRINTF("setting q location to %llu\n", b_end);
+        DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
+        q->items[q->cur].location = b_end;
+        return MO_ALIVE;
+    }
+
+    q->cur++;
+
+    s64a cur_start = start;
+
+    while (1) {
+        DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                     q_cur_type(q) == MQE_START ? "START" :
+                             q_cur_type(q) == MQE_TOP ? "TOP" :
+                                     q_cur_type(q) == MQE_END ? "END" : "???");
+        s64a end = q_cur_loc(q);
+        if (mode != NO_MATCHES) {
+            end = MIN(end, b_end);
+        }
+        assert(end <= (s64a) q->length);
+        s64a cur_end = end;
+
+        /* we may cross the border between history and current buffer */
+        if (cur_start < 0) {
+            cur_end = MIN(0, cur_end);
+        }
+
+        DEBUG_PRINTF("start: %lli end: %lli\n", start, end);
+
+        /* don't scan zero length buffer */
+        if (cur_start != cur_end) {
+            const u8 * scanned = cur_buf;
+            char rv;
+
+            /* if we're in nomatch mode or if we're scanning history buffer */
+            if (mode == NO_MATCHES ||
+                (cur_start < 0 && mode == CALLBACK_OUTPUT)) {
+                runShengNm(sh, q->cb, q->context, q->offset,
+                           &cached_accept_state, &cached_accept_id, cur_buf,
+                           cur_buf + cur_start, cur_buf + cur_end, can_die,
+                           has_accel, single, &scanned, &state);
+            } else if (mode == CALLBACK_OUTPUT) {
+                rv = runShengCb(sh, q->cb, q->context, q->offset,
+                                &cached_accept_state, &cached_accept_id,
+                                cur_buf, cur_buf + cur_start, cur_buf + cur_end,
+                                can_die, has_accel, single, &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG_STATE_MASK);
+                    return MO_DEAD;
+                }
+            } else if (mode == STOP_AT_MATCH) {
+                rv = runShengSam(sh, q->cb, q->context, q->offset,
+                                 &cached_accept_state, &cached_accept_id,
+                                 cur_buf, cur_buf + cur_start,
+                                 cur_buf + cur_end, can_die, has_accel, single,
+                                 &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG_STATE_MASK);
+                    return rv;
+                } else if (rv == MO_MATCHES_PENDING) {
+                    assert(q->cur);
+                    DEBUG_PRINTF("found a match, setting q location to %zd\n",
+                                 scanned - cur_buf + 1);
+                    q->cur--;
+                    q->items[q->cur].type = MQE_START;
+                    q->items[q->cur].location =
+                            scanned - cur_buf + 1; /* due to exiting early */
+                    *(u8 *)q->state = state;
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG_STATE_MASK);
+                    return rv;
+                }
+            } else {
+                assert(!"invalid scanning mode!");
+            }
+            assert(scanned == cur_buf + cur_end);
+
+            cur_start = cur_end;
+        }
+
+        /* if we our queue event is past our end */
+        if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+            DEBUG_PRINTF("current location past buffer end\n");
+            DEBUG_PRINTF("setting q location to %llu\n", b_end);
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = b_end;
+            *(u8 *)q->state = state;
+            return MO_ALIVE;
+        }
+
+        /* crossing over into actual buffer */
+        if (cur_start == 0) {
+            DEBUG_PRINTF("positive location, scanning buffer\n");
+            DEBUG_PRINTF("max offset: %lli\n", b_end);
+            cur_buf = q->buffer;
+        }
+
+        /* continue scanning the same buffer */
+        if (end != cur_end) {
+            continue;
+        }
+
+        switch (q_cur_type(q)) {
+        case MQE_END:
+            *(u8 *)q->state = state;
+            q->cur++;
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
+            if (can_die) {
+                return (state & SHENG_STATE_DEAD) ? MO_DEAD : MO_ALIVE;
+            }
+            return MO_ALIVE;
+        case MQE_TOP:
+            if (q->offset + cur_start == 0) {
+                DEBUG_PRINTF("Anchored start, going to state %u\n",
+                             sh->anchored);
+                state = sh->anchored;
+            } else {
+                u8 new_state = get_aux(sh, state)->top;
+                DEBUG_PRINTF("Top event %u->%u\n", state & SHENG_STATE_MASK,
+                             new_state & SHENG_STATE_MASK);
+                state = new_state;
+            }
+            break;
+        default:
+            assert(!"invalid queue event");
+            break;
+        }
+        q->cur++;
+    }
+}
+
+char nfaExecSheng0_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                     size_t length, NfaCallback cb, void *context) {
+    DEBUG_PRINTF("smallwrite Sheng\n");
+    assert(n->type == SHENG_NFA_0);
+    const struct sheng *sh = getImplNfa(n);
+    u8 state = sh->anchored;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    /* scan and report all matches */
+    int rv;
+    s64a end = length;
+    const u8 *scanned;
+
+    rv = runShengCb(sh, cb, context, offset, &cached_accept_state,
+                    &cached_accept_id, buffer, buffer, buffer + end, can_die,
+                    has_accel, single, &scanned, &state);
+    if (rv == MO_DEAD) {
+        DEBUG_PRINTF("exiting in state %u\n",
+                     state & SHENG_STATE_MASK);
+        return MO_DEAD;
+    }
+
+    DEBUG_PRINTF("%u\n", state & SHENG_STATE_MASK);
+
+    const struct sstate_aux *aux = get_aux(sh, state);
+
+    if (aux->accept_eod) {
+        DEBUG_PRINTF("Reporting EOD matches\n");
+        fireReports(sh, cb, context, state, end + offset, &cached_accept_state,
+                    &cached_accept_id, 1);
+    }
+
+    return state & SHENG_STATE_DEAD ? MO_DEAD : MO_ALIVE;
+}
+
+char nfaExecSheng0_Q(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng *sh = get_sheng(n);
+    char rv = runSheng(sh, q, end, CALLBACK_OUTPUT);
+    return rv;
+}
+
+char nfaExecSheng0_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng *sh = get_sheng(n);
+    char rv = runSheng(sh, q, end, STOP_AT_MATCH);
+    return rv;
+}
+
+char nfaExecSheng0_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    assert(q_cur_type(q) == MQE_START);
+
+    const struct sheng *sh = get_sheng(n);
+    char rv = runSheng(sh, q, 0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecSheng0_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    }
+    return rv;
+}
+
+char nfaExecSheng0_inAccept(const struct NFA *n, ReportID report,
+                            struct mq *q) {
+    assert(n && q);
+
+    const struct sheng *sh = get_sheng(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux(sh, s);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    return shengHasAccept(sh, aux, report);
+}
+
+char nfaExecSheng0_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct sheng *sh = get_sheng(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux(sh, s);
+    return !!aux->accept;
+}
+
+char nfaExecSheng0_testEOD(const struct NFA *nfa, const char *state,
+                           UNUSED const char *streamState, u64a offset,
+                           NfaCallback cb, void *ctxt) {
+    assert(nfa);
+
+    const struct sheng *sh = get_sheng(nfa);
+    u8 s = *(const u8 *)state;
+    DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux(sh, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    return fireReports(sh, cb, ctxt, s, offset, NULL, NULL, 1);
+}
+
+char nfaExecSheng0_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct sheng *sh = (const struct sheng *)getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u8 s = *(u8 *)q->state;
+    const struct sstate_aux *aux = get_aux(sh, s);
+    u64a offset = q_cur_offset(q);
+    u8 cached_state_id = 0;
+    ReportID cached_report_id = 0;
+    assert(q_cur_type(q) == MQE_START);
+
+    if (aux->accept) {
+        if (sh->flags & SHENG_FLAG_SINGLE_REPORT) {
+            fireSingleReport(cb, ctxt, sh->report, offset);
+        } else {
+            fireReports(sh, cb, ctxt, s, offset, &cached_state_id,
+                        &cached_report_id, 1);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecSheng0_initCompressedState(const struct NFA *nfa, u64a offset,
+                                       void *state, UNUSED u8 key) {
+    const struct sheng *sh = get_sheng(nfa);
+    u8 *s = (u8 *)state;
+    *s = offset ? sh->floating: sh->anchored;
+    return !(*s & SHENG_STATE_DEAD);
+}
+
+char nfaExecSheng0_queueInitState(const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+
+    /* starting in floating state */
+    const struct sheng *sh = get_sheng(nfa);
+    *(u8 *)q->state = sh->floating;
+    DEBUG_PRINTF("starting in floating state\n");
+    return 0;
+}
+
+char nfaExecSheng0_queueCompressState(UNUSED const struct NFA *nfa,
+                                      const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecSheng0_expandState(UNUSED const struct NFA *nfa, void *dest,
+                               const void *src, UNUSED u64a offset,
+                               UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
diff --git a/src/nfa/sheng.h b/src/nfa/sheng.h
new file mode 100644
index 00000000..46ead180
--- /dev/null
+++ b/src/nfa/sheng.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SHENG_H_
+#define SHENG_H_
+
+#include "callback.h"
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+
+#define nfaExecSheng0_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng0_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+char nfaExecSheng0_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng0_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng0_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecSheng0_inAccept(const struct NFA *n, ReportID report, struct mq *q);
+char nfaExecSheng0_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecSheng0_queueInitState(const struct NFA *nfa, struct mq *q);
+char nfaExecSheng0_queueCompressState(const struct NFA *nfa, const struct mq *q,
+                                      s64a loc);
+char nfaExecSheng0_expandState(const struct NFA *nfa, void *dest,
+                               const void *src, u64a offset, u8 key);
+char nfaExecSheng0_initCompressedState(const struct NFA *nfa, u64a offset,
+                                       void *state, u8 key);
+char nfaExecSheng0_testEOD(const struct NFA *nfa, const char *state,
+                           const char *streamState, u64a offset,
+                           NfaCallback callback, void *context);
+char nfaExecSheng0_reportCurrent(const struct NFA *n, struct mq *q);
+
+char nfaExecSheng0_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                    size_t length, NfaCallback cb, void *context);
+
+#endif /* SHENG_H_ */
diff --git a/src/nfa/sheng_defs.h b/src/nfa/sheng_defs.h
new file mode 100644
index 00000000..26bdbcee
--- /dev/null
+++ b/src/nfa/sheng_defs.h
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SHENG_DEFS_H
+#define SHENG_DEFS_H
+
+/*
+ * Utility functions used by various versions of Sheng engine
+ */
+static really_inline
+u8 isDeadState(const u8 a) {
+    return a & SHENG_STATE_DEAD;
+}
+
+static really_inline
+u8 isAcceptState(const u8 a) {
+    return a & SHENG_STATE_ACCEPT;
+}
+
+static really_inline
+u8 isAccelState(const u8 a) {
+    return a & SHENG_STATE_ACCEL;
+}
+
+static really_inline
+u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) {
+    return (a | b | c | d) & (SHENG_STATE_FLAG_MASK);
+}
+
+/* these functions should be optimized out, used by NO_MATCHES mode */
+static really_inline
+u8 dummyFunc4(UNUSED const u8 a, UNUSED const u8 b, UNUSED const u8 c,
+              UNUSED const u8 d) {
+    return 0;
+}
+
+static really_inline
+u8 dummyFunc(UNUSED const u8 a) {
+    return 0;
+}
+
+/*
+ * Sheng function definitions for single byte loops
+ */
+/* callback output, can die */
+#define SHENG_IMPL sheng_cod
+#define DEAD_FUNC isDeadState
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 0
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* callback output, can't die */
+#define SHENG_IMPL sheng_co
+#define DEAD_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 0
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* stop at match, can die */
+#define SHENG_IMPL sheng_samd
+#define DEAD_FUNC isDeadState
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 1
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* stop at match, can't die */
+#define SHENG_IMPL sheng_sam
+#define DEAD_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 1
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* no match, can die */
+#define SHENG_IMPL sheng_nmd
+#define DEAD_FUNC isDeadState
+#define ACCEPT_FUNC dummyFunc
+#define STOP_AT_MATCH 0
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* no match, can't die */
+#define SHENG_IMPL sheng_nm
+#define DEAD_FUNC dummyFunc
+#define ACCEPT_FUNC dummyFunc
+#define STOP_AT_MATCH 0
+#include "sheng_impl.h"
+#undef SHENG_IMPL
+#undef DEAD_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/*
+ * Sheng function definitions for 4-byte loops
+ */
+/* callback output, can die, accelerated */
+#define SHENG_IMPL sheng4_coda
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC isDeadState
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC isAccelState
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* callback output, can die, not accelerated */
+#define SHENG_IMPL sheng4_cod
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC isDeadState
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* callback output, can't die, accelerated */
+#define SHENG_IMPL sheng4_coa
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC isAccelState
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* callback output, can't die, not accelerated */
+#define SHENG_IMPL sheng4_co
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* stop at match, can die, accelerated */
+#define SHENG_IMPL sheng4_samda
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC isDeadState
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC isAccelState
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 1
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* stop at match, can die, not accelerated */
+#define SHENG_IMPL sheng4_samd
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC isDeadState
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 1
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* stop at match, can't die, accelerated */
+#define SHENG_IMPL sheng4_sama
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC isAccelState
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 1
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* stop at match, can't die, not accelerated */
+#define SHENG_IMPL sheng4_sam
+#define INTERESTING_FUNC hasInterestingStates
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC isAcceptState
+#define STOP_AT_MATCH 1
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* no-match have interesting func as dummy, and die/accel checks are outer */
+
+/* no match, can die, accelerated */
+#define SHENG_IMPL sheng4_nmda
+#define INTERESTING_FUNC dummyFunc4
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC isDeadState
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC isAccelState
+#define ACCEPT_FUNC dummyFunc
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* no match, can die, not accelerated */
+#define SHENG_IMPL sheng4_nmd
+#define INTERESTING_FUNC dummyFunc4
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC isDeadState
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC dummyFunc
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+/* there is no performance benefit in accelerating a no-match case that can't
+ * die */
+
+/* no match, can't die */
+#define SHENG_IMPL sheng4_nm
+#define INTERESTING_FUNC dummyFunc4
+#define INNER_DEAD_FUNC dummyFunc
+#define OUTER_DEAD_FUNC dummyFunc
+#define INNER_ACCEL_FUNC dummyFunc
+#define OUTER_ACCEL_FUNC dummyFunc
+#define ACCEPT_FUNC dummyFunc
+#define STOP_AT_MATCH 0
+#include "sheng_impl4.h"
+#undef SHENG_IMPL
+#undef INTERESTING_FUNC
+#undef INNER_DEAD_FUNC
+#undef OUTER_DEAD_FUNC
+#undef INNER_ACCEL_FUNC
+#undef OUTER_ACCEL_FUNC
+#undef ACCEPT_FUNC
+#undef STOP_AT_MATCH
+
+#endif // SHENG_DEFS_H
diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h
new file mode 100644
index 00000000..fc3e54aa
--- /dev/null
+++ b/src/nfa/sheng_impl.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * In order to use this macro, the following things need to be defined:
+ *
+ *  - SHENG_IMPL    (name of the Sheng implementation function)
+ *  - DEAD_FUNC     (name of the function checking for dead states)
+ *  - ACCEPT_FUNC   (name of the function checking for accept state)
+ *  - STOP_AT_MATCH (can be 1 or 0, enable or disable stop at match)
+ */
+
+/* byte-by-byte version. we don't do byte-by-byte death checking as it's
+ * pretty pointless to do it over a buffer that's at most 3 bytes long */
+static really_inline
+char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
+                u8 *const cached_accept_state, ReportID *const cached_accept_id,
+                u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFA execution in state %u\n",
+                 *state & SHENG_STATE_MASK);
+    const u8 *cur_buf = start;
+    if (DEAD_FUNC(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+    DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
+
+    m128 cur_state = set16x8(*state);
+    const m128 *masks = s->shuffle_masks;
+
+    while (likely(cur_buf != end)) {
+        const u8 c = *cur_buf;
+        const m128 shuffle_mask = masks[c];
+        cur_state = pshufb(shuffle_mask, cur_state);
+        const u8 tmp = movd(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", tmp, (tmp & 0xF0) >> 4,
+                     tmp & 0xF);
+
+        if (unlikely(ACCEPT_FUNC(tmp))) {
+            DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG_STATE_MASK);
+            u64a match_offset = base_offset + (cur_buf - buf) + 1;
+            DEBUG_PRINTF("Match @ %llu\n", match_offset);
+            if (STOP_AT_MATCH) {
+                DEBUG_PRINTF("Stopping at match @ %lli\n",
+                             (u64a)(cur_buf - start));
+                *state = tmp;
+                *scan_end = cur_buf;
+                return MO_MATCHES_PENDING;
+            }
+            if (single) {
+                if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                    MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            } else {
+                if (fireReports(s, cb, ctxt, tmp, match_offset,
+                                cached_accept_state, cached_accept_id,
+                                0) == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            }
+        }
+        cur_buf++;
+    }
+    *state = movd(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h
new file mode 100644
index 00000000..2561e52d
--- /dev/null
+++ b/src/nfa/sheng_impl4.h
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * In order to use this macro, the following things need to be defined:
+ *
+ *  - SHENG_IMPL        (name of the Sheng implementation function)
+ *  - INTERESTING_FUNC  (name of the function checking for accept, accel or dead
+ *                       states)
+ *  - INNER_DEAD_FUNC   (name of the inner function checking for dead states)
+ *  - OUTER_DEAD_FUNC   (name of the outer function checking for dead states)
+ *  - INNER_ACCEL_FUNC  (name of the inner function checking for accel states)
+ *  - OUTER_ACCEL_FUNC  (name of the outer function checking for accel states)
+ *  - ACCEPT_FUNC       (name of the function checking for accept state)
+ *  - STOP_AT_MATCH     (can be 1 or 0, enable or disable stop at match)
+ */
+
+/* unrolled 4-byte-at-a-time version.
+ *
+ * we put innerDeadFunc inside interestingFunc() block so that we don't pay for
+ * dead states checking. however, if interestingFunc is dummy, innerDeadFunc
+ * gets lost with it, so we need an additional check outside the
+ * interestingFunc() branch - it's normally dummy so we don't pay for it, but
+ * when interestingFunc is dummy, outerDeadFunc should be set if we want to
+ * check for dead states.
+ *
+ * also, deadFunc only checks the last known state, but since we can't ever get
+ * out of the dead state and we don't really care where we died, it's not a
+ * problem.
+ */
+static really_inline
+char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
+                u8 *const cached_accept_state, ReportID *const cached_accept_id,
+                u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
+                 *state & SHENG_STATE_MASK);
+    const u8 *cur_buf = start;
+    const u8 *min_accel_dist = start;
+    base_offset++;
+    DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
+
+    if (INNER_ACCEL_FUNC(*state) || OUTER_ACCEL_FUNC(*state)) {
+        DEBUG_PRINTF("Accel state reached @ 0\n");
+        const union AccelAux *aaux = get_accel(s, *state & SHENG_STATE_MASK);
+        const u8 *new_offset = run_accel(aaux, cur_buf, end);
+        if (new_offset < cur_buf + BAD_ACCEL_DIST) {
+            min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+        } else {
+            min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+        }
+        DEBUG_PRINTF("Next accel chance: %llu\n",
+                     (u64a)(min_accel_dist - start));
+        DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf);
+        cur_buf = new_offset;
+        DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start));
+    }
+    if (INNER_DEAD_FUNC(*state) || OUTER_DEAD_FUNC(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+
+    m128 cur_state = set16x8(*state);
+    const m128 *masks = s->shuffle_masks;
+
+    while (likely(end - cur_buf >= 4)) {
+        const u8 *b1 = cur_buf;
+        const u8 *b2 = cur_buf + 1;
+        const u8 *b3 = cur_buf + 2;
+        const u8 *b4 = cur_buf + 3;
+        const u8 c1 = *b1;
+        const u8 c2 = *b2;
+        const u8 c3 = *b3;
+        const u8 c4 = *b4;
+
+        const m128 shuffle_mask1 = masks[c1];
+        cur_state = pshufb(shuffle_mask1, cur_state);
+        const u8 a1 = movd(cur_state);
+
+        const m128 shuffle_mask2 = masks[c2];
+        cur_state = pshufb(shuffle_mask2, cur_state);
+        const u8 a2 = movd(cur_state);
+
+        const m128 shuffle_mask3 = masks[c3];
+        cur_state = pshufb(shuffle_mask3, cur_state);
+        const u8 a3 = movd(cur_state);
+
+        const m128 shuffle_mask4 = masks[c4];
+        cur_state = pshufb(shuffle_mask4, cur_state);
+        const u8 a4 = movd(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a1, (a1 & 0xF0) >> 4, a1 & 0xF);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a2, (a2 & 0xF0) >> 4, a2 & 0xF);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a3, (a3 & 0xF0) >> 4, a3 & 0xF);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
+        DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a4, (a4 & 0xF0) >> 4, a4 & 0xF);
+
+        if (unlikely(INTERESTING_FUNC(a1, a2, a3, a4))) {
+            if (ACCEPT_FUNC(a1)) {
+                u64a match_offset = base_offset + b1 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a1 & SHENG_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b1 - start));
+                    *scan_end = b1;
+                    *state = a1;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports(s, cb, ctxt, a1, match_offset,
+                                    cached_accept_state, cached_accept_id,
+                                    0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC(a2)) {
+                u64a match_offset = base_offset + b2 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a2 & SHENG_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b2 - start));
+                    *scan_end = b2;
+                    *state = a2;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports(s, cb, ctxt, a2, match_offset,
+                                    cached_accept_state, cached_accept_id,
+                                    0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC(a3)) {
+                u64a match_offset = base_offset + b3 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a3 & SHENG_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b3 - start));
+                    *scan_end = b3;
+                    *state = a3;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports(s, cb, ctxt, a3, match_offset,
+                                    cached_accept_state, cached_accept_id,
+                                    0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC(a4)) {
+                u64a match_offset = base_offset + b4 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a4 & SHENG_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b4 - start));
+                    *scan_end = b4;
+                    *state = a4;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports(s, cb, ctxt, a4, match_offset,
+                                    cached_accept_state, cached_accept_id,
+                                    0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (INNER_DEAD_FUNC(a4)) {
+                DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
+                *scan_end = end;
+                *state = a4;
+                return MO_CONTINUE_MATCHING;
+            }
+            if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC(a4)) {
+                DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+                const union AccelAux *aaux =
+                    get_accel(s, a4 & SHENG_STATE_MASK);
+                const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+                if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                    min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+                } else {
+                    min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+                }
+                DEBUG_PRINTF("Next accel chance: %llu\n",
+                             (u64a)(min_accel_dist - start));
+                DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                             (u64a)(new_offset - cur_buf - 4));
+                cur_buf = new_offset;
+                DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+                continue;
+            }
+        }
+        if (OUTER_DEAD_FUNC(a4)) {
+            DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
+            *scan_end = end;
+            *state = a4;
+            return MO_CONTINUE_MATCHING;
+        };
+        if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC(a4)) {
+            DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+            const union AccelAux *aaux = get_accel(s, a4 & SHENG_STATE_MASK);
+            const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+            if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+            } else {
+                min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+            }
+            DEBUG_PRINTF("Next accel chance: %llu\n",
+                         (u64a)(min_accel_dist - start));
+            DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                         (u64a)(new_offset - cur_buf - 4));
+            cur_buf = new_offset;
+            DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+            continue;
+        };
+        cur_buf += 4;
+    }
+    *state = movd(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
diff --git a/src/nfa/sheng_internal.h b/src/nfa/sheng_internal.h
new file mode 100644
index 00000000..046eb759
--- /dev/null
+++ b/src/nfa/sheng_internal.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SHENG_INTERNAL_H_
+#define SHENG_INTERNAL_H_
+
+#include "ue2common.h"
+#include "util/simd_utils.h"
+
+#define SHENG_STATE_ACCEPT 0x10
+#define SHENG_STATE_DEAD 0x20
+#define SHENG_STATE_ACCEL 0x40
+#define SHENG_STATE_MASK 0xF
+#define SHENG_STATE_FLAG_MASK 0x70
+
+#define SHENG_FLAG_SINGLE_REPORT 0x1
+#define SHENG_FLAG_CAN_DIE 0x2
+#define SHENG_FLAG_HAS_ACCEL 0x4
+
+struct report_list {
+    u32 count;
+    ReportID report[];
+};
+
+struct sstate_aux {
+    u32 accept;
+    u32 accept_eod;
+    u32 accel;
+    u32 top;
+};
+
+struct sheng {
+    m128 shuffle_masks[256];
+    u32 length;
+    u32 aux_offset;
+    u32 report_offset;
+    u32 accel_offset;
+    u8 n_states;
+    u8 anchored;
+    u8 floating;
+    u8 flags;
+    ReportID report;
+};
+
+#endif /* SHENG_INTERNAL_H_ */
diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp
new file mode 100644
index 00000000..911f6d70
--- /dev/null
+++ b/src/nfa/shengcompile.cpp
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "shengcompile.h"
+
+#include "accel.h"
+#include "accelcompile.h"
+#include "shufticompile.h"
+#include "trufflecompile.h"
+#include "util/alloc.h"
+#include "util/bitutils.h"
+#include "util/charreach.h"
+#include "util/compare.h"
+#include "util/container.h"
+#include "util/order_check.h"
+#include "util/report_manager.h"
+#include "util/unaligned.h"
+
+#include "grey.h"
+#include "nfa_internal.h"
+#include "sheng_internal.h"
+#include "ue2common.h"
+#include "util/compile_context.h"
+#include "util/make_unique.h"
+#include "util/verify_types.h"
+#include "util/simd_utils.h"
+
+#include <map>
+#include <vector>
+#include <sstream>
+
+#include <boost/range/adaptor/map.hpp>
+
+using namespace std;
+using boost::adaptors::map_keys;
+
+namespace ue2 {
+
+#define ACCEL_DFA_MAX_OFFSET_DEPTH 4
+
+/** Maximum tolerated number of escape character from an accel state.
+ * This is larger than nfa, as we don't have a budget and the nfa cheats on stop
+ * characters for sets of states */
+#define ACCEL_DFA_MAX_STOP_CHAR 160
+
+/** Maximum tolerated number of escape character from a sds accel state. Larger
+ * than normal states as accelerating sds is important. Matches NFA value */
+#define ACCEL_DFA_MAX_FLOATING_STOP_CHAR 192
+
+struct dfa_info {
+    accel_dfa_build_strat &strat;
+    raw_dfa &raw;
+    vector<dstate> &states;
+    dstate &floating;
+    dstate &anchored;
+    bool can_die;
+
+    explicit dfa_info(accel_dfa_build_strat &s)
+        : strat(s), raw(strat.get_raw()), states(raw.states),
+          floating(states[raw.start_floating]),
+          anchored(states[raw.start_anchored]), can_die(dfaCanDie(raw)) {}
+
+    // returns adjusted size
+    size_t size() const {
+        return can_die ? states.size() : states.size() - 1;
+    }
+    // expects adjusted index
+    dstate &operator[](dstate_id_t idx) {
+        return states[raw_id(idx)];
+    }
+    dstate &top(dstate_id_t idx) {
+        if (isDead(idx)) {
+            return floating;
+        }
+        return next(idx, TOP);
+    }
+    dstate &next(dstate_id_t idx, u16 chr) {
+        auto &src = (*this)[idx];
+        auto next_id = src.next[raw.alpha_remap[chr]];
+        return states[next_id];
+    }
+    // get original idx from adjusted idx
+    dstate_id_t raw_id(dstate_id_t idx) {
+        assert(idx < size());
+        // if DFA can't die, shift all indices left by 1
+        return can_die ? idx : idx + 1;
+    }
+    bool isDead(dstate &state) {
+        return raw_id(state.impl_id) == DEAD_STATE;
+    }
+    bool isDead(dstate_id_t idx) {
+        return raw_id(idx) == DEAD_STATE;
+    }
+
+private:
+    static bool dfaCanDie(raw_dfa &rdfa) {
+        for (unsigned chr = 0; chr < 256; chr++) {
+            for (dstate_id_t state = 0; state < rdfa.states.size(); state++) {
+                auto succ = rdfa.states[state].next[rdfa.alpha_remap[chr]];
+                if (succ == DEAD_STATE) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+};
+
+namespace {
+
+struct raw_report_list {
+    flat_set<ReportID> reports;
+
+    raw_report_list(const flat_set<ReportID> &reports_in,
+                    const ReportManager &rm, bool do_remap) {
+        if (do_remap) {
+            for (auto &id : reports_in) {
+                reports.insert(rm.getProgramOffset(id));
+            }
+        } else {
+            reports = reports_in;
+        }
+    }
+
+    bool operator<(const raw_report_list &b) const {
+        return reports < b.reports;
+    }
+};
+
+struct raw_report_info_impl : public raw_report_info {
+    vector<raw_report_list> rl;
+    u32 getReportListSize() const override;
+    size_t size() const override;
+    void fillReportLists(NFA *n, size_t base_offset,
+                         std::vector<u32> &ro /* out */) const override;
+};
+}
+
+u32 raw_report_info_impl::getReportListSize() const {
+    u32 rv = 0;
+
+    for (const auto &reps : rl) {
+        rv += sizeof(report_list);
+        rv += sizeof(ReportID) * reps.reports.size();
+    }
+
+    return rv;
+}
+
+size_t raw_report_info_impl::size() const {
+    return rl.size();
+}
+
+void raw_report_info_impl::fillReportLists(NFA *n, size_t base_offset,
+                                           vector<u32> &ro) const {
+    for (const auto &reps : rl) {
+        ro.push_back(base_offset);
+
+        report_list *p = (report_list *)((char *)n + base_offset);
+
+        u32 i = 0;
+        for (const ReportID report : reps.reports) {
+            p->report[i++] = report;
+        }
+        p->count = verify_u32(reps.reports.size());
+
+        base_offset += sizeof(report_list);
+        base_offset += sizeof(ReportID) * reps.reports.size();
+    }
+}
+
+unique_ptr<raw_report_info> sheng_build_strat::gatherReports(
+                                                  vector<u32> &reports,
+                                                  vector<u32> &reports_eod,
+                                                  u8 *isSingleReport,
+                                                  ReportID *arbReport) const {
+    DEBUG_PRINTF("gathering reports\n");
+
+    const bool remap_reports = has_managed_reports(rdfa.kind);
+
+    auto ri = ue2::make_unique<raw_report_info_impl>();
+    map<raw_report_list, u32> rev;
+
+    for (const dstate &s : rdfa.states) {
+        if (s.reports.empty()) {
+            reports.push_back(MO_INVALID_IDX);
+            continue;
+        }
+
+        raw_report_list rrl(s.reports, rm, remap_reports);
+        DEBUG_PRINTF("non empty r\n");
+        if (rev.find(rrl) != rev.end()) {
+            reports.push_back(rev[rrl]);
+        } else {
+            DEBUG_PRINTF("adding to rl %zu\n", ri->size());
+            rev[rrl] = ri->size();
+            reports.push_back(ri->size());
+            ri->rl.push_back(rrl);
+        }
+    }
+
+    for (const dstate &s : rdfa.states) {
+        if (s.reports_eod.empty()) {
+            reports_eod.push_back(MO_INVALID_IDX);
+            continue;
+        }
+
+        DEBUG_PRINTF("non empty r eod\n");
+        raw_report_list rrl(s.reports_eod, rm, remap_reports);
+        if (rev.find(rrl) != rev.end()) {
+            reports_eod.push_back(rev[rrl]);
+            continue;
+        }
+
+        DEBUG_PRINTF("adding to rl eod %zu\n", s.reports_eod.size());
+        rev[rrl] = ri->size();
+        reports_eod.push_back(ri->size());
+        ri->rl.push_back(rrl);
+    }
+
+    assert(!ri->rl.empty()); /* all components should be able to generate
+                                reports */
+    if (!ri->rl.empty()) {
+        *arbReport = *ri->rl.begin()->reports.begin();
+    } else {
+        *arbReport = 0;
+    }
+
+    /* if we have only a single report id generated from all accepts (not eod)
+     * we can take some short cuts */
+    set<ReportID> reps;
+
+    for (u32 rl_index : reports) {
+        if (rl_index == MO_INVALID_IDX) {
+            continue;
+        }
+        assert(rl_index < ri->size());
+        insert(&reps, ri->rl[rl_index].reports);
+    }
+
+    if (reps.size() == 1) {
+        *isSingleReport = 1;
+        *arbReport = *reps.begin();
+        DEBUG_PRINTF("single -- %u\n", *arbReport);
+    } else {
+        *isSingleReport = 0;
+    }
+
+    return move(ri);
+}
+
+u32 sheng_build_strat::max_allowed_offset_accel() const {
+    return ACCEL_DFA_MAX_OFFSET_DEPTH;
+}
+
+u32 sheng_build_strat::max_stop_char() const {
+    return ACCEL_DFA_MAX_STOP_CHAR;
+}
+
+u32 sheng_build_strat::max_floating_stop_char() const {
+    return ACCEL_DFA_MAX_FLOATING_STOP_CHAR;
+}
+
+size_t sheng_build_strat::accelSize() const {
+    return sizeof(AccelAux);
+}
+
+#ifdef DEBUG
+static really_inline
+void dumpShuffleMask(const u8 chr, const u8 *buf, unsigned sz) {
+    stringstream o;
+
+    for (unsigned i = 0; i < sz; i++) {
+        o.width(2);
+        o << (buf[i] & SHENG_STATE_MASK) << " ";
+    }
+    DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str());
+}
+#endif
+
+static
+void fillAccelOut(const map<dstate_id_t, AccelScheme> &accel_escape_info,
+                  set<dstate_id_t> *accel_states) {
+    for (dstate_id_t i : accel_escape_info | map_keys) {
+        accel_states->insert(i);
+    }
+}
+
+static
+u8 getShengState(dstate &state, dfa_info &info,
+                 map<dstate_id_t, AccelScheme> &accelInfo) {
+    u8 s = state.impl_id;
+    if (!state.reports.empty()) {
+        s |= SHENG_STATE_ACCEPT;
+    }
+    if (info.isDead(state)) {
+        s |= SHENG_STATE_DEAD;
+    }
+    if (accelInfo.find(info.raw_id(state.impl_id)) != accelInfo.end()) {
+        s |= SHENG_STATE_ACCEL;
+    }
+    return s;
+}
+
+static
+void fillAccelAux(struct NFA *n, dfa_info &info,
+                  map<dstate_id_t, AccelScheme> &accelInfo) {
+    DEBUG_PRINTF("Filling accel aux structures\n");
+    sheng *s = (sheng *)getMutableImplNfa(n);
+    u32 offset = s->accel_offset;
+
+    for (dstate_id_t i = 0; i < info.size(); i++) {
+        dstate_id_t state_id = info.raw_id(i);
+        if (accelInfo.find(state_id) != accelInfo.end()) {
+            s->flags |= SHENG_FLAG_HAS_ACCEL;
+            AccelAux *aux = (AccelAux *)((char *)n + offset);
+            info.strat.buildAccel(state_id, accelInfo[state_id], aux);
+            sstate_aux *saux =
+                (sstate_aux *)((char *)n + s->aux_offset) + state_id;
+            saux->accel = offset;
+            DEBUG_PRINTF("Accel offset: %u\n", offset);
+            offset += ROUNDUP_N(sizeof(AccelAux), alignof(AccelAux));
+        }
+    }
+}
+
+static
+void populateBasicInfo(struct NFA *n, dfa_info &info,
+                       map<dstate_id_t, AccelScheme> &accelInfo, u32 aux_offset,
+                       u32 report_offset, u32 accel_offset, u32 total_size,
+                       u32 dfa_size) {
+    n->length = total_size;
+    n->scratchStateSize = 1;
+    n->streamStateSize = 1;
+    n->nPositions = info.size();
+    n->type = SHENG_NFA_0;
+    n->flags |= info.raw.hasEodReports() ? NFA_ACCEPTS_EOD : 0;
+
+    sheng *s = (sheng *)getMutableImplNfa(n);
+    s->aux_offset = aux_offset;
+    s->report_offset = report_offset;
+    s->accel_offset = accel_offset;
+    s->n_states = info.size();
+    s->length = dfa_size;
+    s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0;
+
+    s->anchored = getShengState(info.anchored, info, accelInfo);
+    s->floating = getShengState(info.floating, info, accelInfo);
+}
+
+static
+void fillTops(NFA *n, dfa_info &info, dstate_id_t id,
+              map<dstate_id_t, AccelScheme> &accelInfo) {
+    sheng *s = (sheng *)getMutableImplNfa(n);
+    u32 aux_base = s->aux_offset;
+
+    DEBUG_PRINTF("Filling tops for state %u\n", id);
+
+    sstate_aux *aux = (sstate_aux *)((char *)n + aux_base) + id;
+
+    DEBUG_PRINTF("Aux structure for state %u, offset %zd\n", id,
+                 (char *)aux - (char *)n);
+
+    /* we could conceivably end up in an accept/dead state on a top event,
+     * so mark top as accept/dead state if it indeed is.
+     */
+    auto &top_state = info.top(id);
+
+    DEBUG_PRINTF("Top transition for state %u: %u\n", id, top_state.impl_id);
+
+    aux->top = getShengState(top_state, info, accelInfo);
+}
+
+static
+void fillAux(NFA *n, dfa_info &info, dstate_id_t id, vector<u32> &reports,
+                 vector<u32> &reports_eod, vector<u32> &report_offsets) {
+    sheng *s = (sheng *)getMutableImplNfa(n);
+    u32 aux_base = s->aux_offset;
+    auto raw_id = info.raw_id(id);
+
+    auto &state = info[id];
+
+    sstate_aux *aux = (sstate_aux *)((char *)n + aux_base) + id;
+
+    DEBUG_PRINTF("Filling aux and report structures for state %u\n", id);
+    DEBUG_PRINTF("Aux structure for state %u, offset %zd\n", id,
+                 (char *)aux - (char *)n);
+
+    aux->accept = state.reports.empty() ? 0 : report_offsets[reports[raw_id]];
+    aux->accept_eod =
+        state.reports_eod.empty() ? 0 : report_offsets[reports_eod[raw_id]];
+
+    DEBUG_PRINTF("Report list offset: %u\n", aux->accept);
+    DEBUG_PRINTF("EOD report list offset: %u\n", aux->accept_eod);
+}
+
+static
+void fillSingleReport(NFA *n, ReportID r_id) {
+    sheng *s = (sheng *)getMutableImplNfa(n);
+
+    DEBUG_PRINTF("Single report ID: %u\n", r_id);
+    s->report = r_id;
+    s->flags |= SHENG_FLAG_SINGLE_REPORT;
+}
+
+static
+void createShuffleMasks(sheng *s, dfa_info &info,
+                        map<dstate_id_t, AccelScheme> &accelInfo) {
+    for (u16 chr = 0; chr < 256; chr++) {
+        u8 buf[16] = {0};
+
+        for (dstate_id_t idx = 0; idx < info.size(); idx++) {
+            auto &succ_state = info.next(idx, chr);
+
+            buf[idx] = getShengState(succ_state, info, accelInfo);
+        }
+#ifdef DEBUG
+        dumpShuffleMask(chr, buf, sizeof(buf));
+#endif
+        m128 mask = loadu128(buf);
+        s->shuffle_masks[chr] = mask;
+    }
+}
+
+bool has_accel_sheng(const NFA *nfa) {
+    const sheng *s = (const sheng *)getImplNfa(nfa);
+    return s->flags & SHENG_FLAG_HAS_ACCEL;
+}
+
+aligned_unique_ptr<NFA> shengCompile(raw_dfa &raw,
+                                     const CompileContext &cc,
+                                     const ReportManager &rm,
+                                     set<dstate_id_t> *accel_states) {
+    if (!cc.grey.allowSheng) {
+        DEBUG_PRINTF("Sheng is not allowed!\n");
+        return nullptr;
+    }
+
+    sheng_build_strat strat(raw, rm);
+    dfa_info info(strat);
+
+    DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size());
+
+    DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n",
+                 raw.start_anchored, raw.start_floating);
+
+    DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n",
+                 info.can_die ? "can" : "cannot", info.size());
+    if (info.size() > 16) {
+        DEBUG_PRINTF("Too many states\n");
+        return nullptr;
+    }
+
+    if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming
+                          * mode with our semantics */
+        raw.stripExtraEodReports();
+    }
+    auto accelInfo = strat.getAccelInfo(cc.grey);
+
+    // set impl_id of each dfa state
+    for (dstate_id_t i = 0; i < info.size(); i++) {
+        info[i].impl_id = i;
+    }
+
+    DEBUG_PRINTF("Anchored start state: %u, floating start state: %u\n",
+                 info.anchored.impl_id, info.floating.impl_id);
+
+    u32 nfa_size = ROUNDUP_16(sizeof(NFA) + sizeof(sheng));
+    vector<u32> reports, eod_reports, report_offsets;
+    u8 isSingle = 0;
+    ReportID single_report = 0;
+
+    auto ri =
+        strat.gatherReports(reports, eod_reports, &isSingle, &single_report);
+
+    u32 total_aux = sizeof(sstate_aux) * info.size();
+    u32 total_accel = strat.accelSize() * accelInfo.size();
+    u32 total_reports = ri->getReportListSize();
+
+    u32 reports_offset = nfa_size + total_aux;
+    u32 accel_offset =
+        ROUNDUP_N(reports_offset + total_reports, alignof(AccelAux));
+    u32 total_size = ROUNDUP_N(accel_offset + total_accel, 64);
+
+    DEBUG_PRINTF("NFA: %u, aux: %u, reports: %u, accel: %u, total: %u\n",
+                 nfa_size, total_aux, total_reports, total_accel, total_size);
+
+    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+
+    populateBasicInfo(nfa.get(), info, accelInfo, nfa_size, reports_offset,
+                      accel_offset, total_size, total_size - sizeof(NFA));
+
+    DEBUG_PRINTF("Setting up aux and report structures\n");
+
+    ri->fillReportLists(nfa.get(), reports_offset, report_offsets);
+
+    for (dstate_id_t idx = 0; idx < info.size(); idx++) {
+        fillTops(nfa.get(), info, idx, accelInfo);
+        fillAux(nfa.get(), info, idx, reports, eod_reports, report_offsets);
+    }
+    if (isSingle) {
+        fillSingleReport(nfa.get(), single_report);
+    }
+
+    fillAccelAux(nfa.get(), info, accelInfo);
+
+    if (accel_states) {
+        fillAccelOut(accelInfo, accel_states);
+    }
+
+    createShuffleMasks((sheng *)getMutableImplNfa(nfa.get()), info, accelInfo);
+
+    return nfa;
+}
+
+} // namespace ue2
diff --git a/src/nfa/shengcompile.h b/src/nfa/shengcompile.h
new file mode 100644
index 00000000..873b7c75
--- /dev/null
+++ b/src/nfa/shengcompile.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SHENGCOMPILE_H_
+#define SHENGCOMPILE_H_
+
+#include "accel_dfa_build_strat.h"
+#include "rdfa.h"
+#include "util/alloc.h"
+#include "util/charreach.h"
+#include "util/ue2_containers.h"
+
+struct NFA;
+
+namespace ue2 {
+
+class ReportManager;
+struct CompileContext;
+struct raw_dfa;
+
+class sheng_build_strat : public accel_dfa_build_strat {
+public:
+    sheng_build_strat(raw_dfa &rdfa_in, const ReportManager &rm_in)
+        : accel_dfa_build_strat(rm_in), rdfa(rdfa_in) {}
+    raw_dfa &get_raw() const override { return rdfa; }
+    std::unique_ptr<raw_report_info> gatherReports(
+                                  std::vector<u32> &reports /* out */,
+                                  std::vector<u32> &reports_eod /* out */,
+                                  u8 *isSingleReport /* out */,
+                                  ReportID *arbReport /* out */) const override;
+    size_t accelSize(void) const override;
+    u32 max_allowed_offset_accel() const override;
+    u32 max_stop_char() const override;
+    u32 max_floating_stop_char() const override;
+
+private:
+    raw_dfa &rdfa;
+};
+
+aligned_unique_ptr<NFA>
+shengCompile(raw_dfa &raw, const CompileContext &cc, const ReportManager &rm,
+             std::set<dstate_id_t> *accel_states = nullptr);
+
+struct sheng_escape_info {
+    CharReach outs;
+    CharReach outs2_single;
+    flat_set<std::pair<u8, u8>> outs2;
+    bool outs2_broken = false;
+};
+
+bool has_accel_sheng(const NFA *nfa);
+
+} // namespace ue2
+
+#endif /* SHENGCOMPILE_H_ */
diff --git a/src/nfa/shengdump.cpp b/src/nfa/shengdump.cpp
new file mode 100644
index 00000000..037dfb05
--- /dev/null
+++ b/src/nfa/shengdump.cpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "shengdump.h"
+
+#include "accel_dump.h"
+#include "nfa_dump_internal.h"
+#include "nfa_internal.h"
+#include "sheng_internal.h"
+#include "rdfa.h"
+#include "ue2common.h"
+#include "util/charreach.h"
+#include "util/dump_charclass.h"
+#include "util/simd_utils.h"
+
+
+#ifndef DUMP_SUPPORT
+#error No dump support!
+#endif
+
+using namespace std;
+
+namespace ue2 {
+
+static
+const sstate_aux *get_aux(const NFA *n, dstate_id_t i) {
+    assert(n && isShengType(n->type));
+
+    const sheng *s = (const sheng *)getImplNfa(n);
+    const sstate_aux *aux_base =
+        (const sstate_aux *)((const char *)n + s->aux_offset);
+
+    const sstate_aux *aux = aux_base + i;
+
+    assert((const char *)aux < (const char *)s + s->length);
+
+    return aux;
+}
+
+static
+void dumpHeader(FILE *f, const sheng *s) {
+    fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states,
+            s->length);
+    fprintf(f, "aux base offset: %u, reports base offset: %u, "
+               "accel offset: %u\n",
+            s->aux_offset, s->report_offset, s->accel_offset);
+    fprintf(f, "anchored start state: %u, floating start state: %u\n",
+            s->anchored & SHENG_STATE_MASK, s->floating & SHENG_STATE_MASK);
+    fprintf(f, "has accel: %u can die: %u single report: %u\n",
+            !!(s->flags & SHENG_FLAG_HAS_ACCEL),
+            !!(s->flags & SHENG_FLAG_CAN_DIE),
+            !!(s->flags & SHENG_FLAG_SINGLE_REPORT));
+}
+
+static
+void dumpAux(FILE *f, u32 state, const sstate_aux *aux) {
+    fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, "
+               "accel offset: %u, top: %u\n",
+            state, aux->accept, aux->accept_eod, aux->accel,
+            aux->top & SHENG_STATE_MASK);
+}
+
+static
+void dumpReports(FILE *f, const report_list *rl) {
+    fprintf(f, "reports count: %u\n", rl->count);
+    for (u32 i = 0; i < rl->count; i++) {
+        fprintf(f, "  report: %u, report ID: %u\n", i, rl->report[i]);
+    }
+}
+
+static
+void dumpMasks(FILE *f, const sheng *s) {
+    for (u32 chr = 0; chr < 256; chr++) {
+        u8 buf[16];
+        m128 shuffle_mask = s->shuffle_masks[chr];
+        store128(buf, shuffle_mask);
+
+        fprintf(f, "%3u: ", chr);
+        for (u32 pos = 0; pos < 16; pos++) {
+            u8 c = buf[pos];
+            if (c & SHENG_STATE_FLAG_MASK) {
+                fprintf(f, "%2u* ", c & SHENG_STATE_MASK);
+            } else {
+                fprintf(f, "%2u  ", c & SHENG_STATE_MASK);
+            }
+        }
+        fprintf(f, "\n");
+    }
+}
+
+void nfaExecSheng0_dumpText(const NFA *nfa, FILE *f) {
+    assert(nfa->type == SHENG_NFA_0);
+    const sheng *s = (const sheng *)getImplNfa(nfa);
+
+    fprintf(f, "sheng DFA\n");
+    dumpHeader(f, s);
+
+    for (u32 state = 0; state < s->n_states; state++) {
+        const sstate_aux *aux = get_aux(nfa, state);
+        dumpAux(f, state, aux);
+        if (aux->accept) {
+            fprintf(f, "report list:\n");
+            const report_list *rl =
+                (const report_list *)((const char *)nfa + aux->accept);
+            dumpReports(f, rl);
+        }
+        if (aux->accept_eod) {
+            fprintf(f, "EOD report list:\n");
+            const report_list *rl =
+                (const report_list *)((const char *)nfa + aux->accept_eod);
+            dumpReports(f, rl);
+        }
+        if (aux->accel) {
+            fprintf(f, "accel:\n");
+            const AccelAux *accel =
+                (const AccelAux *)((const char *)nfa + aux->accel);
+            dumpAccelInfo(f, *accel);
+        }
+    }
+
+    fprintf(f, "\n");
+
+    dumpMasks(f, s);
+
+    fprintf(f, "\n");
+}
+
+static
+void dumpDotPreambleDfa(FILE *f) {
+    dumpDotPreamble(f);
+
+    // DFA specific additions.
+    fprintf(f, "STARTF [style=invis];\n");
+    fprintf(f, "STARTA [style=invis];\n");
+    fprintf(f, "0 [style=invis];\n");
+}
+
+static
+void describeNode(const NFA *n, const sheng *s, u16 i, FILE *f) {
+    const sstate_aux *aux = get_aux(n, i);
+
+    fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
+               "label = \"%u\" ]; \n",
+            i, i);
+
+    if (aux->accept_eod) {
+        fprintf(f, "%u [ color = darkorchid ];\n", i);
+    }
+
+    if (aux->accept) {
+        fprintf(f, "%u [ shape = doublecircle ];\n", i);
+    }
+
+    if (aux->top && (aux->top & SHENG_STATE_MASK) != i) {
+        fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
+                aux->top & SHENG_STATE_MASK);
+    }
+
+    if (i == (s->anchored & SHENG_STATE_MASK)) {
+        fprintf(f, "STARTA -> %u [color = blue ]\n", i);
+    }
+
+    if (i == (s->floating & SHENG_STATE_MASK)) {
+        fprintf(f, "STARTF -> %u [color = red ]\n", i);
+    }
+}
+
+static
+void describeEdge(FILE *f, const u16 *t, u16 i) {
+    for (u16 s = 0; s < N_CHARS; s++) {
+        if (!t[s]) {
+            continue;
+        }
+
+        u16 ss;
+        for (ss = 0; ss < s; ss++) {
+            if (t[s] == t[ss]) {
+                break;
+            }
+        }
+
+        if (ss != s) {
+            continue;
+        }
+
+        CharReach reach;
+        for (ss = s; ss < 256; ss++) {
+            if (t[s] == t[ss]) {
+                reach.set(ss);
+            }
+        }
+
+        fprintf(f, "%u -> %u [ label = \"", i, t[s]);
+
+        describeClass(f, reach, 5, CC_OUT_DOT);
+
+        fprintf(f, "\" ];\n");
+    }
+}
+
+static
+void shengGetTransitions(const NFA *n, u16 state, u16 *t) {
+    assert(isShengType(n->type));
+    const sheng *s = (const sheng *)getImplNfa(n);
+    const sstate_aux *aux = get_aux(n, state);
+
+    for (unsigned i = 0; i < N_CHARS; i++) {
+        u8 buf[16];
+        m128 shuffle_mask = s->shuffle_masks[i];
+
+        store128(buf, shuffle_mask);
+
+        t[i] = buf[state] & SHENG_STATE_MASK;
+    }
+
+    t[TOP] = aux->top & SHENG_STATE_MASK;
+}
+
+void nfaExecSheng0_dumpDot(const NFA *nfa, FILE *f, const string &) {
+    assert(nfa->type == SHENG_NFA_0);
+    const sheng *s = (const sheng *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < s->n_states; i++) {
+        describeNode(nfa, s, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        shengGetTransitions(nfa, i, t);
+
+        describeEdge(f, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+} // namespace ue2
diff --git a/src/nfa/shengdump.h b/src/nfa/shengdump.h
new file mode 100644
index 00000000..5334894f
--- /dev/null
+++ b/src/nfa/shengdump.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SHENGDUMP_H_
+#define SHENGDUMP_H_
+
+#ifdef DUMP_SUPPORT
+
+#include <cstdio>
+#include <string>
+
+struct NFA;
+
+namespace ue2 {
+
+void nfaExecSheng0_dumpDot(const struct NFA *nfa, FILE *file,
+                           const std::string &base);
+void nfaExecSheng0_dumpText(const struct NFA *nfa, FILE *file);
+
+} // namespace ue2
+
+#endif // DUMP_SUPPORT
+
+#endif /* SHENGDUMP_H_ */
diff --git a/src/nfagraph/ng_limex.cpp b/src/nfagraph/ng_limex.cpp
index a8a5113d..72efa43a 100644
--- a/src/nfagraph/ng_limex.cpp
+++ b/src/nfagraph/ng_limex.cpp
@@ -505,6 +505,9 @@ aligned_unique_ptr<NFA> constructReversedNFA(const NGHolder &h_in, u32 hint,
 
 u32 isImplementableNFA(const NGHolder &g, const ReportManager *rm,
                        const CompileContext &cc) {
+    if (!cc.grey.allowLimExNFA) {
+        return false;
+    }
     // Quick check: we can always implement an NFA with less than NFA_MAX_STATES
     // states. Note that top masks can generate extra states, so we account for
     // those here too.
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 085aca79..c39c3401 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -50,6 +50,7 @@
 #include "nfa/nfa_api_queue.h"
 #include "nfa/nfa_build_util.h"
 #include "nfa/nfa_internal.h"
+#include "nfa/shengcompile.h"
 #include "nfa/shufticompile.h"
 #include "nfa/tamaramacompile.h"
 #include "nfa/tamarama_internal.h"
@@ -863,13 +864,18 @@ aligned_unique_ptr<NFA> pickImpl(aligned_unique_ptr<NFA> dfa_impl,
                                  aligned_unique_ptr<NFA> nfa_impl) {
     assert(nfa_impl);
     assert(dfa_impl);
-    assert(isMcClellanType(dfa_impl->type));
+    assert(isDfaType(dfa_impl->type));
 
     // If our NFA is an LBR, it always wins.
     if (isLbrType(nfa_impl->type)) {
         return nfa_impl;
     }
 
+    // if our DFA is an accelerated Sheng, it always wins.
+    if (isShengType(dfa_impl->type) && has_accel(*dfa_impl)) {
+        return dfa_impl;
+    }
+
     bool d_accel = has_accel(*dfa_impl);
     bool n_accel = has_accel(*nfa_impl);
     bool d_big = dfa_impl->type == MCCLELLAN_NFA_16;
@@ -922,6 +928,18 @@ buildRepeatEngine(const CastleProto &proto,
     return castle_nfa;
 }
 
+static
+aligned_unique_ptr<NFA> getDfa(raw_dfa &rdfa, const CompileContext &cc,
+                               const ReportManager &rm) {
+    // Unleash the Sheng!!
+    auto dfa = shengCompile(rdfa, cc, rm);
+    if (!dfa) {
+        // Sheng wasn't successful, so unleash McClellan!
+        dfa = mcclellanCompile(rdfa, cc, rm);
+    }
+    return dfa;
+}
+
 /* builds suffix nfas */
 static
 aligned_unique_ptr<NFA>
@@ -942,7 +960,7 @@ buildSuffix(const ReportManager &rm, const SomSlotManager &ssm,
     }
 
     if (suff.dfa()) {
-        auto d = mcclellanCompile(*suff.dfa(), cc, rm);
+        auto d = getDfa(*suff.dfa(), cc, rm);
         assert(d);
         return d;
     }
@@ -971,7 +989,7 @@ buildSuffix(const ReportManager &rm, const SomSlotManager &ssm,
             auto rdfa = buildMcClellan(holder, &rm, false, triggers.at(0),
                                        cc.grey);
             if (rdfa) {
-                auto d = mcclellanCompile(*rdfa, cc, rm);
+                auto d = getDfa(*rdfa, cc, rm);
                 assert(d);
                 if (cc.grey.roseMcClellanSuffix != 2) {
                     n = pickImpl(move(d), move(n));
@@ -1091,12 +1109,13 @@ makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
     }
 
     if (left.dfa()) {
-        n = mcclellanCompile(*left.dfa(), cc, rm);
+        n = getDfa(*left.dfa(), cc, rm);
     } else if (left.graph() && cc.grey.roseMcClellanPrefix == 2 && is_prefix &&
                !is_transient) {
         auto rdfa = buildMcClellan(*left.graph(), nullptr, cc.grey);
         if (rdfa) {
-            n = mcclellanCompile(*rdfa, cc, rm);
+            n = getDfa(*rdfa, cc, rm);
+            assert(n);
         }
     }
 
@@ -1122,7 +1141,7 @@ makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
         && (!n || !has_bounded_repeats_other_than_firsts(*n) || !is_fast(*n))) {
         auto rdfa = buildMcClellan(*left.graph(), nullptr, cc.grey);
         if (rdfa) {
-            auto d = mcclellanCompile(*rdfa, cc, rm);
+            auto d = getDfa(*rdfa, cc, rm);
             assert(d);
             n = pickImpl(move(d), move(n));
         }
@@ -1857,8 +1876,8 @@ public:
     };
 
     aligned_unique_ptr<NFA> operator()(unique_ptr<raw_dfa> &rdfa) const {
-        // Unleash the McClellan!
-        return mcclellanCompile(*rdfa, build.cc, build.rm);
+        // Unleash the mighty DFA!
+        return getDfa(*rdfa, build.cc, build.rm);
     }
 
     aligned_unique_ptr<NFA> operator()(unique_ptr<raw_som_dfa> &haig) const {
@@ -1886,7 +1905,7 @@ public:
             !has_bounded_repeats_other_than_firsts(*n)) {
             auto rdfa = buildMcClellan(h, &rm, cc.grey);
             if (rdfa) {
-                auto d = mcclellanCompile(*rdfa, cc, rm);
+                auto d = getDfa(*rdfa, cc, rm);
                 if (d) {
                     n = pickImpl(move(d), move(n));
                 }
diff --git a/src/runtime.c b/src/runtime.c
index fc867b8e..35a11634 100644
--- a/src/runtime.c
+++ b/src/runtime.c
@@ -43,6 +43,7 @@
 #include "nfa/nfa_api_util.h"
 #include "nfa/nfa_internal.h"
 #include "nfa/nfa_rev_api.h"
+#include "nfa/sheng.h"
 #include "smallwrite/smallwrite_internal.h"
 #include "rose/rose.h"
 #include "rose/runtime.h"
@@ -286,13 +287,16 @@ void runSmallWriteEngine(const struct SmallWriteEngine *smwr,
     size_t local_alen = length - smwr->start_offset;
     const u8 *local_buffer = buffer + smwr->start_offset;
 
-    assert(isMcClellanType(nfa->type));
+    assert(isDfaType(nfa->type));
     if (nfa->type == MCCLELLAN_NFA_8) {
         nfaExecMcClellan8_B(nfa, smwr->start_offset, local_buffer,
                             local_alen, roseReportAdaptor, scratch);
-    } else {
+    } else if (nfa->type == MCCLELLAN_NFA_16){
         nfaExecMcClellan16_B(nfa, smwr->start_offset, local_buffer,
                              local_alen, roseReportAdaptor, scratch);
+    } else {
+        nfaExecSheng0_B(nfa, smwr->start_offset, local_buffer,
+                        local_alen, roseReportAdaptor, scratch);
     }
 }
 
diff --git a/src/smallwrite/smallwrite_build.cpp b/src/smallwrite/smallwrite_build.cpp
index 1cffe514..90770ba5 100644
--- a/src/smallwrite/smallwrite_build.cpp
+++ b/src/smallwrite/smallwrite_build.cpp
@@ -34,6 +34,7 @@
 #include "nfa/mcclellancompile_util.h"
 #include "nfa/nfa_internal.h"
 #include "nfa/rdfa_merge.h"
+#include "nfa/shengcompile.h"
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_mcclellan.h"
@@ -312,6 +313,20 @@ bool is_slow(const raw_dfa &rdfa, const set<dstate_id_t> &accel,
     return true;
 }
 
+static
+aligned_unique_ptr<NFA> getDfa(raw_dfa &rdfa, const CompileContext &cc,
+                               const ReportManager &rm,
+                               set<dstate_id_t> &accel_states) {
+    aligned_unique_ptr<NFA> dfa = nullptr;
+    if (cc.grey.allowSmallWriteSheng) {
+        dfa = shengCompile(rdfa, cc, rm, &accel_states);
+    }
+    if (!dfa) {
+        dfa = mcclellanCompile(rdfa, cc, rm, &accel_states);
+    }
+    return dfa;
+}
+
 static
 aligned_unique_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
                                    const CompileContext &cc,
@@ -322,9 +337,9 @@ aligned_unique_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
     // Unleash the McClellan!
     set<dstate_id_t> accel_states;
 
-    auto nfa = mcclellanCompile(rdfa, cc, rm, &accel_states);
+    auto nfa = getDfa(rdfa, cc, rm, accel_states);
     if (!nfa) {
-        DEBUG_PRINTF("mcclellan compile failed for smallwrite NFA\n");
+        DEBUG_PRINTF("DFA compile failed for smallwrite NFA\n");
         return nullptr;
     }
 
@@ -340,9 +355,9 @@ aligned_unique_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
                 return nullptr;
             }
 
-            nfa = mcclellanCompile(rdfa, cc, rm, &accel_states);
+            nfa = getDfa(rdfa, cc, rm, accel_states);
             if (!nfa) {
-                DEBUG_PRINTF("mcclellan compile failed for smallwrite NFA\n");
+                DEBUG_PRINTF("DFA compile failed for smallwrite NFA\n");
                 assert(0); /* able to build orig dfa but not the trimmed? */
                 return nullptr;
             }
@@ -351,7 +366,7 @@ aligned_unique_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
         *small_region = cc.grey.smallWriteLargestBuffer;
     }
 
-    assert(isMcClellanType(nfa->type));
+    assert(isDfaType(nfa->type));
     if (nfa->length > cc.grey.limitSmallWriteOutfixSize
         || nfa->length > cc.grey.limitDFASize) {
         DEBUG_PRINTF("smallwrite outfix size too large\n");

From 702f256b39bbe5431a67b96a7de1ad13d08b17c6 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Wed, 3 Aug 2016 14:41:34 +1000
Subject: [PATCH 151/166] remove exceptionMap from LimExNFA

---
 src/nfa/limex_compile.cpp    | 55 ++++++++++++++++++++----------------
 src/nfa/limex_dump.cpp       | 19 +++++++++++--
 src/nfa/limex_exceptional.h  | 30 ++++++++++++++------
 src/nfa/limex_internal.h     |  1 -
 src/nfa/limex_native.c       |  3 +-
 src/nfa/limex_runtime_impl.h | 24 ++++++++--------
 src/util/bitutils.h          | 16 +++++++++++
 unit/internal/bitutils.cpp   | 24 ++++++++++++++++
 8 files changed, 120 insertions(+), 52 deletions(-)

diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index e0c459aa..77754e0b 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -1172,12 +1172,13 @@ u32 getReportListIndex(const flat_set<ReportID> &reports,
 }
 
 static
-void buildExceptionMap(const build_info &args,
-                       const ue2::unordered_set<NFAEdge> &exceptional,
-                       map<ExceptionProto, vector<u32> > &exceptionMap,
-                       vector<ReportID> &exceptionReports) {
+u32 buildExceptionMap(const build_info &args,
+                      const ue2::unordered_set<NFAEdge> &exceptional,
+                      map<ExceptionProto, vector<u32> > &exceptionMap,
+                      vector<ReportID> &exceptionReports) {
     const NGHolder &h = args.h;
     const u32 num_states = args.num_states;
+    u32 exceptionCount = 0;
 
     ue2::unordered_map<NFAVertex, u32> pos_trigger;
     ue2::unordered_map<NFAVertex, u32> tug_trigger;
@@ -1307,10 +1308,13 @@ void buildExceptionMap(const build_info &args,
             assert(e.succ_states.size() == num_states);
             assert(e.squash_states.size() == num_states);
             exceptionMap[e].push_back(i);
+            exceptionCount++;
         }
     }
 
-    DEBUG_PRINTF("%zu unique exceptions found.\n", exceptionMap.size());
+    DEBUG_PRINTF("%u exceptions found (%zu unique)\n", exceptionCount,
+                 exceptionMap.size());
+    return exceptionCount;
 }
 
 static
@@ -1642,19 +1646,25 @@ struct Factory {
                          implNFA_t *limex, const u32 exceptionsOffset) {
         DEBUG_PRINTF("exceptionsOffset=%u\n", exceptionsOffset);
 
-        // to make testing easier, we pre-set the exceptionMap to all invalid
-        // values
-        memset(limex->exceptionMap, 0xff, sizeof(limex->exceptionMap));
-
         exception_t *etable = (exception_t *)((char *)limex + exceptionsOffset);
         assert(ISALIGNED(etable));
 
-        u32 ecount = 0;
+        map<u32, ExceptionProto> exception_by_state;
         for (const auto &m : exceptionMap) {
             const ExceptionProto &proto = m.first;
             const vector<u32> &states = m.second;
-            DEBUG_PRINTF("exception %u, triggered by %zu states.\n", ecount,
-                         states.size());
+            for (u32 i : states) {
+                assert(!contains(exception_by_state, i));
+                exception_by_state.emplace(i, proto);
+            }
+        }
+
+        u32 ecount = 0;
+        for (const auto &m : exception_by_state) {
+            const ExceptionProto &proto = m.second;
+            u32 state_id = m.first;
+            DEBUG_PRINTF("exception %u, triggered by state %u\n", ecount,
+                         state_id);
 
             // Write the exception entry.
             exception_t &e = etable[ecount];
@@ -1668,13 +1678,10 @@ struct Factory {
                                     : repeatOffsets[proto.repeat_index];
             e.repeatOffset = repeat_offset;
 
-            // for each state that can switch it on
-            for (auto state_id : states) {
-                // set this bit in the exception mask
-                maskSetBit(limex->exceptionMask, state_id);
-                // set this index in the exception map
-                limex->exceptionMap[state_id] = ecount;
-            }
+            // for the state that can switch it on
+            // set this bit in the exception mask
+            maskSetBit(limex->exceptionMask, state_id);
+
             ecount++;
         }
 
@@ -1882,12 +1889,10 @@ struct Factory {
 
         map<ExceptionProto, vector<u32> > exceptionMap;
         vector<ReportID> exceptionReports;
-        buildExceptionMap(args, exceptional, exceptionMap, exceptionReports);
+        u32 exceptionCount = buildExceptionMap(args, exceptional, exceptionMap,
+                                               exceptionReports);
 
-        if (exceptionMap.size() > ~0U) {
-            DEBUG_PRINTF("too many exceptions!\n");
-            return nullptr;
-        }
+        assert(exceptionCount <= args.num_states);
 
         // Build reach table and character mapping.
         vector<NFAStateSet> reach;
@@ -1942,7 +1947,7 @@ struct Factory {
 
         offset = ROUNDUP_CL(offset);
         const u32 exceptionsOffset = offset;
-        offset += sizeof(exception_t) * exceptionMap.size();
+        offset += sizeof(exception_t) * exceptionCount;
 
         const u32 exceptionReportsOffset = offset;
         offset += sizeof(ReportID) * exceptionReports.size();
diff --git a/src/nfa/limex_dump.cpp b/src/nfa/limex_dump.cpp
index 207769a0..2c215feb 100644
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@@ -80,6 +80,21 @@ void dumpMask(FILE *f, const char *name, const u8 *mask, u32 mask_bits) {
     fprintf(f, "MSK %-20s %s\n", name, dumpMask(mask, mask_bits).c_str());
 }
 
+template<typename mask_t>
+static
+u32 rank_in_mask(mask_t mask, u32 bit) {
+    u32 chunks[sizeof(mask)/sizeof(u32)];
+    memcpy(chunks, &mask, sizeof(mask));
+    u32 base_rank = 0;
+    for (u32 i = 0; i < bit / 32; i++) {
+        base_rank += popcount32(chunks[i]);
+    }
+    u32 chunk = chunks[bit / 32];
+    u32 local_bit = bit % 32;
+    assert(chunk & (1U << local_bit));
+    return base_rank + popcount32(chunk & ((1U << local_bit) - 1));
+}
+
 template <typename limex_type>
 static
 void dumpRepeats(const limex_type *limex, u32 model_size, FILE *f) {
@@ -338,7 +353,7 @@ struct limex_labeller : public nfa_labeller {
             return;
         }
 
-        u32 ex_index = limex->exceptionMap[state];
+        u32 ex_index = rank_in_mask(limex->exceptionMask, state);
         const typename limex_traits<limex_type>::exception_type *e
             = &exceptions[ex_index];
 
@@ -409,7 +424,7 @@ void dumpExDotInfo(const limex_type *limex, u32 state, FILE *f) {
     const typename limex_traits<limex_type>::exception_type *exceptions
         = getExceptionTable(limex);
 
-    u32 ex_index = limex->exceptionMap[state];
+    u32 ex_index = rank_in_mask(limex->exceptionMask, state);
     const typename limex_traits<limex_type>::exception_type *e
         = &exceptions[ex_index];
 
diff --git a/src/nfa/limex_exceptional.h b/src/nfa/limex_exceptional.h
index 26c5e5a5..175ca393 100644
--- a/src/nfa/limex_exceptional.h
+++ b/src/nfa/limex_exceptional.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -79,9 +79,13 @@
 #ifdef ARCH_64_BIT
 #define CHUNK_T u64a
 #define FIND_AND_CLEAR_FN findAndClearLSB_64
+#define POPCOUNT_FN popcount64
+#define RANK_IN_MASK_FN rank_in_mask64
 #else
 #define CHUNK_T u32
 #define FIND_AND_CLEAR_FN findAndClearLSB_32
+#define POPCOUNT_FN popcount32
+#define RANK_IN_MASK_FN rank_in_mask32
 #endif
 
 /** \brief Process a single exception. Returns 1 if exception handling should
@@ -206,13 +210,13 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
 
 #ifndef RUN_EXCEPTION_FN_ONLY
 
-/** \brief Process all of the exceptions associated with the states in the \a estate. */
+/** \brief Process all of the exceptions associated with the states in the \a
+ * estate. */
 static really_inline
 int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
-          const struct IMPL_NFA_T *limex,
-          const u32 *exceptionMap, const EXCEPTION_T *exceptions,
-          const ReportID *exReports,
-          u64a offset, struct CONTEXT_T *ctx, char in_rev, char flags) {
+          const struct IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
+          const ReportID *exReports, u64a offset, struct CONTEXT_T *ctx,
+          char in_rev, char flags) {
     assert(diffmask > 0); // guaranteed by caller macro
 
     if (EQ_STATE(estate, LOAD_STATE(&ctx->cached_estate))) {
@@ -237,15 +241,23 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
 
     // A copy of the estate as an array of GPR-sized chunks.
     CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
+    CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
 #ifdef ESTATE_ON_STACK
     memcpy(chunks, &estate, sizeof(STATE_T));
 #else
     memcpy(chunks, estatep, sizeof(STATE_T));
 #endif
+    memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
 
     struct proto_cache new_cache = {0, NULL};
     enum CacheResult cacheable = CACHE_RESULT;
 
+    u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
+    base_index[0] = 0;
+    for (u32 i = 0; i < ARRAY_LENGTH(base_index) - 1; i++) {
+        base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]);
+    }
+
     do {
         u32 t = findAndClearLSB_32(&diffmask);
 #ifdef ARCH_64_BIT
@@ -254,10 +266,10 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
         assert(t < ARRAY_LENGTH(chunks));
         CHUNK_T word = chunks[t];
         assert(word != 0);
-        u32 base = t * sizeof(CHUNK_T) * 8;
         do {
-            u32 bit = FIND_AND_CLEAR_FN(&word) + base;
-            u32 idx = exceptionMap[bit];
+            u32 bit = FIND_AND_CLEAR_FN(&word);
+            u32 local_index = RANK_IN_MASK_FN(emask_chunks[t], bit);
+            u32 idx = local_index + base_index[t];
             const EXCEPTION_T *e = &exceptions[idx];
 
             if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
diff --git a/src/nfa/limex_internal.h b/src/nfa/limex_internal.h
index 6bc9a597..c37f5f40 100644
--- a/src/nfa/limex_internal.h
+++ b/src/nfa/limex_internal.h
@@ -135,7 +135,6 @@ struct LimExNFA##size {                                                     \
     u32 exReportOffset; /* rel. to start of LimExNFA */                     \
     u32 repeatCount;                                                        \
     u32 repeatOffset;                                                       \
-    u32 exceptionMap[size];                                                 \
     u32 squashOffset; /* rel. to start of LimExNFA; for accept squashing */ \
     u32 squashCount;                                                        \
     u32 topCount;                                                           \
diff --git a/src/nfa/limex_native.c b/src/nfa/limex_native.c
index e156cb81..8a0a8acd 100644
--- a/src/nfa/limex_native.c
+++ b/src/nfa/limex_native.c
@@ -74,7 +74,6 @@
 static really_inline
 int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
                          const struct LimExNFA32 *limex,
-                         const u32 *exceptionMap,
                          const struct NFAException32 *exceptions,
                          const ReportID *exReports, u64a offset,
                          struct NFAContext32 *ctx, char in_rev, char flags) {
@@ -104,7 +103,7 @@ int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ,
 
     do {
         u32 bit = findAndClearLSB_32(&estate);
-        u32 idx = exceptionMap[bit];
+        u32 idx = rank_in_mask32(limex->exceptionMask, bit);
         const struct NFAException32 *e = &exceptions[idx];
         if (!runException32(e, s, succ, &local_succ, limex, exReports, offset,
                             ctx, &new_cache, &cacheable, in_rev, flags)) {
diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index d6c28c6f..881e41fd 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -105,8 +105,8 @@
 // continue, 1 if an accept was fired and the user instructed us to halt.
 static really_inline
 char RUN_EXCEPTIONS_FN(const IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
-                       const ReportID *exReports, const u32 *exceptionMap,
-                       STATE_T s, const STATE_T emask, size_t i, u64a offset,
+                       const ReportID *exReports, STATE_T s,
+                       const STATE_T emask, size_t i, u64a offset,
                        STATE_T *succ, u64a *final_loc, struct CONTEXT_T *ctx,
                        const char flags, const char in_rev,
                        const char first_match) {
@@ -133,8 +133,8 @@ char RUN_EXCEPTIONS_FN(const IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
     char localflags = (!i && !in_rev) ? NO_OUTPUT | FIRST_BYTE : flags;
 
     int rv = JOIN(processExceptional, SIZE)(
-        pass_state, pass_estate, diffmask, succ, limex, exceptionMap,
-        exceptions, exReports, callback_offset, ctx, in_rev, localflags);
+        pass_state, pass_estate, diffmask, succ, limex, exceptions, exReports,
+        callback_offset, ctx, in_rev, localflags);
     if (rv == PE_RV_HALT) {
         return 1; // Halt matching.
     }
@@ -176,7 +176,6 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
         (const union AccelAux *)((const char *)limex + limex->accelAuxOffset);
     const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
     const ReportID *exReports = getExReports(limex);
-    const u32 *exceptionMap = limex->exceptionMap;
     STATE_T s = LOAD_STATE(&ctx->s);
 
     /* assert(ISALIGNED_16(exceptions)); */
@@ -204,9 +203,9 @@ without_accel:
         STATE_T succ;
         NFA_EXEC_GET_LIM_SUCC(STATE_T);
 
-        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, exceptionMap, s,
-                              EXCEPTION_MASK, i, offset, &succ, final_loc, ctx,
-                              flags, 0, first_match)) {
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, s, EXCEPTION_MASK,
+                              i, offset, &succ, final_loc, ctx, flags, 0,
+                              first_match)) {
             return MO_HALT_MATCHING;
         }
 
@@ -255,9 +254,9 @@ with_accel:
         STATE_T succ;
         NFA_EXEC_GET_LIM_SUCC(STATE_T);
 
-        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, exceptionMap, s,
-                              EXCEPTION_MASK, i, offset, &succ, final_loc, ctx,
-                              flags, 0, first_match)) {
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, s,  EXCEPTION_MASK,
+                              i, offset, &succ, final_loc, ctx, flags, 0,
+                              first_match)) {
             return MO_HALT_MATCHING;
         }
 
@@ -301,7 +300,6 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
 #endif
     const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
     const ReportID *exReports = getExReports(limex);
-    const u32 *exceptionMap = limex->exceptionMap;
     STATE_T s = LOAD_STATE(&ctx->s);
 
     /* assert(ISALIGNED_16(exceptions)); */
@@ -321,7 +319,7 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
         STATE_T succ;
         NFA_EXEC_GET_LIM_SUCC(STATE_T);
 
-        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, exceptionMap, s,
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, exReports, s,
                               EXCEPTION_MASK, i, offset, &succ, final_loc, ctx,
                               flags, 1, 0)) {
             return MO_HALT_MATCHING;
diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index c863fba9..6f1bcd09 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -454,4 +454,20 @@ void bf64_unset(u64a *bitfield, u32 i) {
     *bitfield &= ~(1ULL << i);
 }
 
+static really_inline
+u32 rank_in_mask32(u32 mask, u32 bit) {
+    assert(bit < sizeof(u32) * 8);
+    assert(mask & (u32)(1U << bit));
+    mask &= (u32)(1U << bit) - 1;
+    return popcount32(mask);
+}
+
+static really_inline
+u32 rank_in_mask64(u64a mask, u32 bit) {
+    assert(bit < sizeof(u64a) * 8);
+    assert(mask & (u64a)(1ULL << bit));
+    mask &= (u64a)(1ULL << bit) - 1;
+    return popcount64(mask);
+}
+
 #endif // BITUTILS_H
diff --git a/unit/internal/bitutils.cpp b/unit/internal/bitutils.cpp
index e13270dc..4d476932 100644
--- a/unit/internal/bitutils.cpp
+++ b/unit/internal/bitutils.cpp
@@ -412,3 +412,27 @@ TEST(BitUtils, bf_it_1) {
     ASSERT_EQ(~0U, bf64_iterate(1ULL << 63, 63));
 }
 
+TEST(BitUtils, rank_in_mask32) {
+    for (u32 i = 0; i < 32; i++) {
+        ASSERT_EQ(i, rank_in_mask32(0xffffffff, i));
+        ASSERT_EQ(0, rank_in_mask32(1U << i, i));
+    }
+    ASSERT_EQ(0, rank_in_mask32(0xf0f0f0f0, 4));
+    ASSERT_EQ(1, rank_in_mask32(0xf0f0f0f0, 5));
+    ASSERT_EQ(3, rank_in_mask32(0xf0f0f0f0, 7));
+    ASSERT_EQ(7, rank_in_mask32(0xf0f0f0f0, 15));
+    ASSERT_EQ(15, rank_in_mask32(0xf0f0f0f0, 31));
+}
+
+TEST(BitUtils, rank_in_mask64) {
+    for (u32 i = 0; i < 64; i++) {
+        ASSERT_EQ(i, rank_in_mask64(0xffffffffffffffffULL, i));
+        ASSERT_EQ(0, rank_in_mask64(1ULL << i, i));
+    }
+    ASSERT_EQ(0, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 4));
+    ASSERT_EQ(1, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 5));
+    ASSERT_EQ(3, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 7));
+    ASSERT_EQ(7, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 15));
+    ASSERT_EQ(15, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 31));
+    ASSERT_EQ(31, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 63));
+}

From 01ee4c94e3a7635645a62ef75baa64e765e5a7f8 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 4 Aug 2016 10:36:09 +1000
Subject: [PATCH 152/166] dfa_build_strat: remove dead pure virtual impl

---
 src/nfa/dfa_build_strat.cpp | 177 +-----------------------------------
 1 file changed, 3 insertions(+), 174 deletions(-)

diff --git a/src/nfa/dfa_build_strat.cpp b/src/nfa/dfa_build_strat.cpp
index 1d31feb1..d4d418aa 100755
--- a/src/nfa/dfa_build_strat.cpp
+++ b/src/nfa/dfa_build_strat.cpp
@@ -28,184 +28,13 @@
 
 #include "dfa_build_strat.h"
 
-#include "accel.h"
-#include "accelcompile.h"
-#include "grey.h"
-#include "mcclellan_internal.h"
-#include "mcclellancompile_util.h"
-#include "nfa_internal.h"
-#include "shufticompile.h"
-#include "trufflecompile.h"
-#include "ue2common.h"
-#include "util/alloc.h"
-#include "util/bitutils.h"
-#include "util/charreach.h"
-#include "util/compare.h"
-#include "util/compile_context.h"
-#include "util/container.h"
-#include "util/make_unique.h"
-#include "util/order_check.h"
-#include "util/report_manager.h"
-#include "util/ue2_containers.h"
-#include "util/unaligned.h"
-#include "util/verify_types.h"
-
-#include <vector>
-
-using namespace std;
-
 namespace ue2 {
 
 // prevent weak vtables for raw_report_info, dfa_build_strat and raw_dfa
-raw_report_info::~raw_report_info() {
-}
+raw_report_info::~raw_report_info() {}
 
-dfa_build_strat::~dfa_build_strat() {
-}
+dfa_build_strat::~dfa_build_strat() {}
 
-raw_dfa::~raw_dfa() {
-}
-
-namespace {
-
-struct raw_report_list {
-    flat_set<ReportID> reports;
-
-    raw_report_list(const flat_set<ReportID> &reports_in,
-                    const ReportManager &rm, bool do_remap) {
-        if (do_remap) {
-            for (auto &id : reports_in) {
-                reports.insert(rm.getProgramOffset(id));
-            }
-        } else {
-            reports = reports_in;
-        }
-    }
-
-    bool operator<(const raw_report_list &b) const {
-        return reports < b.reports;
-    }
-};
-
-struct raw_report_info_impl : public raw_report_info {
-    vector<raw_report_list> rl;
-    u32 getReportListSize() const override;
-    size_t size() const override;
-    void fillReportLists(NFA *n, size_t base_offset,
-                         std::vector<u32> &ro /* out */) const override;
-};
-}
-
-unique_ptr<raw_report_info>
-dfa_build_strat::gatherReports(vector<u32> &reports, vector<u32> &reports_eod,
-                               u8 *isSingleReport, ReportID *arbReport) const {
-    auto &rdfa = get_raw();
-    DEBUG_PRINTF("gathering reports\n");
-
-    const bool remap_reports = has_managed_reports(rdfa.kind);
-
-    auto ri = ue2::make_unique<raw_report_info_impl>();
-    map<raw_report_list, u32> rev;
-
-    for (const dstate &s : rdfa.states) {
-        if (s.reports.empty()) {
-            reports.push_back(MO_INVALID_IDX);
-            continue;
-        }
-
-        raw_report_list rrl(s.reports, rm, remap_reports);
-        DEBUG_PRINTF("non empty r\n");
-        if (rev.find(rrl) != rev.end()) {
-            reports.push_back(rev[rrl]);
-        } else {
-            DEBUG_PRINTF("adding to rl %zu\n", ri->size());
-            rev[rrl] = ri->size();
-            reports.push_back(ri->size());
-            ri->rl.push_back(rrl);
-        }
-    }
-
-    for (const dstate &s : rdfa.states) {
-        if (s.reports_eod.empty()) {
-            reports_eod.push_back(MO_INVALID_IDX);
-            continue;
-        }
-
-        DEBUG_PRINTF("non empty r eod\n");
-        raw_report_list rrl(s.reports_eod, rm, remap_reports);
-        if (rev.find(rrl) != rev.end()) {
-            reports_eod.push_back(rev[rrl]);
-            continue;
-        }
-
-        DEBUG_PRINTF("adding to rl eod %zu\n", s.reports_eod.size());
-        rev[rrl] = ri->size();
-        reports_eod.push_back(ri->size());
-        ri->rl.push_back(rrl);
-    }
-
-    assert(!ri->rl.empty()); /* all components should be able to generate
-                                reports */
-    if (!ri->rl.empty()) {
-        *arbReport = *ri->rl.begin()->reports.begin();
-    } else {
-        *arbReport = 0;
-    }
-
-    /* if we have only a single report id generated from all accepts (not eod)
-     * we can take some short cuts */
-    set<ReportID> reps;
-
-    for (u32 rl_index : reports) {
-        if (rl_index == MO_INVALID_IDX) {
-            continue;
-        }
-        assert(rl_index < ri->size());
-        insert(&reps, ri->rl[rl_index].reports);
-    }
-
-    if (reps.size() == 1) {
-        *isSingleReport = 1;
-        *arbReport = *reps.begin();
-        DEBUG_PRINTF("single -- %u\n", *arbReport);
-    } else {
-        *isSingleReport = 0;
-    }
-
-    return move(ri);
-}
-
-u32 raw_report_info_impl::getReportListSize() const {
-    u32 rv = 0;
-
-    for (const auto &reps : rl) {
-        rv += sizeof(report_list);
-        rv += sizeof(ReportID) * reps.reports.size();
-    }
-
-    return rv;
-}
-
-size_t raw_report_info_impl::size() const {
-    return rl.size();
-}
-
-void raw_report_info_impl::fillReportLists(NFA *n, size_t base_offset,
-                                           vector<u32> &ro) const {
-    for (const auto &reps : rl) {
-        ro.push_back(base_offset);
-
-        report_list *p = (report_list *)((char *)n + base_offset);
-
-        u32 i = 0;
-        for (const ReportID report : reps.reports) {
-            p->report[i++] = report;
-        }
-        p->count = verify_u32(reps.reports.size());
-
-        base_offset += sizeof(report_list);
-        base_offset += sizeof(ReportID) * reps.reports.size();
-    }
-}
+raw_dfa::~raw_dfa() {}
 
 } // namespace ue2

From e03375b6441e722a1ccee808983c9a86ad051fbd Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Thu, 4 Aug 2016 10:40:35 +1000
Subject: [PATCH 153/166] program_runtime: remove commented-out code

---
 src/rose/program_runtime.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index fef41269..8bf41715 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -718,7 +718,6 @@ int roseCheckMask(const struct core_info *ci, u64a and_mask, u64a cmp_mask,
     s32 shift_r = 0; // size of bytes before the history.
     s32 h_len = 0; // size of bytes in the history buffer.
     s32 c_len = 8; // size of bytes in the current buffer.
-    //s64a c_start = offset; // offset of start pointer in current buffer.
     if (offset < 0) {
         // in or before history buffer.
         if (offset + 8 <= -(s64a)ci->hlen) {

From cec57d7e903942bfc05c1f0a2d1b073a7d38d71c Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 9 Aug 2016 15:53:21 +1000
Subject: [PATCH 154/166] rose: ensure anch small block literals have bounds

---
 src/rose/rose_build_compile.cpp | 40 +++++----------------------------
 1 file changed, 5 insertions(+), 35 deletions(-)

diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index 8545ca70..3f82a9cc 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -1256,7 +1256,8 @@ void addSmallBlockLiteral(RoseBuildImpl &tbi, const simple_anchored_info &sai,
         assert(old_id < tbi.literal_info.size());
         const rose_literal_info &li = tbi.literal_info[old_id];
 
-        // For compile determinism, operate over literal vertices in index order.
+        // For compile determinism, operate over literal vertices in index
+        // order.
         vector<RoseVertex> lit_verts(begin(li.vertices), end(li.vertices));
         sort(begin(lit_verts), end(lit_verts), VertexIndexComp(g));
 
@@ -1270,40 +1271,9 @@ void addSmallBlockLiteral(RoseBuildImpl &tbi, const simple_anchored_info &sai,
             g[v].max_offset = sai.max_bound + sai.literal.length();
             lit_info.vertices.insert(v);
 
-            assert(!g[v].reports.empty());
-
-            bool doDirectReports = true;
-            for (ReportID report_id : g[v].reports) {
-                const Report &old_rep = tbi.rm.getReport(report_id);
-                if (!isExternalReport(old_rep) || old_rep.hasBounds()) {
-                    doDirectReports = false;
-                    break;
-                }
-            }
-
-            if (doDirectReports) {
-                flat_set<ReportID> dr_reports;
-                for (ReportID report_id : g[v].reports) {
-                    // These new literal roles can be made direct reports, with
-                    // their bounds handled by the bounds on their Report
-                    // structures.
-                    Report rep(tbi.rm.getReport(report_id)); // copy
-                    assert(!rep.hasBounds());
-                    rep.minOffset = sai.literal.length() + sai.min_bound;
-                    rep.maxOffset = sai.literal.length() + sai.max_bound;
-                    dr_reports.insert(tbi.rm.getInternalId(rep));
-                }
-                g[v].reports = dr_reports;
-                RoseEdge e = add_edge(tbi.root, v, g).first;
-                g[e].minBound = 0;             // handled by internal_report
-                g[e].maxBound = ROSE_BOUND_INF; // handled by internal_report
-            } else {
-                // If we have a complex internal report, these must become
-                // anchored literals with their own roles.
-                RoseEdge e = add_edge(anchored_root, v, g).first;
-                g[e].minBound = sai.min_bound;
-                g[e].maxBound = sai.max_bound;
-            }
+            RoseEdge e = add_edge(anchored_root, v, g).first;
+            g[e].minBound = sai.min_bound;
+            g[e].maxBound = sai.max_bound;
         }
     }
 }

From ae141874623705576c7768302fdced0e75a4ec51 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Tue, 9 Aug 2016 14:38:58 +1000
Subject: [PATCH 155/166] rose: use min of max_offset in left merges

Be more careful with max_offset, since we rely on it ofr ANCH history
cases. Also adds tighter assertions.
---
 src/rose/rose_build_bytecode.cpp      |  2 +-
 src/rose/rose_build_role_aliasing.cpp | 74 +++++++++++++++++----------
 2 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index c39c3401..56591de8 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -3887,7 +3887,7 @@ void makeRoleCheckBounds(const RoseBuildImpl &build, RoseVertex v,
                          : g[e].maxBound + lit_length;
 
     if (g[e].history == ROSE_ROLE_HISTORY_ANCH) {
-        assert(g[u].max_offset != ROSE_BOUND_INF);
+        assert(g[u].fixedOffset());
         // Make offsets absolute.
         min_bound += g[u].max_offset;
         if (max_bound != ROSE_BOUND_INF) {
diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index 2c7568f4..c2366f0e 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -596,37 +596,67 @@ void updateAliasingInfo(RoseBuildImpl &build, RoseAliasingInfo &rai,
     }
 }
 
-// Merge role 'a' into 'b'.
+/** \brief Common role merge code used by variants below. */
 static
-void mergeVertices(RoseVertex a, RoseVertex b, RoseBuildImpl &build,
-                   RoseAliasingInfo &rai) {
+void mergeCommon(RoseBuildImpl &build, RoseAliasingInfo &rai, RoseVertex a,
+                 RoseVertex b) {
     RoseGraph &g = build.g;
-    DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
 
-    // Merge role properties.
     assert(g[a].eod_accept == g[b].eod_accept);
     assert(g[a].left == g[b].left);
-
-    insert(&g[b].reports, g[a].reports);
+    assert(!g[a].suffix || g[a].suffix == g[b].suffix);
 
     // In some situations (ghost roles etc), we can have different groups.
     assert(!g[a].groups && !g[b].groups); /* current structure means groups
                                            * haven't been assigned yet */
     g[b].groups |= g[a].groups;
 
-    g[b].min_offset = min(g[a].min_offset, g[b].min_offset);
-    g[b].max_offset = max(g[a].max_offset, g[b].max_offset);
-
     mergeLiteralSets(a, b, build);
+    updateAliasingInfo(build, rai, a, b);
+
+    // Our min and max_offsets should be sane.
+    assert(g[b].min_offset <= g[b].max_offset);
+
+    // Safety check: we should not have created through a merge a vertex that
+    // has an out-edge with ANCH history but is not fixed-offset.
+    assert(!hasAnchHistorySucc(g, b) || g[b].fixedOffset());
+}
+
+/** \brief Merge role 'a' into 'b', left merge path. */
+static
+void mergeVerticesLeft(RoseVertex a, RoseVertex b, RoseBuildImpl &build,
+                       RoseAliasingInfo &rai) {
+    RoseGraph &g = build.g;
+    DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
+
+    insert(&g[b].reports, g[a].reports);
+
+    // Since it is a left merge (identical LHS) we should pick the tighter
+    // bound.
+    g[b].min_offset = max(g[a].min_offset, g[b].min_offset);
+    g[b].max_offset = min(g[a].max_offset, g[b].max_offset);
 
     if (!g[b].suffix) {
         g[b].suffix = g[a].suffix;
-    } else {
-        assert(!g[a].suffix || g[b].suffix == g[a].suffix);
     }
 
     mergeEdges(a, b, g);
-    updateAliasingInfo(build, rai, a, b);
+    mergeCommon(build, rai, a, b);
+}
+
+/** \brief Merge role 'a' into 'b', right merge path. */
+static
+void mergeVerticesRight(RoseVertex a, RoseVertex b, RoseBuildImpl &build,
+                        RoseAliasingInfo &rai) {
+    RoseGraph &g = build.g;
+    DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
+
+    insert(&g[b].reports, g[a].reports);
+    g[b].min_offset = min(g[a].min_offset, g[b].min_offset);
+    g[b].max_offset = max(g[a].max_offset, g[b].max_offset);
+
+    mergeEdges(a, b, g);
+    mergeCommon(build, rai, a, b);
 }
 
 /**
@@ -639,23 +669,15 @@ void mergeVerticesDiamond(RoseVertex a, RoseVertex b, RoseBuildImpl &build,
     RoseGraph &g = build.g;
     DEBUG_PRINTF("merging vertex %zu into %zu\n", g[a].idx, g[b].idx);
 
-    // Merge role properties. For a diamond merge, most properties are already
-    // the same (with the notable exception of the literal set).
-    assert(g[a].eod_accept == g[b].eod_accept);
-    assert(g[a].left == g[b].left);
+    // For a diamond merge, most properties are already the same (with the
+    // notable exception of the literal set).
     assert(g[a].reports == g[b].reports);
     assert(g[a].suffix == g[b].suffix);
 
-    // In some situations (ghost roles etc), we can have different groups.
-    assert(!g[a].groups && !g[b].groups); /* current structure means groups
-                                           * haven't been assigned yet */
-    g[b].groups |= g[a].groups;
-
     g[b].min_offset = min(g[a].min_offset, g[b].min_offset);
     g[b].max_offset = max(g[a].max_offset, g[b].max_offset);
 
-    mergeLiteralSets(a, b, build);
-    updateAliasingInfo(build, rai, a, b);
+    mergeCommon(build, rai, a, b);
 }
 
 static never_inline
@@ -1709,7 +1731,7 @@ void leftMergePass(CandidateSet &candidates, RoseBuildImpl &build,
             continue;
         }
 
-        mergeVertices(a, b, build, rai);
+        mergeVerticesLeft(a, b, build, rai);
         dead->push_back(a);
         candidates.erase(ait);
     }
@@ -1924,7 +1946,7 @@ void rightMergePass(CandidateSet &candidates, RoseBuildImpl &build,
         }
 
         RoseVertex b = *jt;
-        mergeVertices(a, b, build, rai);
+        mergeVerticesRight(a, b, build, rai);
         dead->push_back(a);
         candidates.erase(ait);
     }

From 85e2ba75558b451012ac8975edfa1dc967740de6 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Tue, 28 Jun 2016 11:59:14 +1000
Subject: [PATCH 156/166] cmake: take control of our compiler flags

---
 CMakeLists.txt | 80 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 50 insertions(+), 30 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8834d4d6..0b86b2c1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,8 @@
 cmake_minimum_required (VERSION 2.8.11)
+
+# don't use the built-in default configs
+set (CMAKE_NOT_USING_CONFIG_FLAGS TRUE)
+
 project (Hyperscan C CXX)
 
 set (HS_MAJOR_VERSION 4)
@@ -6,6 +10,10 @@ set (HS_MINOR_VERSION 2)
 set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
+# since we are doing this manually, we only have three types
+set (CMAKE_CONFIGURATION_TYPES "Debug;Release;RelWithDebInfo"
+     CACHE STRING "" FORCE)
+
 string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@@ -24,7 +32,7 @@ find_package(PkgConfig QUIET)
 
 if (NOT CMAKE_BUILD_TYPE)
     message(STATUS "Default build type 'Release with debug info'")
-    set(CMAKE_BUILD_TYPE "RELWITHDEBINFO")
+    set(CMAKE_BUILD_TYPE RELWITHDEBINFO CACHE STRING "" FORCE )
 else()
     string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE)
     message(STATUS "Build type ${CMAKE_BUILD_TYPE}")
@@ -121,13 +129,7 @@ endif()
 
 CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in release builds" ON "NOT RELEASE_BUILD" OFF)
 
-option(DISABLE_ASSERTS "Disable assert(); enabled in debug builds, disabled in release builds" FALSE)
-
-if (DISABLE_ASSERTS)
-    if (CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-        add_definitions(-DNDEBUG)
-    endif()
-endif()
+CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in debug builds, disabled in release builds" OFF "NOT RELEASE_BUILD" ON)
 
 option(WINDOWS_ICC "Use Intel C++ Compiler on Windows, default off, requires ICC to be set in project" OFF)
 
@@ -139,18 +141,26 @@ if(MSVC OR MSVC_IDE)
     if (MSVC_VERSION LESS 1700)
         message(FATAL_ERROR "The project requires C++11 features.")
     else()
+        # set base flags
+        set(CMAKE_C_FLAGS "/DWIN32 /D_WINDOWS /W3")
+        set(CMAKE_C_FLAGS_DEBUG "/D_DEBUG /MDd /Zi /Od")
+        set(CMAKE_C_FLAGS_RELEASE "/MD /O2 /Ob2 /Oi")
+        set(CMAKE_C_FLAGS_RELWITHDEBINFO "/Zi /MD /O2 /Ob2 /Oi")
+
+        set(CMAKE_CXX_FLAGS "/DWIN32 /D_WINDOWS /W3 /GR /EHsc")
+        set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Zi /Od")
+        set(CMAKE_CXX_FLAGS_RELEASE "/MD /O2 /Ob2 /Oi")
+        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/Zi /MD /O2 /Ob2 /Oi")
+
         if (WINDOWS_ICC)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Qstd=c99 /Qrestrict /QxHost /O3 /wd4267 /Qdiag-disable:remark")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qstd=c++11 /Qrestrict /QxHost /O2 /wd4267 /wd4800 /Qdiag-disable:remark -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Qstd=c99 /Qrestrict /QxHost /wd4267 /Qdiag-disable:remark")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qstd=c++11 /Qrestrict /QxHost /wd4267 /wd4800 /Qdiag-disable:remark -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
         else()
             #TODO: don't hardcode arch
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX /O2 /wd4267")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX /O2 /wd4244 /wd4267 /wd4800 /wd2586 /wd1170 -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX /wd4267")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX /wd4244 /wd4267 /wd4800 /wd2586 /wd1170 -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
         endif()
-        string(REGEX REPLACE "/RTC1" ""
-            CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}" )
-        string(REGEX REPLACE "/RTC1" ""
-            CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}" )
+
 
     endif()
 
@@ -172,15 +182,33 @@ else()
         unset(_GXX_OUTPUT)
     endif()
 
+    if(OPTIMISE)
+        set(OPT_C_FLAG "-O3")
+        set(OPT_CXX_FLAG "-O2")
+    else()
+        set(OPT_C_FLAG "-O0")
+        set(OPT_CXX_FLAG "-O0")
+    endif(OPTIMISE)
+
+    # set up base flags for build types
+    set(CMAKE_C_FLAGS_DEBUG "-g ${OPT_C_FLAG} -Werror")
+    set(CMAKE_C_FLAGS_RELWITHDEBINFO "-g ${OPT_C_FLAG}")
+    set(CMAKE_C_FLAGS_RELEASE "${OPT_C_FLAG}")
+
+    set(CMAKE_CXX_FLAGS_DEBUG "-g ${OPT_CXX_FLAG} -Werror")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-g ${OPT_CXX_FLAG}")
+    set(CMAKE_CXX_FLAGS_RELEASE "${OPT_CXX_FLAG}")
+
+    if (DISABLE_ASSERTS)
+        # usually true for release builds, false for debug
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DNDEBUG")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG")
+    endif()
+
+
     # set compiler flags - more are tested and added later
     set(EXTRA_C_FLAGS "-std=c99 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
     set(EXTRA_CXX_FLAGS "-std=c++11 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
-    if (NOT RELEASE_BUILD)
-        # -Werror is most useful during development, don't potentially break
-        # release builds
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
-    endif()
 
     if (NOT CMAKE_C_FLAGS MATCHES .*march.*)
         message(STATUS "Building for current host CPU")
@@ -199,14 +227,6 @@ else()
         set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0 -Wno-unused-local-typedefs -Wno-maybe-uninitialized")
     endif()
 
-    if(OPTIMISE)
-        set(EXTRA_C_FLAGS "-O3 ${EXTRA_C_FLAGS}")
-        set(EXTRA_CXX_FLAGS "-O2 ${EXTRA_CXX_FLAGS}")
-    else()
-        set(EXTRA_C_FLAGS "-O0 ${EXTRA_C_FLAGS}")
-        set(EXTRA_CXX_FLAGS "-O0 ${EXTRA_CXX_FLAGS}")
-    endif(OPTIMISE)
-
     if (NOT(ARCH_IA32 AND RELEASE_BUILD))
         set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
         set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")

From e3c9bc7edf33dbbeb124ae7de71d6d1932fea80e Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Tue, 19 Jul 2016 09:23:57 +1000
Subject: [PATCH 157/166] GCC 6 warns about unused attributes - disable the
 warning

---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b86b2c1..24061c42 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -317,6 +317,11 @@ if (CXX_UNUSED_CONST_VAR)
     set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-const-variable")
 endif()
 
+# gcc 6 complains about type attributes that get ignored, like alignment
+CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_IGNORED_ATTR)
+if (CXX_IGNORED_ATTR)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes")
+endif()
 
 # note this for later
 # g++ doesn't have this flag but clang does

From ead869992c736682f87be56c3d14573c1c2a5b7e Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Tue, 19 Jul 2016 09:28:46 +1000
Subject: [PATCH 158/166] unit: Disable warnings

---
 unit/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt
index 8209c277..63f3a9ac 100644
--- a/unit/CMakeLists.txt
+++ b/unit/CMakeLists.txt
@@ -25,6 +25,11 @@ if(CXX_WUNUSED_VARIABLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable")
 endif()
 
+if(CMAKE_COMPILER_IS_GNUCC)
+    # spurious warnings?
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-array-bounds")
+endif()
+
 add_library(gtest STATIC ${gtest_SOURCES})
 
 add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${PROJECT_SOURCE_DIR})

From f06f5d0702e6bebc4f65d82630434728d64c5953 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Wed, 10 Aug 2016 11:14:54 +1000
Subject: [PATCH 159/166] Use SOURCE_DATE_EPOCH for timestamp if present

The Debian reproducible builds effort suggests using
this environment variable for timestamps.
---
 CMakeLists.txt      | 14 ++++++++++++--
 cmake/formatdate.py | 18 ++++++++++++++++++
 2 files changed, 30 insertions(+), 2 deletions(-)
 create mode 100755 cmake/formatdate.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 24061c42..d80b3d4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,8 +14,6 @@ set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 set (CMAKE_CONFIGURATION_TYPES "Debug;Release;RelWithDebInfo"
      CACHE STRING "" FORCE)
 
-string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
-
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
@@ -98,6 +96,18 @@ else()
     message(FATAL_ERROR "No python interpreter found")
 endif()
 
+# allow for reproducible builds - python for portability
+if (DEFINED ENV{SOURCE_DATE_EPOCH})
+      execute_process(
+          COMMAND "${PYTHON}" "${CMAKE_MODULE_PATH}/formatdate.py" "$ENV{SOURCE_DATE_EPOCH}"
+          OUTPUT_VARIABLE BUILD_DATE
+          OUTPUT_STRIP_TRAILING_WHITESPACE)
+else ()
+    string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
+endif ()
+message(STATUS "Build date: ${BUILD_DATE}")
+
+
 if(${RAGEL} STREQUAL "RAGEL-NOTFOUND")
     message(FATAL_ERROR "Ragel state machine compiler not found")
 endif()
diff --git a/cmake/formatdate.py b/cmake/formatdate.py
new file mode 100755
index 00000000..1b9c62d2
--- /dev/null
+++ b/cmake/formatdate.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+from __future__ import print_function
+import os
+import sys
+import datetime
+
+def usage():
+    print("Usage:", os.path.basename(sys.argv[0]), "<seconds from epoch>")
+
+if len(sys.argv) != 2:
+    usage()
+    sys.exit(1)
+
+ts = sys.argv[1]
+
+build_date = datetime.datetime.utcfromtimestamp(int(ts))
+
+print(build_date.strftime("%Y-%m-%d"))

From 147f9655b6442e722e98abda17ff7f19b166327a Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Thu, 11 Aug 2016 13:04:26 +1000
Subject: [PATCH 160/166] Add assertion indicating valid range of
 rank_in_mask's bit param

Coverity CID 141632
---
 src/nfa/limex_dump.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/nfa/limex_dump.cpp b/src/nfa/limex_dump.cpp
index 2c215feb..c52adc46 100644
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@@ -83,6 +83,8 @@ void dumpMask(FILE *f, const char *name, const u8 *mask, u32 mask_bits) {
 template<typename mask_t>
 static
 u32 rank_in_mask(mask_t mask, u32 bit) {
+    assert(bit < 8 * sizeof(mask));
+
     u32 chunks[sizeof(mask)/sizeof(u32)];
     memcpy(chunks, &mask, sizeof(mask));
     u32 base_rank = 0;

From e6c05d5a559e6c98bb84b8fff08f61e981e05e21 Mon Sep 17 00:00:00 2001
From: Alex Coyte <a.coyte@intel.com>
Date: Thu, 11 Aug 2016 13:10:34 +1000
Subject: [PATCH 161/166] set an appropriate default value for RoleInfo::score

Coverity CID 131843
---
 src/rose/rose_build_exclusive.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rose/rose_build_exclusive.h b/src/rose/rose_build_exclusive.h
index a6772f7f..9cabb1d2 100644
--- a/src/rose/rose_build_exclusive.h
+++ b/src/rose/rose_build_exclusive.h
@@ -89,7 +89,7 @@ struct RoleInfo {
     CharReach cr; // reach of engine graph
     const role_id role; // infix or suffix info
     const u32 id; // infix or suffix id
-    u32 score; // score for exclusive analysis
+    u32 score = ~0U; // score for exclusive analysis
 };
 
 /**

From 87e32c90372a22140c67db23ab8935b2c6be35de Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Mon, 15 Aug 2016 09:58:06 +1000
Subject: [PATCH 162/166] tamarama: check for match halt in _Q2 as well

---
 src/nfa/tamarama.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/nfa/tamarama.c b/src/nfa/tamarama.c
index 4286a67e..b5f90e85 100644
--- a/src/nfa/tamarama.c
+++ b/src/nfa/tamarama.c
@@ -431,6 +431,9 @@ char nfaExecTamarama0_Q2(const struct NFA *n,
         rv = nfaQueueExec2_raw(q1.nfa, &q1, end);
         q->report_current = q1.report_current;
         copy = 1;
+        if (can_stop_matching(q->scratch)) {
+            break;
+        }
     }
     if (copy) {
         copyBack(t, q, &q1);

From 0a1491d907dab0e3c59fd5b5c1ade3a7028f8d99 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Mon, 15 Aug 2016 11:55:08 +1000
Subject: [PATCH 163/166] Remove problematic debug output

---
 src/nfagraph/ng_rose.cpp   | 3 ---
 src/nfagraph/ng_violet.cpp | 4 ----
 2 files changed, 7 deletions(-)

diff --git a/src/nfagraph/ng_rose.cpp b/src/nfagraph/ng_rose.cpp
index 24570a01..137ac5cc 100644
--- a/src/nfagraph/ng_rose.cpp
+++ b/src/nfagraph/ng_rose.cpp
@@ -1784,9 +1784,6 @@ bool doNetflowCut(RoseInGraph &ig, const vector<RoseInEdge> &to_cut,
         set<ue2_literal> lits = getLiteralSet(h, e);
         compressAndScore(lits);
         cut_lits[e] = lits;
-
-        DEBUG_PRINTF("cut lit '%s'\n",
-                     dumpString(*cut_lits[e].begin()).c_str());
     }
 
     /* if literals are underlength bail or if it involves a forbidden edge*/
diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 26fb0ef5..94e0a998 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -1290,10 +1290,6 @@ bool doNetflowCut(NGHolder &h,
         sanitizeAndCompressAndScore(lits);
 
         cut_lits[e] = lits;
-
-        DEBUG_PRINTF("cut lit '%s' %u->%u\n",
-                     dumpString(*cut_lits[e].begin()).c_str(),
-                     h[source(e, h)].index, h[target(e, h)].index);
     }
 
     /* if literals are underlength bail or if it involves a forbidden edge*/

From 34d6a0d6831b73b0f295af3409f0ce0075dab54a Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Mon, 22 Aug 2016 15:54:18 +1000
Subject: [PATCH 164/166] Change SONAME to only use the major version number

Hyperscan will only break ABI on major version changes, and the SONAME
used for shared library versions should reflect this.
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d80b3d4c..f6e03b0d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1000,7 +1000,7 @@ endif()
 # choose which ones to build
 
 set (LIB_VERSION ${HS_VERSION})
-set (LIB_SOVERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION})
+set (LIB_SOVERSION ${HS_MAJOR_VERSION})
 
 add_library(hs_exec OBJECT ${hs_exec_SRCS})
 

From 896618fda1db228cc468ec58a5b857bf6b5808f7 Mon Sep 17 00:00:00 2001
From: Justin Viiret <justin.viiret@intel.com>
Date: Wed, 24 Aug 2016 14:21:57 +1000
Subject: [PATCH 165/166] changelog: updates for 4.3 release

---
 CHANGELOG.md | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19a37b8e..7dc0fd79 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,36 @@
 
 This is a list of notable changes to Hyperscan, in reverse chronological order.
 
+## [4.3.0] 2016-08-24
+- Introduce a new analysis pass ("Violet") used for decomposition of patterns
+  into literals and smaller engines.
+- Introduce a new container engine ("Tamarama") for infix and suffix engines
+  that can be proven to run exclusively of one another. This reduces stream
+  state for pattern sets with many such engines.
+- Introduce a new shuffle-based DFA engine ("Sheng"). This improves scanning
+  performance for pattern sets where small engines are generated.
+- Improve the analysis used to extract extra mask information from short
+  literals.
+- Reduced compile time spent in equivalence class analysis.
+- Build: frame pointers are now only omitted for 32-bit release builds.
+- Build: Workaround for C++ issues reported on FreeBSD/libc++ platforms.
+  (github issue #27)
+- Simplify the LimEx NFA with a unified "variable shift" model, which reduces
+  the number of different NFA code paths to one per model size.
+- Allow some anchored prefixes that may squash the literal to which they are
+  attached to run eagerly. This improves scanning performance for some
+  patterns.
+- Simplify and improve EOD ("end of data") matching, using the interpreter for
+  all operations.
+- Elide unnecessary instructions in the Rose interpreter at compile time.
+- Reduce the number of inlined instantiations of the Rose interpreter in order
+  to reduce instruction cache pressure.
+- Small improvements to literal matcher acceleration.
+- Parser: ignore `\E` metacharacters that are not preceded by `\Q`. This
+  conforms to PCRE's behaviour, rather than returning a compile error.
+- Check for misaligned memory when allocating an error structure in Hyperscan's
+  compile path and return an appropriate error if detected.
+
 ## [4.2.0] 2016-05-31
 - Introduce an interpreter for many complex actions to replace the use of
   internal reports within the core of Hyperscan (the "Rose" engine). This

From 8cf0c415525e40d9548fbcfa6d5267f7eae0b266 Mon Sep 17 00:00:00 2001
From: Matthew Barr <matthew.barr@intel.com>
Date: Wed, 24 Aug 2016 14:27:59 +1000
Subject: [PATCH 166/166] Bump version number for release

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f6e03b0d..abbfe53b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ set (CMAKE_NOT_USING_CONFIG_FLAGS TRUE)
 project (Hyperscan C CXX)
 
 set (HS_MAJOR_VERSION 4)
-set (HS_MINOR_VERSION 2)
+set (HS_MINOR_VERSION 3)
 set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})