diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index 11ae2000..b76078f9 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -978,11 +978,6 @@ unique_ptr<NGHolder> makeHolder(const CastleProto &proto,
     auto g = ue2::make_unique<NGHolder>(proto.kind);
 
     for (const auto &m : proto.repeats) {
-        if (m.first >= NFA_MAX_TOP_MASKS) {
-            DEBUG_PRINTF("top %u too big for an NFA\n", m.first);
-            return nullptr;
-        }
-
         addToHolder(*g, m.first, m.second);
     }
 
diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index 53a003e3..2c164090 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -41,7 +41,6 @@
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_limex_accel.h"
 #include "nfagraph/ng_repeat.h"
-#include "nfagraph/ng_restructuring.h"
 #include "nfagraph/ng_squash.h"
 #include "nfagraph/ng_util.h"
 #include "ue2common.h"
@@ -74,6 +73,12 @@ using boost::adaptors::map_values;
 
 namespace ue2 {
 
+/**
+ * \brief Special state index value meaning that the vertex will not
+ * participate in an (NFA/DFA/etc) implementation.
+ */
+static constexpr u32 NO_STATE = ~0;
+
 namespace {
 
 struct precalcAccel {
@@ -91,7 +96,7 @@ struct precalcAccel {
 struct limex_accel_info {
     ue2::unordered_set<NFAVertex> accelerable;
     map<NFAStateSet, precalcAccel> precalc;
-    ue2::unordered_map<NFAVertex, flat_set<NFAVertex> > friends;
+    ue2::unordered_map<NFAVertex, flat_set<NFAVertex>> friends;
     ue2::unordered_map<NFAVertex, AccelScheme> accel_map;
 };
 
@@ -134,7 +139,7 @@ struct build_info {
                const vector<BoundedRepeatData> &ri,
                const map<NFAVertex, NFAStateSet> &rsmi,
                const map<NFAVertex, NFAStateSet> &smi,
-               const map<u32, NFAVertex> &ti, const set<NFAVertex> &zi,
+               const map<u32, set<NFAVertex>> &ti, const set<NFAVertex> &zi,
                bool dai, bool sci, const CompileContext &cci,
                u32 nsi)
         : h(hi), state_ids(states_in), repeats(ri), tops(ti), zombies(zi),
@@ -160,7 +165,7 @@ struct build_info {
     map<NFAVertex, NFAStateSet> reportSquashMap;
     map<NFAVertex, NFAStateSet> squashMap;
 
-    const map<u32, NFAVertex> &tops;
+    const map<u32, set<NFAVertex>> &tops;
     ue2::unordered_set<NFAVertex> tugs;
     map<NFAVertex, BoundedRepeatSummary> br_cyclic;
     const set<NFAVertex> &zombies;
@@ -522,20 +527,25 @@ struct fas_visitor : public boost::default_bfs_visitor {
 };
 
 static
-void filterAccelStates(NGHolder &g, const map<u32, NFAVertex> &tops,
+void filterAccelStates(NGHolder &g, const map<u32, set<NFAVertex>> &tops,
                        ue2::unordered_map<NFAVertex, AccelScheme> *accel_map) {
     /* We want the NFA_MAX_ACCEL_STATES best acceleration states, everything
      * else should be ditched. We use a simple BFS to choose accel states near
      * the start. */
 
-    // Temporarily wire start to each top for the BFS.
-    vector<NFAEdge> topEdges;
-    wireStartToTops(g, tops, topEdges);
+    vector<NFAEdge> tempEdges;
+    for (const auto &vv : tops | map_values) {
+        for (NFAVertex v : vv) {
+            if (!edge(g.start, v, g).second) {
+                tempEdges.push_back(add_edge(g.start, v, g).first);
+            }
+        }
+    }
 
     // Similarly, connect (start, startDs) if necessary.
     if (!edge(g.start, g.startDs, g).second) {
         auto e = add_edge(g.start, g.startDs, g).first;
-        topEdges.push_back(e); // Remove edge later.
+        tempEdges.push_back(e); // Remove edge later.
     }
 
     ue2::unordered_map<NFAVertex, AccelScheme> out;
@@ -551,7 +561,7 @@ void filterAccelStates(NGHolder &g, const map<u32, NFAVertex> &tops,
         ; /* found max accel_states */
     }
 
-    remove_edges(topEdges, g);
+    remove_edges(tempEdges, g);
 
     assert(out.size() <= NFA_MAX_ACCEL_STATES);
     accel_map->swap(out);
@@ -705,7 +715,7 @@ void fillAccelInfo(build_info &bi) {
 
 /** The AccelAux structure has large alignment specified, and this makes some
  * compilers do odd things unless we specify a custom allocator. */
-typedef vector<AccelAux, AlignedAllocator<AccelAux, alignof(AccelAux)> >
+typedef vector<AccelAux, AlignedAllocator<AccelAux, alignof(AccelAux)>>
     AccelAuxVector;
 
 #define IMPOSSIBLE_ACCEL_MASK (~0U)
@@ -1122,19 +1132,20 @@ void buildTopMasks(const build_info &args, vector<NFAStateSet> &topMasks) {
 
     u32 numMasks = args.tops.rbegin()->first + 1; // max mask index
     DEBUG_PRINTF("we have %u top masks\n", numMasks);
-    assert(numMasks <= NFA_MAX_TOP_MASKS);
 
     topMasks.assign(numMasks, NFAStateSet(args.num_states)); // all zeroes
 
     for (const auto &m : args.tops) {
         u32 mask_idx = m.first;
-        u32 state_id = args.state_ids.at(m.second);
-        DEBUG_PRINTF("state %u is in top mask %u\n", state_id, mask_idx);
+        for (NFAVertex v : m.second) {
+            u32 state_id = args.state_ids.at(v);
+            DEBUG_PRINTF("state %u is in top mask %u\n", state_id, mask_idx);
 
-        assert(mask_idx < numMasks);
-        assert(state_id != NO_STATE);
+            assert(mask_idx < numMasks);
+            assert(state_id != NO_STATE);
 
-        topMasks[mask_idx].set(state_id);
+            topMasks[mask_idx].set(state_id);
+        }
     }
 }
 
@@ -2123,7 +2134,7 @@ struct Factory {
         u32 maxShift = findMaxVarShift(args, shiftCount);
         findExceptionalTransitions(args, exceptional, maxShift);
 
-        map<ExceptionProto, vector<u32> > exceptionMap;
+        map<ExceptionProto, vector<u32>> exceptionMap;
         vector<ReportID> reportList;
 
         u32 exceptionCount = buildExceptionMap(args, reports_cache, exceptional,
@@ -2315,13 +2326,13 @@ MAKE_LIMEX_TRAITS(512)
 #ifndef NDEBUG
 // Some sanity tests, called by an assertion in generate().
 static UNUSED
-bool isSane(const NGHolder &h, const map<u32, NFAVertex> &tops,
+bool isSane(const NGHolder &h, const map<u32, set<NFAVertex>> &tops,
             const ue2::unordered_map<NFAVertex, u32> &state_ids,
             u32 num_states) {
     ue2::unordered_set<u32> seen;
     ue2::unordered_set<NFAVertex> top_starts;
-    for (const auto &m : tops) {
-        top_starts.insert(m.second);
+    for (const auto &vv : tops | map_values) {
+        insert(&top_starts, vv);
     }
 
     for (auto v : vertices_range(h)) {
@@ -2385,7 +2396,7 @@ aligned_unique_ptr<NFA> generate(NGHolder &h,
                          const vector<BoundedRepeatData> &repeats,
                          const map<NFAVertex, NFAStateSet> &reportSquashMap,
                          const map<NFAVertex, NFAStateSet> &squashMap,
-                         const map<u32, NFAVertex> &tops,
+                         const map<u32, set<NFAVertex>> &tops,
                          const set<NFAVertex> &zombies,
                          bool do_accel,
                          bool stateCompression,
@@ -2457,7 +2468,7 @@ u32 countAccelStates(NGHolder &h,
                      const vector<BoundedRepeatData> &repeats,
                      const map<NFAVertex, NFAStateSet> &reportSquashMap,
                      const map<NFAVertex, NFAStateSet> &squashMap,
-                     const map<u32, NFAVertex> &tops,
+                     const map<u32, set<NFAVertex>> &tops,
                      const set<NFAVertex> &zombies,
                      const CompileContext &cc) {
     const u32 num_states = max_state(states) + 1;
diff --git a/src/nfa/limex_compile.h b/src/nfa/limex_compile.h
index 62a07e10..21cb7608 100644
--- a/src/nfa/limex_compile.h
+++ b/src/nfa/limex_compile.h
@@ -71,7 +71,7 @@ aligned_unique_ptr<NFA> generate(NGHolder &g,
                         const std::vector<BoundedRepeatData> &repeats,
                         const std::map<NFAVertex, NFAStateSet> &reportSquashMap,
                         const std::map<NFAVertex, NFAStateSet> &squashMap,
-                        const std::map<u32, NFAVertex> &tops,
+                        const std::map<u32, std::set<NFAVertex>> &tops,
                         const std::set<NFAVertex> &zombies,
                         bool do_accel,
                         bool stateCompression,
@@ -89,7 +89,7 @@ u32 countAccelStates(NGHolder &h,
                      const std::vector<BoundedRepeatData> &repeats,
                      const std::map<NFAVertex, NFAStateSet> &reportSquashMap,
                      const std::map<NFAVertex, NFAStateSet> &squashMap,
-                     const std::map<u32, NFAVertex> &tops,
+                     const std::map<u32, std::set<NFAVertex>> &tops,
                      const std::set<NFAVertex> &zombies,
                      const CompileContext &cc);
 
diff --git a/src/nfa/limex_limits.h b/src/nfa/limex_limits.h
index 9b35b115..f4df54a4 100644
--- a/src/nfa/limex_limits.h
+++ b/src/nfa/limex_limits.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,6 +31,5 @@
 
 #define NFA_MAX_STATES       512 /**< max states in an NFA */
 #define NFA_MAX_ACCEL_STATES   8 /**< max accel states in a NFA */
-#define NFA_MAX_TOP_MASKS     32 /**< max number of MQE_TOP_N event types */
 
 #endif
diff --git a/src/nfagraph/ng_haig.cpp b/src/nfagraph/ng_haig.cpp
index e70b7708..143dca16 100644
--- a/src/nfagraph/ng_haig.cpp
+++ b/src/nfagraph/ng_haig.cpp
@@ -35,7 +35,6 @@
 #include "nfa/goughcompile.h"
 #include "ng_holder.h"
 #include "ng_mcclellan_internal.h"
-#include "ng_restructuring.h"
 #include "ng_som_util.h"
 #include "ng_squash.h"
 #include "ng_util.h"
@@ -118,11 +117,11 @@ public:
     using StateMap = typename Automaton_Traits::StateMap;
 
 protected:
-    Automaton_Base(const NGHolder &graph_in,
-                   const flat_set<NFAVertex> &unused_in, som_type som,
+    Automaton_Base(const NGHolder &graph_in, som_type som,
                    const vector<vector<CharReach>> &triggers,
                    bool unordered_som)
-        : graph(graph_in), numStates(num_vertices(graph)), unused(unused_in),
+        : graph(graph_in), numStates(num_vertices(graph)),
+          unused(getRedundantStarts(graph_in)),
           init(Automaton_Traits::init_states(numStates)),
           initDS(Automaton_Traits::init_states(numStates)),
           squash(Automaton_Traits::init_states(numStates)),
@@ -210,7 +209,7 @@ public:
 
     const NGHolder &graph;
     const u32 numStates;
-    const flat_set<NFAVertex> &unused;
+    const flat_set<NFAVertex> unused;
 
     array<u16, ALPHABET_SIZE> alpha;
     array<u16, ALPHABET_SIZE> unalpha;
@@ -251,10 +250,9 @@ struct Big_Traits {
 
 class Automaton_Big : public Automaton_Base<Big_Traits> {
 public:
-    Automaton_Big(const NGHolder &graph_in,
-                  const flat_set<NFAVertex> &unused_in, som_type som,
+    Automaton_Big(const NGHolder &graph_in, som_type som,
                   const vector<vector<CharReach>> &triggers, bool unordered_som)
-        : Automaton_Base(graph_in, unused_in, som, triggers, unordered_som) {}
+        : Automaton_Base(graph_in, som, triggers, unordered_som) {}
 };
 
 struct Graph_Traits {
@@ -278,11 +276,10 @@ struct Graph_Traits {
 
 class Automaton_Graph : public Automaton_Base<Graph_Traits> {
 public:
-    Automaton_Graph(const NGHolder &graph_in,
-                    const flat_set<NFAVertex> &unused_in, som_type som,
+    Automaton_Graph(const NGHolder &graph_in, som_type som,
                     const vector<vector<CharReach>> &triggers,
                     bool unordered_som)
-        : Automaton_Base(graph_in, unused_in, som, triggers, unordered_som) {}
+        : Automaton_Base(graph_in, som, triggers, unordered_som) {}
 };
 
 class Automaton_Haig_Merge {
@@ -512,15 +509,14 @@ void haig_note_starts(const NGHolder &g, map<u32, u32> *out) {
 
 template<class Auto>
 static
-bool doHaig(const NGHolder &g,
-            const flat_set<NFAVertex> &unused,
-            som_type som, const vector<vector<CharReach>> &triggers,
-            bool unordered_som, raw_som_dfa *rdfa) {
+bool doHaig(const NGHolder &g, som_type som,
+            const vector<vector<CharReach>> &triggers, bool unordered_som,
+            raw_som_dfa *rdfa) {
     u32 state_limit = HAIG_FINAL_DFA_STATE_LIMIT; /* haig never backs down from
                                                      a fight */
     typedef typename Auto::StateSet StateSet;
     vector<StateSet> nfa_state_map;
-    Auto n(g, unused, som, triggers, unordered_som);
+    Auto n(g, som, triggers, unordered_som);
     try {
         if (determinise(n, rdfa->states, state_limit, &nfa_state_map)) {
             DEBUG_PRINTF("state limit exceeded\n");
@@ -550,9 +546,9 @@ bool doHaig(const NGHolder &g,
         haig_do_preds(g, source_states, n.v_by_index,
                       rdfa->state_som.back().preds);
 
-        haig_do_report(g, unused, g.accept, source_states, n.v_by_index,
+        haig_do_report(g, n.unused, g.accept, source_states, n.v_by_index,
                        rdfa->state_som.back().reports);
-        haig_do_report(g, unused, g.acceptEod, source_states, n.v_by_index,
+        haig_do_report(g, n.unused, g.acceptEod, source_states, n.v_by_index,
                        rdfa->state_som.back().reports_eod);
     }
 
@@ -577,8 +573,6 @@ attemptToBuildHaig(const NGHolder &g, som_type som, u32 somPrecision,
     assert(allMatchStatesHaveReports(g));
     assert(hasCorrectlyNumberedVertices(g));
 
-    auto unused = findUnusedStates(g);
-
     u32 numStates = num_vertices(g);
     if (numStates > HAIG_MAX_NFA_STATE) {
         DEBUG_PRINTF("giving up... looks too big\n");
@@ -592,12 +586,11 @@ attemptToBuildHaig(const NGHolder &g, som_type som, u32 somPrecision,
     bool rv;
     if (numStates <= NFA_STATE_LIMIT) {
         /* fast path */
-        rv = doHaig<Automaton_Graph>(g, unused, som, triggers, unordered_som,
+        rv = doHaig<Automaton_Graph>(g, som, triggers, unordered_som,
                                      rdfa.get());
     } else {
         /* not the fast path */
-        rv = doHaig<Automaton_Big>(g, unused, som, triggers, unordered_som,
-                                   rdfa.get());
+        rv = doHaig<Automaton_Big>(g, som, triggers, unordered_som, rdfa.get());
     }
 
     if (!rv) {
diff --git a/src/nfagraph/ng_limex.cpp b/src/nfagraph/ng_limex.cpp
index c6e4c24e..66494c77 100644
--- a/src/nfagraph/ng_limex.cpp
+++ b/src/nfagraph/ng_limex.cpp
@@ -54,10 +54,15 @@
 #include "util/ue2_containers.h"
 #include "util/verify_types.h"
 
+#include <algorithm>
 #include <map>
 #include <vector>
 
+#include <boost/range/adaptor/map.hpp>
+
 using namespace std;
+using boost::adaptors::map_values;
+using boost::adaptors::map_keys;
 
 namespace ue2 {
 
@@ -146,78 +151,310 @@ void dropRedundantStartEdges(NGHolder &g) {
 }
 
 static
-void makeTopStates(NGHolder &g, map<u32, NFAVertex> &tops,
-                   const map<u32, CharReach> &top_reach) {
-    /* TODO: more intelligent creation of top states */
-    map<u32, vector<NFAVertex>> top_succs;
-    for (const auto &e : out_edges_range(g.start, g)) {
-        NFAVertex v = target(e, g);
-        if (v == g.startDs) {
-            continue;
-        }
-        for (u32 t : g[e].tops) {
-            top_succs[t].push_back(v);
-        }
-    }
-
-    for (const auto &top : top_succs) {
-        u32 t = top.first;
-
-        CharReach top_cr;
+CharReach calcTopVertexReach(const flat_set<u32> &tops,
+                             const map<u32, CharReach> &top_reach) {
+    CharReach top_cr;
+    for (u32 t : tops) {
         if (contains(top_reach, t)) {
-            top_cr = top_reach.at(t);
+            top_cr |= top_reach.at(t);
         } else {
             top_cr = CharReach::dot();
-        }
-
-        assert(!contains(tops, t));
-
-        NFAVertex s = NGHolder::null_vertex();
-        flat_set<NFAVertex> succs;
-        insert(&succs, top.second);
-
-        for (auto v : top.second) {
-            if (!top_cr.isSubsetOf(g[v].char_reach)) {
-                continue;
-            }
-
-            flat_set<NFAVertex> vsuccs;
-            insert(&vsuccs, adjacent_vertices(v, g));
-
-            if (succs != vsuccs) {
-                continue;
-            }
-
-            if (g[v].reports != g[g.start].reports) {
-                continue;
-            }
-            s = v;
             break;
         }
+    }
+    return top_cr;
+}
 
-        if (!s) {
-            s = add_vertex(g[g.start], g);
-            g[s].char_reach = top_cr;
-            for (auto v : top.second) {
-                add_edge(s, v, g);
+static
+NFAVertex makeTopStartVertex(NGHolder &g, const flat_set<u32> &tops,
+                             const flat_set<NFAVertex> &succs,
+                             const map<u32, CharReach> &top_reach) {
+    assert(!succs.empty());
+    assert(!tops.empty());
+
+    bool reporter = false;
+
+    NFAVertex u = add_vertex(g[g.start], g);
+    CharReach top_cr = calcTopVertexReach(tops, top_reach);
+    g[u].char_reach = top_cr;
+    for (auto v : succs) {
+        if (v == g.accept || v == g.acceptEod) {
+            reporter = true;
+        }
+        add_edge(u, v, g);
+    }
+
+    // Only retain reports (which we copied on add_vertex above) for new top
+    // vertices connected to accepts.
+    if (!reporter) {
+        g[u].reports.clear();
+    }
+
+    return u;
+}
+
+static
+void pickNextTopStateToHandle(const map<u32, flat_set<NFAVertex>> &top_succs,
+                              const map<NFAVertex, flat_set<u32>> &succ_tops,
+                              flat_set<u32> *picked_tops,
+                              flat_set<NFAVertex> *picked_succs) {
+    /* pick top or vertex we want to handle */
+    if (top_succs.size() < succ_tops.size()) {
+        auto best = top_succs.end();
+        for (auto it = top_succs.begin(); it != top_succs.end(); ++it) {
+            if (best == top_succs.end()
+                || it->second.size() < best->second.size()) {
+                best = it;
             }
         }
-        tops[t] = s;
+        assert(best != top_succs.end());
+        assert(!best->second.empty()); /* should already been pruned */
+
+        *picked_tops = { best->first };
+        *picked_succs = best->second;
+    } else {
+        auto best = succ_tops.end();
+        for (auto it = succ_tops.begin(); it != succ_tops.end(); ++it) {
+            /* have to worry about determinism for this one */
+            if (best == succ_tops.end()
+                || it->second.size() < best->second.size()
+                || (it->second.size() == best->second.size()
+                    && it->second < best->second)) {
+                best = it;
+            }
+        }
+        assert(best != succ_tops.end());
+        assert(!best->second.empty()); /* should already been pruned */
+
+        *picked_succs = { best->first };
+        *picked_tops = best->second;
     }
+}
+
+static
+void expandCbsByTops(const map<u32, flat_set<NFAVertex>> &unhandled_top_succs,
+                     const map<u32, flat_set<NFAVertex>> &top_succs,
+                     const map<NFAVertex, flat_set<u32>> &succ_tops,
+                     flat_set<u32> &picked_tops,
+                     flat_set<NFAVertex> &picked_succs) {
+    NFAVertex v = *picked_succs.begin(); /* arbitrary successor - all equiv */
+    const auto &cand_tops = succ_tops.at(v);
+
+    for (u32 t : cand_tops) {
+        if (!contains(unhandled_top_succs, t)) {
+            continue;
+        }
+        if (!has_intersection(unhandled_top_succs.at(t), picked_succs)) {
+            continue; /* not adding any useful work that hasn't already been
+                       * done */
+        }
+        if (!is_subset_of(picked_succs, top_succs.at(t))) {
+            continue; /* will not form a cbs */
+        }
+        picked_tops.insert(t);
+    }
+}
+
+static
+void expandCbsBySuccs(const map<NFAVertex, flat_set<u32>> &unhandled_succ_tops,
+                      const map<u32, flat_set<NFAVertex>> &top_succs,
+                      const map<NFAVertex, flat_set<u32>> &succ_tops,
+                      flat_set<u32> &picked_tops,
+                      flat_set<NFAVertex> &picked_succs) {
+    u32 t = *picked_tops.begin(); /* arbitrary top - all equiv */
+    const auto &cand_succs = top_succs.at(t);
+
+    for (NFAVertex v : cand_succs) {
+        if (!contains(unhandled_succ_tops, v)) {
+            continue;
+        }
+        if (!has_intersection(unhandled_succ_tops.at(v), picked_tops)) {
+            continue; /* not adding any useful work that hasn't already been
+                       * done */
+        }
+        if (!is_subset_of(picked_tops, succ_tops.at(v))) {
+            continue; /* will not form a cbs */
+        }
+        picked_succs.insert(v);
+    }
+}
+
+/* See if we can expand the complete bipartite subgraph (cbs) specified by the
+ * picked tops/succs by adding more to either of the tops or succs.
+ */
+static
+void expandTopSuccCbs(const map<u32, flat_set<NFAVertex>> &top_succs,
+                      const map<NFAVertex, flat_set<u32>> &succ_tops,
+                      const map<u32, flat_set<NFAVertex>> &unhandled_top_succs,
+                      const map<NFAVertex, flat_set<u32>> &unhandled_succ_tops,
+                      flat_set<u32> &picked_tops,
+                      flat_set<NFAVertex> &picked_succs) {
+    /* Note: all picked (tops|succs) are equivalent */
+
+    /* Try to expand first (as we are more likely to succeed) on the side
+     * with fewest remaining things to be handled */
+
+    if (unhandled_top_succs.size() < unhandled_succ_tops.size()) {
+        expandCbsByTops(unhandled_top_succs, top_succs, succ_tops,
+                        picked_tops, picked_succs);
+        expandCbsBySuccs(unhandled_succ_tops, top_succs, succ_tops,
+                        picked_tops, picked_succs);
+    } else {
+        expandCbsBySuccs(unhandled_succ_tops, top_succs, succ_tops,
+                        picked_tops, picked_succs);
+        expandCbsByTops(unhandled_top_succs, top_succs, succ_tops,
+                        picked_tops, picked_succs);
+    }
+}
+
+static
+void markTopSuccAsHandled(NFAVertex start_v,
+                          const flat_set<u32> &handled_tops,
+                          const flat_set<NFAVertex> &handled_succs,
+                          map<u32, set<NFAVertex>> &tops_out,
+                          map<u32, flat_set<NFAVertex>> &unhandled_top_succs,
+                          map<NFAVertex, flat_set<u32>> &unhandled_succ_tops) {
+    for (u32 t : handled_tops) {
+        tops_out[t].insert(start_v);
+        assert(contains(unhandled_top_succs, t));
+        erase_all(&unhandled_top_succs[t], handled_succs);
+        if (unhandled_top_succs[t].empty()) {
+            unhandled_top_succs.erase(t);
+        }
+    }
+
+    for (NFAVertex v : handled_succs) {
+        assert(contains(unhandled_succ_tops, v));
+        erase_all(&unhandled_succ_tops[v], handled_tops);
+        if (unhandled_succ_tops[v].empty()) {
+            unhandled_succ_tops.erase(v);
+        }
+    }
+}
+
+static
+void attemptToUseAsStart(const NGHolder &g,  NFAVertex u,
+                         const map<u32, CharReach> &top_reach,
+                         map<u32, flat_set<NFAVertex>> &unhandled_top_succs,
+                         map<NFAVertex, flat_set<u32>> &unhandled_succ_tops,
+                         map<u32, set<NFAVertex>> &tops_out) {
+    flat_set<u32> top_inter = unhandled_succ_tops.at(u);
+    flat_set<NFAVertex> succs;
+    for (NFAVertex v : adjacent_vertices_range(u, g)) {
+        if (!contains(unhandled_succ_tops, v)) {
+            return;
+        }
+        const flat_set<u32> &v_tops = unhandled_succ_tops.at(v);
+        flat_set<u32> new_inter;
+        auto ni_inserter = inserter(new_inter, new_inter.end());
+        set_intersection(top_inter.begin(), top_inter.end(),
+                         v_tops.begin(), v_tops.end(), ni_inserter);
+        top_inter = move(new_inter);
+        succs.insert(v);
+    }
+
+    if (top_inter.empty()) {
+        return;
+    }
+
+    auto top_cr = calcTopVertexReach(top_inter, top_reach);
+    if (!top_cr.isSubsetOf(g[u].char_reach)) {
+        return;
+    }
+
+    markTopSuccAsHandled(u, top_inter, succs, tops_out, unhandled_top_succs,
+                         unhandled_succ_tops);
+}
+
+/* We may have cases where a top triggers something that starts with a .* (or
+ * similar state). In these cases we can make use of that state as a start
+ * state.
+ */
+static
+void reusePredsAsStarts(const NGHolder &g, const map<u32, CharReach> &top_reach,
+                        map<u32, flat_set<NFAVertex>> &unhandled_top_succs,
+                        map<NFAVertex, flat_set<u32>> &unhandled_succ_tops,
+                        map<u32, set<NFAVertex>> &tops_out) {
+    /* create list of candidates first, to avoid issues of iter invalidation
+     * and determinism */
+    vector<NFAVertex> cand_starts;
+    for (NFAVertex u : unhandled_succ_tops | map_keys) {
+        if (hasSelfLoop(u, g)) {
+            cand_starts.push_back(u);
+        }
+    }
+    sort(cand_starts.begin(), cand_starts.end(), make_index_ordering(g));
+
+    for (NFAVertex u : cand_starts) {
+        if (!contains(unhandled_succ_tops, u)) {
+            continue;
+        }
+        attemptToUseAsStart(g, u, top_reach, unhandled_top_succs,
+                            unhandled_succ_tops, tops_out);
+     }
+}
+
+static
+void makeTopStates(NGHolder &g, map<u32, set<NFAVertex>> &tops_out,
+                   const map<u32, CharReach> &top_reach) {
+    /* Ideally, we want to add the smallest number of states to the graph for
+     * tops to turn on so that they can accurately trigger their successors.
+     *
+     * The relationships between tops and their successors forms a bipartite
+     * graph. Finding the optimal number of start states to add is equivalent to
+     * finding a minimal biclique coverings. Unfortunately, this is known to be
+     * NP-complete.
+     *
+     * Given this, we will just do something simple to avoid creating something
+     * truly wasteful:
+     * 1) Try to find any cyclic states which can act as their own start states
+     * 2) Pick a top or a succ to create a start state for and then try to find
+     *    the largest complete bipartite subgraph that it is part of.
+     */
+
+    map<u32, flat_set<NFAVertex>> top_succs;
+    map<NFAVertex, flat_set<u32>> succ_tops;
+    for (const auto &e : out_edges_range(g.start, g)) {
+        NFAVertex v = target(e, g);
+        for (u32 t : g[e].tops) {
+            top_succs[t].insert(v);
+            succ_tops[v].insert(t);
+        }
+    }
+
+    auto unhandled_top_succs = top_succs;
+    auto unhandled_succ_tops = succ_tops;
+
+    reusePredsAsStarts(g, top_reach, unhandled_top_succs, unhandled_succ_tops,
+                       tops_out);
+
+    /* Note: there may be successors which are equivalent (in terms of
+       top-triggering), it may be more efficient to discover this and treat them
+       as a unit. TODO */
+
+    while (!unhandled_succ_tops.empty()) {
+        assert(!unhandled_top_succs.empty());
+        flat_set<u32> u_tops;
+        flat_set<NFAVertex> u_succs;
+        pickNextTopStateToHandle(unhandled_top_succs, unhandled_succ_tops,
+                                 &u_tops, &u_succs);
+
+        expandTopSuccCbs(top_succs, succ_tops, unhandled_top_succs,
+                         unhandled_succ_tops, u_tops, u_succs);
+
+        /* create start vertex to handle this top/succ combination */
+        NFAVertex u = makeTopStartVertex(g, u_tops, u_succs, top_reach);
+
+        /* update maps */
+        markTopSuccAsHandled(u, u_tops, u_succs, tops_out, unhandled_top_succs,
+                             unhandled_succ_tops);
+    }
+    assert(unhandled_top_succs.empty());
 
     // We are completely replacing the start vertex, so clear its reports.
     clear_out_edges(g.start, g);
     add_edge(g.start, g.startDs, g);
     g[g.start].reports.clear();
-
-    // Only retain reports (which we copied on add_vertex above) for new top
-    // vertices connected to accepts.
-    for (const auto &m : tops) {
-        NFAVertex v = m.second;
-        if (!edge(v, g.accept, g).second && !edge(v, g.acceptEod, g).second) {
-            g[v].reports.clear();
-        }
-    }
 }
 
 static
@@ -325,7 +562,8 @@ prepareGraph(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, vector<vector<CharReach>>> &triggers,
              bool impl_test_only, const CompileContext &cc,
              ue2::unordered_map<NFAVertex, u32> &state_ids,
-             vector<BoundedRepeatData> &repeats, map<u32, NFAVertex> &tops) {
+             vector<BoundedRepeatData> &repeats,
+             map<u32, set<NFAVertex>> &tops) {
     assert(is_triggered(h_in) || fixed_depth_tops.empty());
 
     unique_ptr<NGHolder> h = cloneHolder(h_in);
@@ -335,15 +573,19 @@ prepareGraph(const NGHolder &h_in, const ReportManager *rm,
                    impl_test_only, cc.grey);
 
     // If we're building a rose/suffix, do the top dance.
+    flat_set<NFAVertex> topVerts;
     if (is_triggered(*h)) {
         makeTopStates(*h, tops, findTopReach(triggers));
+
+        for (const auto &vv : tops | map_values) {
+            insert(&topVerts, vv);
+        }
     }
 
     dropRedundantStartEdges(*h);
 
     // Do state numbering
-    state_ids = numberStates(*h, tops);
-    dropUnusedStarts(*h, state_ids);
+    state_ids = numberStates(*h, topVerts);
 
     // In debugging, we sometimes like to reverse the state numbering to stress
     // the NFA construction code.
@@ -389,14 +631,14 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
 
     ue2::unordered_map<NFAVertex, u32> state_ids;
     vector<BoundedRepeatData> repeats;
-    map<u32, NFAVertex> tops;
+    map<u32, set<NFAVertex>> tops;
     unique_ptr<NGHolder> h
         = prepareGraph(h_in, rm, fixed_depth_tops, triggers, impl_test_only, cc,
                        state_ids, repeats, tops);
 
     // Quick exit: if we've got an embarrassment of riches, i.e. more states
     // than we can implement in our largest NFA model, bail here.
-    u32 numStates = countStates(*h, state_ids, false);
+    u32 numStates = countStates(state_ids);
     if (numStates > NFA_MAX_STATES) {
         DEBUG_PRINTF("Can't build an NFA with %u states\n", numStates);
         return nullptr;
@@ -469,13 +711,11 @@ aligned_unique_ptr<NFA> constructReversedNFA_i(const NGHolder &h_in, u32 hint,
     assert(h.kind == NFA_REV_PREFIX); /* triggered, raises internal callbacks */
 
     // Do state numbering.
-    auto state_ids = numberStates(h);
-
-    dropUnusedStarts(h, state_ids);
+    auto state_ids = numberStates(h, {});
 
     // Quick exit: if we've got an embarrassment of riches, i.e. more states
     // than we can implement in our largest NFA model, bail here.
-    u32 numStates = countStates(h, state_ids, false);
+    u32 numStates = countStates(state_ids);
     if (numStates > NFA_MAX_STATES) {
         DEBUG_PRINTF("Can't build an NFA with %u states\n", numStates);
         return nullptr;
@@ -483,7 +723,7 @@ aligned_unique_ptr<NFA> constructReversedNFA_i(const NGHolder &h_in, u32 hint,
 
     assert(sanityCheckGraph(h, state_ids));
 
-    map<u32, NFAVertex> tops; /* only the standards tops for nfas */
+    map<u32, set<NFAVertex>> tops; /* only the standards tops for nfas */
     set<NFAVertex> zombies;
     vector<BoundedRepeatData> repeats;
     map<NFAVertex, NFAStateSet> reportSquashMap;
@@ -518,7 +758,7 @@ u32 isImplementableNFA(const NGHolder &g, const ReportManager *rm,
     // Quick check: we can always implement an NFA with less than NFA_MAX_STATES
     // states. Note that top masks can generate extra states, so we account for
     // those here too.
-    if (num_vertices(g) + NFA_MAX_TOP_MASKS < NFA_MAX_STATES) {
+    if (num_vertices(g) + getTops(g).size() < NFA_MAX_STATES) {
         return true;
     }
 
@@ -539,12 +779,12 @@ u32 isImplementableNFA(const NGHolder &g, const ReportManager *rm,
 
     ue2::unordered_map<NFAVertex, u32> state_ids;
     vector<BoundedRepeatData> repeats;
-    map<u32, NFAVertex> tops;
+    map<u32, set<NFAVertex>> tops;
     unique_ptr<NGHolder> h
         = prepareGraph(g, rm, fixed_depth_tops, triggers, impl_test_only, cc,
                        state_ids, repeats, tops);
     assert(h);
-    u32 numStates = countStates(*h, state_ids, false);
+    u32 numStates = countStates(state_ids);
     if (numStates <= NFA_MAX_STATES) {
         return numStates;
     }
@@ -586,12 +826,12 @@ u32 countAccelStates(const NGHolder &g, const ReportManager *rm,
 
     ue2::unordered_map<NFAVertex, u32> state_ids;
     vector<BoundedRepeatData> repeats;
-    map<u32, NFAVertex> tops;
+    map<u32, set<NFAVertex>> tops;
     unique_ptr<NGHolder> h
         = prepareGraph(g, rm, fixed_depth_tops, triggers, impl_test_only, cc,
                        state_ids, repeats, tops);
 
-    if (!h || countStates(*h, state_ids, false) > NFA_MAX_STATES) {
+    if (!h || countStates(state_ids) > NFA_MAX_STATES) {
         DEBUG_PRINTF("not constructible\n");
         return NFA_MAX_ACCEL_STATES + 1;
     }
diff --git a/src/nfagraph/ng_mcclellan.cpp b/src/nfagraph/ng_mcclellan.cpp
index 39788570..71c9a05e 100644
--- a/src/nfagraph/ng_mcclellan.cpp
+++ b/src/nfagraph/ng_mcclellan.cpp
@@ -36,7 +36,6 @@
 #include "nfa/rdfa.h"
 #include "ng_holder.h"
 #include "ng_mcclellan_internal.h"
-#include "ng_restructuring.h"
 #include "ng_squash.h"
 #include "ng_util.h"
 #include "ue2common.h"
@@ -348,10 +347,11 @@ public:
     using StateMap = typename Automaton_Traits::StateMap;
 
     Automaton_Base(const ReportManager *rm_in, const NGHolder &graph_in,
-                   const flat_set<NFAVertex> &unused_in, bool single_trigger,
+                   bool single_trigger,
                    const vector<vector<CharReach>> &triggers, bool prunable_in)
         : rm(rm_in), graph(graph_in), numStates(num_vertices(graph)),
-          unused(unused_in), init(Automaton_Traits::init_states(numStates)),
+          unused(getRedundantStarts(graph_in)),
+          init(Automaton_Traits::init_states(numStates)),
           initDS(Automaton_Traits::init_states(numStates)),
           squash(Automaton_Traits::init_states(numStates)),
           accept(Automaton_Traits::init_states(numStates)),
@@ -444,7 +444,7 @@ private:
 public:
     const NGHolder &graph;
     u32 numStates;
-    const flat_set<NFAVertex> &unused;
+    const flat_set<NFAVertex> unused;
     vector<NFAVertex> v_by_index;
     vector<CharReach> cr_by_index; /* pre alpha'ed */
     StateSet init;
@@ -482,9 +482,9 @@ struct Big_Traits {
 class Automaton_Big : public Automaton_Base<Big_Traits> {
 public:
     Automaton_Big(const ReportManager *rm_in, const NGHolder &graph_in,
-                  const flat_set<NFAVertex> &unused_in, bool single_trigger,
+                  bool single_trigger,
                   const vector<vector<CharReach>> &triggers, bool prunable_in)
-        : Automaton_Base(rm_in, graph_in, unused_in, single_trigger, triggers,
+        : Automaton_Base(rm_in, graph_in, single_trigger, triggers,
                          prunable_in) {}
 };
 
@@ -510,14 +510,36 @@ struct Graph_Traits {
 class Automaton_Graph : public Automaton_Base<Graph_Traits> {
 public:
     Automaton_Graph(const ReportManager *rm_in, const NGHolder &graph_in,
-                  const flat_set<NFAVertex> &unused_in, bool single_trigger,
-                  const vector<vector<CharReach>> &triggers, bool prunable_in)
-        : Automaton_Base(rm_in, graph_in, unused_in, single_trigger, triggers,
+                    bool single_trigger,
+                    const vector<vector<CharReach>> &triggers, bool prunable_in)
+        : Automaton_Base(rm_in, graph_in, single_trigger, triggers,
                          prunable_in) {}
 };
 
 } // namespace
 
+static
+bool startIsRedundant(const NGHolder &g) {
+    set<NFAVertex> start;
+    set<NFAVertex> startDs;
+
+    insert(&start, adjacent_vertices(g.start, g));
+    insert(&startDs, adjacent_vertices(g.startDs, g));
+
+    return start == startDs;
+}
+
+flat_set<NFAVertex> getRedundantStarts(const NGHolder &g) {
+    flat_set<NFAVertex> dead;
+    if (startIsRedundant(g)) {
+        dead.insert(g.start);
+    }
+    if (proper_out_degree(g.startDs, g) == 0) {
+        dead.insert(g.startDs);
+    }
+    return dead;
+}
+
 unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
                                    const ReportManager *rm, bool single_trigger,
                                    const vector<vector<CharReach>> &triggers,
@@ -526,8 +548,6 @@ unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
         return nullptr;
     }
 
-    auto unused = findUnusedStates(graph);
-
     DEBUG_PRINTF("attempting to build ?%d? mcclellan\n", (int)graph.kind);
     assert(allMatchStatesHaveReports(graph));
 
@@ -553,8 +573,7 @@ unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
     if (numStates <= NFA_STATE_LIMIT) {
         /* Fast path. Automaton_Graph uses a bitfield internally to represent
          * states and is quicker than Automaton_Big. */
-        Automaton_Graph n(rm, graph, unused, single_trigger, triggers,
-                          prunable);
+        Automaton_Graph n(rm, graph, single_trigger, triggers, prunable);
         if (determinise(n, rdfa->states, state_limit)) {
             DEBUG_PRINTF("state limit exceeded\n");
             return nullptr; /* over state limit */
@@ -566,7 +585,7 @@ unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
         rdfa->alpha_remap = n.alpha;
     } else {
         /* Slow path. Too many states to use Automaton_Graph. */
-        Automaton_Big n(rm, graph, unused, single_trigger, triggers, prunable);
+        Automaton_Big n(rm, graph, single_trigger, triggers, prunable);
         if (determinise(n, rdfa->states, state_limit)) {
             DEBUG_PRINTF("state limit exceeded\n");
             return nullptr; /* over state limit */
diff --git a/src/nfagraph/ng_mcclellan_internal.h b/src/nfagraph/ng_mcclellan_internal.h
index 22fcf01e..b78dac3b 100644
--- a/src/nfagraph/ng_mcclellan_internal.h
+++ b/src/nfagraph/ng_mcclellan_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,7 +36,6 @@
 #include "ue2common.h"
 #include "nfa/mcclellancompile.h"
 #include "nfagraph/ng_holder.h"
-#include "nfagraph/ng_restructuring.h" // for NO_STATE
 #include "util/charreach.h"
 #include "util/graph_range.h"
 #include "util/ue2_containers.h"
@@ -69,6 +68,13 @@ void markToppableStarts(const NGHolder &g, const flat_set<NFAVertex> &unused,
                         const std::vector<std::vector<CharReach>> &triggers,
                         boost::dynamic_bitset<> *out);
 
+/**
+ * \brief Returns a set of start vertices that will not participate in an
+ * implementation of this graph. These are either starts with no successors or
+ * starts which are redundant with startDs.
+ */
+flat_set<NFAVertex> getRedundantStarts(const NGHolder &g);
+
 template<typename autom>
 void transition_graph(autom &nfa, const std::vector<NFAVertex> &vByStateId,
                       const typename autom::StateSet &in,
diff --git a/src/nfagraph/ng_restructuring.cpp b/src/nfagraph/ng_restructuring.cpp
index c85860c7..46990330 100644
--- a/src/nfagraph/ng_restructuring.cpp
+++ b/src/nfagraph/ng_restructuring.cpp
@@ -49,37 +49,71 @@ namespace ue2 {
 /** Connect the start vertex to each of the vertices in \p tops. This is useful
  * temporarily for when we need to run a graph algorithm that expects a single
  * source vertex. */
-void wireStartToTops(NGHolder &g, const map<u32, NFAVertex> &tops,
-                     vector<NFAEdge> &topEdges) {
-    for (const auto &top : tops) {
-        NFAVertex v = top.second;
+static
+void wireStartToTops(NGHolder &g, const flat_set<NFAVertex> &tops,
+                     vector<NFAEdge> &tempEdges) {
+    for (NFAVertex v : tops) {
         assert(!isLeafNode(v, g));
 
         const NFAEdge &e = add_edge(g.start, v, g).first;
-        topEdges.push_back(e);
+        tempEdges.push_back(e);
     }
 }
 
+/**
+ * Returns true if start's successors (aside from startDs) are subset of
+ * startDs's proper successors or if start has no successors other than startDs.
+ */
 static
-void getStateOrdering(NGHolder &g, const map<u32, NFAVertex> &tops,
+bool startIsRedundant(const NGHolder &g) {
+    /* We ignore startDs as the self-loop may have been stripped as an
+     * optimisation for repeats (improveLeadingRepeats()). */
+    set<NFAVertex> start;
+    insert(&start,  adjacent_vertices_range(g.start, g));
+    start.erase(g.startDs);
+
+    // Trivial case: start has no successors other than startDs.
+    if (start.empty()) {
+        DEBUG_PRINTF("start has no out-edges other than to startDs\n");
+        return true;
+    }
+
+    set<NFAVertex> startDs;
+    insert(&startDs,  adjacent_vertices_range(g.startDs, g));
+    startDs.erase(g.startDs);
+
+    if (!is_subset_of(start, startDs)) {
+        DEBUG_PRINTF("out-edges of start and startDs aren't equivalent\n");
+        return false;
+    }
+
+    return true;
+}
+
+static
+void getStateOrdering(NGHolder &g, const flat_set<NFAVertex> &tops,
                       vector<NFAVertex> &ordering) {
     // First, wire up our "tops" to start so that we have a single source,
     // which will give a nicer topo order.
-    vector<NFAEdge> topEdges;
-    wireStartToTops(g, tops, topEdges);
+    vector<NFAEdge> tempEdges;
+    wireStartToTops(g, tops, tempEdges);
 
     renumberGraphVertices(g);
 
     vector<NFAVertex> temp = getTopoOrdering(g);
 
-    remove_edges(topEdges, g);
+    remove_edges(tempEdges, g);
 
     // Move {start, startDs} to the end, so they'll be first when we reverse
-    // the ordering.
+    // the ordering (if they are required).
     temp.erase(remove(temp.begin(), temp.end(), g.startDs));
     temp.erase(remove(temp.begin(), temp.end(), g.start));
-    temp.push_back(g.startDs);
-    temp.push_back(g.start);
+    if (proper_out_degree(g.startDs, g)) {
+        temp.push_back(g.startDs);
+    }
+    if (!startIsRedundant(g)) {
+        temp.push_back(g.start);
+    }
 
     // Walk ordering, remove vertices that shouldn't be participating in state
     // numbering, such as accepts.
@@ -149,16 +183,15 @@ void optimiseTightLoops(const NGHolder &g, vector<NFAVertex> &ordering) {
             continue;
         }
 
-        DEBUG_PRINTF("moving vertex %u next to %u\n",
-                     g[v].index, g[u].index);
+        DEBUG_PRINTF("moving vertex %u next to %u\n", g[v].index, g[u].index);
 
         ordering.erase(v_it);
         ordering.insert(++u_it, v);
     }
 }
 
-ue2::unordered_map<NFAVertex, u32>
-numberStates(NGHolder &h, const map<u32, NFAVertex> &tops) {
+unordered_map<NFAVertex, u32>
+numberStates(NGHolder &h, const flat_set<NFAVertex> &tops) {
     DEBUG_PRINTF("numbering states for holder %p\n", &h);
 
     vector<NFAVertex> ordering;
@@ -166,15 +199,10 @@ numberStates(NGHolder &h, const map<u32, NFAVertex> &tops) {
 
     optimiseTightLoops(h, ordering);
 
-    ue2::unordered_map<NFAVertex, u32> states = getStateIndices(h, ordering);
-
-    return states;
+    return getStateIndices(h, ordering);
 }
 
-u32 countStates(const NGHolder &g,
-                const ue2::unordered_map<NFAVertex, u32> &state_ids,
-                bool addTops) {
-    /* TODO: smarter top state allocation, move to limex? */
+u32 countStates(const unordered_map<NFAVertex, u32> &state_ids) {
     if (state_ids.empty()) {
         return 0;
     }
@@ -185,168 +213,9 @@ u32 countStates(const NGHolder &g,
             max_state = max(m.second, max_state);
         }
     }
-
     u32 num_states = max_state + 1;
 
-    assert(contains(state_ids, g.start));
-    if (addTops && is_triggered(g) && state_ids.at(g.start) != NO_STATE) {
-        num_states--;
-        set<u32> tops;
-        for (auto e : out_edges_range(g.start, g)) {
-            insert(&tops, g[e].tops);
-        }
-        num_states += tops.size();
-    }
-
     return num_states;
 }
 
-/**
- * Returns true if start leads to all of startDs's proper successors or if
- * start has no successors other than startDs.
- */
-static
-bool startIsRedundant(const NGHolder &g) {
-    set<NFAVertex> start, startDs;
-
-    for (const auto &e : out_edges_range(g.start, g)) {
-        NFAVertex v = target(e, g);
-        if (v == g.startDs) {
-            continue;
-        }
-        start.insert(v);
-    }
-
-    for (const auto &e : out_edges_range(g.startDs, g)) {
-        NFAVertex v = target(e, g);
-        if (v == g.startDs) {
-            continue;
-        }
-        startDs.insert(v);
-    }
-
-    // Trivial case: start has no successors other than startDs.
-    if (start.empty()) {
-        DEBUG_PRINTF("start has no out-edges other than to startDs\n");
-        return true;
-    }
-
-    if (start != startDs) {
-        DEBUG_PRINTF("out-edges of start and startDs aren't equivalent\n");
-        return false;
-    }
-
-    return true;
-}
-
-/** One final, FINAL optimisation. Drop either start or startDs if it's unused
- * in this graph. We leave this until this late because having both vertices in
- * the graph, with fixed state indices, is useful for merging and other
- * analyses. */
-void dropUnusedStarts(NGHolder &g, ue2::unordered_map<NFAVertex, u32> &states) {
-    u32 adj = 0;
-
-    if (startIsRedundant(g)) {
-        DEBUG_PRINTF("dropping unused start\n");
-        states[g.start] = NO_STATE;
-        adj++;
-    }
-
-    if (proper_out_degree(g.startDs, g) == 0) {
-        DEBUG_PRINTF("dropping unused startDs\n");
-        states[g.startDs] = NO_STATE;
-        adj++;
-    }
-
-    if (!adj) {
-        DEBUG_PRINTF("both start and startDs must remain\n");
-        return;
-    }
-
-    // We have removed one or both of the starts. Walk the non-special vertices
-    // in the graph with state indices assigned to them and subtract
-    // adj from all of them.
-    for (auto v : vertices_range(g)) {
-        u32 &state = states[v]; // note ref
-        if (state == NO_STATE) {
-            continue;
-        }
-        if (is_any_start(v, g)) {
-            assert(state <= 1);
-            state = 0; // one start remains
-        } else {
-            assert(!is_special(v, g));
-            assert(state >= adj);
-            state -= adj;
-        }
-    }
-}
-
-flat_set<NFAVertex> findUnusedStates(const NGHolder &g) {
-    flat_set<NFAVertex> dead;
-    if (startIsRedundant(g)) {
-        dead.insert(g.start);
-    }
-    if (proper_out_degree(g.startDs, g) == 0) {
-        dead.insert(g.startDs);
-    }
-    return dead;
-}
-
-/** Construct a reversed copy of an arbitrary NGHolder, mapping starts to
- * accepts. */
-void reverseHolder(const NGHolder &g_in, NGHolder &g) {
-    // Make the BGL do the grunt work.
-    ue2::unordered_map<NFAVertex, NFAVertex> vertexMap;
-    boost::transpose_graph(g_in.g, g.g,
-                orig_to_copy(boost::make_assoc_property_map(vertexMap)).
-                vertex_index_map(get(&NFAGraphVertexProps::index, g_in.g)));
-
-    // The transpose_graph operation will have created extra copies of our
-    // specials. We have to rewire their neighbours to the 'real' specials and
-    // delete them.
-    NFAVertex start = vertexMap[g_in.acceptEod];
-    NFAVertex startDs = vertexMap[g_in.accept];
-    NFAVertex accept = vertexMap[g_in.startDs];
-    NFAVertex acceptEod = vertexMap[g_in.start];
-
-    // Successors of starts.
-    for (const auto &e : out_edges_range(start, g)) {
-        NFAVertex v = target(e, g);
-        add_edge(g.start, v, g[e], g);
-    }
-    for (const auto &e : out_edges_range(startDs, g)) {
-        NFAVertex v = target(e, g);
-        add_edge(g.startDs, v, g[e], g);
-    }
-
-    // Predecessors of accepts.
-    for (const auto &e : in_edges_range(accept, g)) {
-        NFAVertex u = source(e, g);
-        add_edge(u, g.accept, g[e], g);
-    }
-    for (const auto &e : in_edges_range(acceptEod, g)) {
-        NFAVertex u = source(e, g);
-        add_edge(u, g.acceptEod, g[e], g);
-    }
-
-    // Remove our impostors.
-    clear_vertex(start, g);
-    remove_vertex(start, g);
-    clear_vertex(startDs, g);
-    remove_vertex(startDs, g);
-    clear_vertex(accept, g);
-    remove_vertex(accept, g);
-    clear_vertex(acceptEod, g);
-    remove_vertex(acceptEod, g);
-
-    // Renumber so that g's properties (number of vertices, edges) are
-    // accurate.
-    g.renumberVertices();
-    g.renumberEdges();
-
-    assert(num_vertices(g) == num_vertices(g_in));
-    assert(num_edges(g) == num_edges(g_in));
-}
-
 } // namespace ue2
diff --git a/src/nfagraph/ng_restructuring.h b/src/nfagraph/ng_restructuring.h
index 5e244bf6..bbd478d5 100644
--- a/src/nfagraph/ng_restructuring.h
+++ b/src/nfagraph/ng_restructuring.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,23 +37,8 @@
 #include "ue2common.h"
 #include "util/ue2_containers.h"
 
-#include <map>
-#include <vector>
-
 namespace ue2 {
 
-class NGHolder;
-
-/** Construct a reversed copy of an arbitrary NGHolder, mapping starts to
- * accepts. */
-void reverseHolder(const NGHolder &g, NGHolder &out);
-
-/** Connect the start vertex to each of the vertices in \p tops. This is useful
- * temporarily for when we need to run a graph algorithm that expects a single
- * source vertex. */
-void wireStartToTops(NGHolder &g, const std::map<u32, NFAVertex> &tops,
-                     std::vector<NFAEdge> &topEdges);
-
 /**
  * \brief Special state index value meaning that the vertex will not
  * participate in an (NFA/DFA/etc) implementation.
@@ -63,30 +48,14 @@ static constexpr u32 NO_STATE = ~0;
 /**
  * \brief Gives each participating vertex in the graph a unique state index.
  */
-ue2::unordered_map<NFAVertex, u32>
-numberStates(NGHolder &h,
-             const std::map<u32, NFAVertex> &tops = std::map<u32, NFAVertex>{});
+unordered_map<NFAVertex, u32>
+numberStates(NGHolder &h, const flat_set<NFAVertex> &tops);
 
 /**
  * \brief Counts the number of states (vertices with state indices) in the
  * graph.
- *
- * If addTops is true, also accounts for states that will be constructed for
- * each unique top.
  */
-u32 countStates(const NGHolder &g,
-                const ue2::unordered_map<NFAVertex, u32> &state_ids,
-                bool addTops = true);
-
-/** Optimisation: drop unnecessary start states. */
-void dropUnusedStarts(NGHolder &g, ue2::unordered_map<NFAVertex, u32> &states);
-
-/**
- * \brief Returns a set of vertices that will not participate in an
- * implementation (NFA, DFA etc) of this graph. For example, starts with no
- * successors.
- */
-flat_set<NFAVertex> findUnusedStates(const NGHolder &g);
+u32 countStates(const unordered_map<NFAVertex, u32> &state_ids);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_uncalc_components.cpp b/src/nfagraph/ng_uncalc_components.cpp
index fd6dfc3e..3326d6f4 100644
--- a/src/nfagraph/ng_uncalc_components.cpp
+++ b/src/nfagraph/ng_uncalc_components.cpp
@@ -39,7 +39,6 @@
 #include "ng_limex.h"
 #include "ng_redundancy.h"
 #include "ng_region.h"
-#include "ng_restructuring.h"
 #include "ng_uncalc_components.h"
 #include "ng_util.h"
 #include "ue2common.h"
@@ -55,42 +54,52 @@
 #include <set>
 #include <vector>
 
+#include <boost/range/adaptor/map.hpp>
+
 using namespace std;
+using boost::adaptors::map_values;
 
 namespace ue2 {
 
 static const u32 FAST_STATE_LIMIT = 256; /**< largest possible desirable NFA */
 
 /** Sentinel value meaning no component has yet been selected. */
-static const u32 NO_COMPONENT = 0xffffffffu;
+static const u32 NO_COMPONENT = ~0U;
 
-static
-vector<NFAVertex> getSortedVA(const NGHolder &g,
-            const ue2::unordered_map<NFAVertex, u32> &state_ids) {
-    vector<NFAVertex> out;
-    out.reserve(num_vertices(g));
+static const u32 UNUSED_STATE = ~0U;
 
-    for (auto v : vertices_range(g)) {
-        assert(contains(state_ids, v));
-        if (state_ids.at(v) == NO_STATE) {
-            continue;
+namespace {
+struct ranking_info {
+    explicit ranking_info(const NGHolder &h) : to_vertex(getTopoOrdering(h)) {
+        u32 rank = 0;
+
+        reverse(to_vertex.begin(), to_vertex.end());
+
+        for (NFAVertex v : to_vertex) {
+            to_rank[v] = rank++;
+        }
+
+        for (NFAVertex v : vertices_range(h)) {
+            if (!contains(to_rank, v)) {
+                to_rank[v] = UNUSED_STATE;
+            }
         }
-        out.push_back(v);
     }
 
-    // Order vertices by their state indices.
-    sort(begin(out), end(out), [&state_ids](NFAVertex a, NFAVertex b) {
-        return state_ids.at(a) < state_ids.at(b);
-    });
-
-#ifndef NDEBUG
-    // State indices should match vector indices.
-    for (u32 i = 0; i < out.size(); i++) {
-        assert(state_ids.at(out.at(i)) == i);
+    NFAVertex at(u32 ranking) const { return to_vertex.at(ranking); }
+    u32 get(NFAVertex v) const { return to_rank.at(v); }
+    u32 size() const { return (u32)to_vertex.size(); }
+    u32 add_to_tail(NFAVertex v) {
+        u32 rank = size();
+        to_rank[v] = rank;
+        to_vertex.push_back(v);
+        return rank;
     }
-#endif
 
-    return out;
+private:
+    vector<NFAVertex> to_vertex;
+    unordered_map<NFAVertex, u32> to_rank;
+};
 }
 
 static never_inline
@@ -122,9 +131,9 @@ bool cplVerticesMatch(const NGHolder &ga, NFAVertex va,
 }
 
 static never_inline
-u32 cplCommonReachAndSimple(const NGHolder &ga, const vector<NFAVertex> &a,
-                            const NGHolder &gb, const vector<NFAVertex> &b) {
-    u32 ml = min(a.size(), b.size());
+u32 cplCommonReachAndSimple(const NGHolder &ga, const ranking_info &a_ranking,
+                            const NGHolder &gb, const ranking_info &b_ranking) {
+    u32 ml = min(a_ranking.size(), b_ranking.size());
     if (ml > 65535) {
         ml = 65535;
     }
@@ -133,7 +142,7 @@ u32 cplCommonReachAndSimple(const NGHolder &ga, const vector<NFAVertex> &a,
     // "startedness" properties.
     u32 max = 0;
     for (; max < ml; max++) {
-        if (!cplVerticesMatch(ga, a[max], gb, b[max])) {
+        if (!cplVerticesMatch(ga, a_ranking.at(max), gb, b_ranking.at(max))) {
             break;
         }
     }
@@ -141,34 +150,30 @@ u32 cplCommonReachAndSimple(const NGHolder &ga, const vector<NFAVertex> &a,
     return max;
 }
 
-u32 commonPrefixLength(const NGHolder &ga,
-                       const ue2::unordered_map<NFAVertex, u32> &a_state_ids,
-                       const NGHolder &gb,
-                       const ue2::unordered_map<NFAVertex, u32> &b_state_ids) {
-    vector<NFAVertex> a = getSortedVA(ga, a_state_ids);
-    vector<NFAVertex> b = getSortedVA(gb, b_state_ids);
-
+static
+u32 commonPrefixLength(const NGHolder &ga, const ranking_info &a_ranking,
+                       const NGHolder &gb, const ranking_info &b_ranking) {
     /* upper bound on the common region based on local properties */
-    u32 max = cplCommonReachAndSimple(ga, a, gb, b);
+    u32 max = cplCommonReachAndSimple(ga, a_ranking, gb, b_ranking);
     DEBUG_PRINTF("cpl upper bound %u\n", max);
 
     while (max > 0) {
-        bool ok = true;
-
         /* shrink max region based on in-edges from outside the region */
         for (size_t j = max; j > 0; j--) {
-            for (auto u : inv_adjacent_vertices_range(a[j - 1], ga)) {
-                u32 state_id = a_state_ids.at(u);
-                if (state_id != NO_STATE && state_id >= max) {
+            NFAVertex a_v = a_ranking.at(j - 1);
+            NFAVertex b_v = b_ranking.at(j - 1);
+            for (auto u : inv_adjacent_vertices_range(a_v, ga)) {
+                u32 state_id = a_ranking.get(u);
+                if (state_id != UNUSED_STATE && state_id >= max) {
                     max = j - 1;
                     DEBUG_PRINTF("lowering max to %u\n", max);
                     goto next_vertex;
                 }
             }
 
-            for (auto u : inv_adjacent_vertices_range(b[j - 1], gb)) {
-                u32 state_id = b_state_ids.at(u);
-                if (state_id != NO_STATE && state_id >= max) {
+            for (auto u : inv_adjacent_vertices_range(b_v, gb)) {
+                u32 state_id = b_ranking.get(u);
+                if (state_id != UNUSED_STATE && state_id >= max) {
                     max = j - 1;
                     DEBUG_PRINTF("lowering max to %u\n", max);
                     goto next_vertex;
@@ -180,14 +185,13 @@ u32 commonPrefixLength(const NGHolder &ga,
 
         /* Ensure that every pair of vertices has same out-edges to vertices in
            the region. */
-        for (size_t i = 0; ok && i < max; i++) {
+        for (size_t i = 0; i < max; i++) {
             size_t a_count = 0;
             size_t b_count = 0;
 
-            NGHolder::out_edge_iterator ei, ee;
-            for (tie(ei, ee) = out_edges(a[i], ga); ok && ei != ee; ++ei) {
-                u32 sid = a_state_ids.at(target(*ei, ga));
-                if (sid == NO_STATE || sid >= max) {
+            for (NFAEdge a_edge : out_edges_range(a_ranking.at(i), ga)) {
+                u32 sid = a_ranking.get(target(a_edge, ga));
+                if (sid == UNUSED_STATE || sid >= max) {
                     continue;
                 }
 
@@ -195,28 +199,26 @@ u32 commonPrefixLength(const NGHolder &ga,
 
                 NFAEdge b_edge;
                 bool has_b_edge;
-                tie(b_edge, has_b_edge) = edge(b[i], b[sid], gb);
+                tie(b_edge, has_b_edge) = edge(b_ranking.at(i),
+                                               b_ranking.at(sid), gb);
 
                 if (!has_b_edge) {
                     max = i;
-                    ok = false;
                     DEBUG_PRINTF("lowering max to %u due to edge %zu->%u\n",
                                  max, i, sid);
-                    break;
+                    goto try_smaller;
                 }
 
-                if (ga[*ei].tops != gb[b_edge].tops) {
+                if (ga[a_edge].tops != gb[b_edge].tops) {
                     max = i;
-                    ok = false;
                     DEBUG_PRINTF("tops don't match on edge %zu->%u\n", i, sid);
+                    goto try_smaller;
                 }
             }
 
-            NGHolder::adjacency_iterator ai, ae;
-            for (tie(ai, ae) = adjacent_vertices(b[i], gb); ok && ai != ae;
-                 ++ai) {
-                u32 sid = b_state_ids.at(*ai);
-                if (sid == NO_STATE || sid >= max) {
+            for (NFAVertex b_v : adjacent_vertices_range(b_ranking.at(i), gb)) {
+                u32 sid = b_ranking.get(b_v);
+                if (sid == UNUSED_STATE || sid >= max) {
                     continue;
                 }
 
@@ -225,28 +227,32 @@ u32 commonPrefixLength(const NGHolder &ga,
 
             if (a_count != b_count) {
                 max = i;
-                DEBUG_PRINTF("lowering max to %u due to a,b count "
-                             "(a_count=%zu, b_count=%zu)\n", max, a_count,
-                             b_count);
-                ok = false;
+                DEBUG_PRINTF("lowering max to %u due to a,b count (a_count=%zu,"
+                             " b_count=%zu)\n", max, a_count, b_count);
+                goto try_smaller;
             }
         }
 
-        if (ok) {
-            DEBUG_PRINTF("survived checks, returning cpl %u\n", max);
-            return max;
-        }
+        DEBUG_PRINTF("survived checks, returning cpl %u\n", max);
+        return max;
+    try_smaller:;
     }
 
     DEBUG_PRINTF("failed to find any common region\n");
     return 0;
 }
 
+u32 commonPrefixLength(const NGHolder &ga, const NGHolder &gb) {
+    return commonPrefixLength(ga, ranking_info(ga), gb, ranking_info(gb));
+}
+
 static never_inline
-void mergeNfa(NGHolder &dest, vector<NFAVertex> &destStateMap,
-              ue2::unordered_map<NFAVertex, u32> &dest_state_ids,
-              NGHolder &vic, vector<NFAVertex> &vicStateMap,
-              size_t common_len) {
+void mergeNfaComponent(NGHolder &dest, const NGHolder &vic, size_t common_len) {
+    assert(&dest != &vic);
+
+    auto dest_info = ranking_info(dest);
+    auto vic_info = ranking_info(vic);
+
     map<NFAVertex, NFAVertex> vmap; // vic -> dest
 
     vmap[vic.start]     = dest.start;
@@ -255,22 +261,20 @@ void mergeNfa(NGHolder &dest, vector<NFAVertex> &destStateMap,
     vmap[vic.acceptEod] = dest.acceptEod;
     vmap[nullptr] = nullptr;
 
-    u32 stateNum = countStates(dest, dest_state_ids);
-
     // For vertices in the common len, add to vmap and merge in the reports, if
     // any.
     for (u32 i = 0; i < common_len; i++) {
-        NFAVertex v_old = vicStateMap[i], v = destStateMap[i];
+        NFAVertex v_old = vic_info.at(i);
+        NFAVertex v = dest_info.at(i);
         vmap[v_old] = v;
 
         const auto &reports = vic[v_old].reports;
         dest[v].reports.insert(reports.begin(), reports.end());
     }
 
-    // Add in vertices beyond the common len, giving them state numbers
-    // starting at stateNum.
-    for (u32 i = common_len; i < vicStateMap.size(); i++) {
-        NFAVertex v_old = vicStateMap[i];
+    // Add in vertices beyond the common len
+    for (u32 i = common_len; i < vic_info.size(); i++) {
+        NFAVertex v_old = vic_info.at(i);
 
         if (is_special(v_old, vic)) {
             // Dest already has start vertices, just merge the reports.
@@ -282,15 +286,17 @@ void mergeNfa(NGHolder &dest, vector<NFAVertex> &destStateMap,
         }
 
         NFAVertex v = add_vertex(vic[v_old], dest);
-        dest_state_ids[v] = stateNum++;
+        dest_info.add_to_tail(v);
         vmap[v_old] = v;
     }
 
     /* add edges */
     DEBUG_PRINTF("common_len=%zu\n", common_len);
     for (const auto &e : edges_range(vic)) {
-        NFAVertex u_old = source(e, vic), v_old = target(e, vic);
-        NFAVertex u = vmap[u_old], v = vmap[v_old];
+        NFAVertex u_old = source(e, vic);
+        NFAVertex v_old = target(e, vic);
+        NFAVertex u = vmap[u_old];
+        NFAVertex v = vmap[v_old];
         bool uspecial = is_special(u, dest);
         bool vspecial = is_special(v, dest);
 
@@ -301,15 +307,14 @@ void mergeNfa(NGHolder &dest, vector<NFAVertex> &destStateMap,
 
         // We're in the common region if v's state ID is low enough, unless v
         // is a special (an accept), in which case we use u's state ID.
-        assert(contains(dest_state_ids, v));
-        bool in_common_region = dest_state_ids.at(v) < common_len;
-        if (vspecial && dest_state_ids.at(u) < common_len) {
+        bool in_common_region = dest_info.get(v) < common_len;
+        if (vspecial && dest_info.get(u) < common_len) {
             in_common_region = true;
         }
 
         DEBUG_PRINTF("adding idx=%u (state %u) -> idx=%u (state %u)%s\n",
-                     dest[u].index, dest_state_ids.at(u),
-                     dest[v].index, dest_state_ids.at(v),
+                     dest[u].index, dest_info.get(u),
+                     dest[v].index, dest_info.get(v),
                      in_common_region ? " [common]" : "");
 
         if (in_common_region) {
@@ -337,18 +342,6 @@ void mergeNfa(NGHolder &dest, vector<NFAVertex> &destStateMap,
     dest.renumberVertices();
 }
 
-static never_inline
-void mergeNfaComponent(NGHolder &pholder, NGHolder &vholder, size_t cpl) {
-    assert(&pholder != &vholder);
-
-    auto v_state_ids = numberStates(vholder);
-    auto p_state_ids = numberStates(pholder);
-    auto vhvmap = getSortedVA(vholder, v_state_ids);
-    auto phvmap = getSortedVA(pholder, p_state_ids);
-
-    mergeNfa(pholder, phvmap, p_state_ids, vholder, vhvmap, cpl);
-}
-
 namespace {
 struct NfaMergeCandidateH {
     NfaMergeCandidateH(size_t cpl_in, NGHolder *first_in, NGHolder *second_in,
@@ -373,14 +366,19 @@ struct NfaMergeCandidateH {
 
 /** Returns true if graphs \p h1 and \p h2 can (and should) be merged. */
 static
-bool shouldMerge(NGHolder &ha,
-                 const ue2::unordered_map<NFAVertex, u32> &a_state_ids,
-                 NGHolder &hb,
-                 const ue2::unordered_map<NFAVertex, u32> &b_state_ids,
-                 size_t cpl, const ReportManager *rm,
-                 const CompileContext &cc) {
-    size_t combinedStateCount =
-        countStates(ha, a_state_ids) + countStates(hb, b_state_ids) - cpl;
+bool shouldMerge(const NGHolder &ha, const NGHolder &hb, size_t cpl,
+                 const ReportManager *rm, const CompileContext &cc) {
+    size_t combinedStateCount = num_vertices(ha) + num_vertices(hb) - cpl;
+
+    combinedStateCount -= 2 * 2; /* discount accepts from both */
+
+    if (is_triggered(ha)) {
+        /* allow for a state for each top, ignore existing starts */
+        combinedStateCount -= 2; /* for start, startDs */
+        auto tops = getTops(ha);
+        insert(&tops, getTops(hb));
+        combinedStateCount += tops.size();
+    }
 
     if (combinedStateCount > FAST_STATE_LIMIT) {
         // More complex implementability check.
@@ -423,11 +421,13 @@ void buildNfaMergeQueue(const vector<NGHolder *> &cluster,
 
     // First, make sure all holders have numbered states and collect their
     // counts.
-    vector<ue2::unordered_map<NFAVertex, u32>> states_map(cs);
+    vector<ranking_info> states_map;
+    states_map.reserve(cs);
     for (size_t i = 0; i < cs; i++) {
         assert(cluster[i]);
-        NGHolder &g = *(cluster[i]);
-        states_map[i] = numberStates(g);
+        assert(states_map.size() == i);
+        const NGHolder &g = *(cluster[i]);
+        states_map.emplace_back(g);
     }
 
     vector<u16> seen_cpl(cs * cs, 0);
@@ -536,11 +536,9 @@ bool mergeableStarts(const NGHolder &h1, const NGHolder &h2) {
 }
 
 /** Merge graph \p ga into graph \p gb. Returns false on failure. */
-bool mergeNfaPair(NGHolder &ga, NGHolder &gb, const ReportManager *rm,
+bool mergeNfaPair(const NGHolder &ga, NGHolder &gb, const ReportManager *rm,
                   const CompileContext &cc) {
     assert(ga.kind == gb.kind);
-    auto a_state_ids = numberStates(ga);
-    auto b_state_ids = numberStates(gb);
 
     // Vacuous NFAs require special checks on their starts to ensure that tops
     // match, and that reports match for mixed-accept cases.
@@ -549,14 +547,13 @@ bool mergeNfaPair(NGHolder &ga, NGHolder &gb, const ReportManager *rm,
         return false;
     }
 
-    u32 cpl = commonPrefixLength(ga, a_state_ids, gb, b_state_ids);
-    if (!shouldMerge(gb, b_state_ids, ga, a_state_ids, cpl, rm, cc)) {
+    u32 cpl = commonPrefixLength(ga, gb);
+    if (!shouldMerge(gb, ga, cpl, rm, cc)) {
         return false;
     }
 
     mergeNfaComponent(gb, ga, cpl);
     reduceImplementableGraph(gb, SOM_NONE, rm, cc);
-    b_state_ids = numberStates(gb);
     return true;
 }
 
diff --git a/src/nfagraph/ng_uncalc_components.h b/src/nfagraph/ng_uncalc_components.h
index 5f341961..ddab8825 100644
--- a/src/nfagraph/ng_uncalc_components.h
+++ b/src/nfagraph/ng_uncalc_components.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -52,10 +52,7 @@ class ReportManager;
  * The CPL is calculated based the topological ordering given by the state
  * indices for each graph.
  */
-u32 commonPrefixLength(const NGHolder &ga,
-                       const ue2::unordered_map<NFAVertex, u32> &a_state_ids,
-                       const NGHolder &gb,
-                       const ue2::unordered_map<NFAVertex, u32> &b_state_ids);
+u32 commonPrefixLength(const NGHolder &ga, const NGHolder &gb);
 
 /**
  * \brief Merge the group of graphs in \p cluster where possible.
@@ -73,7 +70,7 @@ void mergeNfaCluster(const std::vector<NGHolder *> &cluster,
  * Returns false on failure. On success, \p gb is reduced via \ref
  * reduceImplementableGraph and renumbered.
  */
-bool mergeNfaPair(NGHolder &ga, NGHolder &gb, const ReportManager *rm,
+bool mergeNfaPair(const NGHolder &ga, NGHolder &gb, const ReportManager *rm,
                   const CompileContext &cc);
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_util.cpp b/src/nfagraph/ng_util.cpp
index da9c2438..71eef7eb 100644
--- a/src/nfagraph/ng_util.cpp
+++ b/src/nfagraph/ng_util.cpp
@@ -343,6 +343,47 @@ bool is_virtual_start(NFAVertex v, const NGHolder &g) {
     return g[v].assert_flags & POS_FLAG_VIRTUAL_START;
 }
 
+static
+void reorderSpecials(const NGHolder &g, vector<NFAVertex> &topoOrder) {
+    // Start is last element of reverse topo ordering.
+    auto it = find(topoOrder.begin(), topoOrder.end(), g.start);
+    if (it != topoOrder.end() - 1) {
+        DEBUG_PRINTF("repositioning start\n");
+        assert(it != topoOrder.end());
+        topoOrder.erase(it);
+        topoOrder.insert(topoOrder.end(), g.start);
+    }
+
+    // StartDs is second-to-last element of reverse topo ordering.
+    it = find(topoOrder.begin(), topoOrder.end(), g.startDs);
+    if (it != topoOrder.end() - 2) {
+        DEBUG_PRINTF("repositioning start ds\n");
+        assert(it != topoOrder.end());
+        topoOrder.erase(it);
+        topoOrder.insert(topoOrder.end() - 1, g.startDs);
+    }
+
+    // AcceptEOD is first element of reverse topo ordering.
+    it = find(topoOrder.begin(), topoOrder.end(), g.acceptEod);
+    if (it != topoOrder.begin()) {
+        DEBUG_PRINTF("repositioning accept\n");
+        assert(it != topoOrder.end());
+        topoOrder.erase(it);
+        topoOrder.insert(topoOrder.begin(), g.acceptEod);
+    }
+
+    // Accept is second element of reverse topo ordering, if it's connected.
+    it = find(topoOrder.begin(), topoOrder.end(), g.accept);
+    if (it != topoOrder.begin() + 1) {
+        DEBUG_PRINTF("repositioning accept\n");
+        assert(it != topoOrder.end());
+        topoOrder.erase(it);
+        if (in_degree(g.accept, g) != 0) {
+            topoOrder.insert(topoOrder.begin() + 1, g.accept);
+        }
+    }
+}
+
 vector<NFAVertex> getTopoOrdering(const NGHolder &g) {
     assert(hasCorrectlyNumberedVertices(g));
 
@@ -372,6 +413,8 @@ vector<NFAVertex> getTopoOrdering(const NGHolder &g) {
         color_map(make_iterator_property_map(colour.begin(), index_map))
             .vertex_index_map(index_map));
 
+    reorderSpecials(g, ordering);
+
     return ordering;
 }
 
@@ -629,6 +672,60 @@ unique_ptr<NGHolder> cloneHolder(const NGHolder &in) {
     return h;
 }
 
+void reverseHolder(const NGHolder &g_in, NGHolder &g) {
+    // Make the BGL do the grunt work.
+    ue2::unordered_map<NFAVertex, NFAVertex> vertexMap;
+    boost::transpose_graph(g_in.g, g.g,
+                orig_to_copy(boost::make_assoc_property_map(vertexMap)).
+                vertex_index_map(get(&NFAGraphVertexProps::index, g_in.g)));
+
+    // The transpose_graph operation will have created extra copies of our
+    // specials. We have to rewire their neighbours to the 'real' specials and
+    // delete them.
+    NFAVertex start = vertexMap[g_in.acceptEod];
+    NFAVertex startDs = vertexMap[g_in.accept];
+    NFAVertex accept = vertexMap[g_in.startDs];
+    NFAVertex acceptEod = vertexMap[g_in.start];
+
+    // Successors of starts.
+    for (const auto &e : out_edges_range(start, g)) {
+        NFAVertex v = target(e, g);
+        add_edge(g.start, v, g[e], g);
+    }
+    for (const auto &e : out_edges_range(startDs, g)) {
+        NFAVertex v = target(e, g);
+        add_edge(g.startDs, v, g[e], g);
+    }
+
+    // Predecessors of accepts.
+    for (const auto &e : in_edges_range(accept, g)) {
+        NFAVertex u = source(e, g);
+        add_edge(u, g.accept, g[e], g);
+    }
+    for (const auto &e : in_edges_range(acceptEod, g)) {
+        NFAVertex u = source(e, g);
+        add_edge(u, g.acceptEod, g[e], g);
+    }
+
+    // Remove our impostors.
+    clear_vertex(start, g);
+    remove_vertex(start, g);
+    clear_vertex(startDs, g);
+    remove_vertex(startDs, g);
+    clear_vertex(accept, g);
+    remove_vertex(accept, g);
+    clear_vertex(acceptEod, g);
+    remove_vertex(acceptEod, g);
+
+    // Renumber so that g's properties (number of vertices, edges) are
+    // accurate.
+    g.renumberVertices();
+    g.renumberEdges();
+
+    assert(num_vertices(g) == num_vertices(g_in));
+    assert(num_edges(g) == num_edges(g_in));
+}
+
 #ifndef NDEBUG
 
 bool allMatchStatesHaveReports(const NGHolder &g) {
diff --git a/src/nfagraph/ng_util.h b/src/nfagraph/ng_util.h
index 1c6dd461..6c6907a3 100644
--- a/src/nfagraph/ng_util.h
+++ b/src/nfagraph/ng_util.h
@@ -174,7 +174,11 @@ bool is_match_vertex(NFAVertex v, const GraphT &g) {
 }
 
 /** Generate a reverse topological ordering for a back-edge filtered version of
- * our graph (as it must be a DAG and correctly numbered) */
+ * our graph (as it must be a DAG and correctly numbered).
+ *
+ * Note: we ensure that we produce a topo ordering that begins with acceptEod
+ * and accept (if present) and ends with startDs followed by start.
+ */
 std::vector<NFAVertex> getTopoOrdering(const NGHolder &g);
 
 /** Comparison functor used to sort by vertex_index. */
@@ -300,6 +304,10 @@ void clearReports(NGHolder &g);
  * r_old. */
 void duplicateReport(NGHolder &g, ReportID r_old, ReportID r_new);
 
+/** Construct a reversed copy of an arbitrary NGHolder, mapping starts to
+ * accepts. */
+void reverseHolder(const NGHolder &g, NGHolder &out);
+
 #ifndef NDEBUG
 
 // Assertions: only available in internal builds.
diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index 6b19549b..38c488be 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -47,6 +47,7 @@
 #include "nfagraph/ng_is_equal.h"
 #include "nfagraph/ng_limex.h"
 #include "nfagraph/ng_mcclellan.h"
+#include "nfagraph/ng_prune.h"
 #include "nfagraph/ng_repeat.h"
 #include "nfagraph/ng_reports.h"
 #include "nfagraph/ng_stop.h"
@@ -788,19 +789,230 @@ void RoseBuildImpl::findTransientLeftfixes(void) {
 
 /** Find all the different roses and their associated literals. */
 static
-map<left_id, vector<RoseVertex>> findLeftSucc(RoseBuildImpl &tbi) {
+map<left_id, vector<RoseVertex>> findLeftSucc(const RoseBuildImpl &build) {
     map<left_id, vector<RoseVertex>> leftfixes;
-    for (auto v : vertices_range(tbi.g)) {
-        if (tbi.g[v].left) {
-            const LeftEngInfo &lei = tbi.g[v].left;
+    for (auto v : vertices_range(build.g)) {
+        if (build.g[v].left) {
+            const LeftEngInfo &lei = build.g[v].left;
             leftfixes[lei].push_back(v);
         }
     }
     return leftfixes;
 }
 
+namespace {
+struct infix_info {
+    set<RoseVertex> preds;
+    set<RoseVertex> succs;
+};
+}
+
 static
-bool triggerKillsRoseGraph(const RoseBuildImpl &tbi, const left_id &left,
+map<NGHolder *, infix_info> findInfixGraphInfo(const RoseBuildImpl &build) {
+    map<NGHolder *, infix_info> rv;
+
+    for (auto v : vertices_range(build.g)) {
+        if (!build.g[v].left) {
+            continue;
+        }
+
+        if (build.isRootSuccessor(v)) {
+            DEBUG_PRINTF("a prefix is never an infix\n");
+            continue;
+        }
+
+        /* ensure only proper nfas */
+        const LeftEngInfo &lei = build.g[v].left;
+        if (!lei.graph) {
+            continue;
+        }
+        if (lei.haig || lei.dfa) {
+            continue;
+        }
+        assert(!lei.castle);
+        infix_info &info = rv[lei.graph.get()];
+        insert(&info.preds, inv_adjacent_vertices_range(v, build.g));
+        info.succs.insert(v);
+    }
+
+    return rv;
+}
+
+static
+map<u32, flat_set<NFAEdge>> getTopInfo(const NGHolder &h) {
+    map<u32, flat_set<NFAEdge>> rv;
+    for (NFAEdge e : out_edges_range(h.start, h)) {
+        for (u32 t : h[e].tops) {
+            rv[t].insert(e);
+        }
+    }
+    return rv;
+}
+
+static
+u32 findUnusedTop(const map<u32, flat_set<NFAEdge>> &tops) {
+    u32 i = 0;
+    while (contains(tops, i)) {
+        i++;
+    }
+    return i;
+}
+
+static
+bool reduceTopTriggerLoad(RoseBuildImpl &build, NGHolder &h, RoseVertex u) {
+    RoseGraph &g = build.g;
+
+    set<u32> tops; /* tops triggered by u */
+    for (RoseEdge e : out_edges_range(u, g)) {
+        RoseVertex v = target(e, g);
+        if (g[v].left.graph.get() != &h) {
+            continue;
+        }
+        tops.insert(g[e].rose_top);
+    }
+
+    assert(!tops.empty());
+    if (tops.size() <= 1) {
+        return false;
+    }
+    DEBUG_PRINTF("%zu triggers %zu tops for %p\n", build.g[u].idx, tops.size(),
+                 &h);
+
+    auto h_top_info = getTopInfo(h);
+    flat_set<NFAEdge> edges_to_trigger;
+    for (u32 t : tops) {
+        insert(&edges_to_trigger, h_top_info[t]);
+    }
+
+    u32 new_top = ~0U;
+    /* check if there is already a top with the right the successor set */
+    for (const auto &elem : h_top_info) {
+        if (elem.second == edges_to_trigger) {
+            new_top = elem.first;
+            break;
+        }
+    }
+
+    /* if no existing suitable top, add a new top for us */
+    if (new_top == ~0U) {
+        new_top = findUnusedTop(h_top_info);
+
+        /* add top to edges out of start */
+        for (NFAEdge e : out_edges_range(h.start, h)) {
+            if (has_intersection(tops, h[e].tops)) {
+                h[e].tops.insert(new_top);
+            }
+        }
+
+        /* check still implementable if we add a new top */
+        if (!isImplementableNFA(h, nullptr, build.cc)) {
+            DEBUG_PRINTF("unable to add new top\n");
+            for (NFAEdge e : out_edges_range(h.start, h)) {
+                h[e].tops.erase(new_top);
+            }
+            /* we should be back to the original graph */
+            assert(isImplementableNFA(h, nullptr, build.cc));
+            return false;
+        }
+    }
+
+    DEBUG_PRINTF("using new merged top %u\n", new_top);
+    assert(new_top != ~0U);
+    for (RoseEdge e: out_edges_range(u, g)) {
+        RoseVertex v = target(e, g);
+        if (g[v].left.graph.get() != &h) {
+            continue;
+        }
+        g[e].rose_top = new_top;
+    }
+
+    return true;
+}
+
+static
+void packInfixTops(NGHolder &h, RoseGraph &g,
+                   const set<RoseVertex> &verts) {
+    if (!is_triggered(h)) {
+        DEBUG_PRINTF("not triggered, no tops\n");
+        return;
+    }
+    assert(isCorrectlyTopped(h));
+    DEBUG_PRINTF("pruning unused tops\n");
+    flat_set<u32> used_tops;
+    for (auto v : verts) {
+        assert(g[v].left.graph.get() == &h);
+
+        for (const auto &e : in_edges_range(v, g)) {
+            u32 top = g[e].rose_top;
+            used_tops.insert(top);
+        }
+    }
+
+    map<u32, u32> top_mapping;
+    for (u32 t : used_tops) {
+        u32 new_top = top_mapping.size();
+        top_mapping[t] = new_top;
+    }
+
+    for (auto v : verts) {
+        assert(g[v].left.graph.get() == &h);
+
+        for (const auto &e : in_edges_range(v, g)) {
+            g[e].rose_top = top_mapping.at(g[e].rose_top);
+        }
+    }
+
+    vector<NFAEdge> dead;
+    for (const auto &e : out_edges_range(h.start, h)) {
+        NFAVertex v = target(e, h);
+        if (v == h.startDs) {
+            continue; // stylised edge, leave it alone.
+        }
+        flat_set<u32> updated_tops;
+        for (u32 t : h[e].tops) {
+            if (contains(top_mapping, t)) {
+                updated_tops.insert(top_mapping.at(t));
+            }
+        }
+        h[e].tops = move(updated_tops);
+        if (h[e].tops.empty()) {
+            DEBUG_PRINTF("edge (start,%u) has only unused tops\n", h[v].index);
+            dead.push_back(e);
+        }
+    }
+
+    if (dead.empty()) {
+        return;
+    }
+
+    remove_edges(dead, h);
+    pruneUseless(h);
+    clearReports(h); // As we may have removed vacuous edges.
+}
+
+static
+void reduceTopTriggerLoad(RoseBuildImpl &build) {
+    auto infixes = findInfixGraphInfo(build);
+
+    for (auto &p : infixes) {
+        if (onlyOneTop(*p.first)) {
+            continue;
+        }
+
+        bool changed = false;
+        for (RoseVertex v : p.second.preds) {
+            changed |= reduceTopTriggerLoad(build, *p.first, v);
+        }
+
+        if (changed) {
+            packInfixTops(*p.first, build.g, p.second.succs);
+            reduceImplementableGraph(*p.first, SOM_NONE, nullptr, build.cc);
+        }
+    }
+}
+
+static
+bool triggerKillsRoseGraph(const RoseBuildImpl &build, const left_id &left,
                            const set<ue2_literal> &all_lits,
                            const RoseEdge &e) {
     assert(left.graph());
@@ -816,8 +1028,8 @@ bool triggerKillsRoseGraph(const RoseBuildImpl &tbi, const left_id &left,
 
     /* check each pred literal to see if they all kill previous graph
      * state */
-    for (u32 lit_id : tbi.g[source(e, tbi.g)].literals) {
-        const rose_literal_id &pred_lit = tbi.literals.right.at(lit_id);
+    for (u32 lit_id : build.g[source(e, build.g)].literals) {
+        const rose_literal_id &pred_lit = build.literals.right.at(lit_id);
         const ue2_literal s = findNonOverlappingTail(all_lits, pred_lit.s);
 
         DEBUG_PRINTF("running graph %zu\n", states.size());
@@ -833,7 +1045,7 @@ bool triggerKillsRoseGraph(const RoseBuildImpl &tbi, const left_id &left,
 }
 
 static
-bool triggerKillsRose(const RoseBuildImpl &tbi, const left_id &left,
+bool triggerKillsRose(const RoseBuildImpl &build, const left_id &left,
                       const set<ue2_literal> &all_lits, const RoseEdge &e) {
     if (left.haig()) {
         /* TODO: To allow this for som-based engines we would also need to
@@ -843,32 +1055,30 @@ bool triggerKillsRose(const RoseBuildImpl &tbi, const left_id &left,
     }
 
     if (left.graph()) {
-        return triggerKillsRoseGraph(tbi, left, all_lits, e);
+        return triggerKillsRoseGraph(build, left, all_lits, e);
     }
 
     if (left.castle()) {
-        return triggerKillsRoseCastle(tbi, left, all_lits, e);
+        return triggerKillsRoseCastle(build, left, all_lits, e);
     }
 
     return false;
 }
 
+/* Sometimes the arrival of a top for a rose infix can ensure that the nfa would
+ * be dead at that time. In the case of multiple trigger literals, we can only
+ * base our decision on that portion of literal after any overlapping literals.
+ */
 static
-void inspectRoseTops(RoseBuildImpl &tbi) {
-    /* Sometimes the arrival of a top for a rose infix can ensure that the nfa
-     * would be dead at that time. In the case of multiple trigger literals we
-     * can only base our decision on that portion of literal after any
-     * overlapping literals */
+void findTopTriggerCancels(RoseBuildImpl &build) {
+    auto left_succ = findLeftSucc(build); /* leftfixes -> succ verts */
 
-    map<left_id, vector<RoseVertex>> roses =
-        findLeftSucc(tbi); /* rose -> succ verts */
-
-    for (const auto &r : roses) {
+    for (const auto &r : left_succ) {
         const left_id &left = r.first;
         const vector<RoseVertex> &succs = r.second;
 
         assert(!succs.empty());
-        if (tbi.isRootSuccessor(*succs.begin())) {
+        if (build.isRootSuccessor(*succs.begin())) {
             /* a prefix is never an infix */
             continue;
         }
@@ -878,10 +1088,10 @@ void inspectRoseTops(RoseBuildImpl &tbi) {
         set<u32> pred_lit_ids;
 
         for (auto v : succs) {
-            for (const auto &e : in_edges_range(v, tbi.g)) {
-                RoseVertex u = source(e, tbi.g);
-                tops_seen.insert(tbi.g[e].rose_top);
-                insert(&pred_lit_ids, tbi.g[u].literals);
+            for (const auto &e : in_edges_range(v, build.g)) {
+                RoseVertex u = source(e, build.g);
+                tops_seen.insert(build.g[e].rose_top);
+                insert(&pred_lit_ids, build.g[u].literals);
                 rose_edges.insert(e);
             }
         }
@@ -893,7 +1103,7 @@ void inspectRoseTops(RoseBuildImpl &tbi) {
         }
 
         for (u32 lit_id : pred_lit_ids) {
-            const rose_literal_id &p_lit = tbi.literals.right.at(lit_id);
+            const rose_literal_id &p_lit = build.literals.right.at(lit_id);
             if (p_lit.delay || p_lit.table == ROSE_ANCHORED) {
                 goto next_rose;
             }
@@ -905,15 +1115,22 @@ void inspectRoseTops(RoseBuildImpl &tbi) {
                      all_lits.size(), rose_edges.size());
 
         for (const auto &e : rose_edges) {
-            if (triggerKillsRose(tbi, left, all_lits, e)) {
+            if (triggerKillsRose(build, left, all_lits, e)) {
                 DEBUG_PRINTF("top will override previous rose state\n");
-                tbi.g[e].rose_cancel_prev_top = true;
+                build.g[e].rose_cancel_prev_top = true;
             }
         }
     next_rose:;
     }
 }
 
+static
+void optimiseRoseTops(RoseBuildImpl &build) {
+    reduceTopTriggerLoad(build);
+    /* prune unused tops ? */
+    findTopTriggerCancels(build);
+}
+
 static
 void buildRoseSquashMasks(RoseBuildImpl &tbi) {
     /* Rose nfa squash masks are applied to the groups when the nfa can no
@@ -1492,7 +1709,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildRose(u32 minWidth) {
 
     /* final prep work */
     remapCastleTops(*this);
-    inspectRoseTops(*this);
+    optimiseRoseTops(*this);
     buildRoseSquashMasks(*this);
 
     rm.assignDkeys(this);
diff --git a/src/rose/rose_build_merge.cpp b/src/rose/rose_build_merge.cpp
index 01134736..054dd12f 100644
--- a/src/rose/rose_build_merge.cpp
+++ b/src/rose/rose_build_merge.cpp
@@ -53,7 +53,6 @@
 #include "nfagraph/ng_redundancy.h"
 #include "nfagraph/ng_repeat.h"
 #include "nfagraph/ng_reports.h"
-#include "nfagraph/ng_restructuring.h"
 #include "nfagraph/ng_stop.h"
 #include "nfagraph/ng_uncalc_components.h"
 #include "nfagraph/ng_util.h"
@@ -1457,11 +1456,7 @@ bool hasReformedStartDotStar(const NGHolder &h, const Grey &grey) {
 static
 u32 commonPrefixLength(left_id &r1, left_id &r2) {
     if (r1.graph() && r2.graph()) {
-        auto &g1 = *r1.graph();
-        auto &g2 = *r2.graph();
-        auto state_ids_1 = numberStates(g1);
-        auto state_ids_2 = numberStates(g2);
-        return commonPrefixLength(g1, state_ids_1, g2, state_ids_2);
+        return commonPrefixLength(*r1.graph(), *r2.graph());
     } else if (r1.castle() && r2.castle()) {
         return min(findMinWidth(*r1.castle()), findMinWidth(*r2.castle()));
     }
@@ -1750,7 +1745,6 @@ u32 findUnusedTop(const ue2::flat_set<u32> &tops) {
     while (contains(tops, i)) {
         i++;
     }
-    assert(i < NFA_MAX_TOP_MASKS);
     return i;
 }
 
@@ -1779,11 +1773,6 @@ bool setDistinctTops(NGHolder &h1, const NGHolder &h2,
     DEBUG_PRINTF("before: h1 has %zu tops, h2 has %zu tops\n", tops1.size(),
                  tops2.size());
 
-    if (tops1.size() + tops2.size() > NFA_MAX_TOP_MASKS) {
-        DEBUG_PRINTF("too many tops!\n");
-        return false;
-    }
-
     // If our tops don't intersect, we're OK to merge with no changes.
     if (!has_intersection(tops1, tops2)) {
         DEBUG_PRINTF("tops don't intersect\n");
@@ -1856,11 +1845,6 @@ bool setDistinctSuffixTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
     return true;
 }
 
-static
-bool hasMaxTops(const NGHolder &h) {
-    return getTops(h).size() == NFA_MAX_TOP_MASKS;
-}
-
 /** \brief Estimate the number of accel states in the given graph when built as
  * an NFA.
  *
@@ -1899,11 +1883,6 @@ void mergeNfaLeftfixes(RoseBuildImpl &tbi, RoseBouquet &roses) {
                          "with %p (%zu verts)\n",
                          r1.graph(), verts1.size(), r2.graph(), verts2.size());
 
-            if (hasMaxTops(*r1.graph())) {
-                DEBUG_PRINTF("h1 has hit max tops\n");
-                break; // next h1
-            }
-
             u32 accel1 = accel_count[r1];
             if (accel1 >= NFA_MAX_ACCEL_STATES) {
                 DEBUG_PRINTF("h1 has hit max accel\n");
@@ -2203,11 +2182,6 @@ void mergeSuffixes(RoseBuildImpl &tbi, SuffixBouquet &suffixes,
             const deque<RoseVertex> &verts2 = suffixes.vertices(s2);
             assert(s2.graph() && s2.graph()->kind == NFA_SUFFIX);
 
-            if (hasMaxTops(*s1.graph())) {
-                DEBUG_PRINTF("h1 has hit max tops\n");
-                break; // next h1
-            }
-
             if (!acyclic) {
                 u32 accel1 = accel_count[s1];
                 if (accel1 >= NFA_MAX_ACCEL_STATES) {