vectorscan/src/rose/rose_build_merge.cpp

/*
 * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/** \file
 * \brief Rose Build: functions for reducing the size of the Rose graph
 * through merging.
 */
#include "rose_build_merge.h"

#include "grey.h"
#include "rose_build.h"
#include "rose_build_impl.h"
#include "rose_build_util.h"
#include "ue2common.h"
#include "nfa/castlecompile.h"
#include "nfa/goughcompile.h"
#include "nfa/limex_limits.h"
#include "nfa/mcclellancompile.h"
#include "nfa/nfa_build_util.h"
#include "nfa/rdfa_merge.h"
#include "nfagraph/ng_holder.h"
#include "nfagraph/ng_haig.h"
#include "nfagraph/ng_is_equal.h"
#include "nfagraph/ng_lbr.h"
#include "nfagraph/ng_limex.h"
#include "nfagraph/ng_mcclellan.h"
#include "nfagraph/ng_puff.h"
#include "nfagraph/ng_redundancy.h"
#include "nfagraph/ng_repeat.h"
#include "nfagraph/ng_reports.h"
#include "nfagraph/ng_restructuring.h"
#include "nfagraph/ng_stop.h"
#include "nfagraph/ng_uncalc_components.h"
#include "nfagraph/ng_util.h"
#include "nfagraph/ng_width.h"
#include "util/bitutils.h"
#include "util/charreach.h"
#include "util/compile_context.h"
#include "util/container.h"
#include "util/dump_charclass.h"
#include "util/graph_range.h"
#include "util/order_check.h"
#include "util/report_manager.h"
#include "util/ue2string.h"

#include <algorithm>
#include <functional>
#include <list>
#include <map>
#include <queue>
#include <set>
#include <string>
#include <vector>
#include <utility>

#include <boost/functional/hash/hash_fwd.hpp>
#include <boost/range/adaptor/map.hpp>

using namespace std;
using boost::adaptors::map_values;
using boost::hash_combine;

namespace ue2 {

static const size_t NARROW_START_MAX = 10;
static const size_t SMALL_MERGE_MAX_VERTICES_STREAM = 128;
static const size_t SMALL_MERGE_MAX_VERTICES_BLOCK = 64;
static const size_t SMALL_ROSE_THRESHOLD_STREAM = 32;
static const size_t SMALL_ROSE_THRESHOLD_BLOCK = 10;
static const size_t MERGE_GROUP_SIZE_MAX = 200;
static const size_t MERGE_CASTLE_GROUP_SIZE_MAX = 1000;

/** \brief Max number of DFAs (McClellan, Haig) to pairwise merge together. */
static const size_t DFA_CHUNK_SIZE_MAX = 200;

/** \brief Max DFA states in a merged DFA. */
static const size_t DFA_MERGE_MAX_STATES = 8000;

/** \brief An LBR must have at least this many vertices to be protected from
 * merging with other graphs. */
static const size_t LARGE_LBR_MIN_VERTICES = 32;

/** \brief In block mode, merge two prefixes even if they don't have identical
 * literal sets if they have fewer than this many states and the merged graph
 * is also small. */
static constexpr size_t MAX_BLOCK_PREFIX_MERGE_VERTICES = 32;

static
size_t small_merge_max_vertices(const CompileContext &cc) {
    return cc.streaming ? SMALL_MERGE_MAX_VERTICES_STREAM
                        : SMALL_MERGE_MAX_VERTICES_BLOCK;
}

static
size_t small_rose_threshold(const CompileContext &cc) {
    return cc.streaming ? SMALL_ROSE_THRESHOLD_STREAM
                        : SMALL_ROSE_THRESHOLD_BLOCK;
}

static
bool isLargeLBR(const NGHolder &g, const Grey &grey) {
    if (num_vertices(g) < LARGE_LBR_MIN_VERTICES) {
        return false;
    }
    return isLBR(g, grey);
}

namespace {
struct DupeLeafKey {
    explicit DupeLeafKey(const RoseVertexProps &litv)
        : literals(litv.literals), reports(litv.reports),
          eod_accept(litv.eod_accept), suffix(litv.suffix), left(litv.left),
          som_adjust(litv.som_adjust) {
        DEBUG_PRINTF("eod_accept %d\n", (int)eod_accept);
        DEBUG_PRINTF("report %u\n", left.leftfix_report);
        DEBUG_PRINTF("lag %u\n", left.lag);
    }

    bool operator<(const DupeLeafKey &b) const {
        const DupeLeafKey &a = *this;
        ORDER_CHECK(literals);
        ORDER_CHECK(eod_accept);
        ORDER_CHECK(suffix);
        ORDER_CHECK(reports);
        ORDER_CHECK(som_adjust);
        ORDER_CHECK(left.leftfix_report);
        ORDER_CHECK(left.lag);
        return false;
    }

    flat_set<u32> literals;
    flat_set<ReportID> reports;
    bool eod_accept;
    suffix_id suffix;
    LeftEngInfo left;
    u32 som_adjust;
};

struct UncalcLeafKey {
    UncalcLeafKey(const RoseGraph &g, RoseVertex v)
        : literals(g[v].literals), rose(g[v].left) {
        for (const auto &e : in_edges_range(v, g)) {
            RoseVertex u = source(e, g);
            preds.insert(make_pair(u, g[e]));
        }
    }

    bool operator<(const UncalcLeafKey &b) const {
        const UncalcLeafKey &a = *this;
        ORDER_CHECK(literals);
        ORDER_CHECK(preds);
        ORDER_CHECK(rose);
        return false;
    }

    flat_set<u32> literals;
    flat_set<pair<RoseVertex, RoseEdgeProps>> preds;
    LeftEngInfo rose;
};
} // namespace

/**
 * This function merges leaf vertices with the same literals and report
 * id/suffix. The leaf vertices of the graph are inspected and a mapping of
 * leaf vertex properties to vertices is built. If the same set of leaf
 * properties has already been seen when we inspect a vertex, we attempt to
 * merge the vertex in with the previously seen vertex. This process can fail
 * if the vertices share a common predecessor vertex but have a differing,
 * incompatible relationship (different bounds or infix) with the predecessor.
 *
 * This takes place after \ref dedupeSuffixes to increase effectiveness as the
 * same suffix is required for a merge to occur.
 */
void mergeDupeLeaves(RoseBuildImpl &tbi) {
    map<DupeLeafKey, RoseVertex> leaves;
    vector<RoseVertex> changed;

    RoseGraph &g = tbi.g;
    for (auto v : vertices_range(g)) {
        if (in_degree(v, g) == 0) {
            assert(tbi.isAnyStart(v));
            continue;
        }

        DEBUG_PRINTF("inspecting vertex idx=%zu in_degree %zu out_degree %zu\n",
                     g[v].idx, in_degree(v, g), out_degree(v, g));

        // Vertex must be a reporting leaf node
        if (g[v].reports.empty() || !isLeafNode(v, g)) {
            continue;
        }

        // At the moment, we ignore all successors of root or anchored_root,
        // since many parts of our runtime assume that these have in-degree 1.
        if (tbi.isRootSuccessor(v)) {
            continue;
        }

        DupeLeafKey dupe(g[v]);
        if (leaves.find(dupe) == leaves.end()) {
            leaves.insert(make_pair(dupe, v));
            continue;
        }

        RoseVertex t = leaves.find(dupe)->second;
        DEBUG_PRINTF("found two leaf dupe roles, idx=%zu,%zu\n", g[v].idx,
                     g[t].idx);

        vector<RoseEdge> deadEdges;
        for (const auto &e : in_edges_range(v, g)) {
            RoseVertex u = source(e, g);
            DEBUG_PRINTF("u idx=%zu\n", g[u].idx);
            RoseEdge et;
            bool exists;
            tie (et, exists) = edge(u, t, g);
            if (exists) {
                if (g[et].minBound <= g[e].minBound
                    && g[et].maxBound >= g[e].maxBound) {
                    DEBUG_PRINTF("remove more constrained edge\n");
                    deadEdges.push_back(e);
                }
            } else {
                DEBUG_PRINTF("rehome edge: add %zu->%zu\n", g[u].idx, g[t].idx);
                add_edge(u, t, g[e], g);
                deadEdges.push_back(e);
            }
        }

        if (!deadEdges.empty()) {
            for (auto &e : deadEdges) {
                remove_edge(e, g);
            }
            changed.push_back(v);
            g[t].min_offset = min(g[t].min_offset, g[v].min_offset);
            g[t].max_offset = max(g[t].max_offset, g[v].max_offset);
        }
    }
    DEBUG_PRINTF("find loop done\n");

    // Remove any vertices that now have no in-edges.
    size_t countRemovals = 0;
    for (size_t i = 0; i < changed.size(); i++) {
        RoseVertex v = changed[i];
        if (in_degree(v, g) == 0) {
            DEBUG_PRINTF("remove vertex\n");
            if (!tbi.isVirtualVertex(v)) {
                for (u32 lit_id : g[v].literals) {
                    tbi.literal_info[lit_id].vertices.erase(v);
                }
            }
            remove_vertex(v, g);
            countRemovals++;
        }
    }

    // if we've removed anything, we need to renumber vertices
    if (countRemovals) {
        tbi.renumberVertices();
        DEBUG_PRINTF("removed %zu vertices.\n", countRemovals);
    }
}

/** Merges the suffixes on the (identical) vertices in \a vcluster, used by
 * \ref uncalcLeaves. */
static
void mergeCluster(RoseGraph &g, const ReportManager &rm,
                  const vector<RoseVertex> &vcluster,
                  vector<RoseVertex> &dead, const CompileContext &cc) {
    if (vcluster.size() <= 1) {
        return; // No merge to perform.
    }

    // Note that we batch merges up fairly crudely for performance reasons.
    vector<RoseVertex>::const_iterator it = vcluster.begin(), it2;
    while (it != vcluster.end()) {
        vector<NGHolder *> cluster;
        map<NGHolder *, RoseVertex> rev;

        for (it2 = it;
             it2 != vcluster.end() && cluster.size() < MERGE_GROUP_SIZE_MAX;
             ++it2) {
            RoseVertex v = *it2;
            NGHolder *h = g[v].suffix.graph.get();
            assert(!g[v].suffix.haig); /* should not be here if haig */
            rev[h] = v;
            cluster.push_back(h);
        }
        it = it2;

        DEBUG_PRINTF("merging cluster %zu\n", cluster.size());
        map<NGHolder *, NGHolder *> merged;
        mergeNfaCluster(cluster, &rm, merged, cc);
        DEBUG_PRINTF("done\n");

        for (const auto &m : merged) {
            NGHolder *h_victim = m.first; // mergee
            NGHolder *h_winner = m.second;
            RoseVertex victim = rev[h_victim];
            RoseVertex winner = rev[h_winner];

            LIMIT_TO_AT_MOST(&g[winner].min_offset, g[victim].min_offset);
            ENSURE_AT_LEAST(&g[winner].max_offset, g[victim].max_offset);
            insert(&g[winner].reports, g[victim].reports);

            dead.push_back(victim);
        }
    }
}

static
void findUncalcLeavesCandidates(RoseBuildImpl &tbi,
                           map<UncalcLeafKey, vector<RoseVertex> > &clusters,
                           deque<UncalcLeafKey> &ordered) {
    const RoseGraph &g = tbi.g;

    vector<NFAVertex> suffix_vertices; // vertices with suffix graphs
    ue2::unordered_map<const NGHolder *, u32> fcount; // ref count per graph

    for (auto v : vertices_range(g)) {
        if (g[v].suffix) {
            if (!g[v].suffix.graph) {
                continue; /* cannot uncalc (haig/mcclellan); TODO */
            }

            assert(g[v].suffix.graph->kind == NFA_SUFFIX);

            // Ref count all suffixes, as we don't want to merge a suffix
            // that happens to be shared with a non-leaf vertex somewhere.
            DEBUG_PRINTF("vertex %zu has suffix %p\n", g[v].idx,
                         g[v].suffix.graph.get());
            fcount[g[v].suffix.graph.get()]++;

            // Vertex must be a reporting pseudo accept
            if (!isLeafNode(v, g)) {
                continue;
            }

            suffix_vertices.push_back(v);
        }
    }

    for (auto v : suffix_vertices) {
        if (in_degree(v, g) == 0) {
            assert(tbi.isAnyStart(v));
            continue;
        }

        const NGHolder *h = g[v].suffix.graph.get();
        assert(h);
        DEBUG_PRINTF("suffix %p\n", h);

        // We can't easily merge suffixes shared with other vertices, and
        // creating a unique copy to do so may just mean we end up tracking
        // more NFAs. Better to leave shared suffixes alone.
        if (fcount[h] != 1) {
            DEBUG_PRINTF("skipping shared suffix\n");
            continue;
        }

        UncalcLeafKey key(g, v);
        vector<RoseVertex> &vec = clusters[key];
        if (vec.empty()) {

            ordered.push_back(key);
        }
        vec.push_back(v);
    }

    DEBUG_PRINTF("find loop done\n");
}

/**
 * This function attempts to combine identical roles (same literals, same
 * predecessors, etc) with different suffixes into a single role which
 * activates a larger suffix. The leaf vertices of the graph with a suffix are
 * grouped into clusters which have members triggered by identical roles. The
 * \ref mergeNfaCluster function (from ng_uncalc_components) is then utilised
 * to build a set of larger (and still implementable) suffixes. The graph is
 * then updated to point to the new suffixes and any unneeded roles are
 * removed.
 *
 * Note: suffixes which are shared amongst multiple roles are not considered
 * for this pass as the individual suffixes would have to continue to exist for
 * the other roles to trigger resulting in the transformation not producing any
 * savings.
 *
 * Note: as \ref mergeNfaCluster is slow when the cluster sizes are large,
 * clusters of more than \ref MERGE_GROUP_SIZE_MAX roles are split into smaller
 * chunks for processing.
 */
void uncalcLeaves(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("uncalcing\n");

    map<UncalcLeafKey, vector<RoseVertex> > clusters;
    deque<UncalcLeafKey> ordered;
    findUncalcLeavesCandidates(tbi, clusters, ordered);

    vector<RoseVertex> dead;

    for (const auto &key : ordered) {
        DEBUG_PRINTF("cluster of size %zu\n", clusters[key].size());
        mergeCluster(tbi.g, tbi.rm, clusters[key], dead, tbi.cc);
    }
    tbi.removeVertices(dead);
}

/**
 * Returns a loose hash of a leftfix for use in dedupeLeftfixes. Note that
 * reports should not contribute to the hash.
 */
static
size_t hashLeftfix(const LeftEngInfo &left) {
    size_t val = 0;

    if (left.castle) {
        hash_combine(val, left.castle->reach());
        for (const auto &pr : left.castle->repeats) {
            hash_combine(val, pr.first); // top
            hash_combine(val, pr.second.bounds);
        }
    } else if (left.graph) {
        hash_combine(val, hash_holder(*left.graph));
    }

    return val;
}

namespace {

/** Key used to group sets of leftfixes by the dedupeLeftfixes path. */
struct RoseGroup {
    RoseGroup(const RoseBuildImpl &build, RoseVertex v)
        : left_hash(hashLeftfix(build.g[v].left)),
          lag(build.g[v].left.lag), eod_table(build.isInETable(v)) {
        const RoseGraph &g = build.g;
        assert(in_degree(v, g) == 1);
        RoseVertex u = *inv_adjacent_vertices(v, g).first;
        parent = g[u].idx;
    }

    bool operator<(const RoseGroup &b) const {
        const RoseGroup &a = *this;
        ORDER_CHECK(parent);
        ORDER_CHECK(left_hash);
        ORDER_CHECK(lag);
        ORDER_CHECK(eod_table);
        return false;
    }

private:
    /** Parent vertex index. We must use the index, rather than the descriptor,
     * for compile determinism. */
    size_t parent;

    /** Quick hash of the leftfix itself. Must be identical for a given pair of
     * graphs if is_equal would return true. */
    size_t left_hash;

    /** Leftfix lag value. */
    u32 lag;

    /** True if associated vertex (successor) is in the EOD table. We don't
     * allow sharing of leftfix engines between "normal" and EOD operation. */
    bool eod_table;
};

/**
 * Trivial Rose comparator intended to find graphs that are identical except
 * for their report IDs. Relies on vertex and edge indices to pick up graphs
 * that have been messily put together in different orderings...
 */
struct RoseComparator {
    explicit RoseComparator(const RoseGraph &g_in) : g(g_in) {}

    bool operator()(const RoseVertex u, const RoseVertex v) const {
        const LeftEngInfo &u_left = g[u].left;
        const LeftEngInfo &v_left = g[v].left;

        if (u_left.castle && v_left.castle) {
            return is_equal(*u_left.castle, u_left.leftfix_report,
                            *v_left.castle, v_left.leftfix_report);
        }

        if (!u_left.graph || !v_left.graph) {
            return false;
        }

        return is_equal(*u_left.graph, u_left.leftfix_report, *v_left.graph,
                        v_left.leftfix_report);
    }

private:
    const RoseGraph &g;
};

} // namespace

/**
 * This pass performs work similar to \ref dedupeSuffixes - it removes
 * duplicate prefix/infixes (that is, leftfixes) which are identical graphs and
 * share the same trigger vertex and lag. Leftfixes are first grouped by
 * parent role and lag to reduce the number of candidates to be inspected
 * for each leftfix. The graphs in each cluster are then compared with each
 * other and the graph is updated to only refer to a canonical version of each
 * graph.
 *
 * Note: only roles with a single predecessor vertex are considered for this
 * transform - it should probably be generalised to work for roles which share
 * the same set of predecessor roles as for \ref dedupeLeftfixesVariableLag or it
 * should be retired entirely.
 */
bool dedupeLeftfixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("deduping leftfixes\n");
    map<RoseGroup, deque<RoseVertex>> roses;
    bool work_done = false;

    /* Note: a leftfix's transientness will not be altered by deduping */

    // Collect leftfixes into groups.
    RoseGraph &g = tbi.g;
    for (auto v : vertices_range(g)) {
        if (!g[v].left) {
            continue;
        }
        const left_id left(g[v].left);

        if (left.haig()) {
            /* TODO: allow merging of identical haigs */
            continue;
        }

        if (in_degree(v, g) != 1) {
            continue;
        }

        roses[RoseGroup(tbi, v)].push_back(v);
    }

    DEBUG_PRINTF("collected %zu rose groups\n", roses.size());

    const RoseComparator rosecmp(g);

    // Walk groups and dedupe the roses therein.
    for (deque<RoseVertex> &verts : roses | map_values) {
        DEBUG_PRINTF("group has %zu vertices\n", verts.size());

        ue2::unordered_set<left_id> seen;

        for (auto jt = verts.begin(), jte = verts.end(); jt != jte; ++jt) {
            RoseVertex v = *jt;
            left_id left(g[v].left);

            // Skip cases we've already handled, and mark as seen otherwise.
            if (!seen.insert(left).second) {
                continue;
            }

            // Scan the rest of the list for dupes.
            for (auto kt = next(jt); kt != jte; ++kt) {
                if (g[v].left == g[*kt].left || !rosecmp(v, *kt)) {
                    continue;
                }

                // Dupe found.
                DEBUG_PRINTF("rose at vertex %zu is a dupe of %zu\n",
                             g[*kt].idx, g[v].idx);
                assert(g[v].left.lag == g[*kt].left.lag);
                g[*kt].left = g[v].left;
                work_done = true;
            }
        }
    }

    return work_done;
}

/**
 * \brief Returns a numeric key that can be used to group this suffix with
 * others that may be its duplicate.
 */
static
size_t suffix_size_key(const suffix_id &s) {
    if (s.graph()) {
        return num_vertices(*s.graph());
    }
    if (s.castle()) {
        return s.castle()->repeats.size();
    }
    return 0;
}

static
bool is_equal(const suffix_id &s1, const suffix_id &s2) {
    if (s1.graph() && s2.graph()) {
        return is_equal(*s1.graph(), *s2.graph());
    } else if (s1.castle() && s2.castle()) {
        return is_equal(*s1.castle(), *s2.castle());
    }
    return false;
}

/**
 * This function simply looks for suffix NGHolder graphs which are identical
 * and updates the roles in the RoseGraph to refer to only a single copy. This
 * obviously has benefits in terms of both performance (as we don't run
 * multiple engines doing the same work) and stream state. This function first
 * groups all suffixes by number of vertices and report set to restrict the set
 * of possible candidates. Each group is then walked to find duplicates using
 * the \ref is_equal comparator for NGHolders and updating the RoseGraph as it
 * goes.
 *
 * Note: does not dedupe suffixes of vertices in the EOD table.
 */
void dedupeSuffixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("deduping suffixes\n");

    ue2::unordered_map<suffix_id, set<RoseVertex>> suffix_map;
    map<pair<size_t, set<ReportID>>, vector<suffix_id>> part;

    // Collect suffixes into groups.
    RoseGraph &g = tbi.g;
    for (auto v : vertices_range(g)) {
        if (!g[v].suffix || tbi.isInETable(v)) {
            continue;
        }

        const suffix_id s(g[v].suffix);

        if (!(s.graph() || s.castle())) {
            continue; // e.g. Haig
        }

        set<RoseVertex> &verts = suffix_map[s];
        if (verts.empty()) {
            part[make_pair(suffix_size_key(s), all_reports(s))].push_back(s);
        }
        verts.insert(v);
    }

    DEBUG_PRINTF("collected %zu groups\n", part.size());

    for (const auto &cand : part | map_values) {
        if (cand.size() <= 1) {
            continue;
        }
        DEBUG_PRINTF("deduping cand set of size %zu\n", cand.size());

        for (auto jt = cand.begin(); jt != cand.end(); ++jt) {
            if (suffix_map[*jt].empty()) {
                continue;
            }
            for (auto kt = next(jt); kt != cand.end(); ++kt) {
                if (suffix_map[*kt].empty() || !is_equal(*jt, *kt)) {
                    continue;
                }
                DEBUG_PRINTF("found dupe\n");
                for (auto v : suffix_map[*kt]) {
                    RoseVertex dupe = *suffix_map[*jt].begin();
                    assert(dupe != v);
                    g[v].suffix.graph = g[dupe].suffix.graph;
                    g[v].suffix.castle = g[dupe].suffix.castle;
                    assert(suffix_id(g[v].suffix) ==
                           suffix_id(g[dupe].suffix));
                    suffix_map[*jt].insert(v);
                }
                suffix_map[*kt].clear();
            }
        }
    }
}

namespace {

/**
 * This class stores a mapping from an engine reference (left_id, suffix_id,
 * etc) to a list of vertices, and also allows us to iterate over the set of
 * engine references in insertion order -- we add to the mapping in vertex
 * iteration order, so this allows us to provide a consistent ordering.
 */
template<class EngineRef>
class Bouquet {
private:
    list<EngineRef> ordering; // Unique list in insert order.
    typedef ue2::unordered_map<EngineRef, deque<RoseVertex> > BouquetMap;
    BouquetMap bouquet;
public:
    void insert(const EngineRef &h, RoseVertex v) {
        typename BouquetMap::iterator f = bouquet.find(h);
        if (f == bouquet.end()) {
            ordering.push_back(h);
            bouquet[h].push_back(v);
        } else {
            f->second.push_back(v);
        }
    }

    void insert(const EngineRef &h, const deque<RoseVertex> &verts) {
        typename BouquetMap::iterator f = bouquet.find(h);
        if (f == bouquet.end()) {
            ordering.push_back(h);
            bouquet.insert(make_pair(h, verts));
        } else {
            f->second.insert(f->second.end(), verts.begin(), verts.end());
        }
    }

    const deque<RoseVertex> &vertices(const EngineRef &h) const {
        typename BouquetMap::const_iterator it = bouquet.find(h);
        assert(it != bouquet.end()); // must be present
        return it->second;
    }

    void erase(const EngineRef &h) {
        assert(bouquet.find(h) != bouquet.end());
        bouquet.erase(h);
        ordering.remove(h);
    }

    /** Remove all the elements in the given iterator range. */
    template <class Iter>
    void erase_all(Iter erase_begin, Iter erase_end) {
        for (Iter it = erase_begin; it != erase_end; ++it) {
            bouquet.erase(*it);
        }

        // Use a quick-lookup container so that we only have to traverse the
        // 'ordering' list once.
        const set<EngineRef> dead(erase_begin, erase_end);
        for (iterator it = begin(); it != end(); /* incremented inside */) {
            if (contains(dead, *it)) {
                ordering.erase(it++);
            } else {
                ++it;
            }
        }
    }

    void clear() {
        ordering.clear();
        bouquet.clear();
    }

    size_t size() const { return bouquet.size(); }

    // iterate over holders in insert order
    typedef typename list<EngineRef>::iterator iterator;
    iterator begin() { return ordering.begin(); }
    iterator end() { return ordering.end(); }

    // const iterate over holders in insert order
    typedef typename list<EngineRef>::const_iterator const_iterator;
    const_iterator begin() const { return ordering.begin(); }
    const_iterator end() const { return ordering.end(); }
};

typedef Bouquet<left_id> RoseBouquet;
typedef Bouquet<suffix_id> SuffixBouquet;

} // namespace

/**
 * Split a \ref Bouquet of some type into several smaller ones.
 */
template <class EngineRef>
static void chunkBouquets(const Bouquet<EngineRef> &in,
                          deque<Bouquet<EngineRef>> &out,
                          const size_t chunk_size) {
    if (in.size() <= chunk_size) {
        out.push_back(in);
        return;
    }

    out.push_back(Bouquet<EngineRef>());
    for (const auto &engine : in) {
        if (out.back().size() >= chunk_size) {
            out.push_back(Bouquet<EngineRef>());
        }
        out.back().insert(engine, in.vertices(engine));
    }
}

static
bool stringsCanFinishAtSameSpot(const ue2_literal &u,
                                ue2_literal::const_iterator v_b,
                                ue2_literal::const_iterator v_e) {
    ue2_literal::const_iterator u_e = u.end();
    ue2_literal::const_iterator u_b = u.begin();

    while (u_e != u_b && v_e != v_b) {
        --u_e;
        --v_e;

        if (!overlaps(*u_e, *v_e)) {
            return false;
        }
    }

    return true;
}

/**
 * Check that if after u has been seen, that it is impossible for the arrival of
 * v to require the inspection of an engine earlier than u did.
 *
 * Let delta be the earliest that v can be seen after u (may be zero)
 *
 * ie, we require u_loc - ulag <= v_loc - vlag (v_loc = u_loc + delta)
 * ==> - ulag <= delta - vlag
 * ==> vlag - ulag <= delta
 */
static
bool checkPrefix(const rose_literal_id &ul, const u32 ulag,
                 const rose_literal_id &vl, const u32 vlag) {
    DEBUG_PRINTF("'%s'-%u '%s'-%u\n", escapeString(ul.s).c_str(), ulag,
                 escapeString(vl.s).c_str(), vlag);

    if (vl.delay || ul.delay) {
        /* engine related literals should not be delayed anyway */
        return false;
    }

    if (ulag >= vlag) {
        assert(maxOverlap(ul, vl) <= vl.elength() - vlag + ulag);
        return true;
    }

    size_t min_allowed_delta = vlag - ulag;
    DEBUG_PRINTF("min allow distace %zu\n", min_allowed_delta);

    for (size_t i = 0; i < min_allowed_delta; i++) {
        if (stringsCanFinishAtSameSpot(ul.s, vl.s.begin(), vl.s.end() - i)) {
            DEBUG_PRINTF("v can follow u at a (too close) distance of %zu\n", i);
            return false;
        }
    }

    DEBUG_PRINTF("OK\n");
    return true;
}

static
bool hasSameEngineType(const RoseVertexProps &u_prop,
                       const RoseVertexProps &v_prop) {
    const left_id u_left(u_prop.left), v_left(v_prop.left);

    if (u_left.haig() || v_left.haig()) {
        if (u_left.graph() != v_left.graph()) {
            return false;
        }
    }

    if (u_left.dfa() || v_left.dfa()) {
        if (u_left.graph() != v_left.graph()) {
            return false;
        }
    }

    if (u_left.castle() || v_left.castle()) {
        if (!u_left.castle() || !v_left.castle()) {
            return false; // Must both be castles.
        }
    }

    return true;
}

static
bool compatibleLiteralsForMerge(
                     const vector<pair<const rose_literal_id *, u32>> &ulits,
                     const vector<pair<const rose_literal_id *, u32>> &vlits) {
    assert(!ulits.empty());
    assert(!vlits.empty());

    // We cannot merge engines that prefix literals in different tables.
    if (ulits[0].first->table != vlits[0].first->table) {
        DEBUG_PRINTF("literals in different tables\n");
        return false;
    }

    /* An engine requires that all accesses to it are ordered by offsets. (ie,
       we can not check an engine's state at offset Y, if we have already
       checked its status at offset X and X > Y). If we can not establish that
       the literals used for triggering will statisfy this property, then it is
       not safe to merge the engine. */
    for (const auto &ue : ulits) {
        const rose_literal_id &ul = *ue.first;
        u32 ulag = ue.second;

        if (ul.delay) {
            return false; // We don't handle delayed cases yet.
        }

        for (const auto &ve : vlits) {
            const rose_literal_id &vl = *ve.first;
            u32 vlag = ve.second;

            if (vl.delay) {
                return false; // We don't handle delayed cases yet.
            }

            if (!checkPrefix(ul, ulag, vl, vlag)
                || !checkPrefix(vl, vlag, ul, ulag)) {
                DEBUG_PRINTF("prefix check failed\n");
                return false;
            }
        }
    }

    return true;
}

/**
 * True if this graph has few enough accel states to be implemented as an NFA
 * with all of those states actually becoming accel schemes.
 */
static
bool isAccelerableLeftfix(const RoseBuildImpl &build, const NGHolder &g) {
    u32 num = countAccelStates(g, &build.rm, build.cc);
    DEBUG_PRINTF("graph with %zu vertices has %u accel states\n",
                  num_vertices(g), num);
    return num <= NFA_MAX_ACCEL_STATES;
}

/**
 * In block mode, we want to be a little more selective, We will only merge
 * prefix engines when the literal sets are the same, or if the merged graph
 * has only grown by a small amount.
 */
static
bool safeBlockModeMerge(const RoseBuildImpl &build, RoseVertex u,
                        RoseVertex v) {
    assert(!build.cc.streaming);
    assert(build.isRootSuccessor(u) == build.isRootSuccessor(v));

    // Always merge infixes if we can (subject to the other criteria in
    // mergeableRoseVertices).
    if (!build.isRootSuccessor(u)) {
        return true;
    }

    const RoseGraph &g = build.g;

    // Merge prefixes with identical literal sets (as we'd have to run them
    // both when we see those literals anyway).
    if (g[u].literals == g[v].literals) {
        return true;
    }

    // The rest of this function only deals with the case when both vertices
    // have graph leftfixes.
    if (!g[u].left.graph || !g[v].left.graph) {
        return false;
    }

    const size_t u_count = num_vertices(*g[u].left.graph);
    const size_t v_count = num_vertices(*g[v].left.graph);
    DEBUG_PRINTF("u prefix has %zu vertices, v prefix has %zu vertices\n",
                 u_count, v_count);
    if (u_count > MAX_BLOCK_PREFIX_MERGE_VERTICES ||
        v_count > MAX_BLOCK_PREFIX_MERGE_VERTICES) {
        DEBUG_PRINTF("prefixes too big already\n");
        return false;
    }

    DEBUG_PRINTF("trying merge\n");
    NGHolder h;
    cloneHolder(h, *g[v].left.graph);
    if (!mergeNfaPair(*g[u].left.graph, h, nullptr, build.cc)) {
        DEBUG_PRINTF("couldn't merge\n");
        return false;
    }

    const size_t merged_count = num_vertices(h);
    DEBUG_PRINTF("merged result has %zu vertices\n", merged_count);
    if (merged_count > MAX_BLOCK_PREFIX_MERGE_VERTICES) {
        DEBUG_PRINTF("exceeded limit\n");
        return false;
    }

    // We want to only perform merges that take advantage of some
    // commonality in the two input graphs, so we check that the number of
    // vertices has only grown a small amount: somewhere between the sum
    // (no commonality) and the max (no growth at all) of the vertex counts
    // of the input graphs.
    const size_t max_size = u_count + v_count;
    const size_t min_size = max(u_count, v_count);
    const size_t max_growth = ((max_size - min_size) * 25) / 100;
    if (merged_count > min_size + max_growth) {
        DEBUG_PRINTF("grew too much\n");
        return false;
    }

    // We don't want to squander any chances at accelerating.
    if (!isAccelerableLeftfix(build, h) &&
        (isAccelerableLeftfix(build, *g[u].left.graph) ||
         isAccelerableLeftfix(build, *g[v].left.graph))) {
        DEBUG_PRINTF("would lose accel property\n");
        return false;
    }

    DEBUG_PRINTF("safe to merge\n");
    return true;
}

bool mergeableRoseVertices(const RoseBuildImpl &tbi, RoseVertex u,
                           RoseVertex v) {
    assert(u != v);

    if (!hasSameEngineType(tbi.g[u], tbi.g[v])) {
        return false;
    }

    if (!tbi.cc.streaming && !safeBlockModeMerge(tbi, u, v)) {
        return false;
    }

    /* We cannot merge prefixes/vertices if they are successors of different
     * root vertices */
    if (tbi.isRootSuccessor(u)) {
        assert(tbi.isRootSuccessor(v));
        set<RoseVertex> u_preds;
        set<RoseVertex> v_preds;
        insert(&u_preds, inv_adjacent_vertices(u, tbi.g));
        insert(&v_preds, inv_adjacent_vertices(v, tbi.g));

        if (u_preds != v_preds) {
            return false;
        }
    }

    u32 ulag = tbi.g[u].left.lag;
    vector<pair<const rose_literal_id *, u32>> ulits;
    ulits.reserve(tbi.g[u].literals.size());
    for (u32 id : tbi.g[u].literals) {
        ulits.push_back(make_pair(&tbi.literals.right.at(id), ulag));
    }

    u32 vlag = tbi.g[v].left.lag;
    vector<pair<const rose_literal_id *, u32>> vlits;
    vlits.reserve(tbi.g[v].literals.size());
    for (u32 id : tbi.g[v].literals) {
        vlits.push_back(make_pair(&tbi.literals.right.at(id), vlag));
    }

    if (!compatibleLiteralsForMerge(ulits, vlits)) {
        return false;
    }

    DEBUG_PRINTF("roses on %zu and %zu are mergeable\n", tbi.g[u].idx,
                 tbi.g[v].idx);
    return true;
}

/* We cannot merge an engine, if a trigger literal and a post literal overlap
 * in such a way that engine status needs to be check at a point before the
 * engine's current location.
 *
 * i.e., for a trigger literal u and a pos literal v,
 * where delta is the earliest v can appear after t,
 * we require that v_loc - v_lag >= u_loc
 * ==> u_loc + delta - v_lag >= u_loc
 * ==> delta >= v_lag
 *
 */
static
bool checkPredDelay(const rose_literal_id &ul, const rose_literal_id &vl,
                    u32 vlag) {
    DEBUG_PRINTF("%s %s (lag %u)\n", escapeString(ul.s).c_str(),
                 escapeString(vl.s).c_str(), vlag);

    for (size_t i = 0; i < vlag; i++) {
        if (stringsCanFinishAtSameSpot(ul.s, vl.s.begin(), vl.s.end() - i)) {
            DEBUG_PRINTF("v can follow u at a (too close) distance of %zu\n", i);
            return false;
        }
    }

    DEBUG_PRINTF("OK\n");
    return true;
}

static never_inline
bool checkPredDelays(const RoseBuildImpl &tbi, const deque<RoseVertex> &v1,
                     const deque<RoseVertex> &v2) {
    flat_set<RoseVertex> preds;
    for (auto v : v1) {
        insert(&preds, inv_adjacent_vertices(v, tbi.g));
    }

    flat_set<u32> pred_lits;

    /* No need to examine delays of a common pred - as it must already have
     * survived the delay checks.
     *
     * This is important when the pred is in the anchored table as
     * the literal is no longer available. */
    flat_set<RoseVertex> known_good_preds;
    for (auto v : v2) {
        insert(&known_good_preds, inv_adjacent_vertices(v, tbi.g));
    }

    for (auto u : preds) {
        if (!contains(known_good_preds, &u)) {
            insert(&pred_lits, tbi.g[u].literals);
        }
    }

    vector<const rose_literal_id *> pred_rose_lits;
    pred_rose_lits.reserve(pred_lits.size());
    for (const auto &p : pred_lits) {
        pred_rose_lits.push_back(&tbi.literals.right.at(p));
    }

    for (auto v : v2) {
        u32 vlag = tbi.g[v].left.lag;
        if (!vlag) {
            continue;
        }

        for (const u32 vlit : tbi.g[v].literals) {
            const rose_literal_id &vl = tbi.literals.right.at(vlit);
            assert(!vl.delay); // this should never have got this far?
            for (const auto &ul : pred_rose_lits) {
                assert(!ul->delay); // this should never have got this far?

                if (!checkPredDelay(*ul, vl, vlag)) {
                    return false;
                }
            }
        }
    }

    return true;
}

static
bool mergeableRoseVertices(const RoseBuildImpl &tbi,
                           const deque<RoseVertex> &verts1,
                           const deque<RoseVertex> &verts2) {
    assert(!verts1.empty());
    assert(!verts2.empty());

    RoseVertex u_front = verts1.front();
    RoseVertex v_front = verts2.front();

    /* all vertices must have the same engine type: assume all verts in each
     * group are already of the same type */
    if (!hasSameEngineType(tbi.g[u_front], tbi.g[v_front])) {
        return false;
    }

    bool is_prefix = tbi.isRootSuccessor(u_front);

    /* We cannot merge prefixes/vertices if they are successors of different
     * root vertices: similarly, assume the grouped vertices are compatible */
    if (is_prefix) {
        assert(tbi.isRootSuccessor(v_front));
        set<RoseVertex> u_preds;
        set<RoseVertex> v_preds;
        insert(&u_preds, inv_adjacent_vertices(u_front, tbi.g));
        insert(&v_preds, inv_adjacent_vertices(v_front, tbi.g));

        if (u_preds != v_preds) {
            return false;
        }
    }

    vector<pair<const rose_literal_id *, u32>> ulits; /* lit + lag pairs */
    for (auto a : verts1) {
        if (!tbi.cc.streaming && !safeBlockModeMerge(tbi, u_front, a)) {
            return false;
        }

        u32 ulag = tbi.g[a].left.lag;
        for (u32 id : tbi.g[a].literals) {
            ulits.push_back(make_pair(&tbi.literals.right.at(id), ulag));
        }
    }

    vector<pair<const rose_literal_id *, u32>> vlits;
    for (auto a : verts2) {
        if (!tbi.cc.streaming && !safeBlockModeMerge(tbi, u_front, a)) {
            return false;
        }

        u32 vlag = tbi.g[a].left.lag;
        for (u32 id : tbi.g[a].literals) {
            vlits.push_back(make_pair(&tbi.literals.right.at(id), vlag));
        }
    }

    if (!compatibleLiteralsForMerge(ulits, vlits)) {
        return false;
    }

    // Check preds are compatible as well.
    if (!checkPredDelays(tbi, verts1, verts2)
        || !checkPredDelays(tbi, verts2, verts1)) {
        return false;
    }

    DEBUG_PRINTF("vertex sets are mergeable\n");
    return true;
}

bool mergeableRoseVertices(const RoseBuildImpl &tbi, const set<RoseVertex> &v1,
                           const set<RoseVertex> &v2) {
    const deque<RoseVertex> vv1(v1.begin(), v1.end());
    const deque<RoseVertex> vv2(v2.begin(), v2.end());
    return mergeableRoseVertices(tbi, vv1, vv2);
}

/** \brief Priority queue element for Rose merges. */
namespace {
struct RoseMergeCandidate {
    RoseMergeCandidate(const left_id &r1_in, const left_id &r2_in, u32 cpl_in,
                       u32 tb)
        : r1(r1_in), r2(r2_in), stopxor(0), cpl(cpl_in), states(0),
          tie_breaker(tb) {
        if (r1.graph() && r2.graph()) {
            const NGHolder &h1 = *r1.graph(), &h2 = *r2.graph();
            /* som_none as haigs don't merge and just a guiding heuristic */
            CharReach stop1 = findStopAlphabet(h1, SOM_NONE);
            CharReach stop2 = findStopAlphabet(h2, SOM_NONE);
            stopxor = (stop1 ^ stop2).count();

            // We use the number of vertices as an approximation of the state
            // count here, as this is just feeding a comparison.
            u32 vertex_count = num_vertices(h1) + num_vertices(h2);
            states = vertex_count - min(vertex_count, cpl);
        } else if (r1.castle() && r2.castle()) {
            // FIXME
        }
    }

    bool operator<(const RoseMergeCandidate &a) const {
        if (stopxor != a.stopxor) {
            return stopxor > a.stopxor;
        }
        if (cpl != a.cpl) {
            return cpl < a.cpl;
        }
        if (states != a.states) {
            return states > a.states;
        }
        return tie_breaker < a.tie_breaker;
    }

    left_id r1;
    left_id r2;
    u32 stopxor;
    u32 cpl; //!< common prefix length
    u32 states;
    u32 tie_breaker; //!< determinism
};
}

static
bool mergeRosePair(RoseBuildImpl &tbi, left_id &r1, left_id &r2,
                   const deque<RoseVertex> &verts1,
                   const deque<RoseVertex> &verts2) {
    assert(!verts1.empty() && !verts2.empty());

    RoseGraph &g = tbi.g;

    if (r1.graph()) {
        assert(r2.graph());
        assert(r1.graph()->kind == r2.graph()->kind);
        if (!mergeNfaPair(*r1.graph(), *r2.graph(), nullptr, tbi.cc)) {
            DEBUG_PRINTF("nfa merge failed\n");
            return false;
        }

        // The graph in r1 has been merged into the graph in r2. Update r1's
        // vertices with the new graph ptr. Since the parent vertices are the
        // same, we know that tops will already have been distinct.
        shared_ptr<NGHolder> &h = g[verts2.front()].left.graph;
        for (RoseVertex v : verts1) {
            g[v].left.graph = h;
        }

        return true;
    } else if (r1.castle()) {
        assert(r2.castle());
        assert(tbi.cc.grey.allowCastle);

        map<u32, u32> top_map;
        if (!mergeCastle(*r2.castle(), *r1.castle(), top_map)) {
            DEBUG_PRINTF("castle merge failed\n");
            return false;
        }

        // The castle in r1 has been merged into the castle in r2, with tops
        // remapped as per top_map.
        const shared_ptr<CastleProto> &c = g[verts2.front()].left.castle;
        for (RoseVertex v : verts1) {
            g[v].left.castle = c;
            for (const auto &e : in_edges_range(v, g)) {
                g[e].rose_top = top_map.at(g[e].rose_top);
            }
        }
        return true;
    }

    assert(0);
    return false;
}

static
void processMergeQueue(RoseBuildImpl &tbi, RoseBouquet &roses,
                       priority_queue<RoseMergeCandidate> &pq) {
    ue2::unordered_set<left_id> dead;

    DEBUG_PRINTF("merge queue has %zu entries\n", pq.size());

    while (!pq.empty()) {
        DEBUG_PRINTF("pq pop h1=%p, h2=%p, cpl=%u, states=%u\n",
                     pq.top().r1.graph(), pq.top().r2.graph(), pq.top().cpl,
                     pq.top().states);

        left_id r1 = pq.top().r1, r2 = pq.top().r2;
        pq.pop();

        if (contains(dead, r1) || contains(dead, r2)) {
            continue;
        }

        if (r1.graph() && r2.graph()) {
            NGHolder *h1 = r1.graph(), *h2 = r2.graph();
            CharReach stop1 = findStopAlphabet(*h1, SOM_NONE);
            CharReach stop2 = findStopAlphabet(*h2, SOM_NONE);
            CharReach stopboth(stop1 & stop2);
            DEBUG_PRINTF("stop1=%zu, stop2=%zu, stopboth=%zu\n", stop1.count(),
                         stop2.count(), stopboth.count());
            if (stopboth.count() < 10 &&
                (stop1.count() > 10 || stop2.count() > 10)) {
                DEBUG_PRINTF("skip merge, would kill stop alphabet\n");
                continue;
            }
            size_t maxstop = max(stop1.count(), stop2.count());
            if (maxstop > 200 && stopboth.count() < 200) {
                DEBUG_PRINTF("skip merge, would reduce stop alphabet\n");
                continue;
            }
        }

        const deque<RoseVertex> &verts1 = roses.vertices(r1);
        const deque<RoseVertex> &verts2 = roses.vertices(r2);

        if (!mergeableRoseVertices(tbi, verts1, verts2)) {
            continue;
        }

        if (!mergeRosePair(tbi, r1, r2, verts1, verts2)) {
            continue;
        }

        roses.insert(r2, verts1);
        roses.erase(r1);
        dead.insert(r1);
    }
}

static
bool nfaHasNarrowStart(const NGHolder &g) {
    if (hasGreaterOutDegree(1, g.startDs, g)) {
        return false; // unanchored
    }

    CharReach cr;

    for (auto v : adjacent_vertices_range(g.start, g)) {
        if (v == g.startDs) {
            continue;
        }
        cr |= g[v].char_reach;
    }
    return cr.count() <= NARROW_START_MAX;
}

static
bool nfaHasFiniteMaxWidth(const NGHolder &g) {
    return findMaxWidth(g).is_finite();
}

namespace {
struct RoseMergeKey {
    RoseMergeKey(const RoseVertexSet &parents_in,
                 bool narrowStart_in, bool hasMaxWidth_in) :
                        narrowStart(narrowStart_in),
                        hasMaxWidth(hasMaxWidth_in),
                        parents(parents_in) {}
    bool operator<(const RoseMergeKey &b) const {
        const RoseMergeKey &a = *this;
        ORDER_CHECK(narrowStart);
        ORDER_CHECK(hasMaxWidth);
        ORDER_CHECK(parents);
        return false;
    }

    // NOTE: these two bool discriminators are only used for prefixes, not
    // infixes.
    bool narrowStart;
    bool hasMaxWidth;

    RoseVertexSet parents;
};
}

static
bool hasReformedStartDotStar(const NGHolder &h, const Grey &grey) {
    if (!proper_out_degree(h.startDs, h)) {
        return false;
    }

    assert(!is_triggered(h));

    NGHolder h_temp;
    cloneHolder(h_temp, h);

    vector<BoundedRepeatData> repeats;
    bool suitable_for_sds_reforming = false;
    const map<u32, u32> fixed_depth_tops; /* not relevant for cfa check */
    const map<u32, vector<vector<CharReach>>> triggers; /* not for cfa check */
    const bool simple_model_selection = true; // FIRST is considered simple
    analyseRepeats(h_temp, nullptr, fixed_depth_tops, triggers, &repeats, true,
                   simple_model_selection, grey, &suitable_for_sds_reforming);

    return suitable_for_sds_reforming;
}

static
u32 commonPrefixLength(left_id &r1, left_id &r2) {
    if (r1.graph() && r2.graph()) {
        auto &g1 = *r1.graph();
        auto &g2 = *r2.graph();
        auto state_ids_1 = numberStates(g1);
        auto state_ids_2 = numberStates(g2);
        return commonPrefixLength(g1, state_ids_1, g2, state_ids_2);
    } else if (r1.castle() && r2.castle()) {
        return min(findMinWidth(*r1.castle()), findMinWidth(*r2.castle()));
    }
    return 0;
}

/**
 * This pass attempts to merge prefix/infix engines which share a common set of
 * parent vertices.
 *
 * Engines are greedily merged pairwise by this process based on a priority
 * queue keyed off the common prefix length.
 *
 * Engines are not merged if the lags are not compatible or if it would damage
 * the stop alphabet.
 *
 * Infixes:
 * - LBR candidates are not considered.
 *
 * Prefixes:
 * - transient prefixes are not considered.
 * - with a max width or a narrow start are kept segregated by
 *   this phase and can only be merged with similar infixes.
 * - in block mode, merges are only performed if literal sets are the same.
 * - merges are not considered in cases where dot star start state will be
 *   reformed to optimise a leading repeat.
 */
void mergeLeftfixesVariableLag(RoseBuildImpl &tbi) {
    if (!tbi.cc.grey.mergeRose) {
        return;
    }

    map<RoseMergeKey, RoseBouquet> rosesByParent;
    RoseGraph &g = tbi.g;
    RoseVertexSet parents(g);

    DEBUG_PRINTF("-----\n");
    DEBUG_PRINTF("entry\n");
    DEBUG_PRINTF("-----\n");

    for (auto v : vertices_range(g)) {
        if (!g[v].left) {
            continue;
        }

        const bool is_prefix = tbi.isRootSuccessor(v);

        // Only non-transient for the moment.
        if (contains(tbi.transient, g[v].left)) {
            continue;
        }

        // No forced McClellan or Haig infix merges.
        if (g[v].left.dfa || (!is_prefix && g[v].left.haig)) {
            continue;
        }

        if (g[v].left.graph) {
            NGHolder &h = *g[v].left.graph;

            /* Ensure that kind on the graph is correct */
            assert(h.kind == (is_prefix ? NFA_PREFIX : NFA_INFIX));

            if (hasReformedStartDotStar(h, tbi.cc.grey)) {
                continue; // preserve the optimisation of the leading repeat
            }

            if (!is_prefix && isLargeLBR(h, tbi.cc.grey)) {
                continue;
            }
        }

        if (g[v].left.castle && !tbi.cc.grey.allowCastle) {
            DEBUG_PRINTF("castle merging disallowed by greybox\n");
            continue;
        }

        // We collapse the anchored root into the root vertex when calculating
        // parents, so that we can merge differently-anchored prefix roses
        // together. (Prompted by UE-2100)
        parents.clear();
        for (auto u : inv_adjacent_vertices_range(v, g)) {
            if (tbi.isAnyStart(u)) {
                parents.insert(tbi.root);
            } else {
                parents.insert(u);
            }
        }

        if (parents.empty()) {
            assert(0);
            continue;
        }

        // We want to distinguish prefixes (but not infixes) on whether they
        // have a narrow start or max width.
        bool narrowStart = false, hasMaxWidth = false;
        if (is_prefix && g[v].left.graph) {
            const NGHolder &h = *g[v].left.graph;
            narrowStart = nfaHasNarrowStart(h);
            hasMaxWidth = nfaHasFiniteMaxWidth(h);
        }

        RoseMergeKey key(parents, narrowStart, hasMaxWidth);
        rosesByParent[key].insert(g[v].left, v);
    }

    for (auto &m : rosesByParent) {
        if (m.second.size() < 2) {
            continue;
        }

        deque<RoseBouquet> rose_groups;
        chunkBouquets(m.second, rose_groups, MERGE_GROUP_SIZE_MAX);
        m.second.clear();
        DEBUG_PRINTF("chunked roses into %zu groups\n", rose_groups.size());

        for (auto &roses : rose_groups) {
            // All pairs on the prio queue.
            u32 tie_breaker = 0;
            priority_queue<RoseMergeCandidate> pq;
            for (auto it = roses.begin(), ite = roses.end(); it != ite; ++it) {
                left_id r1 = *it;
                const deque<RoseVertex> &verts1 = roses.vertices(r1);

                for (auto jt = next(it); jt != ite; ++jt) {
                    left_id r2 = *jt;

                    // Roses must be of the same engine type to be mergeable.
                    if ((!r1.graph() != !r2.graph()) ||
                        (!r1.castle() != !r2.castle())) {
                        continue;
                    }

                    // Castles must have the same reach to be mergeable.
                    if (r1.castle()) {
                        if (r1.castle()->reach() != r2.castle()->reach()) {
                            continue;
                        }
                    }

                    const deque<RoseVertex> &verts2 = roses.vertices(r2);
                    if (!mergeableRoseVertices(tbi, verts1, verts2)) {
                        continue; // No point queueing unmergeable cases.
                    }

                    u32 cpl = commonPrefixLength(r1, r2);
                    pq.push(RoseMergeCandidate(r1, r2, cpl, tie_breaker++));
                }
            }
            processMergeQueue(tbi, roses, pq);
        }
    }

    DEBUG_PRINTF("-----\n");
    DEBUG_PRINTF("exit\n");
    DEBUG_PRINTF("-----\n");
}

namespace {

/**
 * Key used to group sets of leftfixes for the dedupeLeftfixesVariableLag path.
 */
struct DedupeLeftKey {
    DedupeLeftKey(const RoseBuildImpl &build, RoseVertex v)
        : left_hash(hashLeftfix(build.g[v].left)) {
        const auto &g = build.g;
        for (const auto &e : in_edges_range(v, g)) {
            preds.emplace(g[source(e, g)].idx, g[e].rose_top);
        }
    }

    bool operator<(const DedupeLeftKey &b) const {
        return tie(left_hash, preds) < tie(b.left_hash, b.preds);
    }

private:
    /** Quick hash of the leftfix itself. Must be identical for a given pair of
     * graphs if is_equal would return true. */
    size_t left_hash;

    /** For each in-edge, the pair of (parent index, edge top). */
    set<pair<size_t, u32>> preds;
};

} // namespace

/**
 * This is a generalisation of \ref dedupeLeftfixes which relaxes two
 * restrictions: multiple predecessor roles are allowed and the delay used by
 * each vertex may not be the same for each vertex. Like \ref dedupeLeftfixes,
 * the leftfixes' successor vertices are first grouped to reduce the number of
 * potential candidates - the grouping in this case is by the set of
 * predecessor roles with their associated top events. For the dedupe to be
 * possible, it is required that:
 *
 * 1. the nfa graphs with respect to the relevant reports are identical
 * 2. the nfa graphs are triggered by the same roles with same events (ensured
 *    by the initial grouping pass)
 * 3. all the successor roles of either graph can inspect the combined leftfix
 *    without advancing the state of the leftfix past the point that another
 *    successor may want to inspect it; the overlap relationships between the
 *    involved literals are examined to ensure that this property holds.
 *
 * Note: in block mode we restrict the dedupe of prefixes further as some of
 * logic checks are shared with the mergeLeftfix functions.
 */
void dedupeLeftfixesVariableLag(RoseBuildImpl &tbi) {
    map<DedupeLeftKey, RoseBouquet> roseGrouping;

    DEBUG_PRINTF("entry\n");

    RoseGraph &g = tbi.g;
    for (auto v : vertices_range(g)) {
        if (!g[v].left) {
            continue;
        }

        const left_id leftfix(g[v].left);

        // Only non-transient for the moment.
        if (contains(tbi.transient, leftfix)) {
            continue;
        }

        if (leftfix.haig()) {
            /* TODO: allow merging of identical haigs */
            continue;
        }

        roseGrouping[DedupeLeftKey(tbi, v)].insert(leftfix, v);
    }

    for (RoseBouquet &roses : roseGrouping | map_values) {
        DEBUG_PRINTF("group of %zu roses\n", roses.size());

        if (roses.size() < 2) {
            continue;
        }

        const RoseComparator rosecmp(g);

        for (auto it = roses.begin(); it != roses.end(); ++it) {
            left_id r1 = *it;
            const deque<RoseVertex> &verts1 = roses.vertices(r1);

            for (auto jt = next(it); jt != roses.end(); ++jt) {
                left_id r2 = *jt;
                const deque<RoseVertex> &verts2 = roses.vertices(r2);

                if (!rosecmp(verts1.front(), verts2.front())) {
                    continue;
                }

                if (!mergeableRoseVertices(tbi, verts1, verts2)) {
                    continue;
                }

                DEBUG_PRINTF("%p and %p are dupes\n", r1.graph(), r2.graph());

                // Replace h1 with h2.

                const LeftEngInfo &v2_left = g[verts2.front()].left;
                assert(v2_left.graph.get() == r2.graph());

                for (auto v : verts1) {
                    DEBUG_PRINTF("replacing report %u with %u on %zu\n",
                                 g[v].left.leftfix_report,
                                 v2_left.leftfix_report, g[v].idx);
                    u32 orig_lag = g[v].left.lag;
                    g[v].left = v2_left;
                    g[v].left.lag = orig_lag;
                }
                roses.insert(r2, verts1);
                // no need to erase h1 from roses, that would invalidate `it'.
                break;
            }
        }
    }
}

static
u32 findUnusedTop(const ue2::flat_set<u32> &tops) {
    u32 i = 0;
    while (contains(tops, i)) {
        i++;
    }
    assert(i < NFA_MAX_TOP_MASKS);
    return i;
}

// Replace top 't' on edges with new top 'u'.
static
void replaceTops(NGHolder &h, const map<u32, u32> &top_mapping) {
    for (const auto &e : out_edges_range(h.start, h)) {
        NFAVertex v = target(e, h);
        if (v == h.startDs) {
            continue;
        }
        DEBUG_PRINTF("vertex %u has top %u\n", h[v].index, h[e].top);
        assert(contains(top_mapping, h[e].top));
        h[e].top = top_mapping.at(h[e].top);
    }
}

static
bool setDistinctTops(NGHolder &h1, const NGHolder &h2,
                     map<u32, u32> &top_mapping) {
    ue2::flat_set<u32> tops1 = getTops(h1), tops2 = getTops(h2);

    DEBUG_PRINTF("before: h1 has %zu tops, h2 has %zu tops\n", tops1.size(),
                 tops2.size());

    if (tops1.size() + tops2.size() > NFA_MAX_TOP_MASKS) {
        DEBUG_PRINTF("too many tops!\n");
        return false;
    }

    // If our tops don't intersect, we're OK to merge with no changes.
    if (!has_intersection(tops1, tops2)) {
        DEBUG_PRINTF("tops don't intersect\n");
        return true;
    }

    // Otherwise, we have to renumber the tops in h1 so that they don't overlap
    // with the tops in h2.
    top_mapping.clear();
    for (u32 t : tops1) {
        u32 u = findUnusedTop(tops2);
        DEBUG_PRINTF("replacing top %u with %u in h1\n", t, u);
        top_mapping.insert(make_pair(t, u));
        assert(!contains(tops2, u));
        tops2.insert(u);
    }

    replaceTops(h1, top_mapping);
    return true;
}

bool setDistinctRoseTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
                         const deque<RoseVertex> &verts1) {
    map<u32, u32> top_mapping;
    if (!setDistinctTops(h1, h2, top_mapping)) {
        return false;
    }

    if (top_mapping.empty()) {
        return true; // No remapping necessary.
    }

    for (auto v : verts1) {
        DEBUG_PRINTF("vertex %zu\n", g[v].idx);
        assert(!g[v].left.haig);
        assert(!g[v].left.dfa);
        for (const auto &e : in_edges_range(v, g)) {
            u32 t = g[e].rose_top;
            DEBUG_PRINTF("t=%u\n", t);
            assert(contains(top_mapping, t));
            g[e].rose_top = top_mapping[t];
            DEBUG_PRINTF("edge (%zu,%zu) went from top %u to %u\n",
                         g[source(e, g)].idx, g[target(e, g)].idx, t,
                         top_mapping[t]);
        }
    }

    return true;
}

static
bool setDistinctSuffixTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
                           const deque<RoseVertex> &verts1) {
    map<u32, u32> top_mapping;
    if (!setDistinctTops(h1, h2, top_mapping)) {
        return false;
    }

    if (top_mapping.empty()) {
        return true; // No remapping necessary.
    }

    for (auto v : verts1) {
        DEBUG_PRINTF("vertex %zu\n", g[v].idx);
        u32 t = g[v].suffix.top;
        assert(contains(top_mapping, t));
        g[v].suffix.top = top_mapping[t];
    }

    return true;
}

static
bool hasMaxTops(const NGHolder &h) {
    return getTops(h).size() == NFA_MAX_TOP_MASKS;
}

/** \brief Estimate the number of accel states in the given graph when built as
 * an NFA.
 *
 * (The easiest way to estimate something like this is to actually build it:
 * the criteria for NFA acceleration are quite complicated and buried in
 * limex_compile.)
 */
static
u32 estimatedAccelStates(const RoseBuildImpl &tbi, const NGHolder &h) {
    return countAccelStates(h, &tbi.rm, tbi.cc);
}

static
void mergeNfaLeftfixes(RoseBuildImpl &tbi, RoseBouquet &roses) {
    RoseGraph &g = tbi.g;
    DEBUG_PRINTF("%zu nfa rose merge candidates\n", roses.size());

    // We track the number of accelerable states for each graph in a map and
    // only recompute them when the graph is modified.
    ue2::unordered_map<left_id, u32> accel_count;
    for (const auto &rose : roses) {
        assert(rose.graph()->kind == NFA_INFIX);
        accel_count[rose] = estimatedAccelStates(tbi, *rose.graph());
    }

    for (auto it = roses.begin(); it != roses.end(); ++it) {
        left_id r1 = *it;
        const deque<RoseVertex> &verts1 = roses.vertices(r1);

        deque<left_id> merged;
        for (auto jt = next(it); jt != roses.end(); ++jt) {
            left_id r2 = *jt;
            const deque<RoseVertex> &verts2 = roses.vertices(r2);

            DEBUG_PRINTF("consider merging rose %p (%zu verts) "
                         "with %p (%zu verts)\n",
                         r1.graph(), verts1.size(), r2.graph(), verts2.size());

            if (hasMaxTops(*r1.graph())) {
                DEBUG_PRINTF("h1 has hit max tops\n");
                break; // next h1
            }

            u32 accel1 = accel_count[r1];
            if (accel1 >= NFA_MAX_ACCEL_STATES) {
                DEBUG_PRINTF("h1 has hit max accel\n");
                break; // next h1
            }

            u32 accel2 = accel_count[r2];
            if (accel1 + accel2 > NFA_MAX_ACCEL_STATES) {
                DEBUG_PRINTF("not merging, might make unaccel (accel1=%u, "
                             "accel2=%u)\n",
                             accel1, accel2);
                continue; // next h2
            }

            if (!mergeableRoseVertices(tbi, verts1, verts2)) {
                DEBUG_PRINTF("not mergeable\n");
                continue; // next h2
            }

            // Attempt to merge h2 into h1.

            NGHolder victim;
            cloneHolder(victim, *r2.graph());

            // Store a copy of the in-edge properties in case we have to roll
            // back.
            map<RoseEdge, RoseEdgeProps> edge_props;
            for (auto v : verts2) {
                for (const auto &e : in_edges_range(v, g)) {
                    edge_props[e] = g[e];
                }
            }

            if (!setDistinctRoseTops(g, victim, *r1.graph(), verts2)) {
                DEBUG_PRINTF("can't set distinct tops\n");
                continue; // next h2
            }

            assert(victim.kind == r1.graph()->kind);
            assert(!generates_callbacks(*r1.graph()));
            if (!mergeNfaPair(victim, *r1.graph(), nullptr, tbi.cc)) {
                DEBUG_PRINTF("merge failed\n");
                // Roll back in-edge properties.
                for (const auto &m : edge_props) {
                    g[m.first] = m.second;
                }
                continue; // next h2
            }

            // Update h2's roses to point to h1 now
            shared_ptr<NGHolder> winner = g[verts1.front()].left.graph;
            for (auto v : verts2) {
                g[v].left.graph = winner;
            }
            roses.insert(r1, verts2);

            merged.push_back(r2);

            if (num_vertices(*winner) >= small_merge_max_vertices(tbi.cc)) {
                DEBUG_PRINTF("h1 now has %zu vertices, proceeding to next\n",
                             num_vertices(*winner));
                break; // next h1
            }

            // Update h1's accel count estimate.
            accel_count[r1] = estimatedAccelStates(tbi, *winner);
        }

        DEBUG_PRINTF("%zu roses merged\n", merged.size());
        roses.erase_all(merged.begin(), merged.end());
    }
}

static
void mergeCastleChunk(RoseBuildImpl &tbi, RoseBouquet &cands) {
    /* caller must have already ensured that candidates have the same reach */
    RoseGraph &g = tbi.g;
    DEBUG_PRINTF("%zu castle rose merge candidates\n", cands.size());

    deque<left_id> merged;

    for (auto it = cands.begin(); it != cands.end(); ++it) {
        left_id r1 = *it;
        CastleProto &castle1 = *r1.castle();
        const deque<RoseVertex> &verts1 = cands.vertices(r1);

        merged.clear();

        for (auto jt = next(it); jt != cands.end(); ++jt) {
            left_id r2 = *jt;
            CastleProto &castle2 = *r2.castle();
            const deque<RoseVertex> &verts2 = cands.vertices(r2);

            if (castle1.repeats.size() == castle1.max_occupancy) {
                DEBUG_PRINTF("castle1 has hit max occupancy\n");
                break; // next castle1
            }

            assert(castle1.reach() == castle2.reach());

            if (!mergeableRoseVertices(tbi, verts1, verts2)) {
                DEBUG_PRINTF("not mergeable\n");
                continue; // next castle2
            }

            DEBUG_PRINTF("castle1=%p (size %zu), castle2=%p (size %zu)\n",
                         &castle1, castle1.repeats.size(), &castle2,
                         castle2.repeats.size());

            map<u32, u32> top_map;
            if (!mergeCastle(castle1, castle2, top_map)) {
                DEBUG_PRINTF("couldn't merge\n");
                continue; // next castle2
            }

            // Update castle2's roses to point to castle1 now.
            shared_ptr<CastleProto> winner = g[verts1.front()].left.castle;
            for (auto v : verts2) {
                g[v].left.castle = winner;
                for (const auto &e : in_edges_range(v, g)) {
                    g[e].rose_top = top_map.at(g[e].rose_top);
                }
            }

            cands.insert(r1, verts2);
            merged.push_back(r2);
        }

        DEBUG_PRINTF("%zu roses merged\n", merged.size());
        cands.erase_all(merged.begin(), merged.end());
    }
}

/**
 * This pass attempts to merge prefix/infix engines with a small number of
 * vertices together into larger engines. The engines must not be have a
 * reformed start dot star (due to a leading repeat) nor an infix LBR. Engines
 * that have compatible lag are greedily grouped such that they remain
 * accelerable and only have a small number of states. Note: if a role has an
 * infix with multiple trigger vertices, the role will be left unchanged by this
 * pass and will remain using an unmerged graph.
 */
void mergeSmallLeftfixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("entry\n");

    if (!tbi.cc.grey.mergeRose || !tbi.cc.grey.roseMultiTopRoses) {
        return;
    }

    RoseGraph &g = tbi.g;

    RoseBouquet nfa_roses;

    for (auto v : vertices_range(g)) {
        if (!g[v].left) {
            continue;
        }

        // Handle single-parent infixes only.
        if (tbi.isRootSuccessor(v)) {
            continue;
        }

        left_id left(g[v].left);

        // Only non-transient for the moment.
        if (contains(tbi.transient, left)) {
            continue;
        }

        // No DFAs or Haigs right now.
        if (left.dfa() || left.haig()) {
            continue;
        }

        // Castles are handled by a different pass.
        if (left.castle()) {
            continue;
        }

        assert(left.graph());
        NGHolder &h = *left.graph();

        /* Ensure that kind on the graph is correct */
        assert(h.kind == (tbi.isRootSuccessor(v) ? NFA_PREFIX : NFA_INFIX));

        if (hasReformedStartDotStar(h, tbi.cc.grey)) {
            /* We would lose optimisations of the leading repeat by merging. */
            continue;
        }

        // Don't merge cases that will become LBRs or haigs.
        if (isLargeLBR(h, tbi.cc.grey)) {
            continue;
        }

        // Small roses only.
        if (num_vertices(h) > small_rose_threshold(tbi.cc)) {
            continue;
        }

        nfa_roses.insert(left, v);
    }

    deque<RoseBouquet> rose_groups;
    chunkBouquets(nfa_roses, rose_groups, MERGE_GROUP_SIZE_MAX);
    nfa_roses.clear();
    DEBUG_PRINTF("chunked nfa roses into %zu groups\n", rose_groups.size());

    for (auto &group : rose_groups) {
        mergeNfaLeftfixes(tbi, group);
    }
}

void mergeCastleLeftfixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("entry\n");

    if (!tbi.cc.grey.mergeRose || !tbi.cc.grey.roseMultiTopRoses ||
        !tbi.cc.grey.allowCastle) {
        return;
    }

    RoseGraph &g = tbi.g;

    map<CharReach, RoseBouquet> by_reach;

    for (auto v : vertices_range(g)) {
        if (!g[v].left) {
            continue;
        }

        // Handle single-parent infixes only.
        if (tbi.isRootSuccessor(v)) {
            continue;
        }

        const left_id left(g[v].left);

        // Only non-transient for the moment.
        if (contains(tbi.transient, left)) {
            continue;
        }

        if (!left.castle()) {
            continue;
        }

        const CastleProto &castle = *left.castle();
        const CharReach &cr = castle.reach();
        by_reach[cr].insert(left, v);
    }

    for (auto &m : by_reach) {
        DEBUG_PRINTF("%zu castles for reach: %s\n", m.second.size(),
                     describeClass(m.first).c_str());
        RoseBouquet &candidates = m.second;
        deque<RoseBouquet> cand_groups;
        chunkBouquets(candidates, cand_groups, MERGE_CASTLE_GROUP_SIZE_MAX);
        candidates.clear();

        for (auto &group : cand_groups) {
            mergeCastleChunk(tbi, group);
        }
    }
}

static
void mergeSuffixes(RoseBuildImpl &tbi, SuffixBouquet &suffixes,
                   const bool acyclic) {
    RoseGraph &g = tbi.g;

    DEBUG_PRINTF("group has %zu suffixes\n", suffixes.size());

    // If this isn't an acyclic case, we track the number of accelerable states
    // for each graph in a map and only recompute them when the graph is
    // modified.
    ue2::unordered_map<suffix_id, u32> accel_count;
    if (!acyclic) {
        for (const auto &suffix : suffixes) {
            assert(suffix.graph() && suffix.graph()->kind == NFA_SUFFIX);
            accel_count[suffix] = estimatedAccelStates(tbi, *suffix.graph());
        }
    }

    for (auto it = suffixes.begin(); it != suffixes.end(); ++it) {
        suffix_id s1 = *it;
        const deque<RoseVertex> &verts1 = suffixes.vertices(s1);
        assert(s1.graph() && s1.graph()->kind == NFA_SUFFIX);
        deque<suffix_id> merged;
        for (auto jt = next(it); jt != suffixes.end(); ++jt) {
            suffix_id s2 = *jt;
            const deque<RoseVertex> &verts2 = suffixes.vertices(s2);
            assert(s2.graph() && s2.graph()->kind == NFA_SUFFIX);

            if (hasMaxTops(*s1.graph())) {
                DEBUG_PRINTF("h1 has hit max tops\n");
                break; // next h1
            }

            if (!acyclic) {
                u32 accel1 = accel_count[s1];
                if (accel1 >= NFA_MAX_ACCEL_STATES) {
                    DEBUG_PRINTF("h1 has hit max accel\n");
                    break; // next h1
                }

                u32 accel2 = accel_count[s2];
                if (accel1 + accel2 > NFA_MAX_ACCEL_STATES) {
                    DEBUG_PRINTF("not merging, might make unaccel (accel1=%u, "
                                 "accel2=%u)\n",
                                 accel1, accel2);
                    continue; // next h2
                }
            }

            // Attempt to merge h2 into h1.

            NGHolder victim;
            cloneHolder(victim, *s2.graph());

            // Store a copy of the suffix tops in case we have to roll back.
            map<RoseVertex, u32> old_tops;
            for (auto v : verts2) {
                old_tops[v] = g[v].suffix.top;
            }

            if (!setDistinctSuffixTops(g, victim, *s1.graph(), verts2)) {
                DEBUG_PRINTF("can't set distinct tops\n");
                continue; // next h2
            }

            if (!mergeNfaPair(victim, *s1.graph(), &tbi.rm, tbi.cc)) {
                DEBUG_PRINTF("merge failed\n");
                // Roll back in-edge properties.
                for (const auto &m : old_tops) {
                    g[m.first].suffix.top = m.second;
                }
                continue; // next h2
            }

            // Update h2's roses to point to h1 now
            shared_ptr<NGHolder> winner = g[verts1.front()].suffix.graph;
            for (auto v : verts2) {
                g[v].suffix.graph = winner;
            }
            suffixes.insert(s1, verts2);
            merged.push_back(s2);

            if (num_vertices(*s1.graph()) >= small_merge_max_vertices(tbi.cc)) {
                DEBUG_PRINTF("h1 now has %zu vertices, proceeding to next\n",
                             num_vertices(*s1.graph()));
                break; // next h1
            }

            if (!acyclic) {
                // Update h1's accel count estimate.
                accel_count[s1] = estimatedAccelStates(tbi, *s1.graph());
            }
        }

        DEBUG_PRINTF("%zu suffixes merged\n", merged.size());
        suffixes.erase_all(merged.begin(), merged.end());
    }
}

/**
 * This merge pass combines suffixes from unrelated roles into a single
 * suffix with multiple top events in order to distinguish the triggers
 * from differing roles. mergeAcyclicSuffixes only considers acyclic suffixes
 * while mergeSmallSuffixes only considers small suffixes. The merges will
 * group roles with suffixes in the graph into clusters of at most
 * \ref MERGE_GROUP_SIZE_MAX. Each cluster is processed by iterating over the
 * suffixes and attempting to pairwise merge it with another member. Merges
 * will fail if the result is not implementable, requires too many distinct top
 * events, or if it losses the ability to be accelerated. The merge will modify
 * the existing suffix graph of the one member (g1), the other member updates
 * it graph to refer to g1 instead of its previous graph (g2) and use the new
 * tops created. Other roles may have been sharing g1 - these are unaffected by
 * the change as the existing top events are left untouched. Other roles using
 * g2 are also unaffected as g2 will continue to exist until while it has any
 * roles triggering it.
 *
 * Note: suffixes destined for the LBR are not considered for these merges as
 * the LBR can only handle a single repeat and this type of repeat is ideally
 * handled outside of an NFA or DFA.
 */
void mergeAcyclicSuffixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("entry\n");

    if (!tbi.cc.grey.mergeSuffixes) {
        return;
    }

    SuffixBouquet suffixes;

    RoseGraph &g = tbi.g;

    for (auto v : vertices_range(g)) {
        shared_ptr<NGHolder> h = g[v].suffix.graph;
        if (!h || tbi.isInETable(v)) {
            continue;
        }

        assert(!g[v].suffix.haig);

        if (!isAcyclic(*h)) {
            continue;
        }

        if (isLargeLBR(*h, tbi.cc.grey)) {
            DEBUG_PRINTF("not considering LBR suffix for merge\n");
            continue;
        }

        suffixes.insert(g[v].suffix, v);
    }

    deque<SuffixBouquet> suff_groups;
    chunkBouquets(suffixes, suff_groups, MERGE_GROUP_SIZE_MAX);
    DEBUG_PRINTF("chunked %zu suffixes into %zu groups\n", suffixes.size(),
                 suff_groups.size());
    suffixes.clear();

    for (auto &group : suff_groups) {
        mergeSuffixes(tbi, group, true);
    }
}

/**
 * This merge pass combines suffixes from unrelated roles into a single
 * suffix with multiple top events in order to distinguish the triggers
 * from differing roles. mergeAcyclicSuffixes only considers acyclic suffixes
 * while mergeSmallSuffixes only considers small suffixes. The merges will
 * group roles with suffixes in the graph into clusters of at most
 * \ref MERGE_GROUP_SIZE_MAX. Each cluster is processed by iterating over the
 * suffixes and attempting to pairwise merge it with another member. Merges
 * will fail if the result is not implementable, requires too many distinct top
 * events, or if it losses the ability to be accelerated. The merge will modify
 * the existing suffix graph of the one member (g1), the other member updates
 * it graph to refer to g1 instead of its previous graph (g2) and use the new
 * tops created. Other roles may have been sharing g1 - these are unaffected by
 * the change as the existing top events are left untouched. Other roles using
 * g2 are also unaffected as g2 will continue to exist until while it has any
 * roles triggering it.
 *
 * Note: suffixes destined for the LBR are not considered for these merges as
 * the LBR can only handle a single repeat and this type of repeat is ideally
 * handled outside of an NFA or DFA.
 */
void mergeSmallSuffixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("entry\n");

    if (!tbi.cc.grey.mergeSuffixes) {
        return;
    }

    RoseGraph &g = tbi.g;
    SuffixBouquet suffixes;

    for (auto v : vertices_range(g)) {
        shared_ptr<NGHolder> h = g[v].suffix.graph;
        if (!h || tbi.isInETable(v)) {
            continue;
        }
        assert(!g[v].suffix.haig);

        // Leave acyclics out for the moment.
        if (isAcyclic(*h)) {
            continue;
        }

        // Small-ish suffixes only.
        if (num_vertices(*h) > 32) {
            continue;
        }

        if (isLargeLBR(*h, tbi.cc.grey)) {
            DEBUG_PRINTF("not considering LBR suffix for merge\n");
            continue;
        }

        suffixes.insert(g[v].suffix, v);
    }

    deque<SuffixBouquet> suff_groups;
    chunkBouquets(suffixes, suff_groups, MERGE_GROUP_SIZE_MAX);
    DEBUG_PRINTF("chunked %zu suffixes into %zu groups\n", suffixes.size(),
                 suff_groups.size());
    suffixes.clear();

    for (auto &group : suff_groups) {
        mergeSuffixes(tbi, group, false);
    }
}

static
void removeDeadOutfixes(vector<OutfixInfo> &outfixes) {
    auto is_dead = [](const OutfixInfo &outfix) { return outfix.is_dead(); };
    outfixes.erase(remove_if(begin(outfixes), end(outfixes), is_dead),
                   end(outfixes));
}

static
void mergeOutfixInfo(OutfixInfo &winner, const OutfixInfo &victim) {
    assert(!winner.is_dead());

    winner.maxBAWidth = max(winner.maxBAWidth, victim.maxBAWidth);
    winner.minWidth = min(winner.minWidth, victim.minWidth);
    winner.maxWidth = max(winner.maxWidth, victim.maxWidth);
    winner.maxOffset = max(winner.maxOffset, victim.maxOffset);
    mergeReverseAccelerationInfo(winner.rev_info, victim.rev_info);

    // This outfix can be ignored in small block mode if both were. The dedupe
    // layer at runtime will protect us from extra matches if only one was in
    // the small block matcher.
    winner.in_sbmatcher &= victim.in_sbmatcher;

    // We should never have merged outfixes that differ in these properties.
    assert(winner.chained == victim.chained);
}

static
map<NGHolder *, NGHolder *> chunkedNfaMerge(RoseBuildImpl &build,
                                            const vector<NGHolder *> &nfas) {
    map<NGHolder *, NGHolder *> merged;

    vector<NGHolder *> batch;
    for (auto it = begin(nfas), ite = end(nfas); it != ite; ++it) {
        batch.push_back(*it);
        assert((*it)->kind == NFA_OUTFIX);
        if (batch.size() == MERGE_GROUP_SIZE_MAX || next(it) == ite) {
            mergeNfaCluster(batch, &build.rm, merged, build.cc);
            batch.clear();
        }
    }

    return merged;
}

static
void mergeOutfixNfas(RoseBuildImpl &tbi, vector<NGHolder *> &nfas) {
    DEBUG_PRINTF("merging %zu nfas\n", nfas.size());
    if (nfas.size() < 2) {
        return;
    }

    vector<OutfixInfo> &outfixes = tbi.outfixes;

    map<NGHolder *, size_t> nfa_mapping;
    for (size_t i = 0; i < outfixes.size(); i++) {
        if (outfixes[i].holder) {
            nfa_mapping[outfixes[i].holder.get()] = i;
        }
    }

    map<NGHolder *, NGHolder *> merged = chunkedNfaMerge(tbi, nfas);
    if (merged.empty()) {
        return;
    }

    DEBUG_PRINTF("%zu nfas merged\n", merged.size());

    // Update the outfix info for merged holders.
    for (const auto &m : merged) {
        OutfixInfo &victim = outfixes.at(nfa_mapping[m.first]);
        OutfixInfo &winner = outfixes.at(nfa_mapping[m.second]);
        mergeOutfixInfo(winner, victim);
        victim.clear();
    }

    removeDeadOutfixes(outfixes);
}

namespace {
struct MergeMcClellan {
    MergeMcClellan(const ReportManager &rm_in, const Grey &grey_in)
        : rm(rm_in), grey(grey_in) {}

    unique_ptr<raw_dfa> operator()(const raw_dfa *d1, const raw_dfa *d2) const {
        assert(d1 && d2);
        return mergeTwoDfas(d1, d2, DFA_MERGE_MAX_STATES, &rm, grey);
    }

    static void transfer(OutfixInfo &outfix, unique_ptr<raw_dfa> d) {
        outfix.rdfa = move(d);
    }

private:
    const ReportManager &rm;
    const Grey &grey;
};

struct MergeHaig {
    explicit MergeHaig(u32 limit_in) : limit(limit_in) {}

    unique_ptr<raw_som_dfa> operator()(const raw_som_dfa *d1,
                                       const raw_som_dfa *d2) const {
        assert(d1 && d2);
        return attemptToMergeHaig({d1, d2}, limit);
    }

    static void transfer(OutfixInfo &outfix, unique_ptr<raw_som_dfa> d) {
        outfix.haig = move(d);
    }

private:
    const u32 limit; //!< state limit for merged result.
};
}

/**
 * Generic pairwise merge algorithm that can be used for either McClellan
 * (RawDfa=raw_dfa) or Haig (RawDfa=raw_som_dfa). Delegates the actual merge
 * operation to a merge functor, which allows the caller to set some policy
 * (state limits, etc).
 *
 * This is currently astonishingly simple and just considers every pair of
 * DFAs, slow and steady. We may wish to actually apply a merge ordering
 * strategy in the future.
 */
template<class RawDfa, class MergeFunctor>
static
void pairwiseDfaMerge(vector<RawDfa *> &dfas,
                      ue2::unordered_map<RawDfa *, size_t> &dfa_mapping,
                      vector<OutfixInfo> &outfixes,
                      MergeFunctor merge_func) {
    DEBUG_PRINTF("merging group of size %zu\n", dfas.size());

    for (auto it = dfas.begin(), ite = dfas.end(); it != ite; ++it) {
        if (!*it) {
            continue;
        }
        for (auto jt = next(it); jt != ite; ++jt) {
            if (!*jt) {
                continue;
            }

            DEBUG_PRINTF("try merge %p and %p\n", *it, *jt);
            unique_ptr<RawDfa> rdfa = merge_func(*it, *jt);
            if (!rdfa) {
                continue; // Merge failed.
            }

            DEBUG_PRINTF("merge succeeded, built %p\n", rdfa.get());
            OutfixInfo &winner = outfixes.at(dfa_mapping[*it]);
            OutfixInfo &victim = outfixes.at(dfa_mapping[*jt]);
            assert(!winner.is_dead() && !victim.is_dead());

            RawDfa *dfa_ptr = rdfa.get();
            dfa_mapping[dfa_ptr] = dfa_mapping[*it];
            dfa_mapping.erase(*it);
            merge_func.transfer(winner, move(rdfa));

            mergeOutfixInfo(winner, victim);

            victim.clear();
            *jt = nullptr; // to be deleted.
            *it = dfa_ptr;
        }
    }
}

template<class RawDfa, class MergeFunctor>
static
void chunkedDfaMerge(vector<RawDfa *> &dfas,
                     ue2::unordered_map<RawDfa *, size_t> &dfa_mapping,
                     vector<OutfixInfo> &outfixes,
                     MergeFunctor merge_func) {
    DEBUG_PRINTF("begin merge of %zu dfas\n", dfas.size());

    vector<RawDfa *> out_dfas;
    vector<RawDfa *> chunk;
    for (auto it = begin(dfas), ite = end(dfas); it != ite; ++it) {
        chunk.push_back(*it);
        if (chunk.size() >= DFA_CHUNK_SIZE_MAX || next(it) == ite) {
            pairwiseDfaMerge(chunk, dfa_mapping, outfixes, merge_func);
            out_dfas.insert(end(out_dfas), begin(chunk), end(chunk));
            chunk.clear();
        }
    }

    // Remove null (merged) DFAs and update vector for subsequent use.
    out_dfas.erase(remove(out_dfas.begin(), out_dfas.end(), nullptr),
                   out_dfas.end());
    dfas.swap(out_dfas);
    DEBUG_PRINTF("after merge there are %zu dfas\n", dfas.size());
}

static
void mergeOutfixDfas(RoseBuildImpl &tbi, vector<raw_dfa *> &dfas) {
    DEBUG_PRINTF("merging %zu nfas\n", dfas.size());
    if (dfas.size() < 2) {
        return;
    }

    vector<OutfixInfo> &outfixes = tbi.outfixes;

    /* key is index into outfix array as iterators, etc may be invalidated by
     * element addition. */
    ue2::unordered_map<raw_dfa *, size_t> dfa_mapping;
    for (size_t i = 0; i < outfixes.size(); i++) {
        if (outfixes[i].rdfa) {
            dfa_mapping[outfixes[i].rdfa.get()] = i;
        }
    }

    chunkedDfaMerge(dfas, dfa_mapping, outfixes,
                    MergeMcClellan(tbi.rm, tbi.cc.grey));
    removeDeadOutfixes(outfixes);
}

static
void mergeOutfixCombo(RoseBuildImpl &tbi, const ReportManager &rm,
                      const Grey &grey) {
    if (!grey.roseMcClellanOutfix) {
        return;
    }

    DEBUG_PRINTF("merge combo\n");

    bool seen_dfa = false;
    u32 nfa_count = 0;
    for (const auto &outfix : tbi.outfixes) {
        if (outfix.holder) {
            DEBUG_PRINTF("nfa\n");
            nfa_count++;
        } else if (outfix.rdfa) {
            DEBUG_PRINTF("dfa\n");
            seen_dfa = true;
        }
    }

    DEBUG_PRINTF("nfa %u dfas present %d\n", nfa_count,
                  (int)seen_dfa);
    if (!nfa_count || (nfa_count == 1 && !seen_dfa)) {
        DEBUG_PRINTF("no combo merges possible\n");
        return;
    }

    /* key is index into outfix array as iterators, etc may be invalidated by
     * element addition. */
    size_t new_dfas = 0;
    ue2::unordered_map<raw_dfa *, size_t> dfa_mapping;
    vector<raw_dfa *> dfas;

    for (auto it = tbi.outfixes.begin(); it != tbi.outfixes.end(); ++it) {
        assert(!it->is_dead());
        assert(!it->chained);
        if (it->rdfa) {
            dfas.push_back(it->rdfa.get());
            dfa_mapping[it->rdfa.get()] = it - tbi.outfixes.begin();
            continue;
        }

        if (!it->holder) {
            continue;
        }

        NGHolder *h = it->holder.get();
        assert(h->kind == NFA_OUTFIX);
        auto rdfa = buildMcClellan(*h, &rm, grey);
        if (rdfa) {
            // Transform this outfix into a DFA and add it to the merge set.
            dfa_mapping[rdfa.get()] = it - tbi.outfixes.begin();
            dfas.push_back(rdfa.get());
            it->clear();
            it->rdfa = move(rdfa);
            new_dfas++;
        }
    }

    DEBUG_PRINTF("constructed %zu new dfas\n", new_dfas);

    if (!new_dfas) {
        /* assumes normal dfas have already been fully merged */
        return;
    }

    chunkedDfaMerge(dfas, dfa_mapping, tbi.outfixes,
                    MergeMcClellan(tbi.rm, tbi.cc.grey));
    removeDeadOutfixes(tbi.outfixes);
}

static
void mergeOutfixHaigs(RoseBuildImpl &tbi, vector<raw_som_dfa *> &dfas,
                      u32 limit) {
    if (dfas.size() < 2) {
        return;
    }

    vector<OutfixInfo> &outfixes = tbi.outfixes;

    ue2::unordered_map<raw_som_dfa *, size_t> dfa_mapping;
    for (size_t i = 0; i < outfixes.size(); i++) {
        if (outfixes[i].haig) {
            dfa_mapping[outfixes[i].haig.get()] = i;
        }
    }

    chunkedDfaMerge(dfas, dfa_mapping, outfixes, MergeHaig(limit));
    removeDeadOutfixes(outfixes);
}

/**
 * This pass attempts to merge outfix engines together. At this point in time,
 * the engine type (NFA, DFA, Haig) has already been decided for each outfix
 * and outfixes can only merged with others of their same type. NFAs are merged
 * in a priority order based on common prefix length. The other types are
 * merged blindly. Engines are merged to the extent that they can still be
 * implemented efficiently.
 */
void mergeOutfixes(RoseBuildImpl &tbi) {
    if (!tbi.cc.grey.mergeOutfixes) {
        return;
    }

    vector<NGHolder *> nfas;
    vector<raw_dfa *> dfas;
    vector<raw_som_dfa *> som_dfas;

    for (const auto &outfix : tbi.outfixes) {
        assert(!outfix.chained);
        if (outfix.rdfa) {
            dfas.push_back(outfix.rdfa.get());
        } else if (outfix.holder) {
            nfas.push_back(outfix.holder.get());
        } else if (outfix.haig) {
            som_dfas.push_back(outfix.haig.get());
        }
    }

    DEBUG_PRINTF("merging %zu dfas, %zu nfas\n",
                 dfas.size(), nfas.size());

    mergeOutfixNfas(tbi, nfas);
    mergeOutfixDfas(tbi, dfas);
    mergeOutfixHaigs(tbi, som_dfas, 255);
    mergeOutfixHaigs(tbi, som_dfas, 8192);
    mergeOutfixCombo(tbi, tbi.rm, tbi.cc.grey);
}

static
u32 allowedSquashDistance(const CharReach &cr, u32 min_width,
                          const RoseBuildImpl &tbi,
                          RoseVertex tv) {
    CharReach accept_cr;
    DEBUG_PRINTF("hello |cr|=%zu\n", cr.count());

    const RoseGraph &g = tbi.g;

    /* TODO: inspect further back in the pattern */
    for (u32 lit_id : g[tv].literals) {
        const rose_literal_id &lit = tbi.literals.right.at(lit_id);
        if (lit.delay) {
            return 0; /* TODO: better */
        }
        if (lit.table != ROSE_FLOATING && lit.table != ROSE_EOD_ANCHORED) {
            return 0;
        }
        assert(!lit.s.empty());
        accept_cr |= *lit.s.rbegin();
    }

    DEBUG_PRINTF("|accept_cr|=%zu\n", accept_cr.count());

    if ((accept_cr & cr).any()) {
        DEBUG_PRINTF("no squash\n");
        return 0; /* the accept byte doesn't always kill the puffette. TODO:
                   * maybe if we look further back we could find something that
                   * would kill the puffette... */
    }

    DEBUG_PRINTF("allowed to squash %u\n", min_width);
    return min_width;
}

void mergePuffixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("entry\n");

    if (!tbi.cc.grey.mergeSuffixes) {
        return;
    }

    RoseGraph &g = tbi.g;

    for (auto v : vertices_range(g)) {
        shared_ptr<NGHolder> h = g[v].suffix.graph;
        if (!h) {
            continue;
        }
        assert(!g[v].suffix.haig);
        assert(!g[v].eod_accept);

        assert(onlyOneTop(*h)); /* we should not have merged yet */
        bool fixed_depth = g[v].min_offset == g[v].max_offset;

        if (!isPuffable(*h, fixed_depth, tbi.rm, tbi.cc.grey)) {
            continue;
        }

        PureRepeat repeat;
        if (!isPureRepeat(*h, repeat)) {
            assert(0);
            continue;
        }

        if (repeat.bounds.min == depth(0)) {
            assert(0); // No vacuous puffs allowed.
            continue;
        }

        assert(repeat.bounds.min.is_finite() &&
               repeat.bounds.max.is_reachable());
        assert(repeat.bounds.max == repeat.bounds.min ||
               repeat.bounds.max.is_infinite());

        const bool unbounded = repeat.bounds.max.is_infinite();
        const set<ReportID> reports = all_reports(*h);
        assert(reports.size() == 1);
        ReportID report = *reports.begin();

        DEBUG_PRINTF("got puffette candidate %u:%s\n", report,
                     repeat.bounds.str().c_str());

        raw_puff rp(repeat.bounds.min, unbounded, report, repeat.reach);

        u32 queue;
        u32 event;
        tbi.addChainTail(rp, &queue, &event);
        u32 squashDistance =
            allowedSquashDistance(repeat.reach, repeat.bounds.min, tbi, v);

        Report ir = makeMpvTrigger(event, squashDistance);
        ReportID id = tbi.rm.getInternalId(ir);

        DEBUG_PRINTF("puffette event q%u t%u\n", queue, event);
        g[v].suffix.reset();
        g[v].reports.insert(id);
    }
}

static
void updateCastleSuffix(RoseGraph &g, const shared_ptr<CastleProto> &m,
                        u32 top, const vector<RoseVertex> &verts) {
    DEBUG_PRINTF("merged in as top %u, updating %zu vertices\n", top,
                  verts.size());

    for (auto v : verts) {
        assert(g[v].suffix.castle);
        g[v].suffix.castle = m;
        g[v].suffix.top = top;
    }
}

static
void mergeCastleSuffixes(RoseBuildImpl &tbi,
            vector<shared_ptr<CastleProto> > &castles,
            map<shared_ptr<CastleProto>, vector<RoseVertex> > &castle_map) {
    if (castles.size() <= 1) {
        return;
    }

    RoseGraph &g = tbi.g;
    const size_t max_size = CastleProto::max_occupancy;

    shared_ptr<CastleProto> m = castles.front();
    assert(m->repeats.size() == 1); // Not yet merged.

    // Cache repeats we've already merged, mapped to (prototype, top). That
    // way, we can ensure that we don't construct more than one completely
    // identical repeat.
    typedef map<PureRepeat, pair<shared_ptr<CastleProto>, u32> > RepeatCache;
    RepeatCache cache;
    {
        // Initial entry in cache.
        const u32 top = m->repeats.begin()->first;
        const PureRepeat &pr = m->repeats.begin()->second;
        cache[pr] = make_pair(m, top);
    }

    for (size_t i = 1; i < castles.size(); i++) {
        shared_ptr<CastleProto> c = castles[i];
        assert(c->repeats.size() == 1); // Not yet merged.
        const PureRepeat &pr = c->repeats.begin()->second;
        RepeatCache::const_iterator it = cache.find(pr);
        if (it != cache.end()) {
            DEBUG_PRINTF("reusing cached merge, top=%u, proto=%p\n",
                         it->second.second, it->second.first.get());
            updateCastleSuffix(g, it->second.first, it->second.second,
                               castle_map[c]);
            continue;
        }

        if (m->repeats.size() == max_size) {
            // No room left to merge into 'm'. This one becomes the new 'm'.
            DEBUG_PRINTF("next mergee\n");
            m = c;
            u32 top = m->repeats.begin()->first;
            cache[pr] = make_pair(m, top);
        } else {
            u32 top = m->add(pr);
            updateCastleSuffix(g, m, top, castle_map[c]);
            DEBUG_PRINTF("added to %p, top %u\n", m.get(), top);
            cache[pr] = make_pair(m, top);
        }
    }
}

void mergeCastleSuffixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("entry\n");

    if (!(tbi.cc.grey.allowCastle && tbi.cc.grey.mergeSuffixes)) {
        return;
    }

    map<shared_ptr<CastleProto>, vector<RoseVertex>> castles;
    map<CharReach, vector<shared_ptr<CastleProto>>> by_reach;

    RoseGraph &g = tbi.g;

    for (auto v : vertices_range(g)) {
        if (!g[v].suffix.castle) {
            continue;
        }

        shared_ptr<CastleProto> c = g[v].suffix.castle;

        if (c->repeats.size() != 1) {
            // This code assumes it's the only place merging is being done.
            assert(0);
            continue;
        }

        if (!contains(castles, c)) {
            by_reach[c->reach()].push_back(c);
        }
        castles[c].push_back(v);
    }

    for (auto &m : by_reach) {
        DEBUG_PRINTF("reach %s, %zu elements\n", describeClass(m.first).c_str(),
                     m.second.size());
        mergeCastleSuffixes(tbi, m.second, castles);
    }
}

} // namespace ue2