vectorscan/src/nfa/castlecompile.cpp

/*
 * Copyright (c) 2015-2017, Intel Corporation
 * Copyright (c) 2021, Arm Limited
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/**
 * \file
 * \brief Castle: multi-tenant repeat engine, compiler code.
 */

#include "castlecompile.h"

#include "castle_internal.h"
#include "limex_limits.h"
#include "nfa_internal.h"
#include "repeatcompile.h"
#include "shufticompile.h"
#include "trufflecompile.h"
#include "vermicellicompile.h"
#include "nfagraph/ng_dump.h"
#include "nfagraph/ng_equivalence.h"
#include "nfagraph/ng_repeat.h"
#include "nfagraph/ng_redundancy.h"
#include "nfagraph/ng_util.h"
#include "util/alloc.h"
#include "util/compile_context.h"
#include "util/container.h"
#include "util/dump_charclass.h"
#include "util/flat_containers.h"
#include "util/graph.h"
#include "util/multibit_build.h"
#include "util/report_manager.h"
#include "util/verify_types.h"
#include "grey.h"

#include <stack>
#include <cassert>

#include <boost/graph/adjacency_list.hpp>
#include <boost/range/adaptor/map.hpp>

using namespace std;
using boost::adaptors::map_keys;
using boost::adaptors::map_values;

namespace ue2 {

#define CLIQUE_GRAPH_MAX_SIZE 1000

static
u32 depth_to_u32(const depth &d) {
    assert(d.is_reachable());
    if (d.is_infinite()) {
        return REPEAT_INF;
    }

    u32 d_val = d;
    assert(d_val < REPEAT_INF);
    return d_val;
}

static
void writeCastleScanEngine(const CharReach &cr, Castle *c) {
    if (cr.all()) {
        c->type = CASTLE_DOT;
        return;
    }

    if (cr.count() == 1) {
        c->type = CASTLE_NVERM;
        c->u.verm.c = cr.find_first();
        return;
    }

    const CharReach negated(~cr);
    if (negated.count() == 1) {
        c->type = CASTLE_VERM;
        c->u.verm.c = negated.find_first();
        return;
    }

#ifdef HAVE_SVE2
    if (cr.count() <= 16) {
        c->type = CASTLE_NVERM16;
        vermicelli16Build(cr, (u8 *)&c->u.verm16.mask);
        return;
    }
    if (negated.count() <= 16) {
        c->type = CASTLE_VERM16;
        vermicelli16Build(negated, (u8 *)&c->u.verm16.mask);
        return;
    }
#endif // HAVE_SVE2

    if (shuftiBuildMasks(negated, (u8 *)&c->u.shuf.mask_lo,
                         (u8 *)&c->u.shuf.mask_hi) != -1) {
        c->type = CASTLE_SHUFTI;
        return;
    }

    c->type = CASTLE_TRUFFLE;
    truffleBuildMasks(negated, (u8 *)(u8 *)&c->u.truffle.mask1,
                      (u8 *)&c->u.truffle.mask2);
}

static
bool literalOverlap(const vector<CharReach> &a, const vector<CharReach> &b,
                    const size_t dist) {
    for (size_t i = 0; i < b.size(); i++) {
        if (i > dist) {
            return true;
        }
        size_t overlap_len = b.size() - i;
        if (overlap_len <= a.size()) {
            if (matches(a.end() - overlap_len, a.end(), b.begin(),
                        b.end() - i)) {
                return false;
            }
        } else {
            assert(overlap_len > a.size());
            if (matches(a.begin(), a.end(), b.end() - i - a.size(),
                        b.end() - i)) {
                return false;
            }
        }
    }

    return b.size() > dist;
}

struct CliqueVertexProps {
    CliqueVertexProps() {}
    explicit CliqueVertexProps(u32 state_in) : stateId(state_in) {}

    u32 stateId = ~0U;
};

typedef boost::adjacency_list<boost::listS, boost::listS, boost::undirectedS,
                              CliqueVertexProps> CliqueGraph;
typedef CliqueGraph::vertex_descriptor CliqueVertex;

static
void getNeighborInfo(const CliqueGraph &g, vector<u32> &neighbor,
                     const CliqueVertex &cv, const set<u32> &group) {
    u32 id = g[cv].stateId;

    // find neighbors for cv
    for (const auto &v : adjacent_vertices_range(cv, g)) {
        if (g[v].stateId != id && contains(group, g[v].stateId)) {
            neighbor.emplace_back(g[v].stateId);
            DEBUG_PRINTF("Neighbor:%u\n", g[v].stateId);
        }
    }
}

static
void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique) {
    stack<vector<u32>> gStack;

    // Create mapping between vertex and id
    map<u32, CliqueVertex> vertexMap;
    vector<u32> init;
    for (const auto &v : vertices_range(cg)) {
        vertexMap[cg[v].stateId] = v;
        init.emplace_back(cg[v].stateId);
    }
    gStack.push(init);

    // Get the vertex to start from
    CliqueGraph::vertex_iterator vi, ve;
    tie(vi, ve) = vertices(cg);
    while (!gStack.empty()) {
        vector<u32> g = gStack.top();
        gStack.pop();

        // Choose a vertex from the graph
        u32 id = g[0];
        const CliqueVertex &n = vertexMap.at(id);
        clique.emplace_back(id);
        // Corresponding vertex in the original graph
        vector<u32> neighbor;
        set<u32> subgraphId(g.begin(), g.end());
        getNeighborInfo(cg, neighbor, n, subgraphId);
        // Get graph consisting of neighbors for left branch
        if (!neighbor.empty()) {
            gStack.push(neighbor);
        }
    }
}

template<typename Graph>
bool graph_empty(const Graph &g) {
    typename Graph::vertex_iterator vi, ve;
    tie(vi, ve) = vertices(g);
    return vi == ve;
}

static
vector<u32> removeClique(CliqueGraph &cg) {
    vector<vector<u32>> cliquesVec(1);
    DEBUG_PRINTF("graph size:%zu\n", num_vertices(cg));
    findCliqueGroup(cg, cliquesVec[0]);
    while (!graph_empty(cg)) {
        const vector<u32> &c = cliquesVec.back();
        vector<CliqueVertex> dead;
        for (const auto &v : vertices_range(cg)) {
            if (find(c.begin(), c.end(), cg[v].stateId) != c.end()) {
                dead.emplace_back(v);
            }
        }
        for (const auto &v : dead) {
            clear_vertex(v, cg);
            remove_vertex(v, cg);
        }
        if (graph_empty(cg)) {
            break;
        }
        vector<u32> clique;
        findCliqueGroup(cg, clique);
        cliquesVec.emplace_back(clique);
    }

    // get the independent set with max size
    size_t max = 0;
    size_t id = 0;
    for (size_t j = 0; j < cliquesVec.size(); ++j) {
        if (cliquesVec[j].size() > max) {
            max = cliquesVec[j].size();
            id = j;
        }
    }

    DEBUG_PRINTF("clique size:%zu\n", cliquesVec[id].size());
    return cliquesVec[id];
}

// if the location of any reset character in one literal are after
// the end locations where it overlaps with other literals,
// then the literals are mutual exclusive
static
bool findExclusivePair(const size_t id1, const size_t id2,
                       const size_t lower,
                       const vector<vector<size_t>> &min_reset_dist,
                       const vector<vector<vector<CharReach>>> &triggers) {
    const auto &triggers1 = triggers[id1];
    const auto &triggers2 = triggers[id2];
    for (size_t i = 0; i < triggers1.size(); ++i) {
        for (size_t j = 0; j < triggers2.size(); ++j) {
            if (!literalOverlap(triggers1[i], triggers2[j],
                                min_reset_dist[id2 - lower][j]) ||
                !literalOverlap(triggers2[j], triggers1[i],
                                min_reset_dist[id1 - lower][i])) {
                return false;
            }
        }
    }
    return true;
}

static
vector<vector<u32>> checkExclusion(u32 &streamStateSize,
                       const CharReach &cr,
                       const vector<vector<vector<CharReach>>> &triggers,
                       enum ExclusiveType &exclusive,
                       const size_t numRepeats) {
    vector<vector<u32>> groups;
    size_t trigSize = triggers.size();
    DEBUG_PRINTF("trigSize %zu\n", trigSize);

    size_t lower = 0;
    size_t total = 0;
    while (lower < trigSize) {
        vector<CliqueVertex> vertices;
        unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();

        vector<vector<size_t>> min_reset_dist;
        size_t upper = min(lower + CLIQUE_GRAPH_MAX_SIZE, trigSize);
        // get min reset distance for each repeat
        for (size_t i = lower; i < upper; i++) {
            CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
            vertices.emplace_back(v);

            const vector<size_t> &tmp_dist =
                minResetDistToEnd(triggers[i], cr);
            min_reset_dist.emplace_back(tmp_dist);
        }

        // find exclusive pair for each repeat
        for (size_t i = lower; i < upper; i++) {
            CliqueVertex s = vertices[i - lower];
            for (size_t j = i + 1; j < upper; j++) {
                if (findExclusivePair(i, j, lower, min_reset_dist,
                                      triggers)) {
                    CliqueVertex d = vertices[j - lower];
                    add_edge(s, d, *cg);
                }
            }
        }

        // find the largest exclusive group
        auto clique = removeClique(*cg);
        size_t cliqueSize = clique.size();
        if (cliqueSize > 1) {
            groups.emplace_back(clique);
            exclusive = EXCLUSIVE;
            total += cliqueSize;
        }

        lower += CLIQUE_GRAPH_MAX_SIZE;
    }
    DEBUG_PRINTF("clique size %zu, num of repeats %zu\n",
                 total, numRepeats);
    if (total == numRepeats) {
        exclusive = PURE_EXCLUSIVE;
        streamStateSize = 0;
    };

    return groups;
}

namespace {
struct ExclusiveInfo {

    /** Mapping between top and exclusive group id */
    map<u32, u32> groupId;

    /** Number of exclusive groups */
    u32 numGroups = 0;
};
}

static
void buildSubcastles(const CastleProto &proto, vector<SubCastle> &subs,
                     vector<RepeatInfo> &infos, vector<u64a> &patchSize,
                     const vector<pair<depth, bool>> &repeatInfoPair,
                     u32 &scratchStateSize, u32 &streamStateSize,
                     u32 &tableSize, vector<u64a> &tables, u32 &sparseRepeats,
                     const ExclusiveInfo &exclusiveInfo,
                     vector<u32> &may_stale, const ReportManager &rm) {
    const bool remap_reports = has_managed_reports(proto.kind);

    u32 i = 0;
    const auto &groupId = exclusiveInfo.groupId;
    const auto &numGroups = exclusiveInfo.numGroups;
    vector<u32> maxStreamSize(numGroups, 0);

    for (auto it = proto.repeats.begin(), ite = proto.repeats.end();
         it != ite; ++it, ++i) {
        const PureRepeat &pr = it->second;
        depth min_period = repeatInfoPair[i].first;
        bool is_reset = repeatInfoPair[i].second;

        enum RepeatType rtype = chooseRepeatType(pr.bounds.min, pr.bounds.max,
                                                 min_period, is_reset, true);
        RepeatStateInfo rsi(rtype, pr.bounds.min, pr.bounds.max, min_period);

        DEBUG_PRINTF("sub %u: selected %s model for %s repeat\n", i,
                     repeatTypeName(rtype), pr.bounds.str().c_str());

        SubCastle &sub = subs[i];
        RepeatInfo &info = infos[i];

        info.packedCtrlSize = rsi.packedCtrlSize;
        u32 subStreamStateSize = verify_u32(rsi.packedCtrlSize + rsi.stateSize);

        // Handle stream/scratch space alloc for exclusive case differently.
        if (contains(groupId, i)) {
            u32 id = groupId.at(i);
            maxStreamSize[id] = max(maxStreamSize[id], subStreamStateSize);
            // SubCastle full/stream state offsets are written in for the group
            // below.
        } else {
            sub.fullStateOffset = scratchStateSize;
            sub.streamStateOffset = streamStateSize;
            scratchStateSize += verify_u32(sizeof(RepeatControl));
            streamStateSize += subStreamStateSize;
        }

        if (pr.bounds.max.is_finite()) {
            may_stale.emplace_back(i);
        }

        info.type = verify_u8(rtype);
        info.repeatMin = depth_to_u32(pr.bounds.min);
        info.repeatMax = depth_to_u32(pr.bounds.max);
        info.stateSize = rsi.stateSize;
        info.horizon = rsi.horizon;
        info.minPeriod = min_period.is_finite() ? (u32)min_period : ~0U;
        assert(rsi.packedFieldSizes.size()
               <= ARRAY_LENGTH(info.packedFieldSizes));
        copy(rsi.packedFieldSizes.begin(), rsi.packedFieldSizes.end(),
             info.packedFieldSizes);
        info.patchCount = rsi.patchCount;
        info.patchSize = rsi.patchSize;
        info.encodingSize = rsi.encodingSize;
        info.patchesOffset = rsi.patchesOffset;

        assert(pr.reports.size() == 1);
        ReportID id = *pr.reports.begin();
        sub.report = remap_reports ? rm.getProgramOffset(id) : id;

        if (rtype == REPEAT_SPARSE_OPTIMAL_P) {
            for (u32 j = 0; j < rsi.patchSize; j++) {
                tables.emplace_back(rsi.table[j]);
            }
            sparseRepeats++;
            patchSize[i] = rsi.patchSize;
            tableSize += rsi.patchSize;
        }
    }

    vector<u32> scratchOffset(numGroups, 0);
    vector<u32> streamOffset(numGroups, 0);
    for (const auto &j : groupId) {
        u32 top = j.first;
        u32 id = j.second;
        SubCastle &sub = subs[top];
        if (!scratchOffset[id]) {
            sub.fullStateOffset = scratchStateSize;
            sub.streamStateOffset = streamStateSize;
            scratchOffset[id] = scratchStateSize;
            streamOffset[id] = streamStateSize;
            scratchStateSize += verify_u32(sizeof(RepeatControl));
            streamStateSize += maxStreamSize[id];
        } else {
            sub.fullStateOffset = scratchOffset[id];
            sub.streamStateOffset = streamOffset[id];
        }
    }
}

bytecode_ptr<NFA>
buildCastle(const CastleProto &proto,
            const map<u32, vector<vector<CharReach>>> &triggers,
            const CompileContext &cc, const ReportManager &rm) {
    assert(cc.grey.allowCastle);

    const size_t numRepeats = proto.repeats.size();
    assert(numRepeats > 0 && numRepeats <= proto.max_occupancy);

    const CharReach &cr = proto.reach();

    DEBUG_PRINTF("reach %s, %zu repeats\n", describeClass(cr).c_str(),
                 numRepeats);

    vector<SubCastle> subs(numRepeats);
    memset(&subs[0], 0, sizeof(SubCastle) * numRepeats);

    vector<RepeatInfo> infos(numRepeats);
    memset(&infos[0], 0, sizeof(RepeatInfo) * numRepeats);

    vector<u64a> patchSize(numRepeats);
    memset(&patchSize[0], 0, sizeof(u64a) * numRepeats);

    vector<u64a> tables;

    // We start with enough stream state to store the active bitfield.
    u32 streamStateSize = mmbit_size(numRepeats);

    // We have a copy of the stream state in scratch for castleMatchLoop.
    u32 scratchStateSize = ROUNDUP_N(streamStateSize, alignof(RepeatControl));

    depth minWidth(depth::infinity());
    depth maxWidth(0);

    u32 i = 0;
    ExclusiveInfo exclusiveInfo;
    vector<vector<vector<CharReach>>> candidateTriggers;
    vector<u32> candidateRepeats;
    vector<pair<depth, bool>> repeatInfoPair;
    for (auto it = proto.repeats.begin(), ite = proto.repeats.end();
         it != ite; ++it, ++i) {
        const u32 top = it->first;
        const PureRepeat &pr = it->second;
        assert(pr.reach == cr);
        assert(pr.reports.size() == 1);

        if (top != i) {
            // Tops have not been remapped?
            assert(0);
            throw std::logic_error("Tops not remapped");
        }

        minWidth = min(minWidth, pr.bounds.min);
        maxWidth = max(maxWidth, pr.bounds.max);

        bool is_reset = false;
        depth min_period = depth::infinity();

        // If we've got a top in the castle without any trigger information, it
        // possibly means that we've got a repeat that we can't trigger. We do
        // need to cope with it though.
        if (contains(triggers, top)) {
            min_period = depth(minPeriod(triggers.at(top), cr, &is_reset));
        }

        if (min_period > pr.bounds.max) {
            DEBUG_PRINTF("trigger is longer than repeat; only need one offset\n");
            is_reset = true;
        }

        repeatInfoPair.emplace_back(make_pair(min_period, is_reset));

        candidateTriggers.emplace_back(triggers.at(top));
        candidateRepeats.emplace_back(i);
    }

    // Case 1: exclusive repeats
    enum ExclusiveType exclusive = NOT_EXCLUSIVE;
    u32 activeIdxSize = 0;
    u32 groupIterOffset = 0;
    if (cc.grey.castleExclusive) {
        auto cliqueGroups =
            checkExclusion(streamStateSize, cr, candidateTriggers,
                           exclusive, numRepeats);
        for (const auto &group : cliqueGroups) {
            // mutual exclusive repeats group found,
            // update state sizes
            activeIdxSize = calcPackedBytes(numRepeats + 1);
            streamStateSize += activeIdxSize;

            // replace with top values
            for (const auto &val : group) {
                const u32 top = candidateRepeats[val];
                exclusiveInfo.groupId[top] = exclusiveInfo.numGroups;
            }
            exclusiveInfo.numGroups++;
        }

        if (exclusive) {
            groupIterOffset = streamStateSize;
            streamStateSize += mmbit_size(exclusiveInfo.numGroups);
        }

        DEBUG_PRINTF("num of groups:%u\n", exclusiveInfo.numGroups);
    }
    candidateRepeats.clear();

    DEBUG_PRINTF("reach %s exclusive %u\n", describeClass(cr).c_str(),
                 exclusive);

    u32 tableSize = 0;
    u32 sparseRepeats = 0;
    vector<u32> may_stale; /* sub castles that may go stale */

    buildSubcastles(proto, subs, infos, patchSize, repeatInfoPair,
                    scratchStateSize, streamStateSize, tableSize,
                    tables, sparseRepeats, exclusiveInfo, may_stale, rm);

    DEBUG_PRINTF("%zu subcastles may go stale\n", may_stale.size());
    vector<mmbit_sparse_iter> stale_iter;
    if (!may_stale.empty()) {
        stale_iter = mmbBuildSparseIterator(may_stale, numRepeats);
    }


    size_t total_size =
        sizeof(NFA) +                      // initial NFA structure
        sizeof(Castle) +                   // Castle structure
        sizeof(SubCastle) * subs.size() +  // SubCastles themselves
        sizeof(RepeatInfo) * subs.size() + // RepeatInfo structure
        sizeof(u64a) * tableSize +         // table size for
                                           // REPEAT_SPARSE_OPTIMAL_P
        sizeof(u64a) * sparseRepeats;      // paddings for
                                           // REPEAT_SPARSE_OPTIMAL_P tables

    total_size = ROUNDUP_N(total_size, alignof(mmbit_sparse_iter));
    total_size += byte_length(stale_iter); // stale sparse iter

    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
    nfa->type = verify_u8(CASTLE_NFA);
    nfa->length = verify_u32(total_size);
    nfa->nPositions = verify_u32(subs.size());
    nfa->streamStateSize = streamStateSize;
    nfa->scratchStateSize = scratchStateSize;
    nfa->minWidth = verify_u32(minWidth);
    nfa->maxWidth = maxWidth.is_finite() ? verify_u32(maxWidth) : 0;

    char * const base_ptr = (char *)nfa.get() + sizeof(NFA);
    char *ptr = base_ptr;
    Castle *c = (Castle *)ptr;
    c->numRepeats = verify_u32(subs.size());
    c->numGroups = exclusiveInfo.numGroups;
    c->exclusive = verify_s8(exclusive);
    c->activeIdxSize = verify_u8(activeIdxSize);
    c->activeOffset = verify_u32(c->numGroups * activeIdxSize);
    c->groupIterOffset = groupIterOffset;

    writeCastleScanEngine(cr, c);

    ptr += sizeof(Castle);
    SubCastle *subCastles = ((SubCastle *)(ROUNDUP_PTR(ptr, alignof(u32))));
    copy(subs.begin(), subs.end(), subCastles);

    u32 length = 0;
    u32 tableIdx = 0;
    for (i = 0; i < numRepeats; i++) {
        u32 offset = sizeof(SubCastle) * (numRepeats - i) + length;
        SubCastle *sub = &subCastles[i];
        sub->repeatInfoOffset = offset;

        ptr = (char *)sub + offset;
        memcpy(ptr, &infos[i], sizeof(RepeatInfo));

        if (patchSize[i]) {
            RepeatInfo *info = (RepeatInfo *)ptr;
            u64a *table = ((u64a *)(ROUNDUP_PTR(((char *)(info) +
                                    sizeof(*info)), alignof(u64a))));
            copy(tables.begin() + tableIdx,
                 tables.begin() + tableIdx + patchSize[i], table);
            u32 diff = (char *)table - (char *)info +
                       sizeof(u64a) * patchSize[i];
            info->length = diff;
            length += diff;
            tableIdx += patchSize[i];
        } else {
            length += sizeof(RepeatInfo);
        }

        // set exclusive group info
        if (contains(exclusiveInfo.groupId, i)) {
            sub->exclusiveId = exclusiveInfo.groupId[i];
        } else {
            sub->exclusiveId = numRepeats;
        }
    }

    ptr = base_ptr + total_size - sizeof(NFA) - byte_length(stale_iter);

    assert(ptr + byte_length(stale_iter) == base_ptr + total_size - sizeof(NFA));
    if (!stale_iter.empty()) {
        c->staleIterOffset = verify_u32(ptr - base_ptr);
        copy_bytes(ptr, stale_iter);
        ptr += byte_length(stale_iter);
    }

    return nfa;
}

set<ReportID> all_reports(const CastleProto &proto) {
    set<ReportID> reports;
    for (const ReportID &report : proto.report_map | map_keys) {
        reports.insert(report);
    }
    return reports;
}

depth findMinWidth(const CastleProto &proto) {
    depth min_width(depth::infinity());
    for (const PureRepeat &pr : proto.repeats | map_values) {
        min_width = min(min_width, pr.bounds.min);
    }
    return min_width;
}

depth findMaxWidth(const CastleProto &proto) {
    depth max_width(0);
    for (const PureRepeat &pr : proto.repeats | map_values) {
        max_width = max(max_width, pr.bounds.max);
    }
    return max_width;
}

depth findMinWidth(const CastleProto &proto, u32 top) {
    if (!contains(proto.repeats, top)) {
        assert(0); // should not happen
        return depth::infinity();
    }
    return proto.repeats.at(top).bounds.min;
}

depth findMaxWidth(const CastleProto &proto, u32 top) {
    if (!contains(proto.repeats, top)) {
        assert(0); // should not happen
        return depth(0);
    }
    return proto.repeats.at(top).bounds.max;
}

CastleProto::CastleProto(nfa_kind k, const PureRepeat &pr) : kind(k) {
    assert(pr.reach.any());
    assert(pr.reports.size() == 1);
    u32 top = 0;
    repeats.emplace(top, pr);
    for (const auto &report : pr.reports) {
        report_map[report].insert(top);
    }
}

const CharReach &CastleProto::reach() const {
    assert(!repeats.empty());
    return repeats.begin()->second.reach;
}

u32 CastleProto::add(const PureRepeat &pr) {
    assert(repeats.size() < max_occupancy);
    assert(pr.reach == reach());
    assert(pr.reports.size() == 1);
    u32 top = next_top++;
    DEBUG_PRINTF("selected unused top %u\n", top);
    assert(!contains(repeats, top));
    repeats.emplace(top, pr);
    for (const auto &report : pr.reports) {
        report_map[report].insert(top);
    }
    return top;
}

void CastleProto::erase(u32 top) {
    DEBUG_PRINTF("erase top %u\n", top);
    assert(contains(repeats, top));
    repeats.erase(top);
    for (auto &m : report_map) {
        m.second.erase(top);
    }
}

u32 CastleProto::merge(const PureRepeat &pr) {
    assert(repeats.size() <= max_occupancy);
    assert(pr.reach == reach());
    assert(pr.reports.size() == 1);

    // First, see if this repeat is already in this castle.
    for (const auto &m : repeats) {
        if (m.second == pr) {
            DEBUG_PRINTF("repeat already present, with top %u\n", m.first);
            return m.first;
        }
    }

    if (repeats.size() == max_occupancy) {
        DEBUG_PRINTF("this castle is full\n");
        return max_occupancy;
    }

    return add(pr);
}

bool mergeCastle(CastleProto &c1, const CastleProto &c2,
                 map<u32, u32> &top_map) {
    assert(&c1 != &c2);
    assert(c1.kind == c2.kind);

    DEBUG_PRINTF("c1 has %zu repeats, c2 has %zu repeats\n", c1.repeats.size(),
                 c2.repeats.size());

    if (c1.reach() != c2.reach()) {
        DEBUG_PRINTF("different reach!\n");
        return false;
    }

    if (c1.repeats.size() + c2.repeats.size() > c1.max_occupancy) {
        DEBUG_PRINTF("too many repeats to merge\n");
        return false;
    }

    top_map.clear();

    for (const auto &m : c2.repeats) {
        const u32 top = m.first;
        const PureRepeat &pr = m.second;
        DEBUG_PRINTF("top %u\n", top);
        u32 new_top = c1.merge(pr);
        top_map[top] = new_top;
        DEBUG_PRINTF("adding repeat: map %u->%u\n", top, new_top);
    }

    assert(c1.repeats.size() <= c1.max_occupancy);
    return true;
}

void remapCastleTops(CastleProto &proto, map<u32, u32> &top_map) {
    map<u32, PureRepeat> out;
    top_map.clear();

    for (const auto &m : proto.repeats) {
        const u32 top = m.first;
        const PureRepeat &pr = m.second;
        u32 new_top = out.size();
        out.emplace(new_top, pr);
        top_map[top] = new_top;
    }

    proto.repeats.swap(out);

    // Remap report map.
    proto.report_map.clear();
    for (const auto &m : proto.repeats) {
        const u32 top = m.first;
        const PureRepeat &pr = m.second;
        for (const auto &report : pr.reports) {
            proto.report_map[report].insert(top);
        }
    }

    assert(proto.repeats.size() <= proto.max_occupancy);
}

namespace {
struct HasReport {
    explicit HasReport(ReportID r) : report(r) {}

    bool operator()(const pair<u32, PureRepeat> &a) const {
        return contains(a.second.reports, report);
    }

private:
    ReportID report;
};
}

bool is_equal(const CastleProto &c1, ReportID report1, const CastleProto &c2,
              ReportID report2) {
    assert(!c1.repeats.empty());
    assert(!c2.repeats.empty());
    assert(c1.kind == c2.kind);

    if (c1.reach() != c2.reach()) {
        DEBUG_PRINTF("different reach\n");
        return false;
    }

    map<u32, PureRepeat>::const_iterator it = c1.repeats.begin(),
                                         ite = c1.repeats.end(),
                                         jt = c2.repeats.begin(),
                                         jte = c2.repeats.end();

    for (;; ++it, ++jt) {
        it = find_if(it, ite, HasReport(report1));
        jt = find_if(jt, jte, HasReport(report2));

        if (it == ite && jt == jte) {
            DEBUG_PRINTF("success, cases are equivalent!\n");
            return true;
        }

        if (it == ite || jt == jte) {
            DEBUG_PRINTF("no match for one repeat\n");
            break;
        }

        if (it->first != jt->first) {
            DEBUG_PRINTF("different tops\n");
            break;
        }

        const PureRepeat &r1 = it->second;
        const PureRepeat &r2 = jt->second;
        assert(r1.reach == c1.reach());
        assert(r2.reach == c1.reach());
        if (r1.bounds != r2.bounds) {
            DEBUG_PRINTF("different bounds\n");
            break;
        }
    }

    return false;
}

bool is_equal(const CastleProto &c1, const CastleProto &c2) {
    assert(!c1.repeats.empty());
    assert(!c2.repeats.empty());
    assert(c1.kind == c2.kind);

    if (c1.reach() != c2.reach()) {
        DEBUG_PRINTF("different reach\n");
        return false;
    }

    return c1.repeats == c2.repeats;
}

bool requiresDedupe(const CastleProto &proto,
                    const flat_set<ReportID> &reports) {
    for (const auto &report : reports) {
        auto it = proto.report_map.find(report);
        if (it == end(proto.report_map)) {
            continue;
        }
        if (it->second.size() > 1) {
            DEBUG_PRINTF("castle proto %p has dupe report %u\n", &proto,
                         report);
            return true;
        }
    }
    return false;
}

static
void addToHolder(NGHolder &g, u32 top, const PureRepeat &pr) {
    DEBUG_PRINTF("top %u -> repeat %s\n", top, pr.bounds.str().c_str());
    NFAVertex u = g.start;

    // Mandatory repeats to min bound.
    u32 min_bound = pr.bounds.min; // always finite
    if (min_bound == 0) { // Vacuous case, we can only do this once.
        assert(!edge(g.start, g.accept, g).second);
        NFAEdge e = add_edge(g.start, g.accept, g);
        g[e].tops.insert(top);
        g[u].reports.insert(pr.reports.begin(), pr.reports.end());
        min_bound = 1;
    }

    for (u32 i = 0; i < min_bound; i++) {
        NFAVertex v = add_vertex(g);
        g[v].char_reach = pr.reach;
        NFAEdge e = add_edge(u, v, g);
        if (u == g.start) {
            g[e].tops.insert(top);
        }
        u = v;
    }

    NFAVertex head = u;

    // Optional repeats to max bound.
    if (pr.bounds.max.is_finite()) {
        assert(pr.bounds.max > depth(0));
        const u32 max_bound = pr.bounds.max;
        for (u32 i = 0; i < max_bound - min_bound; i++) {
            NFAVertex v = add_vertex(g);
            g[v].char_reach = pr.reach;
            if (head != u) {
                add_edge(head, v, g);
            }
            NFAEdge e = add_edge(u, v, g);
            if (u == g.start) {
                g[e].tops.insert(top);
            }
            u = v;
        }
    } else {
        assert(pr.bounds.max.is_infinite());
        add_edge(u, u, g);
    }

    // Connect to accept.
    add_edge(u, g.accept, g);
    g[u].reports.insert(pr.reports.begin(), pr.reports.end());
    if (u != head) {
        add_edge(head, g.accept, g);
        g[head].reports.insert(pr.reports.begin(), pr.reports.end());
    }
}

static
bool hasZeroMinBound(const CastleProto &proto) {
    const depth zero(0);
    for (const PureRepeat &pr : proto.repeats | map_values) {
        if (pr.bounds.min == zero) {
            return true;
        }
    }
    return false;
}

unique_ptr<NGHolder> makeHolder(const CastleProto &proto,
                                const CompileContext &cc) {
    assert(!proto.repeats.empty());

    // Vacuous edges are only doable in the NGHolder if we are a single-top
    // Castle.
    if (hasZeroMinBound(proto)) {
        if (proto.repeats.size() != 1 || proto.repeats.begin()->first != 0) {
            DEBUG_PRINTF("can't build multi-top vacuous holder\n");
            return nullptr;
        }
    }

    auto g = std::make_unique<NGHolder>(proto.kind);

    for (const auto &m : proto.repeats) {
        addToHolder(*g, m.first, m.second);
    }

    //dumpGraph("castle_holder.dot", *g);

    // Sanity checks.
    assert(allMatchStatesHaveReports(*g));
    assert(!has_parallel_edge(*g));

    reduceGraphEquivalences(*g, cc);

    removeRedundancy(*g, SOM_NONE);

    return g;
}

} // namespace ue2