/* * Copyright (c) 2016-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "ng_violet.h" #include "grey.h" #include "ng_depth.h" #include "ng_dominators.h" #include "ng_dump.h" #include "ng_equivalence.h" #include "ng_holder.h" #include "ng_is_equal.h" #include "ng_literal_analysis.h" #include "ng_limex.h" #include "ng_mcclellan.h" #include "ng_netflow.h" #include "ng_prune.h" #include "ng_redundancy.h" #include "ng_region.h" #include "ng_reports.h" #include "ng_split.h" #include "ng_util.h" #include "ng_width.h" #include "nfa/rdfa.h" #include "rose/rose_build.h" #include "rose/rose_build_util.h" #include "rose/rose_in_dump.h" #include "rose/rose_in_graph.h" #include "rose/rose_in_util.h" #include "util/compare.h" #include "util/compile_context.h" #include "util/container.h" #include "util/flat_containers.h" #include "util/graph.h" #include "util/graph_range.h" #include "util/make_unique.h" #include "util/order_check.h" #include "util/target_info.h" #include "util/ue2string.h" #include #include #include #include #include #define STAGE_DEBUG_PRINTF DEBUG_PRINTF using namespace std; using boost::adaptors::map_values; namespace ue2 { /* createsAnchoredLHS() is conservative as the depths take into account * back edges that come from beyond the split point and would be missing after * the graph is split. */ static bool createsAnchoredLHS(const NGHolder &g, const vector &vv, const vector &depths, const Grey &grey, depth max_depth = depth::infinity()) { max_depth = min(max_depth, depth(grey.maxAnchoredRegion)); for (auto v : vv) { /* avoid issues of self loops blowing out depths: * look at preds, add 1 */ for (auto u : inv_adjacent_vertices_range(v, g)) { if (u == v) { continue; } u32 idx = g[u].index; assert(idx < depths.size()); if (maxDistFromStartOfData(depths.at(idx)) >= max_depth) { return false; } } } return true; } /* createsTransientLHS() is conservative as the depths take into account * back edges that come from beyond the split point and would be missing after * the graph is split. */ static bool createsTransientLHS(const NGHolder &g, const vector &vv, const vector &depths, const Grey &grey) { const depth max_depth(grey.maxHistoryAvailable); for (auto v : vv) { /* avoid issues of self loops blowing out depths: * look at preds, add 1 */ for (auto u : inv_adjacent_vertices_range(v, g)) { if (u == v) { continue; } u32 idx = g[u].index; assert(idx < depths.size()); if (maxDistFromInit(depths.at(idx)) >= max_depth) { return false; } } } return true; } static double calcSplitRatio(const NGHolder &g, const vector &vv) { flat_set not_reachable; find_unreachable(g, vv, ¬_reachable); double rv = (double)not_reachable.size() / num_vertices(g); rv = rv > 0.5 ? 1 - rv : rv; return rv; } static size_t shorter_than(const set &s, size_t limit) { return count_if(s.begin(), s.end(), [&](const ue2_literal &a) { return a.length() < limit; }); } static u32 min_len(const set &s) { u32 rv = ~0U; for (const auto &lit : s) { rv = min(rv, (u32)lit.length()); } return rv; } static u32 min_period(const set &s) { u32 rv = ~0U; for (const auto &lit : s) { rv = min(rv, (u32)minStringPeriod(lit)); } DEBUG_PRINTF("min period %u\n", rv); return rv; } namespace { /** * Information on a cut: vertices and literals. */ struct VertLitInfo { VertLitInfo() {} VertLitInfo(NFAVertex v, const set &litlit, bool c_anch, bool c_tran = false) : vv(vector(1, v)), lit(litlit), creates_anchored(c_anch), creates_transient(c_tran) {} VertLitInfo(const vector &vv_in, const set &lit_in, bool c_anch) : vv(vv_in), lit(lit_in), creates_anchored(c_anch) {} vector vv; set lit; bool creates_anchored = false; bool creates_transient = false; double split_ratio = 0; }; #define LAST_CHANCE_STRONG_LEN 1 /** * \brief Comparator class for comparing different literal cuts. */ class LitComparator { public: LitComparator(const NGHolder &g_in, bool sa, bool st, bool lc) : g(g_in), seeking_anchored(sa), seeking_transient(st), last_chance(lc) {} bool operator()(const unique_ptr &a, const unique_ptr &b) const { assert(a && b); if (seeking_anchored) { if (a->creates_anchored != b->creates_anchored) { return a->creates_anchored < b->creates_anchored; } } if (seeking_transient) { if (a->creates_transient != b->creates_transient) { return a->creates_transient < b->creates_transient; } } if (last_chance && min_len(a->lit) > LAST_CHANCE_STRONG_LEN && min_len(b->lit) > LAST_CHANCE_STRONG_LEN) { DEBUG_PRINTF("using split ratio %g , %g\n", a->split_ratio, b->split_ratio); return a->split_ratio < b->split_ratio; } u64a score_a = scoreSet(a->lit); u64a score_b = scoreSet(b->lit); if (score_a != score_b) { return score_a > score_b; } /* vertices should only be in one candidate cut */ assert(a->vv == b->vv || a->vv.front() != b->vv.front()); return g[a->vv.front()].index > g[b->vv.front()].index; } private: const NGHolder &g; /**< graph on which cuts are found */ bool seeking_anchored; bool seeking_transient; bool last_chance; }; } #define MIN_ANCHORED_LEN 2 #define MIN_ANCHORED_DESPERATE_LEN 1 /* anchored here means that the cut creates a 'usefully' anchored LHS */ static bool validateRoseLiteralSetQuality(const set &s, u64a score, bool anchored, u32 min_allowed_floating_len, bool desperation, bool last_chance) { u32 min_allowed_len = anchored ? MIN_ANCHORED_LEN : min_allowed_floating_len; if (anchored && last_chance) { min_allowed_len = MIN_ANCHORED_DESPERATE_LEN; } if (last_chance) { desperation = true; } DEBUG_PRINTF("validating%s set, min allowed len %u\n", anchored ? " anchored" : "", min_allowed_len); assert(none_of(begin(s), end(s), bad_mixed_sensitivity)); if (score >= NO_LITERAL_AT_EDGE_SCORE) { DEBUG_PRINTF("candidate is too bad %llu/%zu\n", score, s.size()); return false; } assert(!s.empty()); if (s.empty()) { DEBUG_PRINTF("candidate is too bad/something went wrong\n"); return false; } u32 s_min_len = min_len(s); u32 s_min_period = min_period(s); size_t short_count = shorter_than(s, 5); DEBUG_PRINTF("cand '%s': score %llu count=%zu min_len=%u min_period=%u" " short_count=%zu desp=%d\n", dumpString(*s.begin()).c_str(), score, s.size(), s_min_len, s_min_period, short_count, (int)desperation); bool ok = true; if (s.size() > 10 /* magic number is magic */ || s_min_len < min_allowed_len || (s_min_period <= 1 && min_allowed_len != 1)) { DEBUG_PRINTF("candidate may be bad\n"); ok = false; } if (!ok && desperation && s.size() <= 20 /* more magic numbers are magical */ && (s_min_len > 5 || (s_min_len > 2 && short_count <= 10)) && s_min_period > 1) { DEBUG_PRINTF("candidate is ok\n"); ok = true; } if (!ok && desperation && s.size() <= 50 /* more magic numbers are magical */ && s_min_len > 10 && s_min_period > 1) { DEBUG_PRINTF("candidate is ok\n"); ok = true; } if (!ok) { DEBUG_PRINTF("candidate is too shitty\n"); return false; } return true; } static UNUSED void dumpRoseLiteralSet(const set &s) { for (UNUSED const auto &lit : s) { DEBUG_PRINTF(" lit: %s\n", dumpString(lit).c_str()); } } static void getSimpleRoseLiterals(const NGHolder &g, bool seeking_anchored, const vector *depths, const set &a_dom, vector> *lits, u32 min_allowed_len, bool desperation, bool last_chance, const CompileContext &cc) { assert(depths || !seeking_anchored); map scores; map> lit_info; set s; for (auto v : a_dom) { s = getLiteralSet(g, v, true); /* RHS will take responsibility for any revisits to the target vertex */ if (s.empty()) { DEBUG_PRINTF("candidate is too shitty\n"); continue; } DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size()); dumpRoseLiteralSet(s); u64a score = sanitizeAndCompressAndScore(s); bool anchored = false; if (seeking_anchored) { anchored = createsAnchoredLHS(g, {v}, *depths, cc.grey); } if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len, desperation, last_chance)) { continue; } DEBUG_PRINTF("candidate is a candidate\n"); scores[v] = score; lit_info[v] = make_unique(v, s, anchored); } /* try to filter out cases where appending some characters produces worse * literals. Only bother to look back one byte, TODO make better */ for (auto u : a_dom) { if (out_degree(u, g) != 1 || !scores[u]) { continue; } NFAVertex v = *adjacent_vertices(u, g).first; if (contains(scores, v) && scores[v] >= scores[u]) { DEBUG_PRINTF("killing off v as score %llu >= %llu\n", scores[v], scores[u]); lit_info.erase(v); } } lits->reserve(lit_info.size()); for (auto &m : lit_info) { lits->push_back(move(m.second)); } DEBUG_PRINTF("%zu candidate literal sets\n", lits->size()); } static void getRegionRoseLiterals(const NGHolder &g, bool seeking_anchored, const vector *depths, const set &bad, const set *allowed, vector> *lits, u32 min_allowed_len, bool desperation, bool last_chance, const CompileContext &cc) { /* This allows us to get more places to split the graph as we are not limited to points where there is a single vertex to split at. */ assert(depths || !seeking_anchored); /* TODO: operate over 'proto-regions' which ignore back edges */ auto regions = assignRegions(g); set mand, optional; map > exits; for (auto v : vertices_range(g)) { u32 region = regions[v]; if (is_any_start(v, g) || region == 0) { continue; } if (is_any_accept(v, g)) { continue; } if (!generates_callbacks(g) && is_match_vertex(v, g)) { /* we cannot leave a completely vacuous infix */ continue; } if (isRegionExit(g, v, regions)) { exits[region].push_back(v); } if (isRegionEntry(g, v, regions)) { // Determine whether this region is mandatory or optional. We only // need to do this check for the first entry vertex we encounter // for this region. if (!contains(mand, region) && !contains(optional, region)) { if (isOptionalRegion(g, v, regions)) { optional.insert(region); } else { mand.insert(region); } } } } for (const auto &m : exits) { if (false) { next_cand: continue; } const u32 region = m.first; const vector &vv = m.second; assert(!vv.empty()); if (!contains(mand, region)) { continue; } for (auto v : vv) { /* if an exit is in bad, the region is already handled well * by getSimpleRoseLiterals or is otherwise bad */ if (contains(bad, v)) { goto next_cand; } /* if we are only allowed to consider some vertices, v must be in the list; */ if (allowed && !contains(*allowed, v)) { goto next_cand; } } /* the final region may not have a neat exit. validate that all exits * have an edge to each accept or none do */ bool edge_to_a = edge(vv[0], g.accept, g).second; bool edge_to_aeod = edge(vv[0], g.acceptEod, g).second; const auto &reports = g[vv[0]].reports; for (auto v : vv) { if (edge_to_a != edge(v, g.accept, g).second) { goto next_cand; } if (edge_to_aeod != edge(v, g.acceptEod, g).second) { goto next_cand; } if (g[v].reports != reports) { goto next_cand; } } DEBUG_PRINTF("inspecting region %u\n", region); set s; for (auto v : vv) { DEBUG_PRINTF(" exit vertex: %zu\n", g[v].index); /* Note: RHS can not be depended on to take all subsequent revisits * to this vertex */ set ss = getLiteralSet(g, v, false); if (ss.empty()) { DEBUG_PRINTF("candidate is too shitty\n"); goto next_cand; } insert(&s, ss); } assert(!s.empty()); DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size()); dumpRoseLiteralSet(s); u64a score = sanitizeAndCompressAndScore(s); DEBUG_PRINTF("|candidate literal set| = %zu\n", s.size()); dumpRoseLiteralSet(s); bool anchored = false; if (seeking_anchored) { anchored = createsAnchoredLHS(g, vv, *depths, cc.grey); } if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len, desperation, last_chance)) { goto next_cand; } DEBUG_PRINTF("candidate is a candidate\n"); lits->push_back(make_unique(vv, s, anchored)); } } static void filterCandPivots(const NGHolder &g, const set &cand_raw, set *out) { for (auto u : cand_raw) { const CharReach &u_cr = g[u].char_reach; if (u_cr.count() > 40) { continue; /* too wide to be plausible */ } if (u_cr.count() > 2) { /* include u as a candidate as successor may have backed away from * expanding through it */ out->insert(u); continue; } NFAVertex v = getSoleDestVertex(g, u); if (v && in_degree(v, g) == 1 && out_degree(u, g) == 1) { const CharReach &v_cr = g[v].char_reach; if (v_cr.count() == 1 || v_cr.isCaselessChar()) { continue; /* v will always generate better literals */ } } out->insert(u); } } /* cand_raw is the candidate set before filtering points which are clearly * a bad idea. */ static void getCandidatePivots(const NGHolder &g, set *cand, set *cand_raw) { auto dominators = findDominators(g); set accepts; for (auto v : inv_adjacent_vertices_range(g.accept, g)) { if (is_special(v, g)) { continue; } accepts.insert(v); } for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) { if (is_special(v, g)) { continue; } accepts.insert(v); } assert(!accepts.empty()); vector dom_trace; auto ait = accepts.begin(); assert(ait != accepts.end()); NFAVertex curr = *ait; while (curr && !is_special(curr, g)) { dom_trace.push_back(curr); curr = dominators[curr]; } reverse(dom_trace.begin(), dom_trace.end()); for (++ait; ait != accepts.end(); ++ait) { curr = *ait; vector dom_trace2; while (curr && !is_special(curr, g)) { dom_trace2.push_back(curr); curr = dominators[curr]; } reverse(dom_trace2.begin(), dom_trace2.end()); auto dti = dom_trace.begin(), dtie = dom_trace.end(); auto dtj = dom_trace2.begin(), dtje = dom_trace2.end(); while (dti != dtie && dtj != dtje && *dti == *dtj) { ++dti; ++dtj; } dom_trace.erase(dti, dtie); } cand_raw->insert(dom_trace.begin(), dom_trace.end()); filterCandPivots(g, *cand_raw, cand); } static unique_ptr findBestSplit(const NGHolder &g, const vector *depths, bool for_prefix, u32 min_len, const set *allowed_cand, const set *disallowed_cand, bool last_chance, const CompileContext &cc) { assert(!for_prefix || depths); /* look for a single simple split point */ set cand; set cand_raw; getCandidatePivots(g, &cand, &cand_raw); if (allowed_cand) { set cand2; set cand2_raw; set_intersection(allowed_cand->begin(), allowed_cand->end(), cand.begin(), cand.end(), inserter(cand2, cand2.begin())); set_intersection(allowed_cand->begin(), allowed_cand->end(), cand_raw.begin(), cand_raw.end(), inserter(cand2_raw, cand2_raw.begin())); cand = std::move(cand2); cand_raw = std::move(cand2_raw); } if (disallowed_cand) { DEBUG_PRINTF("%zu disallowed candidates\n", disallowed_cand->size()); DEBUG_PRINTF("|old cand| = %zu\n", cand.size()); erase_all(&cand, *disallowed_cand); insert(&cand_raw, *disallowed_cand); } if (!generates_callbacks(g)) { /* not output exposed so must leave some RHS */ for (NFAVertex v : inv_adjacent_vertices_range(g.accept, g)) { cand.erase(v); cand_raw.erase(v); } for (NFAVertex v : inv_adjacent_vertices_range(g.acceptEod, g)) { cand.erase(v); cand_raw.erase(v); } } DEBUG_PRINTF("|cand| = %zu\n", cand.size()); bool seeking_anchored = for_prefix; bool seeking_transient = for_prefix; bool desperation = for_prefix && cc.streaming; vector> lits; /**< sorted list of potential cuts */ getSimpleRoseLiterals(g, seeking_anchored, depths, cand, &lits, min_len, desperation, last_chance, cc); getRegionRoseLiterals(g, seeking_anchored, depths, cand_raw, allowed_cand, &lits, min_len, desperation, last_chance, cc); if (lits.empty()) { DEBUG_PRINTF("no literals found\n"); return nullptr; } if (seeking_transient) { for (auto &a : lits) { a->creates_transient = createsTransientLHS(g, a->vv, *depths, cc.grey); } } if (last_chance) { for (auto &a : lits) { a->split_ratio = calcSplitRatio(g, a->vv); } } auto cmp = LitComparator(g, seeking_anchored, seeking_transient, last_chance); unique_ptr best = move(lits.back()); lits.pop_back(); while (!lits.empty()) { if (cmp(best, lits.back())) { best = move(lits.back()); } lits.pop_back(); } DEBUG_PRINTF("best is '%s' %zu a%d t%d\n", dumpString(*best->lit.begin()).c_str(), g[best->vv.front()].index, depths ? (int)createsAnchoredLHS(g, best->vv, *depths, cc.grey) : 0, depths ? (int)createsTransientLHS(g, best->vv, *depths, cc.grey) : 0); return best; } static void poisonFromSuccessor(const NGHolder &h, const ue2_literal &succ, bool overhang_ok, flat_set &bad) { DEBUG_PRINTF("poisoning holder of size %zu, succ len %zu\n", num_vertices(h), succ.length()); using EdgeSet = boost::dynamic_bitset<>; const size_t edge_count = num_edges(h); EdgeSet bad_edges(edge_count); unordered_map curr; for (const auto &e : in_edges_range(h.accept, h)) { auto &path_set = curr[source(e, h)]; if (path_set.empty()) { path_set.resize(edge_count); } path_set.set(h[e].index); } unordered_map next; for (auto it = succ.rbegin(); it != succ.rend(); ++it) { for (const auto &path : curr) { NFAVertex u = path.first; const auto &path_set = path.second; if (u == h.start && overhang_ok) { DEBUG_PRINTF("poisoning early %zu [overhang]\n", path_set.count()); bad_edges |= path_set; continue; } if (overlaps(h[u].char_reach, *it)) { for (const auto &e : in_edges_range(u, h)) { auto &new_path_set = next[source(e, h)]; if (new_path_set.empty()) { new_path_set.resize(edge_count); } new_path_set |= path_set; new_path_set.set(h[e].index); } } } DEBUG_PRINTF("succ char matches at %zu paths\n", next.size()); assert(overhang_ok || !curr.empty()); swap(curr, next); next.clear(); } assert(overhang_ok || !curr.empty()); for (const auto &path : curr) { bad_edges |= path.second; DEBUG_PRINTF("poisoning %zu vertices\n", path.second.count()); } for (const auto &e : edges_range(h)) { if (bad_edges.test(h[e].index)) { bad.insert(e); } } } static void poisonForGoodPrefix(const NGHolder &h, const vector &depths, flat_set &bad, const Grey &grey) { for (const auto &v : vertices_range(h)) { if (!createsAnchoredLHS(h, {v}, depths, grey) && !createsTransientLHS(h, {v}, depths, grey)) { insert(&bad, in_edges_range(v, h)); } } } static UNUSED bool is_any_accept_type(RoseInVertexType t) { return t == RIV_ACCEPT || t == RIV_ACCEPT_EOD; } static flat_set poisonEdges(const NGHolder &h, const vector *depths, const RoseInGraph &vg, const vector &ee, bool for_prefix, const Grey &grey) { DEBUG_PRINTF("poisoning edges %zu successor edges\n", ee.size()); /* poison edges covered by successor literal */ set > succs; for (const RoseInEdge &ve : ee) { if (vg[target(ve, vg)].type != RIV_LITERAL) { /* nothing to poison in suffixes/outfixes */ assert(generates_callbacks(h)); assert(is_any_accept_type(vg[target(ve, vg)].type)); continue; } succs.insert({vg[target(ve, vg)].s, vg[source(ve, vg)].type == RIV_LITERAL}); } DEBUG_PRINTF("poisoning edges %zu successor literals\n", succs.size()); flat_set bad; for (const auto &p : succs) { poisonFromSuccessor(h, p.first, p.second, bad); } /* poison edges which don't significantly improve a prefix */ if (for_prefix) { poisonForGoodPrefix(h, *depths, bad, grey); } return bad; } static set poisonVertices(const NGHolder &h, const RoseInGraph &vg, const vector &ee, const Grey &grey) { flat_set bad_edges = poisonEdges(h, nullptr, vg, ee, false, grey); set bad_vertices; for (const NFAEdge &e : bad_edges) { bad_vertices.insert(target(e, h)); DEBUG_PRINTF("bad: %zu->%zu\n", h[source(e, h)].index, h[target(e, h)].index); } return bad_vertices; } static unique_ptr findBestNormalSplit(const NGHolder &g, const RoseInGraph &vg, const vector &ee, const CompileContext &cc) { assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX); set bad_vertices = poisonVertices(g, vg, ee, cc.grey); return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength, nullptr, &bad_vertices, false, cc); } static unique_ptr findBestLastChanceSplit(const NGHolder &g, const RoseInGraph &vg, const vector &ee, const CompileContext &cc) { assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX); set bad_vertices = poisonVertices(g, vg, ee, cc.grey); return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength, nullptr, &bad_vertices, true, cc); } static unique_ptr findSimplePrefixSplit(const NGHolder &g, const CompileContext &cc) { DEBUG_PRINTF("looking for simple prefix split\n"); bool anchored = !proper_out_degree(g.startDs, g); NFAVertex u = anchored ? g.start : g.startDs; if (out_degree(u, g) != 2) { /* startDs + succ */ return nullptr; } NFAVertex v = NGHolder::null_vertex(); for (NFAVertex t : adjacent_vertices_range(u, g)) { if (t != g.startDs) { assert(!v); v = t; } } assert(v); if (!anchored) { if (out_degree(g.start, g) > 2) { return nullptr; } if (out_degree(g.start, g) == 2 && !edge(g.start, v, g).second) { return nullptr; } } NFAVertex best_v = NGHolder::null_vertex(); ue2_literal best_lit; u32 limit = cc.grey.maxHistoryAvailable; if (anchored) { LIMIT_TO_AT_MOST(&limit, cc.grey.maxAnchoredRegion); } ue2_literal curr_lit; for (u32 i = 0; i < limit; i++) { const auto &v_cr = g[v].char_reach; if (v_cr.count() == 1 || v_cr.isCaselessChar()) { curr_lit.push_back(v_cr.find_first(), v_cr.isCaselessChar()); } else { curr_lit.clear(); } if (curr_lit.length() > best_lit.length()) { best_lit = curr_lit; best_v = v; } if (out_degree(v, g) != 1) { break; } v = *adjacent_vertices(v, g).first; } if (best_lit.length() < cc.grey.minRoseLiteralLength) { return nullptr; } set best_lit_set({best_lit}); if (bad_mixed_sensitivity(best_lit)) { sanitizeAndCompressAndScore(best_lit_set); } return ue2::make_unique(best_v, best_lit_set, anchored, true); } static unique_ptr findBestPrefixSplit(const NGHolder &g, const vector &depths, const RoseInGraph &vg, const vector &ee, bool last_chance, const CompileContext &cc) { assert(g.kind == NFA_PREFIX || g.kind == NFA_OUTFIX); set bad_vertices = poisonVertices(g, vg, ee, cc.grey); auto rv = findBestSplit(g, &depths, true, cc.grey.minRoseLiteralLength, nullptr, &bad_vertices, last_chance, cc); /* large back edges may prevent us identifying anchored or transient cases * properly - use a simple walk instead */ if (!rv || !(rv->creates_transient || rv->creates_anchored)) { auto rv2 = findSimplePrefixSplit(g, cc); if (rv2) { return rv2; } } return rv; } static unique_ptr findBestCleanSplit(const NGHolder &g, const CompileContext &cc) { assert(g.kind != NFA_PREFIX); set cleanSplits; for (NFAVertex v : vertices_range(g)) { if (!g[v].char_reach.all() || !edge(v, v, g).second) { continue; } insert(&cleanSplits, inv_adjacent_vertices(v, g)); cleanSplits.erase(v); } cleanSplits.erase(g.start); if (cleanSplits.empty()) { return nullptr; } return findBestSplit(g, nullptr, false, cc.grey.violetEarlyCleanLiteralLen, &cleanSplits, nullptr, false, cc); } static bool can_match(const NGHolder &g, const ue2_literal &lit, bool overhang_ok) { set curr, next; curr.insert(g.accept); for (auto it = lit.rbegin(); it != lit.rend(); ++it) { next.clear(); for (auto v : curr) { for (auto u : inv_adjacent_vertices_range(v, g)) { if (u == g.start) { if (overhang_ok) { DEBUG_PRINTF("bail\n"); return true; } else { continue; /* it is not possible for a lhs literal to * overhang the start */ } } const CharReach &cr = g[u].char_reach; if (!overlaps(*it, cr)) { continue; } next.insert(u); } } curr.swap(next); } return !curr.empty(); } static bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg, const vector &ee, const VertLitInfo &split) { const vector &splitters = split.vv; assert(!splitters.empty()); shared_ptr lhs = make_shared(); shared_ptr rhs = make_shared(); unordered_map lhs_map; unordered_map rhs_map; splitGraph(base_graph, splitters, lhs.get(), &lhs_map, rhs.get(), &rhs_map); DEBUG_PRINTF("split %s:%zu into %s:%zu + %s:%zu\n", to_string(base_graph.kind).c_str(), num_vertices(base_graph), to_string(lhs->kind).c_str(), num_vertices(*lhs), to_string(rhs->kind).c_str(), num_vertices(*rhs)); bool suffix = generates_callbacks(base_graph); if (is_triggered(base_graph)) { /* if we are already guarded, check if the split reduces the size of * the problem before continuing with the split */ if (num_vertices(*lhs) >= num_vertices(base_graph) && !(suffix && isVacuous(*rhs))) { DEBUG_PRINTF("split's lhs is no smaller\n"); return false; } if (num_vertices(*rhs) >= num_vertices(base_graph)) { DEBUG_PRINTF("split's rhs is no smaller\n"); return false; } } bool do_accept = false; bool do_accept_eod = false; assert(rhs); if (isVacuous(*rhs) && suffix) { if (edge(rhs->start, rhs->accept, *rhs).second) { DEBUG_PRINTF("rhs has a cliche\n"); do_accept = true; remove_edge(rhs->start, rhs->accept, *rhs); } if (edge(rhs->start, rhs->acceptEod, *rhs).second) { DEBUG_PRINTF("rhs has an eod cliche\n"); do_accept_eod = true; remove_edge(rhs->start, rhs->acceptEod, *rhs); } renumber_edges(*rhs); } /* check if we still have a useful graph left over */ bool do_norm = out_degree(rhs->start, *rhs) != 1; set splitter_reports; for (auto v : splitters) { insert(&splitter_reports, base_graph[v].reports); } /* find the targets of each source vertex; note the use of vectors to * preserve deterministic ordering */ vector sources; map> images; for (const RoseInEdge &e : ee) { RoseInVertex src = source(e, vg); RoseInVertex dest = target(e, vg); if (!contains(images, src)) { sources.push_back(src); } images[src].push_back(dest); remove_edge(e, vg); } map, vector> verts_by_image; for (const auto &u : sources) { const auto &image = images[u]; if (contains(verts_by_image, image)) { for (RoseInVertex v : verts_by_image[image]) { add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg); } continue; } for (const auto &lit : split.lit) { assert(!bad_mixed_sensitivity(lit)); /* don't allow overhang in can_match() as literals should * correspond to the edge graph being split; overhanging the graph * would indicate a false path.*/ if (!can_match(*lhs, lit, false)) { DEBUG_PRINTF("'%s' did not match lhs\n", escapeString(lit).c_str()); continue; } DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str()); auto v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg); add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg); /* work out delay later */ if (do_accept) { DEBUG_PRINTF("rhs has a cliche\n"); auto tt = add_vertex(RoseInVertexProps::makeAccept( splitter_reports), vg); add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg); } if (do_accept_eod) { DEBUG_PRINTF("rhs has an eod cliche\n"); auto tt = add_vertex(RoseInVertexProps::makeAcceptEod( splitter_reports), vg); add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg); } if (do_norm) { assert(out_degree(rhs->start, *rhs) > 1); for (RoseInVertex dest : image) { add_edge(v, dest, RoseInEdgeProps(rhs, 0U), vg); } } verts_by_image[image].push_back(v); } } assert(hasCorrectlyNumberedVertices(*rhs)); assert(hasCorrectlyNumberedEdges(*rhs)); assert(isCorrectlyTopped(*rhs)); assert(hasCorrectlyNumberedVertices(*lhs)); assert(hasCorrectlyNumberedEdges(*lhs)); assert(isCorrectlyTopped(*lhs)); return true; } #define MAX_NETFLOW_CUT_WIDTH 40 /* magic number is magic */ #define MAX_LEN_2_LITERALS_PER_CUT 3 static bool checkValidNetflowLits(NGHolder &h, const vector &scores, const map> &cut_lits, u32 min_allowed_length) { DEBUG_PRINTF("cut width %zu; min allowed %u\n", cut_lits.size(), min_allowed_length); if (cut_lits.size() > MAX_NETFLOW_CUT_WIDTH) { return false; } u32 len_2_count = 0; for (const auto &cut : cut_lits) { if (scores[h[cut.first].index] >= NO_LITERAL_AT_EDGE_SCORE) { DEBUG_PRINTF("cut uses a forbidden edge\n"); return false; } if (min_len(cut.second) < min_allowed_length) { DEBUG_PRINTF("cut uses a bad literal\n"); return false; } for (const auto &lit : cut.second) { if (lit.length() == 2) { len_2_count++; } } } if (len_2_count > MAX_LEN_2_LITERALS_PER_CUT) { return false; } return true; } static void splitEdgesByCut(NGHolder &h, RoseInGraph &vg, const vector &to_cut, const vector &cut, const map> &cut_lits) { DEBUG_PRINTF("splitting %s (%zu vertices)\n", to_string(h.kind).c_str(), num_vertices(h)); /* create literal vertices and connect preds */ unordered_set done_sources; map>> verts_by_source; for (const RoseInEdge &ve : to_cut) { assert(&h == &*vg[ve].graph); RoseInVertex src = source(ve, vg); if (!done_sources.insert(src).second) { continue; /* already processed */ } /* iterate over cut for determinism */ for (const auto &e : cut) { NFAVertex prev_v = source(e, h); NFAVertex pivot = target(e, h); DEBUG_PRINTF("splitting on pivot %zu\n", h[pivot].index); unordered_map temp_map; shared_ptr new_lhs = make_shared(); splitLHS(h, pivot, new_lhs.get(), &temp_map); /* want to cut off paths to pivot from things other than the pivot - * makes a more svelte graphy */ clear_in_edges(temp_map[pivot], *new_lhs); NFAEdge pivot_edge = add_edge(temp_map[prev_v], temp_map[pivot], *new_lhs); if (is_triggered(h) && prev_v == h.start) { (*new_lhs)[pivot_edge].tops.insert(DEFAULT_TOP); } pruneUseless(*new_lhs, false); renumber_vertices(*new_lhs); renumber_edges(*new_lhs); DEBUG_PRINTF(" into lhs %s (%zu vertices)\n", to_string(new_lhs->kind).c_str(), num_vertices(*new_lhs)); assert(hasCorrectlyNumberedVertices(*new_lhs)); assert(hasCorrectlyNumberedEdges(*new_lhs)); assert(isCorrectlyTopped(*new_lhs)); const set &lits = cut_lits.at(e); for (const auto &lit : lits) { if (!can_match(*new_lhs, lit, is_triggered(h))) { continue; } RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg); /* if this is a prefix/infix an edge directly to accept should * represent a false path as we have poisoned vertices covered * by the literals. */ if (generates_callbacks(h)) { if (edge(pivot, h.accept, h).second) { DEBUG_PRINTF("adding acceptEod\n"); /* literal has a direct connection to accept */ const flat_set &reports = h[pivot].reports; auto tt = add_vertex( RoseInVertexProps::makeAccept(reports), vg); add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg); } if (edge(pivot, h.acceptEod, h).second) { assert(generates_callbacks(h)); DEBUG_PRINTF("adding acceptEod\n"); /* literal has a direct connection to accept */ const flat_set &reports = h[pivot].reports; auto tt = add_vertex( RoseInVertexProps::makeAcceptEod(reports), vg); add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg); } } add_edge(src, v, RoseInEdgeProps(new_lhs, 0), vg); verts_by_source[src].push_back({v, pivot}); } } } /* wire the literal vertices up to successors */ map, shared_ptr > done_rhs; for (const RoseInEdge &ve : to_cut) { RoseInVertex src = source(ve, vg); RoseInVertex dest = target(ve, vg); /* iterate over cut for determinism */ for (const auto &elem : verts_by_source[src]) { NFAVertex pivot = elem.second; RoseInVertex v = elem.first; vector adj; insert(&adj, adj.end(), adjacent_vertices(pivot, h)); /* we can ignore presence of accept, accepteod in adj as it is best effort */ if (!contains(done_rhs, adj)) { unordered_map temp_map; shared_ptr new_rhs = make_shared(); splitRHS(h, adj, new_rhs.get(), &temp_map); remove_edge(new_rhs->start, new_rhs->accept, *new_rhs); remove_edge(new_rhs->start, new_rhs->acceptEod, *new_rhs); renumber_edges(*new_rhs); DEBUG_PRINTF(" into rhs %s (%zu vertices)\n", to_string(new_rhs->kind).c_str(), num_vertices(*new_rhs)); done_rhs.emplace(adj, new_rhs); assert(isCorrectlyTopped(*new_rhs)); } assert(done_rhs[adj].get()); shared_ptr new_rhs = done_rhs[adj]; assert(hasCorrectlyNumberedVertices(*new_rhs)); assert(hasCorrectlyNumberedEdges(*new_rhs)); assert(isCorrectlyTopped(*new_rhs)); if (vg[dest].type == RIV_LITERAL && !can_match(*new_rhs, vg[dest].s, true)) { continue; } if (out_degree(new_rhs->start, *new_rhs) != 1) { add_edge(v, dest, RoseInEdgeProps(new_rhs, 0), vg); } } remove_edge(ve, vg); } } static bool doNetflowCut(NGHolder &h, const vector *depths, RoseInGraph &vg, const vector &ee, bool for_prefix, const Grey &grey, u32 min_allowed_length = 0U) { ENSURE_AT_LEAST(&min_allowed_length, grey.minRoseNetflowLiteralLength); DEBUG_PRINTF("doing netflow cut\n"); /* TODO: we should really get literals/scores from the full graph as this * allows us to overlap with previous cuts. */ assert(!ee.empty()); assert(&h == &*vg[ee.front()].graph); assert(!for_prefix || depths); if (num_edges(h) > grey.maxRoseNetflowEdges) { /* We have a limit on this because scoring edges and running netflow * gets very slow for big graphs. */ DEBUG_PRINTF("too many edges, skipping netflow cut\n"); return false; } assert(hasCorrectlyNumberedVertices(h)); assert(hasCorrectlyNumberedEdges(h)); auto known_bad = poisonEdges(h, depths, vg, ee, for_prefix, grey); /* Step 1: Get scores for all edges */ vector scores = scoreEdges(h, known_bad); /* scores by edge_index */ /* Step 2: Find cutset based on scores */ vector cut = findMinCut(h, scores); /* Step 3: Get literals corresponding to cut edges */ map> cut_lits; for (const auto &e : cut) { set lits = getLiteralSet(h, e); sanitizeAndCompressAndScore(lits); cut_lits[e] = lits; } /* if literals are underlength bail or if it involves a forbidden edge*/ if (!checkValidNetflowLits(h, scores, cut_lits, min_allowed_length)) { return false; } DEBUG_PRINTF("splitting\n"); /* Step 4: Split graph based on cuts */ splitEdgesByCut(h, vg, ee, cut, cut_lits); return true; } static bool deanchorIfNeeded(NGHolder &g) { DEBUG_PRINTF("hi\n"); if (proper_out_degree(g.startDs, g)) { return false; } /* look for a non-special dot with a loop following start */ set succ_g; insert(&succ_g, adjacent_vertices(g.start, g)); succ_g.erase(g.startDs); for (auto v : adjacent_vertices_range(g.start, g)) { DEBUG_PRINTF("inspecting cand %zu || = %zu\n", g[v].index, g[v].char_reach.count()); if (v == g.startDs || !g[v].char_reach.all()) { continue; } set succ_v; insert(&succ_v, adjacent_vertices(v, g)); if (succ_v == succ_g) { DEBUG_PRINTF("found ^.*\n"); for (auto succ : adjacent_vertices_range(g.start, g)) { if (succ == g.startDs) { continue; } add_edge(g.startDs, succ, g); } clear_vertex(v, g); remove_vertex(v, g); renumber_vertices(g); return true; } if (succ_g.size() == 1 && hasSelfLoop(v, g)) { DEBUG_PRINTF("found ^.+\n"); add_edge(g.startDs, v, g); remove_edge(v, v, g); return true; } } return false; } static RoseInGraph populateTrivialGraph(const NGHolder &h) { RoseInGraph g; shared_ptr root_g = cloneHolder(h); bool orig_anch = isAnchored(*root_g); orig_anch |= deanchorIfNeeded(*root_g); DEBUG_PRINTF("orig_anch %d\n", (int)orig_anch); auto start = add_vertex(RoseInVertexProps::makeStart(orig_anch), g); auto accept = add_vertex(RoseInVertexProps::makeAccept(set()), g); add_edge(start, accept, RoseInEdgeProps(root_g, 0), g); return g; } static void avoidOutfixes(RoseInGraph &vg, bool last_chance, const CompileContext &cc) { STAGE_DEBUG_PRINTF("AVOIDING OUTFIX\n"); assert(num_vertices(vg) == 2); assert(num_edges(vg) == 1); RoseInEdge e = *edges(vg).first; NGHolder &h = *vg[e].graph; assert(isCorrectlyTopped(h)); renumber_vertices(h); renumber_edges(h); unique_ptr split = findBestNormalSplit(h, vg, {e}, cc); if (split && splitRoseEdge(h, vg, {e}, *split)) { DEBUG_PRINTF("split on simple literal\n"); return; } if (last_chance) { /* look for a prefix split as it allows us to accept very weak anchored * literals. */ auto depths = calcDepths(h); split = findBestPrefixSplit(h, depths, vg, {e}, last_chance, cc); if (split && splitRoseEdge(h, vg, {e}, *split)) { DEBUG_PRINTF("split on simple literal\n"); return; } } doNetflowCut(h, nullptr, vg, {e}, false, cc.grey); } static void removeRedundantPrefixes(RoseInGraph &g) { STAGE_DEBUG_PRINTF("REMOVING REDUNDANT PREFIXES\n"); for (const RoseInEdge &e : edges_range(g)) { RoseInVertex s = source(e, g); RoseInVertex t = target(e, g); if (g[s].type != RIV_START || g[t].type != RIV_LITERAL) { continue; } if (!g[e].graph) { continue; } assert(!g[t].delay); const ue2_literal &lit = g[t].s; if (!literalIsWholeGraph(*g[e].graph, lit)) { DEBUG_PRINTF("not whole graph\n"); continue; } if (!isFloating(*g[e].graph)) { DEBUG_PRINTF("not floating\n"); continue; } g[e].graph.reset(); } } static u32 maxDelay(const CompileContext &cc) { if (!cc.streaming) { return MO_INVALID_IDX; } return cc.grey.maxHistoryAvailable; } static void removeRedundantLiteralsFromPrefixes(RoseInGraph &g, const CompileContext &cc) { STAGE_DEBUG_PRINTF("REMOVING LITERALS FROM PREFIXES\n"); vector to_anchor; for (const RoseInEdge &e : edges_range(g)) { RoseInVertex s = source(e, g); RoseInVertex t = target(e, g); if (g[s].type != RIV_START && g[s].type != RIV_ANCHORED_START) { continue; } if (g[t].type != RIV_LITERAL) { continue; } if (!g[e].graph) { continue; } if (g[e].graph_lag) { /* already removed redundant parts of literals */ continue; } assert(!g[t].delay); const ue2_literal &lit = g[t].s; DEBUG_PRINTF("removing states for literal: %s\n", dumpString(lit).c_str()); unique_ptr h = cloneHolder(*g[e].graph); const u32 max_delay = maxDelay(cc); u32 delay = removeTrailingLiteralStates(*h, lit, max_delay, false /* can't overhang start */); DEBUG_PRINTF("got delay %u (max allowed %u)\n", delay, max_delay); if (edge(h->startDs, h->accept, *h).second) { /* we should have delay == lit.length(), but in really complex * cases we may fail to identify that we can remove the whole * graph. Regardless, the fact that sds is wired to accept means the * graph serves no purpose. */ DEBUG_PRINTF("whole graph\n"); g[e].graph.reset(); continue; } if (delay == lit.length() && edge(h->start, h->accept, *h).second && num_vertices(*h) == N_SPECIALS) { to_anchor.push_back(e); continue; } /* if we got here we should still have an interesting graph */ assert(delay == max_delay || num_vertices(*h) > N_SPECIALS); if (delay && delay != MO_INVALID_IDX) { DEBUG_PRINTF("setting delay %u on lhs %p\n", delay, h.get()); g[e].graph = move(h); g[e].graph_lag = delay; } } if (!to_anchor.empty()) { RoseInVertex anch = add_vertex(RoseInVertexProps::makeStart(true), g); for (RoseInEdge e : to_anchor) { DEBUG_PRINTF("rehoming to anchor\n"); RoseInVertex v = target(e, g); add_edge(anch, v, g); remove_edge(e, g); } } } static bool isStarCliche(const NGHolder &g) { DEBUG_PRINTF("checking graph with %zu vertices\n", num_vertices(g)); bool nonspecials_seen = false; for (auto v : vertices_range(g)) { if (is_special(v, g)) { continue; } if (nonspecials_seen) { return false; } nonspecials_seen = true; if (!g[v].char_reach.all()) { return false; } if (!hasSelfLoop(v, g)) { return false; } if (!edge(v, g.accept, g).second) { return false; } } if (!nonspecials_seen) { return false; } if (!edge(g.start, g.accept, g).second) { return false; } return true; } static void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig, const vector &ee, const CompileContext &cc) { /* TODO: This could be better by not creating a separate graph for each * successor literal. This would require using distinct report ids and also * taking into account overlap of successor literals. */ set preds; set succs; for (const RoseInEdge &e : ee) { RoseInVertex u = source(e, ig); assert(ig[u].type == RIV_LITERAL); assert(!ig[u].delay); preds.insert(ig[u].s); RoseInVertex v = target(e, ig); assert(ig[v].type == RIV_LITERAL); assert(!ig[v].delay); succs.insert(ig[v].s); if (ig[e].graph_lag) { /* already removed redundant parts of literals */ return; } } map, u32> > graphs; /* + delay */ for (const ue2_literal &right : succs) { size_t max_overlap = 0; for (const ue2_literal &left : preds) { size_t overlap = maxOverlap(left, right, 0); ENSURE_AT_LEAST(&max_overlap, overlap); } u32 max_allowed_delay = right.length() - max_overlap; if (cc.streaming) { LIMIT_TO_AT_MOST(&max_allowed_delay, cc.grey.maxHistoryAvailable); } if (!max_allowed_delay) { continue; } shared_ptr h_new = cloneHolder(h); u32 delay = removeTrailingLiteralStates(*h_new, right, max_allowed_delay); if (delay == MO_INVALID_IDX) { /* successor literal could not match infix -> ignore false path */ assert(0); continue; } assert(isCorrectlyTopped(*h_new)); graphs[right] = make_pair(h_new, delay); } for (const RoseInEdge &e : ee) { RoseInVertex v = target(e, ig); const ue2_literal &succ = ig[v].s; if (!contains(graphs, succ)) { continue; } ig[e].graph = graphs[succ].first; ig[e].graph_lag = graphs[succ].second; if (isStarCliche(*ig[e].graph)) { DEBUG_PRINTF("is a X star!\n"); ig[e].graph.reset(); ig[e].graph_lag = 0; } } } static void removeRedundantLiteralsFromInfixes(RoseInGraph &g, const CompileContext &cc) { vector seen_order; map> infixes; for (const RoseInEdge &e : edges_range(g)) { RoseInVertex s = source(e, g); RoseInVertex t = target(e, g); if (g[s].type != RIV_LITERAL || g[t].type != RIV_LITERAL) { continue; } if (!g[e].graph) { continue; } assert(!g[t].delay); NGHolder *h = g[e].graph.get(); if (!contains(infixes, h)) { seen_order.push_back(h); } infixes[h].push_back(e); } for (NGHolder *h : seen_order) { removeRedundantLiteralsFromInfix(*h, g, infixes[h], cc); } } static void removeRedundantLiterals(RoseInGraph &g, const CompileContext &cc) { removeRedundantLiteralsFromPrefixes(g, cc); removeRedundantLiteralsFromInfixes(g, cc); } static RoseInVertex getStart(RoseInGraph &vg) { for (RoseInVertex v : vertices_range(vg)) { if (vg[v].type == RIV_START || vg[v].type == RIV_ANCHORED_START) { return v; } } assert(0); return RoseInGraph::null_vertex(); } /** * Finds the initial accept vertex created to which suffix/outfixes are * attached. */ static RoseInVertex getPrimaryAccept(RoseInGraph &vg) { for (RoseInVertex v : vertices_range(vg)) { if (vg[v].type == RIV_ACCEPT && vg[v].reports.empty()) { return v; } } assert(0); return RoseInGraph::null_vertex(); } static bool willBeTransient(const depth &max_depth, const CompileContext &cc) { if (!cc.streaming) { return max_depth <= depth(ROSE_BLOCK_TRANSIENT_MAX_WIDTH); } else { return max_depth <= depth(cc.grey.maxHistoryAvailable + 1); } } static bool willBeAnchoredTable(const depth &max_depth, const Grey &grey) { return max_depth <= depth(grey.maxAnchoredRegion); } static unique_ptr make_chain(u32 count) { assert(count); auto rv = make_unique(NFA_INFIX); NGHolder &h = *rv; NFAVertex u = h.start; for (u32 i = 0; i < count; i++) { NFAVertex v = add_vertex(h); h[v].char_reach = CharReach::dot(); add_edge(u, v, h); u = v; } h[u].reports.insert(0); add_edge(u, h.accept, h); setTops(h); return rv; } #define SHORT_TRIGGER_LEN 16 static bool makeTransientFromLongLiteral(NGHolder &h, RoseInGraph &vg, const vector &ee, const CompileContext &cc) { /* check max width and literal lengths to see if possible */ size_t min_lit = (size_t)~0ULL; for (const RoseInEdge &e : ee) { RoseInVertex v = target(e, vg); LIMIT_TO_AT_MOST(&min_lit, vg[v].s.length()); } if (min_lit <= SHORT_TRIGGER_LEN || min_lit >= UINT_MAX) { return false; } depth max_width = findMaxWidth(h); u32 delta = min_lit - SHORT_TRIGGER_LEN; if (!willBeTransient(max_width - depth(delta), cc) && !willBeAnchoredTable(max_width - depth(delta), cc.grey)) { return false; } DEBUG_PRINTF("candidate for splitting long literal (len %zu)\n", min_lit); DEBUG_PRINTF("delta = %u\n", delta); /* try split */ map > graphs; for (const RoseInEdge &e : ee) { RoseInVertex v = target(e, vg); shared_ptr h_new = cloneHolder(h); u32 delay = removeTrailingLiteralStates(*h_new, vg[v].s, delta); DEBUG_PRINTF("delay %u\n", delay); if (delay != delta) { DEBUG_PRINTF("unable to trim literal\n"); return false; } if (in_degree(v, vg) != 1) { DEBUG_PRINTF("complicated\n"); return false; } DEBUG_PRINTF("new mw = %u\n", (u32)findMaxWidth(*h_new)); assert(willBeTransient(findMaxWidth(*h_new), cc) || willBeAnchoredTable(findMaxWidth(*h_new), cc.grey)); assert(isCorrectlyTopped(*h_new)); graphs[v] = h_new; } /* add .{repeats} from prefixes to long literals */ for (const RoseInEdge &e : ee) { RoseInVertex s = source(e, vg); RoseInVertex t = target(e, vg); remove_edge(e, vg); const ue2_literal &orig_lit = vg[t].s; ue2_literal lit(orig_lit.begin(), orig_lit.end() - delta); ue2_literal lit2(orig_lit.end() - delta, orig_lit.end()); assert(lit.length() + delta == orig_lit.length()); vg[t].s = lit2; RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg); add_edge(s, v, RoseInEdgeProps(graphs[t], 0), vg); add_edge(v, t, RoseInEdgeProps(make_chain(delta), 0), vg); } DEBUG_PRINTF("success\n"); /* TODO: alter split point to avoid pathological splits */ return true; } static void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit, u32 delay, const vector &preds) { assert(delay <= lit.length()); assert(isCorrectlyTopped(g)); DEBUG_PRINTF("adding on '%s' %u\n", dumpString(lit).c_str(), delay); NFAVertex prev = g.accept; auto it = lit.rbegin(); while (delay--) { NFAVertex curr = add_vertex(g); assert(it != lit.rend()); g[curr].char_reach = *it; add_edge(curr, prev, g); ++it; prev = curr; } for (auto v : preds) { NFAEdge e = add_edge_if_not_present(v, prev, g); if (v == g.start && is_triggered(g)) { g[e].tops.insert(DEFAULT_TOP); } } // Every predecessor of accept must have a report. set_report(g, 0); renumber_vertices(g); renumber_edges(g); assert(allMatchStatesHaveReports(g)); assert(isCorrectlyTopped(g)); } static void restoreTrailingLiteralStates(NGHolder &g, const vector> &lits) { vector preds; insert(&preds, preds.end(), inv_adjacent_vertices(g.accept, g)); clear_in_edges(g.accept, g); for (auto v : preds) { g[v].reports.clear(); /* clear report from old accepts */ } for (const auto &p : lits) { const ue2_literal &lit = p.first; u32 delay = p.second; restoreTrailingLiteralStates(g, lit, delay, preds); } } static bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector &ee, const CompileContext &cc) { DEBUG_PRINTF("trying to improve prefix %p, %zu verts\n", &h, num_vertices(h)); assert(isCorrectlyTopped(h)); renumber_vertices(h); renumber_edges(h); auto depths = calcDepths(h); /* If the reason the prefix is not transient is due to a very long literal * following, we can make it transient by restricting ourselves to using * just the head of the literal. */ if (makeTransientFromLongLiteral(h, vg, ee, cc)) { return true; } auto split = findBestPrefixSplit(h, depths, vg, ee, false, cc); if (split && (split->creates_transient || split->creates_anchored) && splitRoseEdge(h, vg, ee, *split)) { DEBUG_PRINTF("split on simple literal\n"); return true; } /* large back edges may prevent us identifing anchored or transient cases * properly - use a simple walk instead */ if (doNetflowCut(h, &depths, vg, ee, true, cc.grey)) { return true; } if (split && splitRoseEdge(h, vg, ee, *split)) { /* use the simple split even though it doesn't create a transient * prefix */ DEBUG_PRINTF("split on simple literal\n"); return true; } /* look for netflow cuts which don't produce good prefixes */ if (doNetflowCut(h, &depths, vg, ee, false, cc.grey)) { return true; } if (ee.size() > 1) { DEBUG_PRINTF("split the prefix apart based on succ literals\n"); unordered_map, vector >, NGHolderHasher, NGHolderEqual> trimmed; for (const auto &e : ee) { shared_ptr hh = cloneHolder(h); auto succ_lit = vg[target(e, vg)].s; assert(isCorrectlyTopped(*hh)); u32 delay = removeTrailingLiteralStates(*hh, succ_lit, succ_lit.length(), false /* can't overhang start */); if (!delay) { DEBUG_PRINTF("could not remove any literal, skip over\n"); continue; } assert(isCorrectlyTopped(*hh)); trimmed[hh].emplace_back(e, delay); } if (trimmed.size() == 1) { return false; } /* shift the contents to a vector so we can modify the graphs without * violating the map's invariants. */ vector, vector > > > trimmed_vec(trimmed.begin(), trimmed.end()); trimmed.clear(); for (auto &elem : trimmed_vec) { shared_ptr &hp = elem.first; vector> succ_lits; for (const auto &edge_delay : elem.second) { const RoseInEdge &e = edge_delay.first; u32 delay = edge_delay.second; auto lit = vg[target(e, vg)].s; vg[e].graph = hp; assert(delay <= lit.length()); succ_lits.emplace_back(lit, delay); } restoreTrailingLiteralStates(*hp, succ_lits); } return true; } return false; } #define MAX_FIND_BETTER_PREFIX_GEN 4 #define MAX_FIND_BETTER_PREFIX_COUNT 100 static void findBetterPrefixes(RoseInGraph &vg, const CompileContext &cc) { STAGE_DEBUG_PRINTF("FIND BETTER PREFIXES\n"); RoseInVertex start = getStart(vg); bool changed; u32 gen = 0; do { DEBUG_PRINTF("gen %u\n", gen); changed = false; vector seen_order; map > prefixes; /* find prefixes */ for (const RoseInEdge &e : out_edges_range(start, vg)) { /* outfixes shouldn't have made it this far */ assert(vg[target(e, vg)].type == RIV_LITERAL); if (vg[e].graph) { NGHolder *h = vg[e].graph.get(); if (!contains(prefixes, h)) { seen_order.push_back(h); } prefixes[h].push_back(e); } } if (prefixes.size() > MAX_FIND_BETTER_PREFIX_COUNT) { break; } /* look for bad prefixes and try to split */ for (NGHolder *h : seen_order) { depth max_width = findMaxWidth(*h); if (willBeTransient(max_width, cc) || willBeAnchoredTable(max_width, cc.grey)) { continue; } changed = improvePrefix(*h, vg, prefixes[h], cc); } } while (changed && gen++ < MAX_FIND_BETTER_PREFIX_GEN); } #define STRONG_LITERAL_LENGTH 20 #define MAX_EXTRACT_STRONG_LITERAL_GRAPHS 10 static bool extractStrongLiteral(NGHolder &h, RoseInGraph &vg, const vector &ee, const CompileContext &cc) { DEBUG_PRINTF("looking for string literal\n"); unique_ptr split = findBestNormalSplit(h, vg, ee, cc); if (split && min_len(split->lit) >= STRONG_LITERAL_LENGTH) { DEBUG_PRINTF("splitting simple literal\n"); return splitRoseEdge(h, vg, ee, *split); } return false; } static void extractStrongLiterals(RoseInGraph &vg, const CompileContext &cc) { if (!cc.grey.violetExtractStrongLiterals) { return; } STAGE_DEBUG_PRINTF("EXTRACT STRONG LITERALS\n"); set stuck; bool changed; do { changed = false; vector seen_order; map > edges_by_graph; for (const RoseInEdge &ve : edges_range(vg)) { if (vg[source(ve, vg)].type != RIV_LITERAL) { continue; } if (vg[ve].graph) { if (!contains(edges_by_graph, vg[ve].graph.get())) { seen_order.push_back(vg[ve].graph.get()); } edges_by_graph[vg[ve].graph.get()].push_back(ve); } } if (edges_by_graph.size() > MAX_EXTRACT_STRONG_LITERAL_GRAPHS) { DEBUG_PRINTF("too many graphs, stopping\n"); return; } for (NGHolder *g : seen_order) { if (contains(stuck, g)) { DEBUG_PRINTF("already known to be bad\n"); continue; } bool rv = extractStrongLiteral(*g, vg, edges_by_graph[g], cc); if (rv) { changed = true; } else { stuck.insert(g); } } } while (changed); } #define INFIX_STRONG_GUARD_LEN 8 #define INFIX_MIN_SPLIT_LITERAL_LEN 12 static bool improveInfix(NGHolder &h, RoseInGraph &vg, const vector &ee, const CompileContext &cc) { unique_ptr split = findBestNormalSplit(h, vg, ee, cc); if (split && min_len(split->lit) >= INFIX_MIN_SPLIT_LITERAL_LEN && splitRoseEdge(h, vg, ee, *split)) { DEBUG_PRINTF("splitting simple literal\n"); return true; } DEBUG_PRINTF("trying for a netflow cut\n"); /* look for netflow cuts which don't produce good prefixes */ bool rv = doNetflowCut(h, nullptr, vg, ee, false, cc.grey, 8); DEBUG_PRINTF("did netfow cut? = %d\n", (int)rv); return rv; } /** * Infixes which are weakly guarded can, in effect, act like prefixes as they * will often be live. We should try to split these infixes further if they * contain strong literals so that we are at least running smaller weak infixes * which can hopeful be accelerated/miracled. */ static void improveWeakInfixes(RoseInGraph &vg, const CompileContext &cc) { if (!cc.grey.violetAvoidWeakInfixes) { return; } STAGE_DEBUG_PRINTF("IMPROVE WEAK INFIXES\n"); RoseInVertex start = getStart(vg); set weak; vector ordered_weak; for (RoseInVertex vv : adjacent_vertices_range(start, vg)) { /* outfixes shouldn't have made it this far */ assert(vg[vv].type == RIV_LITERAL); if (vg[vv].s.length() >= INFIX_STRONG_GUARD_LEN) { continue; } for (const RoseInEdge &e : out_edges_range(vv, vg)) { if (vg[target(e, vg)].type != RIV_LITERAL || !vg[e].graph) { continue; } NGHolder *h = vg[e].graph.get(); DEBUG_PRINTF("'%s' guards %p\n", dumpString(vg[vv].s).c_str(), h); if (!contains(weak, h)) { weak.insert(h); ordered_weak.push_back(h); } } } map > weak_edges; for (const RoseInEdge &ve : edges_range(vg)) { if (contains(weak, vg[ve].graph.get())) { weak_edges[vg[ve].graph.get()].push_back(ve); } } for (NGHolder *h : ordered_weak) { improveInfix(*h, vg, weak_edges[h], cc); } } static void splitEdgesForSuffix(const NGHolder &base_graph, RoseInGraph &vg, const vector &ee, const VertLitInfo &split, bool eod, const flat_set &reports) { const vector &splitters = split.vv; assert(!splitters.empty()); shared_ptr lhs = make_shared(); unordered_map v_map; cloneHolder(*lhs, base_graph, &v_map); lhs->kind = NFA_INFIX; clear_in_edges(lhs->accept, *lhs); clear_in_edges(lhs->acceptEod, *lhs); add_edge(lhs->accept, lhs->acceptEod, *lhs); clearReports(*lhs); for (NFAVertex v : splitters) { NFAEdge e = add_edge(v_map[v], lhs->accept, *lhs); if (v == base_graph.start) { (*lhs)[e].tops.insert(DEFAULT_TOP); } (*lhs)[v_map[v]].reports.insert(0); } pruneUseless(*lhs); assert(isCorrectlyTopped(*lhs)); /* create literal vertices and connect preds */ for (const auto &lit : split.lit) { if (!can_match(*lhs, lit, is_triggered(*lhs))) { continue; } DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str()); RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg); RoseInVertex tt; if (eod) { DEBUG_PRINTF("doing eod\n"); tt = add_vertex(RoseInVertexProps::makeAcceptEod(reports), vg); } else { DEBUG_PRINTF("doing non-eod\n"); tt = add_vertex(RoseInVertexProps::makeAccept(reports), vg); } add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg); for (const RoseInEdge &e : ee) { RoseInVertex u = source(e, vg); assert(!edge(u, v, vg).second); add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg); } } } #define MIN_SUFFIX_LEN 6 static bool replaceSuffixWithInfix(const NGHolder &h, RoseInGraph &vg, const vector &suffix_edges, const CompileContext &cc) { DEBUG_PRINTF("inspecting suffix : %p on %zu edges\n", &h, suffix_edges.size()); /* * We would, in general, rather not have output exposed engines because * once they are triggered, they must be run while infixes only have to run * if the successor literal is seen. Matches from output exposed engines * also have to be placed in a priority queue and interleaved with matches * from other sources. * * Note: * - if the LHS is extremely unlikely we may be better off leaving * a suffix unguarded. * * - limited width suffixes may be less bad as they won't be continuously * active, we may want to have (a) stronger controls on if we want to pick * a trailing literal in these cases and/or (b) look also for literals * near accept as well as right on accept * * TODO: improve heuristics, splitting logic. */ /* we may do multiple splits corresponding to different report behaviour */ set seen; map >, VertLitInfo> by_reports; /* eod, rep */ for (NFAVertex v : inv_adjacent_vertices_range(h.accept, h)) { set ss = getLiteralSet(h, v, false); if (ss.empty()) { DEBUG_PRINTF("candidate is too shitty\n"); return false; } VertLitInfo &vli = by_reports[make_pair(false, h[v].reports)]; insert(&vli.lit, ss); vli.vv.push_back(v); seen.insert(v); } seen.insert(h.accept); for (NFAVertex v : inv_adjacent_vertices_range(h.acceptEod, h)) { if (contains(seen, v)) { continue; } set ss = getLiteralSet(h, v, false); if (ss.empty()) { DEBUG_PRINTF("candidate is too shitty\n"); return false; } VertLitInfo &vli = by_reports[make_pair(true, h[v].reports)]; insert(&vli.lit, ss); vli.vv.push_back(v); } assert(!by_reports.empty()); /* TODO: how strong a min len do we want here ? */ u32 min_len = cc.grey.minRoseLiteralLength; ENSURE_AT_LEAST(&min_len, MIN_SUFFIX_LEN); for (auto &vli : by_reports | map_values) { u64a score = sanitizeAndCompressAndScore(vli.lit); if (vli.lit.empty() || !validateRoseLiteralSetQuality(vli.lit, score, false, min_len, false, false)) { return false; } } for (const auto &info : by_reports) { DEBUG_PRINTF("splitting on simple literals\n"); splitEdgesForSuffix(h, vg, suffix_edges, info.second, info.first.first /* eod */, info.first.second /* reports */); } for (const RoseInEdge &e : suffix_edges) { remove_edge(e, vg); } return true; } static void avoidSuffixes(RoseInGraph &vg, const CompileContext &cc) { if (!cc.grey.violetAvoidSuffixes) { return; } STAGE_DEBUG_PRINTF("AVOID SUFFIXES\n"); RoseInVertex accept = getPrimaryAccept(vg); map > suffixes; vector ordered_suffixes; /* find suffixes */ for (const RoseInEdge &e : in_edges_range(accept, vg)) { /* outfixes shouldn't have made it this far */ assert(vg[source(e, vg)].type == RIV_LITERAL); assert(vg[e].graph); /* non suffix paths should be wired to other accepts */ const NGHolder *h = vg[e].graph.get(); if (!contains(suffixes, h)) { ordered_suffixes.push_back(h); } suffixes[h].push_back(e); } /* look at suffixes and try to split */ for (const NGHolder *h : ordered_suffixes) { replaceSuffixWithInfix(*h, vg, suffixes[h], cc); } } static bool leadingDotStartLiteral(const NGHolder &h, VertLitInfo *out) { if (out_degree(h.start, h) != 3) { return false; } NFAVertex v = NGHolder::null_vertex(); NFAVertex ds = NGHolder::null_vertex(); for (NFAVertex a : adjacent_vertices_range(h.start, h)) { if (a == h.startDs) { continue; } if (h[a].char_reach.all()) { ds = a; if (out_degree(ds, h) != 2 || !edge(ds, ds, h).second) { return false; } } else { v = a; } } if (!v || !ds || !edge(ds, v, h).second) { return false; } if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) { return false; } ue2_literal lit; lit.push_back(h[v].char_reach.find_first(), h[v].char_reach.isCaselessChar()); while (out_degree(v, h) == 1) { NFAVertex vv = *adjacent_vertices(v, h).first; if (h[vv].char_reach.count() != 1 && !h[vv].char_reach.isCaselessChar()) { break; } v = vv; lit.push_back(h[v].char_reach.find_first(), h[v].char_reach.isCaselessChar()); } if (is_match_vertex(v, h) && h.kind != NFA_SUFFIX) { /* we have rediscovered the post-infix literal */ return false; } if (bad_mixed_sensitivity(lit)) { make_nocase(&lit); } DEBUG_PRINTF("%zu found %s\n", h[v].index, dumpString(lit).c_str()); out->vv = {v}; out->lit = {lit}; return true; } static bool lookForDoubleCut(const NGHolder &h, const vector &ee, RoseInGraph &vg, const Grey &grey) { VertLitInfo info; if (!leadingDotStartLiteral(h, &info) || min_len(info.lit) < grey.violetDoubleCutLiteralLen) { return false; } DEBUG_PRINTF("performing split\n"); return splitRoseEdge(h, vg, ee, {info}); } static void lookForDoubleCut(RoseInGraph &vg, const CompileContext &cc) { if (!cc.grey.violetDoubleCut) { return; } map > right_edges; vector ordered_graphs; for (const RoseInEdge &ve : edges_range(vg)) { if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) { const NGHolder *h = vg[ve].graph.get(); if (!contains(right_edges, h)) { ordered_graphs.push_back(h); } right_edges[h].push_back(ve); } } for (const NGHolder *h : ordered_graphs) { lookForDoubleCut(*h, right_edges[h], vg, cc.grey); } } static pair findLiteralBefore(const NGHolder &h, NFAVertex v) { ue2_literal lit; if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) { return {v, std::move(lit) }; } lit.push_back(h[v].char_reach.find_first(), h[v].char_reach.isCaselessChar()); while (in_degree(v, h) == 1) { NFAVertex vv = *inv_adjacent_vertices(v, h).first; if (h[vv].char_reach.count() != 1 && !h[vv].char_reach.isCaselessChar()) { break; } lit.push_back(h[vv].char_reach.find_first(), h[vv].char_reach.isCaselessChar()); v = vv; } return {v, std::move(lit) }; } static bool lookForDotStarPred(NFAVertex v, const NGHolder &h, NFAVertex *u, NFAVertex *ds) { *u = NGHolder::null_vertex(); *ds = NGHolder::null_vertex(); for (NFAVertex a : inv_adjacent_vertices_range(v, h)) { if (h[a].char_reach.all()) { if (!edge(a, a, h).second) { return false; } if (*ds) { return false; } *ds = a; } else { if (*u) { return false; } *u = a; } } if (!*u || !*ds) { return false; } return true; } static bool trailingDotStarLiteral(const NGHolder &h, VertLitInfo *out) { /* Note: there is no delay yet - so the final literal is the already * discovered successor literal - we are in fact interested in the literal * before it. */ if (in_degree(h.accept, h) != 1) { return false; } if (in_degree(h.acceptEod, h) != 1) { assert(0); return false; } NFAVertex v = findLiteralBefore(h, *inv_adjacent_vertices(h.accept, h).first).first; NFAVertex u; NFAVertex ds; if (!lookForDotStarPred(v, h, &u, &ds)) { return false; } v = u; auto rv = findLiteralBefore(h, v); if (!lookForDotStarPred(v, h, &u, &ds)) { return false; } ue2_literal lit = reverse_literal(rv.second); DEBUG_PRINTF("%zu found %s\n", h[v].index, dumpString(lit).c_str()); if (bad_mixed_sensitivity(lit)) { make_nocase(&lit); } out->vv = {v}; out->lit = {lit}; return true; } static bool lookForTrailingLiteralDotStar(const NGHolder &h, const vector &ee, RoseInGraph &vg, const Grey &grey) { VertLitInfo info; if (!trailingDotStarLiteral(h, &info) || min_len(info.lit) < grey.violetDoubleCutLiteralLen) { return false; } DEBUG_PRINTF("performing split\n"); return splitRoseEdge(h, vg, ee, info); } /* In streaming mode, active engines have to be caught up at stream boundaries * and have to be stored in stream state, so we prefer to decompose patterns * in to literals with no state between them if possible. */ static void decomposeLiteralChains(RoseInGraph &vg, const CompileContext &cc) { if (!cc.grey.violetLiteralChains) { return; } bool changed; do { changed = false; map > right_edges; vector ordered_graphs; for (const RoseInEdge &ve : edges_range(vg)) { if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) { const NGHolder *h = vg[ve].graph.get(); if (!contains(right_edges, h)) { ordered_graphs.push_back(h); } right_edges[h].push_back(ve); } } for (const NGHolder *h : ordered_graphs) { const vector &ee = right_edges[h]; bool rv = lookForDoubleCut(*h, ee, vg, cc.grey); if (!rv && h->kind != NFA_SUFFIX) { rv = lookForTrailingLiteralDotStar(*h, ee, vg, cc.grey); } changed |= rv; } } while (changed); } static bool lookForCleanSplit(const NGHolder &h, const vector &ee, RoseInGraph &vg, const CompileContext &cc) { unique_ptr split = findBestCleanSplit(h, cc); if (split) { return splitRoseEdge(h, vg, {ee}, *split); } return false; } #define MAX_DESIRED_CLEAN_SPLIT_DEPTH 4 static void lookForCleanEarlySplits(RoseInGraph &vg, const CompileContext &cc) { u32 gen = 0; vector prev = {getStart(vg)}; while (gen < MAX_DESIRED_CLEAN_SPLIT_DEPTH) { /* collect vertices in edge order for determinism */ vector curr; set curr_seen; for (RoseInVertex u : prev) { for (auto v : adjacent_vertices_range(u, vg)) { if (curr_seen.insert(v).second) { curr.push_back(v); } } } map> rightfixes; vector ordered_graphs; for (RoseInVertex v : curr) { for (const RoseInEdge &e : out_edges_range(v, vg)) { if (vg[e].graph) { NGHolder *h = vg[e].graph.get(); if (!contains(rightfixes, h)) { ordered_graphs.push_back(h); } rightfixes[h].push_back(e); } } } for (const NGHolder *h : ordered_graphs) { lookForCleanSplit(*h, rightfixes[h], vg, cc); } prev = curr; gen++; } } static void rehomeEodSuffixes(RoseInGraph &vg) { // Find edges to accept with EOD-anchored graphs that we can move over to // acceptEod. vector acc_edges; for (const auto &e : edges_range(vg)) { if (vg[target(e, vg)].type != RIV_ACCEPT) { continue; } if (vg[e].haig || !vg[e].graph) { continue; } const NGHolder &h = *vg[e].graph; if (in_degree(h.accept, h)) { DEBUG_PRINTF("graph isn't eod anchored\n"); continue; } acc_edges.push_back(e); } for (const RoseInEdge &e : acc_edges) { // Move this edge from accept to acceptEod RoseInVertex w = add_vertex(RoseInVertexProps::makeAcceptEod(), vg); add_edge(source(e, vg), w, vg[e], vg); remove_edge(e, vg); } /* old accept vertices will be tidied up by final pruneUseless() call */ } static bool tryForEarlyDfa(const NGHolder &h, const CompileContext &cc) { switch (h.kind) { case NFA_OUTFIX: /* 'prefix' of eod */ case NFA_PREFIX: return cc.grey.earlyMcClellanPrefix; case NFA_INFIX: return cc.grey.earlyMcClellanInfix; case NFA_SUFFIX: return cc.grey.earlyMcClellanSuffix; default: DEBUG_PRINTF("kind %u\n", (u32)h.kind); assert(0); return false; } } static vector> getDfaTriggers(RoseInGraph &vg, const vector &edges, bool *single_trigger) { vector> triggers; u32 min_offset = ~0U; u32 max_offset = 0; for (const auto &e : edges) { RoseInVertex s = source(e, vg); if (vg[s].type == RIV_LITERAL) { triggers.push_back(as_cr_seq(vg[s].s)); } ENSURE_AT_LEAST(&max_offset, vg[s].max_offset); LIMIT_TO_AT_MOST(&min_offset, vg[s].min_offset); } *single_trigger = min_offset == max_offset; DEBUG_PRINTF("trigger offset (%u, %u)\n", min_offset, max_offset); return triggers; } static bool doEarlyDfa(RoseBuild &rose, RoseInGraph &vg, NGHolder &h, const vector &edges, bool final_chance, const ReportManager &rm, const CompileContext &cc) { DEBUG_PRINTF("trying for dfa\n"); bool single_trigger; for (const auto &e : edges) { if (vg[target(e, vg)].type == RIV_ACCEPT_EOD) { /* TODO: support eod prefixes */ return false; } } auto triggers = getDfaTriggers(vg, edges, &single_trigger); /* TODO: literal delay things */ if (!generates_callbacks(h)) { set_report(h, rose.getNewNfaReport()); } shared_ptr dfa = buildMcClellan(h, &rm, single_trigger, triggers, cc.grey, final_chance); if (!dfa) { return false; } DEBUG_PRINTF("dfa ok\n"); for (const auto &e : edges) { vg[e].dfa = dfa; } return true; } #define MAX_EDGES_FOR_IMPLEMENTABILITY 50 static bool splitForImplementability(RoseInGraph &vg, NGHolder &h, const vector &edges, const CompileContext &cc) { vector> succ_lits; DEBUG_PRINTF("trying to split %s with %zu vertices on %zu edges\n", to_string(h.kind).c_str(), num_vertices(h), edges.size()); if (edges.size() > MAX_EDGES_FOR_IMPLEMENTABILITY) { return false; } if (!generates_callbacks(h)) { for (const auto &e : edges) { const auto &lit = vg[target(e, vg)].s; u32 delay = vg[e].graph_lag; vg[e].graph_lag = 0; assert(delay <= lit.length()); succ_lits.emplace_back(lit, delay); } restoreTrailingLiteralStates(h, succ_lits); } unique_ptr split; bool last_chance = true; if (h.kind == NFA_PREFIX) { auto depths = calcDepths(h); split = findBestPrefixSplit(h, depths, vg, edges, last_chance, cc); } else { split = findBestLastChanceSplit(h, vg, edges, cc); } if (split && splitRoseEdge(h, vg, edges, *split)) { DEBUG_PRINTF("split on simple literal\n"); return true; } DEBUG_PRINTF("trying to netflow\n"); bool rv = doNetflowCut(h, nullptr, vg, edges, false, cc.grey); DEBUG_PRINTF("done\n"); return rv; } #define MAX_IMPLEMENTABLE_SPLITS 50 bool ensureImplementable(RoseBuild &rose, RoseInGraph &vg, bool allow_changes, bool final_chance, const ReportManager &rm, const CompileContext &cc) { DEBUG_PRINTF("checking for impl %d\n", final_chance); bool changed = false; bool need_to_recalc = false; u32 added_count = 0; unordered_set good; /* known to be implementable */ do { changed = false; DEBUG_PRINTF("added %u\n", added_count); map > edges_by_graph; vector graphs; for (const RoseInEdge &ve : edges_range(vg)) { if (vg[ve].graph) { NGHolder *h = vg[ve].graph.get(); if (!contains(edges_by_graph, h)) { graphs.push_back(h); } edges_by_graph[h].push_back(ve); } } for (NGHolder *h : graphs) { if (contains(good, h)) { continue; } reduceGraphEquivalences(*h, cc); if (isImplementableNFA(*h, &rm, cc)) { good.insert(h); continue; } if (tryForEarlyDfa(*h, cc) && doEarlyDfa(rose, vg, *h, edges_by_graph[h], final_chance, rm, cc)) { good.insert(h); continue; } DEBUG_PRINTF("eek\n"); if (!allow_changes) { return false; } if (splitForImplementability(vg, *h, edges_by_graph[h], cc)) { added_count++; if (added_count > MAX_IMPLEMENTABLE_SPLITS) { DEBUG_PRINTF("added_count hit limit\n"); return false; } changed = true; good.insert(h); continue; } return false; } assert(added_count <= MAX_IMPLEMENTABLE_SPLITS); if (changed) { removeRedundantLiterals(vg, cc); pruneUseless(vg); need_to_recalc = true; } } while (changed); if (need_to_recalc) { renumber_vertices(vg); calcVertexOffsets(vg); } DEBUG_PRINTF("ok!\n"); return true; } static RoseInGraph doInitialVioletTransform(const NGHolder &h, bool last_chance, const CompileContext &cc) { assert(!can_never_match(h)); RoseInGraph vg = populateTrivialGraph(h); if (!cc.grey.allowViolet) { return vg; } /* Avoid running the Violet analysis at all on graphs with no vertices with * small reach, since we will not be able to extract any literals. */ if (!hasNarrowReachVertex(h)) { DEBUG_PRINTF("fail, no vertices with small reach\n"); return vg; } DEBUG_PRINTF("hello world\n"); /* Step 1: avoid outfixes as we always have to run them. */ avoidOutfixes(vg, last_chance, cc); if (num_vertices(vg) <= 2) { return vg; /* unable to transform pattern */ } removeRedundantPrefixes(vg); dumpPreRoseGraph(vg, cc.grey, "pre_prefix_rose.dot"); /* Step 2: avoid non-transient prefixes (esp in streaming mode) */ findBetterPrefixes(vg, cc); dumpPreRoseGraph(vg, cc.grey, "post_prefix_rose.dot"); extractStrongLiterals(vg, cc); dumpPreRoseGraph(vg, cc.grey, "post_extract_rose.dot"); improveWeakInfixes(vg, cc); dumpPreRoseGraph(vg, cc.grey, "post_infix_rose.dot"); /* Step 3: avoid output exposed engines if there is a strong trailing literal) */ avoidSuffixes(vg, cc); /* Step 4: look for infixes/suffixes with leading .*literals * This can reduce the amount of work a heavily picked literal has to do and * reduce the amount of state used as .* is handled internally to rose. */ lookForDoubleCut(vg, cc); if (cc.streaming) { lookForCleanEarlySplits(vg, cc); decomposeLiteralChains(vg, cc); } rehomeEodSuffixes(vg); removeRedundantLiterals(vg, cc); pruneUseless(vg); dumpPreRoseGraph(vg, cc.grey); renumber_vertices(vg); calcVertexOffsets(vg); return vg; } bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter, bool last_chance, const ReportManager &rm, const CompileContext &cc) { auto vg = doInitialVioletTransform(h, last_chance, cc); if (num_vertices(vg) <= 2) { return false; } /* Step 5: avoid unimplementable, or overly large engines if possible */ if (!ensureImplementable(rose, vg, last_chance, last_chance, rm, cc)) { return false; } dumpPreRoseGraph(vg, cc.grey, "post_ensure_rose.dot"); /* Step 6: send to rose */ bool rv = rose.addRose(vg, prefilter); DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail"); return rv; } bool checkViolet(const ReportManager &rm, const NGHolder &h, bool prefilter, const CompileContext &cc) { auto vg = doInitialVioletTransform(h, true, cc); if (num_vertices(vg) <= 2) { return false; } bool rv = roseCheckRose(vg, prefilter, rm, cc); DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail"); return rv; } }