/* * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "rose_build_add_internal.h" #include "rose_build_impl.h" #include "ue2common.h" #include "grey.h" #include "rose_build_anchored.h" #include "rose_in_util.h" #include "hwlm/hwlm_literal.h" #include "nfa/goughcompile.h" #include "nfa/nfa_api_queue.h" #include "nfagraph/ng_depth.h" #include "nfagraph/ng_dump.h" #include "nfagraph/ng_holder.h" #include "nfagraph/ng_limex.h" #include "nfagraph/ng_mcclellan.h" #include "nfagraph/ng_prefilter.h" #include "nfagraph/ng_prune.h" #include "nfagraph/ng_region.h" #include "nfagraph/ng_repeat.h" #include "nfagraph/ng_reports.h" #include "nfagraph/ng_util.h" #include "nfagraph/ng_width.h" #include "util/charreach.h" #include "util/charreach_util.h" #include "util/compare.h" #include "util/compile_context.h" #include "util/container.h" #include "util/dump_charclass.h" #include "util/graph_range.h" #include "util/insertion_ordered.h" #include "util/noncopyable.h" #include "util/order_check.h" #include "util/report_manager.h" #include "util/ue2string.h" #include "util/verify_types.h" #include #include #include #include #include #include using namespace std; namespace ue2 { /** * \brief Data used by most of the construction code in this file. */ struct RoseBuildData : noncopyable { RoseBuildData(const RoseInGraph &ig_in, bool som_in) : ig(ig_in), som(som_in) {} /** Input rose graph. */ const RoseInGraph &ig; /** Edges we've transformed (in \ref transformAnchoredLiteralOverlap) which * require ANCH history to prevent overlap. */ unordered_set anch_history_edges; /** True if we're tracking Start of Match. */ bool som; }; static ReportID findReportId(const NGHolder &g) { /* prefix/infix always have an edge to accept and only 1 reportid initially */ assert(in_degree(g.accept, g)); const auto &rep = g[*inv_adjacent_vertices(g.accept, g).first].reports; assert(!rep.empty()); return *rep.begin(); } static RoseVertex createVertex(RoseBuildImpl *build, u32 literalId, u32 min_offset, u32 max_offset) { RoseGraph &g = build->g; // add to tree RoseVertex v = add_vertex(g); g[v].min_offset = min_offset; g[v].max_offset = max_offset; DEBUG_PRINTF("insert vertex %zu into literal %u's vertex set\n", g[v].index, literalId); g[v].literals.insert(literalId); build->literal_info[literalId].vertices.insert(v); return v; } RoseVertex createVertex(RoseBuildImpl *build, const RoseVertex parent, u32 minBound, u32 maxBound, u32 literalId, size_t literalLength, const flat_set &reports) { assert(parent != RoseGraph::null_vertex()); RoseGraph &g = build->g; // add to tree (offsets set latter) RoseVertex v = createVertex(build, literalId, 0U, 0U); /* fill in report information */ g[v].reports.insert(reports.begin(), reports.end()); RoseEdge e = add_edge(parent, v, g); DEBUG_PRINTF("adding edge (%u, %u) to parent\n", minBound, maxBound); g[e].minBound = minBound; g[e].maxBound = maxBound; g[e].rose_top = 0; u32 min_offset = add_rose_depth(g[parent].min_offset, minBound); u32 max_offset = add_rose_depth(g[parent].max_offset, maxBound); /* take literal length into account for offsets */ const u32 lit_len = verify_u32(literalLength); min_offset = add_rose_depth(min_offset, lit_len); max_offset = add_rose_depth(max_offset, lit_len); g[v].min_offset = min_offset; g[v].max_offset = max_offset; return v; } static RoseVertex createAnchoredVertex(RoseBuildImpl *build, u32 literalId, u32 min_offset, u32 max_offset) { RoseGraph &g = build->g; RoseVertex v = createVertex(build, literalId, min_offset, max_offset); DEBUG_PRINTF("created anchored vertex %zu with lit id %u\n", g[v].index, literalId); RoseEdge e = add_edge(build->anchored_root, v, g); g[e].minBound = min_offset; g[e].maxBound = max_offset; return v; } static RoseVertex duplicate(RoseBuildImpl *build, RoseVertex v) { RoseGraph &g = build->g; RoseVertex w = add_vertex(g[v], g); DEBUG_PRINTF("added vertex %zu\n", g[w].index); for (auto lit_id : g[w].literals) { build->literal_info[lit_id].vertices.insert(w); } for (const auto &e : in_edges_range(v, g)) { RoseVertex s = source(e, g); add_edge(s, w, g[e], g); DEBUG_PRINTF("added edge (%zu,%zu)\n", g[s].index, g[w].index); } return w; } namespace { struct created_key { explicit created_key(const RoseInEdgeProps &trep) : prefix(trep.graph.get()), lag(trep.graph_lag) { } bool operator<(const created_key &b) const { const created_key &a = *this; ORDER_CHECK(prefix); ORDER_CHECK(lag); return false; } NGHolder *prefix; u32 lag; }; } static bool isPureAnchored(const NGHolder &h) { return !proper_out_degree(h.startDs, h); } static RoseRoleHistory selectHistory(const RoseBuildImpl &tbi, const RoseBuildData &bd, const RoseInEdge &rose_edge, const RoseEdge &e) { const RoseGraph &g = tbi.g; const RoseVertex u = source(e, g), v = target(e, g); const bool fixed_offset_src = g[u].fixedOffset(); const bool has_bounds = g[e].minBound || (g[e].maxBound != ROSE_BOUND_INF); /*DEBUG_PRINTF("edge %zu->%zu, bounds=[%u,%u], fixed_u=%d, prefix=%d\n", g[u].index, g[v].index, g[e].minBound, g[e].maxBound, (int)g[u].fixedOffset(), (int)g[v].left);*/ if (g[v].left) { // Roles with prefix engines have their history handled by that prefix. assert(!contains(bd.anch_history_edges, rose_edge)); return ROSE_ROLE_HISTORY_NONE; } if (contains(bd.anch_history_edges, rose_edge)) { DEBUG_PRINTF("needs anch history\n"); return ROSE_ROLE_HISTORY_ANCH; } if (fixed_offset_src && has_bounds) { DEBUG_PRINTF("needs anch history\n"); return ROSE_ROLE_HISTORY_ANCH; } return ROSE_ROLE_HISTORY_NONE; } static bool hasSuccessorLiterals(RoseInVertex iv, const RoseInGraph &ig) { // cppcheck-suppress useStlAlgorithm for (auto v : adjacent_vertices_range(iv, ig)) { if (ig[v].type != RIV_ACCEPT) { return true; } } return false; } static void createVertices(RoseBuildImpl *tbi, map > &vertex_map, const vector > &parents, RoseInVertex iv, u32 min_offset, u32 max_offset, u32 literalId, u32 delay, const RoseBuildData &bd) { RoseGraph &g = tbi->g; DEBUG_PRINTF("vertex has %zu parents\n", parents.size()); map created; for (const auto &pv : parents) { RoseVertex w; const RoseInEdgeProps &edge_props = bd.ig[pv.second]; shared_ptr prefix_graph = edge_props.graph; u32 prefix_lag = edge_props.graph_lag; created_key key(edge_props); if (!contains(created, key)) { assert(prefix_graph || !edge_props.haig); w = createVertex(tbi, literalId, min_offset, max_offset); created[key] = w; if (prefix_graph) { g[w].left.graph = prefix_graph; if (edge_props.dfa) { g[w].left.dfa = edge_props.dfa; } g[w].left.haig = edge_props.haig; g[w].left.lag = prefix_lag; // The graph already has its report id allocated - find it. g[w].left.leftfix_report = findReportId(*prefix_graph); if (g[w].left.dfa || g[w].left.haig) { assert(prefix_graph); g[w].left.dfa_min_width = findMinWidth(*prefix_graph); g[w].left.dfa_max_width = findMaxWidth(*prefix_graph); } } if (bd.som && !g[w].left.haig) { /* no prefix - som based on literal start */ assert(!prefix_graph); g[w].som_adjust = tbi->literals.at(literalId).elength(); DEBUG_PRINTF("set som_adjust to %u\n", g[w].som_adjust); } DEBUG_PRINTF(" adding new vertex index=%zu\n", tbi->g[w].index); vertex_map[iv].emplace_back(w); } else { w = created[key]; } RoseVertex p = pv.first; RoseEdge e = add_edge(p, w, g); DEBUG_PRINTF("adding edge (%u,%u) to parent\n", edge_props.minBound, edge_props.maxBound); g[e].minBound = edge_props.minBound; if (p != tbi->root && g[w].left.graph && (!tbi->isAnyStart(p) || isPureAnchored(*g[w].left.graph))) { depth mw = findMaxWidth(*g[w].left.graph); if (mw.is_infinite()) { g[e].maxBound = ROSE_BOUND_INF; } else { DEBUG_PRINTF("setting max to %s + %u\n", mw.str().c_str(), prefix_lag); g[e].maxBound = prefix_lag + mw; } } else { g[e].maxBound = edge_props.maxBound; } g[e].rose_top = 0; g[e].history = selectHistory(*tbi, bd, pv.second, e); } if (delay && hasSuccessorLiterals(iv, bd.ig)) { // Add an undelayed "ghost" vertex for this literal. u32 ghostId = tbi->literal_info[literalId].undelayed_id; DEBUG_PRINTF("creating delay ghost vertex, id=%u\n", ghostId); assert(ghostId != literalId); assert(tbi->literals.at(ghostId).delay == 0); // Adjust offsets, removing delay. u32 ghost_min = min_offset, ghost_max = max_offset; assert(ghost_min < ROSE_BOUND_INF && ghost_min >= delay); ghost_min -= delay; ghost_max -= ghost_max == ROSE_BOUND_INF ? 0 : delay; RoseVertex g_v = createVertex(tbi, ghostId, ghost_min, ghost_max); for (const auto &pv : parents) { const RoseInEdgeProps &edge_props = bd.ig[pv.second]; RoseEdge e = add_edge(pv.first, g_v, tbi->g); g[e].minBound = edge_props.minBound; g[e].maxBound = edge_props.maxBound; g[e].history = selectHistory(*tbi, bd, pv.second, e); DEBUG_PRINTF("parent edge has bounds [%u,%u]\n", edge_props.minBound, edge_props.maxBound); } for (auto &m : created) { tbi->ghost[m.second] = g_v; } } } /* ensure the holder does not accept any paths which do not end with lit */ static void removeFalsePaths(NGHolder &g, const ue2_literal &lit) { DEBUG_PRINTF("strip '%s'\n", dumpString(lit).c_str()); set curr, next; curr.insert(g.accept); curr.insert(g.acceptEod); for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) { next.clear(); for (auto curr_v : curr) { DEBUG_PRINTF("handling %zu\n", g[curr_v].index); vector next_cand; insert(&next_cand, next_cand.end(), inv_adjacent_vertices(curr_v, g)); clear_in_edges(curr_v, g); if (curr_v == g.acceptEod) { add_edge(g.accept, g.acceptEod, g); } for (auto v : next_cand) { assert(v != g.startDs); if (v == g.start || v == g.startDs || v == g.accept) { continue; } const CharReach &cr = g[v].char_reach; if (!overlaps(*it, cr)) { DEBUG_PRINTF("false edge %zu\n", g[v].index); continue; } NFAVertex v2 = clone_vertex(g, v); clone_in_edges(g, v, v2); add_edge(v2, curr_v, g); g[v2].char_reach &= *it; DEBUG_PRINTF("next <- %zu\n", g[v2].index); next.insert(v2); } } curr.swap(next); } pruneUseless(g); clearReports(g); assert(in_degree(g.accept, g) || in_degree(g.acceptEod, g) > 1); assert(allMatchStatesHaveReports(g)); DEBUG_PRINTF("graph has %zu vertices left\n", num_vertices(g)); } static RoseVertex tryForAnchoredVertex(RoseBuildImpl *tbi, const RoseInVertexProps &iv_info, const RoseInEdgeProps &ep) { if (ep.graph_lag && ep.graph_lag != iv_info.s.length()) { DEBUG_PRINTF("bad lag %u != %zu\n", ep.graph_lag, iv_info.s.length()); return RoseGraph::null_vertex(); /* TODO: better */ } const depth anchored_max_depth(tbi->cc.grey.maxAnchoredRegion); depth min_width(0), max_width(0); if (ep.graph.get()) { const depth graph_lag(ep.graph_lag); max_width = findMaxWidth(*ep.graph) + graph_lag; min_width = findMinWidth(*ep.graph) + graph_lag; if (proper_out_degree(ep.graph->startDs, *ep.graph)) { max_width = depth::infinity(); } } DEBUG_PRINTF("mw = %s; lag = %u\n", max_width.str().c_str(), ep.graph_lag); NGHolder h; if (ep.graph.get() && max_width <= anchored_max_depth) { cloneHolder(h, *ep.graph); /* add literal/dots */ if (ep.graph_lag) { assert(ep.graph_lag == iv_info.s.length()); appendLiteral(h, iv_info.s); } else { removeFalsePaths(h, iv_info.s); } } else if (!ep.graph.get() && ep.maxBound < ROSE_BOUND_INF && iv_info.s.length() + ep.maxBound <= tbi->cc.grey.maxAnchoredRegion) { if (ep.maxBound || ep.minBound) { /* TODO: handle, however these cases are not generated currently by ng_violet */ return RoseGraph::null_vertex(); } max_width = depth(ep.maxBound + iv_info.s.length()); min_width = depth(ep.minBound + iv_info.s.length()); add_edge(h.start, h.accept, h); appendLiteral(h, iv_info.s); } else { return RoseGraph::null_vertex(); } u32 anchored_exit_id = tbi->getNewLiteralId(); u32 remap_id = 0; DEBUG_PRINTF(" trying to add dfa stuff\n"); int rv = addToAnchoredMatcher(*tbi, h, anchored_exit_id, &remap_id); if (rv == ANCHORED_FAIL) { return RoseGraph::null_vertex(); } else if (rv == ANCHORED_REMAP) { anchored_exit_id = remap_id; } else { assert(rv == ANCHORED_SUCCESS); } // Store the literal itself in a side structure so that we can use it for // overlap calculations later. This may be obsolete when the old Rose // construction path (and its history selection code) goes away. rose_literal_id lit(iv_info.s, ROSE_ANCHORED, 0); tbi->anchoredLitSuffix.insert(make_pair(anchored_exit_id, lit)); assert(min_width <= anchored_max_depth); assert(max_width <= anchored_max_depth); assert(min_width <= max_width); /* Note: bounds are end-to-end as anchored lits are considered * to have 0 length. */ RoseVertex v = createAnchoredVertex(tbi, anchored_exit_id, min_width, max_width); return v; } static u32 findRoseAnchorFloatingOverlap(const RoseInEdgeProps &ep, const RoseInVertexProps &succ_vp) { /* we need to ensure there is enough history to find the successor literal * when we enable its group. */ if (!ep.graph.get()) { return 0; /* non overlapping */ } depth graph_min_width = findMinWidth(*ep.graph); u32 min_width = ep.graph_lag + graph_min_width; u32 s_len = succ_vp.s.length(); if (s_len <= min_width) { return 0; /* no overlap */ } u32 overlap = s_len - min_width; DEBUG_PRINTF("found overlap of %u\n", overlap); return overlap; } static void findRoseLiteralMask(const NGHolder &h, const u32 lag, vector &msk, vector &cmp) { if (lag >= HWLM_MASKLEN) { msk.clear(); cmp.clear(); return; } assert(in_degree(h.acceptEod, h) == 1); // no eod reports // Start with the set of reporter vertices for this rose. set curr, next; insert(&curr, inv_adjacent_vertices(h.accept, h)); assert(!curr.empty()); msk.assign(HWLM_MASKLEN, 0); cmp.assign(HWLM_MASKLEN, 0); size_t i = HWLM_MASKLEN - lag - 1; do { if (curr.empty() || contains(curr, h.start) || contains(curr, h.startDs)) { DEBUG_PRINTF("end of the road\n"); break; } next.clear(); CharReach cr; for (auto v : curr) { DEBUG_PRINTF("vertex %zu, reach %s\n", h[v].index, describeClass(h[v].char_reach).c_str()); cr |= h[v].char_reach; insert(&next, inv_adjacent_vertices(v, h)); } make_and_cmp_mask(cr, &msk[i], &cmp[i]); DEBUG_PRINTF("%zu: reach=%s, msk=%u, cmp=%u\n", i, describeClass(cr).c_str(), msk.at(i), cmp.at(i)); curr.swap(next); } while (i-- > 0); } static void doRoseLiteralVertex(RoseBuildImpl *tbi, bool use_eod_table, map > &vertex_map, const vector > &parents, RoseInVertex iv, const RoseBuildData &bd) { const RoseInGraph &ig = bd.ig; const RoseInVertexProps &iv_info = ig[iv]; assert(iv_info.type == RIV_LITERAL); assert(!parents.empty()); /* start vertices should not be here */ // ng_violet should have ensured that mixed-sensitivity literals are no // longer than the benefits max width. assert(iv_info.s.length() <= MAX_MASK2_WIDTH || !mixed_sensitivity(iv_info.s)); // Rose graph construction process should have given us a min_offset. assert(iv_info.min_offset > 0); if (use_eod_table) { goto floating; } DEBUG_PRINTF("rose find vertex\n"); if (parents.size() == 1) { const RoseVertex u = parents.front().first; const RoseInEdgeProps &ep = ig[parents.front().second]; if (!tbi->isAnyStart(u)) { goto floating; } if (!ep.graph && ep.maxBound == ROSE_BOUND_INF) { goto floating; } if (ep.graph && !isAnchored(*ep.graph)) { goto floating; } DEBUG_PRINTF("cand for anchored maxBound %u, %p (%d)\n", ep.maxBound, ep.graph.get(), ep.graph ? (int)isAnchored(*ep.graph) : 3); /* need to check if putting iv into the anchored table would create * any bad_overlap relationships with its successor literals */ for (const auto &e : out_edges_range(iv, ig)) { RoseInVertex t = target(e, ig); u32 overlap = findRoseAnchorFloatingOverlap(ig[e], ig[t]); DEBUG_PRINTF("found overlap of %u\n", overlap); if (overlap > tbi->cc.grey.maxHistoryAvailable + 1) { goto floating; } } RoseVertex v = tryForAnchoredVertex(tbi, iv_info, ep); if (v != RoseGraph::null_vertex()) { DEBUG_PRINTF("add anchored literal vertex\n"); vertex_map[iv].emplace_back(v); return; } } floating: vector msk, cmp; if (tbi->cc.grey.roseHamsterMasks && in_degree(iv, ig) == 1) { RoseInEdge e = *in_edges(iv, ig).first; if (ig[e].graph) { findRoseLiteralMask(*ig[e].graph, ig[e].graph_lag, msk, cmp); } } u32 delay = iv_info.delay; rose_literal_table table = use_eod_table ? ROSE_EOD_ANCHORED : ROSE_FLOATING; u32 literalId = tbi->getLiteralId(iv_info.s, msk, cmp, delay, table); DEBUG_PRINTF("literal=%u (len=%zu, delay=%u, offsets=[%u,%u] '%s')\n", literalId, iv_info.s.length(), delay, iv_info.min_offset, iv_info.max_offset, dumpString(iv_info.s).c_str()); createVertices(tbi, vertex_map, parents, iv, iv_info.min_offset, iv_info.max_offset, literalId, delay, bd); } static unique_ptr makeRoseEodPrefix(const NGHolder &h, RoseBuildImpl &build, map, ReportID> &remap) { assert(generates_callbacks(h)); assert(!in_degree(h.accept, h)); auto gg = cloneHolder(h); NGHolder &g = *gg; g.kind = is_triggered(h) ? NFA_INFIX : NFA_PREFIX; // Move acceptEod edges over to accept. vector dead; for (const auto &e : in_edges_range(g.acceptEod, g)) { NFAVertex u = source(e, g); if (u == g.accept) { continue; } add_edge_if_not_present(u, g.accept, g); dead.emplace_back(e); if (!contains(remap, g[u].reports)) { remap[g[u].reports] = build.getNewNfaReport(); } g[u].reports = { remap[g[u].reports] }; } remove_edges(dead, g); return gg; } static u32 getEodEventID(RoseBuildImpl &build) { // Allocate the EOD event if it hasn't been already. if (build.eod_event_literal_id == MO_INVALID_IDX) { build.eod_event_literal_id = build.getLiteralId({}, 0, ROSE_EVENT); } return build.eod_event_literal_id; } static void makeEodEventLeftfix(RoseBuildImpl &build, RoseVertex u, const NGHolder &h) { assert(!build.isInETable(u)); RoseGraph &g = build.g; map, ReportID> report_remap; shared_ptr eod_leftfix = makeRoseEodPrefix(h, build, report_remap); u32 eod_event = getEodEventID(build); for (const auto &report_mapping : report_remap) { RoseVertex v = add_vertex(g); g[v].literals.insert(eod_event); build.literal_info[eod_event].vertices.insert(v); g[v].left.graph = eod_leftfix; g[v].left.leftfix_report = report_mapping.second; g[v].left.lag = 0; RoseEdge e1 = add_edge(u, v, g); g[e1].minBound = 0; g[e1].maxBound = ROSE_BOUND_INF; g[v].min_offset = add_rose_depth(g[u].min_offset, findMinWidth(*g[v].left.graph)); g[v].max_offset = ROSE_BOUND_INF; depth max_width = findMaxWidth(*g[v].left.graph); if (u != build.root && max_width.is_finite() && (!build.isAnyStart(u) || isPureAnchored(*g[v].left.graph))) { g[e1].maxBound = max_width; g[v].max_offset = add_rose_depth(g[u].max_offset, max_width); } g[e1].history = ROSE_ROLE_HISTORY_NONE; // handled by prefix RoseVertex w = add_vertex(g); g[w].eod_accept = true; g[w].reports = report_mapping.first; g[w].min_offset = g[v].min_offset; g[w].max_offset = g[v].max_offset; RoseEdge e = add_edge(v, w, g); g[e].minBound = 0; g[e].maxBound = 0; /* No need to set history as the event is only delivered at the last * byte anyway - no need to invalidate stale entries. */ g[e].history = ROSE_ROLE_HISTORY_NONE; DEBUG_PRINTF("accept eod vertex (index=%zu)\n", g[w].index); } } static void doRoseAcceptVertex(RoseBuildImpl *tbi, const vector > &parents, RoseInVertex iv, const RoseBuildData &bd) { const RoseInGraph &ig = bd.ig; assert(ig[iv].type == RIV_ACCEPT || ig[iv].type == RIV_ACCEPT_EOD); RoseGraph &g = tbi->g; for (const auto &pv : parents) { RoseVertex u = pv.first; const RoseInEdgeProps &edge_props = bd.ig[pv.second]; /* We need to duplicate the parent vertices if: * * 1) It already has a suffix, etc as we are going to add the specified * suffix, etc to the parents and we do not want to overwrite the * existing information. * * 2) We are making the an EOD accept and the vertex already has other * out-edges - The LAST_BYTE history used for EOD accepts is * incompatible with normal successors. As accepts are processed last we * do not need to worry about other normal successors being added later. */ if (g[u].suffix || !g[u].reports.empty() || (ig[iv].type == RIV_ACCEPT_EOD && out_degree(u, g) && !edge_props.graph) || (!isLeafNode(u, g) && !tbi->isAnyStart(u))) { DEBUG_PRINTF("duplicating for parent %zu\n", g[u].index); assert(!tbi->isAnyStart(u)); u = duplicate(tbi, u); g[u].suffix.reset(); g[u].eod_accept = false; } assert(!g[u].suffix); if (ig[iv].type == RIV_ACCEPT) { assert(!tbi->isAnyStart(u)); if (edge_props.dfa) { DEBUG_PRINTF("adding early dfa suffix to i%zu\n", g[u].index); g[u].suffix.rdfa = edge_props.dfa; g[u].suffix.dfa_min_width = findMinWidth(*edge_props.graph); g[u].suffix.dfa_max_width = findMaxWidth(*edge_props.graph); } else if (edge_props.graph) { DEBUG_PRINTF("adding suffix to i%zu\n", g[u].index); g[u].suffix.graph = edge_props.graph; assert(g[u].suffix.graph->kind == NFA_SUFFIX); /* TODO: set dfa_(min|max)_width */ } else if (edge_props.haig) { DEBUG_PRINTF("adding suffaig to i%zu\n", g[u].index); g[u].suffix.haig = edge_props.haig; } else { DEBUG_PRINTF("adding boring accept to i%zu\n", g[u].index); assert(!g[u].eod_accept); g[u].reports = ig[iv].reports; } } else { assert(ig[iv].type == RIV_ACCEPT_EOD); assert(!edge_props.haig); if (!edge_props.graph) { RoseVertex w = add_vertex(g); g[w].eod_accept = true; g[w].reports = ig[iv].reports; g[w].min_offset = g[u].min_offset; g[w].max_offset = g[u].max_offset; RoseEdge e = add_edge(u, w, g); g[e].minBound = 0; g[e].maxBound = 0; g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE; DEBUG_PRINTF("accept eod vertex (index=%zu)\n", g[w].index); continue; } const NGHolder &h = *edge_props.graph; assert(!in_degree(h.accept, h)); assert(generates_callbacks(h)); if (tbi->isInETable(u)) { assert(h.kind == NFA_SUFFIX); assert(!tbi->isAnyStart(u)); /* etable can't/shouldn't use eod event */ DEBUG_PRINTF("adding suffix to i%zu\n", g[u].index); g[u].suffix.graph = edge_props.graph; continue; } makeEodEventLeftfix(*tbi, u, h); } } } static bool suitableForEod(const RoseInGraph &ig, vector topo, u32 *max_len, const CompileContext &cc) { map max_depth_from_eod; *max_len = 0; reverse(topo.begin(), topo.end()); /* we want to start at accept end */ for (auto v : topo) { u32 v_depth = 0; if (ig[v].type == RIV_ACCEPT) { DEBUG_PRINTF("[ACCEPT]\n"); // cppcheck-suppress useStlAlgorithm for (const auto &e : in_edges_range(v, ig)) { if (!ig[e].graph || !can_only_match_at_eod(*ig[e].graph)) { DEBUG_PRINTF("floating accept\n"); return false; } } } switch (ig[v].type) { case RIV_LITERAL: DEBUG_PRINTF("[LITERAL]\n"); break; case RIV_START: DEBUG_PRINTF("[START]\n"); break; case RIV_ANCHORED_START: DEBUG_PRINTF("[ANCHOR]\n"); break; case RIV_ACCEPT: break; case RIV_ACCEPT_EOD: DEBUG_PRINTF("[EOD]\n"); break; default: assert(0); DEBUG_PRINTF("????\n"); return false; } for (const auto &e : out_edges_range(v, ig)) { RoseInVertex t = target(e, ig); assert(contains(max_depth_from_eod, t)); u64a max_width; if (ig[v].type == RIV_START || ig[v].type == RIV_ANCHORED_START) { /* start itself doesn't need to be in history buffer * just need to make sure all succ literals are ok */ if (ig[t].type == RIV_LITERAL) { max_width = ig[t].s.length(); } else { max_width = 0; } if (ig[e].graph) { depth graph_max_width = findMaxWidth(*ig[e].graph); DEBUG_PRINTF("graph max width %s, lag %u\n", graph_max_width.str().c_str(), ig[e].graph_lag); if (!graph_max_width.is_finite()) { DEBUG_PRINTF("fail due to graph with inf max width\n"); return false; } max_width += graph_max_width; } } else if (ig[e].haig) { DEBUG_PRINTF("fail due to haig\n"); return false; } else if (ig[e].graph) { depth graph_max_width = findMaxWidth(*ig[e].graph); DEBUG_PRINTF("graph max width %s, lag %u\n", graph_max_width.str().c_str(), ig[e].graph_lag); if (!graph_max_width.is_finite()) { DEBUG_PRINTF("fail due to graph with inf max width\n"); return false; } max_width = ig[e].graph_lag + graph_max_width; } else { max_width = ig[e].maxBound; if (ig[t].type == RIV_LITERAL) { max_width += ig[t].s.length(); } } max_width += max_depth_from_eod[t]; if (max_width > ROSE_BOUND_INF) { max_width = ROSE_BOUND_INF; } DEBUG_PRINTF("max_width=%llu\n", max_width); ENSURE_AT_LEAST(&v_depth, (u32)max_width); } if (v_depth == ROSE_BOUND_INF || v_depth > cc.grey.maxHistoryAvailable) { DEBUG_PRINTF("not suitable for eod table %u\n", v_depth); return false; } max_depth_from_eod[v] = v_depth; ENSURE_AT_LEAST(max_len, v_depth); } DEBUG_PRINTF("to the eod table and beyond\n"); return true; } static void shift_accepts_to_end(const RoseInGraph &ig, vector &topo_order) { stable_partition(begin(topo_order), end(topo_order), [&](RoseInVertex v){ return !is_any_accept(v, ig); }); } static void populateRoseGraph(RoseBuildImpl *tbi, RoseBuildData &bd) { const RoseInGraph &ig = bd.ig; /* add the pattern in to the main rose graph */ DEBUG_PRINTF("%srose pop\n", bd.som ? "som " : ""); /* Note: an input vertex may need to create several rose vertices. This is * primarily because a RoseVertex can only have 1 one leftfix */ map > vertex_map; vector v_order = topo_order(ig); shift_accepts_to_end(ig, v_order); u32 eod_space_required; bool use_eod_table = suitableForEod(ig, v_order, &eod_space_required, tbi->cc); if (use_eod_table) { ENSURE_AT_LEAST(&tbi->ematcher_region_size, eod_space_required); } assert(ig[v_order.front()].type == RIV_START || ig[v_order.front()].type == RIV_ANCHORED_START); for (RoseInVertex iv : v_order) { DEBUG_PRINTF("vertex %zu\n", ig[iv].index); if (ig[iv].type == RIV_START) { DEBUG_PRINTF("is root\n"); vertex_map[iv].emplace_back(tbi->root); continue; } else if (ig[iv].type == RIV_ANCHORED_START) { DEBUG_PRINTF("is anchored root\n"); vertex_map[iv].emplace_back(tbi->anchored_root); continue; } vector > parents; for (const auto &e : in_edges_range(iv, ig)) { RoseInVertex u = source(e, ig); assert(contains(vertex_map, u)); const vector &images = vertex_map[u]; // We should have no dupes. assert(set(images.begin(), images.end()).size() == images.size()); for (auto v_image : images) { // v_image should NOT already be in our parents list. assert(find_if(parents.begin(), parents.end(), [&v_image](const pair &p) { return p.first == v_image; }) == parents.end()); parents.emplace_back(v_image, e); if (tbi->isAnchored(v_image)) { assert(!use_eod_table); u32 overlap = findRoseAnchorFloatingOverlap(ig[e], ig[iv]); assert(overlap <= tbi->cc.grey.maxHistoryAvailable + 1); ENSURE_AT_LEAST(&tbi->max_rose_anchored_floating_overlap, overlap); } } } if (ig[iv].type == RIV_LITERAL) { DEBUG_PRINTF("LITERAL '%s'\n", dumpString(ig[iv].s).c_str()); assert(!isLeafNode(iv, ig)); doRoseLiteralVertex(tbi, use_eod_table, vertex_map, parents, iv, bd); } else { if (ig[iv].type == RIV_ACCEPT) { DEBUG_PRINTF("ACCEPT\n"); } else { assert(ig[iv].type == RIV_ACCEPT_EOD); DEBUG_PRINTF("ACCEPT_EOD\n"); } assert(isLeafNode(iv, ig)); /* accepts are final */ doRoseAcceptVertex(tbi, parents, iv, bd); } } DEBUG_PRINTF("done\n"); } template static bool empty(const GraphT &g) { typename GraphT::vertex_iterator vi, ve; tie(vi, ve) = vertices(g); return vi == ve; } static bool canImplementGraph(NGHolder &h, bool prefilter, const ReportManager &rm, const CompileContext &cc) { if (isImplementableNFA(h, &rm, cc)) { return true; } if (prefilter && cc.grey.prefilterReductions) { // If we're prefiltering, we can have another go with a reduced graph. UNUSED size_t numBefore = num_vertices(h); prefilterReductions(h, cc); UNUSED size_t numAfter = num_vertices(h); DEBUG_PRINTF("reduced from %zu to %zu vertices\n", numBefore, numAfter); if (isImplementableNFA(h, &rm, cc)) { return true; } } DEBUG_PRINTF("unable to build engine\n"); return false; } static bool predsAreDelaySensitive(const RoseInGraph &ig, RoseInVertex v) { assert(in_degree(v, ig)); // cppcheck-suppress useStlAlgorithm for (const auto &e : in_edges_range(v, ig)) { if (ig[e].graph || ig[e].haig) { DEBUG_PRINTF("edge graph\n"); return true; } if (ig[e].minBound || ig[e].maxBound != ROSE_BOUND_INF) { DEBUG_PRINTF("edge bounds\n"); return true; } RoseInVertex u = source(e, ig); if (ig[u].type == RIV_START) { continue; } if (ig[u].type != RIV_LITERAL) { DEBUG_PRINTF("unsafe pred vertex\n"); return true; } if (ig[u].delay) { DEBUG_PRINTF("pred has delay\n"); return true; } } return false; } static u32 maxAvailableDelay(const ue2_literal &pred_key, const ue2_literal &lit_key) { /* overly conservative if only part of the string is nocase */ string pred = pred_key.get_string(); string lit = lit_key.get_string(); if (pred_key.any_nocase() || lit_key.any_nocase()) { upperString(pred); upperString(lit); } string::size_type last = pred.rfind(lit); if (last == string::npos) { return MAX_DELAY; } u32 raw = pred.size() - last - 1; return MIN(raw, MAX_DELAY); } static u32 findMaxSafeDelay(const RoseInGraph &ig, RoseInVertex u, RoseInVertex v) { // First, check the overlap constraints on (u,v). size_t max_delay; if (ig[v].type == RIV_LITERAL) { DEBUG_PRINTF("lit->lit edge: '%s' -> '%s'\n", escapeString(ig[u].s).c_str(), escapeString(ig[v].s).c_str()); max_delay = maxAvailableDelay(ig[u].s, ig[v].s); } else if (ig[v].type == RIV_ACCEPT) { DEBUG_PRINTF("lit->accept edge: '%s' -> ACCEPT\n", escapeString(ig[u].s).c_str()); max_delay = MAX_DELAY; } else { assert(0); return 0; } DEBUG_PRINTF("max safe delay for this edge: %zu\n", max_delay); // Now consider the predecessors of u. for (const auto &e : in_edges_range(u, ig)) { RoseInVertex w = source(e, ig); if (ig[w].type == RIV_START) { continue; } assert(ig[w].type == RIV_LITERAL); assert(ig[w].delay == 0); DEBUG_PRINTF("pred lit->lit edge: '%s' -> '%s'\n", escapeString(ig[w].s).c_str(), escapeString(ig[u].s).c_str()); // We cannot delay the literal on u so much that a predecessor literal // could occur in the delayed region. For example, consider // 'barman.*foobar': if we allow 'foobar' to be delayed by 3, then // 'barman' could occur in the input string and race with 'foobar', as // in 'foobarman'. const size_t pred_len = ig[w].s.length(); size_t overlap = maxOverlap(ig[u].s, ig[w].s, 0); DEBUG_PRINTF("pred_len=%zu, overlap=%zu\n", pred_len, overlap); assert(overlap <= pred_len); size_t max_lit_delay = pred_len - min(overlap + 1, pred_len); DEBUG_PRINTF("overlap=%zu -> max_lit_delay=%zu\n", overlap, max_lit_delay); max_delay = min(max_delay, max_lit_delay); } DEBUG_PRINTF("max_delay=%zu\n", max_delay); assert(max_delay <= MAX_DELAY); return max_delay; } static bool transformInfixToDelay(const RoseInGraph &ig, const RoseInEdge &e, const CompileContext &cc, u32 *delay_out) { const u32 max_history = cc.streaming ? cc.grey.maxHistoryAvailable : ROSE_BOUND_INF; const RoseInVertex u = source(e, ig), v = target(e, ig); const u32 graph_lag = ig[e].graph_lag; // Clone a copy of the graph, as we need to be able to roll back this // operation. NGHolder h; cloneHolder(h, *ig[e].graph); DEBUG_PRINTF("target literal: %s\n", dumpString(ig[v].s).c_str()); DEBUG_PRINTF("graph with %zu vertices and graph_lag %u\n", num_vertices(h), graph_lag); assert(graph_lag <= ig[v].s.length()); if (graph_lag < ig[v].s.length()) { size_t len = ig[v].s.length() - graph_lag; ue2_literal lit(ig[v].s.substr(0, len)); DEBUG_PRINTF("lit2=%s\n", dumpString(lit).c_str()); u32 delay2 = removeTrailingLiteralStates(h, lit, max_history); if (delay2 == MO_INVALID_IDX) { DEBUG_PRINTF("couldn't remove trailing literal\n"); return false; } if (delay2 != len) { DEBUG_PRINTF("couldn't remove entire trailing literal\n"); return false; } } PureRepeat repeat; if (!isPureRepeat(h, repeat)) { DEBUG_PRINTF("graph is not repeat\n"); return false; } DEBUG_PRINTF("graph is %s repeat\n", repeat.bounds.str().c_str()); if (!repeat.bounds.max.is_infinite()) { DEBUG_PRINTF("not inf\n"); return false; } if (!repeat.reach.all()) { DEBUG_PRINTF("non-dot reach\n"); return false; } u32 delay = ig[v].s.length() + repeat.bounds.min; if (delay > MAX_DELAY) { DEBUG_PRINTF("delay %u > MAX_DELAY\n", delay); return false; } if (delay + ig[u].s.length() - 1 > max_history) { DEBUG_PRINTF("delay too large for history\n"); return false; } *delay_out = delay; return true; } static void transformLiteralDelay(RoseInGraph &ig, const CompileContext &cc) { if (!cc.grey.roseTransformDelay) { return; } for (auto u : vertices_range(ig)) { if (ig[u].type != RIV_LITERAL) { continue; } if (out_degree(u, ig) != 1) { continue; } RoseInEdge e = *out_edges(u, ig).first; RoseInVertex v = target(e, ig); if (ig[v].type != RIV_LITERAL) { continue; } if (ig[e].haig) { continue; } if (!ig[e].graph) { continue; } if (predsAreDelaySensitive(ig, u)) { DEBUG_PRINTF("preds are delay sensitive\n"); continue; } u32 max_delay = findMaxSafeDelay(ig, u, v); DEBUG_PRINTF("lit->lit edge with graph: '%s' -> '%s'\n", escapeString(ig[u].s).c_str(), escapeString(ig[v].s).c_str()); u32 delay = 0; if (!transformInfixToDelay(ig, e, cc, &delay)) { continue; } if (delay > max_delay) { DEBUG_PRINTF("delay=%u > max_delay=%u\n", delay, max_delay); continue; } DEBUG_PRINTF("setting lit delay to %u and deleting graph\n", delay); ig[u].delay = delay; ig[u].min_offset = add_rose_depth(ig[u].min_offset, delay); ig[u].max_offset = add_rose_depth(ig[u].max_offset, delay); ig[e].graph_lag = 0; ig[e].graph.reset(); ig[e].minBound = 0; ig[e].maxBound = ROSE_BOUND_INF; } } static bool transformInfixToAnchBounds(const RoseInGraph &ig, const RoseInEdge &e, const CompileContext &cc, DepthMinMax *bounds) { const u32 max_history = cc.streaming ? cc.grey.maxHistoryAvailable : ROSE_BOUND_INF; const RoseInVertex v = target(e, ig); const u32 graph_lag = ig[e].graph_lag; // Clone a copy of the graph, as we need to be able to roll back this // operation. NGHolder h; cloneHolder(h, *ig[e].graph); DEBUG_PRINTF("graph with %zu vertices and graph_lag %u\n", num_vertices(h), graph_lag); assert(graph_lag <= ig[v].s.length()); if (graph_lag < ig[v].s.length()) { size_t len = ig[v].s.length() - graph_lag; ue2_literal lit(ig[v].s.substr(0, len)); DEBUG_PRINTF("lit2=%s\n", dumpString(lit).c_str()); u32 delay2 = removeTrailingLiteralStates(h, lit, max_history); if (delay2 == MO_INVALID_IDX) { DEBUG_PRINTF("couldn't remove trailing literal\n"); return false; } if (delay2 != len) { DEBUG_PRINTF("couldn't remove entire trailing literal\n"); return false; } } PureRepeat repeat; if (!isPureRepeat(h, repeat)) { DEBUG_PRINTF("graph is not repeat\n"); return false; } DEBUG_PRINTF("graph is %s repeat\n", repeat.bounds.str().c_str()); if (!repeat.bounds.max.is_infinite()) { DEBUG_PRINTF("not inf\n"); return false; } if (!repeat.reach.all()) { DEBUG_PRINTF("non-dot reach\n"); return false; } *bounds = repeat.bounds; return true; } static void transformAnchoredLiteralOverlap(RoseInGraph &ig, RoseBuildData &bd, const CompileContext &cc) { if (!cc.grey.roseTransformDelay) { return; } for (const auto &e : edges_range(ig)) { const RoseInVertex u = source(e, ig); const RoseInVertex v = target(e, ig); if (ig[u].type != RIV_LITERAL || ig[v].type != RIV_LITERAL) { continue; } if (ig[e].haig || !ig[e].graph) { continue; } if (ig[u].min_offset != ig[u].max_offset) { DEBUG_PRINTF("u not fixed depth\n"); continue; } DEBUG_PRINTF("anch_lit->lit edge with graph: '%s' -> '%s'\n", escapeString(ig[u].s).c_str(), escapeString(ig[v].s).c_str()); DepthMinMax bounds; if (!transformInfixToAnchBounds(ig, e, cc, &bounds)) { continue; } DEBUG_PRINTF("setting bounds to %s and deleting graph\n", bounds.str().c_str()); ig[e].graph_lag = 0; ig[e].graph.reset(); ig[e].minBound = bounds.min; ig[e].maxBound = bounds.max.is_finite() ? (u32)bounds.max : ROSE_BOUND_INF; bd.anch_history_edges.insert(e); } } /** * \brief Transform small trailing dot repeat suffixes into delay on the last * literal. * * For example, the case /hatstand.*teakettle./s can just delay 'teakettle' +1 * rather than having a suffix to handle the dot. * * This transformation looks for literal->accept edges and transforms them if * appropriate. It doesn't handle complex cases where the literal has more than * one successor. */ static void transformSuffixDelay(RoseInGraph &ig, const CompileContext &cc) { if (!cc.grey.roseTransformDelay) { return; } const u32 max_history = cc.streaming ? cc.grey.maxHistoryAvailable : ROSE_BOUND_INF; set modified_accepts; // may be dead after transform for (auto u : vertices_range(ig)) { if (ig[u].type != RIV_LITERAL) { continue; } if (out_degree(u, ig) != 1) { continue; } RoseInEdge e = *out_edges(u, ig).first; RoseInVertex v = target(e, ig); if (ig[v].type != RIV_ACCEPT) { continue; } if (ig[e].haig) { continue; } if (!ig[e].graph) { continue; } if (predsAreDelaySensitive(ig, u)) { DEBUG_PRINTF("preds are delay sensitive\n"); continue; } DEBUG_PRINTF("lit->accept edge with graph: lit='%s'\n", escapeString(ig[u].s).c_str()); const NGHolder &h = *ig[e].graph; const set reports = all_reports(h); if (reports.size() != 1) { DEBUG_PRINTF("too many reports\n"); continue; } PureRepeat repeat; if (!isPureRepeat(h, repeat)) { DEBUG_PRINTF("suffix graph is not repeat\n"); continue; } DEBUG_PRINTF("suffix graph is %s repeat\n", repeat.bounds.str().c_str()); if (!repeat.reach.all()) { DEBUG_PRINTF("non-dot reach\n"); continue; } if (repeat.bounds.min != repeat.bounds.max || repeat.bounds.min > depth(MAX_DELAY)) { DEBUG_PRINTF("repeat is variable or too large\n"); continue; } u32 max_delay = findMaxSafeDelay(ig, u, v); u32 delay = repeat.bounds.min; if (delay > max_delay) { DEBUG_PRINTF("delay=%u > max_delay=%u\n", delay, max_delay); continue; } if (delay + ig[u].s.length() - 1 > max_history) { DEBUG_PRINTF("delay too large for history\n"); continue; } DEBUG_PRINTF("setting lit delay to %u and removing suffix\n", delay); ig[u].delay = delay; ig[u].min_offset = add_rose_depth(ig[u].min_offset, delay); ig[u].max_offset = add_rose_depth(ig[u].max_offset, delay); // Construct a new accept vertex for this report and remove edge e. // (This allows us to cope if v has more than one in-edge). RoseInVertex v2 = add_vertex(RoseInVertexProps::makeAccept(reports), ig); add_edge(u, v2, ig); remove_edge(e, ig); modified_accepts.insert(v); } DEBUG_PRINTF("%zu modified accepts\n", modified_accepts.size()); for (auto v : modified_accepts) { if (in_degree(v, ig) == 0) { DEBUG_PRINTF("removing accept vertex with no preds\n"); remove_vertex(v, ig); } } } #ifndef NDEBUG static bool validateKinds(const RoseInGraph &g) { for (const auto &e : edges_range(g)) { if (g[e].graph && g[e].graph->kind != whatRoseIsThis(g, e)) { return false; } } return true; } #endif bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter) { DEBUG_PRINTF("trying to rose\n"); assert(validateKinds(ig)); assert(hasCorrectlyNumberedVertices(ig)); if (::ue2::empty(ig)) { assert(0); return false; } const unique_ptr in_ptr = cloneRoseGraph(ig); RoseInGraph &in = *in_ptr; RoseBuildData bd(in, false); transformLiteralDelay(in, cc); transformAnchoredLiteralOverlap(in, bd, cc); transformSuffixDelay(in, cc); renumber_vertices(in); assert(validateKinds(in)); insertion_ordered_map> graphs; for (const auto &e : edges_range(in)) { if (!in[e].graph) { assert(!in[e].dfa); assert(!in[e].haig); continue; // no graph } if (in[e].haig || in[e].dfa) { /* Early DFAs/Haigs are always implementable (we've already built * the raw DFA). */ continue; } NGHolder *h = in[e].graph.get(); assert(isCorrectlyTopped(*h)); graphs[h].emplace_back(e); } vector graph_edges; for (const auto &m : graphs) { NGHolder *h = m.first; if (!canImplementGraph(*h, prefilter, rm, cc)) { return false; } insert(&graph_edges, graph_edges.end(), m.second); } /* we are now past the point of no return. We can start making irreversible changes to the rose graph, etc */ for (const auto &e : graph_edges) { assert(in[e].graph); assert(!in[e].haig); NGHolder &h = *in[e].graph; DEBUG_PRINTF("handling %p\n", &h); assert(allMatchStatesHaveReports(h)); if (!generates_callbacks(whatRoseIsThis(in, e)) && in[target(e, in)].type != RIV_ACCEPT_EOD) { set_report(h, getNewNfaReport()); } } populateRoseGraph(this, bd); return true; } bool RoseBuildImpl::addSombeRose(const RoseInGraph &ig) { DEBUG_PRINTF("rose is trying to consume a sombe\n"); assert(validateKinds(ig)); if (::ue2::empty(ig)) { assert(0); return false; } RoseBuildData bd(ig, true); for (const auto &e : edges_range(ig)) { if (!ig[e].graph) { continue; // no graph } DEBUG_PRINTF("handling %p\n", ig[e].graph.get()); assert(allMatchStatesHaveReports(*ig[e].graph)); assert(ig[e].haig); } populateRoseGraph(this, bd); return true; } bool roseCheckRose(const RoseInGraph &ig, bool prefilter, const ReportManager &rm, const CompileContext &cc) { assert(validateKinds(ig)); if (::ue2::empty(ig)) { assert(0); return false; } vector graphs; for (const auto &e : edges_range(ig)) { if (!ig[e].graph) { continue; // no graph } if (ig[e].haig) { // Haigs are always implementable (we've already built the raw DFA). continue; } graphs.emplace_back(ig[e].graph.get()); } // cppcheck-suppress useStlAlgorithm for (const auto &g : graphs) { if (!canImplementGraph(*g, prefilter, rm, cc)) { return false; } } return true; } void RoseBuildImpl::add(bool anchored, bool eod, const ue2_literal &lit, const flat_set &reports) { assert(!reports.empty()); if (cc.grey.floodAsPuffette && !anchored && !eod && is_flood(lit) && lit.length() > 3) { DEBUG_PRINTF("adding as puffette\n"); const CharReach &cr = *lit.begin(); for (const auto &report : reports) { addOutfix(raw_puff(lit.length(), true, report, cr, true)); } return; } RoseInGraph ig; RoseInVertex start = add_vertex(RoseInVertexProps::makeStart(anchored), ig); RoseInVertex accept = add_vertex( eod ? RoseInVertexProps::makeAcceptEod(set()) : RoseInVertexProps::makeAccept(set()), ig); RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), ig); add_edge(start, v, RoseInEdgeProps(0U, anchored ? 0U : ROSE_BOUND_INF), ig); add_edge(v, accept, RoseInEdgeProps(0U, 0U), ig); calcVertexOffsets(ig); ig[accept].reports.insert(reports.begin(), reports.end()); addRose(ig, false); } static u32 findMaxBAWidth(const NGHolder &h) { // Must be bi-anchored: no out-edges from startDs (other than its // self-loop), no in-edges to accept. if (out_degree(h.startDs, h) > 1 || in_degree(h.accept, h)) { return ROSE_BOUND_INF; } depth d = findMaxWidth(h); assert(d.is_reachable()); if (!d.is_finite()) { return ROSE_BOUND_INF; } return d; } static void populateOutfixInfo(OutfixInfo &outfix, const NGHolder &h, const RoseBuildImpl &tbi) { outfix.maxBAWidth = findMaxBAWidth(h); outfix.minWidth = findMinWidth(h); outfix.maxWidth = findMaxWidth(h); outfix.maxOffset = findMaxOffset(h, tbi.rm); populateReverseAccelerationInfo(outfix.rev_info, h); } static bool addEodOutfix(RoseBuildImpl &build, const NGHolder &h) { map, ReportID> report_remap; shared_ptr eod_leftfix = makeRoseEodPrefix(h, build, report_remap); bool nfa_ok = isImplementableNFA(h, &build.rm, build.cc); /* TODO: check if early dfa is possible */ if (!nfa_ok) { DEBUG_PRINTF("could not build as NFA\n"); return false; } u32 eod_event = getEodEventID(build); auto &g = build.g; for (const auto &report_mapping : report_remap) { RoseVertex v = add_vertex(g); g[v].literals.insert(eod_event); build.literal_info[eod_event].vertices.insert(v); g[v].left.graph = eod_leftfix; g[v].left.leftfix_report = report_mapping.second; g[v].left.lag = 0; RoseEdge e1 = add_edge(build.anchored_root, v, g); g[e1].minBound = 0; g[e1].maxBound = ROSE_BOUND_INF; g[v].min_offset = findMinWidth(*eod_leftfix); g[v].max_offset = ROSE_BOUND_INF; depth max_width = findMaxWidth(*g[v].left.graph); if (max_width.is_finite() && isPureAnchored(*eod_leftfix)) { g[e1].maxBound = max_width; g[v].max_offset = max_width; } g[e1].history = ROSE_ROLE_HISTORY_NONE; // handled by prefix RoseVertex w = add_vertex(g); g[w].eod_accept = true; g[w].reports = report_mapping.first; g[w].min_offset = g[v].min_offset; g[w].max_offset = g[v].max_offset; RoseEdge e = add_edge(v, w, g); g[e].minBound = 0; g[e].maxBound = 0; g[e].history = ROSE_ROLE_HISTORY_NONE; DEBUG_PRINTF("accept eod vertex (index=%zu)\n", g[w].index); } return true; } bool RoseBuildImpl::addOutfix(const NGHolder &h) { DEBUG_PRINTF("%zu vertices, %zu edges\n", num_vertices(h), num_edges(h)); /* TODO: handle more than one report */ if (!in_degree(h.accept, h) && all_reports(h).size() == 1 && addEodOutfix(*this, h)) { return true; } const u32 nfa_states = isImplementableNFA(h, &rm, cc); if (nfa_states) { DEBUG_PRINTF("implementable as an NFA in %u states\n", nfa_states); } else { DEBUG_PRINTF("not implementable as an NFA\n"); } bool dfa_cand = !nfa_states || nfa_states > 128 /* slow model */ || can_exhaust(h, rm); /* can be pruned */ unique_ptr rdfa; if (!nfa_states || cc.grey.roseMcClellanOutfix == 2 || (cc.grey.roseMcClellanOutfix == 1 && dfa_cand)) { rdfa = buildMcClellan(h, &rm, cc.grey); } if (!nfa_states && !rdfa) { DEBUG_PRINTF("could not build as either an NFA or a DFA\n"); return false; } if (rdfa) { outfixes.emplace_back(OutfixInfo(std::move(rdfa))); } else { outfixes.emplace_back(OutfixInfo(cloneHolder(h))); } populateOutfixInfo(outfixes.back(), h, *this); return true; } bool RoseBuildImpl::addOutfix(const NGHolder &h, const raw_som_dfa &haig) { DEBUG_PRINTF("haig with %zu states\n", haig.states.size()); outfixes.emplace_back(OutfixInfo(std::make_unique(haig))); populateOutfixInfo(outfixes.back(), h, *this); return true; /* failure is not yet an option */ } bool RoseBuildImpl::addOutfix(const raw_puff &rp) { if (!mpv_outfix) { mpv_outfix = std::make_unique(MpvProto()); } auto *mpv = mpv_outfix->mpv(); assert(mpv); mpv->puffettes.emplace_back(rp); mpv_outfix->maxBAWidth = ROSE_BOUND_INF; /* not ba */ mpv_outfix->minWidth = min(mpv_outfix->minWidth, depth(rp.repeats)); mpv_outfix->maxWidth = rp.unbounded ? depth::infinity() : max(mpv_outfix->maxWidth, depth(rp.repeats)); if (mpv_outfix->maxOffset == ROSE_BOUND_INF || rp.unbounded) { mpv_outfix->maxOffset = ROSE_BOUND_INF; } else { mpv_outfix->maxOffset = MAX(mpv_outfix->maxOffset, rp.repeats); } return true; /* failure is not yet an option */ } bool RoseBuildImpl::addChainTail(const raw_puff &rp, u32 *queue_out, u32 *event_out) { if (!mpv_outfix) { mpv_outfix = std::make_unique(MpvProto()); } auto *mpv = mpv_outfix->mpv(); assert(mpv); mpv->triggered_puffettes.emplace_back(rp); mpv_outfix->maxBAWidth = ROSE_BOUND_INF; /* not ba */ mpv_outfix->minWidth = min(mpv_outfix->minWidth, depth(rp.repeats)); mpv_outfix->maxWidth = rp.unbounded ? depth::infinity() : max(mpv_outfix->maxWidth, depth(rp.repeats)); mpv_outfix->maxOffset = ROSE_BOUND_INF; /* TODO: we could get information from * the caller */ *queue_out = mpv_outfix->get_queue(qif); *event_out = MQE_TOP_FIRST + mpv->triggered_puffettes.size() - 1; return true; /* failure is not yet an option */ } static bool prepAcceptForAddAnchoredNFA(RoseBuildImpl &tbi, const NGHolder &w, NFAVertex u, const vector &vertexDepths, map &depthMap, map> &reportMap, map &allocated_reports, flat_set &added_lit_ids) { const depth max_anchored_depth(tbi.cc.grey.maxAnchoredRegion); const size_t index = w[u].index; assert(index < vertexDepths.size()); const DepthMinMax &d = vertexDepths.at(index); for (const auto &int_report : w[u].reports) { assert(int_report != MO_INVALID_IDX); u32 lit_id; if (!contains(allocated_reports, int_report)) { lit_id = tbi.getNewLiteralId(); added_lit_ids.insert(lit_id); allocated_reports[int_report] = lit_id; } else { lit_id = allocated_reports[int_report]; } reportMap[u].insert(lit_id); if (!contains(depthMap, lit_id)) { depthMap[lit_id] = d; } else { depthMap[lit_id] = unionDepthMinMax(depthMap[lit_id], d); } if (depthMap[lit_id].max > max_anchored_depth) { DEBUG_PRINTF("depth=%s exceeds maxAnchoredRegion=%u\n", depthMap[lit_id].max.str().c_str(), tbi.cc.grey.maxAnchoredRegion); return false; } } return true; } // Failure path for addAnchoredAcyclic: removes the literal IDs that have been // added to support anchored NFAs. Assumes that they are a contiguous range at // the end of the RoseBuildImpl::literal_info vector. static void removeAddedLiterals(RoseBuildImpl &tbi, const flat_set &lit_ids) { if (lit_ids.empty()) { return; } DEBUG_PRINTF("remove last %zu literals\n", lit_ids.size()); // lit_ids should be a contiguous range. assert(lit_ids.size() == *lit_ids.rbegin() - *lit_ids.begin() + 1); assert(*lit_ids.rbegin() == tbi.literals.size() - 1); assert(all_of_in(lit_ids, [&](u32 lit_id) { return lit_id < tbi.literal_info.size() && tbi.literals.at(lit_id).table == ROSE_ANCHORED && tbi.literal_info[lit_id].vertices.empty(); })); tbi.literals.erase_back(lit_ids.size()); assert(tbi.literals.size() == *lit_ids.begin()); // lit_ids should be at the end of tbi.literal_info. assert(tbi.literal_info.size() == *lit_ids.rbegin() + 1); tbi.literal_info.resize(*lit_ids.begin()); // remove all ids in lit_ids } bool RoseBuildImpl::addAnchoredAcyclic(const NGHolder &h) { auto vertexDepths = calcDepthsFrom(h, h.start); map > reportMap; /* NFAVertex -> literal ids */ map depthMap; /* literal id -> min/max depth */ map allocated_reports; /* report -> literal id */ flat_set added_lit_ids; /* literal ids added for this NFA */ for (auto v : inv_adjacent_vertices_range(h.accept, h)) { // cppcheck-suppress useStlAlgorithm if (!prepAcceptForAddAnchoredNFA(*this, h, v, vertexDepths, depthMap, reportMap, allocated_reports, added_lit_ids)) { removeAddedLiterals(*this, added_lit_ids); return false; } } map allocated_reports_eod; /* report -> literal id */ for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) { if (v == h.accept) { continue; } if (!prepAcceptForAddAnchoredNFA(*this, h, v, vertexDepths, depthMap, reportMap, allocated_reports_eod, added_lit_ids)) { removeAddedLiterals(*this, added_lit_ids); return false; } } assert(!reportMap.empty()); int rv = addAnchoredNFA(*this, h, reportMap); if (rv != ANCHORED_FAIL) { assert(rv != ANCHORED_REMAP); DEBUG_PRINTF("added anchored nfa\n"); /* add edges to the rose graph to bubble the match up */ for (const auto &m : allocated_reports) { const ReportID &report = m.first; const u32 &lit_id = m.second; assert(depthMap[lit_id].max.is_finite()); u32 minBound = depthMap[lit_id].min; u32 maxBound = depthMap[lit_id].max; RoseVertex v = createAnchoredVertex(this, lit_id, minBound, maxBound); g[v].reports.insert(report); } for (const auto &m : allocated_reports_eod) { const ReportID &report = m.first; const u32 &lit_id = m.second; assert(depthMap[lit_id].max.is_finite()); u32 minBound = depthMap[lit_id].min; u32 maxBound = depthMap[lit_id].max; RoseVertex v = createAnchoredVertex(this, lit_id, minBound, maxBound); RoseVertex eod = add_vertex(g); g[eod].eod_accept = true; g[eod].reports.insert(report); g[eod].min_offset = g[v].min_offset; g[eod].max_offset = g[v].max_offset; add_edge(v, eod, g); } return true; } else { DEBUG_PRINTF("failed to add anchored nfa\n"); removeAddedLiterals(*this, added_lit_ids); return false; } } } // namespace ue2