/* * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "rose_build_add_internal.h" #include "rose_build_impl.h" #include "ue2common.h" #include "grey.h" #include "rose_build_anchored.h" #include "rose_in_util.h" #include "hwlm/hwlm_literal.h" #include "nfa/goughcompile.h" #include "nfa/nfa_api_queue.h" #include "nfagraph/ng_depth.h" #include "nfagraph/ng_dump.h" #include "nfagraph/ng_holder.h" #include "nfagraph/ng_limex.h" #include "nfagraph/ng_mcclellan.h" #include "nfagraph/ng_prefilter.h" #include "nfagraph/ng_prune.h" #include "nfagraph/ng_region.h" #include "nfagraph/ng_repeat.h" #include "nfagraph/ng_reports.h" #include "nfagraph/ng_rose.h" #include "nfagraph/ng_util.h" #include "nfagraph/ng_width.h" #include "util/charreach.h" #include "util/charreach_util.h" #include "util/compare.h" #include "util/compile_context.h" #include "util/container.h" #include "util/dump_charclass.h" #include "util/graph_range.h" #include "util/make_unique.h" #include "util/order_check.h" #include "util/report_manager.h" #include "util/ue2string.h" #include "util/verify_types.h" #include #include #include #include #include #include #include using namespace std; namespace ue2 { /** * \brief Data used by most of the construction code in this file. */ struct RoseBuildData : boost::noncopyable { RoseBuildData(const RoseInGraph &ig_in, bool som_in) : ig(ig_in), som(som_in) {} /** Input rose graph. */ const RoseInGraph &ig; /** Mapping from engine graph to constructed DFA for pre-build DFAs. */ ue2::unordered_map > early_dfas; /** Edges we've transformed (in \ref transformAnchoredLiteralOverlap) which * require ANCH history to prevent overlap. */ ue2::unordered_set anch_history_edges; /** True if we're tracking Start of Match. */ bool som; }; static ReportID findReportId(const NGHolder &g) { /* prefix/infix always have an edge to accept and only 1 reportid initially */ assert(in_degree(g.accept, g)); const auto &rep = g[*inv_adjacent_vertices(g.accept, g).first].reports; assert(!rep.empty()); return *rep.begin(); } static RoseVertex createVertex(RoseBuildImpl *build, u32 literalId, u32 min_offset, u32 max_offset) { RoseGraph &g = build->g; // add to tree RoseVertex v = add_vertex(g); g[v].idx = build->vertexIndex++; g[v].min_offset = min_offset; g[v].max_offset = max_offset; DEBUG_PRINTF("insert vertex %zu into literal %u's vertex set\n", g[v].idx, literalId); g[v].literals.insert(literalId); build->literal_info[literalId].vertices.insert(v); return v; } RoseVertex createVertex(RoseBuildImpl *build, const RoseVertex parent, u32 minBound, u32 maxBound, u32 literalId, size_t literalLength, const ue2::flat_set &reports) { assert(parent != RoseGraph::null_vertex()); RoseGraph &g = build->g; // add to tree (offsets set latter) RoseVertex v = createVertex(build, literalId, 0U, 0U); /* fill in report information */ g[v].reports.insert(reports.begin(), reports.end()); RoseEdge e; bool added; tie(e, added) = add_edge(parent, v, g); assert(added); DEBUG_PRINTF("adding edge (%u, %u) to parent\n", minBound, maxBound); g[e].minBound = minBound; g[e].maxBound = maxBound; g[e].rose_top = 0; u32 min_offset = add_rose_depth(g[parent].min_offset, minBound); u32 max_offset = add_rose_depth(g[parent].max_offset, maxBound); /* take literal length into account for offsets */ const u32 lit_len = verify_u32(literalLength); min_offset = add_rose_depth(min_offset, lit_len); max_offset = add_rose_depth(max_offset, lit_len); g[v].min_offset = min_offset; g[v].max_offset = max_offset; return v; } static RoseVertex createAnchoredVertex(RoseBuildImpl *build, u32 literalId, u32 min_offset, u32 max_offset) { RoseGraph &g = build->g; RoseVertex v = createVertex(build, literalId, min_offset, max_offset); DEBUG_PRINTF("created anchored vertex %zu with lit id %u\n", g[v].idx, literalId); RoseEdge e = add_edge(build->anchored_root, v, g).first; g[e].minBound = min_offset; g[e].maxBound = max_offset; return v; } static RoseVertex duplicate(RoseBuildImpl *build, RoseVertex v) { RoseGraph &g = build->g; RoseVertex w = add_vertex(g[v], g); g[w].idx = build->vertexIndex++; DEBUG_PRINTF("added vertex %zu\n", g[w].idx); for (auto lit_id : g[w].literals) { build->literal_info[lit_id].vertices.insert(w); } for (const auto &e : in_edges_range(v, g)) { RoseVertex s = source(e, g); add_edge(s, w, g[e], g); DEBUG_PRINTF("added edge (%zu,%zu)\n", g[s].idx, g[w].idx); } return w; } namespace { struct created_key { explicit created_key(const RoseInEdgeProps &trep) : prefix(trep.graph.get()), lag(trep.graph_lag) { } bool operator<(const created_key &b) const { const created_key &a = *this; ORDER_CHECK(prefix); ORDER_CHECK(lag); return false; } NGHolder *prefix; u32 lag; }; } static bool isPureAnchored(const NGHolder &h) { return !proper_out_degree(h.startDs, h); } static RoseRoleHistory selectHistory(const RoseBuildImpl &tbi, const RoseBuildData &bd, const RoseInEdge &rose_edge, const RoseEdge &e) { const RoseGraph &g = tbi.g; const RoseVertex u = source(e, g), v = target(e, g); const bool fixed_offset_src = g[u].fixedOffset(); const bool has_bounds = g[e].minBound || (g[e].maxBound != ROSE_BOUND_INF); DEBUG_PRINTF("edge %zu->%zu, bounds=[%u,%u], fixed_u=%d, prefix=%d\n", g[u].idx, g[v].idx, g[e].minBound, g[e].maxBound, (int)g[u].fixedOffset(), (int)g[v].left); if (g[v].left) { // Roles with prefix engines have their history handled by that prefix. assert(!contains(bd.anch_history_edges, rose_edge)); return ROSE_ROLE_HISTORY_NONE; } if (contains(bd.anch_history_edges, rose_edge)) { DEBUG_PRINTF("needs anch history\n"); return ROSE_ROLE_HISTORY_ANCH; } if (fixed_offset_src && has_bounds) { DEBUG_PRINTF("needs anch history\n"); return ROSE_ROLE_HISTORY_ANCH; } return ROSE_ROLE_HISTORY_NONE; } static bool hasSuccessorLiterals(RoseInVertex iv, const RoseInGraph &ig) { for (auto v : adjacent_vertices_range(iv, ig)) { if (ig[v].type != RIV_ACCEPT) { return true; } } return false; } static void createVertices(RoseBuildImpl *tbi, map > &vertex_map, const vector > &parents, RoseInVertex iv, u32 min_offset, u32 max_offset, u32 literalId, u32 delay, const RoseBuildData &bd) { RoseGraph &g = tbi->g; DEBUG_PRINTF("vertex has %zu parents\n", parents.size()); map created; for (const auto &pv : parents) { RoseVertex w; const RoseInEdgeProps &edge_props = bd.ig[pv.second]; shared_ptr prefix_graph = edge_props.graph; u32 prefix_lag = edge_props.graph_lag; created_key key(edge_props); if (!contains(created, key)) { assert(prefix_graph || !edge_props.haig); w = createVertex(tbi, literalId, min_offset, max_offset); created[key] = w; if (prefix_graph) { g[w].left.graph = prefix_graph; if (contains(bd.early_dfas, prefix_graph.get())) { g[w].left.dfa = bd.early_dfas.at(prefix_graph.get()); } g[w].left.haig = edge_props.haig; g[w].left.lag = prefix_lag; // The graph already has its report id allocated - find it. g[w].left.leftfix_report = findReportId(*prefix_graph); if (g[w].left.dfa || g[w].left.haig) { assert(prefix_graph); g[w].left.dfa_min_width = findMinWidth(*prefix_graph); g[w].left.dfa_max_width = findMaxWidth(*prefix_graph); } } if (bd.som && !g[w].left.haig) { /* no prefix - som based on literal start */ assert(!prefix_graph); g[w].som_adjust = tbi->literals.right.at(literalId).elength(); DEBUG_PRINTF("set som_adjust to %u\n", g[w].som_adjust); } DEBUG_PRINTF(" adding new vertex idx=%zu\n", tbi->g[w].idx); vertex_map[iv].push_back(w); } else { w = created[key]; } RoseVertex p = pv.first; RoseEdge e; bool added; tie(e, added) = add_edge(p, w, g); assert(added); DEBUG_PRINTF("adding edge (%u,%u) to parent\n", edge_props.minBound, edge_props.maxBound); g[e].minBound = edge_props.minBound; if (p != tbi->root && g[w].left.graph && (!tbi->isAnyStart(p) || isPureAnchored(*g[w].left.graph))) { depth mw = findMaxWidth(*g[w].left.graph); if (mw.is_infinite()) { g[e].maxBound = ROSE_BOUND_INF; } else { DEBUG_PRINTF("setting max to %s + %u\n", mw.str().c_str(), prefix_lag); g[e].maxBound = prefix_lag + mw; } } else { g[e].maxBound = edge_props.maxBound; } g[e].rose_top = 0; g[e].history = selectHistory(*tbi, bd, pv.second, e); } if (delay && hasSuccessorLiterals(iv, bd.ig)) { // Add an undelayed "ghost" vertex for this literal. u32 ghostId = tbi->literal_info[literalId].undelayed_id; DEBUG_PRINTF("creating delay ghost vertex, id=%u\n", ghostId); assert(ghostId != literalId); assert(tbi->literals.right.at(ghostId).delay == 0); // Adjust offsets, removing delay. u32 ghost_min = min_offset, ghost_max = max_offset; assert(ghost_min < ROSE_BOUND_INF && ghost_min >= delay); ghost_min -= delay; ghost_max -= ghost_max == ROSE_BOUND_INF ? 0 : delay; RoseVertex g_v = createVertex(tbi, ghostId, ghost_min, ghost_max); for (const auto &pv : parents) { const RoseInEdgeProps &edge_props = bd.ig[pv.second]; RoseEdge e = add_edge(pv.first, g_v, tbi->g).first; g[e].minBound = edge_props.minBound; g[e].maxBound = edge_props.maxBound; g[e].history = selectHistory(*tbi, bd, pv.second, e); DEBUG_PRINTF("parent edge has bounds [%u,%u]\n", edge_props.minBound, edge_props.maxBound); } for (auto &m : created) { tbi->ghost[m.second] = g_v; } } } /* ensure the holder does not accept any paths which do not end with lit */ static void removeFalsePaths(NGHolder &g, const ue2_literal &lit) { DEBUG_PRINTF("strip '%s'\n", dumpString(lit).c_str()); set curr, next; curr.insert(g.accept); curr.insert(g.acceptEod); for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) { next.clear(); for (auto curr_v : curr) { DEBUG_PRINTF("handling %u\n", g[curr_v].index); vector next_cand; insert(&next_cand, next_cand.end(), inv_adjacent_vertices(curr_v, g)); clear_in_edges(curr_v, g); if (curr_v == g.acceptEod) { add_edge(g.accept, g.acceptEod, g); } for (auto v : next_cand) { assert(v != g.startDs); if (v == g.start || v == g.startDs || v == g.accept) { continue; } const CharReach &cr = g[v].char_reach; if (!overlaps(*it, cr)) { DEBUG_PRINTF("false edge %u\n", g[v].index); continue; } NFAVertex v2 = clone_vertex(g, v); clone_in_edges(g, v, v2); add_edge(v2, curr_v, g); g[v2].char_reach &= *it; DEBUG_PRINTF("next <- %u\n", g[v2].index); next.insert(v2); } } curr.swap(next); } pruneUseless(g); clearReports(g); assert(in_degree(g.accept, g) || in_degree(g.acceptEod, g) > 1); assert(allMatchStatesHaveReports(g)); DEBUG_PRINTF("graph has %zu vertices left\n", num_vertices(g)); } static RoseVertex tryForAnchoredVertex(RoseBuildImpl *tbi, const RoseInVertexProps &iv_info, const RoseInEdgeProps &ep) { if (ep.graph_lag && ep.graph_lag != iv_info.s.length()) { DEBUG_PRINTF("bad lag %u != %zu\n", ep.graph_lag, iv_info.s.length()); return RoseGraph::null_vertex(); /* TODO: better */ } const depth anchored_max_depth(tbi->cc.grey.maxAnchoredRegion); depth min_width(0), max_width(0); if (ep.graph.get()) { const depth graph_lag(ep.graph_lag); max_width = findMaxWidth(*ep.graph) + graph_lag; min_width = findMinWidth(*ep.graph) + graph_lag; if (proper_out_degree(ep.graph->startDs, *ep.graph)) { max_width = depth::infinity(); } } DEBUG_PRINTF("mw = %s; lag = %u\n", max_width.str().c_str(), ep.graph_lag); NGHolder h; if (ep.graph.get() && max_width <= anchored_max_depth) { cloneHolder(h, *ep.graph); /* add literal/dots */ if (ep.graph_lag) { assert(ep.graph_lag == iv_info.s.length()); appendLiteral(h, iv_info.s); } else { removeFalsePaths(h, iv_info.s); } } else if (!ep.graph.get() && ep.maxBound < ROSE_BOUND_INF && iv_info.s.length() + ep.maxBound <= tbi->cc.grey.maxAnchoredRegion) { if (ep.maxBound || ep.minBound) { /* TODO: handle, however these cases are not generated currently by ng_rose */ return RoseGraph::null_vertex(); } max_width = depth(ep.maxBound + iv_info.s.length()); min_width = depth(ep.minBound + iv_info.s.length()); add_edge(h.start, h.accept, h); appendLiteral(h, iv_info.s); } else { return RoseGraph::null_vertex(); } u32 anchored_exit_id = tbi->getNewLiteralId(); u32 remap_id = 0; DEBUG_PRINTF(" trying to add dfa stuff\n"); int rv = addToAnchoredMatcher(*tbi, h, anchored_exit_id, &remap_id); if (rv == ANCHORED_FAIL) { return RoseGraph::null_vertex(); } else if (rv == ANCHORED_REMAP) { anchored_exit_id = remap_id; } else { assert(rv == ANCHORED_SUCCESS); } // Store the literal itself in a side structure so that we can use it for // overlap calculations later. This may be obsolete when the old Rose // construction path (and its history selection code) goes away. rose_literal_id lit(iv_info.s, ROSE_ANCHORED, 0); tbi->anchoredLitSuffix.insert(make_pair(anchored_exit_id, lit)); assert(min_width <= anchored_max_depth); assert(max_width <= anchored_max_depth); assert(min_width <= max_width); /* Note: bounds are end-to-end as anchored lits are considered * to have 0 length. */ RoseVertex v = createAnchoredVertex(tbi, anchored_exit_id, min_width, max_width); return v; } static u32 findRoseAnchorFloatingOverlap(const RoseInEdgeProps &ep, const RoseInVertexProps &succ_vp) { /* we need to ensure there is enough history to find the successor literal * when we enable its group. */ if (!ep.graph.get()) { return 0; /* non overlapping */ } depth graph_min_width = findMinWidth(*ep.graph); u32 min_width = ep.graph_lag + graph_min_width; u32 s_len = succ_vp.s.length(); if (s_len <= min_width) { return 0; /* no overlap */ } u32 overlap = s_len - min_width; DEBUG_PRINTF("found overlap of %u\n", overlap); return overlap; } static void findRoseLiteralMask(const NGHolder &h, const u32 lag, vector &msk, vector &cmp) { if (lag >= HWLM_MASKLEN) { msk.clear(); cmp.clear(); return; } assert(in_degree(h.acceptEod, h) == 1); // no eod reports // Start with the set of reporter vertices for this rose. set curr, next; insert(&curr, inv_adjacent_vertices(h.accept, h)); assert(!curr.empty()); msk.assign(HWLM_MASKLEN, 0); cmp.assign(HWLM_MASKLEN, 0); size_t i = HWLM_MASKLEN - lag - 1; do { if (curr.empty() || contains(curr, h.start) || contains(curr, h.startDs)) { DEBUG_PRINTF("end of the road\n"); break; } next.clear(); CharReach cr; for (auto v : curr) { DEBUG_PRINTF("vertex %u, reach %s\n", h[v].index, describeClass(h[v].char_reach).c_str()); cr |= h[v].char_reach; insert(&next, inv_adjacent_vertices(v, h)); } make_and_cmp_mask(cr, &msk[i], &cmp[i]); DEBUG_PRINTF("%zu: reach=%s, msk=%u, cmp=%u\n", i, describeClass(cr).c_str(), msk.at(i), cmp.at(i)); curr.swap(next); } while (i-- > 0); } static void doRoseLiteralVertex(RoseBuildImpl *tbi, bool use_eod_table, map > &vertex_map, const vector > &parents, RoseInVertex iv, const RoseBuildData &bd) { const RoseInGraph &ig = bd.ig; const RoseInVertexProps &iv_info = ig[iv]; assert(iv_info.type == RIV_LITERAL); assert(!parents.empty()); /* start vertices should not be here */ // ng_rose should have ensured that mixed-sensitivity literals are no // longer than the benefits max width. assert(iv_info.s.length() <= MAX_MASK2_WIDTH || !mixed_sensitivity(iv_info.s)); // Rose graph construction process should have given us a min_offset. assert(iv_info.min_offset > 0); if (use_eod_table) { goto floating; } DEBUG_PRINTF("rose find vertex\n"); if (parents.size() == 1) { const RoseVertex u = parents.front().first; const RoseInEdgeProps &ep = ig[parents.front().second]; if (!tbi->isAnyStart(u)) { goto floating; } if (!ep.graph && ep.maxBound == ROSE_BOUND_INF) { goto floating; } if (ep.graph && !isAnchored(*ep.graph)) { goto floating; } DEBUG_PRINTF("cand for anchored maxBound %u, %p (%d)\n", ep.maxBound, ep.graph.get(), ep.graph ? (int)isAnchored(*ep.graph) : 3); /* need to check if putting iv into the anchored table would create * any bad_overlap relationships with its successor literals */ for (const auto &e : out_edges_range(iv, ig)) { RoseInVertex t = target(e, ig); u32 overlap = findRoseAnchorFloatingOverlap(ig[e], ig[t]); DEBUG_PRINTF("found overlap of %u\n", overlap); if (overlap > tbi->cc.grey.maxHistoryAvailable + 1) { goto floating; } } RoseVertex v = tryForAnchoredVertex(tbi, iv_info, ep); if (v != RoseGraph::null_vertex()) { DEBUG_PRINTF("add anchored literal vertex\n"); vertex_map[iv].push_back(v); return; } } floating: vector msk, cmp; if (tbi->cc.grey.roseHamsterMasks && in_degree(iv, ig) == 1) { RoseInEdge e = *in_edges(iv, ig).first; if (ig[e].graph) { findRoseLiteralMask(*ig[e].graph, ig[e].graph_lag, msk, cmp); } } u32 delay = iv_info.delay; rose_literal_table table = use_eod_table ? ROSE_EOD_ANCHORED : ROSE_FLOATING; u32 literalId = tbi->getLiteralId(iv_info.s, msk, cmp, delay, table); DEBUG_PRINTF("literal=%u (len=%zu, delay=%u, offsets=[%u,%u] '%s')\n", literalId, iv_info.s.length(), delay, iv_info.min_offset, iv_info.max_offset, dumpString(iv_info.s).c_str()); createVertices(tbi, vertex_map, parents, iv, iv_info.min_offset, iv_info.max_offset, literalId, delay, bd); } static unique_ptr makeRoseEodPrefix(const NGHolder &h, RoseBuildImpl &build, map, ReportID> &remap) { assert(generates_callbacks(h)); assert(!in_degree(h.accept, h)); auto gg = cloneHolder(h); NGHolder &g = *gg; g.kind = is_triggered(h) ? NFA_INFIX : NFA_PREFIX; // Move acceptEod edges over to accept. vector dead; for (const auto &e : in_edges_range(g.acceptEod, g)) { NFAVertex u = source(e, g); if (u == g.accept) { continue; } add_edge_if_not_present(u, g.accept, g); dead.push_back(e); if (!contains(remap, g[u].reports)) { remap[g[u].reports] = build.getNewNfaReport(); } g[u].reports = { remap[g[u].reports] }; } remove_edges(dead, g); return gg; } static u32 getEodEventID(RoseBuildImpl &build) { // Allocate the EOD event if it hasn't been already. if (build.eod_event_literal_id == MO_INVALID_IDX) { build.eod_event_literal_id = build.getLiteralId({}, 0, ROSE_EVENT); } return build.eod_event_literal_id; } static void makeEodEventLeftfix(RoseBuildImpl &build, RoseVertex u, const NGHolder &h) { assert(!build.isInETable(u)); RoseGraph &g = build.g; map, ReportID> report_remap; shared_ptr eod_leftfix = makeRoseEodPrefix(h, build, report_remap); u32 eod_event = getEodEventID(build); for (const auto &report_mapping : report_remap) { RoseVertex v = add_vertex(g); g[v].idx = build.vertexIndex++; g[v].literals.insert(eod_event); build.literal_info[eod_event].vertices.insert(v); g[v].left.graph = eod_leftfix; g[v].left.leftfix_report = report_mapping.second; g[v].left.lag = 0; RoseEdge e1 = add_edge(u, v, g).first; g[e1].minBound = 0; g[e1].maxBound = ROSE_BOUND_INF; g[v].min_offset = add_rose_depth(g[u].min_offset, findMinWidth(*g[v].left.graph)); g[v].max_offset = ROSE_BOUND_INF; depth max_width = findMaxWidth(*g[v].left.graph); if (u != build.root && max_width.is_finite() && (!build.isAnyStart(u) || isPureAnchored(*g[v].left.graph))) { g[e1].maxBound = max_width; g[v].max_offset = add_rose_depth(g[u].max_offset, max_width); } g[e1].history = ROSE_ROLE_HISTORY_NONE; // handled by prefix RoseVertex w = add_vertex(g); g[w].idx = build.vertexIndex++; g[w].eod_accept = true; g[w].reports = report_mapping.first; g[w].min_offset = g[v].min_offset; g[w].max_offset = g[v].max_offset; RoseEdge e = add_edge(v, w, g).first; g[e].minBound = 0; g[e].maxBound = 0; g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE; DEBUG_PRINTF("accept eod vertex (idx=%zu)\n", g[w].idx); } } static void doRoseAcceptVertex(RoseBuildImpl *tbi, const vector > &parents, RoseInVertex iv, const RoseBuildData &bd) { const RoseInGraph &ig = bd.ig; assert(ig[iv].type == RIV_ACCEPT || ig[iv].type == RIV_ACCEPT_EOD); RoseGraph &g = tbi->g; for (const auto &pv : parents) { RoseVertex u = pv.first; const RoseInEdgeProps &edge_props = bd.ig[pv.second]; /* We need to duplicate the parent vertices if: * * 1) It already has a suffix, etc as we are going to add the specified * suffix, etc to the parents and we do not want to overwrite the * existing information. * * 2) We are making the an EOD accept and the vertex already has other * out-edges - The LAST_BYTE history used for EOD accepts is * incompatible with normal successors. As accepts are processed last we * do not need to worry about other normal successors being added later. */ if (g[u].suffix || !g[u].reports.empty() || (ig[iv].type == RIV_ACCEPT_EOD && out_degree(u, g) && !edge_props.graph) || (!isLeafNode(u, g) && !tbi->isAnyStart(u))) { DEBUG_PRINTF("duplicating for parent %zu\n", g[u].idx); assert(!tbi->isAnyStart(u)); u = duplicate(tbi, u); g[u].suffix.reset(); g[u].eod_accept = false; } assert(!g[u].suffix); if (ig[iv].type == RIV_ACCEPT) { assert(!tbi->isAnyStart(u)); if (contains(bd.early_dfas, edge_props.graph.get())) { DEBUG_PRINTF("adding early dfa suffix to i%zu\n", g[u].idx); g[u].suffix.rdfa = bd.early_dfas.at(edge_props.graph.get()); g[u].suffix.dfa_min_width = findMinWidth(*edge_props.graph); g[u].suffix.dfa_max_width = findMaxWidth(*edge_props.graph); } else if (edge_props.graph) { DEBUG_PRINTF("adding suffix to i%zu\n", g[u].idx); g[u].suffix.graph = edge_props.graph; assert(g[u].suffix.graph->kind == NFA_SUFFIX); /* TODO: set dfa_(min|max)_width */ } else if (edge_props.haig) { DEBUG_PRINTF("adding suffaig to i%zu\n", g[u].idx); g[u].suffix.haig = edge_props.haig; } else { DEBUG_PRINTF("adding boring accept to i%zu\n", g[u].idx); assert(!g[u].eod_accept); g[u].reports = ig[iv].reports; } } else { assert(ig[iv].type == RIV_ACCEPT_EOD); assert(!edge_props.haig); if (!edge_props.graph) { RoseVertex w = add_vertex(g); g[w].idx = tbi->vertexIndex++; g[w].eod_accept = true; g[w].reports = ig[iv].reports; g[w].min_offset = g[u].min_offset; g[w].max_offset = g[u].max_offset; RoseEdge e = add_edge(u, w, g).first; g[e].minBound = 0; g[e].maxBound = 0; g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE; DEBUG_PRINTF("accept eod vertex (idx=%zu)\n", g[w].idx); continue; } const NGHolder &h = *edge_props.graph; assert(!in_degree(h.accept, h)); assert(generates_callbacks(h)); if (tbi->isInETable(u)) { assert(h.kind == NFA_SUFFIX); assert(!tbi->isAnyStart(u)); /* etable can't/shouldn't use eod event */ DEBUG_PRINTF("adding suffix to i%zu\n", g[u].idx); g[u].suffix.graph = edge_props.graph; continue; } makeEodEventLeftfix(*tbi, u, h); } } } static bool suitableForEod(const RoseInGraph &ig, vector topo, u32 *max_len, const CompileContext &cc) { map max_depth_from_eod; *max_len = 0; reverse(topo.begin(), topo.end()); /* we want to start at accept end */ for (auto v : topo) { u32 v_depth = 0; if (ig[v].type == RIV_ACCEPT) { DEBUG_PRINTF("[ACCEPT]\n"); for (const auto &e : in_edges_range(v, ig)) { if (!ig[e].graph || !can_only_match_at_eod(*ig[e].graph)) { DEBUG_PRINTF("floating accept\n"); return false; } } } switch (ig[v].type) { case RIV_LITERAL: DEBUG_PRINTF("[LITERAL]\n"); break; case RIV_START: DEBUG_PRINTF("[START]\n"); break; case RIV_ANCHORED_START: DEBUG_PRINTF("[ANCHOR]\n"); break; case RIV_ACCEPT: break; case RIV_ACCEPT_EOD: DEBUG_PRINTF("[EOD]\n"); break; default: assert(0); DEBUG_PRINTF("????\n"); return false; } for (const auto &e : out_edges_range(v, ig)) { RoseInVertex t = target(e, ig); assert(contains(max_depth_from_eod, t)); u64a max_width; if (ig[v].type == RIV_START || ig[v].type == RIV_ANCHORED_START) { /* start itself doesn't need to be in history buffer * just need to make sure all succ literals are ok */ if (ig[t].type == RIV_LITERAL) { max_width = ig[t].s.length(); } else { max_width = 0; } if (ig[e].graph) { depth graph_max_width = findMaxWidth(*ig[e].graph); DEBUG_PRINTF("graph max width %s, lag %u\n", graph_max_width.str().c_str(), ig[e].graph_lag); if (!graph_max_width.is_finite()) { DEBUG_PRINTF("fail due to graph with inf max width\n"); return false; } max_width += graph_max_width; } } else if (ig[e].haig) { DEBUG_PRINTF("fail due to haig\n"); return false; } else if (ig[e].graph) { depth graph_max_width = findMaxWidth(*ig[e].graph); DEBUG_PRINTF("graph max width %s, lag %u\n", graph_max_width.str().c_str(), ig[e].graph_lag); if (!graph_max_width.is_finite()) { DEBUG_PRINTF("fail due to graph with inf max width\n"); return false; } max_width = ig[e].graph_lag + graph_max_width; } else { max_width = ig[e].maxBound; if (ig[t].type == RIV_LITERAL) { max_width += ig[t].s.length(); } } max_width += max_depth_from_eod[t]; if (max_width > ROSE_BOUND_INF) { max_width = ROSE_BOUND_INF; } DEBUG_PRINTF("max_width=%llu\n", max_width); ENSURE_AT_LEAST(&v_depth, (u32)max_width); } if (v_depth == ROSE_BOUND_INF || v_depth > cc.grey.maxHistoryAvailable) { DEBUG_PRINTF("not suitable for eod table %u\n", v_depth); return false; } max_depth_from_eod[v] = v_depth; ENSURE_AT_LEAST(max_len, v_depth); } DEBUG_PRINTF("to the eod table and beyond\n"); return true; } static void shift_accepts_to_end(const RoseInGraph &ig, vector &topo_order) { stable_partition(begin(topo_order), end(topo_order), [&](RoseInVertex v){ return !is_any_accept(v, ig); }); } static void populateRoseGraph(RoseBuildImpl *tbi, RoseBuildData &bd) { const RoseInGraph &ig = bd.ig; /* add the pattern in to the main rose graph */ DEBUG_PRINTF("%srose pop\n", bd.som ? "som " : ""); /* Note: an input vertex may need to create several rose vertices. This is * primarily because a RoseVertex can only have 1 one leftfix */ map > vertex_map; vector v_order = topo_order(ig); shift_accepts_to_end(ig, v_order); u32 eod_space_required; bool use_eod_table = suitableForEod(ig, v_order, &eod_space_required, tbi->cc); if (use_eod_table) { ENSURE_AT_LEAST(&tbi->ematcher_region_size, eod_space_required); } assert(ig[v_order.front()].type == RIV_START || ig[v_order.front()].type == RIV_ANCHORED_START); for (RoseInVertex iv : v_order) { DEBUG_PRINTF("vertex %p\n", iv); if (ig[iv].type == RIV_START) { DEBUG_PRINTF("is root\n"); vertex_map[iv].push_back(tbi->root); continue; } else if (ig[iv].type == RIV_ANCHORED_START) { DEBUG_PRINTF("is anchored root\n"); vertex_map[iv].push_back(tbi->anchored_root); continue; } vector > parents; for (const auto &e : in_edges_range(iv, ig)) { RoseInVertex u = source(e, ig); assert(contains(vertex_map, u)); const vector &images = vertex_map[u]; // We should have no dupes. assert(set(images.begin(), images.end()).size() == images.size()); for (auto v_image : images) { // v_image should NOT already be in our parents list. assert(find_if(parents.begin(), parents.end(), [&v_image](const pair &p) { return p.first == v_image; }) == parents.end()); parents.emplace_back(v_image, e); if (tbi->isAnchored(v_image)) { assert(!use_eod_table); u32 overlap = findRoseAnchorFloatingOverlap(ig[e], ig[iv]); assert(overlap <= tbi->cc.grey.maxHistoryAvailable + 1); ENSURE_AT_LEAST(&tbi->max_rose_anchored_floating_overlap, overlap); } } } if (ig[iv].type == RIV_LITERAL) { DEBUG_PRINTF("LITERAL '%s'\n", dumpString(ig[iv].s).c_str()); assert(!isLeafNode(iv, ig)); doRoseLiteralVertex(tbi, use_eod_table, vertex_map, parents, iv, bd); } else { if (ig[iv].type == RIV_ACCEPT) { DEBUG_PRINTF("ACCEPT\n"); } else { assert(ig[iv].type == RIV_ACCEPT_EOD); DEBUG_PRINTF("ACCEPT_EOD\n"); } assert(isLeafNode(iv, ig)); /* accepts are final */ doRoseAcceptVertex(tbi, parents, iv, bd); } } DEBUG_PRINTF("done\n"); } template static bool empty(const GraphT &g) { typename GraphT::vertex_iterator vi, ve; tie(vi, ve) = vertices(g); return vi == ve; } /* We only try to implement as a dfa if a non-nullptr as_dfa is provided to return * the raw dfa to. */ static bool canImplementGraph(RoseBuildImpl *tbi, const RoseInGraph &in, NGHolder &h, const vector &edges, bool prefilter, const ReportManager &rm, const CompileContext &cc, bool finalChance, unique_ptr *as_dfa) { assert(!edges.empty()); assert(&*in[edges[0]].graph == &h); assert(h.kind == whatRoseIsThis(in, edges[0])); if (isImplementableNFA(h, &rm, cc)) { return true; } if (prefilter && cc.grey.prefilterReductions) { // If we're prefiltering, we can have another go with a reduced graph. UNUSED size_t numBefore = num_vertices(h); prefilterReductions(h, cc); UNUSED size_t numAfter = num_vertices(h); DEBUG_PRINTF("reduced from %zu to %zu vertices\n", numBefore, numAfter); if (isImplementableNFA(h, &rm, cc)) { return true; } } if (as_dfa) { switch (h.kind) { case NFA_OUTFIX: /* 'prefix' of eod */ case NFA_PREFIX: if (!cc.grey.earlyMcClellanPrefix) { return false; } break; case NFA_INFIX: if (!cc.grey.earlyMcClellanInfix) { return false; } break; case NFA_SUFFIX: if (!cc.grey.earlyMcClellanSuffix) { return false; } break; case NFA_EAGER_PREFIX: case NFA_REV_PREFIX: case NFA_OUTFIX_RAW: DEBUG_PRINTF("kind %u\n", (u32)h.kind); assert(0); } assert(!*as_dfa); assert(tbi); vector > triggers; u32 min_offset = ~0U; u32 max_offset = 0; for (const auto &e : edges) { RoseInVertex s = source(e, in); RoseInVertex t = target(e, in); if (in[s].type == RIV_LITERAL) { triggers.push_back(as_cr_seq(in[s].s)); } if (in[t].type == RIV_ACCEPT_EOD) { /* TODO: support eod prefixes */ return false; } ENSURE_AT_LEAST(&max_offset, in[s].max_offset); LIMIT_TO_AT_MOST(&min_offset, in[s].min_offset); } if (!generates_callbacks(h)) { setReportId(h, tbi->getNewNfaReport()); } bool single_trigger = min_offset == max_offset; DEBUG_PRINTF("trying for mcclellan (%u, %u)\n", min_offset, max_offset); *as_dfa = buildMcClellan(h, &rm, single_trigger, triggers, cc.grey, finalChance); if (*as_dfa) { return true; } } DEBUG_PRINTF("unable to build engine\n"); return false; } static bool predsAreDelaySensitive(const RoseInGraph &ig, RoseInVertex v) { assert(in_degree(v, ig)); for (const auto &e : in_edges_range(v, ig)) { if (ig[e].graph || ig[e].haig) { DEBUG_PRINTF("edge graph\n"); return true; } if (ig[e].minBound || ig[e].maxBound != ROSE_BOUND_INF) { DEBUG_PRINTF("edge bounds\n"); return true; } RoseInVertex u = source(e, ig); if (ig[u].type == RIV_START) { continue; } if (ig[u].type != RIV_LITERAL) { DEBUG_PRINTF("unsafe pred vertex\n"); return true; } if (ig[u].delay) { DEBUG_PRINTF("pred has delay\n"); return true; } } return false; } static u32 maxAvailableDelay(const ue2_literal &pred_key, const ue2_literal &lit_key) { /* overly conservative if only part of the string is nocase */ string pred = pred_key.get_string(); string lit = lit_key.get_string(); if (pred_key.any_nocase() || lit_key.any_nocase()) { upperString(pred); upperString(lit); } string::size_type last = pred.rfind(lit); if (last == string::npos) { return MAX_DELAY; } u32 raw = pred.size() - last - 1; return MIN(raw, MAX_DELAY); } static u32 findMaxSafeDelay(const RoseInGraph &ig, RoseInVertex u, RoseInVertex v) { // First, check the overlap constraints on (u,v). size_t max_delay; if (ig[v].type == RIV_LITERAL) { DEBUG_PRINTF("lit->lit edge: '%s' -> '%s'\n", escapeString(ig[u].s).c_str(), escapeString(ig[v].s).c_str()); max_delay = maxAvailableDelay(ig[u].s, ig[v].s); } else if (ig[v].type == RIV_ACCEPT) { DEBUG_PRINTF("lit->accept edge: '%s' -> ACCEPT\n", escapeString(ig[u].s).c_str()); max_delay = MAX_DELAY; } else { assert(0); return 0; } DEBUG_PRINTF("max safe delay for this edge: %zu\n", max_delay); // Now consider the predecessors of u. for (const auto &e : in_edges_range(u, ig)) { RoseInVertex w = source(e, ig); if (ig[w].type == RIV_START) { continue; } assert(ig[w].type == RIV_LITERAL); assert(ig[w].delay == 0); DEBUG_PRINTF("pred lit->lit edge: '%s' -> '%s'\n", escapeString(ig[w].s).c_str(), escapeString(ig[u].s).c_str()); // We cannot delay the literal on u so much that a predecessor literal // could occur in the delayed region. For example, consider // 'barman.*foobar': if we allow 'foobar' to be delayed by 3, then // 'barman' could occur in the input string and race with 'foobar', as // in 'foobarman'. const size_t pred_len = ig[w].s.length(); size_t overlap = maxOverlap(ig[u].s, ig[w].s, 0); DEBUG_PRINTF("pred_len=%zu, overlap=%zu\n", pred_len, overlap); assert(overlap <= pred_len); size_t max_lit_delay = pred_len - min(overlap + 1, pred_len); DEBUG_PRINTF("overlap=%zu -> max_lit_delay=%zu\n", overlap, max_lit_delay); max_delay = min(max_delay, max_lit_delay); } DEBUG_PRINTF("max_delay=%zu\n", max_delay); assert(max_delay <= MAX_DELAY); return max_delay; } static bool transformInfixToDelay(const RoseInGraph &ig, const RoseInEdge &e, const CompileContext &cc, u32 *delay_out) { const u32 max_history = cc.streaming ? cc.grey.maxHistoryAvailable : ROSE_BOUND_INF; const RoseInVertex u = source(e, ig), v = target(e, ig); const u32 graph_lag = ig[e].graph_lag; // Clone a copy of the graph, as we need to be able to roll back this // operation. NGHolder h; cloneHolder(h, *ig[e].graph); DEBUG_PRINTF("target literal: %s\n", dumpString(ig[v].s).c_str()); DEBUG_PRINTF("graph with %zu vertices and graph_lag %u\n", num_vertices(h), graph_lag); assert(graph_lag <= ig[v].s.length()); if (graph_lag < ig[v].s.length()) { size_t len = ig[v].s.length() - graph_lag; ue2_literal lit(ig[v].s.substr(0, len)); DEBUG_PRINTF("lit2=%s\n", dumpString(lit).c_str()); u32 delay2 = removeTrailingLiteralStates(h, lit, max_history); if (delay2 == MO_INVALID_IDX) { DEBUG_PRINTF("couldn't remove trailing literal\n"); return false; } if (delay2 != len) { DEBUG_PRINTF("couldn't remove entire trailing literal\n"); return false; } } PureRepeat repeat; if (!isPureRepeat(h, repeat)) { DEBUG_PRINTF("graph is not repeat\n"); return false; } DEBUG_PRINTF("graph is %s repeat\n", repeat.bounds.str().c_str()); if (!repeat.bounds.max.is_infinite()) { DEBUG_PRINTF("not inf\n"); return false; } if (!repeat.reach.all()) { DEBUG_PRINTF("non-dot reach\n"); return false; } u32 delay = ig[v].s.length() + repeat.bounds.min; if (delay > MAX_DELAY) { DEBUG_PRINTF("delay %u > MAX_DELAY\n", delay); return false; } if (delay + ig[u].s.length() - 1 > max_history) { DEBUG_PRINTF("delay too large for history\n"); return false; } *delay_out = delay; return true; } static void transformLiteralDelay(RoseInGraph &ig, const CompileContext &cc) { if (!cc.grey.roseTransformDelay) { return; } for (auto u : vertices_range(ig)) { if (ig[u].type != RIV_LITERAL) { continue; } if (out_degree(u, ig) != 1) { continue; } RoseInEdge e = *out_edges(u, ig).first; RoseInVertex v = target(e, ig); if (ig[v].type != RIV_LITERAL) { continue; } if (ig[e].haig) { continue; } if (!ig[e].graph) { continue; } if (predsAreDelaySensitive(ig, u)) { DEBUG_PRINTF("preds are delay sensitive\n"); continue; } u32 max_delay = findMaxSafeDelay(ig, u, v); DEBUG_PRINTF("lit->lit edge with graph: '%s' -> '%s'\n", escapeString(ig[u].s).c_str(), escapeString(ig[v].s).c_str()); u32 delay = 0; if (!transformInfixToDelay(ig, e, cc, &delay)) { continue; } if (delay > max_delay) { DEBUG_PRINTF("delay=%u > max_delay=%u\n", delay, max_delay); continue; } DEBUG_PRINTF("setting lit delay to %u and deleting graph\n", delay); ig[u].delay = delay; ig[u].min_offset = add_rose_depth(ig[u].min_offset, delay); ig[u].max_offset = add_rose_depth(ig[u].max_offset, delay); ig[e].graph_lag = 0; ig[e].graph.reset(); ig[e].minBound = 0; ig[e].maxBound = ROSE_BOUND_INF; } } static bool transformInfixToAnchBounds(const RoseInGraph &ig, const RoseInEdge &e, const CompileContext &cc, DepthMinMax *bounds) { const u32 max_history = cc.streaming ? cc.grey.maxHistoryAvailable : ROSE_BOUND_INF; const RoseInVertex v = target(e, ig); const u32 graph_lag = ig[e].graph_lag; // Clone a copy of the graph, as we need to be able to roll back this // operation. NGHolder h; cloneHolder(h, *ig[e].graph); DEBUG_PRINTF("graph with %zu vertices and graph_lag %u\n", num_vertices(h), graph_lag); assert(graph_lag <= ig[v].s.length()); if (graph_lag < ig[v].s.length()) { size_t len = ig[v].s.length() - graph_lag; ue2_literal lit(ig[v].s.substr(0, len)); DEBUG_PRINTF("lit2=%s\n", dumpString(lit).c_str()); u32 delay2 = removeTrailingLiteralStates(h, lit, max_history); if (delay2 == MO_INVALID_IDX) { DEBUG_PRINTF("couldn't remove trailing literal\n"); return false; } if (delay2 != len) { DEBUG_PRINTF("couldn't remove entire trailing literal\n"); return false; } } PureRepeat repeat; if (!isPureRepeat(h, repeat)) { DEBUG_PRINTF("graph is not repeat\n"); return false; } DEBUG_PRINTF("graph is %s repeat\n", repeat.bounds.str().c_str()); if (!repeat.bounds.max.is_infinite()) { DEBUG_PRINTF("not inf\n"); return false; } if (!repeat.reach.all()) { DEBUG_PRINTF("non-dot reach\n"); return false; } *bounds = repeat.bounds; return true; } static void transformAnchoredLiteralOverlap(RoseInGraph &ig, RoseBuildData &bd, const CompileContext &cc) { if (!cc.grey.roseTransformDelay) { return; } for (const auto &e : edges_range(ig)) { const RoseInVertex u = source(e, ig); const RoseInVertex v = target(e, ig); if (ig[u].type != RIV_LITERAL || ig[v].type != RIV_LITERAL) { continue; } if (ig[e].haig || !ig[e].graph) { continue; } if (ig[u].min_offset != ig[u].max_offset) { DEBUG_PRINTF("u not fixed depth\n"); continue; } DEBUG_PRINTF("anch_lit->lit edge with graph: '%s' -> '%s'\n", escapeString(ig[u].s).c_str(), escapeString(ig[v].s).c_str()); DepthMinMax bounds; if (!transformInfixToAnchBounds(ig, e, cc, &bounds)) { continue; } DEBUG_PRINTF("setting bounds to %s and deleting graph\n", bounds.str().c_str()); ig[e].graph_lag = 0; ig[e].graph.reset(); ig[e].minBound = bounds.min; ig[e].maxBound = bounds.max.is_finite() ? (u32)bounds.max : ROSE_BOUND_INF; bd.anch_history_edges.insert(e); } } /** * \brief Transform small trailing dot repeat suffixes into delay on the last * literal. * * For example, the case /hatstand.*teakettle./s can just delay 'teakettle' +1 * rather than having a suffix to handle the dot. * * This transformation looks for literal->accept edges and transforms them if * appropriate. It doesn't handle complex cases where the literal has more than * one successor. */ static void transformSuffixDelay(RoseInGraph &ig, const CompileContext &cc) { if (!cc.grey.roseTransformDelay) { return; } const u32 max_history = cc.streaming ? cc.grey.maxHistoryAvailable : ROSE_BOUND_INF; set modified_accepts; // may be dead after transform for (auto u : vertices_range(ig)) { if (ig[u].type != RIV_LITERAL) { continue; } if (out_degree(u, ig) != 1) { continue; } RoseInEdge e = *out_edges(u, ig).first; RoseInVertex v = target(e, ig); if (ig[v].type != RIV_ACCEPT) { continue; } if (ig[e].haig) { continue; } if (!ig[e].graph) { continue; } if (predsAreDelaySensitive(ig, u)) { DEBUG_PRINTF("preds are delay sensitive\n"); continue; } DEBUG_PRINTF("lit->accept edge with graph: lit='%s'\n", escapeString(ig[u].s).c_str()); const NGHolder &h = *ig[e].graph; const set reports = all_reports(h); if (reports.size() != 1) { DEBUG_PRINTF("too many reports\n"); continue; } PureRepeat repeat; if (!isPureRepeat(h, repeat)) { DEBUG_PRINTF("suffix graph is not repeat\n"); continue; } DEBUG_PRINTF("suffix graph is %s repeat\n", repeat.bounds.str().c_str()); if (!repeat.reach.all()) { DEBUG_PRINTF("non-dot reach\n"); continue; } if (repeat.bounds.min != repeat.bounds.max || repeat.bounds.min > depth(MAX_DELAY)) { DEBUG_PRINTF("repeat is variable or too large\n"); continue; } u32 max_delay = findMaxSafeDelay(ig, u, v); u32 delay = repeat.bounds.min; if (delay > max_delay) { DEBUG_PRINTF("delay=%u > max_delay=%u\n", delay, max_delay); continue; } if (delay + ig[u].s.length() - 1 > max_history) { DEBUG_PRINTF("delay too large for history\n"); continue; } DEBUG_PRINTF("setting lit delay to %u and removing suffix\n", delay); ig[u].delay = delay; ig[u].min_offset = add_rose_depth(ig[u].min_offset, delay); ig[u].max_offset = add_rose_depth(ig[u].max_offset, delay); // Construct a new accept vertex for this report and remove edge e. // (This allows us to cope if v has more than one in-edge). RoseInVertex v2 = add_vertex(RoseInVertexProps::makeAccept(reports), ig); add_edge(u, v2, ig); remove_edge(e, ig); modified_accepts.insert(v); } DEBUG_PRINTF("%zu modified accepts\n", modified_accepts.size()); for (auto v : modified_accepts) { if (in_degree(v, ig) == 0) { DEBUG_PRINTF("removing accept vertex with no preds\n"); remove_vertex(v, ig); } } } #ifndef NDEBUG static bool validateKinds(const RoseInGraph &g) { for (const auto &e : edges_range(g)) { if (g[e].graph && g[e].graph->kind != whatRoseIsThis(g, e)) { return false; } } return true; } #endif bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter, bool finalChance) { DEBUG_PRINTF("trying to rose\n"); assert(validateKinds(ig)); if (::ue2::empty(ig)) { assert(0); return false; } const unique_ptr in_ptr = cloneRoseGraph(ig); RoseInGraph &in = *in_ptr; RoseBuildData bd(in, false); transformLiteralDelay(in, cc); transformAnchoredLiteralOverlap(in, bd, cc); transformSuffixDelay(in, cc); assert(validateKinds(ig)); map > graphs; vector ordered_graphs; // Stored in first-encounter order. for (const auto &e : edges_range(in)) { if (!in[e].graph) { continue; // no graph } if (in[e].haig) { // Haigs are always implementable (we've already built the raw DFA). continue; } NGHolder *h = in[e].graph.get(); if (!contains(graphs, h)) { ordered_graphs.push_back(h); } graphs[h].push_back(e); } assert(ordered_graphs.size() == graphs.size()); vector graph_edges; for (auto h : ordered_graphs) { const vector &h_edges = graphs.at(h); unique_ptr as_dfa; /* allow finalChance as fallback is basically an outfix at this point */ if (!canImplementGraph(this, in, *h, h_edges, prefilter, rm, cc, finalChance, &as_dfa)) { return false; } if (as_dfa) { bd.early_dfas[h] = move(as_dfa); } insert(&graph_edges, graph_edges.end(), h_edges); } /* we are now past the point of no return. We can start making irreversible changes to the rose graph, etc */ for (const auto &e : graph_edges) { assert(in[e].graph); assert(!in[e].haig); NGHolder &h = *in[e].graph; DEBUG_PRINTF("handling %p\n", &h); assert(allMatchStatesHaveReports(h)); if (!generates_callbacks(whatRoseIsThis(in, e)) && !contains(bd.early_dfas, &h) && in[target(e, in)].type != RIV_ACCEPT_EOD) { setReportId(h, getNewNfaReport()); } } populateRoseGraph(this, bd); return true; } bool RoseBuildImpl::addSombeRose(const RoseInGraph &ig) { DEBUG_PRINTF("rose is trying to consume a sombe\n"); assert(validateKinds(ig)); if (::ue2::empty(ig)) { assert(0); return false; } RoseBuildData bd(ig, true); for (const auto &e : edges_range(ig)) { if (!ig[e].graph) { continue; // no graph } DEBUG_PRINTF("handling %p\n", ig[e].graph.get()); assert(allMatchStatesHaveReports(*ig[e].graph)); assert(ig[e].haig); } populateRoseGraph(this, bd); return true; } bool roseCheckRose(const RoseInGraph &ig, bool prefilter, const ReportManager &rm, const CompileContext &cc) { assert(validateKinds(ig)); if (::ue2::empty(ig)) { assert(0); return false; } map> graphs; for (const auto &e : edges_range(ig)) { if (!ig[e].graph) { continue; // no graph } if (ig[e].haig) { // Haigs are always implementable (we've already built the raw DFA). continue; } graphs[ig[e].graph.get()].push_back(e); } for (const auto &m : graphs) { if (!canImplementGraph(nullptr, ig, *m.first, m.second, prefilter, rm, cc, false, nullptr)) { return false; } } return true; } void RoseBuildImpl::add(bool anchored, bool eod, const ue2_literal &lit, const ue2::flat_set &reports) { assert(!reports.empty()); if (cc.grey.floodAsPuffette && !anchored && !eod && is_flood(lit) && lit.length() > 3) { DEBUG_PRINTF("adding as puffette\n"); const CharReach &cr = *lit.begin(); for (const auto &report : reports) { addOutfix(raw_puff(lit.length(), true, report, cr, true)); } return; } RoseInGraph ig; RoseInVertex start = add_vertex(RoseInVertexProps::makeStart(anchored), ig); RoseInVertex accept = add_vertex( eod ? RoseInVertexProps::makeAcceptEod(set()) : RoseInVertexProps::makeAccept(set()), ig); RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), ig); add_edge(start, v, RoseInEdgeProps(0U, anchored ? 0U : ROSE_BOUND_INF), ig); add_edge(v, accept, RoseInEdgeProps(0U, 0U), ig); calcVertexOffsets(ig); ig[accept].reports.insert(reports.begin(), reports.end()); addRose(ig, false); } static u32 findMaxBAWidth(const NGHolder &h) { // Must be bi-anchored: no out-edges from startDs (other than its // self-loop), no in-edges to accept. if (hasGreaterOutDegree(1, h.startDs, h) || hasGreaterInDegree(0, h.accept, h)) { return ROSE_BOUND_INF; } depth d = findMaxWidth(h); assert(d.is_reachable()); if (!d.is_finite()) { return ROSE_BOUND_INF; } return d; } static void populateOutfixInfo(OutfixInfo &outfix, const NGHolder &h, const RoseBuildImpl &tbi) { outfix.maxBAWidth = findMaxBAWidth(h); outfix.minWidth = findMinWidth(h); outfix.maxWidth = findMaxWidth(h); outfix.maxOffset = findMaxOffset(h, tbi.rm); populateReverseAccelerationInfo(outfix.rev_info, h); } bool RoseBuildImpl::addOutfix(const NGHolder &h) { DEBUG_PRINTF("%zu vertices, %zu edges\n", num_vertices(h), num_edges(h)); const u32 nfa_states = isImplementableNFA(h, &rm, cc); if (nfa_states) { DEBUG_PRINTF("implementable as an NFA in %u states\n", nfa_states); } else { DEBUG_PRINTF("not implementable as an NFA\n"); } bool dfa_cand = !nfa_states || nfa_states > 128 /* slow model */ || can_exhaust(h, rm); /* can be pruned */ unique_ptr rdfa; if (!nfa_states || cc.grey.roseMcClellanOutfix == 2 || (cc.grey.roseMcClellanOutfix == 1 && dfa_cand)) { rdfa = buildMcClellan(h, &rm, cc.grey); } if (!nfa_states && !rdfa) { DEBUG_PRINTF("could not build as either an NFA or a DFA\n"); return false; } if (rdfa) { outfixes.push_back(OutfixInfo(move(rdfa))); } else { outfixes.push_back(OutfixInfo(cloneHolder(h))); } populateOutfixInfo(outfixes.back(), h, *this); return true; } bool RoseBuildImpl::addOutfix(const NGHolder &h, const raw_som_dfa &haig) { DEBUG_PRINTF("haig with %zu states\n", haig.states.size()); outfixes.push_back(OutfixInfo(ue2::make_unique(haig))); populateOutfixInfo(outfixes.back(), h, *this); return true; /* failure is not yet an option */ } bool RoseBuildImpl::addOutfix(const raw_puff &rp) { if (!mpv_outfix) { mpv_outfix = make_unique(MpvProto()); } auto *mpv = mpv_outfix->mpv(); assert(mpv); mpv->puffettes.push_back(rp); mpv_outfix->maxBAWidth = ROSE_BOUND_INF; /* not ba */ mpv_outfix->minWidth = min(mpv_outfix->minWidth, depth(rp.repeats)); mpv_outfix->maxWidth = rp.unbounded ? depth::infinity() : max(mpv_outfix->maxWidth, depth(rp.repeats)); if (mpv_outfix->maxOffset == ROSE_BOUND_INF || rp.unbounded) { mpv_outfix->maxOffset = ROSE_BOUND_INF; } else { mpv_outfix->maxOffset = MAX(mpv_outfix->maxOffset, rp.repeats); } return true; /* failure is not yet an option */ } bool RoseBuildImpl::addChainTail(const raw_puff &rp, u32 *queue_out, u32 *event_out) { if (!mpv_outfix) { mpv_outfix = make_unique(MpvProto()); } auto *mpv = mpv_outfix->mpv(); assert(mpv); mpv->triggered_puffettes.push_back(rp); mpv_outfix->maxBAWidth = ROSE_BOUND_INF; /* not ba */ mpv_outfix->minWidth = min(mpv_outfix->minWidth, depth(rp.repeats)); mpv_outfix->maxWidth = rp.unbounded ? depth::infinity() : max(mpv_outfix->maxWidth, depth(rp.repeats)); mpv_outfix->maxOffset = ROSE_BOUND_INF; /* TODO: we could get information from * the caller */ *queue_out = mpv_outfix->get_queue(qif); *event_out = MQE_TOP_FIRST + mpv->triggered_puffettes.size() - 1; return true; /* failure is not yet an option */ } static bool prepAcceptForAddAnchoredNFA(RoseBuildImpl &tbi, const NGHolder &w, u32 max_adj, NFAVertex u, const vector &vertexDepths, map &depthMap, map > &reportMap, map &allocated_reports, flat_set &added_lit_ids) { const depth max_anchored_depth(tbi.cc.grey.maxAnchoredRegion); const u32 idx = w[u].index; assert(idx < vertexDepths.size()); const DepthMinMax &d = vertexDepths.at(idx); for (const auto &int_report : w[u].reports) { assert(int_report != MO_INVALID_IDX); u32 lit_id; if (!contains(allocated_reports, int_report)) { lit_id = tbi.getNewLiteralId(); added_lit_ids.insert(lit_id); allocated_reports[int_report] = lit_id; } else { lit_id = allocated_reports[int_report]; } reportMap[u].insert(lit_id); if (!contains(depthMap, lit_id)) { depthMap[lit_id] = d; } else { depthMap[lit_id] = unionDepthMinMax(depthMap[lit_id], d); } if (depthMap[lit_id].max + depth(max_adj) > max_anchored_depth) { DEBUG_PRINTF("depth=%s exceeds maxAnchoredRegion=%u\n", (depthMap[lit_id].max + depth(max_adj)).str().c_str(), tbi.cc.grey.maxAnchoredRegion); return false; } } return true; } // Failure path for addAnchoredAcyclic: removes the literal IDs that have been // added to support anchored NFAs. Assumes that they are a contiguous range at // the end of the RoseBuildImpl::literal_info vector. static void removeAddedLiterals(RoseBuildImpl &tbi, const flat_set &lit_ids) { if (lit_ids.empty()) { return; } // lit_ids should be a contiguous range. assert(lit_ids.size() == *lit_ids.rbegin() - *lit_ids.begin() + 1); for (const u32 &lit_id : lit_ids) { assert(lit_id < tbi.literal_info.size()); assert(tbi.literals.right.at(lit_id).table == ROSE_ANCHORED); assert(tbi.literal_info[lit_id].vertices.empty()); tbi.literals.right.erase(lit_id); } // lit_ids should be at the end of tbi.literal_info. assert(tbi.literal_info.size() == *lit_ids.rbegin() + 1); tbi.literal_info.resize(*lit_ids.begin()); // remove all ids in lit_ids } bool RoseBuildImpl::addAnchoredAcyclic(const NGHolder &h) { vector vertexDepths; calcDepthsFrom(h, h.start, vertexDepths); map > reportMap; /* NFAVertex -> literal ids */ map depthMap; /* literal id -> min/max depth */ map allocated_reports; /* report -> literal id */ flat_set added_lit_ids; /* literal ids added for this NFA */ for (auto v : inv_adjacent_vertices_range(h.accept, h)) { if (!prepAcceptForAddAnchoredNFA(*this, h, 0, v, vertexDepths, depthMap, reportMap, allocated_reports, added_lit_ids)) { removeAddedLiterals(*this, added_lit_ids); return false; } } map allocated_reports_eod; /* report -> literal id */ for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) { if (v == h.accept) { continue; } if (!prepAcceptForAddAnchoredNFA(*this, h, 0, v, vertexDepths, depthMap, reportMap, allocated_reports_eod, added_lit_ids)) { removeAddedLiterals(*this, added_lit_ids); return false; } } assert(!reportMap.empty()); int rv = addAnchoredNFA(*this, h, reportMap); if (rv != ANCHORED_FAIL) { assert(rv != ANCHORED_REMAP); DEBUG_PRINTF("added anchored nfa\n"); /* add edges to the rose graph to bubble the match up */ for (const auto &m : allocated_reports) { const ReportID &report = m.first; const u32 &lit_id = m.second; assert(depthMap[lit_id].max.is_finite()); u32 minBound = depthMap[lit_id].min; u32 maxBound = depthMap[lit_id].max; RoseVertex v = createAnchoredVertex(this, lit_id, minBound, maxBound); g[v].reports.insert(report); } for (const auto &m : allocated_reports_eod) { const ReportID &report = m.first; const u32 &lit_id = m.second; assert(depthMap[lit_id].max.is_finite()); u32 minBound = depthMap[lit_id].min; u32 maxBound = depthMap[lit_id].max; RoseVertex v = createAnchoredVertex(this, lit_id, minBound, maxBound); RoseVertex eod = add_vertex(g); g[eod].idx = vertexIndex++; g[eod].eod_accept = true; g[eod].reports.insert(report); g[eod].min_offset = g[v].min_offset; g[eod].max_offset = g[v].max_offset; add_edge(v, eod, g); } return true; } else { DEBUG_PRINTF("failed to add anchored nfa\n"); removeAddedLiterals(*this, added_lit_ids); return false; } } } // namespace ue2