/* * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "rose_build_impl.h" #include "grey.h" #include "hs_internal.h" #include "rose_build_anchored.h" #include "rose_build_castle.h" #include "rose_build_convert.h" #include "rose_build_dump.h" #include "rose_build_groups.h" #include "rose_build_matchers.h" #include "rose_build_merge.h" #include "rose_build_role_aliasing.h" #include "rose_build_util.h" #include "ue2common.h" #include "nfa/nfa_internal.h" #include "nfa/rdfa.h" #include "nfagraph/ng_holder.h" #include "nfagraph/ng_dump.h" #include "nfagraph/ng_execute.h" #include "nfagraph/ng_is_equal.h" #include "nfagraph/ng_limex.h" #include "nfagraph/ng_mcclellan.h" #include "nfagraph/ng_repeat.h" #include "nfagraph/ng_reports.h" #include "nfagraph/ng_stop.h" #include "nfagraph/ng_util.h" #include "nfagraph/ng_width.h" #include "util/bitutils.h" #include "util/charreach.h" #include "util/charreach_util.h" #include "util/compare.h" #include "util/compile_context.h" #include "util/container.h" #include "util/dump_charclass.h" #include "util/graph_range.h" #include "util/order_check.h" #include "util/report_manager.h" #include "util/ue2_containers.h" #include "util/ue2string.h" #include "util/verify_types.h" #include #include #include #include #include #include #include #include using namespace std; using boost::adaptors::map_values; namespace ue2 { #define ANCHORED_REHOME_MIN_FLOATING 800 #define ANCHORED_REHOME_MIN_FLOATING_SHORT 50 #define ANCHORED_REHOME_ALLOW_SHORT 20 #define ANCHORED_REHOME_DEEP 25 #define ANCHORED_REHOME_SHORT_LEN 3 #ifdef DEBUG static UNUSED void printLitInfo(const rose_literal_info &li, u32 id) { DEBUG_PRINTF("lit_info %u\n", id); DEBUG_PRINTF(" parent %u%s", li.undelayed_id, li.delayed_ids.empty() ? "":", children:"); for (u32 d_id : li.delayed_ids) { printf(" %u", d_id); } printf("\n"); DEBUG_PRINTF(" group %llu %s\n", li.group_mask, li.squash_group ? "s":""); } #endif static void allocateFinalIdToSet(const RoseGraph &g, const set &lits, deque *literal_info, map > *final_id_to_literal, u32 *next_final_id) { /* We can allocate the same final id to multiple literals of the same type * if they share the same vertex set and trigger the same delayed literal * ids and squash the same roles and have the same group squashing * behaviour. Benefits literals cannot be merged. */ for (u32 int_id : lits) { rose_literal_info &curr_info = (*literal_info)[int_id]; const auto &verts = curr_info.vertices; if (!verts.empty() && !curr_info.requires_benefits && curr_info.delayed_ids.empty()) { vector cand; insert(&cand, cand.end(), g[*verts.begin()].literals); for (auto v : verts) { vector temp; set_intersection(cand.begin(), cand.end(), g[v].literals.begin(), g[v].literals.end(), inserter(temp, temp.end())); cand.swap(temp); } for (u32 cand_id : cand) { if (cand_id >= int_id) { break; } const rose_literal_info &cand_info = (*literal_info)[cand_id]; if (cand_info.requires_benefits) { continue; } if (!cand_info.delayed_ids.empty()) { /* TODO: allow cases where delayed ids are equivalent. * This is awkward currently as the have not had their * final ids allocated yet */ continue; } if (lits.find(cand_id) == lits.end() || cand_info.vertices.size() != verts.size() || cand_info.squash_group != curr_info.squash_group) { continue; } /* if we are squashing groups we need to check if they are the * same group */ if (cand_info.squash_group && cand_info.group_mask != curr_info.group_mask) { continue; } u32 final_id = cand_info.final_id; assert(final_id != MO_INVALID_IDX); assert(curr_info.final_id == MO_INVALID_IDX); curr_info.final_id = final_id; (*final_id_to_literal)[final_id].insert(int_id); goto next_lit; } } /* oh well, have to give it a fresh one, hang the expense */ DEBUG_PRINTF("allocating final id %u to %u\n", *next_final_id, int_id); assert(curr_info.final_id == MO_INVALID_IDX); curr_info.final_id = *next_final_id; (*final_id_to_literal)[*next_final_id].insert(int_id); (*next_final_id)++; next_lit:; } } static bool isUsedLiteral(const RoseBuildImpl &build, u32 lit_id) { assert(lit_id < build.literal_info.size()); const auto &info = build.literal_info[lit_id]; if (!info.vertices.empty()) { return true; } for (const u32 &delayed_id : info.delayed_ids) { assert(delayed_id < build.literal_info.size()); const rose_literal_info &delayed_info = build.literal_info[delayed_id]; if (!delayed_info.vertices.empty()) { return true; } } DEBUG_PRINTF("literal %u has no refs\n", lit_id); return false; } /** \brief Allocate final literal IDs for all literals. * * These are the literal ids used in the bytecode. */ static void allocateFinalLiteralId(RoseBuildImpl &tbi) { RoseGraph &g = tbi.g; set anch; set norm; set delay; /* undelayed ids come first */ assert(tbi.final_id_to_literal.empty()); u32 next_final_id = 0; for (u32 i = 0; i < tbi.literal_info.size(); i++) { assert(!tbi.hasFinalId(i)); if (!isUsedLiteral(tbi, i)) { /* what is this literal good for? absolutely nothing */ continue; } // The special EOD event literal has its own program and does not need // a real literal ID. if (i == tbi.eod_event_literal_id) { assert(tbi.eod_event_literal_id != MO_INVALID_IDX); continue; } if (tbi.isDelayed(i)) { assert(!tbi.literal_info[i].requires_benefits); delay.insert(i); } else if (tbi.literals.right.at(i).table == ROSE_ANCHORED) { anch.insert(i); } else { norm.insert(i); } } /* normal lits */ allocateFinalIdToSet(g, norm, &tbi.literal_info, &tbi.final_id_to_literal, &next_final_id); /* next anchored stuff */ tbi.anchored_base_id = next_final_id; allocateFinalIdToSet(g, anch, &tbi.literal_info, &tbi.final_id_to_literal, &next_final_id); /* delayed ids come last */ tbi.delay_base_id = next_final_id; allocateFinalIdToSet(g, delay, &tbi.literal_info, &tbi.final_id_to_literal, &next_final_id); } #define MAX_EXPLOSION_NC 3 static bool limited_explosion(const ue2_literal &s) { u32 nc_count = 0; for (const auto &e : s) { if (e.nocase) { nc_count++; } } return nc_count <= MAX_EXPLOSION_NC; } void RoseBuildImpl::handleMixedSensitivity(void) { for (const auto &e : literals.right) { u32 id = e.first; const rose_literal_id &lit = e.second; if (lit.delay) { continue; /* delay id's are virtual-ish */ } if (lit.table == ROSE_ANCHORED || lit.table == ROSE_EVENT) { continue; /* wrong table */ } if (!mixed_sensitivity(lit.s)) { continue; } if (limited_explosion(lit.s)) { DEBUG_PRINTF("need to explode existing string '%s'\n", dumpString(lit.s).c_str()); literal_info[id].requires_explode = true; } else { literal_info[id].requires_benefits = true; } } } // Returns the length of the longest prefix of s that is (a) also a suffix of s // and (b) not s itself. static size_t maxPeriod(const ue2_literal &s) { /* overly conservative if only part of the string is nocase */ if (s.empty()) { return 0; } const size_t len = s.length(); const char *begin = s.c_str(), *end = begin + len; size_t i; for (i = len - 1; i != 0; i--) { if (!cmp(begin, end - i, i, s.any_nocase())) { break; } } return i; } bool RoseBuildImpl::isPseudoStar(const RoseEdge &e) const { return !g[e].minBound && isPseudoStarOrFirstOnly(e); } bool RoseBuildImpl::isPseudoStarOrFirstOnly(const RoseEdge &e) const { RoseVertex u = source(e, g); RoseVertex v = target(e, g); if (g[e].maxBound != ROSE_BOUND_INF) { return false; } if (isAnyStart(u)) { return true; } if (isAnchored(u)) { /* anchored table runs out of order */ return false; } if (hasDelayedLiteral(u)) { return false; } if (g[v].left) { return false; } if (g[v].eod_accept) { return true; } assert(!g[v].literals.empty()); if (maxLiteralOverlap(u, v)) { return false; } return true; } bool RoseBuildImpl::hasOnlyPseudoStarInEdges(RoseVertex v) const { for (const auto &e : in_edges_range(v, g)) { if (!isPseudoStar(e)) { return false; } } return true; } void RoseBuildImpl::renumberVertices() { vertexIndex = 0; DEBUG_PRINTF("renumbering vertices\n"); for (auto v : vertices_range(g)) { g[v].idx = vertexIndex++; } } static size_t trailerDueToSelf(const rose_literal_id &lit) { size_t trailer = lit.s.length() - maxPeriod(lit.s); if (trailer > 255) { return 255; } if (!trailer) { return 1; } return trailer; } static RoseRoleHistory findHistoryScheme(const RoseBuildImpl &tbi, const RoseEdge &e) { const RoseGraph &g = tbi.g; const RoseVertex u = source(e, g); /* pred role */ const RoseVertex v = target(e, g); /* current role */ DEBUG_PRINTF("find history for [%zu,%zu]\n", g[u].idx, g[v].idx); DEBUG_PRINTF("u has min_offset=%u, max_offset=%u\n", g[u].min_offset, g[u].max_offset); if (g[v].left) { if (!tbi.isAnyStart(u)) { /* infix nfa will track history, treat as pseudo .*. Note: rose lits * may overlap so rose history track would be wrong anyway */ DEBUG_PRINTF("skipping history as prefix\n"); return ROSE_ROLE_HISTORY_NONE; } if (g[e].minBound || g[e].maxBound != ROSE_BOUND_INF) { DEBUG_PRINTF("rose prefix with external bounds\n"); return ROSE_ROLE_HISTORY_ANCH; } else { return ROSE_ROLE_HISTORY_NONE; } } // Handle EOD cases. if (g[v].eod_accept) { const u32 minBound = g[e].minBound, maxBound = g[e].maxBound; DEBUG_PRINTF("EOD edge with bounds [%u,%u]\n", minBound, maxBound); // Trivial case: we don't need history for {0,inf} bounds if (minBound == 0 && maxBound == ROSE_BOUND_INF) { return ROSE_ROLE_HISTORY_NONE; } // Event literals store no history. if (tbi.hasLiteralInTable(u, ROSE_EVENT)) { return ROSE_ROLE_HISTORY_NONE; } // Trivial case: fixed offset from anchor if (g[u].fixedOffset()) { return ROSE_ROLE_HISTORY_ANCH; } // If the bounds are {0,0}, this role can only match precisely at EOD. if (minBound == 0 && maxBound == 0) { return ROSE_ROLE_HISTORY_LAST_BYTE; } // XXX: No other history schemes should be possible any longer. assert(0); } // Non-EOD cases. DEBUG_PRINTF("examining edge [%zu,%zu] with bounds {%u,%u}\n", g[u].idx, g[v].idx, g[e].minBound, g[e].maxBound); if (tbi.isAnchored(v)) { // Matches for literals in the anchored table will always arrive at the // right offsets, so there's no need for history-based confirmation. DEBUG_PRINTF("v in anchored table, no need for history\n"); assert(u == tbi.anchored_root); return ROSE_ROLE_HISTORY_NONE; } if (g[u].fixedOffset()) { DEBUG_PRINTF("fixed offset -> anch\n"); return ROSE_ROLE_HISTORY_ANCH; } return ROSE_ROLE_HISTORY_NONE; } static void assignHistories(RoseBuildImpl &tbi) { for (const auto &e : edges_range(tbi.g)) { if (tbi.g[e].history == ROSE_ROLE_HISTORY_INVALID) { tbi.g[e].history = findHistoryScheme(tbi, e); } } } bool RoseBuildImpl::isDirectReport(u32 id) const { assert(id < literal_info.size()); // Literal info properties. const rose_literal_info &info = literal_info[id]; if (info.vertices.empty()) { return false; } if (!info.delayed_ids.empty() /* dr's don't set groups */ || info.requires_benefits) { /* dr's don't require confirm */ return false; } if (isDelayed(id)) { /* can't handle delayed dr atm as we require delay * ids to be dense */ return false; } // Role properties. // Note that a literal can have multiple roles and still be a direct // report; it'll become a multi-direct report ("MDR") that fires each // role's reports from a list. for (auto v : info.vertices) { assert(contains(g[v].literals, id)); if (g[v].reports.empty() || g[v].eod_accept || // no accept EOD !g[v].isBoring() || !isLeafNode(v, g) || // Must have no out-edges in_degree(v, g) != 1) { // Role must have exactly one in-edge return false; } // Use the program to handle cases that aren't external reports. for (const ReportID &id : g[v].reports) { if (!isExternalReport(rm.getReport(id))) { return false; } } if (literals.right.at(id).table == ROSE_ANCHORED) { /* in-edges are irrelevant for anchored region. */ continue; } /* The in-edge must be an (0, inf) edge from root. */ assert(in_degree(v, g) != 0); RoseEdge e = *(in_edges(v, g).first); if (source(e, g) != root || g[e].minBound != 0 || g[e].maxBound != ROSE_BOUND_INF) { return false; } // Note: we allow ekeys; they will result in unused roles being built as // direct reporting will be used when actually matching in Rose. /* TODO: prevent roles being created */ } DEBUG_PRINTF("literal %u ('%s') is a %s report\n", id, dumpString(literals.right.at(id).s).c_str(), info.vertices.size() > 1 ? "multi-direct" : "direct"); return true; } static bool checkEodStealFloating(const RoseBuildImpl &tbi, const vector &eodLiteralsForFloating, u32 numFloatingLiterals, size_t shortestFloatingLen) { if (eodLiteralsForFloating.empty()) { DEBUG_PRINTF("no eod literals\n"); return true; } if (!numFloatingLiterals) { DEBUG_PRINTF("no floating table\n"); return false; } if (tbi.hasNoFloatingRoots()) { DEBUG_PRINTF("skipping as floating table is conditional\n"); /* TODO: investigate putting stuff in atable */ return false; } DEBUG_PRINTF("%zu are eod literals, %u floating; floating len=%zu\n", eodLiteralsForFloating.size(), numFloatingLiterals, shortestFloatingLen); u32 new_floating_lits = 0; for (u32 eod_id : eodLiteralsForFloating) { const rose_literal_id &lit = tbi.literals.right.at(eod_id); DEBUG_PRINTF("checking '%s'\n", dumpString(lit.s).c_str()); if (tbi.hasLiteral(lit.s, ROSE_FLOATING)) { DEBUG_PRINTF("skip; there is already a floating version\n"); continue; } // Don't want to make the shortest floating literal shorter/worse. if (trailerDueToSelf(lit) < 4 || lit.s.length() < shortestFloatingLen) { DEBUG_PRINTF("len=%zu, selfOverlap=%zu\n", lit.s.length(), trailerDueToSelf(lit)); DEBUG_PRINTF("would shorten, bailing\n"); return false; } new_floating_lits++; } DEBUG_PRINTF("..would require %u new floating literals\n", new_floating_lits); // Magic number thresholds: we only want to get rid of our EOD table if it // would make no real difference to the FDR. if (numFloatingLiterals / 8 < new_floating_lits && (new_floating_lits > 3 || numFloatingLiterals <= 2)) { DEBUG_PRINTF("leaving eod table alone.\n"); return false; } return true; } static void promoteEodToFloating(RoseBuildImpl &tbi, const vector &eodLiterals) { DEBUG_PRINTF("promoting eod literals to floating table\n"); for (u32 eod_id : eodLiterals) { const rose_literal_id &lit = tbi.literals.right.at(eod_id); u32 floating_id = tbi.getLiteralId(lit.s, lit.msk, lit.cmp, lit.delay, ROSE_FLOATING); auto &float_verts = tbi.literal_info[floating_id].vertices; auto &eod_verts = tbi.literal_info[eod_id].vertices; insert(&float_verts, eod_verts); eod_verts.clear(); DEBUG_PRINTF("eod_lit=%u -> float_lit=%u\n", eod_id, floating_id); for (auto v : float_verts) { tbi.g[v].literals.erase(eod_id); tbi.g[v].literals.insert(floating_id); } tbi.literal_info[floating_id].requires_explode = tbi.literal_info[eod_id].requires_explode; tbi.literal_info[floating_id].requires_benefits = tbi.literal_info[eod_id].requires_benefits; } } static bool promoteEodToAnchored(RoseBuildImpl &tbi, const vector &eodLiterals) { DEBUG_PRINTF("promoting eod literals to anchored table\n"); bool rv = true; for (u32 eod_id : eodLiterals) { const rose_literal_id &lit = tbi.literals.right.at(eod_id); NGHolder h; add_edge(h.start, h.accept, h); appendLiteral(h, lit.s); /* we only accept cases which are anchored * hard up against start */ u32 a_id = tbi.getNewLiteralId(); u32 remap_id = 0; DEBUG_PRINTF(" trying to add dfa stuff\n"); int anch_ok = addToAnchoredMatcher(tbi, h, a_id, &remap_id); if (anch_ok == ANCHORED_FAIL) { DEBUG_PRINTF("failed to promote to anchored need to keep etable\n"); rv = false; continue; } else if (anch_ok == ANCHORED_REMAP) { DEBUG_PRINTF("remapped\n"); a_id = remap_id; } else { assert(anch_ok == ANCHORED_SUCCESS); } // Store the literal itself in a side structure so that we can use it // for overlap calculations later. This may be obsolete when the old // Rose construction path (and its history selection code) goes away. tbi.anchoredLitSuffix.insert(make_pair(a_id, lit)); auto &a_verts = tbi.literal_info[a_id].vertices; auto &eod_verts = tbi.literal_info[eod_id].vertices; for (auto v : eod_verts) { for (const auto &e : in_edges_range(v, tbi.g)) { assert(tbi.g[e].maxBound != ROSE_BOUND_INF); tbi.g[e].minBound += lit.s.length(); tbi.g[e].maxBound += lit.s.length(); } } insert(&a_verts, eod_verts); eod_verts.clear(); for (auto v : a_verts) { tbi.g[v].literals.erase(eod_id); tbi.g[v].literals.insert(a_id); } } return rv; } static bool suitableForAnchored(const RoseBuildImpl &tbi, const rose_literal_id &l_id, const rose_literal_info &lit) { const RoseGraph &g = tbi.g; bool seen = false; u32 min_offset = 0; u32 max_offset = 0; if (!lit.delayed_ids.empty() || l_id.delay) { DEBUG_PRINTF("delay\n"); return false; } if (!l_id.msk.empty()) { DEBUG_PRINTF("msk\n"); return false; } for (auto v : lit.vertices) { if (!seen) { min_offset = g[v].min_offset; max_offset = g[v].max_offset; seen = true; if (max_offset > tbi.cc.grey.maxAnchoredRegion) { DEBUG_PRINTF("too deep %u\n", max_offset); return false; } } if (max_offset != g[v].max_offset || min_offset != g[v].min_offset) { DEBUG_PRINTF(":(\n"); return false; } if (!g[v].isBoring()) { DEBUG_PRINTF(":(\n"); return false; } if (g[v].literals.size() != 1) { DEBUG_PRINTF("shared\n"); return false; } if (tbi.isNonRootSuccessor(v)) { DEBUG_PRINTF("non root\n"); return false; } if (max_offset != l_id.s.length() || min_offset != l_id.s.length()) { DEBUG_PRINTF("|%zu| (%u,%u):(\n", l_id.s.length(), min_offset, max_offset); /* TODO: handle cases with small bounds */ return false; } for (auto w : adjacent_vertices_range(v, g)) { if (!g[w].eod_accept) { DEBUG_PRINTF("non eod accept literal\n"); return false; } } } return true; } // If we've got a small number of long, innocuous EOD literals and a large // floating table, we consider promoting those EOD literals to the floating // table to avoid having to run both. See UE-2069, consider deleting this and // replacing with an elegant reverse DFA. /* We do not want to do this if we would otherwise avoid running the floating * table altogether. */ static void stealEodVertices(RoseBuildImpl &tbi) { u32 numFloatingLiterals = 0; u32 numAnchoredLiterals = 0; size_t shortestFloatingLen = SIZE_MAX; vector eodLiteralsForFloating; vector eodLiteralsForAnchored; DEBUG_PRINTF("hi\n"); for (u32 i = 0; i < tbi.literal_info.size(); i++) { const auto &info = tbi.literal_info[i]; if (info.vertices.empty()) { continue; // skip unused literals } const rose_literal_id &lit = tbi.literals.right.at(i); if (lit.table == ROSE_EOD_ANCHORED) { if (suitableForAnchored(tbi, lit, info)) { eodLiteralsForAnchored.push_back(i); } else { eodLiteralsForFloating.push_back(i); } } else if (lit.table == ROSE_FLOATING) { numFloatingLiterals++; shortestFloatingLen = min(shortestFloatingLen, lit.s.length()); } else if (lit.table == ROSE_ANCHORED) { numAnchoredLiterals++; } } /* given a choice of having either an eod table or an anchored table, we * always favour having an anchored table */ if (!checkEodStealFloating(tbi, eodLiteralsForFloating, numFloatingLiterals, shortestFloatingLen)) { DEBUG_PRINTF("removing etable weakens ftable\n"); return; } promoteEodToFloating(tbi, eodLiteralsForFloating); if (!promoteEodToAnchored(tbi, eodLiteralsForAnchored)) { DEBUG_PRINTF("still need ematcher\n"); return; } // We're no longer using the EOD matcher. tbi.ematcher_region_size = 0; } bool RoseBuildImpl::isDelayed(u32 id) const { return literal_info.at(id).undelayed_id != id; } bool RoseBuildImpl::hasFinalId(u32 id) const { return literal_info.at(id).final_id != MO_INVALID_IDX; } bool RoseBuildImpl::hasDelayedLiteral(RoseVertex v) const { for (u32 lit_id : g[v].literals) { if (literals.right.at(lit_id).delay) { return true; } } return false; } bool RoseBuildImpl::hasDelayPred(RoseVertex v) const { for (auto u : inv_adjacent_vertices_range(v, g)) { if (hasDelayedLiteral(u)) { return true; } } return false; } bool RoseBuildImpl::hasAnchoredTablePred(RoseVertex v) const { for (auto u : inv_adjacent_vertices_range(v, g)) { if (isAnchored(u)) { return true; } } return false; } void RoseBuildImpl::findTransientLeftfixes(void) { for (auto v : vertices_range(g)) { if (!g[v].left) { continue; } /* infixes can never (or at least not yet) be transient */ if (isNonRootSuccessor(v)) { continue; } const left_id &left(g[v].left); if (::ue2::isAnchored(left) && !isInETable(v)) { /* etable prefixes currently MUST be transient as we do not know * where we can safely catch them up to (yet). */ DEBUG_PRINTF("anchored roses in rocky soil are not fleeting\n"); continue; } const depth max_width = findMaxWidth(left); if (!max_width.is_finite()) { DEBUG_PRINTF("inf max width\n"); continue; } u32 his = g[v].left.lag + max_width; // If this vertex has an event literal, we need to add one to cope // with it. if (hasLiteralInTable(v, ROSE_EVENT)) { his++; } /* +1 as trigger must appear in main buffer and no byte is needed to * decompress the state */ if (his <= cc.grey.maxHistoryAvailable + 1) { transient.insert(left); DEBUG_PRINTF("a transient leftfix has been spotted his=%u\n", his); } } } /** Find all the different roses and their associated literals. */ static map> findLeftSucc(RoseBuildImpl &tbi) { map> leftfixes; for (auto v : vertices_range(tbi.g)) { if (tbi.g[v].left) { const LeftEngInfo &lei = tbi.g[v].left; leftfixes[lei].push_back(v); } } return leftfixes; } static bool triggerKillsRoseGraph(const RoseBuildImpl &tbi, const left_id &left, const set &all_lits, const RoseEdge &e) { assert(left.graph()); const NGHolder &h = *left.graph(); ue2::flat_set all_states; insert(&all_states, vertices(h)); assert(out_degree(h.startDs, h) == 1); /* triggered don't use sds */ DEBUG_PRINTF("removing sds\n"); all_states.erase(h.startDs); ue2::flat_set states; /* check each pred literal to see if they all kill previous graph * state */ for (u32 lit_id : tbi.g[source(e, tbi.g)].literals) { const rose_literal_id &pred_lit = tbi.literals.right.at(lit_id); const ue2_literal s = findNonOverlappingTail(all_lits, pred_lit.s); DEBUG_PRINTF("running graph %zu\n", states.size()); states = execute_graph(h, s, all_states, true); DEBUG_PRINTF("ran, %zu states on\n", states.size()); if (!states.empty()) { return false; } } return true; } static bool triggerKillsRose(const RoseBuildImpl &tbi, const left_id &left, const set &all_lits, const RoseEdge &e) { if (left.haig()) { /* TODO: To allow this for som-based engines we would also need to * ensure as well that no other triggers can occur at the same location * with a different som. */ return false; } if (left.graph()) { return triggerKillsRoseGraph(tbi, left, all_lits, e); } if (left.castle()) { return triggerKillsRoseCastle(tbi, left, all_lits, e); } return false; } static void inspectRoseTops(RoseBuildImpl &tbi) { /* Sometimes the arrival of a top for a rose infix can ensure that the nfa * would be dead at that time. In the case of multiple trigger literals we * can only base our decision on that portion of literal after any * overlapping literals */ map> roses = findLeftSucc(tbi); /* rose -> succ verts */ for (const auto &r : roses) { const left_id &left = r.first; const vector &succs = r.second; assert(!succs.empty()); if (tbi.isRootSuccessor(*succs.begin())) { /* a prefix is never an infix */ continue; } set tops_seen; set rose_edges; set pred_lit_ids; for (auto v : succs) { for (const auto &e : in_edges_range(v, tbi.g)) { RoseVertex u = source(e, tbi.g); tops_seen.insert(tbi.g[e].rose_top); insert(&pred_lit_ids, tbi.g[u].literals); rose_edges.insert(e); } } set all_lits; if (tops_seen.size() > 1) { goto next_rose; /* slightly tricky to deal with overlap case */ } for (u32 lit_id : pred_lit_ids) { const rose_literal_id &p_lit = tbi.literals.right.at(lit_id); if (p_lit.delay || p_lit.table == ROSE_ANCHORED) { goto next_rose; } all_lits.insert(p_lit.s); DEBUG_PRINTF("trigger: '%s'\n", dumpString(p_lit.s).c_str()); } DEBUG_PRINTF("rose has %zu trigger literals, %zu edges\n", all_lits.size(), rose_edges.size()); for (const auto &e : rose_edges) { if (triggerKillsRose(tbi, left, all_lits, e)) { DEBUG_PRINTF("top will override previous rose state\n"); tbi.g[e].rose_cancel_prev_top = true; } } next_rose:; } } static void buildRoseSquashMasks(RoseBuildImpl &tbi) { /* Rose nfa squash masks are applied to the groups when the nfa can no * longer match */ map> roses = findLeftSucc(tbi); /* rose -> succ verts */ /* a rose nfa can squash a group if all literals in that group are a * successor of the nfa and all the literals */ for (const auto &e : roses) { const left_id &left = e.first; const vector &succs = e.second; set lit_ids; bool anchored_pred = false; for (auto v : succs) { lit_ids.insert(tbi.g[v].literals.begin(), tbi.g[v].literals.end()); for (auto u : inv_adjacent_vertices_range(v, tbi.g)) { anchored_pred |= tbi.isAnchored(u); } } /* Due to the anchored table not being able to set groups again, * we cannot use a rose nfa for group squashing if it is being triggered * from the anchored table and can match more than once. */ if (anchored_pred) { /* infix with pred in anchored table */ u32 min_off = ~0U; u32 max_off = 0U; for (auto v : succs) { for (auto u : inv_adjacent_vertices_range(v, tbi.g)) { min_off = min(min_off, tbi.g[u].min_offset); max_off = max(max_off, tbi.g[u].max_offset); } } if (min_off != max_off) { /* leave all groups alone */ tbi.rose_squash_masks[left] = ~0ULL; continue; } } rose_group unsquashable = 0; for (u32 lit_id : lit_ids) { const rose_literal_info &info = tbi.literal_info[lit_id]; if (info.vertices.size() > 1 || !info.delayed_ids.empty()) { unsquashable |= info.group_mask; } } rose_group squash_mask = ~0ULL; /* leave all groups alone */ for (u32 i = 0; i < ROSE_GROUPS_MAX; i++) { if (is_subset_of(tbi.group_to_literal[i], lit_ids)) { squash_mask &= ~(1ULL << i); } } squash_mask |= unsquashable; tbi.rose_squash_masks[left] = squash_mask; } } static void countFloatingLiterals(const RoseBuildImpl &tbi, u32 *total_count, u32 *short_count) { *total_count = 0; *short_count = 0; for (const rose_literal_id &lit : tbi.literals.right | map_values) { if (lit.delay) { continue; /* delay id's are virtual-ish */ } if (lit.table != ROSE_FLOATING) { continue; /* wrong table */ } ++*total_count; if (lit.s.length() <= ANCHORED_REHOME_SHORT_LEN) { ++*short_count; } } } static void rehomeAnchoredLiteral(RoseBuildImpl &tbi, const simple_anchored_info &sai, const set &lit_ids) { /* TODO: verify that vertices only have a single literal at the moment */ DEBUG_PRINTF("rehoming ^.{%u,%u}%s\n", sai.min_bound, sai.max_bound, dumpString(sai.literal).c_str()); /* Get a floating literal corresponding to the anchored literal */ u32 new_literal_id = tbi.getLiteralId(sai.literal, 0, ROSE_FLOATING); rose_literal_info &new_lit_info = tbi.literal_info[new_literal_id]; DEBUG_PRINTF("floating literal id -> %u\n", new_literal_id); for (u32 lit_id : lit_ids) { rose_literal_info &old_lit_info = tbi.literal_info[lit_id]; assert(old_lit_info.delayed_ids.empty()); for (auto v : old_lit_info.vertices) { /* Transfer vertex over to new literal id */ assert(tbi.g[v].literals.size() == 1); tbi.g[v].literals.clear(); tbi.g[v].literals.insert(new_literal_id); new_lit_info.vertices.insert(v); /* ensure bounds on the vertex's in-edge are correct */ assert(in_degree(v, tbi.g) == 1); const RoseEdge &e = *in_edges(v, tbi.g).first; assert(tbi.g[e].minBound == sai.min_bound + sai.literal.length()); assert(tbi.g[e].maxBound == sai.max_bound + sai.literal.length()); tbi.g[e].minBound = sai.min_bound; tbi.g[e].maxBound = sai.max_bound; } /* mark the old literal as empty */ old_lit_info.vertices.clear(); } } static void rehomeAnchoredLiterals(RoseBuildImpl &tbi) { /* if we have many literals in the floating table, we want to push * literals which are anchored but deep into the floating table as they * are unlikely to reduce the performance of the floating table. */ u32 total_count; u32 short_count; countFloatingLiterals(tbi, &total_count, &short_count); DEBUG_PRINTF("considering rehoming options\n"); if (total_count < ANCHORED_REHOME_MIN_FLOATING && short_count < ANCHORED_REHOME_MIN_FLOATING_SHORT) { DEBUG_PRINTF("not a heavy case %u %u\n", total_count, short_count); return; } u32 min_rehome_len = ANCHORED_REHOME_SHORT_LEN + 1; if (short_count >= ANCHORED_REHOME_ALLOW_SHORT) { min_rehome_len--; } for (map >::iterator it = tbi.anchored_simple.begin(); it != tbi.anchored_simple.end();) { if (it->first.max_bound < ANCHORED_REHOME_DEEP || it->first.literal.length() < min_rehome_len) { ++it; continue; } rehomeAnchoredLiteral(tbi, it->first, it->second); tbi.anchored_simple.erase(it++); } } /** \brief Maximum number of single-byte literals to add to the small block * table. */ static const size_t MAX_1BYTE_SMALL_BLOCK_LITERALS = 20; static void addSmallBlockLiteral(RoseBuildImpl &tbi, const simple_anchored_info &sai, const set &lit_ids) { DEBUG_PRINTF("anchored ^.{%u,%u}%s\n", sai.min_bound, sai.max_bound, dumpString(sai.literal).c_str()); u32 lit_id = tbi.getLiteralId(sai.literal, 0, ROSE_ANCHORED_SMALL_BLOCK); rose_literal_info &lit_info = tbi.literal_info[lit_id]; DEBUG_PRINTF("anchored small block literal id -> %u\n", lit_id); RoseGraph &g = tbi.g; const RoseVertex anchored_root = tbi.anchored_root; for (u32 old_id : lit_ids) { assert(old_id < tbi.literal_info.size()); const rose_literal_info &li = tbi.literal_info[old_id]; // For compile determinism, operate over literal vertices in index order. vector lit_verts(begin(li.vertices), end(li.vertices)); sort(begin(lit_verts), end(lit_verts), VertexIndexComp(g)); for (auto lit_v : lit_verts) { // Clone vertex with the new literal ID. RoseVertex v = add_vertex(g[lit_v], g); g[v].idx = tbi.vertexIndex++; g[v].literals.clear(); g[v].literals.insert(lit_id); g[v].min_offset = sai.min_bound + sai.literal.length(); g[v].max_offset = sai.max_bound + sai.literal.length(); lit_info.vertices.insert(v); assert(!g[v].reports.empty()); bool doDirectReports = true; for (ReportID report_id : g[v].reports) { const Report &old_rep = tbi.rm.getReport(report_id); if (!isExternalReport(old_rep) || old_rep.hasBounds()) { doDirectReports = false; break; } } if (doDirectReports) { flat_set dr_reports; for (ReportID report_id : g[v].reports) { // These new literal roles can be made direct reports, with // their bounds handled by the bounds on their Report // structures. Report rep(tbi.rm.getReport(report_id)); // copy assert(!rep.hasBounds()); rep.minOffset = sai.literal.length() + sai.min_bound; rep.maxOffset = sai.literal.length() + sai.max_bound; dr_reports.insert(tbi.rm.getInternalId(rep)); } g[v].reports = dr_reports; RoseEdge e = add_edge(tbi.root, v, g).first; g[e].minBound = 0; // handled by internal_report g[e].maxBound = ROSE_BOUND_INF; // handled by internal_report } else { // If we have a complex internal report, these must become // anchored literals with their own roles. RoseEdge e = add_edge(anchored_root, v, g).first; g[e].minBound = sai.min_bound; g[e].maxBound = sai.max_bound; } } } } static void addSmallBlockLiteral(RoseBuildImpl &tbi, const ue2_literal &lit, const flat_set &reports) { DEBUG_PRINTF("lit %s, reports: %s\n", dumpString(lit).c_str(), as_string_list(reports).c_str()); assert(!reports.empty()); u32 lit_id = tbi.getLiteralId(lit, 0, ROSE_ANCHORED_SMALL_BLOCK); assert(lit_id < tbi.literal_info.size()); rose_literal_info &lit_info = tbi.literal_info[lit_id]; RoseGraph &g = tbi.g; RoseVertex v = add_vertex(g); g[v].idx = tbi.vertexIndex++; g[v].literals.insert(lit_id); g[v].reports = reports; RoseEdge e = add_edge(tbi.root, v, g).first; g[e].minBound = 0; g[e].maxBound = ROSE_BOUND_INF; g[v].min_offset = 1; g[v].max_offset = ROSE_BOUND_INF; lit_info.vertices.insert(v); } static bool stateIsSEPLiteral(const dstate_id_t &s, const symbol_t &sym, const raw_dfa &rdfa) { const dstate &ds = rdfa.states[s]; if (!ds.reports_eod.empty() || ds.reports.empty()) { DEBUG_PRINTF("badly formed reports\n"); return false; } DEBUG_PRINTF("examine state %u reached by sym %u\n", s, sym); for (symbol_t i = 0; i < rdfa.getImplAlphaSize(); i++) { const auto &s_next = ds.next[i]; DEBUG_PRINTF("state %u -> %u on sym %u\n", s, s_next, i); if (s_next == DEAD_STATE) { continue; // dead, probably pruned } else if (s_next == s && i == sym) { continue; // self loop on same symbol } else if (s_next == rdfa.start_floating) { continue; // return to floating start } // We don't handle any other transitions. DEBUG_PRINTF("not single-byte\n"); return false; } return true; } static bool extractSEPLiterals(const raw_dfa &rdfa, map> &lits_out) { if (rdfa.start_floating == DEAD_STATE) { DEBUG_PRINTF("not floating?\n"); return false; } if (rdfa.start_anchored != rdfa.start_floating) { DEBUG_PRINTF("not all floating?\n"); return false; } map, vector> lits; // reports -> symbols const dstate &start = rdfa.states[rdfa.start_floating]; const symbol_t alpha_size = rdfa.getImplAlphaSize(); for (symbol_t i = 0; i < alpha_size; i++) { auto next = start.next[i]; if (next == DEAD_STATE || next == rdfa.start_floating) { continue; } if (!stateIsSEPLiteral(next, i, rdfa)) { return false; } lits[rdfa.states[next].reports].push_back(i); } // Map from symbols back to character reachability. vector reach(alpha_size); for (u32 i = 0; i < N_CHARS; i++) { assert(rdfa.alpha_remap[i] < alpha_size); reach[rdfa.alpha_remap[i]].set(i); } for (const auto &m : lits) { const auto &reports = m.first; const auto &symbols = m.second; CharReach cr; for (const auto &sym : symbols) { cr |= reach[sym]; } for (size_t i = cr.find_first(); i != cr.npos; i = cr.find_next(i)) { if (myisupper(i) && cr.test(mytolower(i))) { // ignore upper half of a nocase pair continue; } bool nocase = myislower(i) && cr.test(mytoupper(i)); insert(&lits_out[ue2_literal((char)i, nocase)], reports); } } return true; } static bool extractSEPLiterals(const OutfixInfo &outfix, const ReportManager &rm, map> &lits_out) { if (outfix.minWidth != depth(1) || outfix.maxWidth != depth(1)) { DEBUG_PRINTF("outfix must be fixed width of one\n"); return false; } for (const auto &report_id : all_reports(outfix)) { const auto &report = rm.getReport(report_id); if (!isSimpleExhaustible(report)) { DEBUG_PRINTF("report id %u not simple exhaustible\n", report_id); return false; } } // SEP cases should always become DFAs, so that's the only extract code we // have implemented here. if (outfix.rdfa()) { return extractSEPLiterals(*outfix.rdfa(), lits_out); } DEBUG_PRINTF("cannot extract literals from outfix type\n"); return false; } static void addAnchoredSmallBlockLiterals(RoseBuildImpl &tbi) { if (tbi.cc.streaming) { DEBUG_PRINTF("not block mode\n"); return; } if (!tbi.anchored_nfas.empty()) { DEBUG_PRINTF("anchored table is not purely literal\n"); return; } // At the moment, we only use the small-block matcher if all our anchored // literals are direct reports (i.e. leaf nodes in the Rose graph). for (const set &lits : tbi.anchored_simple | map_values) { for (u32 lit_id : lits) { if (!tbi.isDirectReport(lit_id)) { DEBUG_PRINTF("not all anchored lits are direct reports\n"); return; } } } vector > > anchored_lits; vector sep_outfixes; size_t oneByteLiterals = 0; for (const auto &e : tbi.anchored_simple) { const simple_anchored_info &sai = e.first; const set &lit_ids = e.second; if (sai.literal.length() + sai.min_bound > ROSE_SMALL_BLOCK_LEN) { DEBUG_PRINTF("skipping literal '%s' with min bound %u that cannot " "match inside small block width\n", dumpString(sai.literal).c_str(), sai.min_bound); } anchored_lits.push_back(make_pair(sai, lit_ids)); if (sai.literal.length() == 1) { oneByteLiterals++; } } // Capture SEP outfixes as well, adding them as literals to the small block // table. map> sep_literals; for (OutfixInfo &oi : tbi.outfixes) { if (extractSEPLiterals(oi, tbi.rm, sep_literals)) { sep_outfixes.push_back(&oi); } } oneByteLiterals += sep_literals.size(); DEBUG_PRINTF("%zu one-byte literals\n", oneByteLiterals); if (oneByteLiterals > MAX_1BYTE_SMALL_BLOCK_LITERALS) { DEBUG_PRINTF("too many one-byte literals, not building small block " "table!\n"); return; } for (const auto &e : tbi.anchored_simple) { const simple_anchored_info &sai = e.first; const set &lit_ids = e.second; addSmallBlockLiteral(tbi, sai, lit_ids); } for (const auto &m : sep_literals) { addSmallBlockLiteral(tbi, m.first, m.second); } for (OutfixInfo *oi : sep_outfixes) { assert(oi); oi->in_sbmatcher = true; } } #ifndef NDEBUG static bool historiesAreValid(const RoseGraph &g) { for (const auto &e : edges_range(g)) { if (g[e].history == ROSE_ROLE_HISTORY_INVALID) { DEBUG_PRINTF("edge [%zu,%zu] has invalid history\n", g[source(e, g)].idx, g[target(e, g)].idx); return false; } } return true; } /** * Assertion: Returns true if we have a reference hanging around to a vertex * that no longer exists in the graph. */ static bool danglingVertexRef(RoseBuildImpl &tbi) { RoseGraph::vertex_iterator vi, ve; tie(vi, ve) = vertices(tbi.g); const ue2::unordered_set valid_vertices(vi, ve); if (!contains(valid_vertices, tbi.anchored_root)) { DEBUG_PRINTF("anchored root vertex %p not in graph\n", tbi.anchored_root); return true; } for (const auto &e : tbi.ghost) { if (!contains(valid_vertices, e.first)) { DEBUG_PRINTF("ghost key vertex %p not in graph\n", e.first); return true; } if (!contains(valid_vertices, e.second)) { DEBUG_PRINTF("ghost value vertex %p not in graph\n", e.second); return true; } } return false; } static bool roleOffsetsAreValid(const RoseGraph &g) { for (auto v : vertices_range(g)) { if (g[v].min_offset >= ROSE_BOUND_INF) { DEBUG_PRINTF("invalid min_offset for role %zu\n", g[v].idx); return false; } if (g[v].min_offset > g[v].max_offset) { DEBUG_PRINTF("min_offset > max_offset for %zu\n", g[v].idx); return false; } } return true; } static UNUSED bool hasOrphanedTops(const RoseBuildImpl &tbi) { const RoseGraph &g = tbi.g; ue2::unordered_map > roses; ue2::unordered_map > suffixes; for (auto v : vertices_range(g)) { if (g[v].left) { set &tops = roses[g[v].left]; if (tbi.isRootSuccessor(v)) { // Prefix, has only one top. tops.insert(0); } else { // Tops for infixes come from the in-edges. for (const auto &e : in_edges_range(v, g)) { tops.insert(g[e].rose_top); } } } if (g[v].suffix) { suffixes[g[v].suffix].insert(g[v].suffix.top); } } for (const auto &e : roses) { if (all_tops(e.first) != e.second) { DEBUG_PRINTF("rose tops (%s) don't match rose graph (%s)\n", as_string_list(all_tops(e.first)).c_str(), as_string_list(e.second).c_str()); return true; } } for (const auto &e : suffixes) { if (all_tops(e.first) != e.second) { DEBUG_PRINTF("suffix tops (%s) don't match rose graph (%s)\n", as_string_list(all_tops(e.first)).c_str(), as_string_list(e.second).c_str()); return true; } } return false; } #endif // NDEBUG aligned_unique_ptr RoseBuildImpl::buildRose(u32 minWidth) { dumpRoseGraph(*this, nullptr, "rose_early.dot"); // Early check for Rose implementability. assert(canImplementGraphs(*this)); // Sanity check vertex role offsets. assert(roleOffsetsAreValid(g)); convertPrefixToBounds(*this); // Turn flood-prone suffixes into suffix NFAs. convertFloodProneSuffixes(*this); // Turn repeats into Castle prototypes. makeCastles(*this); rehomeAnchoredLiterals(*this); // If we've got a very small number of EOD-anchored literals, consider // moving them into the floating table so that we only have one literal // matcher to run. Note that this needs to happen before // addAnchoredSmallBlockLiterals as it may create anchored literals. assert(roleOffsetsAreValid(g)); stealEodVertices(*this); addAnchoredSmallBlockLiterals(*this); // Merge duplicate leaf nodes dedupeSuffixes(*this); if (cc.grey.roseGraphReduction) { mergeDupeLeaves(*this); uncalcLeaves(*this); } assert(roleOffsetsAreValid(g)); handleMixedSensitivity(); assignHistories(*this); convertAnchPrefixToBounds(*this); // Do some final graph reduction. dedupeLeftfixes(*this); aliasRoles(*this, false); // Don't merge leftfixes. dedupeLeftfixes(*this); convertBadLeaves(*this); uncalcLeaves(*this); /* note the leftfixes which do not need to keep state across stream boundaries */ findTransientLeftfixes(); dedupeLeftfixesVariableLag(*this); mergeLeftfixesVariableLag(*this); mergeSmallLeftfixes(*this); mergeCastleLeftfixes(*this); // Do a rose-merging aliasing pass. aliasRoles(*this, true); // Merging of suffixes _below_ role aliasing, as otherwise we'd have to // teach role aliasing about suffix tops. mergeCastleSuffixes(*this); mergePuffixes(*this); mergeAcyclicSuffixes(*this); mergeSmallSuffixes(*this); // Convert Castles that would be better off as NFAs back to NGHolder // infixes/suffixes. if (unmakeCastles(*this)) { // We may be able to save some stream state by merging the newly // "unmade" Castles. mergeSmallSuffixes(*this); mergeSmallLeftfixes(*this); } // Do a rose-merging aliasing pass. aliasRoles(*this, true); // Run a merge pass over the outfixes as well. mergeOutfixes(*this); assert(!danglingVertexRef(*this)); findMoreLiteralMasks(*this); assignGroupsToLiterals(*this); assignGroupsToRoles(*this); findGroupSquashers(*this); /* final prep work */ remapCastleTops(*this); allocateFinalLiteralId(*this); inspectRoseTops(*this); buildRoseSquashMasks(*this); rm.assignDkeys(this); /* transfer mpv outfix to main queue */ if (mpv_outfix) { outfixes.push_back(move(*mpv_outfix)); mpv_outfix = nullptr; } assert(canImplementGraphs(*this)); assert(!hasOrphanedTops(*this)); assert(roleOffsetsAreValid(g)); assert(historiesAreValid(g)); dumpRoseGraph(*this, nullptr, "rose_pre_norm.dot"); return buildFinalEngine(minWidth); } } // namespace ue2