/* * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "rose_build_impl.h" #include "hwlm/hwlm_literal.h" #include "nfa/castlecompile.h" #include "nfa/goughcompile.h" #include "nfa/mcclellancompile_util.h" #include "nfa/nfa_api.h" #include "nfa/rdfa.h" #include "nfa/tamaramacompile.h" #include "nfagraph/ng_holder.h" #include "nfagraph/ng_limex.h" #include "nfagraph/ng_reports.h" #include "nfagraph/ng_repeat.h" #include "nfagraph/ng_util.h" #include "nfagraph/ng_width.h" #include "smallwrite/smallwrite_build.h" #include "util/alloc.h" #include "util/boundary_reports.h" #include "util/compile_context.h" #include "util/container.h" #include "util/graph.h" #include "util/graph_range.h" #include "util/make_unique.h" #include "util/order_check.h" #include "util/report_manager.h" #include "util/ue2string.h" #include "util/verify_types.h" #include "ue2common.h" #include "grey.h" #include #include using namespace std; using boost::hash_combine; namespace ue2 { // just to get it out of the header RoseBuild::~RoseBuild() { } RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, SomSlotManager &ssm_in, SmallWriteBuild &smwr_in, const CompileContext &cc_in, const BoundaryReports &boundary_in) : cc(cc_in), root(add_vertex(g)), anchored_root(add_vertex(g)), hasSom(false), group_end(0), ematcher_region_size(0), eod_event_literal_id(MO_INVALID_IDX), max_rose_anchored_floating_overlap(0), rm(rm_in), ssm(ssm_in), smwr(smwr_in), boundary(boundary_in), next_nfa_report(0) { // add root vertices to graph g[root].min_offset = 0; g[root].max_offset = 0; g[anchored_root].min_offset = 0; g[anchored_root].max_offset = 0; } RoseBuildImpl::~RoseBuildImpl() { // empty } bool RoseVertexProps::isBoring(void) const { return !suffix && !left; } bool RoseVertexProps::fixedOffset(void) const { assert(min_offset <= max_offset); /* ensure offsets calculated */ return max_offset == min_offset && max_offset != ROSE_BOUND_INF; } bool RoseBuildImpl::isRootSuccessor(const RoseVertex &v) const { for (auto u : inv_adjacent_vertices_range(v, g)) { if (isAnyStart(u)) { return true; } } return false; } bool RoseBuildImpl::isNonRootSuccessor(const RoseVertex &v) const { for (auto u : inv_adjacent_vertices_range(v, g)) { if (!isAnyStart(u)) { return true; } } return false; } bool hasAnchHistorySucc(const RoseGraph &g, RoseVertex v) { for (const auto &e : out_edges_range(v, g)) { if (g[e].history == ROSE_ROLE_HISTORY_ANCH) { return true; } } return false; } bool hasLastByteHistorySucc(const RoseGraph &g, RoseVertex v) { for (const auto &e : out_edges_range(v, g)) { if (g[e].history == ROSE_ROLE_HISTORY_LAST_BYTE) { return true; } } return false; } static bool isInTable(const RoseBuildImpl &tbi, RoseVertex v, rose_literal_table table) { const auto &lit_ids = tbi.g[v].literals; if (lit_ids.empty()) { return false; // special role with no literals } // All literals for a given vertex will be in the same table, so we need // only inspect the first one. const auto lit_table = tbi.literals.right.at(*lit_ids.begin()).table; #ifndef NDEBUG // Verify that all literals for this vertex are in the same table. for (auto lit_id : lit_ids) { assert(tbi.literals.right.at(lit_id).table == lit_table); } #endif return lit_table == table; } bool RoseBuildImpl::isAnchored(RoseVertex v) const { return isInTable(*this, v, ROSE_ANCHORED); } bool RoseBuildImpl::isFloating(RoseVertex v) const { return isInTable(*this, v, ROSE_FLOATING); } bool RoseBuildImpl::isInETable(RoseVertex v) const { return isInTable(*this, v, ROSE_EOD_ANCHORED); } bool RoseBuildImpl::hasLiteralInTable(RoseVertex v, enum rose_literal_table t) const { return isInTable(*this, v, t); } /* Indicates that the floating table (if it exists) will be only run conditionally based on matches from the anchored table. */ bool RoseBuildImpl::hasNoFloatingRoots() const { for (auto v : adjacent_vertices_range(root, g)) { if (isFloating(v)) { DEBUG_PRINTF("direct floating root %zu\n", g[v].index); return false; } } /* need to check if the anchored_root has any literals which are too deep */ for (auto v : adjacent_vertices_range(anchored_root, g)) { if (isFloating(v)) { DEBUG_PRINTF("indirect floating root %zu\n", g[v].index); return false; } } return true; } size_t RoseBuildImpl::maxLiteralLen(RoseVertex v) const { const auto &lit_ids = g[v].literals; assert(!lit_ids.empty()); size_t maxlen = 0; for (const auto &lit_id : lit_ids) { maxlen = max(maxlen, literals.right.at(lit_id).elength()); } return maxlen; } size_t RoseBuildImpl::minLiteralLen(RoseVertex v) const { const auto &lit_ids = g[v].literals; assert(!lit_ids.empty()); size_t minlen = ROSE_BOUND_INF; for (const auto &lit_id : lit_ids) { minlen = min(minlen, literals.right.at(lit_id).elength()); } return minlen; } // RoseBuild factory unique_ptr makeRoseBuilder(ReportManager &rm, SomSlotManager &ssm, SmallWriteBuild &smwr, const CompileContext &cc, const BoundaryReports &boundary) { return ue2::make_unique(rm, ssm, smwr, cc, boundary); } size_t roseSize(const RoseEngine *t) { assert(t); return t->size; } bool roseIsPureLiteral(const RoseEngine *t) { return t->runtimeImpl == ROSE_RUNTIME_PURE_LITERAL; } // Returns non-zero max overlap len if a suffix of the literal 'a' overlaps // with a prefix of the literal 'b' or 'a' can be contained in 'b'. size_t maxOverlap(const ue2_literal &a, const ue2_literal &b, u32 b_delay) { /* overly conservative if only part of the string is nocase */ bool nocase = a.any_nocase() || b.any_nocase(); DEBUG_PRINTF("max overlap %s %s+%u %d\n", dumpString(a).c_str(), dumpString(b).c_str(), b_delay, (int)nocase); size_t a_len = a.length(); size_t b_len = b.length(); const char *a_end = a.c_str() + a_len; const char *b_end = b.c_str() + b_len; if (b_delay >= a_len) { return b_len + b_delay; } else if (b_delay) { /* a can be a substring of b which overlaps some of the end dots * OR b can be a substring near the end of a */ /* ignore overlap due to the final trailing dot as delayed literals * are delivered before undelayed */ for (u32 j = b_delay - 1; j > 0; j--) { if (b_len + j >= a_len) { if (!cmp(a.c_str(), b_end + j - a_len, a_len - j, nocase)) { return b_len + j; } } else { if (!cmp(a_end - j - b_len, b.c_str(), b_len, nocase)) { return b_len + j; } } } } return maxStringOverlap(a.get_string(), b.get_string(), nocase); } // Returns non-zero max overlap len if a suffix of the literal ID 'a' overlaps // with a prefix of the literal ID 'b' or 'a' can be contained in 'b'. size_t maxOverlap(const rose_literal_id &a, const rose_literal_id &b) { assert(!a.delay); return maxOverlap(a.s, b.s, b.delay); } static const rose_literal_id &getOverlapLiteral(const RoseBuildImpl &tbi, u32 literal_id) { map::const_iterator it = tbi.anchoredLitSuffix.find(literal_id); if (it != tbi.anchoredLitSuffix.end()) { return it->second; } return tbi.literals.right.at(literal_id); } ue2_literal findNonOverlappingTail(const set &lits, const ue2_literal &s) { size_t max_overlap = 0; for (const auto &lit : lits) { size_t overlap = lit != s ? maxStringOverlap(lit, s) : maxStringSelfOverlap(s); max_overlap = max(max_overlap, overlap); } /* find the tail that doesn't overlap */ ue2_literal tail = s.substr(max_overlap); DEBUG_PRINTF("%zu overlap, tail: '%s'\n", max_overlap, dumpString(tail).c_str()); return tail; } size_t RoseBuildImpl::maxLiteralOverlap(RoseVertex u, RoseVertex v) const { size_t overlap = 0; for (auto u_lit_id : g[u].literals) { const rose_literal_id &ul = getOverlapLiteral(*this, u_lit_id); for (auto v_lit_id : g[v].literals) { const rose_literal_id &vl = getOverlapLiteral(*this, v_lit_id); overlap = max(overlap, maxOverlap(ul, vl)); } } return overlap; } void RoseBuildImpl::removeVertices(const vector &dead) { for (auto v : dead) { assert(!isAnyStart(v)); DEBUG_PRINTF("removing vertex %zu\n", g[v].index); for (auto lit_id : g[v].literals) { literal_info[lit_id].vertices.erase(v); } clear_vertex(v, g); remove_vertex(v, g); } renumber_vertices(g); } // Find the maximum bound on the edges to this vertex's successors ignoring // those via infixes. u32 RoseBuildImpl::calcSuccMaxBound(RoseVertex u) const { u32 maxBound = 0; for (const auto &e : out_edges_range(u, g)) { RoseVertex v = target(e, g); if (g[v].left) { continue; } u32 thisBound = g[e].maxBound; if (thisBound == ROSE_BOUND_INF) { return ROSE_BOUND_INF; } if (!g[v].eod_accept) { // Add the length of the longest of our literals. thisBound += maxLiteralLen(v); } maxBound = max(maxBound, thisBound); } assert(maxBound <= ROSE_BOUND_INF); return maxBound; } u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, u32 delay, rose_literal_table table) { DEBUG_PRINTF("getting id for %s\n", dumpString(s).c_str()); assert(table != ROSE_ANCHORED); rose_literal_id key(s, table, delay); u32 numLiterals = verify_u32(literals.left.size()); RoseLiteralMap::iterator it; bool inserted; tie(it, inserted) = literals.insert(RoseLiteralMap::value_type(key, numLiterals)); u32 id = it->right; if (inserted) { literal_info.push_back(rose_literal_info()); assert(literal_info.size() == id + 1); if (delay) { u32 undelayed_id = getLiteralId(s, 0, table); literal_info[id].undelayed_id = undelayed_id; literal_info[undelayed_id].delayed_ids.insert(id); } else { literal_info[id].undelayed_id = id; } } return id; } // Function that operates on a msk/cmp pair and a literal, as used in // hwlmLiteral, and zeroes msk elements that don't add any power to the // literal. void normaliseLiteralMask(const ue2_literal &s_in, vector &msk, vector &cmp) { assert(msk.size() == cmp.size()); assert(msk.size() <= HWLM_MASKLEN); if (msk.empty()) { return; } // Work over a caseless copy if the string contains nocase chars. This will // ensure that we treat masks designed to handle mixed-sensitivity literals // correctly: these will be matched by the literal matcher in caseless // mode, with the mask used to narrow the matches. ue2_literal s(s_in); if (s.any_nocase()) { make_nocase(&s); } ue2_literal::const_reverse_iterator it = s.rbegin(), ite = s.rend(); size_t i = msk.size(); while (i-- != 0 && it != ite) { const CharReach &cr = *it; for (size_t c = cr.find_first(); c != CharReach::npos; c = cr.find_next(c)) { if (((u8)c & msk[i]) != cmp[i]) { goto skip; } } // If we didn't jump out of the loop to skip, then this mask position // doesn't further narrow the set of acceptable literals from those // accepted by s. So we can zero this element. msk[i] = 0; cmp[i] = 0; skip: ++it; } // Wipe out prefix zeroes. while (!msk.empty() && msk[0] == 0) { msk.erase(msk.begin()); cmp.erase(cmp.begin()); } } rose_literal_id::rose_literal_id(const ue2_literal &s_in, const vector &msk_in, const vector &cmp_in, rose_literal_table table_in, u32 delay_in) : s(s_in), msk(msk_in), cmp(cmp_in), table(table_in), delay(delay_in), distinctiveness(0) { assert(msk.size() == cmp.size()); assert(msk.size() <= HWLM_MASKLEN); assert(delay <= MAX_DELAY); normaliseLiteralMask(s, msk, cmp); } u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, const vector &msk, const vector &cmp, u32 delay, rose_literal_table table) { DEBUG_PRINTF("getting id for %s\n", dumpString(s).c_str()); assert(table != ROSE_ANCHORED); rose_literal_id key(s, msk, cmp, table, delay); u32 numLiterals = verify_u32(literals.left.size()); /* ue2_literals are always uppercased if nocase and must have an * alpha char */ RoseLiteralMap::iterator it; bool inserted; tie(it, inserted) = literals.insert( RoseLiteralMap::value_type(key, numLiterals)); u32 id = it->right; if (inserted) { literal_info.push_back(rose_literal_info()); assert(literal_info.size() == id + 1); if (delay) { u32 undelayed_id = getLiteralId(s, msk, cmp, 0, table); literal_info[id].undelayed_id = undelayed_id; literal_info[undelayed_id].delayed_ids.insert(id); } else { literal_info[id].undelayed_id = id; } } return id; } bool RoseBuildImpl::hasLiteral(const ue2_literal &s, rose_literal_table table) const { DEBUG_PRINTF("looking if %s exists\n", dumpString(s).c_str()); assert(table != ROSE_ANCHORED); for (RoseLiteralMap::left_map::const_iterator it = literals.left.lower_bound(rose_literal_id(s, table, 0)); it != literals.left.end(); ++it) { if (it->first.table != table || it->first.s != s) { break; } const rose_literal_info &info = literal_info[it->second]; if (!info.vertices.empty()) { return true; } } DEBUG_PRINTF("(used) literal not found\n"); return false; } u32 RoseBuildImpl::getNewLiteralId() { rose_literal_id key(ue2_literal(), ROSE_ANCHORED, 0); u32 numLiterals = verify_u32(literals.left.size()); key.distinctiveness = numLiterals; RoseLiteralMap::iterator it; bool inserted; tie(it, inserted) = literals.insert(RoseLiteralMap::value_type(key, numLiterals)); u32 id = it->right; assert(inserted); literal_info.push_back(rose_literal_info()); assert(literal_info.size() == id + 1); literal_info[id].undelayed_id = id; return id; } static bool requiresDedupe(const NGHolder &h, const ue2::flat_set &reports, const Grey &grey) { /* TODO: tighten */ NFAVertex seen_vert = NGHolder::null_vertex(); for (auto v : inv_adjacent_vertices_range(h.accept, h)) { if (has_intersection(h[v].reports, reports)) { if (seen_vert != NGHolder::null_vertex()) { return true; } seen_vert = v; } } for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) { if (has_intersection(h[v].reports, reports)) { if (seen_vert != NGHolder::null_vertex()) { return true; } seen_vert = v; } } if (seen_vert) { /* if the reporting vertex is part of of a terminal repeat, the * construction process may reform the graph splitting it into two * vertices (pos, cyclic) and hence require dedupe */ vector repeats; findRepeats(h, grey.minExtBoundedRepeatSize, &repeats); for (const auto &repeat : repeats) { if (find(repeat.vertices.begin(), repeat.vertices.end(), seen_vert) != repeat.vertices.end()) { return true; } } } return false; } class RoseDedupeAuxImpl : public RoseDedupeAux { public: explicit RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in); bool requiresDedupeSupport( const ue2::flat_set &reports) const override; private: bool hasSafeMultiReports(const ue2::flat_set &reports) const; const RoseBuildImpl &tbi; map> vert_map; //!< ordinary literals map> sb_vert_map; //!< small block literals map> suffix_map; map> outfix_map; map> puff_map; unordered_set live_reports; //!< all live internal reports. }; unique_ptr RoseBuildImpl::generateDedupeAux() const { return ue2::make_unique(*this); } RoseDedupeAux::~RoseDedupeAux() { } RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in) : tbi(tbi_in) { const RoseGraph &g = tbi.g; set suffixes; for (auto v : vertices_range(g)) { insert(&live_reports, g[v].reports); // Literals in the small block table are "shadow" copies of literals in // the other tables that do not run in the same runtime invocation. // Dedupe key assignment will be taken care of by the real literals. if (tbi.hasLiteralInTable(v, ROSE_ANCHORED_SMALL_BLOCK)) { for (const auto &report_id : g[v].reports) { sb_vert_map[report_id].insert(v); } } else { for (const auto &report_id : g[v].reports) { vert_map[report_id].insert(v); } } // Several vertices may share a suffix, so we collect the set of // suffixes first to avoid repeating work. if (g[v].suffix) { suffixes.insert(g[v].suffix); } } for (const auto &suffix : suffixes) { for (const auto &report_id : all_reports(suffix)) { suffix_map[report_id].insert(suffix); live_reports.insert(report_id); } } for (const auto &outfix : tbi.outfixes) { for (const auto &report_id : all_reports(outfix)) { outfix_map[report_id].insert(&outfix); live_reports.insert(report_id); } } if (tbi.mpv_outfix) { auto *mpv = tbi.mpv_outfix->mpv(); for (const auto &puff : mpv->puffettes) { puff_map[puff.report].insert(&puff); live_reports.insert(puff.report); } for (const auto &puff : mpv->triggered_puffettes) { puff_map[puff.report].insert(&puff); live_reports.insert(puff.report); } } // Collect live reports from boundary reports. insert(&live_reports, tbi.boundary.report_at_0); insert(&live_reports, tbi.boundary.report_at_0_eod); insert(&live_reports, tbi.boundary.report_at_eod); DEBUG_PRINTF("%zu of %zu reports are live\n", live_reports.size(), tbi.rm.numReports()); } static vector makePath(const rose_literal_id &lit) { vector path(begin(lit.s), end(lit.s)); for (u32 i = 0; i < lit.delay; i++) { path.push_back(CharReach::dot()); } return path; } /** * \brief True if one of the given literals overlaps with the suffix of * another, meaning that they could arrive at the same offset. */ static bool literalsCouldRace(const rose_literal_id &lit1, const rose_literal_id &lit2) { DEBUG_PRINTF("compare %s (delay %u) and %s (delay %u)\n", dumpString(lit1.s).c_str(), lit1.delay, dumpString(lit2.s).c_str(), lit2.delay); // Add dots on the end of each literal for delay. const auto v1 = makePath(lit1); const auto v2 = makePath(lit2); // See if the smaller path is a suffix of the larger path. const auto *smaller = v1.size() < v2.size() ? &v1 : &v2; const auto *bigger = v1.size() < v2.size() ? &v2 : &v1; auto r = mismatch(smaller->rbegin(), smaller->rend(), bigger->rbegin(), overlaps); return r.first == smaller->rend(); } bool RoseDedupeAuxImpl::hasSafeMultiReports( const flat_set &reports) const { if (reports.size() <= 1) { return true; } /* We have more than one ReportID corresponding to the external ID that is * presented to the user. These may differ in offset adjustment, bounds * checks, etc. */ /* TODO: work out if these differences will actually cause problems */ /* One common case where we know we don't have a problem is if there are * precisely two reports, one for the main Rose path and one for the * "small block matcher" path. */ if (reports.size() == 2) { ReportID id1 = *reports.begin(); ReportID id2 = *reports.rbegin(); bool has_verts_1 = contains(vert_map, id1); bool has_verts_2 = contains(vert_map, id2); bool has_sb_verts_1 = contains(sb_vert_map, id1); bool has_sb_verts_2 = contains(sb_vert_map, id2); if (has_verts_1 != has_verts_2 && has_sb_verts_1 != has_sb_verts_2) { DEBUG_PRINTF("two reports, one full and one small block: ok\n"); return true; } } DEBUG_PRINTF("more than one report\n"); return false; } bool RoseDedupeAuxImpl::requiresDedupeSupport( const flat_set &reports_in) const { /* TODO: this could be expanded to check for offset or character constraints */ // We don't want to consider dead reports (tracked by ReportManager but no // longer used) for the purposes of assigning dupe keys. flat_set reports; for (auto id : reports_in) { if (contains(live_reports, id)) { reports.insert(id); } } DEBUG_PRINTF("live reports: %s\n", as_string_list(reports).c_str()); const RoseGraph &g = tbi.g; bool has_suffix = false; bool has_outfix = false; if (!hasSafeMultiReports(reports)) { DEBUG_PRINTF("multiple reports not safe\n"); return true; } set roles; set suffixes; set outfixes; set puffettes; for (ReportID r : reports) { if (contains(vert_map, r)) { insert(&roles, vert_map.at(r)); } if (contains(suffix_map, r)) { insert(&suffixes, suffix_map.at(r)); } if (contains(outfix_map, r)) { insert(&outfixes, outfix_map.at(r)); } if (contains(puff_map, r)) { insert(&puffettes, puff_map.at(r)); } } /* roles */ map lits; // Literal ID -> count of occurrences. const bool has_role = !roles.empty(); for (auto v : roles) { for (const auto &lit : g[v].literals) { lits[lit]++; } if (g[v].eod_accept) { // Literals plugged into this EOD accept must be taken into account // as well. for (auto u : inv_adjacent_vertices_range(v, g)) { for (const auto &lit : g[u].literals) { lits[lit]++; } } } } /* literals */ for (const auto &m : lits) { if (m.second > 1) { DEBUG_PRINTF("lit %u used by >1 reporting roles\n", m.first); return true; } } for (auto it = begin(lits); it != end(lits); ++it) { const auto &lit1 = tbi.literals.right.at(it->first); for (auto jt = next(it); jt != end(lits); ++jt) { const auto &lit2 = tbi.literals.right.at(jt->first); if (literalsCouldRace(lit1, lit2)) { DEBUG_PRINTF("literals could race\n"); return true; } } } /* suffixes */ for (const auto &suffix : suffixes) { if (has_suffix || has_role) { return true; /* scope for badness */ } has_suffix = true; /* some lesser suffix engines (nfas, haig, castle) can raise multiple * matches for a report id at the same offset if there are multiple * report states live. */ if (suffix.haig()) { return true; } if (suffix.graph() && requiresDedupe(*suffix.graph(), reports, tbi.cc.grey)) { return true; } if (suffix.castle() && requiresDedupe(*suffix.castle(), reports)) { return true; } } /* outfixes */ for (const auto &outfix_ptr : outfixes) { assert(outfix_ptr); const OutfixInfo &out = *outfix_ptr; if (has_outfix || has_role || has_suffix) { return true; } has_outfix = true; if (out.haig()) { return true; /* haig may report matches with different SOM at the same offset */ } if (out.holder() && requiresDedupe(*out.holder(), reports, tbi.cc.grey)) { return true; } } /* mpv */ for (UNUSED const auto &puff : puffettes) { if (has_outfix || has_role || has_suffix) { return true; } has_outfix = true; } /* boundary */ if (has_intersection(tbi.boundary.report_at_eod, reports)) { if (has_outfix || has_role || has_suffix) { return true; } } return false; } bool operator<(const RoseEdgeProps &a, const RoseEdgeProps &b) { ORDER_CHECK(minBound); ORDER_CHECK(maxBound); ORDER_CHECK(history); return false; } #ifndef NDEBUG bool roseHasTops(const RoseBuildImpl &build, RoseVertex v) { const RoseGraph &g = build.g; assert(g[v].left); set graph_tops; if (!build.isRootSuccessor(v)) { for (const auto &e : in_edges_range(v, g)) { graph_tops.insert(g[e].rose_top); } } return is_subset_of(graph_tops, all_tops(g[v].left)); } #endif u32 OutfixInfo::get_queue(QueueIndexFactory &qif) { if (queue == ~0U) { queue = qif.get_queue(); } return queue; } namespace { class OutfixAllReports : public boost::static_visitor> { public: set operator()(const boost::blank &) const { return set(); } template set operator()(const unique_ptr &x) const { return all_reports(*x); } set operator()(const MpvProto &mpv) const { set reports; for (const auto &puff : mpv.puffettes) { reports.insert(puff.report); } for (const auto &puff : mpv.triggered_puffettes) { reports.insert(puff.report); } return reports; } }; } set all_reports(const OutfixInfo &outfix) { auto reports = boost::apply_visitor(OutfixAllReports(), outfix.proto); assert(!reports.empty()); return reports; } bool RoseSuffixInfo::operator==(const RoseSuffixInfo &b) const { return top == b.top && graph == b.graph && castle == b.castle && rdfa == b.rdfa && haig == b.haig && tamarama == b.tamarama; } bool RoseSuffixInfo::operator<(const RoseSuffixInfo &b) const { const RoseSuffixInfo &a = *this; ORDER_CHECK(top); ORDER_CHECK(graph); ORDER_CHECK(castle); ORDER_CHECK(haig); ORDER_CHECK(rdfa); ORDER_CHECK(tamarama); assert(a.dfa_min_width == b.dfa_min_width); assert(a.dfa_max_width == b.dfa_max_width); return false; } void RoseSuffixInfo::reset(void) { top = 0; graph.reset(); castle.reset(); rdfa.reset(); haig.reset(); tamarama.reset(); dfa_min_width = 0; dfa_max_width = depth::infinity(); } std::set all_reports(const suffix_id &s) { assert(s.graph() || s.castle() || s.haig() || s.dfa()); if (s.tamarama()) { return all_reports(*s.tamarama()); } else if (s.graph()) { return all_reports(*s.graph()); } else if (s.castle()) { return all_reports(*s.castle()); } else if (s.dfa()) { return all_reports(*s.dfa()); } else { return all_reports(*s.haig()); } } depth findMinWidth(const suffix_id &s) { assert(s.graph() || s.castle() || s.haig() || s.dfa()); if (s.graph()) { return findMinWidth(*s.graph()); } else if (s.castle()) { return findMinWidth(*s.castle()); } else { return s.dfa_min_width; } } depth findMinWidth(const suffix_id &s, u32 top) { assert(s.graph() || s.castle() || s.haig() || s.dfa()); if (s.graph()) { return findMinWidth(*s.graph(), top); } else if (s.castle()) { return findMinWidth(*s.castle(), top); } else { return s.dfa_min_width; } } depth findMaxWidth(const suffix_id &s) { assert(s.graph() || s.castle() || s.haig() || s.dfa()); if (s.graph()) { return findMaxWidth(*s.graph()); } else if (s.castle()) { return findMaxWidth(*s.castle()); } else { return s.dfa_max_width; } } depth findMaxWidth(const suffix_id &s, u32 top) { assert(s.graph() || s.castle() || s.haig() || s.dfa()); if (s.graph()) { return findMaxWidth(*s.graph(), top); } else if (s.castle()) { return findMaxWidth(*s.castle(), top); } else { return s.dfa_max_width; } } bool has_eod_accepts(const suffix_id &s) { assert(s.graph() || s.castle() || s.haig() || s.dfa()); if (s.graph()) { /* ignore accept -> eod edge */ return in_degree(s.graph()->acceptEod, *s.graph()) > 1; } else if (s.castle()) { return false; } else if (s.dfa()) { return has_eod_accepts(*s.dfa()); } else { return has_eod_accepts(*s.haig()); } } bool has_non_eod_accepts(const suffix_id &s) { assert(s.graph() || s.castle() || s.haig() || s.dfa()); if (s.graph()) { return in_degree(s.graph()->accept, *s.graph()); } else if (s.castle()) { return true; } else if (s.dfa()) { return has_non_eod_accepts(*s.dfa()); } else { return has_non_eod_accepts(*s.haig()); } } set all_tops(const suffix_id &s) { assert(s.graph() || s.castle() || s.haig() || s.dfa()); if (s.graph()) { flat_set tops = getTops(*s.graph()); assert(!tops.empty()); return {tops.begin(), tops.end()}; } if (s.castle()) { return assoc_keys(s.castle()->repeats); } // Other types of suffix are not multi-top. return {0}; } size_t suffix_id::hash() const { size_t val = 0; hash_combine(val, g); hash_combine(val, c); hash_combine(val, d); hash_combine(val, h); return val; } size_t hash_value(const suffix_id &s) { return s.hash(); } bool isAnchored(const left_id &r) { assert(r.graph() || r.castle() || r.haig() || r.dfa()); if (r.graph()) { return isAnchored(*r.graph()); } if (r.dfa()) { return r.dfa()->start_anchored == DEAD_STATE; } if (r.haig()) { return r.haig()->start_anchored == DEAD_STATE; } // All other types are explicitly anchored. return true; } depth findMinWidth(const left_id &r) { assert(r.graph() || r.castle() || r.haig() || r.dfa()); if (r.graph()) { return findMinWidth(*r.graph()); } else if (r.castle()) { return findMinWidth(*r.castle()); } else { return r.dfa_min_width; } } depth findMaxWidth(const left_id &r) { assert(r.graph() || r.castle() || r.haig() || r.dfa()); if (r.graph()) { return findMaxWidth(*r.graph()); } else if (r.castle()) { return findMaxWidth(*r.castle()); } else { return r.dfa_max_width; } } set all_tops(const left_id &r) { assert(r.graph() || r.castle() || r.haig() || r.dfa()); if (r.graph()) { flat_set tops = getTops(*r.graph()); return {tops.begin(), tops.end()}; } if (r.castle()) { return assoc_keys(r.castle()->repeats); } // Other types of rose are not multi-top. return {0}; } u32 num_tops(const left_id &r) { return all_tops(r).size(); } size_t left_id::hash() const { size_t val = 0; hash_combine(val, g); hash_combine(val, c); hash_combine(val, d); hash_combine(val, h); return val; } size_t hash_value(const left_id &r) { return r.hash(); } u64a findMaxOffset(const set &reports, const ReportManager &rm) { assert(!reports.empty()); u64a maxOffset = 0; for (const auto &report_id : reports) { const Report &ir = rm.getReport(report_id); if (ir.hasBounds()) { maxOffset = max(maxOffset, ir.maxOffset); } else { return MAX_OFFSET; } } return maxOffset; } void LeftEngInfo::reset(void) { graph.reset(); castle.reset(); dfa.reset(); haig.reset(); tamarama.reset(); lag = 0; leftfix_report = MO_INVALID_IDX; dfa_min_width = 0; dfa_max_width = depth::infinity(); } LeftEngInfo::operator bool() const { assert((int)!!castle + (int)!!dfa + (int)!!haig <= 1); assert(!castle || !graph); assert(!dfa || graph); /* dfas always have the graph as well */ assert(!haig || graph); return graph || castle || dfa || haig; } u32 roseQuality(const RoseEngine *t) { /* Rose is low quality if the atable is a Mcclellan 16 or has multiple DFAs */ const anchored_matcher_info *atable = getALiteralMatcher(t); if (atable) { if (atable->next_offset) { DEBUG_PRINTF("multiple atable engines\n"); return 0; } const NFA *nfa = (const NFA *)((const char *)atable + sizeof(*atable)); if (!isSmallDfaType(nfa->type)) { DEBUG_PRINTF("m16 atable engine\n"); return 0; } } /* if we always run multiple engines then we are slow */ u32 always_run = 0; if (atable) { always_run++; } if (t->eagerIterOffset) { /* eager prefixes are always run */ always_run++; } const HWLM *ftable = getFLiteralMatcher(t); if (ftable) { /* TODO: ignore conditional ftables, or ftables beyond smwr region */ always_run++; } if (t->ematcherOffset) { always_run++; } /* ignore mpv outfixes as they are v good, mpv outfixes are before begin */ if (t->outfixBeginQueue != t->outfixEndQueue) { /* TODO: ignore outfixes > smwr region */ always_run++; } bool eod_prefix = false; const LeftNfaInfo *left = getLeftTable(t); for (u32 i = 0; i < t->activeLeftCount; i++) { if (left->eod_check) { eod_prefix = true; break; } } if (eod_prefix) { always_run++; DEBUG_PRINTF("eod prefixes are slow"); return 0; } if (always_run > 1) { DEBUG_PRINTF("we always run %u engines\n", always_run); return 0; } return 1; } #ifndef NDEBUG /** \brief Returns true if all the graphs (NFA, DFA, Haig, etc) in this Rose * graph are implementable. */ bool canImplementGraphs(const RoseBuildImpl &tbi) { const RoseGraph &g = tbi.g; // First, check the Rose leftfixes. for (auto v : vertices_range(g)) { DEBUG_PRINTF("leftfix: check vertex %zu\n", g[v].index); if (g[v].left.castle) { DEBUG_PRINTF("castle ok\n"); continue; } if (g[v].left.dfa) { DEBUG_PRINTF("dfa ok\n"); continue; } if (g[v].left.haig) { DEBUG_PRINTF("haig ok\n"); continue; } if (g[v].left.graph) { assert(g[v].left.graph->kind == (tbi.isRootSuccessor(v) ? NFA_PREFIX : NFA_INFIX)); if (!isImplementableNFA(*g[v].left.graph, nullptr, tbi.cc)) { DEBUG_PRINTF("nfa prefix %zu failed (%zu vertices)\n", g[v].index, num_vertices(*g[v].left.graph)); return false; } } } // Suffix graphs. for (auto v : vertices_range(g)) { DEBUG_PRINTF("suffix: check vertex %zu\n", g[v].index); const RoseSuffixInfo &suffix = g[v].suffix; if (suffix.castle) { DEBUG_PRINTF("castle suffix ok\n"); continue; } if (suffix.rdfa) { DEBUG_PRINTF("dfa suffix ok\n"); continue; } if (suffix.haig) { DEBUG_PRINTF("haig suffix ok\n"); continue; } if (suffix.graph) { assert(suffix.graph->kind == NFA_SUFFIX); if (!isImplementableNFA(*suffix.graph, &tbi.rm, tbi.cc)) { DEBUG_PRINTF("nfa suffix %zu failed (%zu vertices)\n", g[v].index, num_vertices(*suffix.graph)); return false; } } } return true; } bool hasOrphanedTops(const RoseBuildImpl &build) { const RoseGraph &g = build.g; ue2::unordered_map > roses; ue2::unordered_map > suffixes; for (auto v : vertices_range(g)) { if (g[v].left) { set &tops = roses[g[v].left]; if (!build.isRootSuccessor(v)) { // Tops for infixes come from the in-edges. for (const auto &e : in_edges_range(v, g)) { tops.insert(g[e].rose_top); } } } if (g[v].suffix) { suffixes[g[v].suffix].insert(g[v].suffix.top); } } for (const auto &e : roses) { if (all_tops(e.first) != e.second) { DEBUG_PRINTF("rose tops (%s) don't match rose graph (%s)\n", as_string_list(all_tops(e.first)).c_str(), as_string_list(e.second).c_str()); return true; } } for (const auto &e : suffixes) { if (all_tops(e.first) != e.second) { DEBUG_PRINTF("suffix tops (%s) don't match rose graph (%s)\n", as_string_list(all_tops(e.first)).c_str(), as_string_list(e.second).c_str()); return true; } } return false; } #endif // NDEBUG } // namespace ue2