vectorscan/src/rose/rose_build_misc.cpp
2024-05-13 13:04:57 +03:00

1044 lines
30 KiB
C++

/*
* Copyright (c) 2015-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "rose_build_misc.h"
#include "rose_build_impl.h"
#include "rose_build_resources.h"
#include "hwlm/hwlm_literal.h"
#include "nfa/castlecompile.h"
#include "nfa/goughcompile.h"
#include "nfa/mcclellancompile_util.h"
#include "nfa/nfa_api.h"
#include "nfa/rdfa.h"
#include "nfa/tamaramacompile.h"
#include "nfagraph/ng_holder.h"
#include "nfagraph/ng_limex.h"
#include "nfagraph/ng_reports.h"
#include "nfagraph/ng_repeat.h"
#include "nfagraph/ng_util.h"
#include "nfagraph/ng_width.h"
#include "smallwrite/smallwrite_build.h"
#include "util/alloc.h"
#include "util/boundary_reports.h"
#include "util/compile_context.h"
#include "util/container.h"
#include "util/graph.h"
#include "util/graph_range.h"
#include "util/order_check.h"
#include "util/report_manager.h"
#include "util/ue2string.h"
#include "util/verify_types.h"
#include "ue2common.h"
#include "grey.h"
#include <boost/graph/breadth_first_search.hpp>
using namespace std;
namespace ue2 {
// just to get it out of the header
RoseBuild::~RoseBuild() { }
RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in,
SomSlotManager &ssm_in,
SmallWriteBuild &smwr_in,
const CompileContext &cc_in,
const BoundaryReports &boundary_in)
: cc(cc_in),
root(add_vertex(g)),
anchored_root(add_vertex(g)),
hasSom(false),
group_end(0),
ematcher_region_size(0),
eod_event_literal_id(MO_INVALID_IDX),
max_rose_anchored_floating_overlap(0),
rm(rm_in),
ssm(ssm_in),
smwr(smwr_in),
boundary(boundary_in),
next_nfa_report(0) {
// add root vertices to graph
g[root].min_offset = 0;
g[root].max_offset = 0;
g[anchored_root].min_offset = 0;
g[anchored_root].max_offset = 0;
}
RoseBuildImpl::~RoseBuildImpl() {
// empty
}
bool RoseVertexProps::isBoring(void) const {
return !suffix && !left;
}
bool RoseVertexProps::fixedOffset(void) const {
assert(min_offset <= max_offset); /* ensure offsets calculated */
return max_offset == min_offset && max_offset != ROSE_BOUND_INF;
}
bool RoseBuildImpl::isRootSuccessor(const RoseVertex &v) const {
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (isAnyStart(u)) {
return true;
}
}
return false;
}
bool RoseBuildImpl::isNonRootSuccessor(const RoseVertex &v) const {
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (!isAnyStart(u)) {
return true;
}
}
return false;
}
bool hasAnchHistorySucc(const RoseGraph &g, RoseVertex v) {
for (const auto &e : out_edges_range(v, g)) {
if (g[e].history == ROSE_ROLE_HISTORY_ANCH) {
return true;
}
}
return false;
}
bool hasLastByteHistorySucc(const RoseGraph &g, RoseVertex v) {
for (const auto &e : out_edges_range(v, g)) {
if (g[e].history == ROSE_ROLE_HISTORY_LAST_BYTE) {
return true;
}
}
return false;
}
static
bool isInTable(const RoseBuildImpl &tbi, RoseVertex v,
rose_literal_table table) {
const auto &lit_ids = tbi.g[v].literals;
if (lit_ids.empty()) {
return false; // special role with no literals
}
// All literals for a given vertex will be in the same table, so we need
// only inspect the first one.
const auto lit_table = tbi.literals.at(*lit_ids.begin()).table;
// Verify that all literals for this vertex are in the same table.
assert(all_of_in(lit_ids, [&](u32 lit_id) {
return tbi.literals.at(lit_id).table == lit_table;
}));
return lit_table == table;
}
bool RoseBuildImpl::isAnchored(RoseVertex v) const {
return isInTable(*this, v, ROSE_ANCHORED);
}
bool RoseBuildImpl::isFloating(RoseVertex v) const {
return isInTable(*this, v, ROSE_FLOATING);
}
bool RoseBuildImpl::isInETable(RoseVertex v) const {
return isInTable(*this, v, ROSE_EOD_ANCHORED);
}
bool RoseBuildImpl::hasLiteralInTable(RoseVertex v,
enum rose_literal_table t) const {
return isInTable(*this, v, t);
}
/* Indicates that the floating table (if it exists) will be only run
conditionally based on matches from the anchored table. */
bool RoseBuildImpl::hasNoFloatingRoots() const {
for (auto v : adjacent_vertices_range(root, g)) {
if (isFloating(v)) {
DEBUG_PRINTF("direct floating root %zu\n", g[v].index);
return false;
}
}
/* need to check if the anchored_root has any literals which are too deep */
for (auto v : adjacent_vertices_range(anchored_root, g)) {
if (isFloating(v)) {
DEBUG_PRINTF("indirect floating root %zu\n", g[v].index);
return false;
}
}
return true;
}
size_t RoseBuildImpl::maxLiteralLen(RoseVertex v) const {
const auto &lit_ids = g[v].literals;
assert(!lit_ids.empty());
size_t maxlen = 0;
for (const auto &lit_id : lit_ids) {
maxlen = max(maxlen, literals.at(lit_id).elength());
}
return maxlen;
}
size_t RoseBuildImpl::minLiteralLen(RoseVertex v) const {
const auto &lit_ids = g[v].literals;
assert(!lit_ids.empty());
size_t minlen = ROSE_BOUND_INF;
for (const auto &lit_id : lit_ids) {
minlen = min(minlen, literals.at(lit_id).elength());
}
return minlen;
}
// RoseBuild factory
unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm,
SomSlotManager &ssm,
SmallWriteBuild &smwr,
const CompileContext &cc,
const BoundaryReports &boundary) {
return std::make_unique<RoseBuildImpl>(rm, ssm, smwr, cc, boundary);
}
bool roseIsPureLiteral(const RoseEngine *t) {
return t->runtimeImpl == ROSE_RUNTIME_PURE_LITERAL;
}
// Returns non-zero max overlap len if a suffix of the literal 'a' overlaps
// with a prefix of the literal 'b' or 'a' can be contained in 'b'.
size_t maxOverlap(const ue2_literal &a, const ue2_literal &b, u32 b_delay) {
/* overly conservative if only part of the string is nocase */
bool nocase = a.any_nocase() || b.any_nocase();
DEBUG_PRINTF("max overlap %s %s+%u %d\n", dumpString(a).c_str(),
dumpString(b).c_str(), b_delay, (int)nocase);
size_t a_len = a.length();
size_t b_len = b.length();
const char *a_end = a.c_str() + a_len;
const char *b_end = b.c_str() + b_len;
if (b_delay >= a_len) {
return b_len + b_delay;
} else if (b_delay) {
/* a can be a substring of b which overlaps some of the end dots
* OR b can be a substring near the end of a */
/* ignore overlap due to the final trailing dot as delayed literals
* are delivered before undelayed */
for (u32 j = b_delay - 1; j > 0; j--) {
if (b_len + j >= a_len) {
if (!cmp(a.c_str(), b_end + j - a_len, a_len - j, nocase)) {
return b_len + j;
}
} else {
if (!cmp(a_end - j - b_len, b.c_str(), b_len, nocase)) {
return b_len + j;
}
}
}
}
return maxStringOverlap(a.get_string(), b.get_string(), nocase);
}
// Returns non-zero max overlap len if a suffix of the literal ID 'a' overlaps
// with a prefix of the literal ID 'b' or 'a' can be contained in 'b'.
size_t maxOverlap(const rose_literal_id &a, const rose_literal_id &b) {
assert(!a.delay);
return maxOverlap(a.s, b.s, b.delay);
}
static
const rose_literal_id &getOverlapLiteral(const RoseBuildImpl &tbi,
u32 literal_id) {
auto it = tbi.anchoredLitSuffix.find(literal_id);
if (it != tbi.anchoredLitSuffix.end()) {
return it->second;
}
return tbi.literals.at(literal_id);
}
ue2_literal findNonOverlappingTail(const set<ue2_literal> &lits,
const ue2_literal &s) {
size_t max_overlap = 0;
for (const auto &lit : lits) {
size_t overlap = lit != s ? maxStringOverlap(lit, s)
: maxStringSelfOverlap(s);
max_overlap = max(max_overlap, overlap);
}
/* find the tail that doesn't overlap */
ue2_literal tail = s.substr(max_overlap);
DEBUG_PRINTF("%zu overlap, tail: '%s'\n", max_overlap,
dumpString(tail).c_str());
return tail;
}
size_t RoseBuildImpl::maxLiteralOverlap(RoseVertex u, RoseVertex v) const {
size_t overlap = 0;
for (auto u_lit_id : g[u].literals) {
const rose_literal_id &ul = getOverlapLiteral(*this, u_lit_id);
for (auto v_lit_id : g[v].literals) {
const rose_literal_id &vl = getOverlapLiteral(*this, v_lit_id);
overlap = max(overlap, maxOverlap(ul, vl));
}
}
return overlap;
}
void RoseBuildImpl::removeVertices(const vector<RoseVertex> &dead) {
for (auto v : dead) {
assert(!isAnyStart(v));
DEBUG_PRINTF("removing vertex %zu\n", g[v].index);
for (auto lit_id : g[v].literals) {
literal_info[lit_id].vertices.erase(v);
}
clear_vertex(v, g);
remove_vertex(v, g);
}
renumber_vertices(g);
}
// Find the maximum bound on the edges to this vertex's successors ignoring
// those via infixes.
u32 RoseBuildImpl::calcSuccMaxBound(RoseVertex u) const {
u32 maxBound = 0;
for (const auto &e : out_edges_range(u, g)) {
RoseVertex v = target(e, g);
if (g[v].left) {
continue;
}
u32 thisBound = g[e].maxBound;
if (thisBound == ROSE_BOUND_INF) {
return ROSE_BOUND_INF;
}
if (!g[v].eod_accept) {
// Add the length of the longest of our literals.
thisBound += maxLiteralLen(v);
}
maxBound = max(maxBound, thisBound);
}
assert(maxBound <= ROSE_BOUND_INF);
return maxBound;
}
u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, u32 delay,
rose_literal_table table) {
DEBUG_PRINTF("getting id for %s in table %d\n", dumpString(s).c_str(),
table);
assert(table != ROSE_ANCHORED);
rose_literal_id key(s, table, delay);
auto m = literals.insert(key);
u32 id = m.first;
bool inserted = m.second;
if (inserted) {
literal_info.emplace_back(rose_literal_info());
assert(literal_info.size() == id + 1);
if (delay) {
u32 undelayed_id = getLiteralId(s, 0, table);
literal_info[id].undelayed_id = undelayed_id;
literal_info[undelayed_id].delayed_ids.insert(id);
} else {
literal_info[id].undelayed_id = id;
}
}
return id;
}
// Function that operates on a msk/cmp pair and a literal, as used in
// hwlmLiteral, and zeroes msk elements that don't add any power to the
// literal.
void normaliseLiteralMask(const ue2_literal &s_in, vector<u8> &msk,
vector<u8> &cmp) {
assert(msk.size() == cmp.size());
assert(msk.size() <= HWLM_MASKLEN);
if (msk.empty()) {
return;
}
// Work over a caseless copy if the string contains nocase chars. This will
// ensure that we treat masks designed to handle mixed-sensitivity literals
// correctly: these will be matched by the literal matcher in caseless
// mode, with the mask used to narrow the matches.
ue2_literal s(s_in);
if (s.any_nocase()) {
make_nocase(&s);
}
ue2_literal::const_reverse_iterator it = s.rbegin(), ite = s.rend();
size_t i = msk.size();
while (i-- != 0 && it != ite) {
const CharReach &cr = *it;
for (size_t c = cr.find_first(); c != CharReach::npos;
c = cr.find_next(c)) {
if (((u8)c & msk[i]) != cmp[i]) {
goto skip;
}
}
// If we didn't jump out of the loop to skip, then this mask position
// doesn't further narrow the set of acceptable literals from those
// accepted by s. So we can zero this element.
msk[i] = 0;
cmp[i] = 0;
skip:
++it;
}
// Wipe out prefix zeroes.
while (!msk.empty() && msk[0] == 0) {
msk.erase(msk.begin());
cmp.erase(cmp.begin());
}
}
rose_literal_id::rose_literal_id(const ue2_literal &s_in,
const vector<u8> &msk_in, const vector<u8> &cmp_in,
rose_literal_table table_in, u32 delay_in)
: s(s_in), msk(msk_in), cmp(cmp_in), table(table_in),
delay(delay_in), distinctiveness(0) {
assert(msk.size() == cmp.size());
assert(msk.size() <= HWLM_MASKLEN);
assert(delay <= MAX_DELAY);
normaliseLiteralMask(s, msk, cmp);
}
u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, const vector<u8> &msk,
const vector<u8> &cmp, u32 delay,
rose_literal_table table) {
DEBUG_PRINTF("getting id for %s in table %d\n", dumpString(s).c_str(),
table);
assert(table != ROSE_ANCHORED);
rose_literal_id key(s, msk, cmp, table, delay);
/* ue2_literals are always uppercased if nocase and must have an
* alpha char */
auto m = literals.insert(key);
u32 id = m.first;
bool inserted = m.second;
if (inserted) {
literal_info.emplace_back(rose_literal_info());
assert(literal_info.size() == id + 1);
if (delay) {
u32 undelayed_id = getLiteralId(s, msk, cmp, 0, table);
literal_info[id].undelayed_id = undelayed_id;
literal_info[undelayed_id].delayed_ids.insert(id);
} else {
literal_info[id].undelayed_id = id;
}
}
return id;
}
u32 RoseBuildImpl::getNewLiteralId() {
rose_literal_id key(ue2_literal(), ROSE_ANCHORED, 0);
u32 numLiterals = verify_u32(literals.size());
key.distinctiveness = numLiterals;
auto m = literals.insert(key);
assert(m.second);
u32 id = m.first;
literal_info.emplace_back(rose_literal_info());
assert(literal_info.size() == id + 1);
literal_info[id].undelayed_id = id;
return id;
}
bool operator<(const RoseEdgeProps &a, const RoseEdgeProps &b) {
ORDER_CHECK(minBound);
ORDER_CHECK(maxBound);
ORDER_CHECK(history);
return false;
}
#ifndef NDEBUG
bool roseHasTops(const RoseBuildImpl &build, RoseVertex v) {
const RoseGraph &g = build.g;
assert(left_id(g[v].left));
set<u32> graph_tops;
if (!build.isRootSuccessor(v)) {
for (const auto &e : in_edges_range(v, g)) {
graph_tops.insert(g[e].rose_top);
}
}
return is_subset_of(graph_tops, all_tops(left_id(g[v].left)));
}
#endif
u32 OutfixInfo::get_queue(QueueIndexFactory &qif) {
if (queue == ~0U) {
queue = qif.get_queue();
}
return queue;
}
namespace {
class OutfixAllReports : public boost::static_visitor<set<ReportID>> {
public:
set<ReportID> operator()(const boost::blank &) const {
return set<ReportID>();
}
template<class T>
set<ReportID> operator()(const unique_ptr<T> &x) const {
return all_reports(*x);
}
set<ReportID> operator()(const MpvProto &mpv) const {
set<ReportID> reports;
for (const auto &puff : mpv.puffettes) {
reports.insert(puff.report);
}
for (const auto &puff : mpv.triggered_puffettes) {
reports.insert(puff.report);
}
return reports;
}
};
}
set<ReportID> all_reports(const OutfixInfo &outfix) {
auto reports = boost::apply_visitor(OutfixAllReports(), outfix.proto);
assert(!reports.empty());
return reports;
}
bool RoseSuffixInfo::operator==(const RoseSuffixInfo &b) const {
return top == b.top && graph == b.graph && castle == b.castle &&
rdfa == b.rdfa && haig == b.haig && tamarama == b.tamarama;
}
bool RoseSuffixInfo::operator<(const RoseSuffixInfo &b) const {
const RoseSuffixInfo &a = *this;
ORDER_CHECK(top);
ORDER_CHECK(graph);
ORDER_CHECK(castle);
ORDER_CHECK(haig);
ORDER_CHECK(rdfa);
ORDER_CHECK(tamarama);
assert(a.dfa_min_width == b.dfa_min_width);
assert(a.dfa_max_width == b.dfa_max_width);
return false;
}
size_t RoseSuffixInfo::hash() const {
return hash_all(top, graph, castle, rdfa, haig, tamarama);
}
void RoseSuffixInfo::reset(void) {
top = 0;
graph.reset();
castle.reset();
rdfa.reset();
haig.reset();
tamarama.reset();
dfa_min_width = depth(0);
dfa_max_width = depth::infinity();
}
std::set<ReportID> all_reports(const suffix_id &s) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.tamarama()) {
return all_reports(*s.tamarama());
} else if (s.graph()) {
return all_reports(*s.graph());
} else if (s.castle()) {
return all_reports(*s.castle());
} else if (s.dfa()) {
return all_reports(*s.dfa());
} else {
return all_reports(*s.haig());
}
}
depth findMinWidth(const suffix_id &s) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
return findMinWidth(*s.graph());
} else if (s.castle()) {
return findMinWidth(*s.castle());
} else {
return s.dfa_min_width;
}
}
depth findMinWidth(const suffix_id &s, u32 top) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
return findMinWidth(*s.graph(), top);
} else if (s.castle()) {
return findMinWidth(*s.castle(), top);
} else {
return s.dfa_min_width;
}
}
depth findMaxWidth(const suffix_id &s) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
return findMaxWidth(*s.graph());
} else if (s.castle()) {
return findMaxWidth(*s.castle());
} else {
return s.dfa_max_width;
}
}
depth findMaxWidth(const suffix_id &s, u32 top) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
return findMaxWidth(*s.graph(), top);
} else if (s.castle()) {
return findMaxWidth(*s.castle(), top);
} else {
return s.dfa_max_width;
}
}
bool has_eod_accepts(const suffix_id &s) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
/* ignore accept -> eod edge */
return in_degree(s.graph()->acceptEod, *s.graph()) > 1;
} else if (s.castle()) {
return false;
} else if (s.dfa()) {
return has_eod_accepts(*s.dfa());
} else {
return has_eod_accepts(*s.haig());
}
}
bool has_non_eod_accepts(const suffix_id &s) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
return in_degree(s.graph()->accept, *s.graph());
} else if (s.castle()) {
return true;
} else if (s.dfa()) {
return has_non_eod_accepts(*s.dfa());
} else {
return has_non_eod_accepts(*s.haig());
}
}
set<u32> all_tops(const suffix_id &s) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
flat_set<u32> tops = getTops(*s.graph());
assert(!tops.empty());
return {tops.begin(), tops.end()};
}
if (s.castle()) {
return assoc_keys(s.castle()->repeats);
}
// Other types of suffix are not multi-top.
return {0};
}
size_t suffix_id::hash() const {
return hash_all(g, c, d, h, t);
}
bool isAnchored(const left_id &r) {
assert(r.graph() || r.castle() || r.haig() || r.dfa());
if (r.graph()) {
return isAnchored(*r.graph());
}
if (r.dfa()) {
return r.dfa()->start_anchored == DEAD_STATE;
}
if (r.haig()) {
return r.haig()->start_anchored == DEAD_STATE;
}
// All other types are explicitly anchored.
return true;
}
depth findMinWidth(const left_id &r) {
assert(r.graph() || r.castle() || r.haig() || r.dfa());
if (r.graph()) {
return findMinWidth(*r.graph());
} else if (r.castle()) {
return findMinWidth(*r.castle());
} else {
return r.dfa_min_width;
}
}
depth findMaxWidth(const left_id &r) {
assert(r.graph() || r.castle() || r.haig() || r.dfa());
if (r.graph()) {
return findMaxWidth(*r.graph());
} else if (r.castle()) {
return findMaxWidth(*r.castle());
} else {
return r.dfa_max_width;
}
}
set<u32> all_tops(const left_id &r) {
assert(r.graph() || r.castle() || r.haig() || r.dfa());
if (r.graph()) {
flat_set<u32> tops = getTops(*r.graph());
return {tops.begin(), tops.end()};
}
if (r.castle()) {
return assoc_keys(r.castle()->repeats);
}
// Other types of rose are not multi-top.
return {0};
}
set<u32> all_reports(const left_id &left) {
assert(left.graph() || left.castle() || left.haig() || left.dfa());
if (left.graph()) {
return all_reports(*left.graph());
} else if (left.castle()) {
return all_reports(*left.castle());
} else if (left.dfa()) {
return all_reports(*left.dfa());
} else {
return all_reports(*left.haig());
}
}
u32 num_tops(const left_id &r) {
return all_tops(r).size();
}
size_t left_id::hash() const {
return hash_all(g, c, d, h);
}
u64a findMaxOffset(const set<ReportID> &reports, const ReportManager &rm) {
assert(!reports.empty());
u64a maxOffset = 0;
for (const auto &report_id : reports) {
const Report &ir = rm.getReport(report_id);
if (ir.hasBounds()) {
maxOffset = max(maxOffset, ir.maxOffset);
} else {
return MAX_OFFSET;
}
}
return maxOffset;
}
size_t LeftEngInfo::hash() const {
return hash_all(graph, castle, dfa, haig, tamarama, lag, leftfix_report);
}
void LeftEngInfo::reset(void) {
graph.reset();
castle.reset();
dfa.reset();
haig.reset();
tamarama.reset();
lag = 0;
leftfix_report = MO_INVALID_IDX;
dfa_min_width = depth(0);
dfa_max_width = depth::infinity();
}
LeftEngInfo::operator bool() const {
assert((int)!!castle + (int)!!dfa + (int)!!haig <= 1);
assert(!castle || !graph);
assert(!dfa || graph); /* dfas always have the graph as well */
assert(!haig || graph);
return graph || castle || dfa || haig;
}
u32 roseQuality(const RoseResources &res, const RoseEngine *t) {
/* Rose is low quality if the atable is a Mcclellan 16 or has multiple DFAs
*/
if (res.has_anchored) {
if (res.has_anchored_multiple) {
DEBUG_PRINTF("multiple atable engines\n");
return 0;
}
if (res.has_anchored_large) {
DEBUG_PRINTF("m16 atable engine\n");
return 0;
}
}
/* if we always run multiple engines then we are slow */
u32 always_run = 0;
if (res.has_anchored) {
always_run++;
}
if (t->eagerIterOffset) {
/* eager prefixes are always run */
always_run++;
}
if (res.has_floating) {
/* TODO: ignore conditional ftables, or ftables beyond smwr region */
always_run++;
}
if (t->ematcherOffset) {
always_run++;
}
/* ignore mpv outfixes as they are v good, mpv outfixes are before begin */
if (t->outfixBeginQueue != t->outfixEndQueue) {
/* TODO: ignore outfixes > smwr region */
always_run++;
}
bool eod_prefix = false;
const LeftNfaInfo *left = getLeftTable(t);
for (u32 i = 0; i < t->activeLeftCount; i++) {
if (left->eod_check) {
eod_prefix = true;
break;
}
}
if (eod_prefix) {
always_run++;
DEBUG_PRINTF("eod prefixes are slow");
return 0;
}
if (always_run > 1) {
DEBUG_PRINTF("we always run %u engines\n", always_run);
return 0;
}
return 1;
}
u32 findMinOffset(const RoseBuildImpl &build, u32 lit_id) {
const auto &lit_vertices = build.literal_info.at(lit_id).vertices;
assert(!lit_vertices.empty());
u32 min_offset = UINT32_MAX;
for (const auto &v : lit_vertices) {
min_offset = min(min_offset, build.g[v].min_offset);
}
return min_offset;
}
u32 findMaxOffset(const RoseBuildImpl &build, u32 lit_id) {
const auto &lit_vertices = build.literal_info.at(lit_id).vertices;
assert(!lit_vertices.empty());
u32 max_offset = 0;
for (const auto &v : lit_vertices) {
max_offset = max(max_offset, build.g[v].max_offset);
}
return max_offset;
}
bool canEagerlyReportAtEod(const RoseBuildImpl &build, const RoseEdge &e) {
const auto &g = build.g;
const auto v = target(e, g);
if (!build.g[v].eod_accept) {
return false;
}
// If there's a graph between us and EOD, we shouldn't be eager.
if (build.g[v].left) {
return false;
}
// Must be exactly at EOD.
if (g[e].minBound != 0 || g[e].maxBound != 0) {
return false;
}
// In streaming mode, we can only eagerly report EOD for literals in the
// EOD-anchored table, as that's the only time we actually know where EOD
// is. In block mode, we always have this information.
const auto u = source(e, g);
if (build.cc.streaming && !build.isInETable(u)) {
return false;
}
return true;
}
#ifndef NDEBUG
/** \brief Returns true if all the graphs (NFA, DFA, Haig, etc) in this Rose
* graph are implementable. */
bool canImplementGraphs(const RoseBuildImpl &tbi) {
const RoseGraph &g = tbi.g;
// First, check the Rose leftfixes.
for (auto v : vertices_range(g)) {
DEBUG_PRINTF("leftfix: check vertex %zu\n", g[v].index);
if (g[v].left.castle) {
DEBUG_PRINTF("castle ok\n");
continue;
}
if (g[v].left.dfa) {
DEBUG_PRINTF("dfa ok\n");
continue;
}
if (g[v].left.haig) {
DEBUG_PRINTF("haig ok\n");
continue;
}
if (g[v].left.graph) {
assert(g[v].left.graph->kind
== (tbi.isRootSuccessor(v) ? NFA_PREFIX : NFA_INFIX));
if (!isImplementableNFA(*g[v].left.graph, nullptr, tbi.cc)) {
DEBUG_PRINTF("nfa prefix %zu failed (%zu vertices)\n",
g[v].index, num_vertices(*g[v].left.graph));
return false;
}
}
}
// Suffix graphs.
for (auto v : vertices_range(g)) {
DEBUG_PRINTF("suffix: check vertex %zu\n", g[v].index);
const RoseSuffixInfo &suffix = g[v].suffix;
if (suffix.castle) {
DEBUG_PRINTF("castle suffix ok\n");
continue;
}
if (suffix.rdfa) {
DEBUG_PRINTF("dfa suffix ok\n");
continue;
}
if (suffix.haig) {
DEBUG_PRINTF("haig suffix ok\n");
continue;
}
if (suffix.graph) {
assert(suffix.graph->kind == NFA_SUFFIX);
if (!isImplementableNFA(*suffix.graph, &tbi.rm, tbi.cc)) {
DEBUG_PRINTF("nfa suffix %zu failed (%zu vertices)\n",
g[v].index, num_vertices(*suffix.graph));
return false;
}
}
}
return true;
}
/**
* \brief True if there is an engine with a top that is not triggered by a
* vertex in the Rose graph. This is a consistency check used in assertions.
*/
bool hasOrphanedTops(const RoseBuildImpl &build) {
const RoseGraph &g = build.g;
unordered_map<left_id, set<u32>> leftfixes;
unordered_map<suffix_id, set<u32>> suffixes;
for (auto v : vertices_range(g)) {
if (g[v].left) {
if (!build.isRootSuccessor(v)) {
// Tops for infixes come from the in-edges.
set<u32> &tops = leftfixes[left_id(g[v].left)];
for (const auto &e : in_edges_range(v, g)) {
tops.insert(g[e].rose_top);
}
}
}
if (g[v].suffix) {
suffixes[suffix_id(g[v].suffix)].insert(g[v].suffix.top);
}
}
for (const auto &e : leftfixes) {
if (all_tops(e.first) != e.second) {
DEBUG_PRINTF("rose tops (%s) don't match rose graph (%s)\n",
as_string_list(all_tops(e.first)).c_str(),
as_string_list(e.second).c_str());
return true;
}
}
for (const auto &e : suffixes) {
if (all_tops(e.first) != e.second) {
DEBUG_PRINTF("suffix tops (%s) don't match rose graph (%s)\n",
as_string_list(all_tops(e.first)).c_str(),
as_string_list(e.second).c_str());
return true;
}
}
return false;
}
#endif // NDEBUG
} // namespace ue2