vectorscan/src/rose/rose_build_misc.cpp
2017-04-26 15:18:13 +10:00

1375 lines
40 KiB
C++

/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "rose_build_impl.h"
#include "hwlm/hwlm_literal.h"
#include "nfa/castlecompile.h"
#include "nfa/goughcompile.h"
#include "nfa/mcclellancompile_util.h"
#include "nfa/nfa_api.h"
#include "nfa/rdfa.h"
#include "nfa/tamaramacompile.h"
#include "nfagraph/ng_holder.h"
#include "nfagraph/ng_limex.h"
#include "nfagraph/ng_reports.h"
#include "nfagraph/ng_repeat.h"
#include "nfagraph/ng_util.h"
#include "nfagraph/ng_width.h"
#include "smallwrite/smallwrite_build.h"
#include "util/alloc.h"
#include "util/boundary_reports.h"
#include "util/compile_context.h"
#include "util/container.h"
#include "util/graph.h"
#include "util/graph_range.h"
#include "util/make_unique.h"
#include "util/order_check.h"
#include "util/report_manager.h"
#include "util/ue2string.h"
#include "util/verify_types.h"
#include "ue2common.h"
#include "grey.h"
#include <boost/functional/hash/hash_fwd.hpp>
#include <boost/graph/breadth_first_search.hpp>
using namespace std;
using boost::hash_combine;
namespace ue2 {
// just to get it out of the header
RoseBuild::~RoseBuild() { }
RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in,
SomSlotManager &ssm_in,
SmallWriteBuild &smwr_in,
const CompileContext &cc_in,
const BoundaryReports &boundary_in)
: cc(cc_in),
root(add_vertex(g)),
anchored_root(add_vertex(g)),
hasSom(false),
group_end(0),
ematcher_region_size(0),
eod_event_literal_id(MO_INVALID_IDX),
max_rose_anchored_floating_overlap(0),
rm(rm_in),
ssm(ssm_in),
smwr(smwr_in),
boundary(boundary_in),
next_nfa_report(0) {
// add root vertices to graph
g[root].min_offset = 0;
g[root].max_offset = 0;
g[anchored_root].min_offset = 0;
g[anchored_root].max_offset = 0;
}
RoseBuildImpl::~RoseBuildImpl() {
// empty
}
bool RoseVertexProps::isBoring(void) const {
return !suffix && !left;
}
bool RoseVertexProps::fixedOffset(void) const {
assert(min_offset <= max_offset); /* ensure offsets calculated */
return max_offset == min_offset && max_offset != ROSE_BOUND_INF;
}
bool RoseBuildImpl::isRootSuccessor(const RoseVertex &v) const {
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (isAnyStart(u)) {
return true;
}
}
return false;
}
bool RoseBuildImpl::isNonRootSuccessor(const RoseVertex &v) const {
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (!isAnyStart(u)) {
return true;
}
}
return false;
}
bool hasAnchHistorySucc(const RoseGraph &g, RoseVertex v) {
for (const auto &e : out_edges_range(v, g)) {
if (g[e].history == ROSE_ROLE_HISTORY_ANCH) {
return true;
}
}
return false;
}
bool hasLastByteHistorySucc(const RoseGraph &g, RoseVertex v) {
for (const auto &e : out_edges_range(v, g)) {
if (g[e].history == ROSE_ROLE_HISTORY_LAST_BYTE) {
return true;
}
}
return false;
}
static
bool isInTable(const RoseBuildImpl &tbi, RoseVertex v,
rose_literal_table table) {
const auto &lit_ids = tbi.g[v].literals;
if (lit_ids.empty()) {
return false; // special role with no literals
}
// All literals for a given vertex will be in the same table, so we need
// only inspect the first one.
const auto lit_table = tbi.literals.right.at(*lit_ids.begin()).table;
#ifndef NDEBUG
// Verify that all literals for this vertex are in the same table.
for (auto lit_id : lit_ids) {
assert(tbi.literals.right.at(lit_id).table == lit_table);
}
#endif
return lit_table == table;
}
bool RoseBuildImpl::isAnchored(RoseVertex v) const {
return isInTable(*this, v, ROSE_ANCHORED);
}
bool RoseBuildImpl::isFloating(RoseVertex v) const {
return isInTable(*this, v, ROSE_FLOATING);
}
bool RoseBuildImpl::isInETable(RoseVertex v) const {
return isInTable(*this, v, ROSE_EOD_ANCHORED);
}
bool RoseBuildImpl::hasLiteralInTable(RoseVertex v,
enum rose_literal_table t) const {
return isInTable(*this, v, t);
}
/* Indicates that the floating table (if it exists) will be only run
conditionally based on matches from the anchored table. */
bool RoseBuildImpl::hasNoFloatingRoots() const {
for (auto v : adjacent_vertices_range(root, g)) {
if (isFloating(v)) {
DEBUG_PRINTF("direct floating root %zu\n", g[v].index);
return false;
}
}
/* need to check if the anchored_root has any literals which are too deep */
for (auto v : adjacent_vertices_range(anchored_root, g)) {
if (isFloating(v)) {
DEBUG_PRINTF("indirect floating root %zu\n", g[v].index);
return false;
}
}
return true;
}
size_t RoseBuildImpl::maxLiteralLen(RoseVertex v) const {
const auto &lit_ids = g[v].literals;
assert(!lit_ids.empty());
size_t maxlen = 0;
for (const auto &lit_id : lit_ids) {
maxlen = max(maxlen, literals.right.at(lit_id).elength());
}
return maxlen;
}
size_t RoseBuildImpl::minLiteralLen(RoseVertex v) const {
const auto &lit_ids = g[v].literals;
assert(!lit_ids.empty());
size_t minlen = ROSE_BOUND_INF;
for (const auto &lit_id : lit_ids) {
minlen = min(minlen, literals.right.at(lit_id).elength());
}
return minlen;
}
// RoseBuild factory
unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm,
SomSlotManager &ssm,
SmallWriteBuild &smwr,
const CompileContext &cc,
const BoundaryReports &boundary) {
return ue2::make_unique<RoseBuildImpl>(rm, ssm, smwr, cc, boundary);
}
size_t roseSize(const RoseEngine *t) {
assert(t);
return t->size;
}
bool roseIsPureLiteral(const RoseEngine *t) {
return t->runtimeImpl == ROSE_RUNTIME_PURE_LITERAL;
}
// Returns non-zero max overlap len if a suffix of the literal 'a' overlaps
// with a prefix of the literal 'b' or 'a' can be contained in 'b'.
size_t maxOverlap(const ue2_literal &a, const ue2_literal &b, u32 b_delay) {
/* overly conservative if only part of the string is nocase */
bool nocase = a.any_nocase() || b.any_nocase();
DEBUG_PRINTF("max overlap %s %s+%u %d\n", dumpString(a).c_str(),
dumpString(b).c_str(), b_delay, (int)nocase);
size_t a_len = a.length();
size_t b_len = b.length();
const char *a_end = a.c_str() + a_len;
const char *b_end = b.c_str() + b_len;
if (b_delay >= a_len) {
return b_len + b_delay;
} else if (b_delay) {
/* a can be a substring of b which overlaps some of the end dots
* OR b can be a substring near the end of a */
/* ignore overlap due to the final trailing dot as delayed literals
* are delivered before undelayed */
for (u32 j = b_delay - 1; j > 0; j--) {
if (b_len + j >= a_len) {
if (!cmp(a.c_str(), b_end + j - a_len, a_len - j, nocase)) {
return b_len + j;
}
} else {
if (!cmp(a_end - j - b_len, b.c_str(), b_len, nocase)) {
return b_len + j;
}
}
}
}
return maxStringOverlap(a.get_string(), b.get_string(), nocase);
}
// Returns non-zero max overlap len if a suffix of the literal ID 'a' overlaps
// with a prefix of the literal ID 'b' or 'a' can be contained in 'b'.
size_t maxOverlap(const rose_literal_id &a, const rose_literal_id &b) {
assert(!a.delay);
return maxOverlap(a.s, b.s, b.delay);
}
static
const rose_literal_id &getOverlapLiteral(const RoseBuildImpl &tbi,
u32 literal_id) {
map<u32, rose_literal_id>::const_iterator it =
tbi.anchoredLitSuffix.find(literal_id);
if (it != tbi.anchoredLitSuffix.end()) {
return it->second;
}
return tbi.literals.right.at(literal_id);
}
ue2_literal findNonOverlappingTail(const set<ue2_literal> &lits,
const ue2_literal &s) {
size_t max_overlap = 0;
for (const auto &lit : lits) {
size_t overlap = lit != s ? maxStringOverlap(lit, s)
: maxStringSelfOverlap(s);
max_overlap = max(max_overlap, overlap);
}
/* find the tail that doesn't overlap */
ue2_literal tail = s.substr(max_overlap);
DEBUG_PRINTF("%zu overlap, tail: '%s'\n", max_overlap,
dumpString(tail).c_str());
return tail;
}
size_t RoseBuildImpl::maxLiteralOverlap(RoseVertex u, RoseVertex v) const {
size_t overlap = 0;
for (auto u_lit_id : g[u].literals) {
const rose_literal_id &ul = getOverlapLiteral(*this, u_lit_id);
for (auto v_lit_id : g[v].literals) {
const rose_literal_id &vl = getOverlapLiteral(*this, v_lit_id);
overlap = max(overlap, maxOverlap(ul, vl));
}
}
return overlap;
}
void RoseBuildImpl::removeVertices(const vector<RoseVertex> &dead) {
for (auto v : dead) {
assert(!isAnyStart(v));
DEBUG_PRINTF("removing vertex %zu\n", g[v].index);
for (auto lit_id : g[v].literals) {
literal_info[lit_id].vertices.erase(v);
}
clear_vertex(v, g);
remove_vertex(v, g);
}
renumber_vertices(g);
}
// Find the maximum bound on the edges to this vertex's successors ignoring
// those via infixes.
u32 RoseBuildImpl::calcSuccMaxBound(RoseVertex u) const {
u32 maxBound = 0;
for (const auto &e : out_edges_range(u, g)) {
RoseVertex v = target(e, g);
if (g[v].left) {
continue;
}
u32 thisBound = g[e].maxBound;
if (thisBound == ROSE_BOUND_INF) {
return ROSE_BOUND_INF;
}
if (!g[v].eod_accept) {
// Add the length of the longest of our literals.
thisBound += maxLiteralLen(v);
}
maxBound = max(maxBound, thisBound);
}
assert(maxBound <= ROSE_BOUND_INF);
return maxBound;
}
u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, u32 delay,
rose_literal_table table) {
DEBUG_PRINTF("getting id for %s\n", dumpString(s).c_str());
assert(table != ROSE_ANCHORED);
rose_literal_id key(s, table, delay);
u32 numLiterals = verify_u32(literals.left.size());
RoseLiteralMap::iterator it;
bool inserted;
tie(it, inserted)
= literals.insert(RoseLiteralMap::value_type(key, numLiterals));
u32 id = it->right;
if (inserted) {
literal_info.push_back(rose_literal_info());
assert(literal_info.size() == id + 1);
if (delay) {
u32 undelayed_id = getLiteralId(s, 0, table);
literal_info[id].undelayed_id = undelayed_id;
literal_info[undelayed_id].delayed_ids.insert(id);
} else {
literal_info[id].undelayed_id = id;
}
}
return id;
}
// Function that operates on a msk/cmp pair and a literal, as used in
// hwlmLiteral, and zeroes msk elements that don't add any power to the
// literal.
void normaliseLiteralMask(const ue2_literal &s_in, vector<u8> &msk,
vector<u8> &cmp) {
assert(msk.size() == cmp.size());
assert(msk.size() <= HWLM_MASKLEN);
if (msk.empty()) {
return;
}
// Work over a caseless copy if the string contains nocase chars. This will
// ensure that we treat masks designed to handle mixed-sensitivity literals
// correctly: these will be matched by the literal matcher in caseless
// mode, with the mask used to narrow the matches.
ue2_literal s(s_in);
if (s.any_nocase()) {
make_nocase(&s);
}
ue2_literal::const_reverse_iterator it = s.rbegin(), ite = s.rend();
size_t i = msk.size();
while (i-- != 0 && it != ite) {
const CharReach &cr = *it;
for (size_t c = cr.find_first(); c != CharReach::npos;
c = cr.find_next(c)) {
if (((u8)c & msk[i]) != cmp[i]) {
goto skip;
}
}
// If we didn't jump out of the loop to skip, then this mask position
// doesn't further narrow the set of acceptable literals from those
// accepted by s. So we can zero this element.
msk[i] = 0;
cmp[i] = 0;
skip:
++it;
}
// Wipe out prefix zeroes.
while (!msk.empty() && msk[0] == 0) {
msk.erase(msk.begin());
cmp.erase(cmp.begin());
}
}
rose_literal_id::rose_literal_id(const ue2_literal &s_in,
const vector<u8> &msk_in, const vector<u8> &cmp_in,
rose_literal_table table_in, u32 delay_in)
: s(s_in), msk(msk_in), cmp(cmp_in), table(table_in),
delay(delay_in), distinctiveness(0) {
assert(msk.size() == cmp.size());
assert(msk.size() <= HWLM_MASKLEN);
assert(delay <= MAX_DELAY);
normaliseLiteralMask(s, msk, cmp);
}
u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, const vector<u8> &msk,
const vector<u8> &cmp, u32 delay,
rose_literal_table table) {
DEBUG_PRINTF("getting id for %s\n", dumpString(s).c_str());
assert(table != ROSE_ANCHORED);
rose_literal_id key(s, msk, cmp, table, delay);
u32 numLiterals = verify_u32(literals.left.size());
/* ue2_literals are always uppercased if nocase and must have an
* alpha char */
RoseLiteralMap::iterator it;
bool inserted;
tie(it, inserted) = literals.insert(
RoseLiteralMap::value_type(key, numLiterals));
u32 id = it->right;
if (inserted) {
literal_info.push_back(rose_literal_info());
assert(literal_info.size() == id + 1);
if (delay) {
u32 undelayed_id = getLiteralId(s, msk, cmp, 0, table);
literal_info[id].undelayed_id = undelayed_id;
literal_info[undelayed_id].delayed_ids.insert(id);
} else {
literal_info[id].undelayed_id = id;
}
}
return id;
}
bool RoseBuildImpl::hasLiteral(const ue2_literal &s,
rose_literal_table table) const {
DEBUG_PRINTF("looking if %s exists\n", dumpString(s).c_str());
assert(table != ROSE_ANCHORED);
for (RoseLiteralMap::left_map::const_iterator it
= literals.left.lower_bound(rose_literal_id(s, table, 0));
it != literals.left.end(); ++it) {
if (it->first.table != table || it->first.s != s) {
break;
}
const rose_literal_info &info = literal_info[it->second];
if (!info.vertices.empty()) {
return true;
}
}
DEBUG_PRINTF("(used) literal not found\n");
return false;
}
u32 RoseBuildImpl::getNewLiteralId() {
rose_literal_id key(ue2_literal(), ROSE_ANCHORED, 0);
u32 numLiterals = verify_u32(literals.left.size());
key.distinctiveness = numLiterals;
RoseLiteralMap::iterator it;
bool inserted;
tie(it, inserted)
= literals.insert(RoseLiteralMap::value_type(key, numLiterals));
u32 id = it->right;
assert(inserted);
literal_info.push_back(rose_literal_info());
assert(literal_info.size() == id + 1);
literal_info[id].undelayed_id = id;
return id;
}
static
bool requiresDedupe(const NGHolder &h, const ue2::flat_set<ReportID> &reports,
const Grey &grey) {
/* TODO: tighten */
NFAVertex seen_vert = NGHolder::null_vertex();
for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
if (has_intersection(h[v].reports, reports)) {
if (seen_vert != NGHolder::null_vertex()) {
return true;
}
seen_vert = v;
}
}
for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) {
if (has_intersection(h[v].reports, reports)) {
if (seen_vert != NGHolder::null_vertex()) {
return true;
}
seen_vert = v;
}
}
if (seen_vert) {
/* if the reporting vertex is part of of a terminal repeat, the
* construction process may reform the graph splitting it into two
* vertices (pos, cyclic) and hence require dedupe */
vector<GraphRepeatInfo> repeats;
findRepeats(h, grey.minExtBoundedRepeatSize, &repeats);
for (const auto &repeat : repeats) {
if (find(repeat.vertices.begin(), repeat.vertices.end(),
seen_vert) != repeat.vertices.end()) {
return true;
}
}
}
return false;
}
class RoseDedupeAuxImpl : public RoseDedupeAux {
public:
explicit RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in);
bool requiresDedupeSupport(
const ue2::flat_set<ReportID> &reports) const override;
private:
bool hasSafeMultiReports(const ue2::flat_set<ReportID> &reports) const;
const RoseBuildImpl &tbi;
map<ReportID, set<RoseVertex>> vert_map; //!< ordinary literals
map<ReportID, set<RoseVertex>> sb_vert_map; //!< small block literals
map<ReportID, set<suffix_id>> suffix_map;
map<ReportID, set<const OutfixInfo *>> outfix_map;
map<ReportID, set<const raw_puff *>> puff_map;
unordered_set<ReportID> live_reports; //!< all live internal reports.
};
unique_ptr<RoseDedupeAux> RoseBuildImpl::generateDedupeAux() const {
return ue2::make_unique<RoseDedupeAuxImpl>(*this);
}
RoseDedupeAux::~RoseDedupeAux() {
}
RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
: tbi(tbi_in) {
const RoseGraph &g = tbi.g;
set<suffix_id> suffixes;
for (auto v : vertices_range(g)) {
insert(&live_reports, g[v].reports);
// Literals in the small block table are "shadow" copies of literals in
// the other tables that do not run in the same runtime invocation.
// Dedupe key assignment will be taken care of by the real literals.
if (tbi.hasLiteralInTable(v, ROSE_ANCHORED_SMALL_BLOCK)) {
for (const auto &report_id : g[v].reports) {
sb_vert_map[report_id].insert(v);
}
} else {
for (const auto &report_id : g[v].reports) {
vert_map[report_id].insert(v);
}
}
// Several vertices may share a suffix, so we collect the set of
// suffixes first to avoid repeating work.
if (g[v].suffix) {
suffixes.insert(g[v].suffix);
}
}
for (const auto &suffix : suffixes) {
for (const auto &report_id : all_reports(suffix)) {
suffix_map[report_id].insert(suffix);
live_reports.insert(report_id);
}
}
for (const auto &outfix : tbi.outfixes) {
for (const auto &report_id : all_reports(outfix)) {
outfix_map[report_id].insert(&outfix);
live_reports.insert(report_id);
}
}
if (tbi.mpv_outfix) {
auto *mpv = tbi.mpv_outfix->mpv();
for (const auto &puff : mpv->puffettes) {
puff_map[puff.report].insert(&puff);
live_reports.insert(puff.report);
}
for (const auto &puff : mpv->triggered_puffettes) {
puff_map[puff.report].insert(&puff);
live_reports.insert(puff.report);
}
}
// Collect live reports from boundary reports.
insert(&live_reports, tbi.boundary.report_at_0);
insert(&live_reports, tbi.boundary.report_at_0_eod);
insert(&live_reports, tbi.boundary.report_at_eod);
DEBUG_PRINTF("%zu of %zu reports are live\n", live_reports.size(),
tbi.rm.numReports());
}
static
vector<CharReach> makePath(const rose_literal_id &lit) {
vector<CharReach> path(begin(lit.s), end(lit.s));
for (u32 i = 0; i < lit.delay; i++) {
path.push_back(CharReach::dot());
}
return path;
}
/**
* \brief True if one of the given literals overlaps with the suffix of
* another, meaning that they could arrive at the same offset.
*/
static
bool literalsCouldRace(const rose_literal_id &lit1,
const rose_literal_id &lit2) {
DEBUG_PRINTF("compare %s (delay %u) and %s (delay %u)\n",
dumpString(lit1.s).c_str(), lit1.delay,
dumpString(lit2.s).c_str(), lit2.delay);
// Add dots on the end of each literal for delay.
const auto v1 = makePath(lit1);
const auto v2 = makePath(lit2);
// See if the smaller path is a suffix of the larger path.
const auto *smaller = v1.size() < v2.size() ? &v1 : &v2;
const auto *bigger = v1.size() < v2.size() ? &v2 : &v1;
auto r = mismatch(smaller->rbegin(), smaller->rend(), bigger->rbegin(),
overlaps);
return r.first == smaller->rend();
}
bool RoseDedupeAuxImpl::hasSafeMultiReports(
const flat_set<ReportID> &reports) const {
if (reports.size() <= 1) {
return true;
}
/* We have more than one ReportID corresponding to the external ID that is
* presented to the user. These may differ in offset adjustment, bounds
* checks, etc. */
/* TODO: work out if these differences will actually cause problems */
/* One common case where we know we don't have a problem is if there are
* precisely two reports, one for the main Rose path and one for the
* "small block matcher" path. */
if (reports.size() == 2) {
ReportID id1 = *reports.begin();
ReportID id2 = *reports.rbegin();
bool has_verts_1 = contains(vert_map, id1);
bool has_verts_2 = contains(vert_map, id2);
bool has_sb_verts_1 = contains(sb_vert_map, id1);
bool has_sb_verts_2 = contains(sb_vert_map, id2);
if (has_verts_1 != has_verts_2 && has_sb_verts_1 != has_sb_verts_2) {
DEBUG_PRINTF("two reports, one full and one small block: ok\n");
return true;
}
}
DEBUG_PRINTF("more than one report\n");
return false;
}
bool RoseDedupeAuxImpl::requiresDedupeSupport(
const flat_set<ReportID> &reports_in) const {
/* TODO: this could be expanded to check for offset or character
constraints */
// We don't want to consider dead reports (tracked by ReportManager but no
// longer used) for the purposes of assigning dupe keys.
flat_set<ReportID> reports;
for (auto id : reports_in) {
if (contains(live_reports, id)) {
reports.insert(id);
}
}
DEBUG_PRINTF("live reports: %s\n", as_string_list(reports).c_str());
const RoseGraph &g = tbi.g;
bool has_suffix = false;
bool has_outfix = false;
if (!hasSafeMultiReports(reports)) {
DEBUG_PRINTF("multiple reports not safe\n");
return true;
}
set<RoseVertex> roles;
set<suffix_id> suffixes;
set<const OutfixInfo *> outfixes;
set<const raw_puff *> puffettes;
for (ReportID r : reports) {
if (contains(vert_map, r)) {
insert(&roles, vert_map.at(r));
}
if (contains(suffix_map, r)) {
insert(&suffixes, suffix_map.at(r));
}
if (contains(outfix_map, r)) {
insert(&outfixes, outfix_map.at(r));
}
if (contains(puff_map, r)) {
insert(&puffettes, puff_map.at(r));
}
}
/* roles */
map<u32, u32> lits; // Literal ID -> count of occurrences.
const bool has_role = !roles.empty();
for (auto v : roles) {
for (const auto &lit : g[v].literals) {
lits[lit]++;
}
if (g[v].eod_accept) {
// Literals plugged into this EOD accept must be taken into account
// as well.
for (auto u : inv_adjacent_vertices_range(v, g)) {
for (const auto &lit : g[u].literals) {
lits[lit]++;
}
}
}
}
/* literals */
for (const auto &m : lits) {
if (m.second > 1) {
DEBUG_PRINTF("lit %u used by >1 reporting roles\n", m.first);
return true;
}
}
for (auto it = begin(lits); it != end(lits); ++it) {
const auto &lit1 = tbi.literals.right.at(it->first);
for (auto jt = next(it); jt != end(lits); ++jt) {
const auto &lit2 = tbi.literals.right.at(jt->first);
if (literalsCouldRace(lit1, lit2)) {
DEBUG_PRINTF("literals could race\n");
return true;
}
}
}
/* suffixes */
for (const auto &suffix : suffixes) {
if (has_suffix || has_role) {
return true; /* scope for badness */
}
has_suffix = true;
/* some lesser suffix engines (nfas, haig, castle) can raise multiple
* matches for a report id at the same offset if there are multiple
* report states live. */
if (suffix.haig()) {
return true;
}
if (suffix.graph() &&
requiresDedupe(*suffix.graph(), reports, tbi.cc.grey)) {
return true;
}
if (suffix.castle() && requiresDedupe(*suffix.castle(), reports)) {
return true;
}
}
/* outfixes */
for (const auto &outfix_ptr : outfixes) {
assert(outfix_ptr);
const OutfixInfo &out = *outfix_ptr;
if (has_outfix || has_role || has_suffix) {
return true;
}
has_outfix = true;
if (out.haig()) {
return true; /* haig may report matches with different SOM at the
same offset */
}
if (out.holder() &&
requiresDedupe(*out.holder(), reports, tbi.cc.grey)) {
return true;
}
}
/* mpv */
for (UNUSED const auto &puff : puffettes) {
if (has_outfix || has_role || has_suffix) {
return true;
}
has_outfix = true;
}
/* boundary */
if (has_intersection(tbi.boundary.report_at_eod, reports)) {
if (has_outfix || has_role || has_suffix) {
return true;
}
}
return false;
}
bool operator<(const RoseEdgeProps &a, const RoseEdgeProps &b) {
ORDER_CHECK(minBound);
ORDER_CHECK(maxBound);
ORDER_CHECK(history);
return false;
}
#ifndef NDEBUG
bool roseHasTops(const RoseBuildImpl &build, RoseVertex v) {
const RoseGraph &g = build.g;
assert(g[v].left);
set<u32> graph_tops;
if (!build.isRootSuccessor(v)) {
for (const auto &e : in_edges_range(v, g)) {
graph_tops.insert(g[e].rose_top);
}
}
return is_subset_of(graph_tops, all_tops(g[v].left));
}
#endif
u32 OutfixInfo::get_queue(QueueIndexFactory &qif) {
if (queue == ~0U) {
queue = qif.get_queue();
}
return queue;
}
namespace {
class OutfixAllReports : public boost::static_visitor<set<ReportID>> {
public:
set<ReportID> operator()(const boost::blank &) const {
return set<ReportID>();
}
template<class T>
set<ReportID> operator()(const unique_ptr<T> &x) const {
return all_reports(*x);
}
set<ReportID> operator()(const MpvProto &mpv) const {
set<ReportID> reports;
for (const auto &puff : mpv.puffettes) {
reports.insert(puff.report);
}
for (const auto &puff : mpv.triggered_puffettes) {
reports.insert(puff.report);
}
return reports;
}
};
}
set<ReportID> all_reports(const OutfixInfo &outfix) {
auto reports = boost::apply_visitor(OutfixAllReports(), outfix.proto);
assert(!reports.empty());
return reports;
}
bool RoseSuffixInfo::operator==(const RoseSuffixInfo &b) const {
return top == b.top && graph == b.graph && castle == b.castle &&
rdfa == b.rdfa && haig == b.haig && tamarama == b.tamarama;
}
bool RoseSuffixInfo::operator<(const RoseSuffixInfo &b) const {
const RoseSuffixInfo &a = *this;
ORDER_CHECK(top);
ORDER_CHECK(graph);
ORDER_CHECK(castle);
ORDER_CHECK(haig);
ORDER_CHECK(rdfa);
ORDER_CHECK(tamarama);
assert(a.dfa_min_width == b.dfa_min_width);
assert(a.dfa_max_width == b.dfa_max_width);
return false;
}
void RoseSuffixInfo::reset(void) {
top = 0;
graph.reset();
castle.reset();
rdfa.reset();
haig.reset();
tamarama.reset();
dfa_min_width = 0;
dfa_max_width = depth::infinity();
}
std::set<ReportID> all_reports(const suffix_id &s) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.tamarama()) {
return all_reports(*s.tamarama());
} else if (s.graph()) {
return all_reports(*s.graph());
} else if (s.castle()) {
return all_reports(*s.castle());
} else if (s.dfa()) {
return all_reports(*s.dfa());
} else {
return all_reports(*s.haig());
}
}
depth findMinWidth(const suffix_id &s) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
return findMinWidth(*s.graph());
} else if (s.castle()) {
return findMinWidth(*s.castle());
} else {
return s.dfa_min_width;
}
}
depth findMinWidth(const suffix_id &s, u32 top) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
return findMinWidth(*s.graph(), top);
} else if (s.castle()) {
return findMinWidth(*s.castle(), top);
} else {
return s.dfa_min_width;
}
}
depth findMaxWidth(const suffix_id &s) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
return findMaxWidth(*s.graph());
} else if (s.castle()) {
return findMaxWidth(*s.castle());
} else {
return s.dfa_max_width;
}
}
depth findMaxWidth(const suffix_id &s, u32 top) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
return findMaxWidth(*s.graph(), top);
} else if (s.castle()) {
return findMaxWidth(*s.castle(), top);
} else {
return s.dfa_max_width;
}
}
bool has_eod_accepts(const suffix_id &s) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
/* ignore accept -> eod edge */
return in_degree(s.graph()->acceptEod, *s.graph()) > 1;
} else if (s.castle()) {
return false;
} else if (s.dfa()) {
return has_eod_accepts(*s.dfa());
} else {
return has_eod_accepts(*s.haig());
}
}
bool has_non_eod_accepts(const suffix_id &s) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
return in_degree(s.graph()->accept, *s.graph());
} else if (s.castle()) {
return true;
} else if (s.dfa()) {
return has_non_eod_accepts(*s.dfa());
} else {
return has_non_eod_accepts(*s.haig());
}
}
set<u32> all_tops(const suffix_id &s) {
assert(s.graph() || s.castle() || s.haig() || s.dfa());
if (s.graph()) {
flat_set<u32> tops = getTops(*s.graph());
assert(!tops.empty());
return {tops.begin(), tops.end()};
}
if (s.castle()) {
return assoc_keys(s.castle()->repeats);
}
// Other types of suffix are not multi-top.
return {0};
}
size_t suffix_id::hash() const {
size_t val = 0;
hash_combine(val, g);
hash_combine(val, c);
hash_combine(val, d);
hash_combine(val, h);
return val;
}
size_t hash_value(const suffix_id &s) {
return s.hash();
}
bool isAnchored(const left_id &r) {
assert(r.graph() || r.castle() || r.haig() || r.dfa());
if (r.graph()) {
return isAnchored(*r.graph());
}
if (r.dfa()) {
return r.dfa()->start_anchored == DEAD_STATE;
}
if (r.haig()) {
return r.haig()->start_anchored == DEAD_STATE;
}
// All other types are explicitly anchored.
return true;
}
depth findMinWidth(const left_id &r) {
assert(r.graph() || r.castle() || r.haig() || r.dfa());
if (r.graph()) {
return findMinWidth(*r.graph());
} else if (r.castle()) {
return findMinWidth(*r.castle());
} else {
return r.dfa_min_width;
}
}
depth findMaxWidth(const left_id &r) {
assert(r.graph() || r.castle() || r.haig() || r.dfa());
if (r.graph()) {
return findMaxWidth(*r.graph());
} else if (r.castle()) {
return findMaxWidth(*r.castle());
} else {
return r.dfa_max_width;
}
}
set<u32> all_tops(const left_id &r) {
assert(r.graph() || r.castle() || r.haig() || r.dfa());
if (r.graph()) {
flat_set<u32> tops = getTops(*r.graph());
return {tops.begin(), tops.end()};
}
if (r.castle()) {
return assoc_keys(r.castle()->repeats);
}
// Other types of rose are not multi-top.
return {0};
}
u32 num_tops(const left_id &r) {
return all_tops(r).size();
}
size_t left_id::hash() const {
size_t val = 0;
hash_combine(val, g);
hash_combine(val, c);
hash_combine(val, d);
hash_combine(val, h);
return val;
}
size_t hash_value(const left_id &r) {
return r.hash();
}
u64a findMaxOffset(const set<ReportID> &reports, const ReportManager &rm) {
assert(!reports.empty());
u64a maxOffset = 0;
for (const auto &report_id : reports) {
const Report &ir = rm.getReport(report_id);
if (ir.hasBounds()) {
maxOffset = max(maxOffset, ir.maxOffset);
} else {
return MAX_OFFSET;
}
}
return maxOffset;
}
void LeftEngInfo::reset(void) {
graph.reset();
castle.reset();
dfa.reset();
haig.reset();
tamarama.reset();
lag = 0;
leftfix_report = MO_INVALID_IDX;
dfa_min_width = 0;
dfa_max_width = depth::infinity();
}
LeftEngInfo::operator bool() const {
assert((int)!!castle + (int)!!dfa + (int)!!haig <= 1);
assert(!castle || !graph);
assert(!dfa || graph); /* dfas always have the graph as well */
assert(!haig || graph);
return graph || castle || dfa || haig;
}
u32 roseQuality(const RoseEngine *t) {
/* Rose is low quality if the atable is a Mcclellan 16 or has multiple DFAs
*/
const anchored_matcher_info *atable = getALiteralMatcher(t);
if (atable) {
if (atable->next_offset) {
DEBUG_PRINTF("multiple atable engines\n");
return 0;
}
const NFA *nfa = (const NFA *)((const char *)atable + sizeof(*atable));
if (!isSmallDfaType(nfa->type)) {
DEBUG_PRINTF("m16 atable engine\n");
return 0;
}
}
/* if we always run multiple engines then we are slow */
u32 always_run = 0;
if (atable) {
always_run++;
}
if (t->eagerIterOffset) {
/* eager prefixes are always run */
always_run++;
}
const HWLM *ftable = getFLiteralMatcher(t);
if (ftable) {
/* TODO: ignore conditional ftables, or ftables beyond smwr region */
always_run++;
}
if (t->ematcherOffset) {
always_run++;
}
/* ignore mpv outfixes as they are v good, mpv outfixes are before begin */
if (t->outfixBeginQueue != t->outfixEndQueue) {
/* TODO: ignore outfixes > smwr region */
always_run++;
}
bool eod_prefix = false;
const LeftNfaInfo *left = getLeftTable(t);
for (u32 i = 0; i < t->activeLeftCount; i++) {
if (left->eod_check) {
eod_prefix = true;
break;
}
}
if (eod_prefix) {
always_run++;
DEBUG_PRINTF("eod prefixes are slow");
return 0;
}
if (always_run > 1) {
DEBUG_PRINTF("we always run %u engines\n", always_run);
return 0;
}
return 1;
}
#ifndef NDEBUG
/** \brief Returns true if all the graphs (NFA, DFA, Haig, etc) in this Rose
* graph are implementable. */
bool canImplementGraphs(const RoseBuildImpl &tbi) {
const RoseGraph &g = tbi.g;
// First, check the Rose leftfixes.
for (auto v : vertices_range(g)) {
DEBUG_PRINTF("leftfix: check vertex %zu\n", g[v].index);
if (g[v].left.castle) {
DEBUG_PRINTF("castle ok\n");
continue;
}
if (g[v].left.dfa) {
DEBUG_PRINTF("dfa ok\n");
continue;
}
if (g[v].left.haig) {
DEBUG_PRINTF("haig ok\n");
continue;
}
if (g[v].left.graph) {
assert(g[v].left.graph->kind
== (tbi.isRootSuccessor(v) ? NFA_PREFIX : NFA_INFIX));
if (!isImplementableNFA(*g[v].left.graph, nullptr, tbi.cc)) {
DEBUG_PRINTF("nfa prefix %zu failed (%zu vertices)\n",
g[v].index, num_vertices(*g[v].left.graph));
return false;
}
}
}
// Suffix graphs.
for (auto v : vertices_range(g)) {
DEBUG_PRINTF("suffix: check vertex %zu\n", g[v].index);
const RoseSuffixInfo &suffix = g[v].suffix;
if (suffix.castle) {
DEBUG_PRINTF("castle suffix ok\n");
continue;
}
if (suffix.rdfa) {
DEBUG_PRINTF("dfa suffix ok\n");
continue;
}
if (suffix.haig) {
DEBUG_PRINTF("haig suffix ok\n");
continue;
}
if (suffix.graph) {
assert(suffix.graph->kind == NFA_SUFFIX);
if (!isImplementableNFA(*suffix.graph, &tbi.rm, tbi.cc)) {
DEBUG_PRINTF("nfa suffix %zu failed (%zu vertices)\n",
g[v].index, num_vertices(*suffix.graph));
return false;
}
}
}
return true;
}
bool hasOrphanedTops(const RoseBuildImpl &build) {
const RoseGraph &g = build.g;
ue2::unordered_map<left_id, set<u32> > roses;
ue2::unordered_map<suffix_id, set<u32> > suffixes;
for (auto v : vertices_range(g)) {
if (g[v].left) {
set<u32> &tops = roses[g[v].left];
if (!build.isRootSuccessor(v)) {
// Tops for infixes come from the in-edges.
for (const auto &e : in_edges_range(v, g)) {
tops.insert(g[e].rose_top);
}
}
}
if (g[v].suffix) {
suffixes[g[v].suffix].insert(g[v].suffix.top);
}
}
for (const auto &e : roses) {
if (all_tops(e.first) != e.second) {
DEBUG_PRINTF("rose tops (%s) don't match rose graph (%s)\n",
as_string_list(all_tops(e.first)).c_str(),
as_string_list(e.second).c_str());
return true;
}
}
for (const auto &e : suffixes) {
if (all_tops(e.first) != e.second) {
DEBUG_PRINTF("suffix tops (%s) don't match rose graph (%s)\n",
as_string_list(all_tops(e.first)).c_str(),
as_string_list(e.second).c_str());
return true;
}
}
return false;
}
#endif // NDEBUG
} // namespace ue2