vectorscan/src/nfagraph/ng_violet.cpp

3085 lines
94 KiB
C++

/*
* Copyright (c) 2016-2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "ng_violet.h"
#include "grey.h"
#include "ng_depth.h"
#include "ng_dominators.h"
#include "ng_dump.h"
#include "ng_equivalence.h"
#include "ng_holder.h"
#include "ng_is_equal.h"
#include "ng_literal_analysis.h"
#include "ng_limex.h"
#include "ng_mcclellan.h"
#include "ng_netflow.h"
#include "ng_prune.h"
#include "ng_redundancy.h"
#include "ng_region.h"
#include "ng_reports.h"
#include "ng_split.h"
#include "ng_util.h"
#include "ng_width.h"
#include "nfa/rdfa.h"
#include "rose/rose_build.h"
#include "rose/rose_build_util.h"
#include "rose/rose_in_dump.h"
#include "rose/rose_in_graph.h"
#include "rose/rose_in_util.h"
#include "util/compare.h"
#include "util/compile_context.h"
#include "util/container.h"
#include "util/flat_containers.h"
#include "util/graph.h"
#include "util/graph_range.h"
#include "util/graph_small_color_map.h"
#include "util/insertion_ordered.h"
#include "util/order_check.h"
#include "util/target_info.h"
#include "util/ue2string.h"
#include <set>
#include <utility>
#include <vector>
#include <memory>
#include <boost/dynamic_bitset.hpp>
#include <boost/range/adaptor/map.hpp>
#define STAGE_DEBUG_PRINTF DEBUG_PRINTF
using namespace std;
using boost::adaptors::map_values;
namespace ue2 {
/* createsAnchoredLHS() is conservative as the depths take into account
* back edges that come from beyond the split point and would be missing after
* the graph is split. */
static
bool createsAnchoredLHS(const NGHolder &g, const vector<NFAVertex> &vv,
const vector<NFAVertexDepth> &depths,
const Grey &grey, depth max_depth = depth::infinity()) {
max_depth = min(max_depth, depth(grey.maxAnchoredRegion));
for (auto v : vv) {
/* avoid issues of self loops blowing out depths:
* look at preds, add 1 */
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (u == v) {
continue;
}
u32 idx = g[u].index;
assert(idx < depths.size());
if (maxDistFromStartOfData(depths.at(idx)) >= max_depth) {
return false;
}
}
}
return true;
}
/* createsTransientLHS() is conservative as the depths take into account
* back edges that come from beyond the split point and would be missing after
* the graph is split. */
static
bool createsTransientLHS(const NGHolder &g, const vector<NFAVertex> &vv,
const vector<NFAVertexDepth> &depths,
const Grey &grey) {
const depth max_depth(grey.maxHistoryAvailable);
for (auto v : vv) {
/* avoid issues of self loops blowing out depths:
* look at preds, add 1 */
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (u == v) {
continue;
}
u32 idx = g[u].index;
assert(idx < depths.size());
if (maxDistFromInit(depths.at(idx)) >= max_depth) {
return false;
}
}
}
return true;
}
/**
* Counts the number of vertices that are reachable from the set of sources
* given.
*/
static
size_t count_reachable(const NGHolder &g, const vector<NFAVertex> &sources,
small_color_map<decltype(get(vertex_index, g))> &color_map) {
auto null_visitor = boost::make_dfs_visitor(boost::null_visitor());
color_map.fill(small_color::white);
for (auto v : sources) {
boost::depth_first_visit(g, v, null_visitor, color_map);
}
return color_map.count(small_color::black);
}
static
size_t shorter_than(const set<ue2_literal> &s, size_t limit) {
return count_if(s.begin(), s.end(),
[&](const ue2_literal &a) { return a.length() < limit; });
}
static
u32 min_len(const set<ue2_literal> &s) {
u32 rv = ~0U;
for (const auto &lit : s) {
rv = min(rv, (u32)lit.length());
}
return rv;
}
static
u32 min_period(const set<ue2_literal> &s) {
u32 rv = ~0U;
for (const auto &lit : s) {
rv = min(rv, (u32)minStringPeriod(lit));
}
DEBUG_PRINTF("min period %u\n", rv);
return rv;
}
namespace {
/**
* Information on a cut: vertices and literals.
*/
struct VertLitInfo {
VertLitInfo() {}
VertLitInfo(NFAVertex v, const set<ue2_literal> &litlit, bool c_anch,
bool c_tran = false)
: vv(vector<NFAVertex>(1, v)), lit(litlit), creates_anchored(c_anch),
creates_transient(c_tran) {}
VertLitInfo(const vector<NFAVertex> &vv_in, const set<ue2_literal> &lit_in,
bool c_anch)
: vv(vv_in), lit(lit_in), creates_anchored(c_anch) {}
vector<NFAVertex> vv;
set<ue2_literal> lit;
bool creates_anchored = false;
bool creates_transient = false;
double split_ratio = 0;
};
#define LAST_CHANCE_STRONG_LEN 1
/**
* \brief Comparator class for comparing different literal cuts.
*/
class LitComparator {
public:
LitComparator(const NGHolder &g_in, bool sa, bool st, bool lc)
: g(g_in), seeking_anchored(sa), seeking_transient(st),
last_chance(lc) {}
bool operator()(const unique_ptr<VertLitInfo> &a,
const unique_ptr<VertLitInfo> &b) const {
assert(a && b);
if (seeking_anchored) {
if (a->creates_anchored != b->creates_anchored) {
return a->creates_anchored < b->creates_anchored;
}
}
if (seeking_transient) {
if (a->creates_transient != b->creates_transient) {
return a->creates_transient < b->creates_transient;
}
}
if (last_chance
&& min_len(a->lit) > LAST_CHANCE_STRONG_LEN
&& min_len(b->lit) > LAST_CHANCE_STRONG_LEN) {
DEBUG_PRINTF("using split ratio %g , %g\n", a->split_ratio,
b->split_ratio);
return a->split_ratio < b->split_ratio;
}
u64a score_a = scoreSet(a->lit);
u64a score_b = scoreSet(b->lit);
if (score_a != score_b) {
return score_a > score_b;
}
/* vertices should only be in one candidate cut */
assert(a->vv == b->vv || a->vv.front() != b->vv.front());
return g[a->vv.front()].index > g[b->vv.front()].index;
}
private:
const NGHolder &g; /**< graph on which cuts are found */
bool seeking_anchored;
bool seeking_transient;
bool last_chance;
};
}
#define MIN_ANCHORED_LEN 2
#define MIN_ANCHORED_DESPERATE_LEN 1
/* anchored here means that the cut creates a 'usefully' anchored LHS */
static
bool validateRoseLiteralSetQuality(const set<ue2_literal> &s, u64a score,
bool anchored, u32 min_allowed_floating_len,
bool desperation, bool last_chance) {
u32 min_allowed_len = anchored ? MIN_ANCHORED_LEN
: min_allowed_floating_len;
if (anchored && last_chance) {
min_allowed_len = MIN_ANCHORED_DESPERATE_LEN;
}
if (last_chance) {
desperation = true;
}
DEBUG_PRINTF("validating%s set, min allowed len %u\n",
anchored ? " anchored" : "", min_allowed_len);
assert(none_of(begin(s), end(s), bad_mixed_sensitivity));
if (score >= NO_LITERAL_AT_EDGE_SCORE) {
DEBUG_PRINTF("candidate is too bad %llu/%zu\n", score, s.size());
return false;
}
assert(!s.empty());
if (s.empty()) {
DEBUG_PRINTF("candidate is too bad/something went wrong\n");
return false;
}
u32 s_min_len = min_len(s);
u32 s_min_period = min_period(s);
size_t short_count = shorter_than(s, 5);
DEBUG_PRINTF("cand '%s': score %llu count=%zu min_len=%u min_period=%u"
" short_count=%zu desp=%d\n",
dumpString(*s.begin()).c_str(), score, s.size(), s_min_len,
s_min_period, short_count, (int)desperation);
bool ok = true;
if (s.size() > 10 /* magic number is magic */
|| s_min_len < min_allowed_len
|| (s_min_period <= 1 && min_allowed_len != 1)) {
DEBUG_PRINTF("candidate may be bad\n");
ok = false;
}
if (!ok && desperation
&& s.size() <= 20 /* more magic numbers are magical */
&& (s_min_len > 5 || (s_min_len > 2 && short_count <= 10))
&& s_min_period > 1) {
DEBUG_PRINTF("candidate is ok\n");
ok = true;
}
if (!ok && desperation
&& s.size() <= 50 /* more magic numbers are magical */
&& s_min_len > 10
&& s_min_period > 1) {
DEBUG_PRINTF("candidate is ok\n");
ok = true;
}
if (!ok) {
DEBUG_PRINTF("candidate is too shitty\n");
return false;
}
return true;
}
static UNUSED
void dumpRoseLiteralSet(const set<ue2_literal> &s) {
for (UNUSED const auto &lit : s) {
DEBUG_PRINTF(" lit: %s\n", dumpString(lit).c_str());
}
}
static
void getSimpleRoseLiterals(const NGHolder &g, bool seeking_anchored,
const vector<NFAVertexDepth> *depths,
const set<NFAVertex> &a_dom,
vector<unique_ptr<VertLitInfo>> *lits,
u32 min_allowed_len, bool desperation,
bool last_chance, const CompileContext &cc) {
assert(depths || !seeking_anchored);
map<NFAVertex, u64a> scores;
map<NFAVertex, unique_ptr<VertLitInfo>> lit_info;
for (auto v : a_dom) {
set<ue2_literal> s = getLiteralSet(g, v, true); /* RHS will take responsibility for any
revisits to the target vertex */
if (s.empty()) {
DEBUG_PRINTF("candidate is too shitty\n");
continue;
}
DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
dumpRoseLiteralSet(s);
u64a score = sanitizeAndCompressAndScore(s);
bool anchored = false;
if (seeking_anchored) {
anchored = createsAnchoredLHS(g, {v}, *depths, cc.grey);
}
if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
desperation, last_chance)) {
continue;
}
DEBUG_PRINTF("candidate is a candidate\n");
scores[v] = score;
lit_info[v] = std::make_unique<VertLitInfo>(v, s, anchored);
}
/* try to filter out cases where appending some characters produces worse
* literals. Only bother to look back one byte, TODO make better */
for (auto u : a_dom) {
if (out_degree(u, g) != 1 || !scores[u]) {
continue;
}
NFAVertex v = *adjacent_vertices(u, g).first;
if (contains(scores, v) && scores[v] >= scores[u]) {
DEBUG_PRINTF("killing off v as score %llu >= %llu\n",
scores[v], scores[u]);
lit_info.erase(v);
}
}
lits->reserve(lit_info.size());
for (auto &m : lit_info) {
lits->emplace_back(std::move(m.second));
}
DEBUG_PRINTF("%zu candidate literal sets\n", lits->size());
}
static
void getRegionRoseLiterals(const NGHolder &g, bool seeking_anchored,
const vector<NFAVertexDepth> *depths,
const set<NFAVertex> &bad,
const set<NFAVertex> *allowed,
vector<unique_ptr<VertLitInfo>> *lits,
u32 min_allowed_len, bool desperation,
bool last_chance, const CompileContext &cc) {
/* This allows us to get more places to split the graph as we are not
limited to points where there is a single vertex to split at. */
assert(depths || !seeking_anchored);
/* TODO: operate over 'proto-regions' which ignore back edges */
auto regions = assignRegions(g);
set<u32> mand, optional;
map<u32, vector<NFAVertex> > exits;
for (auto v : vertices_range(g)) {
u32 region = regions[v];
if (is_any_start(v, g) || region == 0) {
continue;
}
if (is_any_accept(v, g)) {
continue;
}
if (!generates_callbacks(g) && is_match_vertex(v, g)) {
/* we cannot leave a completely vacuous infix */
continue;
}
if (isRegionExit(g, v, regions)) {
exits[region].emplace_back(v);
}
if (isRegionEntry(g, v, regions)) {
// Determine whether this region is mandatory or optional. We only
// need to do this check for the first entry vertex we encounter
// for this region.
if (!contains(mand, region) && !contains(optional, region)) {
if (isOptionalRegion(g, v, regions)) {
optional.insert(region);
} else {
mand.insert(region);
}
}
}
}
for (const auto &m : exits) {
if (false) {
next_cand:
continue;
}
const u32 region = m.first;
const vector<NFAVertex> &vv = m.second;
assert(!vv.empty());
if (!contains(mand, region)) {
continue;
}
for (auto v : vv) {
/* if an exit is in bad, the region is already handled well
* by getSimpleRoseLiterals or is otherwise bad */
if (contains(bad, v)) {
goto next_cand;
}
/* if we are only allowed to consider some vertices, v must be in
the list; */
if (allowed && !contains(*allowed, v)) {
goto next_cand;
}
}
/* the final region may not have a neat exit. validate that all exits
* have an edge to each accept or none do */
bool edge_to_a = edge(vv[0], g.accept, g).second;
bool edge_to_aeod = edge(vv[0], g.acceptEod, g).second;
const auto &reports = g[vv[0]].reports;
for (auto v : vv) {
if (edge_to_a != edge(v, g.accept, g).second) {
goto next_cand;
}
if (edge_to_aeod != edge(v, g.acceptEod, g).second) {
goto next_cand;
}
if (g[v].reports != reports) {
goto next_cand;
}
}
DEBUG_PRINTF("inspecting region %u\n", region);
set<ue2_literal> s;
for (auto v : vv) {
DEBUG_PRINTF(" exit vertex: %zu\n", g[v].index);
/* Note: RHS can not be depended on to take all subsequent revisits
* to this vertex */
set<ue2_literal> ss = getLiteralSet(g, v, false);
if (ss.empty()) {
DEBUG_PRINTF("candidate is too shitty\n");
goto next_cand;
}
insert(&s, ss);
}
assert(!s.empty());
DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
dumpRoseLiteralSet(s);
u64a score = sanitizeAndCompressAndScore(s);
DEBUG_PRINTF("|candidate literal set| = %zu\n", s.size());
dumpRoseLiteralSet(s);
bool anchored = false;
if (seeking_anchored) {
anchored = createsAnchoredLHS(g, vv, *depths, cc.grey);
}
if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
desperation, last_chance)) {
goto next_cand;
}
DEBUG_PRINTF("candidate is a candidate\n");
lits->emplace_back(std::make_unique<VertLitInfo>(vv, s, anchored));
}
}
static
void filterCandPivots(const NGHolder &g, const set<NFAVertex> &cand_raw,
set<NFAVertex> *out) {
for (auto u : cand_raw) {
const CharReach &u_cr = g[u].char_reach;
if (u_cr.count() > 40) {
continue; /* too wide to be plausible */
}
if (u_cr.count() > 2) {
/* include u as a candidate as successor may have backed away from
* expanding through it */
out->insert(u);
continue;
}
NFAVertex v = getSoleDestVertex(g, u);
if (v && in_degree(v, g) == 1 && out_degree(u, g) == 1) {
const CharReach &v_cr = g[v].char_reach;
if (v_cr.count() == 1 || v_cr.isCaselessChar()) {
continue; /* v will always generate better literals */
}
}
out->insert(u);
}
}
/* cand_raw is the candidate set before filtering points which are clearly
* a bad idea. */
static
void getCandidatePivots(const NGHolder &g, set<NFAVertex> *cand,
set<NFAVertex> *cand_raw) {
auto dominators = findDominators(g);
set<NFAVertex> accepts;
for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
if (is_special(v, g)) {
continue;
}
accepts.insert(v);
}
for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
if (is_special(v, g)) {
continue;
}
accepts.insert(v);
}
assert(!accepts.empty());
vector<NFAVertex> dom_trace;
auto ait = accepts.begin();
assert(ait != accepts.end());
NFAVertex curr = *ait;
while (curr && !is_special(curr, g)) {
dom_trace.emplace_back(curr);
curr = dominators[curr];
}
reverse(dom_trace.begin(), dom_trace.end());
for (++ait; ait != accepts.end(); ++ait) {
curr = *ait;
vector<NFAVertex> dom_trace2;
while (curr && !is_special(curr, g)) {
dom_trace2.emplace_back(curr);
curr = dominators[curr];
}
reverse(dom_trace2.begin(), dom_trace2.end());
auto dti = dom_trace.begin(), dtie = dom_trace.end();
auto dtj = dom_trace2.begin(), dtje = dom_trace2.end();
while (dti != dtie && dtj != dtje && *dti == *dtj) {
++dti;
++dtj;
}
dom_trace.erase(dti, dtie);
}
cand_raw->insert(dom_trace.begin(), dom_trace.end());
filterCandPivots(g, *cand_raw, cand);
}
static
unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
const vector<NFAVertexDepth> *depths,
bool for_prefix, u32 min_len,
const set<NFAVertex> *allowed_cand,
const set<NFAVertex> *disallowed_cand,
bool last_chance,
const CompileContext &cc) {
assert(!for_prefix || depths);
/* look for a single simple split point */
set<NFAVertex> cand;
set<NFAVertex> cand_raw;
getCandidatePivots(g, &cand, &cand_raw);
if (allowed_cand) {
set<NFAVertex> cand2;
set<NFAVertex> cand2_raw;
set_intersection(allowed_cand->begin(), allowed_cand->end(),
cand.begin(), cand.end(),
inserter(cand2, cand2.begin()));
set_intersection(allowed_cand->begin(), allowed_cand->end(),
cand_raw.begin(), cand_raw.end(),
inserter(cand2_raw, cand2_raw.begin()));
cand = std::move(cand2);
cand_raw = std::move(cand2_raw);
}
if (disallowed_cand) {
DEBUG_PRINTF("%zu disallowed candidates\n", disallowed_cand->size());
DEBUG_PRINTF("|old cand| = %zu\n", cand.size());
erase_all(&cand, *disallowed_cand);
insert(&cand_raw, *disallowed_cand);
}
if (!generates_callbacks(g)) {
/* not output exposed so must leave some RHS */
for (NFAVertex v : inv_adjacent_vertices_range(g.accept, g)) {
cand.erase(v);
cand_raw.erase(v);
}
for (NFAVertex v : inv_adjacent_vertices_range(g.acceptEod, g)) {
cand.erase(v);
cand_raw.erase(v);
}
}
DEBUG_PRINTF("|cand| = %zu\n", cand.size());
bool seeking_anchored = for_prefix;
bool seeking_transient = for_prefix;
bool desperation = for_prefix && cc.streaming;
vector<unique_ptr<VertLitInfo>> lits; /**< sorted list of potential cuts */
getSimpleRoseLiterals(g, seeking_anchored, depths, cand, &lits, min_len,
desperation, last_chance, cc);
getRegionRoseLiterals(g, seeking_anchored, depths, cand_raw, allowed_cand,
&lits, min_len, desperation, last_chance, cc);
if (lits.empty()) {
DEBUG_PRINTF("no literals found\n");
return nullptr;
}
if (seeking_transient) {
for (const auto &a : lits) {
a->creates_transient
= createsTransientLHS(g, a->vv, *depths, cc.grey);
}
}
if (last_chance) {
const size_t num_verts = num_vertices(g);
auto color_map = make_small_color_map(g);
for (const auto &a : lits) {
size_t num_reachable = count_reachable(g, a->vv, color_map);
double ratio = (double)num_reachable / (double)num_verts;
a->split_ratio = ratio > 0.5 ? 1 - ratio : ratio;
}
}
auto cmp = LitComparator(g, seeking_anchored, seeking_transient,
last_chance);
unique_ptr<VertLitInfo> best = std::move(lits.back());
lits.pop_back();
while (!lits.empty()) {
if (cmp(best, lits.back())) {
best = std::move(lits.back());
}
lits.pop_back();
}
DEBUG_PRINTF("best is '%s' %zu a%d t%d\n",
dumpString(*best->lit.begin()).c_str(),
g[best->vv.front()].index,
depths ? (int)createsAnchoredLHS(g, best->vv, *depths, cc.grey) : 0,
depths ? (int)createsTransientLHS(g, best->vv, *depths, cc.grey) : 0);
return best;
}
static
void poisonFromSuccessor(const NGHolder &h, const ue2_literal &succ,
bool overhang_ok, flat_set<NFAEdge> &bad) {
DEBUG_PRINTF("poisoning holder of size %zu, succ len %zu\n",
num_vertices(h), succ.length());
using EdgeSet = boost::dynamic_bitset<>;
const size_t edge_count = num_edges(h);
EdgeSet bad_edges(edge_count);
unordered_map<NFAVertex, EdgeSet> curr;
for (const auto &e : in_edges_range(h.accept, h)) {
auto &path_set = curr[source(e, h)];
if (path_set.empty()) {
path_set.resize(edge_count);
}
path_set.set(h[e].index);
}
unordered_map<NFAVertex, EdgeSet> next;
for (auto it = succ.rbegin(); it != succ.rend(); ++it) {
for (const auto &path : curr) {
NFAVertex u = path.first;
const auto &path_set = path.second;
if (u == h.start && overhang_ok) {
DEBUG_PRINTF("poisoning early %zu [overhang]\n",
path_set.count());
bad_edges |= path_set;
continue;
}
if (overlaps(h[u].char_reach, *it)) {
for (const auto &e : in_edges_range(u, h)) {
auto &new_path_set = next[source(e, h)];
if (new_path_set.empty()) {
new_path_set.resize(edge_count);
}
new_path_set |= path_set;
new_path_set.set(h[e].index);
}
}
}
DEBUG_PRINTF("succ char matches at %zu paths\n", next.size());
assert(overhang_ok || !curr.empty());
swap(curr, next);
next.clear();
}
assert(overhang_ok || !curr.empty());
for (const auto &path : curr) {
bad_edges |= path.second;
DEBUG_PRINTF("poisoning %zu vertices\n", path.second.count());
}
for (const auto &e : edges_range(h)) {
if (bad_edges.test(h[e].index)) {
bad.insert(e);
}
}
}
static
void poisonForGoodPrefix(const NGHolder &h,
const vector<NFAVertexDepth> &depths,
flat_set<NFAEdge> &bad, const Grey &grey) {
for (const auto &v : vertices_range(h)) {
if (!createsAnchoredLHS(h, {v}, depths, grey)
&& !createsTransientLHS(h, {v}, depths, grey)) {
insert(&bad, in_edges_range(v, h));
}
}
}
static UNUSED
bool is_any_accept_type(RoseInVertexType t) {
return t == RIV_ACCEPT || t == RIV_ACCEPT_EOD;
}
static
flat_set<NFAEdge> poisonEdges(const NGHolder &h,
const vector<NFAVertexDepth> *depths,
const RoseInGraph &vg, const vector<RoseInEdge> &ee,
bool for_prefix, const Grey &grey) {
DEBUG_PRINTF("poisoning edges %zu successor edges\n", ee.size());
/* poison edges covered by successor literal */
set<pair<ue2_literal, bool> > succs;
for (const RoseInEdge &ve : ee) {
if (vg[target(ve, vg)].type != RIV_LITERAL) {
/* nothing to poison in suffixes/outfixes */
assert(generates_callbacks(h));
assert(is_any_accept_type(vg[target(ve, vg)].type));
continue;
}
succs.insert({vg[target(ve, vg)].s,
vg[source(ve, vg)].type == RIV_LITERAL});
}
DEBUG_PRINTF("poisoning edges %zu successor literals\n", succs.size());
flat_set<NFAEdge> bad;
for (const auto &p : succs) {
poisonFromSuccessor(h, p.first, p.second, bad);
}
/* poison edges which don't significantly improve a prefix */
if (for_prefix) {
poisonForGoodPrefix(h, *depths, bad, grey);
}
return bad;
}
static
set<NFAVertex> poisonVertices(const NGHolder &h, const RoseInGraph &vg,
const vector<RoseInEdge> &ee, const Grey &grey) {
flat_set<NFAEdge> bad_edges = poisonEdges(h, nullptr, vg, ee, false, grey);
set<NFAVertex> bad_vertices;
for (const NFAEdge &e : bad_edges) {
bad_vertices.insert(target(e, h));
DEBUG_PRINTF("bad: %zu->%zu\n", h[source(e, h)].index,
h[target(e, h)].index);
}
return bad_vertices;
}
static
unique_ptr<VertLitInfo> findBestNormalSplit(const NGHolder &g,
const RoseInGraph &vg,
const vector<RoseInEdge> &ee,
const CompileContext &cc) {
assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX);
set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength,
nullptr, &bad_vertices, false, cc);
}
static
unique_ptr<VertLitInfo> findBestLastChanceSplit(const NGHolder &g,
const RoseInGraph &vg,
const vector<RoseInEdge> &ee,
const CompileContext &cc) {
assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX);
set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength,
nullptr, &bad_vertices, true, cc);
}
static
unique_ptr<VertLitInfo> findSimplePrefixSplit(const NGHolder &g,
const CompileContext &cc) {
DEBUG_PRINTF("looking for simple prefix split\n");
bool anchored = !proper_out_degree(g.startDs, g);
NFAVertex u = anchored ? g.start : g.startDs;
if (out_degree(u, g) != 2) { /* startDs + succ */
return nullptr;
}
NFAVertex v = NGHolder::null_vertex();
for (NFAVertex t : adjacent_vertices_range(u, g)) {
if (t != g.startDs) {
assert(!v);
v = t;
}
}
assert(v);
if (!anchored) {
if (out_degree(g.start, g) > 2) {
return nullptr;
}
if (out_degree(g.start, g) == 2 && !edge(g.start, v, g).second) {
return nullptr;
}
}
NFAVertex best_v = NGHolder::null_vertex();
ue2_literal best_lit;
u32 limit = cc.grey.maxHistoryAvailable;
if (anchored) {
LIMIT_TO_AT_MOST(&limit, cc.grey.maxAnchoredRegion);
}
ue2_literal curr_lit;
for (u32 i = 0; i < limit; i++) {
const auto &v_cr = g[v].char_reach;
if (v_cr.count() == 1 || v_cr.isCaselessChar()) {
curr_lit.push_back(v_cr.find_first(), v_cr.isCaselessChar());
} else {
curr_lit.clear();
}
if (curr_lit.length() > best_lit.length()) {
best_lit = curr_lit;
best_v = v;
}
if (out_degree(v, g) != 1) {
break;
}
v = *adjacent_vertices(v, g).first;
}
if (best_lit.length() < cc.grey.minRoseLiteralLength) {
return nullptr;
}
set<ue2_literal> best_lit_set({best_lit});
if (bad_mixed_sensitivity(best_lit)) {
sanitizeAndCompressAndScore(best_lit_set);
}
return std::make_unique<VertLitInfo>(best_v, best_lit_set, anchored, true);
}
static
unique_ptr<VertLitInfo> findBestPrefixSplit(const NGHolder &g,
const vector<NFAVertexDepth> &depths,
const RoseInGraph &vg,
const vector<RoseInEdge> &ee,
bool last_chance,
const CompileContext &cc) {
assert(g.kind == NFA_PREFIX || g.kind == NFA_OUTFIX);
set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
auto rv = findBestSplit(g, &depths, true, cc.grey.minRoseLiteralLength,
nullptr, &bad_vertices, last_chance, cc);
/* large back edges may prevent us identifying anchored or transient cases
* properly - use a simple walk instead */
if (!rv || !(rv->creates_transient || rv->creates_anchored)) {
auto rv2 = findSimplePrefixSplit(g, cc);
if (rv2) {
return rv2;
}
}
return rv;
}
static
unique_ptr<VertLitInfo> findBestCleanSplit(const NGHolder &g,
const CompileContext &cc) {
assert(g.kind != NFA_PREFIX);
set<NFAVertex> cleanSplits;
for (NFAVertex v : vertices_range(g)) {
if (!g[v].char_reach.all() || !edge(v, v, g).second) {
continue;
}
insert(&cleanSplits, inv_adjacent_vertices(v, g));
cleanSplits.erase(v);
}
cleanSplits.erase(g.start);
if (cleanSplits.empty()) {
return nullptr;
}
return findBestSplit(g, nullptr, false, cc.grey.violetEarlyCleanLiteralLen,
&cleanSplits, nullptr, false, cc);
}
static
bool can_match(const NGHolder &g, const ue2_literal &lit, bool overhang_ok) {
set<NFAVertex> curr, next;
curr.insert(g.accept);
for (auto it = lit.rbegin(); it != lit.rend(); ++it) {
next.clear();
for (auto v : curr) {
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (u == g.start) {
if (overhang_ok) {
DEBUG_PRINTF("bail\n");
return true;
} else {
continue; /* it is not possible for a lhs literal to
* overhang the start */
}
}
const CharReach &cr = g[u].char_reach;
if (!overlaps(*it, cr)) {
continue;
}
next.insert(u);
}
}
curr.swap(next);
}
return !curr.empty();
}
static
bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
const vector<RoseInEdge> &ee, const VertLitInfo &split) {
const vector<NFAVertex> &splitters = split.vv;
assert(!splitters.empty());
shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
if (!lhs || !rhs) {
assert(0);
throw std::bad_alloc();
}
unordered_map<NFAVertex, NFAVertex> lhs_map;
unordered_map<NFAVertex, NFAVertex> rhs_map;
splitGraph(base_graph, splitters, lhs.get(), &lhs_map, rhs.get(), &rhs_map);
DEBUG_PRINTF("split %s:%zu into %s:%zu + %s:%zu\n",
to_string(base_graph.kind).c_str(), num_vertices(base_graph),
to_string(lhs->kind).c_str(), num_vertices(*lhs),
to_string(rhs->kind).c_str(), num_vertices(*rhs));
bool suffix = generates_callbacks(base_graph);
if (is_triggered(base_graph)) {
/* if we are already guarded, check if the split reduces the size of
* the problem before continuing with the split */
if (num_vertices(*lhs) >= num_vertices(base_graph)
&& !(suffix && isVacuous(*rhs))) {
DEBUG_PRINTF("split's lhs is no smaller\n");
return false;
}
if (num_vertices(*rhs) >= num_vertices(base_graph)) {
DEBUG_PRINTF("split's rhs is no smaller\n");
return false;
}
}
bool do_accept = false;
bool do_accept_eod = false;
assert(rhs);
if (isVacuous(*rhs) && suffix) {
if (edge(rhs->start, rhs->accept, *rhs).second) {
DEBUG_PRINTF("rhs has a cliche\n");
do_accept = true;
remove_edge(rhs->start, rhs->accept, *rhs);
}
if (edge(rhs->start, rhs->acceptEod, *rhs).second) {
DEBUG_PRINTF("rhs has an eod cliche\n");
do_accept_eod = true;
remove_edge(rhs->start, rhs->acceptEod, *rhs);
}
renumber_edges(*rhs);
}
/* check if we still have a useful graph left over */
bool do_norm = out_degree(rhs->start, *rhs) != 1;
set<ReportID> splitter_reports;
for (auto v : splitters) {
insert(&splitter_reports, base_graph[v].reports);
}
/* find the targets of each source vertex; insertion_ordered_map used to
* preserve deterministic ordering */
insertion_ordered_map<RoseInVertex, vector<RoseInVertex>> images;
for (const RoseInEdge &e : ee) {
RoseInVertex src = source(e, vg);
RoseInVertex dest = target(e, vg);
images[src].emplace_back(dest);
remove_edge(e, vg);
}
map<vector<RoseInVertex>, vector<RoseInVertex>> verts_by_image;
for (const auto &m : images) {
const auto &u = m.first;
const auto &image = m.second;
if (contains(verts_by_image, image)) {
for (RoseInVertex v : verts_by_image[image]) {
add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
}
continue;
}
for (const auto &lit : split.lit) {
assert(!bad_mixed_sensitivity(lit));
/* don't allow overhang in can_match() as literals should
* correspond to the edge graph being split; overhanging the graph
* would indicate a false path.*/
if (!can_match(*lhs, lit, false)) {
DEBUG_PRINTF("'%s' did not match lhs\n",
escapeString(lit).c_str());
continue;
}
DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str());
auto v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
/* work out delay later */
if (do_accept) {
DEBUG_PRINTF("rhs has a cliche\n");
auto tt = add_vertex(RoseInVertexProps::makeAccept(
splitter_reports), vg);
add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
}
if (do_accept_eod) {
DEBUG_PRINTF("rhs has an eod cliche\n");
auto tt = add_vertex(RoseInVertexProps::makeAcceptEod(
splitter_reports), vg);
add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
}
if (do_norm) {
assert(out_degree(rhs->start, *rhs) > 1);
for (RoseInVertex dest : image) {
add_edge(v, dest, RoseInEdgeProps(rhs, 0U), vg);
}
}
verts_by_image[image].emplace_back(v);
}
}
assert(hasCorrectlyNumberedVertices(*rhs));
assert(hasCorrectlyNumberedEdges(*rhs));
assert(isCorrectlyTopped(*rhs));
assert(hasCorrectlyNumberedVertices(*lhs));
assert(hasCorrectlyNumberedEdges(*lhs));
assert(isCorrectlyTopped(*lhs));
return true;
}
#define MAX_NETFLOW_CUT_WIDTH 40 /* magic number is magic */
#define MAX_LEN_2_LITERALS_PER_CUT 3
static
bool checkValidNetflowLits(const NGHolder &h, const vector<u64a> &scores,
const map<NFAEdge, set<ue2_literal>> &cut_lits,
u32 min_allowed_length) {
DEBUG_PRINTF("cut width %zu; min allowed %u\n", cut_lits.size(),
min_allowed_length);
if (cut_lits.size() > MAX_NETFLOW_CUT_WIDTH) {
return false;
}
u32 len_2_count = 0;
for (const auto &cut : cut_lits) {
if (scores[h[cut.first].index] >= NO_LITERAL_AT_EDGE_SCORE) {
DEBUG_PRINTF("cut uses a forbidden edge\n");
return false;
}
if (min_len(cut.second) < min_allowed_length) {
DEBUG_PRINTF("cut uses a bad literal\n");
return false;
}
for (const auto &lit : cut.second) {
if (lit.length() == 2) {
len_2_count++;
}
}
}
if (len_2_count > MAX_LEN_2_LITERALS_PER_CUT) {
return false;
}
return true;
}
static
void splitEdgesByCut(const NGHolder &h, RoseInGraph &vg,
const vector<RoseInEdge> &to_cut,
const vector<NFAEdge> &cut,
const map<NFAEdge, set<ue2_literal>> &cut_lits) {
DEBUG_PRINTF("splitting %s (%zu vertices)\n", to_string(h.kind).c_str(),
num_vertices(h));
/* create literal vertices and connect preds */
unordered_set<RoseInVertex> done_sources;
map<RoseInVertex, vector<pair<RoseInVertex, NFAVertex>>> verts_by_source;
for (const RoseInEdge &ve : to_cut) {
assert(&h == &*vg[ve].graph);
RoseInVertex src = source(ve, vg);
if (!done_sources.insert(src).second) {
continue; /* already processed */
}
/* iterate over cut for determinism */
for (const auto &e : cut) {
NFAVertex prev_v = source(e, h);
NFAVertex pivot = target(e, h);
DEBUG_PRINTF("splitting on pivot %zu\n", h[pivot].index);
unordered_map<NFAVertex, NFAVertex> temp_map;
shared_ptr<NGHolder> new_lhs = make_shared<NGHolder>();
if (!new_lhs) {
assert(0);
throw std::bad_alloc();
}
splitLHS(h, pivot, new_lhs.get(), &temp_map);
/* want to cut off paths to pivot from things other than the pivot -
* makes a more svelte graphy */
clear_in_edges(temp_map[pivot], *new_lhs);
NFAEdge pivot_edge = add_edge(temp_map[prev_v], temp_map[pivot],
*new_lhs);
if (is_triggered(h) && prev_v == h.start) {
(*new_lhs)[pivot_edge].tops.insert(DEFAULT_TOP);
}
pruneUseless(*new_lhs, false);
renumber_vertices(*new_lhs);
renumber_edges(*new_lhs);
DEBUG_PRINTF(" into lhs %s (%zu vertices)\n",
to_string(new_lhs->kind).c_str(),
num_vertices(*new_lhs));
assert(hasCorrectlyNumberedVertices(*new_lhs));
assert(hasCorrectlyNumberedEdges(*new_lhs));
assert(isCorrectlyTopped(*new_lhs));
const set<ue2_literal> &lits = cut_lits.at(e);
for (const auto &lit : lits) {
if (!can_match(*new_lhs, lit, is_triggered(h))) {
continue;
}
RoseInVertex v
= add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
/* if this is a prefix/infix an edge directly to accept should
* represent a false path as we have poisoned vertices covered
* by the literals. */
if (generates_callbacks(h)) {
if (edge(pivot, h.accept, h).second) {
DEBUG_PRINTF("adding acceptEod\n");
/* literal has a direct connection to accept */
const flat_set<ReportID> &reports = h[pivot].reports;
auto tt = add_vertex(
RoseInVertexProps::makeAccept(reports), vg);
add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
}
if (edge(pivot, h.acceptEod, h).second) {
assert(generates_callbacks(h));
DEBUG_PRINTF("adding acceptEod\n");
/* literal has a direct connection to accept */
const flat_set<ReportID> &reports = h[pivot].reports;
auto tt = add_vertex(
RoseInVertexProps::makeAcceptEod(reports), vg);
add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
}
}
add_edge(src, v, RoseInEdgeProps(new_lhs, 0), vg);
verts_by_source[src].push_back({v, pivot});
}
}
}
/* wire the literal vertices up to successors */
map<vector<NFAVertex>, shared_ptr<NGHolder> > done_rhs;
for (const RoseInEdge &ve : to_cut) {
RoseInVertex src = source(ve, vg);
RoseInVertex dest = target(ve, vg);
/* iterate over cut for determinism */
for (const auto &elem : verts_by_source[src]) {
NFAVertex pivot = elem.second;
RoseInVertex v = elem.first;
vector<NFAVertex> adj;
insert(&adj, adj.end(), adjacent_vertices(pivot, h));
/* we can ignore presence of accept, accepteod in adj as it is best
effort */
if (!contains(done_rhs, adj)) {
unordered_map<NFAVertex, NFAVertex> temp_map;
shared_ptr<NGHolder> new_rhs = make_shared<NGHolder>();
if (!new_rhs) {
assert(0);
throw std::bad_alloc();
}
splitRHS(h, adj, new_rhs.get(), &temp_map);
remove_edge(new_rhs->start, new_rhs->accept, *new_rhs);
remove_edge(new_rhs->start, new_rhs->acceptEod, *new_rhs);
renumber_edges(*new_rhs);
DEBUG_PRINTF(" into rhs %s (%zu vertices)\n",
to_string(new_rhs->kind).c_str(),
num_vertices(*new_rhs));
done_rhs.emplace(adj, new_rhs);
assert(isCorrectlyTopped(*new_rhs));
}
assert(done_rhs[adj].get());
shared_ptr<NGHolder> new_rhs = done_rhs[adj];
assert(hasCorrectlyNumberedVertices(*new_rhs));
assert(hasCorrectlyNumberedEdges(*new_rhs));
assert(isCorrectlyTopped(*new_rhs));
if (vg[dest].type == RIV_LITERAL
&& !can_match(*new_rhs, vg[dest].s, true)) {
continue;
}
if (out_degree(new_rhs->start, *new_rhs) != 1) {
add_edge(v, dest, RoseInEdgeProps(new_rhs, 0), vg);
}
}
remove_edge(ve, vg);
}
}
static
bool doNetflowCut(NGHolder &h,
const vector<NFAVertexDepth> *depths,
RoseInGraph &vg,
const vector<RoseInEdge> &ee, bool for_prefix,
const Grey &grey, u32 min_allowed_length = 0U) {
ENSURE_AT_LEAST(&min_allowed_length, grey.minRoseNetflowLiteralLength);
DEBUG_PRINTF("doing netflow cut\n");
/* TODO: we should really get literals/scores from the full graph as this
* allows us to overlap with previous cuts. */
assert(!ee.empty());
assert(&h == &*vg[ee.front()].graph);
assert(!for_prefix || depths);
if (num_edges(h) > grey.maxRoseNetflowEdges) {
/* We have a limit on this because scoring edges and running netflow
* gets very slow for big graphs. */
DEBUG_PRINTF("too many edges, skipping netflow cut\n");
return false;
}
assert(hasCorrectlyNumberedVertices(h));
assert(hasCorrectlyNumberedEdges(h));
auto known_bad = poisonEdges(h, depths, vg, ee, for_prefix, grey);
/* Step 1: Get scores for all edges */
vector<u64a> scores = scoreEdges(h, known_bad); /* scores by edge_index */
/* Step 2: Find cutset based on scores */
vector<NFAEdge> cut = findMinCut(h, scores);
/* Step 3: Get literals corresponding to cut edges */
map<NFAEdge, set<ue2_literal>> cut_lits;
for (const auto &e : cut) {
set<ue2_literal> lits = getLiteralSet(h, e);
sanitizeAndCompressAndScore(lits);
cut_lits[e] = lits;
}
/* if literals are underlength bail or if it involves a forbidden edge*/
if (!checkValidNetflowLits(h, scores, cut_lits, min_allowed_length)) {
return false;
}
DEBUG_PRINTF("splitting\n");
/* Step 4: Split graph based on cuts */
splitEdgesByCut(h, vg, ee, cut, cut_lits);
return true;
}
static
bool deanchorIfNeeded(NGHolder &g) {
DEBUG_PRINTF("hi\n");
if (proper_out_degree(g.startDs, g)) {
return false;
}
/* look for a non-special dot with a loop following start */
set<NFAVertex> succ_g;
insert(&succ_g, adjacent_vertices(g.start, g));
succ_g.erase(g.startDs);
for (auto v : adjacent_vertices_range(g.start, g)) {
DEBUG_PRINTF("inspecting cand %zu || = %zu\n", g[v].index,
g[v].char_reach.count());
if (v == g.startDs || !g[v].char_reach.all()) {
continue;
}
set<NFAVertex> succ_v;
insert(&succ_v, adjacent_vertices(v, g));
if (succ_v == succ_g) {
DEBUG_PRINTF("found ^.*\n");
for (auto succ : adjacent_vertices_range(g.start, g)) {
if (succ == g.startDs) {
continue;
}
add_edge(g.startDs, succ, g);
}
clear_vertex(v, g);
remove_vertex(v, g);
renumber_vertices(g);
return true;
}
if (succ_g.size() == 1 && hasSelfLoop(v, g)) {
DEBUG_PRINTF("found ^.+\n");
add_edge(g.startDs, v, g);
remove_edge(v, v, g);
return true;
}
}
return false;
}
static
RoseInGraph populateTrivialGraph(const NGHolder &h) {
RoseInGraph g;
shared_ptr<NGHolder> root_g = cloneHolder(h);
bool orig_anch = isAnchored(*root_g);
orig_anch |= deanchorIfNeeded(*root_g);
DEBUG_PRINTF("orig_anch %d\n", (int)orig_anch);
auto start = add_vertex(RoseInVertexProps::makeStart(orig_anch), g);
auto accept = add_vertex(RoseInVertexProps::makeAccept(set<ReportID>()), g);
add_edge(start, accept, RoseInEdgeProps(root_g, 0), g);
return g;
}
static
void avoidOutfixes(RoseInGraph &vg, bool last_chance,
const CompileContext &cc) {
STAGE_DEBUG_PRINTF("AVOIDING OUTFIX\n");
assert(num_vertices(vg) == 2);
assert(num_edges(vg) == 1);
RoseInEdge e = *edges(vg).first;
NGHolder &h = *vg[e].graph;
assert(isCorrectlyTopped(h));
renumber_vertices(h);
renumber_edges(h);
unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, {e}, cc);
if (split && splitRoseEdge(h, vg, {e}, *split)) {
DEBUG_PRINTF("split on simple literal\n");
return;
}
if (last_chance) {
/* look for a prefix split as it allows us to accept very weak anchored
* literals. */
auto depths = calcDepths(h);
split = findBestPrefixSplit(h, depths, vg, {e}, last_chance, cc);
if (split && splitRoseEdge(h, vg, {e}, *split)) {
DEBUG_PRINTF("split on simple literal\n");
return;
}
}
doNetflowCut(h, nullptr, vg, {e}, false, cc.grey);
}
static
void removeRedundantPrefixes(RoseInGraph &g) {
STAGE_DEBUG_PRINTF("REMOVING REDUNDANT PREFIXES\n");
for (const RoseInEdge &e : edges_range(g)) {
RoseInVertex s = source(e, g);
RoseInVertex t = target(e, g);
if (g[s].type != RIV_START || g[t].type != RIV_LITERAL) {
continue;
}
if (!g[e].graph) {
continue;
}
assert(!g[t].delay);
const ue2_literal &lit = g[t].s;
if (!literalIsWholeGraph(*g[e].graph, lit)) {
DEBUG_PRINTF("not whole graph\n");
continue;
}
if (!isFloating(*g[e].graph)) {
DEBUG_PRINTF("not floating\n");
continue;
}
g[e].graph.reset();
}
}
static
u32 maxDelay(const CompileContext &cc) {
if (!cc.streaming) {
return MO_INVALID_IDX;
}
return cc.grey.maxHistoryAvailable;
}
static
void removeRedundantLiteralsFromPrefixes(RoseInGraph &g,
const CompileContext &cc) {
STAGE_DEBUG_PRINTF("REMOVING LITERALS FROM PREFIXES\n");
vector<RoseInEdge> to_anchor;
for (const RoseInEdge &e : edges_range(g)) {
RoseInVertex s = source(e, g);
RoseInVertex t = target(e, g);
if (g[s].type != RIV_START && g[s].type != RIV_ANCHORED_START) {
continue;
}
if (g[t].type != RIV_LITERAL) {
continue;
}
if (!g[e].graph) {
continue;
}
if (g[e].graph_lag) {
/* already removed redundant parts of literals */
continue;
}
if (g[e].dfa) {
/* if we removed any more states, we would need to rebuild the
* the dfa which can be time consuming. */
continue;
}
assert(!g[t].delay);
const ue2_literal &lit = g[t].s;
DEBUG_PRINTF("removing states for literal: %s\n",
dumpString(lit).c_str());
unique_ptr<NGHolder> h = cloneHolder(*g[e].graph);
const u32 max_delay = maxDelay(cc);
u32 delay = removeTrailingLiteralStates(*h, lit, max_delay,
false /* can't overhang start */);
DEBUG_PRINTF("got delay %u (max allowed %u)\n", delay, max_delay);
if (edge(h->startDs, h->accept, *h).second) {
/* we should have delay == lit.length(), but in really complex
* cases we may fail to identify that we can remove the whole
* graph. Regardless, the fact that sds is wired to accept means the
* graph serves no purpose. */
DEBUG_PRINTF("whole graph\n");
g[e].graph.reset();
continue;
}
if (delay == lit.length() && edge(h->start, h->accept, *h).second
&& num_vertices(*h) == N_SPECIALS) {
to_anchor.emplace_back(e);
continue;
}
/* if we got here we should still have an interesting graph */
assert(delay == max_delay || num_vertices(*h) > N_SPECIALS);
if (delay && delay != MO_INVALID_IDX) {
DEBUG_PRINTF("setting delay %u on lhs %p\n", delay, h.get());
g[e].graph = std::move(h);
g[e].graph_lag = delay;
}
}
if (!to_anchor.empty()) {
RoseInVertex anch = add_vertex(RoseInVertexProps::makeStart(true), g);
for (RoseInEdge e : to_anchor) {
DEBUG_PRINTF("rehoming to anchor\n");
RoseInVertex v = target(e, g);
add_edge(anch, v, g);
remove_edge(e, g);
}
}
}
static
bool isStarCliche(const NGHolder &g) {
DEBUG_PRINTF("checking graph with %zu vertices\n", num_vertices(g));
bool nonspecials_seen = false;
for (auto v : vertices_range(g)) {
if (is_special(v, g)) {
continue;
}
if (nonspecials_seen) {
return false;
}
nonspecials_seen = true;
if (!g[v].char_reach.all()) {
return false;
}
if (!hasSelfLoop(v, g)) {
return false;
}
if (!edge(v, g.accept, g).second) {
return false;
}
}
if (!nonspecials_seen) {
return false;
}
if (!edge(g.start, g.accept, g).second) {
return false;
}
return true;
}
static
void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig,
const vector<RoseInEdge> &ee,
const CompileContext &cc) {
/* TODO: This could be better by not creating a separate graph for each
* successor literal. This would require using distinct report ids and also
* taking into account overlap of successor literals. */
set<ue2_literal> preds;
set<ue2_literal> succs;
for (const RoseInEdge &e : ee) {
RoseInVertex u = source(e, ig);
assert(ig[u].type == RIV_LITERAL);
assert(!ig[u].delay);
preds.insert(ig[u].s);
RoseInVertex v = target(e, ig);
assert(ig[v].type == RIV_LITERAL);
assert(!ig[v].delay);
succs.insert(ig[v].s);
if (ig[e].graph_lag) {
/* already removed redundant parts of literals */
return;
}
assert(!ig[e].dfa);
}
map<ue2_literal, pair<shared_ptr<NGHolder>, u32> > graphs; /* + delay */
for (const ue2_literal &right : succs) {
size_t max_overlap = 0;
for (const ue2_literal &left : preds) {
size_t overlap = maxOverlap(left, right, 0);
ENSURE_AT_LEAST(&max_overlap, overlap);
}
u32 max_allowed_delay = right.length() - max_overlap;
if (cc.streaming) {
LIMIT_TO_AT_MOST(&max_allowed_delay, cc.grey.maxHistoryAvailable);
}
if (!max_allowed_delay) {
continue;
}
shared_ptr<NGHolder> h_new = cloneHolder(h);
u32 delay = removeTrailingLiteralStates(*h_new, right,
max_allowed_delay);
if (delay == MO_INVALID_IDX) {
/* successor literal could not match infix -> ignore false path */
assert(0);
continue;
}
if (!delay) {
/* unable to trim graph --> no point swapping to new holder */
continue;
}
assert(isCorrectlyTopped(*h_new));
graphs[right] = make_pair(h_new, delay);
}
for (const RoseInEdge &e : ee) {
RoseInVertex v = target(e, ig);
const ue2_literal &succ = ig[v].s;
if (!contains(graphs, succ)) {
continue;
}
ig[e].graph = graphs[succ].first;
ig[e].graph_lag = graphs[succ].second;
if (isStarCliche(*ig[e].graph)) {
DEBUG_PRINTF("is a X star!\n");
ig[e].graph.reset();
ig[e].graph_lag = 0;
}
}
}
static
void removeRedundantLiteralsFromInfixes(RoseInGraph &g,
const CompileContext &cc) {
insertion_ordered_map<NGHolder *, vector<RoseInEdge>> infixes;
for (const RoseInEdge &e : edges_range(g)) {
RoseInVertex s = source(e, g);
RoseInVertex t = target(e, g);
if (g[s].type != RIV_LITERAL || g[t].type != RIV_LITERAL) {
continue;
}
if (!g[e].graph) {
continue;
}
assert(!g[t].delay);
if (g[e].dfa) {
/* if we removed any more states, we would need to rebuild the
* the dfa which can be time consuming. */
continue;
}
NGHolder *h = g[e].graph.get();
infixes[h].emplace_back(e);
}
for (const auto &m : infixes) {
NGHolder *h = m.first;
const auto &edges = m.second;
removeRedundantLiteralsFromInfix(*h, g, edges, cc);
}
}
static
void removeRedundantLiterals(RoseInGraph &g, const CompileContext &cc) {
removeRedundantLiteralsFromPrefixes(g, cc);
removeRedundantLiteralsFromInfixes(g, cc);
}
static
RoseInVertex getStart(const RoseInGraph &vg) {
for (RoseInVertex v : vertices_range(vg)) {
if (vg[v].type == RIV_START || vg[v].type == RIV_ANCHORED_START) {
return v;
}
}
assert(0);
return RoseInGraph::null_vertex();
}
/**
* Finds the initial accept vertex created to which suffix/outfixes are
* attached.
*/
static
RoseInVertex getPrimaryAccept(RoseInGraph &vg) {
for (RoseInVertex v : vertices_range(vg)) {
if (vg[v].type == RIV_ACCEPT && vg[v].reports.empty()) {
return v;
}
}
assert(0);
return RoseInGraph::null_vertex();
}
static
bool willBeTransient(const depth &max_depth, const CompileContext &cc) {
if (!cc.streaming) {
return max_depth <= depth(ROSE_BLOCK_TRANSIENT_MAX_WIDTH);
} else {
return max_depth <= depth(cc.grey.maxHistoryAvailable + 1);
}
}
static
bool willBeAnchoredTable(const depth &max_depth, const Grey &grey) {
return max_depth <= depth(grey.maxAnchoredRegion);
}
static
unique_ptr<NGHolder> make_chain(u32 count) {
assert(count);
auto rv = std::make_unique<NGHolder>(NFA_INFIX);
NGHolder &h = *rv;
NFAVertex u = h.start;
for (u32 i = 0; i < count; i++) {
NFAVertex v = add_vertex(h);
h[v].char_reach = CharReach::dot();
add_edge(u, v, h);
u = v;
}
h[u].reports.insert(0);
add_edge(u, h.accept, h);
setTops(h);
return rv;
}
#define SHORT_TRIGGER_LEN 16
static
bool makeTransientFromLongLiteral(const NGHolder &h, RoseInGraph &vg,
const vector<RoseInEdge> &ee,
const CompileContext &cc) {
/* check max width and literal lengths to see if possible */
size_t min_lit = (size_t)~0ULL;
for (const RoseInEdge &e : ee) {
RoseInVertex v = target(e, vg);
LIMIT_TO_AT_MOST(&min_lit, vg[v].s.length());
}
if (min_lit <= SHORT_TRIGGER_LEN || min_lit >= UINT_MAX) {
return false;
}
depth max_width = findMaxWidth(h);
u32 delta = min_lit - SHORT_TRIGGER_LEN;
if (!willBeTransient(max_width - depth(delta), cc)
&& !willBeAnchoredTable(max_width - depth(delta), cc.grey)) {
return false;
}
DEBUG_PRINTF("candidate for splitting long literal (len %zu)\n", min_lit);
DEBUG_PRINTF("delta = %u\n", delta);
/* try split */
map<RoseInVertex, shared_ptr<NGHolder> > graphs;
for (const RoseInEdge &e : ee) {
RoseInVertex v = target(e, vg);
shared_ptr<NGHolder> h_new = cloneHolder(h);
u32 delay = removeTrailingLiteralStates(*h_new, vg[v].s, delta);
DEBUG_PRINTF("delay %u\n", delay);
if (delay != delta) {
DEBUG_PRINTF("unable to trim literal\n");
return false;
}
if (in_degree(v, vg) != 1) {
DEBUG_PRINTF("complicated\n");
return false;
}
DEBUG_PRINTF("new mw = %u\n", (u32)findMaxWidth(*h_new));
assert(willBeTransient(findMaxWidth(*h_new), cc)
|| willBeAnchoredTable(findMaxWidth(*h_new), cc.grey));
assert(isCorrectlyTopped(*h_new));
graphs[v] = h_new;
}
/* add .{repeats} from prefixes to long literals */
for (const RoseInEdge &e : ee) {
RoseInVertex s = source(e, vg);
RoseInVertex t = target(e, vg);
remove_edge(e, vg);
const ue2_literal &orig_lit = vg[t].s;
ue2_literal lit(orig_lit.begin(), orig_lit.end() - delta);
ue2_literal lit2(orig_lit.end() - delta, orig_lit.end());
assert(lit.length() + delta == orig_lit.length());
vg[t].s = lit2;
RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
add_edge(s, v, RoseInEdgeProps(graphs[t], 0), vg);
add_edge(v, t, RoseInEdgeProps(make_chain(delta), 0), vg);
}
DEBUG_PRINTF("success\n");
/* TODO: alter split point to avoid pathological splits */
return true;
}
static
void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
u32 delay, const vector<NFAVertex> &preds) {
assert(delay <= lit.length());
assert(isCorrectlyTopped(g));
DEBUG_PRINTF("adding on '%s' %u\n", dumpString(lit).c_str(), delay);
NFAVertex prev = g.accept;
auto it = lit.rbegin();
while (delay--) {
NFAVertex curr = add_vertex(g);
assert(it != lit.rend());
g[curr].char_reach = *it;
add_edge(curr, prev, g);
++it;
prev = curr;
}
for (auto v : preds) {
NFAEdge e = add_edge_if_not_present(v, prev, g);
if (v == g.start && is_triggered(g)) {
g[e].tops.insert(DEFAULT_TOP);
}
}
// Every predecessor of accept must have a report.
set_report(g, 0);
renumber_vertices(g);
renumber_edges(g);
assert(allMatchStatesHaveReports(g));
assert(isCorrectlyTopped(g));
}
static
void restoreTrailingLiteralStates(NGHolder &g,
const vector<pair<ue2_literal, u32>> &lits) {
vector<NFAVertex> preds;
insert(&preds, preds.end(), inv_adjacent_vertices(g.accept, g));
clear_in_edges(g.accept, g);
for (auto v : preds) {
g[v].reports.clear(); /* clear report from old accepts */
}
for (const auto &p : lits) {
const ue2_literal &lit = p.first;
u32 delay = p.second;
restoreTrailingLiteralStates(g, lit, delay, preds);
}
}
static
bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
const CompileContext &cc) {
DEBUG_PRINTF("trying to improve prefix %p, %zu verts\n", &h,
num_vertices(h));
assert(isCorrectlyTopped(h));
renumber_vertices(h);
renumber_edges(h);
auto depths = calcDepths(h);
/* If the reason the prefix is not transient is due to a very long literal
* following, we can make it transient by restricting ourselves to using
* just the head of the literal. */
if (makeTransientFromLongLiteral(h, vg, ee, cc)) {
return true;
}
auto split = findBestPrefixSplit(h, depths, vg, ee, false, cc);
if (split && (split->creates_transient || split->creates_anchored)
&& splitRoseEdge(h, vg, ee, *split)) {
DEBUG_PRINTF("split on simple literal\n");
return true;
}
/* large back edges may prevent us identifing anchored or transient cases
* properly - use a simple walk instead */
if (doNetflowCut(h, &depths, vg, ee, true, cc.grey)) {
return true;
}
if (split && splitRoseEdge(h, vg, ee, *split)) {
/* use the simple split even though it doesn't create a transient
* prefix */
DEBUG_PRINTF("split on simple literal\n");
return true;
}
/* look for netflow cuts which don't produce good prefixes */
if (doNetflowCut(h, &depths, vg, ee, false, cc.grey)) {
return true;
}
if (ee.size() > 1) {
DEBUG_PRINTF("split the prefix apart based on succ literals\n");
unordered_map<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> >,
NGHolderHasher, NGHolderEqual> trimmed;
for (const auto &e : ee) {
shared_ptr<NGHolder> hh = cloneHolder(h);
auto succ_lit = vg[target(e, vg)].s;
assert(isCorrectlyTopped(*hh));
u32 delay = removeTrailingLiteralStates(*hh, succ_lit,
succ_lit.length(),
false /* can't overhang start */);
if (!delay) {
DEBUG_PRINTF("could not remove any literal, skip over\n");
continue;
}
assert(isCorrectlyTopped(*hh));
trimmed[hh].emplace_back(e, delay);
}
if (trimmed.size() == 1) {
return false;
}
/* shift the contents to a vector so we can modify the graphs without
* violating the map's invariants. */
vector<pair<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> > > >
trimmed_vec(trimmed.begin(), trimmed.end());
trimmed.clear();
for (auto &elem : trimmed_vec) {
shared_ptr<NGHolder> &hp = elem.first;
vector<pair<ue2_literal, u32>> succ_lits;
for (const auto &edge_delay : elem.second) {
const RoseInEdge &e = edge_delay.first;
u32 delay = edge_delay.second;
auto lit = vg[target(e, vg)].s;
vg[e].graph = hp;
assert(delay <= lit.length());
succ_lits.emplace_back(lit, delay);
}
restoreTrailingLiteralStates(*hp, succ_lits);
}
return true;
}
return false;
}
#define MAX_FIND_BETTER_PREFIX_GEN 4
#define MAX_FIND_BETTER_PREFIX_COUNT 100
static
void findBetterPrefixes(RoseInGraph &vg, const CompileContext &cc) {
STAGE_DEBUG_PRINTF("FIND BETTER PREFIXES\n");
RoseInVertex start = getStart(vg);
insertion_ordered_map<NGHolder *, vector<RoseInEdge>> prefixes;
bool changed;
u32 gen = 0;
do {
DEBUG_PRINTF("gen %u\n", gen);
changed = false;
prefixes.clear();
/* find prefixes */
for (const RoseInEdge &e : out_edges_range(start, vg)) {
/* outfixes shouldn't have made it this far */
assert(vg[target(e, vg)].type == RIV_LITERAL);
if (vg[e].graph) {
NGHolder *h = vg[e].graph.get();
prefixes[h].emplace_back(e);
}
}
if (prefixes.size() > MAX_FIND_BETTER_PREFIX_COUNT) {
break;
}
/* look for bad prefixes and try to split */
for (const auto &m : prefixes) {
NGHolder *h = m.first;
const auto &edges = m.second;
depth max_width = findMaxWidth(*h);
if (willBeTransient(max_width, cc)
|| willBeAnchoredTable(max_width, cc.grey)) {
continue;
}
changed = improvePrefix(*h, vg, edges, cc);
}
} while (changed && gen++ < MAX_FIND_BETTER_PREFIX_GEN);
}
#define STRONG_LITERAL_LENGTH 20
#define MAX_EXTRACT_STRONG_LITERAL_GRAPHS 10
static
bool extractStrongLiteral(const NGHolder &h, RoseInGraph &vg,
const vector<RoseInEdge> &ee,
const CompileContext &cc) {
DEBUG_PRINTF("looking for string literal\n");
unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc);
if (split && min_len(split->lit) >= STRONG_LITERAL_LENGTH) {
DEBUG_PRINTF("splitting simple literal\n");
return splitRoseEdge(h, vg, ee, *split);
}
return false;
}
static
void extractStrongLiterals(RoseInGraph &vg, const CompileContext &cc) {
if (!cc.grey.violetExtractStrongLiterals) {
return;
}
STAGE_DEBUG_PRINTF("EXTRACT STRONG LITERALS\n");
unordered_set<NGHolder *> stuck;
insertion_ordered_map<NGHolder *, vector<RoseInEdge>> edges_by_graph;
bool changed;
do {
changed = false;
edges_by_graph.clear();
for (const RoseInEdge &ve : edges_range(vg)) {
if (vg[source(ve, vg)].type != RIV_LITERAL) {
continue;
}
if (vg[ve].graph) {
NGHolder *h = vg[ve].graph.get();
edges_by_graph[h].emplace_back(ve);
}
}
if (edges_by_graph.size() > MAX_EXTRACT_STRONG_LITERAL_GRAPHS) {
DEBUG_PRINTF("too many graphs, stopping\n");
return;
}
for (const auto &m : edges_by_graph) {
NGHolder *g = m.first;
const auto &edges = m.second;
if (contains(stuck, g)) {
DEBUG_PRINTF("already known to be bad\n");
continue;
}
bool rv = extractStrongLiteral(*g, vg, edges, cc);
if (rv) {
changed = true;
} else {
stuck.insert(g);
}
}
} while (changed);
}
#define INFIX_STRONG_GUARD_LEN 8
#define INFIX_MIN_SPLIT_LITERAL_LEN 12
static
bool improveInfix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
const CompileContext &cc) {
unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc);
if (split && min_len(split->lit) >= INFIX_MIN_SPLIT_LITERAL_LEN
&& splitRoseEdge(h, vg, ee, *split)) {
DEBUG_PRINTF("splitting simple literal\n");
return true;
}
DEBUG_PRINTF("trying for a netflow cut\n");
/* look for netflow cuts which don't produce good prefixes */
bool rv = doNetflowCut(h, nullptr, vg, ee, false, cc.grey, 8);
DEBUG_PRINTF("did netfow cut? = %d\n", (int)rv);
return rv;
}
/**
* Infixes which are weakly guarded can, in effect, act like prefixes as they
* will often be live. We should try to split these infixes further if they
* contain strong literals so that we are at least running smaller weak infixes
* which can hopeful be accelerated/miracled.
*/
static
void improveWeakInfixes(RoseInGraph &vg, const CompileContext &cc) {
if (!cc.grey.violetAvoidWeakInfixes) {
return;
}
STAGE_DEBUG_PRINTF("IMPROVE WEAK INFIXES\n");
RoseInVertex start = getStart(vg);
unordered_set<NGHolder *> weak;
for (RoseInVertex vv : adjacent_vertices_range(start, vg)) {
/* outfixes shouldn't have made it this far */
assert(vg[vv].type == RIV_LITERAL);
if (vg[vv].s.length() >= INFIX_STRONG_GUARD_LEN) {
continue;
}
for (const RoseInEdge &e : out_edges_range(vv, vg)) {
if (vg[target(e, vg)].type != RIV_LITERAL || !vg[e].graph) {
continue;
}
NGHolder *h = vg[e].graph.get();
DEBUG_PRINTF("'%s' guards %p\n", dumpString(vg[vv].s).c_str(), h);
weak.insert(h);
}
}
insertion_ordered_map<NGHolder *, vector<RoseInEdge>> weak_edges;
for (const RoseInEdge &ve : edges_range(vg)) {
NGHolder *h = vg[ve].graph.get();
if (contains(weak, h)) {
weak_edges[h].emplace_back(ve);
}
}
for (const auto &m : weak_edges) {
NGHolder *h = m.first;
const auto &edges = m.second;
improveInfix(*h, vg, edges, cc);
}
}
static
void splitEdgesForSuffix(const NGHolder &base_graph, RoseInGraph &vg,
const vector<RoseInEdge> &ee, const VertLitInfo &split,
bool eod, const flat_set<ReportID> &reports) {
const vector<NFAVertex> &splitters = split.vv;
assert(!splitters.empty());
shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
if (!lhs) {
assert(0);
throw bad_alloc();
}
unordered_map<NFAVertex, NFAVertex> v_map;
cloneHolder(*lhs, base_graph, &v_map);
lhs->kind = NFA_INFIX;
clear_in_edges(lhs->accept, *lhs);
clear_in_edges(lhs->acceptEod, *lhs);
add_edge(lhs->accept, lhs->acceptEod, *lhs);
clearReports(*lhs);
for (NFAVertex v : splitters) {
NFAEdge e = add_edge(v_map[v], lhs->accept, *lhs);
if (v == base_graph.start) {
(*lhs)[e].tops.insert(DEFAULT_TOP);
}
(*lhs)[v_map[v]].reports.insert(0);
}
pruneUseless(*lhs);
assert(isCorrectlyTopped(*lhs));
/* create literal vertices and connect preds */
for (const auto &lit : split.lit) {
if (!can_match(*lhs, lit, is_triggered(*lhs))) {
continue;
}
DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str());
RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
RoseInVertex tt;
if (eod) {
DEBUG_PRINTF("doing eod\n");
tt = add_vertex(RoseInVertexProps::makeAcceptEod(reports), vg);
} else {
DEBUG_PRINTF("doing non-eod\n");
tt = add_vertex(RoseInVertexProps::makeAccept(reports), vg);
}
add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
for (const RoseInEdge &e : ee) {
RoseInVertex u = source(e, vg);
assert(!edge(u, v, vg).second);
add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
}
}
}
#define MIN_SUFFIX_LEN 6
static
bool replaceSuffixWithInfix(const NGHolder &h, RoseInGraph &vg,
const vector<RoseInEdge> &suffix_edges,
const CompileContext &cc) {
DEBUG_PRINTF("inspecting suffix : %p on %zu edges\n", &h,
suffix_edges.size());
/*
* We would, in general, rather not have output exposed engines because
* once they are triggered, they must be run while infixes only have to run
* if the successor literal is seen. Matches from output exposed engines
* also have to be placed in a priority queue and interleaved with matches
* from other sources.
*
* Note:
* - if the LHS is extremely unlikely we may be better off leaving
* a suffix unguarded.
*
* - limited width suffixes may be less bad as they won't be continuously
* active, we may want to have (a) stronger controls on if we want to pick
* a trailing literal in these cases and/or (b) look also for literals
* near accept as well as right on accept
*
* TODO: improve heuristics, splitting logic.
*/
/* we may do multiple splits corresponding to different report behaviour */
set<NFAVertex> seen;
map<pair<bool, flat_set<ReportID> >, VertLitInfo> by_reports; /* eod, rep */
for (NFAVertex v : inv_adjacent_vertices_range(h.accept, h)) {
set<ue2_literal> ss = getLiteralSet(h, v, false);
if (ss.empty()) {
DEBUG_PRINTF("candidate is too shitty\n");
return false;
}
VertLitInfo &vli = by_reports[make_pair(false, h[v].reports)];
insert(&vli.lit, ss);
vli.vv.emplace_back(v);
seen.insert(v);
}
seen.insert(h.accept);
for (NFAVertex v : inv_adjacent_vertices_range(h.acceptEod, h)) {
if (contains(seen, v)) {
continue;
}
set<ue2_literal> ss = getLiteralSet(h, v, false);
if (ss.empty()) {
DEBUG_PRINTF("candidate is too shitty\n");
return false;
}
VertLitInfo &vli = by_reports[make_pair(true, h[v].reports)];
insert(&vli.lit, ss);
vli.vv.emplace_back(v);
}
assert(!by_reports.empty());
/* TODO: how strong a min len do we want here ? */
u32 min_len = cc.grey.minRoseLiteralLength;
ENSURE_AT_LEAST(&min_len, MIN_SUFFIX_LEN);
for (auto &vli : by_reports | map_values) {
u64a score = sanitizeAndCompressAndScore(vli.lit);
if (vli.lit.empty()
|| !validateRoseLiteralSetQuality(vli.lit, score, false, min_len,
false, false)) {
return false;
}
}
for (const auto &info : by_reports) {
DEBUG_PRINTF("splitting on simple literals\n");
splitEdgesForSuffix(h, vg, suffix_edges, info.second,
info.first.first /* eod */,
info.first.second /* reports */);
}
for (const RoseInEdge &e : suffix_edges) {
remove_edge(e, vg);
}
return true;
}
static
void avoidSuffixes(RoseInGraph &vg, const CompileContext &cc) {
if (!cc.grey.violetAvoidSuffixes) {
return;
}
STAGE_DEBUG_PRINTF("AVOID SUFFIXES\n");
RoseInVertex accept = getPrimaryAccept(vg);
insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> suffixes;
/* find suffixes */
for (const RoseInEdge &e : in_edges_range(accept, vg)) {
/* outfixes shouldn't have made it this far */
assert(vg[source(e, vg)].type == RIV_LITERAL);
assert(vg[e].graph); /* non suffix paths should be wired to other
accepts */
const NGHolder *h = vg[e].graph.get();
suffixes[h].emplace_back(e);
}
/* look at suffixes and try to split */
for (const auto &m : suffixes) {
const NGHolder *h = m.first;
const auto &edges = m.second;
replaceSuffixWithInfix(*h, vg, edges, cc);
}
}
static
bool leadingDotStartLiteral(const NGHolder &h, VertLitInfo *out) {
if (out_degree(h.start, h) != 3) {
return false;
}
NFAVertex v = NGHolder::null_vertex();
NFAVertex ds = NGHolder::null_vertex();
for (NFAVertex a : adjacent_vertices_range(h.start, h)) {
if (a == h.startDs) {
continue;
}
if (h[a].char_reach.all()) {
ds = a;
if (out_degree(ds, h) != 2 || !edge(ds, ds, h).second) {
return false;
}
} else {
v = a;
}
}
if (!v || !ds || !edge(ds, v, h).second) {
return false;
}
if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) {
return false;
}
ue2_literal lit;
lit.push_back(h[v].char_reach.find_first(),
h[v].char_reach.isCaselessChar());
while (out_degree(v, h) == 1) {
NFAVertex vv = *adjacent_vertices(v, h).first;
if (h[vv].char_reach.count() != 1
&& !h[vv].char_reach.isCaselessChar()) {
break;
}
v = vv;
lit.push_back(h[v].char_reach.find_first(),
h[v].char_reach.isCaselessChar());
}
if (is_match_vertex(v, h) && h.kind != NFA_SUFFIX) {
/* we have rediscovered the post-infix literal */
return false;
}
if (bad_mixed_sensitivity(lit)) {
make_nocase(&lit);
}
DEBUG_PRINTF("%zu found %s\n", h[v].index, dumpString(lit).c_str());
out->vv = {v};
out->lit = {lit};
return true;
}
static
bool lookForDoubleCut(const NGHolder &h, const vector<RoseInEdge> &ee,
RoseInGraph &vg, const Grey &grey) {
VertLitInfo info;
if (!leadingDotStartLiteral(h, &info)
|| min_len(info.lit) < grey.violetDoubleCutLiteralLen) {
return false;
}
DEBUG_PRINTF("performing split\n");
return splitRoseEdge(h, vg, ee, {info});
}
static
void lookForDoubleCut(RoseInGraph &vg, const CompileContext &cc) {
if (!cc.grey.violetDoubleCut) {
return;
}
insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> right_edges;
for (const RoseInEdge &ve : edges_range(vg)) {
if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
const NGHolder *h = vg[ve].graph.get();
right_edges[h].emplace_back(ve);
}
}
for (const auto &m : right_edges) {
const NGHolder *h = m.first;
const auto &edges = m.second;
lookForDoubleCut(*h, edges, vg, cc.grey);
}
}
static
pair<NFAVertex, ue2_literal> findLiteralBefore(const NGHolder &h, NFAVertex v) {
ue2_literal lit;
if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) {
return {v, std::move(lit) };
}
lit.push_back(h[v].char_reach.find_first(),
h[v].char_reach.isCaselessChar());
while (in_degree(v, h) == 1) {
NFAVertex vv = *inv_adjacent_vertices(v, h).first;
if (h[vv].char_reach.count() != 1
&& !h[vv].char_reach.isCaselessChar()) {
break;
}
lit.push_back(h[vv].char_reach.find_first(),
h[vv].char_reach.isCaselessChar());
v = vv;
}
return {v, std::move(lit) };
}
static
bool lookForDotStarPred(NFAVertex v, const NGHolder &h,
NFAVertex *u, NFAVertex *ds) {
*u = NGHolder::null_vertex();
*ds = NGHolder::null_vertex();
for (NFAVertex a : inv_adjacent_vertices_range(v, h)) {
if (h[a].char_reach.all()) {
if (!edge(a, a, h).second) {
return false;
}
if (*ds) {
return false;
}
*ds = a;
} else {
if (*u) {
return false;
}
*u = a;
}
}
if (!*u || !*ds) {
return false;
}
return true;
}
static
bool trailingDotStarLiteral(const NGHolder &h, VertLitInfo *out) {
/* Note: there is no delay yet - so the final literal is the already
* discovered successor literal - we are in fact interested in the literal
* before it. */
if (in_degree(h.accept, h) != 1) {
return false;
}
if (in_degree(h.acceptEod, h) != 1) {
assert(0);
return false;
}
NFAVertex v
= findLiteralBefore(h, *inv_adjacent_vertices(h.accept, h).first).first;
NFAVertex u;
NFAVertex ds;
if (!lookForDotStarPred(v, h, &u, &ds)) {
return false;
}
v = u;
auto rv = findLiteralBefore(h, v);
if (!lookForDotStarPred(v, h, &u, &ds)) {
return false;
}
ue2_literal lit = reverse_literal(rv.second);
DEBUG_PRINTF("%zu found %s\n", h[v].index, dumpString(lit).c_str());
if (bad_mixed_sensitivity(lit)) {
make_nocase(&lit);
}
out->vv = {v};
out->lit = {lit};
return true;
}
static
bool lookForTrailingLiteralDotStar(const NGHolder &h,
const vector<RoseInEdge> &ee,
RoseInGraph &vg, const Grey &grey) {
VertLitInfo info;
if (!trailingDotStarLiteral(h, &info)
|| min_len(info.lit) < grey.violetDoubleCutLiteralLen) {
return false;
}
DEBUG_PRINTF("performing split\n");
return splitRoseEdge(h, vg, ee, info);
}
/* In streaming mode, active engines have to be caught up at stream boundaries
* and have to be stored in stream state, so we prefer to decompose patterns
* in to literals with no state between them if possible. */
static
void decomposeLiteralChains(RoseInGraph &vg, const CompileContext &cc) {
if (!cc.grey.violetLiteralChains) {
return;
}
insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> right_edges;
bool changed;
do {
changed = false;
right_edges.clear();
for (const RoseInEdge &ve : edges_range(vg)) {
if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
const NGHolder *h = vg[ve].graph.get();
right_edges[h].emplace_back(ve);
}
}
for (const auto &m : right_edges) {
const NGHolder *h = m.first;
const vector<RoseInEdge> &ee = m.second;
bool rv = lookForDoubleCut(*h, ee, vg, cc.grey);
if (!rv && h->kind != NFA_SUFFIX) {
rv = lookForTrailingLiteralDotStar(*h, ee, vg, cc.grey);
}
changed |= rv;
}
} while (changed);
}
static
bool lookForCleanSplit(const NGHolder &h, const vector<RoseInEdge> &ee,
RoseInGraph &vg, const CompileContext &cc) {
unique_ptr<VertLitInfo> split = findBestCleanSplit(h, cc);
if (split) {
return splitRoseEdge(h, vg, {ee}, *split);
}
return false;
}
#define MAX_DESIRED_CLEAN_SPLIT_DEPTH 4
static
void lookForCleanEarlySplits(RoseInGraph &vg, const CompileContext &cc) {
u32 gen = 0;
insertion_ordered_set<RoseInVertex> prev({getStart(vg)});
insertion_ordered_set<RoseInVertex> curr;
while (gen < MAX_DESIRED_CLEAN_SPLIT_DEPTH) {
curr.clear();
for (RoseInVertex u : prev) {
for (auto v : adjacent_vertices_range(u, vg)) {
curr.insert(v);
}
}
insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> rightfixes;
for (RoseInVertex v : curr) {
for (const RoseInEdge &e : out_edges_range(v, vg)) {
if (vg[e].graph) {
NGHolder *h = vg[e].graph.get();
rightfixes[h].emplace_back(e);
}
}
}
for (const auto &m : rightfixes) {
const NGHolder *h = m.first;
const auto &edges = m.second;
lookForCleanSplit(*h, edges, vg, cc);
}
prev = std::move(curr);
gen++;
}
}
static
void rehomeEodSuffixes(RoseInGraph &vg) {
// Find edges to accept with EOD-anchored graphs that we can move over to
// acceptEod.
vector<RoseInEdge> acc_edges;
for (const auto &e : edges_range(vg)) {
if (vg[target(e, vg)].type != RIV_ACCEPT) {
continue;
}
if (vg[e].haig || !vg[e].graph) {
continue;
}
const NGHolder &h = *vg[e].graph;
if (in_degree(h.accept, h)) {
DEBUG_PRINTF("graph isn't eod anchored\n");
continue;
}
acc_edges.emplace_back(e);
}
for (const RoseInEdge &e : acc_edges) {
// Move this edge from accept to acceptEod
RoseInVertex w = add_vertex(RoseInVertexProps::makeAcceptEod(), vg);
add_edge(source(e, vg), w, vg[e], vg);
remove_edge(e, vg);
}
/* old accept vertices will be tidied up by final pruneUseless() call */
}
static
bool tryForEarlyDfa(const NGHolder &h, const CompileContext &cc) {
switch (h.kind) {
case NFA_OUTFIX: /* 'prefix' of eod */
case NFA_PREFIX:
return cc.grey.earlyMcClellanPrefix;
case NFA_INFIX:
return cc.grey.earlyMcClellanInfix;
case NFA_SUFFIX:
return cc.grey.earlyMcClellanSuffix;
default:
DEBUG_PRINTF("kind %u\n", (u32)h.kind);
assert(0);
return false;
}
}
static
vector<vector<CharReach>> getDfaTriggers(const RoseInGraph &vg,
const vector<RoseInEdge> &edges,
bool *single_trigger) {
vector<vector<CharReach>> triggers;
u32 min_offset = ~0U;
u32 max_offset = 0;
for (const auto &e : edges) {
RoseInVertex s = source(e, vg);
if (vg[s].type == RIV_LITERAL) {
triggers.emplace_back(as_cr_seq(vg[s].s));
}
ENSURE_AT_LEAST(&max_offset, vg[s].max_offset);
LIMIT_TO_AT_MOST(&min_offset, vg[s].min_offset);
}
*single_trigger = min_offset == max_offset;
DEBUG_PRINTF("trigger offset (%u, %u)\n", min_offset, max_offset);
return triggers;
}
static
bool doEarlyDfa(RoseBuild &rose, RoseInGraph &vg, NGHolder &h,
const vector<RoseInEdge> &edges, bool final_chance,
const ReportManager &rm, const CompileContext &cc) {
DEBUG_PRINTF("trying for dfa\n");
bool single_trigger;
for (const auto &e : edges) {
if (vg[target(e, vg)].type == RIV_ACCEPT_EOD) {
/* TODO: support eod prefixes */
return false;
}
}
auto triggers = getDfaTriggers(vg, edges, &single_trigger);
/* TODO: literal delay things */
if (!generates_callbacks(h)) {
set_report(h, rose.getNewNfaReport());
}
shared_ptr<raw_dfa> dfa = buildMcClellan(h, &rm, single_trigger, triggers,
cc.grey, final_chance);
if (!dfa) {
return false;
}
DEBUG_PRINTF("dfa ok\n");
for (const auto &e : edges) {
vg[e].dfa = dfa;
}
return true;
}
#define MAX_EDGES_FOR_IMPLEMENTABILITY 50
static
bool splitForImplementability(RoseInGraph &vg, NGHolder &h,
const vector<RoseInEdge> &edges,
const CompileContext &cc) {
DEBUG_PRINTF("trying to split %s with %zu vertices on %zu edges\n",
to_string(h.kind).c_str(), num_vertices(h), edges.size());
if (edges.size() > MAX_EDGES_FOR_IMPLEMENTABILITY) {
return false;
}
if (!generates_callbacks(h)) {
vector<pair<ue2_literal, u32>> succ_lits;
for (const auto &e : edges) {
const auto &lit = vg[target(e, vg)].s;
u32 delay = vg[e].graph_lag;
vg[e].graph_lag = 0;
assert(delay <= lit.length());
succ_lits.emplace_back(lit, delay);
}
restoreTrailingLiteralStates(h, succ_lits);
}
unique_ptr<VertLitInfo> split;
if (h.kind == NFA_PREFIX) {
bool last_chance = true;
auto depths = calcDepths(h);
split = findBestPrefixSplit(h, depths, vg, edges, last_chance, cc);
} else {
split = findBestLastChanceSplit(h, vg, edges, cc);
}
if (split && splitRoseEdge(h, vg, edges, *split)) {
DEBUG_PRINTF("split on simple literal\n");
return true;
}
DEBUG_PRINTF("trying to netflow\n");
bool rv = doNetflowCut(h, nullptr, vg, edges, false, cc.grey);
DEBUG_PRINTF("done\n");
return rv;
}
#define MAX_IMPLEMENTABLE_SPLITS 50
bool ensureImplementable(RoseBuild &rose, RoseInGraph &vg, bool allow_changes,
bool final_chance, const ReportManager &rm,
const CompileContext &cc) {
DEBUG_PRINTF("checking for impl %d\n", final_chance);
bool changed = false;
bool need_to_recalc = false;
u32 added_count = 0;
unordered_set<shared_ptr<NGHolder>> good; /* known to be implementable */
do {
changed = false;
DEBUG_PRINTF("added %u\n", added_count);
insertion_ordered_map<shared_ptr<NGHolder>,
vector<RoseInEdge>> edges_by_graph;
for (const RoseInEdge &ve : edges_range(vg)) {
if (vg[ve].graph && !vg[ve].dfa) {
const auto &h = vg[ve].graph;
edges_by_graph[h].emplace_back(ve);
}
}
for (auto &m : edges_by_graph) {
auto &h = m.first;
if (contains(good, h)) {
continue;
}
reduceGraphEquivalences(*h, cc);
if (isImplementableNFA(*h, &rm, cc)) {
good.insert(h);
continue;
}
const auto &edges = m.second;
if (tryForEarlyDfa(*h, cc) &&
doEarlyDfa(rose, vg, *h, edges, final_chance, rm, cc)) {
continue;
}
DEBUG_PRINTF("eek\n");
if (!allow_changes) {
return false;
}
if (splitForImplementability(vg, *h, edges, cc)) {
added_count++;
if (added_count > MAX_IMPLEMENTABLE_SPLITS) {
DEBUG_PRINTF("added_count hit limit\n");
return false;
}
changed = true;
continue;
}
return false;
}
assert(added_count <= MAX_IMPLEMENTABLE_SPLITS);
if (changed) {
removeRedundantLiterals(vg, cc);
pruneUseless(vg);
need_to_recalc = true;
}
} while (changed);
if (need_to_recalc) {
renumber_vertices(vg);
calcVertexOffsets(vg);
}
DEBUG_PRINTF("ok!\n");
return true;
}
static
RoseInGraph doInitialVioletTransform(const NGHolder &h, bool last_chance,
const CompileContext &cc) {
assert(!can_never_match(h));
RoseInGraph vg = populateTrivialGraph(h);
if (!cc.grey.allowViolet) {
return vg;
}
/* Avoid running the Violet analysis at all on graphs with no vertices with
* small reach, since we will not be able to extract any literals. */
if (!hasNarrowReachVertex(h)) {
DEBUG_PRINTF("fail, no vertices with small reach\n");
return vg;
}
DEBUG_PRINTF("hello world\n");
/* Step 1: avoid outfixes as we always have to run them. */
avoidOutfixes(vg, last_chance, cc);
if (num_vertices(vg) <= 2) {
return vg; /* unable to transform pattern */
}
removeRedundantPrefixes(vg);
dumpPreRoseGraph(vg, cc.grey, "pre_prefix_rose.dot");
/* Step 2: avoid non-transient prefixes (esp in streaming mode) */
findBetterPrefixes(vg, cc);
dumpPreRoseGraph(vg, cc.grey, "post_prefix_rose.dot");
extractStrongLiterals(vg, cc);
dumpPreRoseGraph(vg, cc.grey, "post_extract_rose.dot");
improveWeakInfixes(vg, cc);
dumpPreRoseGraph(vg, cc.grey, "post_infix_rose.dot");
/* Step 3: avoid output exposed engines if there is a strong trailing
literal) */
avoidSuffixes(vg, cc);
/* Step 4: look for infixes/suffixes with leading .*literals
* This can reduce the amount of work a heavily picked literal has to do and
* reduce the amount of state used as .* is handled internally to rose. */
lookForDoubleCut(vg, cc);
if (cc.streaming) {
lookForCleanEarlySplits(vg, cc);
decomposeLiteralChains(vg, cc);
}
rehomeEodSuffixes(vg);
removeRedundantLiterals(vg, cc);
pruneUseless(vg);
dumpPreRoseGraph(vg, cc.grey);
renumber_vertices(vg);
calcVertexOffsets(vg);
return vg;
}
bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
bool last_chance, const ReportManager &rm,
const CompileContext &cc) {
auto vg = doInitialVioletTransform(h, last_chance, cc);
if (num_vertices(vg) <= 2) {
return false;
}
/* Step 5: avoid unimplementable, or overly large engines if possible */
if (!ensureImplementable(rose, vg, last_chance, last_chance, rm, cc)) {
return false;
}
dumpPreRoseGraph(vg, cc.grey, "post_ensure_rose.dot");
/* Step 6: send to rose */
bool rv = rose.addRose(vg, prefilter);
DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail");
return rv;
}
bool checkViolet(const ReportManager &rm, const NGHolder &h, bool prefilter,
const CompileContext &cc) {
auto vg = doInitialVioletTransform(h, true, cc);
if (num_vertices(vg) <= 2) {
return false;
}
bool rv = roseCheckRose(vg, prefilter, rm, cc);
DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail");
return rv;
}
}