vectorscan/src/nfagraph/ng_misc_opt.cpp
2015-10-20 09:13:35 +11:00

550 lines
17 KiB
C++

/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Miscellaneous optimisations.
*
* We sometimes see patterns of the form:
*
* /^.*<[^<]*foobaz/s
*
* This is bad for Rose as the escapes from the cyclic state are the same as
* the trigger. However, we can transform this into:
*
* /^.*<.*foobaz/s
*
* ... as the first dot star can eat all but the last '<'.
*
* Slightly more formally:
*
* Given a cyclic state v with character reachability v_cr and proper preds
* {p1 .. pn} with character reachability {p1_cr .. pn_cr}.
*
* let v_cr' = union(intersection(p1_cr .. pn_cr), v_cr)
*
* v_cr can be replaced with v_cr' without changing the behaviour of the system
* if:
*
* for any given proper pred pi: if pi is set in the nfa then after consuming
* any symbol in v_cr', pi will still be set in the nfa and every successor of
* v is a successor of pi.
*
* The easiest way for this condition to be satisfied is for each proper pred
* pi to have all its preds all have an edge to a pred of pi with a character
* reachability containing v_cr'. There are, however, other ways to establish
* the condition holds.
*
* Note: a similar transformation can be applied in reverse, details left as an
* exercise for the interested reader. */
#include "ng_misc_opt.h"
#include "ng_holder.h"
#include "ng_prune.h"
#include "ng_util.h"
#include "util/charreach.h"
#include "util/container.h"
#include "util/graph_range.h"
#include "ue2common.h"
#include <map>
#include <set>
#include <vector>
using namespace std;
namespace ue2 {
static
void findCandidates(NGHolder &g, const vector<NFAVertex> &ordering,
vector<NFAVertex> *cand) {
for (auto it = ordering.rbegin(), ite = ordering.rend(); it != ite; ++it) {
NFAVertex v = *it;
if (is_special(v, g)
|| !hasSelfLoop(v, g)
|| g[v].char_reach.all()) {
continue;
}
// For `v' to be a candidate, its predecessors must all have the same
// successor set as `v'.
set<NFAVertex> succ_v, succ_u;
succ(g, v, &succ_v);
for (auto u : inv_adjacent_vertices_range(v, g)) {
succ_u.clear();
succ(g, u, &succ_u);
if (succ_v != succ_u) {
goto next_cand;
}
}
DEBUG_PRINTF("vertex %u is a candidate\n", g[v].index);
cand->push_back(v);
next_cand:;
}
}
static
void findCandidates_rev(NGHolder &g, const vector<NFAVertex> &ordering,
vector<NFAVertex> *cand) {
for (auto it = ordering.begin(), ite = ordering.end(); it != ite; ++it) {
NFAVertex v = *it;
if (is_special(v, g)
|| !hasSelfLoop(v, g)
|| g[v].char_reach.all()) {
continue;
}
// For `v' to be a candidate, its predecessors must all have the same
// successor set as `v'.
set<NFAVertex> pred_v, pred_u;
pred(g, v, &pred_v);
for (auto u : adjacent_vertices_range(v, g)) {
pred_u.clear();
pred(g, u, &pred_u);
if (pred_v != pred_u) {
goto next_cand;
}
}
DEBUG_PRINTF("vertex %u is a candidate\n", g[v].index);
cand->push_back(v);
next_cand:;
}
}
/** Find the intersection of the reachability of the predecessors of \p v. */
static
void predCRIntersection(const NGHolder &g, NFAVertex v, CharReach &add) {
add.setall();
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (u != v) {
add &= g[u].char_reach;
}
}
}
/** Find the intersection of the reachability of the successors of \p v. */
static
void succCRIntersection(const NGHolder &g, NFAVertex v, CharReach &add) {
add.setall();
for (auto u : adjacent_vertices_range(v, g)) {
if (u != v) {
add &= g[u].char_reach;
}
}
}
/** The sustain set is used to show that once vertex p is on it stays on given
* the alphabet new_cr. Every vertex pp in the sustain set has the following
* properties:
* -# an edge to p
* -# enough edges to vertices in the sustain set to ensure that a vertex in
* the sustain set will be on after consuming a character. */
static
set<NFAVertex> findSustainSet(const NGHolder &g, NFAVertex p,
bool ignore_starts, const CharReach &new_cr) {
set<NFAVertex> cand;
pred(g, p, &cand);
if (ignore_starts) {
cand.erase(g.startDs);
}
/* remove elements from cand until the sustain set property holds */
bool changed;
do {
DEBUG_PRINTF("|cand| %zu\n", cand.size());
changed = false;
set<NFAVertex>::const_iterator it = cand.begin();
while (it != cand.end()) {
NFAVertex u = *it;
++it;
CharReach sus_cr;
for (auto v : adjacent_vertices_range(u, g)) {
if (contains(cand, v)) {
sus_cr |= g[v].char_reach;
}
}
if (!new_cr.isSubsetOf(sus_cr)) {
cand.erase(u);
changed = true;
}
}
} while (changed);
/* Note: it may be possible to find a (larger) sustain set for a smaller
* new_cr */
return cand;
}
/** Finds the reverse version of the sustain set.. whatever that means. */
static
set<NFAVertex> findSustainSet_rev(const NGHolder &g, NFAVertex p,
const CharReach &new_cr) {
set<NFAVertex> cand;
succ(g, p, &cand);
/* remove elements from cand until the sustain set property holds */
bool changed;
do {
changed = false;
set<NFAVertex>::const_iterator it = cand.begin();
while (it != cand.end()) {
NFAVertex u = *it;
++it;
CharReach sus_cr;
for (auto v : inv_adjacent_vertices_range(u, g)) {
if (contains(cand, v)) {
sus_cr |= g[v].char_reach;
}
}
if (!new_cr.isSubsetOf(sus_cr)) {
cand.erase(u);
changed = true;
}
}
} while (changed);
/* Note: it may be possible to find a (larger) sustain set for a smaller
* new_cr */
return cand;
}
static
bool enlargeCyclicVertex(NGHolder &g, som_type som, NFAVertex v) {
DEBUG_PRINTF("considering vertex %u\n", g[v].index);
const CharReach &v_cr = g[v].char_reach;
CharReach add;
predCRIntersection(g, v, add);
add |= v_cr;
if (add == v_cr) {
DEBUG_PRINTF("no benefit\n");
return false;
}
DEBUG_PRINTF("cr of width %zu up for grabs\n", add.count() - v_cr.count());
for (auto p : inv_adjacent_vertices_range(v, g)) {
if (p == v) {
continue;
}
DEBUG_PRINTF("looking at pred %u\n", g[p].index);
bool ignore_sds = som; /* if we are tracking som, entries into a state
from sds are significant. */
set<NFAVertex> sustain = findSustainSet(g, p, ignore_sds, add);
DEBUG_PRINTF("sustain set is %zu\n", sustain.size());
if (sustain.empty()) {
DEBUG_PRINTF("yawn\n");
}
for (auto pp : inv_adjacent_vertices_range(p, g)) {
/* we need to ensure that whenever pp sets p, that a member of the
sustain set is set. Note: p's cr may be not be a subset of
new_cr */
CharReach sustain_cr;
for (auto pv : adjacent_vertices_range(pp, g)) {
if (contains(sustain, pv)) {
sustain_cr |= g[pv].char_reach;
}
}
if (!g[p].char_reach.isSubsetOf(sustain_cr)) {
DEBUG_PRINTF("unable to establish that preds are forced on\n");
return false;
}
}
}
/* the cr can be increased */
g[v].char_reach = add;
DEBUG_PRINTF("vertex %u was widened\n", g[v].index);
return true;
}
static
bool enlargeCyclicVertex_rev(NGHolder &g, NFAVertex v) {
DEBUG_PRINTF("considering vertex %u\n", g[v].index);
const CharReach &v_cr = g[v].char_reach;
CharReach add;
succCRIntersection(g, v, add);
add |= v_cr;
if (add == v_cr) {
DEBUG_PRINTF("no benefit\n");
return false;
}
DEBUG_PRINTF("cr of width %zu up for grabs\n", add.count() - v_cr.count());
for (auto p : adjacent_vertices_range(v, g)) {
if (p == v) {
continue;
}
DEBUG_PRINTF("looking at succ %u\n", g[p].index);
set<NFAVertex> sustain = findSustainSet_rev(g, p, add);
DEBUG_PRINTF("sustain set is %zu\n", sustain.size());
if (sustain.empty()) {
DEBUG_PRINTF("yawn\n");
}
for (auto pp : adjacent_vertices_range(p, g)) {
/* we need to ensure something - see fwd ver */
CharReach sustain_cr;
for (auto pv : inv_adjacent_vertices_range(pp, g)) {
if (contains(sustain, pv)) {
sustain_cr |= g[pv].char_reach;
}
}
if (!g[p].char_reach.isSubsetOf(sustain_cr)) {
DEBUG_PRINTF("unable to establish that succs are thingy\n");
return false;
}
}
}
/* the cr can be increased */
g[v].char_reach = add;
DEBUG_PRINTF("vertex %u was widened\n", g[v].index);
return true;
}
static
bool enlargeCyclicCR(NGHolder &g, som_type som,
const vector<NFAVertex> &ordering) {
DEBUG_PRINTF("hello\n");
vector<NFAVertex> candidates;
findCandidates(g, ordering, &candidates);
bool rv = false;
for (auto v : candidates) {
rv |= enlargeCyclicVertex(g, som, v);
}
return rv;
}
static
bool enlargeCyclicCR_rev(NGHolder &g, const vector<NFAVertex> &ordering) {
DEBUG_PRINTF("olleh\n");
vector<NFAVertex> candidates;
findCandidates_rev(g, ordering, &candidates);
bool rv = false;
for (auto v : candidates) {
rv |= enlargeCyclicVertex_rev(g, v);
}
return rv;
}
bool improveGraph(NGHolder &g, som_type som) {
/* use a topo ordering so that we can get chains of cyclic states
* done in one sweep */
const vector<NFAVertex> ordering = getTopoOrdering(g);
return enlargeCyclicCR(g, som, ordering)
| enlargeCyclicCR_rev(g, ordering);
}
/** finds a smaller reachability for a state by the reverse transformation of
* enlargeCyclicCR. */
CharReach reduced_cr(NFAVertex v, const NGHolder &g,
const map<NFAVertex, BoundedRepeatSummary> &br_cyclic) {
DEBUG_PRINTF("find minimal cr for %u\n", g[v].index);
CharReach v_cr = g[v].char_reach;
if (proper_in_degree(v, g) != 1) {
return v_cr;
}
NFAVertex pred = getSoleSourceVertex(g, v);
assert(pred);
/* require pred to be fed by one vertex OR (start + startDS) */
NFAVertex predpred;
size_t idp = in_degree(pred, g);
if (hasSelfLoop(pred, g)) {
return v_cr; /* not cliche */
} else if (idp == 1) {
predpred = getSoleSourceVertex(g, pred);
} else if (idp == 2
&& edge(g.start, pred, g).second
&& edge(g.startDs, pred, g).second) {
predpred = g.startDs;
} else {
return v_cr; /* not cliche */
}
assert(predpred);
/* require predpred to be cyclic and its cr to be a superset of
pred and v */
if (!hasSelfLoop(predpred, g)) {
return v_cr; /* not cliche */
}
if (contains(br_cyclic, predpred)
&& !br_cyclic.at(predpred).unbounded()) {
return v_cr; /* fake cyclic */
}
const CharReach &p_cr = g[pred].char_reach;
const CharReach &pp_cr = g[predpred].char_reach;
if (!v_cr.isSubsetOf(pp_cr) || !p_cr.isSubsetOf(pp_cr)) {
return v_cr; /* not cliche */
}
DEBUG_PRINTF("confirming [x]* prop\n");
/* we require all of v succs to be succ of p */
set<NFAVertex> v_succ;
insert(&v_succ, adjacent_vertices(v, g));
set<NFAVertex> p_succ;
insert(&p_succ, adjacent_vertices(pred, g));
if (!is_subset_of(v_succ, p_succ)) {
DEBUG_PRINTF("fail\n");
return v_cr; /* not cliche */
}
if (contains(v_succ, g.accept) || contains(v_succ, g.acceptEod)) {
/* need to check that reports of v are a subset of p's */
if (!is_subset_of(g[v].reports,
g[pred].reports)) {
DEBUG_PRINTF("fail - reports not subset\n");
return v_cr; /* not cliche */
}
}
DEBUG_PRINTF("woot success\n");
v_cr &= ~p_cr;
return v_cr;
}
vector<CharReach> reduced_cr(const NGHolder &g,
const map<NFAVertex, BoundedRepeatSummary> &br_cyclic) {
assert(hasCorrectlyNumberedVertices(g));
vector<CharReach> refined_cr(num_vertices(g), CharReach());
for (auto v : vertices_range(g)) {
u32 v_idx = g[v].index;
refined_cr[v_idx] = reduced_cr(v, g, br_cyclic);
}
return refined_cr;
}
static
bool anyOutSpecial(NFAVertex v, const NGHolder &g) {
for (auto w : adjacent_vertices_range(v, g)) {
if (is_special(w, g) && w != v) {
return true;
}
}
return false;
}
bool mergeCyclicDotStars(NGHolder &g) {
set<NFAVertex> verticesToRemove;
set<NFAEdge> edgesToRemove;
// avoid graphs where startDs is not a free spirit
if (out_degree(g.startDs, g) > 1) {
return false;
}
// check if any of the connected vertices are dots
for (auto v : adjacent_vertices_range(g.start, g)) {
if (is_special(v, g)) {
continue;
}
const CharReach &cr = g[v].char_reach;
// if this is a cyclic dot
if (cr.all() && edge(v, v, g).second) {
// prevent insane graphs
if (anyOutSpecial(v, g)) {
continue;
}
// we don't know if we're going to remove this vertex yet
vector<NFAEdge> deadEdges;
// check if all adjacent vertices have edges from start
for (const auto &e : out_edges_range(v, g)) {
NFAVertex t = target(e, g);
// skip self
if (t == v) {
continue;
}
// skip vertices that don't have edges from start
if (!edge(g.start, t, g).second) {
continue;
}
// add an edge from startDs to this vertex
add_edge_if_not_present(g.startDs, t, g);
// mark this edge for removal
deadEdges.push_back(e);
}
// if the number of edges to be removed equals out degree, vertex
// needs to be removed; else, only remove the edges
if (deadEdges.size() == proper_out_degree(v, g)) {
verticesToRemove.insert(v);
} else {
edgesToRemove.insert(deadEdges.begin(), deadEdges.end());
}
}
}
if (verticesToRemove.empty() && edgesToRemove.empty()) {
return false;
}
DEBUG_PRINTF("removing %zu edges and %zu vertices\n", edgesToRemove.size(),
verticesToRemove.size());
remove_edges(edgesToRemove, g);
remove_vertices(verticesToRemove, g);
/* some predecessors to the cyclic vertices may no longer be useful (no out
* edges), so we can remove them */
pruneUseless(g);
return true;
}
} // namespace ue2