vectorscan/src/nfagraph/ng_squash.cpp

696 lines
23 KiB
C++

/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief NFA graph state squashing analysis.
*
* The basic idea behind the state squashing is that when we are in a cyclic
* state v there are certain other states which are completely irrelevant. This
* is used primarily by the determinisation process to produce smaller DFAs by
* not tracking irrelevant states. It's also used by the LimEx NFA model.
*
* Working out which states we can ignore mainly uses the post-dominator
* analysis.
*
* ### Dot Squash Masks:
*
* The following vertices are added to the squash mask:
* - (1) Any vertex post-dominated by the cyclic dot state
* - (2) Any other vertex post-dominated by the cyclic dot state's successors
* - (3) Any vertex post-dominated by a predecessor of the cyclic dot state -
* provided the predecessor's successors are a subset of the cyclic state's
* successors [For (3), the term successor also includes report information]
*
* (2) and (3) allow us to get squash masks from .* as well as .+
*
* The squash masks are not optimal especially in the case where there
* alternations on both sides - for example in:
*
* /foo(bar|baz).*(abc|xyz)/s
*
* 'foo' is irrelevant once the dot star is hit, but it has no post-dominators
* so isn't picked up ('bar' and 'baz' are picked up by (2)). We may be able to
* do a more complete analysis based on cutting the graph and seeing which
* vertices are unreachable but the current approach is quick and probably
* adequate.
*
*
* ### Non-Dot Squash Masks:
*
* As for dot states. However, if anything in a pdom tree falls outside the
* character range of the cyclic state the whole pdom tree is ignored. Also when
* considering the predecessor's pdom tree it is necessary to verify that the
* predecessor's character reachability falls within that of the cyclic state.
*
* We could do better in this case by not throwing away the whole pdom tree -
* however the bits which we can keep are not clear from the pdom tree of the
* cyclic state - it probably can be based on the dom or pdom tree of the bad
* vertex.
*
* An example of us doing badly is:
*
* /HTTP.*Referer[^\n]*google/s
*
* as '[\\n]*' doesn't get a squash mask at all due to .* but we should be able
* to squash 'Referer'.
*
* ### Extension:
*
* If a state leads solely to a squashable state (or its immediate successors)
* with the same reachability we can make this state a squash state of any of
* the original states squashees which we postdominate. Could probably tighten
* this up but it would require thought. May not need to keep the original
* squasher around but that would also require thought.
*
* ### SOM Notes:
*
* If (left) start of match is required, it is illegal to squash any state which
* may result in an early start of match reaching the squashing state.
*/
#include "config.h"
#include "ng_squash.h"
#include "ng_dominators.h"
#include "ng_dump.h"
#include "ng_holder.h"
#include "ng_prune.h"
#include "ng_region.h"
#include "ng_restructuring.h"
#include "ng_som_util.h"
#include "ng_util.h"
#include "ng_util.h"
#include "util/container.h"
#include "util/graph_range.h"
#include "util/report_manager.h"
#include "ue2common.h"
#include <deque>
#include <map>
#include <boost/graph/depth_first_search.hpp>
#include <boost/graph/reverse_graph.hpp>
using namespace std;
namespace ue2 {
typedef ue2::unordered_map<NFAVertex,
ue2::unordered_set<NFAVertex> > PostDomTree;
static
void buildPDomTree(const NGHolder &g, PostDomTree &tree) {
ue2::unordered_map<NFAVertex, NFAVertex> postdominators =
findPostDominators(g);
for (auto v : vertices_range(g)) {
if (is_special(v, g)) {
continue;
}
NFAVertex pdom = postdominators[v];
if (pdom) {
DEBUG_PRINTF("vertex %u -> %u\n", g[pdom].index,
g[v].index);
tree[pdom].insert(v);
}
}
}
/**
* Builds a squash mask based on the pdom tree of v and the given char reach.
* The built squash mask is a bit conservative for non-dot cases and could
* be improved with a bit of thought.
*/
static
void buildSquashMask(NFAStateSet &mask, const NGHolder &g, NFAVertex v,
const CharReach &cr, const NFAStateSet &init,
const vector<NFAVertex> &vByIndex, const PostDomTree &tree,
som_type som, const vector<DepthMinMax> &som_depths,
const ue2::unordered_map<NFAVertex, u32> &region_map,
smgb_cache &cache) {
DEBUG_PRINTF("build base squash mask for vertex %u)\n",
g[v].index);
vector<NFAVertex> q;
PostDomTree::const_iterator it = tree.find(v);
if (it != tree.end()) {
q.insert(q.end(), it->second.begin(), it->second.end());
}
const u32 v_index = g[v].index;
while (!q.empty()) {
NFAVertex u = q.back();
q.pop_back();
const CharReach &cru = g[u].char_reach;
if ((cru & ~cr).any()) {
/* bail: bad cr on vertex u */
/* TODO: this could be better
*
* we still need to ensure that we record any paths leading to u.
* Hence all vertices R which can reach u must be excluded from the
* squash mask. Note: R != pdom(u) and there may exist an x in (R -
* pdom(u)) which is in pdom(y) where y is in q. Clear ?
*/
mask.set();
return;
}
const u32 u_index = g[u].index;
if (som) {
/* We cannot add a state u to the squash mask of v if it may have an
* earlier start of match offset. ie for us to add a state u to v
* maxSomDist(u) <= minSomDist(v)
*/
const depth &max_som_dist_u = som_depths[u_index].max;
const depth &min_som_dist_v = som_depths[v_index].min;
if (max_som_dist_u.is_infinite()) {
/* it is hard to tell due to the INF if u can actually store an
* earlier SOM than w (state we are building the squash mask
* for) - need to think more deeply
*/
if (mustBeSetBefore(u, v, g, cache)
&& !somMayGoBackwards(u, g, region_map, cache)) {
DEBUG_PRINTF("u %u v %u\n", u_index, v_index);
goto squash_ok;
}
}
if (max_som_dist_u > min_som_dist_v) {
/* u can't be squashed as it may be storing an earlier SOM */
goto add_children_to_queue;
}
}
squash_ok:
mask.set(u_index);
DEBUG_PRINTF("pdom'ed %u\n", u_index);
add_children_to_queue:
it = tree.find(u);
if (it != tree.end()) {
q.insert(q.end(), it->second.begin(), it->second.end());
}
}
if (cr.all()) {
/* the init states aren't in the pdom tree. If all their succ states
* are set (or v), we can consider them post dominated */
/* Note: init states will always result in a later som */
for (size_t i = init.find_first(); i != init.npos;
i = init.find_next(i)) {
/* Yes vacuous patterns do exist */
NFAVertex iv = vByIndex[i];
for (auto w : adjacent_vertices_range(iv, g)) {
if (w == g.accept || w == g.acceptEod) {
DEBUG_PRINTF("skipping %zu due to vacuous accept\n", i);
goto next_init_state;
}
u32 vert_id = g[w].index;
if (w != iv && w != v && !mask.test(vert_id)) {
DEBUG_PRINTF("skipping %zu due to %u\n", i, vert_id);
goto next_init_state;
}
}
DEBUG_PRINTF("pdom'ed %zu\n", i);
mask.set(i);
next_init_state:;
}
}
mask.flip();
}
static
void buildSucc(NFAStateSet &succ, const NGHolder &g, NFAVertex v) {
for (auto w : adjacent_vertices_range(v, g)) {
if (!is_special(w, g)) {
succ.set(g[w].index);
}
}
}
static
void buildPred(NFAStateSet &pred, const NGHolder &g, NFAVertex v) {
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (!is_special(u, g)) {
pred.set(g[u].index);
}
}
}
static
void findDerivedSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex,
const PostDomTree &pdom_tree, const NFAStateSet &init,
map<NFAVertex, NFAStateSet> *squash, som_type som,
const vector<DepthMinMax> &som_depths,
const ue2::unordered_map<NFAVertex, u32> &region_map,
smgb_cache &cache) {
deque<NFAVertex> remaining;
for (const auto &m : *squash) {
remaining.push_back(m.first);
}
while (!remaining.empty()) {
NFAVertex v = remaining.back();
remaining.pop_back();
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (is_special(u, g)) {
continue;
}
if (g[v].char_reach != g[u].char_reach) {
continue;
}
if (out_degree(u, g) != 1) {
continue;
}
NFAStateSet u_squash(init.size());
u32 u_index = g[u].index;
buildSquashMask(u_squash, g, u, g[u].char_reach, init, vByIndex,
pdom_tree, som, som_depths, region_map, cache);
u_squash.set(u_index); /* never clear ourselves */
if ((~u_squash).any()) { // i.e. some bits unset in mask
DEBUG_PRINTF("%u is an upstream squasher of %u\n", u_index,
g[v].index);
(*squash)[u] = u_squash;
remaining.push_back(u);
}
}
}
}
/* If there are redundant states in the graph, it may be possible for two sibling
* .* states to try to squash each other -- which should be prevented
*
* Note: this situation should only happen if ng_equivalence has not been run.
*/
static
void clearMutualSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex,
map<NFAVertex, NFAStateSet> &squash) {
for (auto it = squash.begin(); it != squash.end();) {
NFAVertex a = it->first;
u32 a_index = g[a].index;
NFAStateSet a_squash = ~it->second; /* default is mask of survivors */
for (NFAStateSet::size_type b_index = a_squash.find_first();
b_index != a_squash.npos; b_index = a_squash.find_next(b_index)) {
assert(b_index != a_index);
NFAVertex b = vByIndex[b_index];
if (!contains(squash, b)) {
continue;
}
if (!squash[b].test(a_index)) {
/* b and a squash each other, prevent this */
DEBUG_PRINTF("removing mutual squash %u %zu\n",
a_index, b_index);
squash[b].set(a_index);
it->second.set(b_index);
}
}
if (it->second.all()) {
DEBUG_PRINTF("%u is no longer an effictive squash state\n", a_index);
it = squash.erase(it);
} else {
++it;
}
}
}
map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g, som_type som) {
map<NFAVertex, NFAStateSet> squash;
// Number of bits to use for all our masks. If we're a triggered graph,
// tops have already been assigned, so we don't have to account for them.
const u32 numStates = num_vertices(g);
// Build post-dominator tree.
PostDomTree pdom_tree;
buildPDomTree(g, pdom_tree);
// Build list of vertices by state ID and a set of init states.
vector<NFAVertex> vByIndex(numStates, NFAGraph::null_vertex());
NFAStateSet initStates(numStates);
smgb_cache cache(g);
// Mappings used for SOM mode calculations, otherwise left empty.
unordered_map<NFAVertex, u32> region_map;
vector<DepthMinMax> som_depths;
if (som) {
region_map = assignRegions(g);
som_depths = getDistancesFromSOM(g);
}
for (auto v : vertices_range(g)) {
const u32 vert_id = g[v].index;
DEBUG_PRINTF("vertex %u/%u\n", vert_id, numStates);
assert(vert_id < numStates);
vByIndex[vert_id] = v;
if (is_any_start(v, g) || !in_degree(v, g)) {
initStates.set(vert_id);
}
}
for (u32 i = 0; i < numStates; i++) {
NFAVertex v = vByIndex[i];
assert(v != NFAGraph::null_vertex());
const CharReach &cr = g[v].char_reach;
/* only non-init cyclics can be squashers */
if (!hasSelfLoop(v, g) || initStates.test(i)) {
continue;
}
DEBUG_PRINTF("state %u is cyclic\n", i);
NFAStateSet mask(numStates), succ(numStates), pred(numStates);
buildSquashMask(mask, g, v, cr, initStates, vByIndex, pdom_tree, som,
som_depths, region_map, cache);
buildSucc(succ, g, v);
buildPred(pred, g, v);
const auto &reports = g[v].reports;
for (size_t j = succ.find_first(); j != succ.npos;
j = succ.find_next(j)) {
NFAVertex vj = vByIndex[j];
NFAStateSet pred2(numStates);
buildPred(pred2, g, vj);
if (pred2 == pred) {
DEBUG_PRINTF("adding the sm from %zu to %u's sm\n", j, i);
NFAStateSet tmp(numStates);
buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree,
som, som_depths, region_map, cache);
mask &= tmp;
}
}
for (size_t j = pred.find_first(); j != pred.npos;
j = pred.find_next(j)) {
NFAVertex vj = vByIndex[j];
NFAStateSet succ2(numStates);
buildSucc(succ2, g, vj);
/* we can use j as a basis for squashing if its succs are a subset
* of ours */
if ((succ2 & ~succ).any()) {
continue;
}
if (som) {
/* We cannot use j to add to the squash mask of v if it may
* have an earlier start of match offset. ie for us j as a
* basis for the squash mask of v we require:
* maxSomDist(j) <= minSomDist(v)
*/
/* ** TODO ** */
const depth &max_som_dist_j =
som_depths[g[vj].index].max;
const depth &min_som_dist_v =
som_depths[g[v].index].min;
if (max_som_dist_j > min_som_dist_v ||
max_som_dist_j.is_infinite()) {
/* j can't be used as it may be storing an earlier SOM */
continue;
}
}
const CharReach &crv = g[vj].char_reach;
/* we also require that j's report information be a subset of ours
*/
bool seen_special = false;
for (auto w : adjacent_vertices_range(vj, g)) {
if (is_special(w, g)) {
if (!edge(v, w, g).second) {
goto next_j;
}
seen_special = true;
}
}
// FIXME: should be subset check?
if (seen_special && g[vj].reports != reports) {
continue;
}
/* ok we can use j */
if ((crv & ~cr).none()) {
NFAStateSet tmp(numStates);
buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree,
som, som_depths, region_map, cache);
mask &= tmp;
mask.reset(j);
}
next_j:;
}
mask.set(i); /* never clear ourselves */
if ((~mask).any()) { // i.e. some bits unset in mask
DEBUG_PRINTF("%u squashes %zu other states\n", i, (~mask).count());
squash.emplace(v, mask);
}
}
findDerivedSquashers(g, vByIndex, pdom_tree, initStates, &squash, som,
som_depths, region_map, cache);
clearMutualSquashers(g, vByIndex, squash);
return squash;
}
#define MIN_PURE_ACYCLIC_SQUASH 10 /** magic number */
/** Some squash states are clearly not advantageous in the NFA, as they do
* incur the cost of an exception:
* -# acyclic states
* -# squash only a few acyclic states
*/
void filterSquashers(const NGHolder &g,
map<NFAVertex, NFAStateSet> &squash) {
DEBUG_PRINTF("filtering\n");
map<u32, NFAVertex> rev; /* vertex_index -> vertex */
for (auto v : vertices_range(g)) {
rev[g[v].index] = v;
}
for (auto v : vertices_range(g)) {
if (!contains(squash, v)) {
continue;
}
DEBUG_PRINTF("looking at squash set for vertex %u\n",
g[v].index);
if (!hasSelfLoop(v, g)) {
DEBUG_PRINTF("acyclic\n");
squash.erase(v);
continue;
}
NFAStateSet squashed = squash[v];
squashed.flip(); /* default sense for mask of survivors */
for (NFAStateSet::size_type sq = squashed.find_first();
sq != squashed.npos; sq = squashed.find_next(sq)) {
NFAVertex u = rev[sq];
if (hasSelfLoop(u, g)) {
DEBUG_PRINTF("squashing a cyclic (%zu) is always good\n", sq);
goto next_vertex;
}
}
if (squashed.count() < MIN_PURE_ACYCLIC_SQUASH) {
DEBUG_PRINTF("squash set too small\n");
squash.erase(v);
continue;
}
next_vertex:;
DEBUG_PRINTF("squash set ok\n");
}
}
static
void getHighlanderReporters(const NGHolder &g, const NFAVertex accept,
const ReportManager &rm,
set<NFAVertex> &verts) {
for (auto v : inv_adjacent_vertices_range(accept, g)) {
if (v == g.accept) {
continue;
}
const auto &reports = g[v].reports;
if (reports.empty()) {
assert(0);
continue;
}
// Must be _all_ highlander callback reports.
for (auto report : reports) {
const Report &ir = rm.getReport(report);
if (ir.ekey == INVALID_EKEY || ir.type != EXTERNAL_CALLBACK) {
goto next_vertex;
}
// If there's any bounds, these are handled outside the NFA and
// probably shouldn't be pre-empted.
if (ir.hasBounds()) {
goto next_vertex;
}
}
verts.insert(v);
next_vertex:
continue;
}
}
static
void removeEdgesToAccept(NGHolder &g, NFAVertex v) {
const auto &reports = g[v].reports;
assert(!reports.empty());
// We remove any accept edge with a non-empty subset of the reports of v.
set<NFAEdge> dead;
for (const auto &e : in_edges_range(g.accept, g)) {
NFAVertex u = source(e, g);
const auto &r = g[u].reports;
if (!r.empty() && is_subset_of(r, reports)) {
DEBUG_PRINTF("vertex %u\n", g[u].index);
dead.insert(e);
}
}
for (const auto &e : in_edges_range(g.acceptEod, g)) {
NFAVertex u = source(e, g);
const auto &r = g[u].reports;
if (!r.empty() && is_subset_of(r, reports)) {
DEBUG_PRINTF("vertex %u\n", g[u].index);
dead.insert(e);
}
}
assert(!dead.empty());
remove_edges(dead, g);
}
static
vector<NFAVertex> findUnreachable(const NGHolder &g) {
const boost::reverse_graph<NFAGraph, const NFAGraph &> revg(g.g);
ue2::unordered_map<NFAVertex, boost::default_color_type> colours;
colours.reserve(num_vertices(g));
depth_first_visit(revg, g.acceptEod,
make_dfs_visitor(boost::null_visitor()),
make_assoc_property_map(colours));
// Unreachable vertices are not in the colour map.
vector<NFAVertex> unreach;
for (auto v : vertices_range(revg)) {
if (!contains(colours, v)) {
unreach.push_back(v);
}
}
return unreach;
}
/** Populates squash masks for states that can be switched off by highlander
* (single match) reporters. */
map<NFAVertex, NFAStateSet>
findHighlanderSquashers(const NGHolder &g, const ReportManager &rm) {
map<NFAVertex, NFAStateSet> squash;
set<NFAVertex> verts;
getHighlanderReporters(g, g.accept, rm, verts);
getHighlanderReporters(g, g.acceptEod, rm, verts);
if (verts.empty()) {
DEBUG_PRINTF("no highlander reports\n");
return squash;
}
const u32 numStates = num_vertices(g);
for (auto v : verts) {
DEBUG_PRINTF("vertex %u with %zu reports\n", g[v].index,
g[v].reports.size());
// Find the set of vertices that lead to v or any other reporter with a
// subset of v's reports. We do this by creating a copy of the graph,
// cutting the appropriate out-edges to accept and seeing which
// vertices become unreachable.
ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy;
NGHolder h;
cloneHolder(h, g, &orig_to_copy);
removeEdgesToAccept(h, orig_to_copy[v]);
vector<NFAVertex> unreach = findUnreachable(h);
DEBUG_PRINTF("can squash %zu vertices\n", unreach.size());
if (unreach.empty()) {
continue;
}
if (!contains(squash, v)) {
squash[v] = NFAStateSet(numStates);
squash[v].set();
}
NFAStateSet &mask = squash[v];
for (auto uv : unreach) {
DEBUG_PRINTF("squashes index %u\n", h[uv].index);
mask.reset(h[uv].index);
}
}
return squash;
}
} // namespace ue2