Initial commit of Hyperscan

This commit is contained in:
Matthew Barr
2015-10-20 09:13:35 +11:00
commit 904e436f11
610 changed files with 213627 additions and 0 deletions

599
src/nfagraph/ng.cpp Normal file
View File

@@ -0,0 +1,599 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief NG, NGHolder, NGWrapper and graph handling.
*/
#include "grey.h"
#include "ng.h"
#include "ng_anchored_acyclic.h"
#include "ng_anchored_dots.h"
#include "ng_asserts.h"
#include "ng_calc_components.h"
#include "ng_cyclic_redundancy.h"
#include "ng_dump.h"
#include "ng_edge_redundancy.h"
#include "ng_equivalence.h"
#include "ng_extparam.h"
#include "ng_fixed_width.h"
#include "ng_haig.h"
#include "ng_literal_component.h"
#include "ng_literal_decorated.h"
#include "ng_misc_opt.h"
#include "ng_puff.h"
#include "ng_prefilter.h"
#include "ng_prune.h"
#include "ng_redundancy.h"
#include "ng_region.h"
#include "ng_region_redundancy.h"
#include "ng_reports.h"
#include "ng_rose.h"
#include "ng_sep.h"
#include "ng_small_literal_set.h"
#include "ng_som.h"
#include "ng_vacuous.h"
#include "ng_utf8.h"
#include "ng_util.h"
#include "ng_width.h"
#include "ue2common.h"
#include "nfa/goughcompile.h"
#include "smallwrite/smallwrite_build.h"
#include "rose/rose_build.h"
#include "util/compile_error.h"
#include "util/container.h"
#include "util/depth.h"
#include "util/graph_range.h"
#include "util/make_unique.h"
#include "util/ue2string.h"
using namespace std;
namespace ue2 {
NG::NG(const CompileContext &in_cc, unsigned in_somPrecision)
: maxSomRevHistoryAvailable(in_cc.grey.somMaxRevNfaLength),
minWidth(depth::infinity()),
rm(in_cc.grey),
ssm(in_somPrecision),
cc(in_cc),
rose(makeRoseBuilder(rm, ssm, cc, boundary)),
smwr(makeSmallWriteBuilder(rm, cc)) {
}
NG::~NG() {
// empty
}
/** \brief SOM handling code, called by \ref addComponent.
*
* \return true if the component was handled completely by something (e.g. a
* Haig outfix), false if SOM could be established but implementation via an
* engine will be required.
*
* \throw CompileError if SOM cannot be supported for the component.
*/
static
bool addComponentSom(NG &ng, NGHolder &g, const NGWrapper &w,
const som_type som, const u32 comp_id) {
DEBUG_PRINTF("doing som\n");
dumpComponent(g, "03_presom", w.expressionIndex, comp_id, ng.cc.grey);
assert(hasCorrectlyNumberedVertices(g));
// First, we try the "SOM chain" support in ng_som.cpp.
sombe_rv rv = doSom(ng, g, w, comp_id, som);
if (rv == SOMBE_HANDLED_INTERNAL) {
return false;
} else if (rv == SOMBE_HANDLED_ALL) {
return true;
}
assert(rv == SOMBE_FAIL);
/* Next, Sombe style approaches */
rv = doSomWithHaig(ng, g, w, comp_id, som);
if (rv == SOMBE_HANDLED_INTERNAL) {
return false;
} else if (rv == SOMBE_HANDLED_ALL) {
return true;
}
assert(rv == SOMBE_FAIL);
// If the previous approach could not support this pattern, we try treating
// it monolithically, as a Haig outfix.
vector<vector<CharReach> > triggers; /* empty for outfix */
assert(g.kind == NFA_OUTFIX);
dumpComponent(g, "haig", w.expressionIndex, comp_id, ng.cc.grey);
auto haig = attemptToBuildHaig(g, som, ng.ssm.somPrecision(), triggers,
ng.cc.grey);
if (haig) {
DEBUG_PRINTF("built haig outfix\n");
ng.rose->addOutfix(g, *haig);
return true;
}
/* Our various strategies for supporting SOM for this pattern have failed.
* Provide a generic pattern not supported/too large return value as it is
* unclear what the meaning of a specific SOM error would be */
throw CompileError(w.expressionIndex, "Pattern is too large.");
assert(0); // unreachable
return false;
}
void reduceGraph(NGHolder &g, som_type som, bool utf8,
const CompileContext &cc) {
if (!cc.grey.performGraphSimplification) {
return;
}
// We run reduction passes until either the graph stops changing or we hit
// a (small) limit.
if (!som) {
mergeCyclicDotStars(g);
}
const unsigned MAX_PASSES = 3;
for (unsigned pass = 1; pass <= MAX_PASSES; pass++) {
bool changed = false;
DEBUG_PRINTF("reduce pass %u/%u\n", pass, MAX_PASSES);
changed |= removeEdgeRedundancy(g, som, cc);
changed |= reduceGraphEquivalences(g, cc);
changed |= removeRedundancy(g, som);
if (!changed) {
DEBUG_PRINTF("graph unchanged after pass %u, stopping\n", pass);
break;
}
}
if (utf8) {
utf8DotRestoration(g, som);
}
/* Minor non-redundancy improvements */
if (improveGraph(g, som)) {
/* may be some more edges to remove */
removeEdgeRedundancy(g, som, cc);
}
removeCyclicPathRedundancy(g);
removeCyclicDominated(g, som);
if (!som) {
mergeCyclicDotStars(g);
}
if (!som) {
removeSiblingsOfStartDotStar(g);
}
}
static
bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
const u32 comp_id) {
const CompileContext &cc = ng.cc;
DEBUG_PRINTF("expr=%u, comp=%u: %zu vertices, %zu edges\n",
w.expressionIndex, comp_id, num_vertices(g), num_edges(g));
dumpComponent(g, "01_begin", w.expressionIndex, comp_id, ng.cc.grey);
reduceGraph(g, som, w.utf8, cc);
dumpComponent(g, "02_reduced", w.expressionIndex, comp_id, ng.cc.grey);
// There may be redundant regions that we can remove
if (cc.grey.performGraphSimplification) {
removeRegionRedundancy(g, som);
}
// "Short Exhaustible Passthrough" patterns always become outfixes.
if (!som && isSEP(g, ng.rm, cc.grey)) {
DEBUG_PRINTF("graph is SEP\n");
if (ng.rose->addOutfix(g)) {
return true;
}
}
// Start Of Match handling.
if (som) {
if (addComponentSom(ng, g, w, som, comp_id)) {
return true;
}
}
if (splitOffAnchoredAcyclic(*ng.rose, g, cc)) {
return true;
}
if (handleSmallLiteralSets(*ng.rose, g, cc)
|| handleFixedWidth(*ng.rose, g, cc.grey)) {
return true;
}
if (handleDecoratedLiterals(*ng.rose, g, cc)) {
return true;
}
if (splitOffRose(*ng.rose, g, w.prefilter, cc)) {
return true;
}
if (splitOffPuffs(*ng.rose, ng.rm, g, w.prefilter, cc)) {
return true;
}
if (handleSmallLiteralSets(*ng.rose, g, cc)
|| handleFixedWidth(*ng.rose, g, cc.grey)) {
return true;
}
if (handleDecoratedLiterals(*ng.rose, g, cc)) {
return true;
}
if (splitOffRose(*ng.rose, g, w.prefilter, cc)) {
return true;
}
// A final pass at cyclic redundancy and Rose
// TODO: investigate - coverage results suggest that this never succeeds?
if (cc.grey.performGraphSimplification) {
if (removeCyclicPathRedundancy(g) ||
removeCyclicDominated(g, som)) {
if (handleFixedWidth(*ng.rose, g, cc.grey)) {
return true;
}
}
}
if (finalChanceRose(*ng.rose, g, w.prefilter, cc)) {
return true;
}
DEBUG_PRINTF("testing for outfix\n");
assert(allMatchStatesHaveReports(g));
if (ng.rose->addOutfix(g)) {
return true;
}
return false;
}
// Returns true if all components have been added.
static
bool processComponents(NG &ng, NGWrapper &w,
deque<unique_ptr<NGHolder>> &g_comp,
const som_type som) {
const u32 num_components = g_comp.size();
u32 failed = 0;
for (u32 i = 0; i < num_components; i++) {
if (!g_comp[i]) {
continue;
}
if (addComponent(ng, *g_comp[i], w, som, i)) {
g_comp[i].reset();
continue;
}
if (som) { /* bail immediately */
return false;
}
failed++;
}
if (!failed) {
DEBUG_PRINTF("all components claimed\n");
return true;
}
DEBUG_PRINTF("%u components still remain\n", failed);
return false;
}
bool NG::addGraph(NGWrapper &w) {
// remove reports that aren't on vertices connected to accept.
clearReports(w);
som_type som = w.som;
if (som && isVacuous(w)) {
throw CompileError(w.expressionIndex, "Start of match is not "
"currently supported for patterns which match an "
"empty buffer.");
}
dumpDotWrapper(w, "01_initial", cc.grey);
assert(allMatchStatesHaveReports(w));
/* ensure utf8 starts at cp boundary */
ensureCodePointStart(rm, w);
resolveAsserts(rm, w);
dumpDotWrapper(w, "02_post_assert_resolve", cc.grey);
assert(allMatchStatesHaveReports(w));
pruneUseless(w);
pruneEmptyVertices(w);
if (can_never_match(w)) {
throw CompileError(w.expressionIndex, "Pattern can never match.");
}
optimiseVirtualStarts(w); /* good for som */
handleExtendedParams(rm, w, cc);
if (w.min_length) {
// We have a minimum length constraint, which we currently use SOM to
// satisfy.
som = SOM_LEFT;
ssm.somPrecision(8);
}
if (som) {
rose->setSom();
}
// first, we can perform graph work that can be done on an individual
// expression basis.
if (w.utf8) {
relaxForbiddenUtf8(w);
}
if (w.highlander && !w.min_length && !w.min_offset) {
// In highlander mode: if we don't have constraints on our reports that
// may prevent us accepting our first match (i.e. extended params) we
// can prune the other out-edges of all vertices connected to accept.
pruneHighlanderAccepts(w, rm);
}
dumpDotWrapper(w, "02b_fairly_early", cc.grey);
// If we're a vacuous pattern, we can handle this early.
if (splitOffVacuous(boundary, rm, w)) {
DEBUG_PRINTF("split off vacuous\n");
}
// We might be done at this point: if we've run out of vertices, we can
// stop processing.
if (num_vertices(w) == N_SPECIALS) {
DEBUG_PRINTF("all vertices claimed by vacuous handling\n");
return true;
}
// Now that vacuous edges have been removed, update the min width exclusive
// of boundary reports.
minWidth = min(minWidth, findMinWidth(w));
// Add the pattern to the small write builder.
smwr->add(w);
if (!som) {
removeSiblingsOfStartDotStar(w);
}
dumpDotWrapper(w, "03_early", cc.grey);
// If we've got some literals that span the graph from start to accept, we
// can split them off into Rose from here.
if (!som) {
if (splitOffLiterals(*this, w)) {
DEBUG_PRINTF("some vertices claimed by literals\n");
}
}
// We might be done at this point: if we've run out of vertices, we can
// stop processing.
if (num_vertices(w) == N_SPECIALS) {
DEBUG_PRINTF("all vertices claimed before calc components\n");
return true;
}
// Split the graph into a set of connected components.
deque<unique_ptr<NGHolder>> g_comp = calcComponents(w);
assert(!g_comp.empty());
if (!som) {
for (u32 i = 0; i < g_comp.size(); i++) {
assert(g_comp[i]);
reformLeadingDots(*g_comp[i]);
}
recalcComponents(g_comp);
}
if (processComponents(*this, w, g_comp, som)) {
return true;
}
// If we're in prefiltering mode, we can run the prefilter reductions and
// have another shot at accepting the graph.
if (cc.grey.prefilterReductions && w.prefilter) {
for (u32 i = 0; i < g_comp.size(); i++) {
if (!g_comp[i]) {
continue;
}
prefilterReductions(*g_comp[i], cc);
}
if (processComponents(*this, w, g_comp, som)) {
return true;
}
}
// We must have components that could not be compiled.
for (u32 i = 0; i < g_comp.size(); i++) {
if (g_comp[i]) {
DEBUG_PRINTF("could not compile component %u with %zu vertices\n",
i, num_vertices(*g_comp[i]));
throw CompileError(w.expressionIndex, "Pattern is too large.");
}
}
assert(0); // should have thrown.
return false;
}
/** \brief Used from SOM mode to add an arbitrary NGHolder as an engine. */
bool NG::addHolder(NGHolder &w) {
DEBUG_PRINTF("adding holder of %zu states\n", num_vertices(w));
assert(allMatchStatesHaveReports(w));
assert(hasCorrectlyNumberedVertices(w));
/* We don't update the global minWidth here as we care about the min width
* of the whole pattern - not a just a prefix of it. */
bool prefilter = false;
//dumpDotComp(comp, w, *this, 20, "prefix_init");
som_type som = SOM_NONE; /* the prefixes created by the SOM code do not
themselves track som */
bool utf8 = false; // handling done earlier
reduceGraph(w, som, utf8, cc);
// There may be redundant regions that we can remove
if (cc.grey.performGraphSimplification) {
removeRegionRedundancy(w, som);
}
// "Short Exhaustible Passthrough" patterns always become outfixes.
if (isSEP(w, rm, cc.grey)) {
DEBUG_PRINTF("graph is SEP\n");
if (rose->addOutfix(w)) {
return true;
}
}
if (splitOffAnchoredAcyclic(*rose, w, cc)) {
return true;
}
if (handleSmallLiteralSets(*rose, w, cc)
|| handleFixedWidth(*rose, w, cc.grey)) {
return true;
}
if (handleDecoratedLiterals(*rose, w, cc)) {
return true;
}
if (splitOffRose(*rose, w, prefilter, cc)) {
return true;
}
if (splitOffPuffs(*rose, rm, w, prefilter, cc)) {
return true;
}
if (splitOffRose(*rose, w, prefilter, cc)) {
return true;
}
if (finalChanceRose(*rose, w, prefilter, cc)) {
return true;
}
DEBUG_PRINTF("trying for outfix\n");
if (rose->addOutfix(w)) {
DEBUG_PRINTF("ok\n");
return true;
}
DEBUG_PRINTF("trying for outfix - failed\n");
DEBUG_PRINTF("nobody would take us\n");
return false;
}
bool NG::addLiteral(const ue2_literal &literal, u32 expr_index,
u32 external_report, bool highlander, som_type som) {
assert(!literal.empty());
if (!cc.grey.shortcutLiterals) {
return false;
}
// We can't natively handle arbitrary literals with mixed case sensitivity
// in Rose -- they require mechanisms like benefits masks, which have
// length limits etc. Better to let those go through full graph processing.
if (mixed_sensitivity(literal)) {
DEBUG_PRINTF("mixed sensitivity\n");
return false;
}
// Register external report and validate highlander constraints.
rm.registerExtReport(external_report,
external_report_info(highlander, expr_index));
ReportID id;
if (som) {
assert(!highlander); // not allowed, checked earlier.
Report r = makeSomRelativeCallback(external_report, 0, literal.length());
id = rm.getInternalId(r);
rose->setSom();
} else {
u32 ekey = highlander ? rm.getExhaustibleKey(external_report)
: INVALID_EKEY;
Report r = makeECallback(external_report, 0, ekey);
id = rm.getInternalId(r);
}
DEBUG_PRINTF("success: graph is literal '%s', report ID %u\n",
dumpString(literal).c_str(), id);
rose->add(false, false, literal, {id});
minWidth = min(minWidth, depth(literal.length()));
smwr->add(literal, id); /* inform small write handler about this literal */
return true;
}
NGWrapper::NGWrapper(unsigned int ei, bool highlander_in, bool utf8_in,
bool prefilter_in, som_type som_in, ReportID r,
u64a min_offset_in, u64a max_offset_in, u64a min_length_in)
: expressionIndex(ei), reportId(r), highlander(highlander_in),
utf8(utf8_in), prefilter(prefilter_in), som(som_in),
min_offset(min_offset_in), max_offset(max_offset_in),
min_length(min_length_in) {
// All special nodes/edges are added in NGHolder's constructor.
DEBUG_PRINTF("built %p: expr=%u report=%u%s%s%s%s "
"min_offset=%llu max_offset=%llu min_length=%llu\n",
this, expressionIndex, reportId,
highlander ? " highlander" : "",
utf8 ? " utf8" : "",
prefilter ? " prefilter" : "",
(som != SOM_NONE) ? " som" : "",
min_offset, max_offset, min_length);
}
NGWrapper::~NGWrapper() {}
} // namespace ue2

133
src/nfagraph/ng.h Normal file
View File

@@ -0,0 +1,133 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief NG, NGHolder, NGWrapper declarations.
*/
#ifndef NG_H
#define NG_H
#include "ng_holder.h"
#include "ue2common.h"
#include "parser/position.h"
#include "som/slot_manager.h"
#include "som/som.h"
#include "util/boundary_reports.h"
#include "util/compile_context.h"
#include "util/depth.h"
#include "util/graph.h"
#include "util/report_manager.h"
#include "util/ue2_containers.h"
#include <deque>
#include <map>
#include <memory>
#include <utility>
#include <vector>
#include <boost/core/noncopyable.hpp>
namespace ue2 {
struct CompileContext;
struct ue2_literal;
class NGWrapper : public NGHolder {
public:
NGWrapper(unsigned int expressionIndex, bool highlander, bool utf8,
bool prefilter, const som_type som, ReportID rid, u64a min_offset,
u64a max_offset, u64a min_length);
~NGWrapper();
/** index of the expression represented by this graph, used
* - down the track in error handling
* - identifying parts of an expression in highlander mode
*/
const unsigned int expressionIndex;
const ReportID reportId; /**< user-visible report id */
const bool highlander; /**< user-specified single match only */
const bool utf8; /**< UTF-8 mode */
const bool prefilter; /**< prefiltering mode */
const som_type som; /**< SOM type requested */
u64a min_offset; /**< extparam min_offset value */
u64a max_offset; /**< extparam max_offset value */
u64a min_length; /**< extparam min_length value */
};
class RoseBuild;
class SmallWriteBuild;
class NG : boost::noncopyable {
public:
NG(const CompileContext &in_cc, unsigned in_somPrecision);
~NG();
/** \brief Consumes a pattern, returns false or throws a CompileError
* exception if the graph cannot be consumed. */
bool addGraph(NGWrapper &w);
/** \brief Consumes a graph, cut-down version of addGraph for use by SOM
* processing. */
bool addHolder(NGHolder &h);
/** \brief Adds a literal to Rose, used by literal shortcut passes (instead of
* using \ref addGraph) */
bool addLiteral(const ue2_literal &lit, u32 expr_index, u32 external_report,
bool highlander, som_type som);
/** \brief Maximum history in bytes available for use by SOM reverse NFAs,
* a hack for pattern support (see UE-1903). This is always set to the max
* "lookbehind" length. */
const u32 maxSomRevHistoryAvailable;
/** \brief The length of the shortest corpus which can match a pattern
* contained in the NG (excluding the boundary reports used by vacuous
* patterns, which give an effective minWidth of zero). */
depth minWidth;
ReportManager rm;
SomSlotManager ssm;
BoundaryReports boundary;
const CompileContext cc;
const std::unique_ptr<RoseBuild> rose; //!< Rose builder.
const std::unique_ptr<SmallWriteBuild> smwr; //!< SmallWrite builder.
};
/** \brief Run graph reduction passes.
*
* Shared with the small write compiler.
*/
void reduceGraph(NGHolder &g, som_type som, bool utf8, const CompileContext &cc);
} // namespace ue2
#endif

View File

@@ -0,0 +1,67 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Anchored acyclic graph -> DFA analysis.
*/
#include "ng_anchored_acyclic.h"
#include "ng_holder.h"
#include "ng_reports.h"
#include "ng_util.h"
#include "ue2common.h"
#include "rose/rose_build.h"
#include "util/compile_context.h"
namespace ue2 {
bool splitOffAnchoredAcyclic(RoseBuild &rose, const NGHolder &h,
const CompileContext &cc) {
if (!cc.grey.allowAnchoredAcyclic) {
return false;
}
if (!isAnchored(h)) {
DEBUG_PRINTF("fail, not anchored\n");
return false;
}
if (!isAcyclic(h)) {
DEBUG_PRINTF("fail, not acyclic\n");
return false;
}
if (rose.addAnchoredAcyclic(h)) {
return true;
} else {
DEBUG_PRINTF("failed to add anchored nfa\n");
return false;
}
}
} // namespace ue2

View File

@@ -0,0 +1,49 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Anchored acyclic graph -> DFA analysis.
*/
#ifndef NG_ANCHORED_ACYCLIC_H
#define NG_ANCHORED_ACYCLIC_H
namespace ue2 {
class NGHolder;
class RoseBuild;
struct CompileContext;
/** \brief Attempt to consume the entire pattern in graph \a h as an anchored
* acyclic DFA. Returns true if successful. */
bool splitOffAnchoredAcyclic(RoseBuild &rose, const NGHolder &h,
const CompileContext &cc);
} // namespace ue2
#endif // NG_ANCHORED_ACYCLIC_H

View File

@@ -0,0 +1,654 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Analysis pass to reform leading dots.
*
* We have found that many regexes found in the wild use an anchored dot-repeat
* to represent an unanchored pattern, particularly if they have been used with
* a regex engine that assumes that a pattern is anchored. This pass reforms
* patterns that begin with sequences of dots into a more standard form.
*
* In addition, both anchored and unanchored patterns with dot repeats as
* prefixes will have these prefixes reformed into a canonical form, which some
* later analyses depend upon.
*/
#include "ng_anchored_dots.h"
#include "grey.h"
#include "ng_holder.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/container.h"
#include "util/depth.h"
#include "util/graph_range.h"
#include <algorithm>
#include <queue>
#include <set>
#include <vector>
using namespace std;
namespace ue2 {
static
bool findStarts(const NGHolder &g, set<NFAVertex> &anchored,
set<NFAVertex> &unanchored) {
// Populate unanchored map
for (auto v : adjacent_vertices_range(g.startDs, g)) {
if (is_special(v, g)) {
continue;
}
unanchored.insert(v);
}
// Populate anchored map
for (auto v : adjacent_vertices_range(g.start, g)) {
if (is_special(v, g)) {
continue;
}
anchored.insert(v);
}
if (unanchored == anchored) {
anchored.clear();
} else if (!unanchored.empty() && !anchored.empty()) {
return false;
}
return !anchored.empty() || !unanchored.empty();
}
namespace {
class DotInfo {
public:
DotInfo(NFAVertex v, bool se, u32 idx)
: vertex(v), hasSelfLoop(se), index(idx) {}
bool operator<(const DotInfo &other) const {
if (hasSelfLoop != other.hasSelfLoop)
return hasSelfLoop < other.hasSelfLoop;
// tie break with vertex id: lowest ID wins
return index > other.index;
}
NFAVertex vertex;
bool hasSelfLoop;
u32 index;
};
}
// Returns nullptr if all vertices in the given set are not dots.
// We can only pick one dot vertex, so we go for a dot-star if it exists,
// otherwise the dot without a self-edge with the lowest ID.
static
NFAVertex findReformable(const NGHolder &g, const set<NFAVertex> &starts,
set<NFAVertex> &otherV) {
priority_queue<DotInfo> dotq;
for (auto v : starts) {
if (is_dot(v, g)) {
u32 idx = g[v].index;
dotq.push(DotInfo(v, hasSelfLoop(v, g), idx));
}
}
if (dotq.empty()) {
return NFAGraph::null_vertex();
}
const DotInfo &dot = dotq.top();
otherV = starts;
otherV.erase(dot.vertex);
DEBUG_PRINTF("selected dot vertex %u (%s)\n", dot.index,
dot.hasSelfLoop ? "has self-edge" : "no self-edge");
DEBUG_PRINTF("%zu other vertices\n", otherV.size());
return dot.vertex;
}
// Returns true if the given vertex is only preceded by start. If start is
// graph.startDs (i.e. unanchored), the given vertex can also be connected to
// graph.start. If selfLoopIsAcceptable is set, self-loops are ignored.
static
bool isStartNode(NFAVertex v, NFAVertex start, const NGHolder &g,
bool selfLoopIsAcceptable) {
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (selfLoopIsAcceptable && u == v) {
continue;
} else if (u == start) {
continue;
} else if (start == g.startDs && u == g.start) {
continue;
} else {
return false;
}
}
return true;
}
// Note: this will only remove the anchored first dot in the chain -- any other
// removable nodes will be handled by the unanchored case below.
static
void reformAnchoredRepeatsComponent(NGHolder &g,
set<NFAVertex> &compAnchoredStarts,
set<NFAVertex> &compUnanchoredStarts,
set<NFAVertex> &dead, depth *startBegin,
depth *startEnd) {
// anchored cases can not have any unanchored starts
if (!compUnanchoredStarts.empty()) {
DEBUG_PRINTF("we have unanchored starts, skipping\n");
return;
}
NFAVertex dotV = NFAGraph::null_vertex();
set<NFAVertex> otherV;
dotV = findReformable(g, compAnchoredStarts, otherV);
if (dotV == NFAGraph::null_vertex()) {
DEBUG_PRINTF("no candidate reformable dot found.\n");
return;
}
NFAEdge loopEdge;
bool selfLoop = false;
bool bustOut = false;
for (const auto &e : out_edges_range(dotV, g)) {
NFAVertex t = target(e, g);
if (t == dotV) {
selfLoop = true;
loopEdge = e;
continue;
}
if (is_special(t, g)) {
bustOut = true;
break;
}
if (!otherV.empty() && otherV.find(t) == otherV.end()) {
bustOut = true;
break;
}
}
if (bustOut) {
DEBUG_PRINTF("busting out\n");
return;
}
if (!isStartNode(dotV, g.start, g, true)) {
DEBUG_PRINTF("fleeing: vertex %u has other preds\n", g[dotV].index);
return;
}
/* get bounds */
depth min;
depth max = 1;
if (selfLoop) {
// A self-loop indicates that this is a '.+' or '.*'
max = depth::infinity();
}
if (!otherV.empty()) {
/* We require that the successors of the dot node are are the same
* as the start vertex. TODO: remember why.
*/
if (selfLoop) {
if (otherV.size() != out_degree(dotV, g) - 1) {
return;
}
} else {
if (otherV.size() != out_degree(dotV, g)) {
return;
}
}
min = 0;
} else {
min = 1;
}
*startBegin = min;
*startEnd = max;
for (auto t : adjacent_vertices_range(dotV, g)) {
if (t != dotV) {
add_edge_if_not_present(g.startDs, t, g);
add_edge_if_not_present(g.start, t, g);
compUnanchoredStarts.insert(t);
}
}
for (auto v : otherV) {
remove_edge(g.start, v, g);
}
DEBUG_PRINTF("removing vertex %u\n", g[dotV].index);
clear_vertex(dotV, g);
dead.insert(dotV);
compAnchoredStarts.erase(dotV);
}
static
void reformUnanchoredRepeatsComponent(NGHolder &g,
set<NFAVertex> &compAnchoredStarts,
set<NFAVertex> &compUnanchoredStarts,
set<NFAVertex> &dead,
depth *startBegin, depth *startEnd) {
// unanchored cases can not have any anchored starts
if (!compAnchoredStarts.empty()) {
DEBUG_PRINTF("we have anchored starts, skipping\n");
return;
}
while (true) {
NFAVertex dotV = NFAGraph::null_vertex();
set<NFAVertex> otherV;
dotV = findReformable(g, compUnanchoredStarts, otherV);
if (dotV == NFAGraph::null_vertex()) {
DEBUG_PRINTF("no candidate reformable dot found.\n");
return;
}
NFAEdge loopEdge;
bool selfLoop = false;
bool bustOut = false;
for (const auto &e : out_edges_range(dotV, g)) {
NFAVertex t = target(e, g);
if (t == dotV) {
selfLoop = true;
loopEdge = e;
continue;
}
if (is_special(t, g)) {
bustOut = true;
break;
}
if (!otherV.empty() && otherV.find(t) == otherV.end()) {
bustOut = true;
break;
}
}
if (bustOut) {
DEBUG_PRINTF("busting out\n");
if (!selfLoop) {
return;
}
for (auto v : otherV) {
if (!edge(dotV, v, g).second) {
return;
}
}
// A self-loop indicates that this is a '.+' or '.*'
DEBUG_PRINTF("self-loop detected on %u\n", g[dotV].index);
*startEnd = depth::infinity();
remove_edge(dotV, dotV, g);
return;
}
if (!isStartNode(dotV, g.startDs, g, true)) {
DEBUG_PRINTF("fleeing: vertex %u has other preds\n", g[dotV].index);
return;
}
/* get bounds */
depth min = 1;
depth max = 1;
if (selfLoop) {
// A self-loop indicates that this is a '.+' or '.*'
DEBUG_PRINTF("self-loop detected\n");
max = depth::infinity();
}
if (!otherV.empty()) {
if (!selfLoop && otherV.size() != out_degree(dotV, g)) {
return;
}
if (selfLoop && otherV.size() != out_degree(dotV, g) - 1) {
return;
}
if (min > depth(1)) {
/* this is not a case we can handle */
DEBUG_PRINTF("min greater than one, skipping\n");
return;
}
min = 0;
}
*startBegin += min;
*startEnd += max;
for (auto v : otherV) {
remove_edge(g.start, v, g);
remove_edge(g.startDs, v, g);
}
compUnanchoredStarts.clear();
for (auto t : adjacent_vertices_range(dotV, g)) {
if (t != dotV) {
DEBUG_PRINTF("connecting sds -> %u\n", g[t].index);
add_edge(g.startDs, t, g);
add_edge(g.start, t, g);
compUnanchoredStarts.insert(t);
}
}
DEBUG_PRINTF("removing vertex %u\n", g[dotV].index);
dead.insert(dotV);
clear_vertex(dotV, g);
compUnanchoredStarts.erase(dotV);
}
}
// for t to be another optional dot, it must have only in-edges from v and from
// starts
static
bool isOptionalDot(NFAVertex t, NFAVertex v, const NGHolder &g) {
if (!is_dot(t, g)) {
return false;
}
bool found_v = false, found_start = false;
for (auto u : inv_adjacent_vertices_range(t, g)) {
if (u == v) {
found_v = true;
} else if (u == g.start || u == g.startDs) {
found_start = true;
} else {
return false;
}
}
return found_v && found_start;
}
static
bool gatherParticipants(const NGHolder &g,
NFAVertex start, NFAVertex initialDot,
set<NFAVertex> &dots, set<NFAVertex> &succ) {
// Walk the graph downwards from the initial dot; each dot will have:
// 1) a single optional dot successor, or
// 2) N successors (our terminating case)
dots.insert(initialDot);
NFAVertex v = initialDot;
while (out_degree(v, g) == 1) {
NFAVertex t = *(adjacent_vertices(v, g).first);
// for t to be another optional dot, it must have only in-edges from v
// and from starts
if (isOptionalDot(t, v, g)) {
// another dot; bail if we've seen it once already
if (dots.find(t) != dots.end()) {
DEBUG_PRINTF("cycle detected at vertex %u\n", g[t].index);
return false;
}
dots.insert(t);
v = t;
continue;
}
// otherwise, we found a terminating dot state
break;
}
// Our terminating states are the successors of v.
// All of these MUST have an edge from start as well.
for (auto w : adjacent_vertices_range(v, g)) {
succ.insert(w);
if (!edge(start, w, g).second) {
DEBUG_PRINTF("failing, vertex %u does not have edge from start\n",
g[w].index);
return false;
}
}
/* All the non chained v connected to start must be in succ as well
* TODO: remember why (and document). */
for (auto u : adjacent_vertices_range(start, g)) {
if (is_special(u, g)) {
continue;
}
if (!contains(dots, u) && !contains(succ, u)) {
return false;
}
}
return !succ.empty();
}
static
void collapseVariableDotRepeat(NGHolder &g, NFAVertex start,
set<NFAVertex> &dead, UNUSED depth *startBegin,
depth *startEnd) {
// Handle optional dot repeat prefixes, e.g.
// /^.{0,30}foo/s, /^.{0,5}foo/s, unanchored equivs
// Note that this code assumes that fixed repeats ('^.{5,20}') have been
// pruned already, down (in this case) to '^.{0,15}'.
// The first of our optional dots must be connected to start. The jump edge
// past it will be verified in gatherParticipants(). If start is
// graph.start, it should not be connected to startDs.
NFAVertex initialDot = NFAGraph::null_vertex();
for (auto v : adjacent_vertices_range(start, g)) {
if (is_special(v, g)) {
continue;
}
if (is_dot(v, g) && isStartNode(v, start, g, false)) {
if (initialDot) {
return;
}
initialDot = v;
DEBUG_PRINTF("initial dot vertex is %u\n", g[v].index);
}
}
if (!initialDot) {
return;
}
// Collect all the other optional dot vertices and the successor vertices
// by walking down the graph from initialDot
set<NFAVertex> dots, succ;
if (!gatherParticipants(g, start, initialDot, dots, succ)) {
DEBUG_PRINTF("gatherParticipants failed\n");
return;
}
DEBUG_PRINTF("optional dot repeat with %zu participants, "
"terminating in %zu non-dot nodes\n",
dots.size(), succ.size());
// Remove all the participants and set the start offset
dead.insert(dots.begin(), dots.end());
DEBUG_PRINTF("current offsets: %s-%s\n", startBegin->str().c_str(),
startEnd->str().c_str());
if (start == g.start && startEnd->is_infinite()) {
*startEnd = dots.size();
} else if (startEnd->is_finite()) {
*startEnd += dots.size();
}
assert(startEnd->is_reachable());
// For determinism, copy and sort our successor vertices.
deque<NFAVertex> s(succ.begin(), succ.end());
sort(s.begin(), s.end(), make_index_ordering(g));
// Connect our successor vertices to both start and startDs.
for (auto v : s) {
add_edge_if_not_present(g.start, v, g);
add_edge_if_not_present(g.startDs, v, g);
}
}
static
void deleteVertices(set<NFAVertex> &dead, NGHolder &g) {
if (!dead.empty()) {
DEBUG_PRINTF("pruning %zu vertices\n", dead.size());
remove_vertices(dead, g);
}
dead.clear();
}
static
void reformAnchoredRepeats(NGHolder &g, depth *startBegin, depth *startEnd) {
DEBUG_PRINTF("component\n");
set<NFAVertex> anchored, unanchored, dead;
if (!findStarts(g, anchored, unanchored)) {
DEBUG_PRINTF("no starts\n");
return;
}
reformAnchoredRepeatsComponent(g, anchored, unanchored, dead, startBegin,
startEnd);
deleteVertices(dead, g);
reformUnanchoredRepeatsComponent(g, anchored, unanchored, dead, startBegin,
startEnd);
deleteVertices(dead, g);
}
static
void collapseVariableRepeats(NGHolder &g, depth *startBegin, depth *startEnd) {
DEBUG_PRINTF("collapseVariableRepeats\n");
set<NFAVertex> dead;
collapseVariableDotRepeat(g, g.start, dead, startBegin, startEnd);
deleteVertices(dead, g);
collapseVariableDotRepeat(g, g.startDs, dead, startBegin, startEnd);
deleteVertices(dead, g);
}
static
void addDotsBetween(NGHolder &g, NFAVertex lhs, vector<NFAVertex> &rhs,
depth min_repeat, depth max_repeat) {
const bool unbounded = max_repeat.is_infinite();
if (unbounded) {
max_repeat = min_repeat;
}
assert(max_repeat.is_finite());
NFAVertex u = lhs;
if (!min_repeat && unbounded) {
NFAVertex v = add_vertex(g);
add_edge(u, v, g);
g[v].char_reach.setall();
for (auto w : rhs) {
add_edge(lhs, w, g);
}
}
for (u32 i = 0; i < min_repeat; i++) {
NFAVertex v = add_vertex(g);
add_edge(u, v, g);
g[v].char_reach.setall();
u = v;
}
NFAVertex split = u;
/* lhs now split point for optional */
for (u32 i = min_repeat; i < max_repeat; i++) {
NFAVertex v = add_vertex(g);
add_edge(u, v, g);
if (u != split) {
add_edge(split, v, g);
}
g[v].char_reach.setall();
u = v;
}
if (unbounded) {
add_edge(u, u, g);
}
for (auto w : rhs) {
add_edge(u, w, g);
if (split != u) {
add_edge(split, w, g);
}
}
}
static
void restoreLeadingDots(NGHolder &g, const depth &startBegin,
const depth &startEnd) {
if (startBegin == depth(0) && startEnd.is_infinite()) {
return;
}
DEBUG_PRINTF("ungobble (%s, %s)\n", startBegin.str().c_str(),
startEnd.str().c_str());
for (UNUSED auto v : adjacent_vertices_range(g.start, g)) {
assert(edge(g.startDs, v, g).second);
}
clear_out_edges(g.start, g);
add_edge(g.start, g.startDs, g);
const bool unbounded = startEnd.is_infinite();
NFAVertex root = unbounded ? g.startDs : g.start;
vector<NFAVertex> rhs;
insert(&rhs, rhs.end(), adjacent_vertices(g.startDs, g));
rhs.erase(remove(rhs.begin(), rhs.end(), g.startDs), rhs.end());
for (auto v : rhs) {
remove_edge(g.startDs, v, g);
}
addDotsBetween(g, root, rhs, startBegin, startEnd);
g.renumberVertices();
g.renumberEdges();
}
// Entry point.
void reformLeadingDots(NGHolder &g) {
depth startBegin(0);
depth startEnd = depth::infinity();
reformAnchoredRepeats(g, &startBegin, &startEnd);
collapseVariableRepeats(g, &startBegin, &startEnd);
restoreLeadingDots(g, startBegin, startEnd);
}
} // namespace ue2

View File

@@ -0,0 +1,45 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Analysis pass to reform leading dots.
*/
#ifndef NG_ANCHORED_BOUNDED_REPEATS_H
#define NG_ANCHORED_BOUNDED_REPEATS_H
namespace ue2 {
class NGHolder;
/* should not be used if SoM is required */
void reformLeadingDots(NGHolder &g);
} // namespace ue2
#endif

559
src/nfagraph/ng_asserts.cpp Normal file
View File

@@ -0,0 +1,559 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Resolve special assert vertices.
*
* The assert resolution algorithm proceeds by iterating over those edges with
* assertion flags, considering source and target vertices of each edge. If a
* vertex has a superset of the reachability demanded by the assertion on the
* edge, it is split into alternatives providing the word and non-word paths
* through that vertex.
*
* A great deal of the complexity in the resolveAsserts pass is devoted to
* handling these assertions when the UCP flag is specified (meaning \\w and \\W
* are implemented with Unicode properties, rather than their ASCII
* interpretation) and the prefiltering flag is also used. Complete,
* non-prefiltering UCP support is not available yet.
*/
#include "ng_asserts.h"
#include "ng.h"
#include "ng_prune.h"
#include "ng_redundancy.h"
#include "ng_util.h"
#include "parser/position.h" // for POS flags
#include "util/bitutils.h" // for findAndClearLSB_32
#include "util/boundary_reports.h"
#include "util/container.h"
#include "util/compile_context.h"
#include "util/compile_error.h"
#include "util/graph_range.h"
#include "util/report_manager.h"
#include "util/unicode_def.h"
#include <queue>
using namespace std;
namespace ue2 {
/** \brief Hard limit on the maximum number of vertices we'll clone before we
* throw up our hands and report 'Pattern too large.' */
static const size_t MAX_CLONED_VERTICES = 2048;
/** \brief The definition of \\w, since we use it everywhere in here. */
static const CharReach CHARREACH_WORD(CharReach('a', 'z') |
CharReach('A', 'Z') | CharReach('0', '9') | CharReach('_'));
/** \brief \\W is the inverse of \\w */
static const CharReach CHARREACH_NONWORD(~CHARREACH_WORD);
/** \brief Prefiltering definition of \\w for UCP mode.
*
* Includes all high bytes as to capture all non-ASCII, however depending on
* direction only continuers or starters are strictly required - as the input
* is well-formed, this laxness will not cost us. */
static const CharReach CHARREACH_WORD_UCP_PRE(CHARREACH_WORD
| CharReach(128, 255));
/** \brief Prefiltering definition of \\W for UCP Mode.
*
* (non-word already includes high bytes) */
static const CharReach CHARREACH_NONWORD_UCP_PRE(CHARREACH_NONWORD);
/** \brief Find all the edges with assertion flags. */
static
vector<NFAEdge> getAsserts(const NGHolder &g) {
vector<NFAEdge> out;
for (const auto &e : edges_range(g)) {
if (g[e].assert_flags) {
out.push_back(e);
}
}
return out;
}
static
void addToSplit(const NGHolder &g, NFAVertex v, map<u32, NFAVertex> *to_split) {
DEBUG_PRINTF("%u needs splitting\n", g[v].index);
to_split->emplace(g[v].index, v);
}
/** \brief Find vertices that need to be split due to an assertion edge.
*
* A vertex needs to be split if has an edge to/from it with an assert with a
* restriction on the relevant end. */
static
void findSplitters(const NGHolder &g, const vector<NFAEdge> &asserts,
map<u32, NFAVertex> *to_split,
map<u32, NFAVertex> *to_split_ucp) {
for (const auto &e : asserts) {
NFAVertex u = source(e, g);
NFAVertex v = target(e, g);
u32 flags = g[e].assert_flags;
assert(flags);
const CharReach &u_cr = g[u].char_reach;
const CharReach &v_cr = g[v].char_reach;
bool ucp_assert = flags & UCP_ASSERT_FLAGS;
bool normal_assert = flags & NON_UCP_ASSERT_FLAGS;
/* In reality, an expression can only be entirely ucp or not ucp */
assert(ucp_assert != normal_assert);
if (normal_assert) {
/* assume any flag results in us have to split if the vertex is not
* a subset of word or completely disjoint from it. We could be more
* nuanced if flags is a disjunction of multiple assertions. */
if (!u_cr.isSubsetOf(CHARREACH_WORD)
&& !u_cr.isSubsetOf(CHARREACH_NONWORD)
&& u != g.start) { /* start is always considered a nonword */
addToSplit(g, u, to_split);
}
if (!v_cr.isSubsetOf(CHARREACH_WORD)
&& !v_cr.isSubsetOf(CHARREACH_NONWORD)
&& v != g.accept /* accept require special handling, done on a
* per edge basis in resolve asserts
*/
&& v != g.acceptEod) { /* eod is always considered a nonword */
addToSplit(g, v, to_split);
}
}
if (ucp_assert) {
/* note: the ucp prefilter crs overlap - requires a bit more care */
if (u == g.start) { /* start never needs to be split,
* treat nonword */
} else if (flags & POS_FLAG_ASSERT_WORD_TO_ANY_UCP) {
if (!u_cr.isSubsetOf(CHARREACH_WORD_UCP_PRE)
&& !u_cr.isSubsetOf(~CHARREACH_WORD_UCP_PRE)) {
addToSplit(g, u, to_split_ucp);
}
} else {
assert(flags & POS_FLAG_ASSERT_NONWORD_TO_ANY_UCP);
if (!u_cr.isSubsetOf(CHARREACH_NONWORD_UCP_PRE)
&& !u_cr.isSubsetOf(~CHARREACH_NONWORD_UCP_PRE)) {
addToSplit(g, u, to_split_ucp);
}
}
if (v == g.acceptEod /* eod is always considered a nonword */
|| v == g.accept) { /* accept require special handling, done on
* a per edge basis in resolve asserts */
} else if (flags & POS_FLAG_ASSERT_ANY_TO_WORD_UCP) {
if (!v_cr.isSubsetOf(CHARREACH_WORD_UCP_PRE)
&& !v_cr.isSubsetOf(~CHARREACH_WORD_UCP_PRE)) {
addToSplit(g, v, to_split_ucp);
}
} else {
assert(flags & POS_FLAG_ASSERT_ANY_TO_NONWORD_UCP);
if (!v_cr.isSubsetOf(CHARREACH_NONWORD_UCP_PRE)
&& !v_cr.isSubsetOf(~CHARREACH_NONWORD_UCP_PRE)) {
addToSplit(g, v, to_split_ucp);
}
}
}
}
}
static
void setReportId(ReportManager &rm, NGWrapper &g, NFAVertex v, s32 adj) {
// Don't try and set the report ID of a special vertex.
assert(!is_special(v, g));
// If there's a report set already, we're replacing it.
g[v].reports.clear();
Report ir = rm.getBasicInternalReport(g, adj);
g[v].reports.insert(rm.getInternalId(ir));
DEBUG_PRINTF("set report id for vertex %u, adj %d\n", g[v].index, adj);
}
static
NFAVertex makeClone(ReportManager &rm, NGWrapper &g, NFAVertex v,
const CharReach &cr_mask) {
NFAVertex clone = clone_vertex(g, v);
g[clone].char_reach &= cr_mask;
clone_out_edges(g, v, clone);
clone_in_edges(g, v, clone);
if (v == g.startDs) {
if (g.utf8) {
g[clone].char_reach &= ~UTF_START_CR;
}
DEBUG_PRINTF("marked as virt\n");
g[clone].assert_flags = POS_FLAG_VIRTUAL_START;
setReportId(rm, g, clone, 0);
}
return clone;
}
static
void splitVertex(ReportManager &rm, NGWrapper &g, NFAVertex v, bool ucp) {
assert(v != g.start);
assert(v != g.accept);
assert(v != g.acceptEod);
DEBUG_PRINTF("partitioning vertex %u ucp:%d\n", g[v].index, (int)ucp);
CharReach cr_word = ucp ? CHARREACH_WORD_UCP_PRE : CHARREACH_WORD;
CharReach cr_nonword = ucp ? CHARREACH_NONWORD_UCP_PRE : CHARREACH_NONWORD;
auto has_no_assert = [&g](const NFAEdge &e) { return !g[e].assert_flags; };
// Split v into word/nonword vertices with only asserting out-edges.
NFAVertex w_out = makeClone(rm, g, v, cr_word);
NFAVertex nw_out = makeClone(rm, g, v, cr_nonword);
remove_out_edge_if(w_out, has_no_assert, g);
remove_out_edge_if(nw_out, has_no_assert, g);
// Split v into word/nonword vertices with only asserting in-edges.
NFAVertex w_in = makeClone(rm, g, v, cr_word);
NFAVertex nw_in = makeClone(rm, g, v, cr_nonword);
remove_in_edge_if(w_in, has_no_assert, g);
remove_in_edge_if(nw_in, has_no_assert, g);
// Prune edges with asserts from original v.
auto has_assert = [&g](const NFAEdge &e) { return g[e].assert_flags; };
remove_in_edge_if(v, has_assert, g);
remove_out_edge_if(v, has_assert, g);
}
static
void resolveEdges(ReportManager &rm, NGWrapper &g, set<NFAEdge> *dead) {
for (const auto &e : edges_range(g)) {
u32 flags = g[e].assert_flags;
if (!flags) {
continue;
}
NFAVertex u = source(e, g);
NFAVertex v = target(e, g);
assert(u != g.startDs);
const CharReach &u_cr = g[u].char_reach;
const CharReach &v_cr = g[v].char_reach;
bool impassable = true;
bool ucp = flags & UCP_ASSERT_FLAGS;
DEBUG_PRINTF("resolving edge %u->%u (flags=0x%x, ucp=%d)\n", g[u].index,
g[v].index, flags, (int)ucp);
while (flags && impassable) {
u32 flag = 1U << findAndClearLSB_32(&flags);
switch (flag) {
case POS_FLAG_ASSERT_NONWORD_TO_NONWORD:
case POS_FLAG_ASSERT_NONWORD_TO_WORD:
if ((u_cr & CHARREACH_NONWORD).none() && u != g.start) {
continue;
}
break;
case POS_FLAG_ASSERT_WORD_TO_NONWORD:
case POS_FLAG_ASSERT_WORD_TO_WORD:
if ((u_cr & CHARREACH_WORD).none() || u == g.start) {
continue;
}
break;
case POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP:
case POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP:
if ((u_cr & ~CHARREACH_NONWORD_UCP_PRE).any() && u != g.start) {
continue;
}
break;
case POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP:
case POS_FLAG_ASSERT_WORD_TO_WORD_UCP:
if ((u_cr & ~CHARREACH_WORD_UCP_PRE).any() || u == g.start) {
continue;
}
break;
default:
assert(0);
}
if (v == g.accept) {
/* accept special will need to be treated specially later */
impassable = false;
continue;
}
switch (flag) {
case POS_FLAG_ASSERT_NONWORD_TO_NONWORD:
case POS_FLAG_ASSERT_WORD_TO_NONWORD:
if ((v_cr & CHARREACH_NONWORD).none() && v != g.acceptEod) {
continue;
}
break;
case POS_FLAG_ASSERT_WORD_TO_WORD:
case POS_FLAG_ASSERT_NONWORD_TO_WORD:
if ((v_cr & CHARREACH_WORD).none() || v == g.acceptEod) {
continue;
}
break;
case POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP:
case POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP:
if ((v_cr & ~CHARREACH_NONWORD_UCP_PRE).any()
&& v != g.acceptEod) {
continue;
}
break;
case POS_FLAG_ASSERT_WORD_TO_WORD_UCP:
case POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP:
if ((v_cr & ~CHARREACH_WORD_UCP_PRE).any()
|| v == g.acceptEod) {
continue;
}
break;
default:
assert(0);
}
impassable = false;
}
if (impassable) {
dead->insert(e);
} else if (v == g.accept && !ucp) {
bool u_w = (u_cr & CHARREACH_NONWORD).none() && u != g.start;
UNUSED bool u_nw = (u_cr & CHARREACH_WORD).none() || u == g.start;
assert(u_w != u_nw);
bool v_w = false;
bool v_nw = false;
flags = g[e].assert_flags;
if (u_w) {
v_w = flags & POS_FLAG_ASSERT_WORD_TO_WORD;
v_nw = flags & POS_FLAG_ASSERT_WORD_TO_NONWORD;
} else {
v_w = flags & POS_FLAG_ASSERT_NONWORD_TO_WORD;
v_nw = flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD;
}
assert(v_w || v_nw);
if (v_w && v_nw) {
/* edge is effectively unconditional */
g[e].assert_flags = 0;
} else if (v_w) {
/* need to add a word byte */
NFAVertex vv = add_vertex(g);
setReportId(rm, g, vv, -1);
g[vv].char_reach = CHARREACH_WORD;
add_edge(vv, g.accept, g);
g[e].assert_flags = 0;
add_edge(u, vv, g[e], g);
dead->insert(e);
} else {
/* need to add a non word byte or see eod */
NFAVertex vv = add_vertex(g);
setReportId(rm, g, vv, -1);
g[vv].char_reach = CHARREACH_NONWORD;
add_edge(vv, g.accept, g);
g[e].assert_flags = 0;
add_edge(u, vv, g[e], g);
if (!edge(u, g.acceptEod, g).second) {
add_edge(u, g.acceptEod, g[e], g);
} else {
/* there may already be a different edge from start to eod
* if so we need to make it unconditional and alive
*/
NFAEdge start_eod = edge(u, g.acceptEod, g).first;
g[start_eod].assert_flags = 0;
dead->erase(start_eod);
}
dead->insert(e);
}
} else if (v == g.accept && ucp) {
DEBUG_PRINTF("resolving ucp assert to accept\n");
assert(u_cr.any());
bool u_w = (u_cr & CHARREACH_WORD_UCP_PRE).any()
&& u != g.start;
bool u_nw = (u_cr & CHARREACH_NONWORD_UCP_PRE).any()
|| u == g.start;
assert(u_w || u_nw);
bool v_w = false;
bool v_nw = false;
flags = g[e].assert_flags;
if (u_w) {
v_w |= flags & POS_FLAG_ASSERT_WORD_TO_WORD_UCP;
v_nw |= flags & POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP;
}
if (u_nw) {
v_w |= flags & POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP;
v_nw |= flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP;
}
assert(v_w || v_nw);
if (v_w && v_nw) {
/* edge is effectively unconditional */
g[e].assert_flags = 0;
} else if (v_w) {
/* need to add a word byte */
NFAVertex vv = add_vertex(g);
setReportId(rm, g, vv, -1);
g[vv].char_reach = CHARREACH_WORD_UCP_PRE;
add_edge(vv, g.accept, g);
g[e].assert_flags = 0;
add_edge(u, vv, g[e], g);
dead->insert(e);
} else {
/* need to add a non word byte or see eod */
NFAVertex vv = add_vertex(g);
setReportId(rm, g, vv, -1);
g[vv].char_reach = CHARREACH_NONWORD_UCP_PRE;
add_edge(vv, g.accept, g);
g[e].assert_flags = 0;
add_edge(u, vv, g[e], g);
if (!edge(u, g.acceptEod, g).second) {
add_edge(u, g.acceptEod, g[e], g);
} else {
/* there may already be a different edge from start to eod
* if so we need to make it unconditional and alive
*/
NFAEdge start_eod = edge(u, g.acceptEod, g).first;
g[start_eod].assert_flags = 0;
dead->erase(start_eod);
}
dead->insert(e);
}
} else {
/* we can remove the asserts as we have partitioned the vertices
* into w/nw around the assert edges
*/
g[e].assert_flags = 0;
}
}
}
void resolveAsserts(ReportManager &rm, NGWrapper &g) {
vector<NFAEdge> asserts = getAsserts(g);
if (asserts.empty()) {
return;
}
map<u32, NFAVertex> to_split; /* by index, for determinism */
map<u32, NFAVertex> to_split_ucp; /* by index, for determinism */
findSplitters(g, asserts, &to_split, &to_split_ucp);
if (to_split.size() + to_split_ucp.size() > MAX_CLONED_VERTICES) {
throw CompileError(g.expressionIndex, "Pattern is too large.");
}
for (const auto &m : to_split) {
assert(!contains(to_split_ucp, m.first));
splitVertex(rm, g, m.second, false);
}
for (const auto &m : to_split_ucp) {
splitVertex(rm, g, m.second, true);
}
set<NFAEdge> dead;
resolveEdges(rm, g, &dead);
remove_edges(dead, g);
g.renumberVertices();
pruneUseless(g);
pruneEmptyVertices(g);
g.renumberVertices();
g.renumberEdges();
clearReports(g);
}
void ensureCodePointStart(ReportManager &rm, NGWrapper &g) {
/* In utf8 mode there is an implicit assertion that we start at codepoint
* boundaries. Assert resolution handles the badness coming from asserts.
* The only other source of trouble is startDs->accept connections.
*/
bool exists;
NFAEdge orig;
tie(orig, exists) = edge(g.startDs, g.accept, g);
if (g.utf8 && exists) {
DEBUG_PRINTF("rectifying %u\n", g.reportId);
Report ir = rm.getBasicInternalReport(g);
ReportID rep = rm.getInternalId(ir);
NFAVertex v_a = add_vertex(g);
g[v_a].assert_flags = POS_FLAG_VIRTUAL_START;
g[v_a].char_reach = UTF_ASCII_CR;
add_edge(v_a, g.accept, g[orig], g);
NFAVertex v_2 = add_vertex(g);
g[v_2].assert_flags = POS_FLAG_VIRTUAL_START;
g[v_2].char_reach = CharReach(UTF_TWO_BYTE_MIN, UTF_TWO_BYTE_MAX);
NFAVertex v_3 = add_vertex(g);
g[v_3].assert_flags = POS_FLAG_VIRTUAL_START;
g[v_3].char_reach = CharReach(UTF_THREE_BYTE_MIN, UTF_THREE_BYTE_MAX);
NFAVertex v_4 = add_vertex(g);
g[v_4].assert_flags = POS_FLAG_VIRTUAL_START;
g[v_4].char_reach = CharReach(UTF_FOUR_BYTE_MIN, UTF_FOUR_BYTE_MAX);
NFAVertex v_c = add_vertex(g);
g[v_c].assert_flags = POS_FLAG_VIRTUAL_START;
g[v_c].char_reach = UTF_CONT_CR;
add_edge(v_c, g.accept, g[orig], g);
add_edge(v_2, v_c, g);
NFAVertex v_3c = add_vertex(g);
g[v_3c].assert_flags = POS_FLAG_VIRTUAL_START;
g[v_3c].char_reach = UTF_CONT_CR;
add_edge(v_3c, v_c, g);
add_edge(v_3, v_3c, g);
NFAVertex v_4c = add_vertex(g);
g[v_4c].assert_flags = POS_FLAG_VIRTUAL_START;
g[v_4c].char_reach = UTF_CONT_CR;
add_edge(v_4c, v_3c, g);
add_edge(v_4, v_4c, g);
g[v_a].reports.insert(rep);
g[v_c].reports.insert(rep);
add_edge(g.start, v_a, g);
add_edge(g.startDs, v_a, g);
add_edge(g.start, v_2, g);
add_edge(g.startDs, v_2, g);
add_edge(g.start, v_3, g);
add_edge(g.startDs, v_3, g);
add_edge(g.start, v_4, g);
add_edge(g.startDs, v_4, g);
remove_edge(orig, g);
g.renumberEdges();
}
}
} // namespace ue2

48
src/nfagraph/ng_asserts.h Normal file
View File

@@ -0,0 +1,48 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Resolve special assert vertices.
*/
#ifndef NG_ASSERTS_H
#define NG_ASSERTS_H
namespace ue2 {
struct BoundaryReports;
class NGWrapper;
class ReportManager;
void resolveAsserts(ReportManager &rm, NGWrapper &g);
void ensureCodePointStart(ReportManager &rm, NGWrapper &g);
} // namespace ue2
#endif // NG_ASSERTS_H

278
src/nfagraph/ng_builder.cpp Normal file
View File

@@ -0,0 +1,278 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief: NFA Graph Builder: used by Glushkov construction to construct an
* NGWrapper from a parsed expression.
*/
#include "grey.h"
#include "ng.h"
#include "ng_builder.h"
#include "ng_util.h"
#include "ue2common.h"
#include "compiler/compiler.h" // for ParsedExpression
#include "util/compile_error.h"
#include "util/make_unique.h"
#include <cassert>
using namespace std;
namespace ue2 {
namespace {
/** Concrete implementation of NFABuilder interface. */
class NFABuilderImpl : public NFABuilder {
public:
NFABuilderImpl(ReportManager &rm, const Grey &grey,
const ParsedExpression &expr);
~NFABuilderImpl() override;
Position makePositions(size_t nPositions) override;
Position getStart() const override;
Position getStartDotStar() const override;
Position getAccept() const override;
Position getAcceptEOD() const override;
bool isSpecialState(Position p) const override;
void setNodeReportID(Position position, int offsetAdjust) override;
void addCharReach(Position position, const CharReach &cr) override;
void setAssertFlag(Position position, u32 flag) override;
u32 getAssertFlag(Position position) override;
void addVertex(Position p) override;
void addEdge(Position start, Position end) override;
bool hasEdge(Position start, Position end) const override;
u32 numVertices() const override { return vertIdx; }
void cloneRegion(Position first, Position last,
unsigned posOffset) override;
unique_ptr<NGWrapper> getGraph() override;
private:
/** fetch a vertex given its Position ID. */
NFAVertex getVertex(Position pos) const;
/** \brief Internal convenience function to add an edge (u, v). */
pair<NFAEdge, bool> addEdge(NFAVertex u, NFAVertex v);
/** \brief We use the ReportManager to hand out new internal reports. */
ReportManager &rm;
/** \brief Greybox: used for resource limits. */
const Grey &grey;
/** \brief Underlying NGWrapper graph. */
unique_ptr<NGWrapper> graph;
/** \brief mapping from position to vertex. Use \ref getVertex for access.
* */
vector<NFAVertex> id2vertex;
/** \brief Index of next vertex. */
u32 vertIdx;
}; // class NFABuilderImpl
} // namespace
NFABuilderImpl::NFABuilderImpl(ReportManager &rm_in, const Grey &grey_in,
const ParsedExpression &expr)
: rm(rm_in), grey(grey_in),
graph(ue2::make_unique<NGWrapper>(
expr.index, expr.highlander, expr.utf8, expr.prefilter, expr.som,
expr.id, expr.min_offset, expr.max_offset, expr.min_length)),
vertIdx(N_SPECIALS) {
// Reserve space for a reasonably-sized NFA
id2vertex.reserve(64);
id2vertex.resize(N_SPECIALS);
id2vertex[NODE_START] = graph->start;
id2vertex[NODE_START_DOTSTAR] = graph->startDs;
id2vertex[NODE_ACCEPT] = graph->accept;
id2vertex[NODE_ACCEPT_EOD] = graph->acceptEod;
}
NFABuilderImpl::~NFABuilderImpl() {
// empty
}
NFAVertex NFABuilderImpl::getVertex(Position pos) const {
assert(id2vertex.size() >= pos);
const NFAVertex v = id2vertex[pos];
assert(v != NFAGraph::null_vertex());
assert(graph->g[v].index == pos);
return v;
}
void NFABuilderImpl::addVertex(Position pos) {
// Enforce resource limit.
if (pos > grey.limitGraphVertices) {
throw CompileError("Pattern too large.");
}
NFAVertex v = add_vertex(*graph);
if (id2vertex.size() <= pos) {
id2vertex.resize(pos + 1);
}
id2vertex[pos] = v;
graph->g[v].index = pos;
}
unique_ptr<NGWrapper> NFABuilderImpl::getGraph() {
DEBUG_PRINTF("built graph has %zu vertices and %zu edges\n",
num_vertices(*graph), num_edges(*graph));
if (num_edges(*graph) > grey.limitGraphEdges) {
throw CompileError("Pattern too large.");
}
if (num_vertices(*graph) > grey.limitGraphVertices) {
throw CompileError("Pattern too large.");
}
return move(graph);
}
void NFABuilderImpl::setNodeReportID(Position pos, int offsetAdjust) {
Report ir = rm.getBasicInternalReport(*graph, offsetAdjust);
DEBUG_PRINTF("setting report id on %u = (%u, %d, %u)\n",
pos, graph->reportId, offsetAdjust, ir.ekey);
NFAVertex v = getVertex(pos);
auto &reports = (*graph)[v].reports;
reports.clear();
reports.insert(rm.getInternalId(ir));
}
void NFABuilderImpl::addCharReach(Position pos, const CharReach &cr) {
NFAVertex v = getVertex(pos);
graph->g[v].char_reach |= cr;
}
void NFABuilderImpl::setAssertFlag(Position pos, u32 flag) {
NFAVertex v = getVertex(pos);
graph->g[v].assert_flags |= flag;
}
u32 NFABuilderImpl::getAssertFlag(Position pos) {
NFAVertex v = getVertex(pos);
return graph->g[v].assert_flags;
}
pair<NFAEdge, bool> NFABuilderImpl::addEdge(NFAVertex u, NFAVertex v) {
// assert that the edge doesn't already exist
assert(edge(u, v, graph->g).second == false);
pair<NFAEdge, bool> e = add_edge(u, v, *graph);
assert(e.second);
return e;
}
void NFABuilderImpl::addEdge(Position startPos, Position endPos) {
DEBUG_PRINTF("%u -> %u\n", startPos, endPos);
assert(startPos < vertIdx);
assert(endPos < vertIdx);
NFAVertex u = getVertex(startPos);
NFAVertex v = getVertex(endPos);
if ((u == graph->start || u == graph->startDs) && v == graph->startDs) {
/* standard special -> special edges already exist */
assert(edge(u, v, graph->g).second == true);
return;
}
assert(edge(u, v, graph->g).second == false);
addEdge(u, v);
}
bool NFABuilderImpl::hasEdge(Position startPos, Position endPos) const {
return edge(getVertex(startPos), getVertex(endPos), graph->g).second;
}
Position NFABuilderImpl::getStart() const {
return NODE_START;
}
Position NFABuilderImpl::getStartDotStar() const {
return NODE_START_DOTSTAR;
}
Position NFABuilderImpl::getAccept() const {
return NODE_ACCEPT;
}
Position NFABuilderImpl::getAcceptEOD() const {
return NODE_ACCEPT_EOD;
}
bool NFABuilderImpl::isSpecialState(Position p) const {
return (p == NODE_START || p == NODE_START_DOTSTAR ||
p == NODE_ACCEPT || p == NODE_ACCEPT_EOD);
}
Position NFABuilderImpl::makePositions(size_t nPositions) {
Position base = vertIdx;
for (size_t i = 0; i < nPositions; i++) {
addVertex(vertIdx++);
}
DEBUG_PRINTF("built %zu positions from base %u\n", nPositions, base);
return base;
}
void NFABuilderImpl::cloneRegion(Position first, Position last, unsigned posOffset) {
NFAGraph &g = graph->g;
assert(posOffset > 0);
// walk the nodes between first and last and copy their vertex properties
DEBUG_PRINTF("cloning nodes in [%u, %u], offset %u\n", first, last,
posOffset);
for (Position i = first; i <= last; ++i) {
NFAVertex orig = getVertex(i);
Position destIdx = i + posOffset;
assert(destIdx < vertIdx);
NFAVertex dest = getVertex(destIdx);
g[dest] = g[orig]; // all properties
g[dest].index = destIdx;
}
}
unique_ptr<NFABuilder> makeNFABuilder(ReportManager &rm, const CompileContext &cc,
const ParsedExpression &expr) {
return ue2::make_unique<NFABuilderImpl>(rm, cc.grey, expr);
}
NFABuilder::~NFABuilder() { }
} // namespace ue2

99
src/nfagraph/ng_builder.h Normal file
View File

@@ -0,0 +1,99 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief: NFA Graph Builder: used by Glushkov construction to construct an
* NGWrapper from a parsed expression.
*/
#ifndef NG_BUILDER_H
#define NG_BUILDER_H
#include "ue2common.h"
#include "parser/position.h"
#include <memory>
#include <boost/core/noncopyable.hpp>
namespace ue2 {
class CharReach;
class NGWrapper;
class ReportManager;
struct CompileContext;
class ParsedExpression;
/** \brief Abstract builder interface. Use \ref makeNFABuilder to construct
* one. Used by GlushkovBuildState. */
class NFABuilder : boost::noncopyable {
public:
virtual ~NFABuilder();
virtual Position makePositions(size_t nPositions) = 0;
virtual Position getStart() const = 0;
virtual Position getStartDotStar() const = 0;
virtual Position getAccept() const = 0;
virtual Position getAcceptEOD() const = 0;
virtual bool isSpecialState(Position p) const = 0;
virtual void setNodeReportID(Position position, int offsetAdjust) = 0;
virtual void addCharReach(Position position, const CharReach &cr) = 0;
/* or-in vertex assertions */
virtual void setAssertFlag(Position position, u32 flag) = 0;
virtual u32 getAssertFlag(Position position) = 0;
virtual void addVertex(Position p) = 0;
virtual void addEdge(Position start, Position end) = 0;
virtual bool hasEdge(Position start, Position end) const = 0;
virtual u32 numVertices() const = 0;
virtual void cloneRegion(Position first, Position last,
unsigned posOffset) = 0;
/**
* \brief Returns the built NGWrapper graph.
* Note that this builder cannot be used after this call.
*/
virtual std::unique_ptr<NGWrapper> getGraph() = 0;
};
/** Construct a usable NFABuilder. */
std::unique_ptr<NFABuilder> makeNFABuilder(ReportManager &rm,
const CompileContext &cc,
const ParsedExpression &expr);
} // namespace ue2
#endif

View File

@@ -0,0 +1,422 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Splits an NFA graph into its connected components.
*
* This pass takes a NGHolder and splits its graph into a set of connected
* components, returning them as individual NGHolder graphs. For example, the
* graph for the regex /foo.*bar|[a-z]{7,13}|hatstand|teakettle$/ will be split
* into four NGHolders, representing these four components:
*
* - /foo.*bar/
* - /[a-z]{7,13}/
* - /hatstand/
* - /teakettle$/
*
* The pass operates by creating an undirected graph from the input graph, and
* then using the BGL's connected_components algorithm to do the work, cloning
* the identified components into their own graphs. A "shell" of vertices
* is identified and removed first from the head and tail of the graph, in
* order to handle cases where there is a common head/tail region.
*
* Trivial cases, such as an alternation of single vertices like /a|b|c|d|e|f/,
* are not split, as later optimisations will handle these cases efficiently.
*/
#include "ng_calc_components.h"
#include "ng_depth.h"
#include "ng_holder.h"
#include "ng_prune.h"
#include "ng_undirected.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/graph_range.h"
#include "util/make_unique.h"
#include <map>
#include <vector>
#include <boost/graph/connected_components.hpp>
using namespace std;
namespace ue2 {
static constexpr u32 MAX_HEAD_SHELL_DEPTH = 3;
static constexpr u32 MAX_TAIL_SHELL_DEPTH = 3;
/**
* \brief Returns true if the whole graph is just an alternation of character
* classes.
*/
bool isAlternationOfClasses(const NGHolder &g) {
for (auto v : vertices_range(g)) {
if (is_special(v, g)) {
continue;
}
// Vertex must have in edges from starts only.
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (!is_any_start(u, g)) {
return false;
}
}
// Vertex must have out edges to accepts only.
for (auto w : adjacent_vertices_range(v, g)) {
if (!is_any_accept(w, g)) {
return false;
}
}
}
DEBUG_PRINTF("alternation of single states, treating as one comp\n");
return true;
}
/**
* \brief Compute initial max distance to v from start (i.e. ignoring its own
* self-loop).
*/
static
depth max_dist_from_start(const NGHolder &g,
const vector<NFAVertexBidiDepth> &depths,
NFAVertex v) {
depth max_depth(0);
for (const auto u : inv_adjacent_vertices_range(v, g)) {
if (u == v) {
continue;
}
const auto &d = depths.at(g[u].index);
if (d.fromStart.max.is_reachable()) {
max_depth = max(max_depth, d.fromStart.max);
}
if (d.fromStartDotStar.max.is_reachable()) {
max_depth = max(max_depth, d.fromStartDotStar.max);
}
}
return max_depth + 1;
}
/**
* \brief Compute initial max depth from v from accept (i.e. ignoring its own
* self-loop).
*/
static
depth max_dist_to_accept(const NGHolder &g,
const vector<NFAVertexBidiDepth> &depths,
NFAVertex v) {
depth max_depth(0);
for (const auto w : adjacent_vertices_range(v, g)) {
if (w == v) {
continue;
}
const auto &d = depths.at(g[w].index);
if (d.toAccept.max.is_reachable()) {
max_depth = max(max_depth, d.toAccept.max);
}
if (d.toAcceptEod.max.is_reachable()) {
max_depth = max(max_depth, d.toAcceptEod.max);
}
}
return max_depth + 1;
}
static
flat_set<NFAVertex> findHeadShell(const NGHolder &g,
const vector<NFAVertexBidiDepth> &depths,
const depth &max_dist) {
flat_set<NFAVertex> shell;
for (auto v : vertices_range(g)) {
if (is_special(v, g)) {
continue;
}
if (max_dist_from_start(g, depths, v) <= max_dist) {
shell.insert(v);
}
}
for (UNUSED auto v : shell) {
DEBUG_PRINTF("shell: %u\n", g[v].index);
}
return shell;
}
static
flat_set<NFAVertex> findTailShell(const NGHolder &g,
const vector<NFAVertexBidiDepth> &depths,
const depth &max_dist) {
flat_set<NFAVertex> shell;
for (auto v : vertices_range(g)) {
if (is_special(v, g)) {
continue;
}
if (max_dist_to_accept(g, depths, v) <= max_dist) {
shell.insert(v);
}
}
for (UNUSED auto v : shell) {
DEBUG_PRINTF("shell: %u\n", g[v].index);
}
return shell;
}
static
vector<NFAEdge> findShellEdges(const NGHolder &g,
const flat_set<NFAVertex> &head_shell,
const flat_set<NFAVertex> &tail_shell) {
vector<NFAEdge> shell_edges;
for (const auto &e : edges_range(g)) {
auto u = source(e, g);
auto v = target(e, g);
if (v == g.startDs && is_any_start(u, g)) {
continue;
}
if (u == g.accept && v == g.acceptEod) {
continue;
}
if ((is_special(u, g) || contains(head_shell, u)) &&
(is_special(v, g) || contains(tail_shell, v))) {
DEBUG_PRINTF("edge (%u,%u) is a shell edge\n", g[u].index, g[v].index);
shell_edges.push_back(e);
}
}
return shell_edges;
}
static
void removeVertices(const flat_set<NFAVertex> &verts, NFAUndirectedGraph &ug,
ue2::unordered_map<NFAVertex, NFAUndirectedVertex> &old2new,
ue2::unordered_map<NFAVertex, NFAUndirectedVertex> &new2old) {
for (auto v : verts) {
assert(contains(old2new, v));
auto uv = old2new.at(v);
clear_vertex(uv, ug);
remove_vertex(uv, ug);
old2new.erase(v);
new2old.erase(uv);
}
}
static
void renumberVertices(NFAUndirectedGraph &ug) {
u32 vertexIndex = 0;
for (auto uv : vertices_range(ug)) {
put(boost::vertex_index, ug, uv, vertexIndex++);
}
}
/**
* Common code called by calc- and recalc- below. Splits the given holder into
* one or more connected components, adding them to the comps deque.
*/
static
void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
const depth &max_head_depth,
const depth &max_tail_depth, bool *shell_comp) {
DEBUG_PRINTF("graph has %zu vertices\n", num_vertices(g));
assert(shell_comp);
*shell_comp = false;
// Compute "shell" head and tail subgraphs.
vector<NFAVertexBidiDepth> depths;
calcDepths(g, depths);
auto head_shell = findHeadShell(g, depths, max_head_depth);
auto tail_shell = findTailShell(g, depths, max_tail_depth);
for (auto v : head_shell) {
tail_shell.erase(v);
}
if (head_shell.size() + tail_shell.size() + N_SPECIALS >= num_vertices(g)) {
DEBUG_PRINTF("all in shell component\n");
comps.push_back(cloneHolder(g));
*shell_comp = true;
return;
}
vector<NFAEdge> shell_edges = findShellEdges(g, head_shell, tail_shell);
DEBUG_PRINTF("%zu vertices in head, %zu in tail, %zu shell edges\n",
head_shell.size(), tail_shell.size(), shell_edges.size());
NFAUndirectedGraph ug;
ue2::unordered_map<NFAVertex, NFAUndirectedVertex> old2new;
ue2::unordered_map<u32, NFAVertex> newIdx2old;
createUnGraph(g.g, true, true, ug, old2new, newIdx2old);
// Construct reverse mapping.
ue2::unordered_map<NFAVertex, NFAUndirectedVertex> new2old;
for (const auto &m : old2new) {
new2old.emplace(m.second, m.first);
}
// Remove shells from undirected graph and renumber so we have dense
// vertex indices.
removeVertices(head_shell, ug, old2new, new2old);
removeVertices(tail_shell, ug, old2new, new2old);
renumberVertices(ug);
map<NFAUndirectedVertex, u32> split_components;
const u32 num = connected_components(
ug, boost::make_assoc_property_map(split_components));
assert(num > 0);
if (num == 1 && shell_edges.empty()) {
DEBUG_PRINTF("single component\n");
comps.push_back(cloneHolder(g));
return;
}
DEBUG_PRINTF("broke graph into %u components\n", num);
vector<deque<NFAVertex>> verts(num);
// Collect vertex lists per component.
for (const auto &m : split_components) {
NFAVertex uv = m.first;
u32 c = m.second;
assert(contains(new2old, uv));
NFAVertex v = new2old.at(uv);
verts[c].push_back(v);
DEBUG_PRINTF("vertex %u is in comp %u\n", g[v].index, c);
}
ue2::unordered_map<NFAVertex, NFAVertex> v_map; // temp map for fillHolder
for (auto &vv : verts) {
// Shells are in every component.
vv.insert(vv.end(), begin(head_shell), end(head_shell));
vv.insert(vv.end(), begin(tail_shell), end(tail_shell));
// Sort by vertex index for determinism.
sort(begin(vv), end(vv), VertexIndexOrdering<NGHolder>(g));
auto gc = ue2::make_unique<NGHolder>();
v_map.clear();
fillHolder(gc.get(), g, vv, &v_map);
// Remove shell edges, which will get their own component.
for (const auto &e : shell_edges) {
auto cu = v_map.at(source(e, g));
auto cv = v_map.at(target(e, g));
assert(edge(cu, cv, *gc).second);
remove_edge(cu, cv, *gc);
}
pruneUseless(*gc);
DEBUG_PRINTF("component %zu has %zu vertices\n", comps.size(),
num_vertices(*gc));
comps.push_back(move(gc));
}
// Another component to handle the direct shell-to-shell edges.
if (!shell_edges.empty()) {
deque<NFAVertex> vv;
vv.insert(vv.end(), begin(head_shell), end(head_shell));
vv.insert(vv.end(), begin(tail_shell), end(tail_shell));
// Sort by vertex index for determinism.
sort(begin(vv), end(vv), VertexIndexOrdering<NGHolder>(g));
auto gc = ue2::make_unique<NGHolder>();
v_map.clear();
fillHolder(gc.get(), g, vv, &v_map);
pruneUseless(*gc);
DEBUG_PRINTF("shell edge component %zu has %zu vertices\n",
comps.size(), num_vertices(*gc));
comps.push_back(move(gc));
*shell_comp = true;
}
// We should never produce empty component graphs.
assert(all_of(begin(comps), end(comps),
[](const unique_ptr<NGHolder> &g_comp) {
return num_vertices(*g_comp) > N_SPECIALS;
}));
}
deque<unique_ptr<NGHolder>> calcComponents(const NGHolder &g) {
deque<unique_ptr<NGHolder>> comps;
// For trivial cases, we needn't bother running the full
// connected_components algorithm.
if (isAlternationOfClasses(g)) {
comps.push_back(cloneHolder(g));
return comps;
}
bool shell_comp = false;
splitIntoComponents(g, comps, MAX_HEAD_SHELL_DEPTH, MAX_TAIL_SHELL_DEPTH,
&shell_comp);
if (shell_comp) {
DEBUG_PRINTF("re-running on shell comp\n");
assert(!comps.empty());
auto sc = move(comps.back());
comps.pop_back();
splitIntoComponents(*sc, comps, 0, 0, &shell_comp);
}
DEBUG_PRINTF("finished; split into %zu components\n", comps.size());
return comps;
}
void recalcComponents(deque<unique_ptr<NGHolder>> &comps) {
deque<unique_ptr<NGHolder>> out;
for (auto &gc : comps) {
if (!gc) {
continue; // graph has been consumed already.
}
if (isAlternationOfClasses(*gc)) {
out.push_back(move(gc));
continue;
}
auto gc_comps = calcComponents(*gc);
for (auto &elem : gc_comps) {
out.push_back(move(elem));
}
}
// Replace comps with our recalculated list.
comps.swap(out);
}
} // namespace ue2

View File

@@ -0,0 +1,51 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Splits an NFA graph into its connected components.
*/
#ifndef NG_CALC_COMPONENTS_H
#define NG_CALC_COMPONENTS_H
#include <deque>
#include <memory>
namespace ue2 {
class NGHolder;
bool isAlternationOfClasses(const NGHolder &g);
std::deque<std::unique_ptr<NGHolder>> calcComponents(const NGHolder &g);
void recalcComponents(std::deque<std::unique_ptr<NGHolder>> &comps);
} // namespace ue2
#endif

View File

@@ -0,0 +1,264 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Cyclic Path Redundancy pass. Removes redundant vertices on paths
* leading to a cyclic repeat.
*
* This is a graph reduction pass intended to remove vertices that are
* redundant because they lead solely to a cyclic vertex with a superset of
* their character reachability. For example, in this pattern:
*
* /(abc|def|abcghi).*0123/s
*
* The vertices for 'ghi' can be removed due to the presence of the dot-star
* repeat.
*
* Algorithm:
*
* for each cyclic vertex V:
* for each proper predecessor U of V:
* let S be the set of successors of U that are successors of V
* (including V itself)
* for each successor W of U not in S:
* perform a DFS forward from W, stopping exploration when a vertex
* in S is encountered;
* if a vertex with reach not in reach(V) or an accept is encountered:
* fail and continue to the next W.
* else:
* remove (U, W)
*
* NOTE: the following code is templated not just for fun, but so that we can
* run this analysis both forward and in reverse over the graph.
*/
#include "ng_cyclic_redundancy.h"
#include "ng_holder.h"
#include "ng_prune.h"
#include "ng_util.h"
#include "util/container.h"
#include "util/graph_range.h"
#include "util/ue2_containers.h"
#include <boost/graph/depth_first_search.hpp>
#include <boost/graph/reverse_graph.hpp>
using namespace std;
using boost::reverse_graph;
namespace ue2 {
namespace {
// Terminator function for depth first traversal, tells us not to explore
// beyond vertices in set S.
template<class Vertex, class Graph>
class VertexInSet {
public:
explicit VertexInSet(const flat_set<Vertex> &s) : verts(s) {}
bool operator()(const Vertex &v, const Graph&) const {
return contains(verts, v);
}
private:
const flat_set<Vertex> &verts;
};
struct SearchFailed {};
// Visitor for depth first traversal, throws an error if we encounter a vertex
// with bad reach or a report.
class SearchVisitor : public boost::default_dfs_visitor {
public:
explicit SearchVisitor(const CharReach &r) : cr(r) {}
template<class Vertex, class Graph>
void discover_vertex(const Vertex &v, const Graph &g) const {
DEBUG_PRINTF("vertex %u\n", g[v].index);
if (is_special(v, g)) {
DEBUG_PRINTF("start or accept\n");
throw SearchFailed();
}
if (g[v].assert_flags) {
DEBUG_PRINTF("assert flags\n");
throw SearchFailed();
}
const CharReach &vcr = g[v].char_reach;
if (vcr != (vcr & cr)) {
DEBUG_PRINTF("bad reach\n");
throw SearchFailed();
}
}
private:
const CharReach &cr;
};
} // namespace
template<class Graph>
static
bool searchForward(const Graph &g, const CharReach &reach,
const flat_set<typename Graph::vertex_descriptor> &s,
typename Graph::vertex_descriptor w) {
map<NFAVertex, boost::default_color_type> colours;
try {
depth_first_visit(g, w, SearchVisitor(reach),
make_assoc_property_map(colours),
VertexInSet<typename Graph::vertex_descriptor, Graph>(s));
} catch (SearchFailed&) {
return false;
}
return true;
}
static
NFAEdge to_raw(const NFAEdge &e, const NFAGraph &, const NGHolder &) {
return e;
}
static
NFAEdge to_raw(const reverse_graph<NFAGraph, NFAGraph&>::edge_descriptor &e,
const reverse_graph<NFAGraph, NFAGraph&> &g,
const NGHolder &raw) {
/* clang doesn't seem to like edge_underlying */
NFAVertex t = source(e, g);
NFAVertex s = target(e, g);
assert(edge(s, t, raw).second);
return edge(s, t, raw).first;
}
/* returns true if we did stuff */
template<class Graph>
static
bool removeCyclicPathRedundancy(Graph &g, typename Graph::vertex_descriptor v,
NGHolder &raw) {
bool did_stuff = false;
const CharReach &reach = g[v].char_reach;
typedef typename Graph::vertex_descriptor vertex_descriptor;
// precalc successors of v.
flat_set<vertex_descriptor> succ_v;
insert(&succ_v, adjacent_vertices(v, g));
flat_set<vertex_descriptor> s;
for (const auto &e : in_edges_range(v, g)) {
vertex_descriptor u = source(e, g);
if (u == v) {
continue;
}
if (is_any_accept(u, g)) {
continue;
}
DEBUG_PRINTF("- checking u %u\n", g[u].index);
// let s be intersection(succ(u), succ(v))
s.clear();
for (auto b : adjacent_vertices_range(u, g)) {
if (contains(succ_v, b)) {
s.insert(b);
}
}
for (const auto &e_u : make_vector_from(out_edges(u, g))) {
vertex_descriptor w = target(e_u, g);
if (is_special(w, g) || contains(s, w)) {
continue;
}
const CharReach &w_reach = g[w].char_reach;
if (!w_reach.isSubsetOf(reach)) {
continue;
}
DEBUG_PRINTF(" - checking w %u\n", g[w].index);
if (searchForward(g, reach, s, w)) {
DEBUG_PRINTF("removing edge (%u,%u)\n",
g[u].index, g[w].index);
/* we are currently iterating over the in-edges of v, so it
would be unwise to remove edges to v. However, */
assert(w != v); /* as v is in s */
remove_edge(to_raw(e_u, g, raw), raw);
did_stuff = true;
}
}
}
return did_stuff;
}
template<class Graph>
static
bool cyclicPathRedundancyPass(Graph &g, NGHolder &raw) {
bool did_stuff = false;
for (auto v : vertices_range(g)) {
if (is_special(v, g) || !edge(v, v, g).second) {
continue;
}
DEBUG_PRINTF("examining cyclic vertex %u\n", g[v].index);
did_stuff |= removeCyclicPathRedundancy(g, v, raw);
}
return did_stuff;
}
bool removeCyclicPathRedundancy(NGHolder &g) {
// Forward pass.
bool f_changed = cyclicPathRedundancyPass(g.g, g);
if (f_changed) {
DEBUG_PRINTF("edges removed by forward pass\n");
pruneUseless(g);
}
// Reverse pass.
DEBUG_PRINTF("REVERSE PASS\n");
typedef reverse_graph<NFAGraph, NFAGraph&> RevGraph;
RevGraph revg(g.g);
bool r_changed = cyclicPathRedundancyPass(revg, g);
if (r_changed) {
DEBUG_PRINTF("edges removed by reverse pass\n");
pruneUseless(g);
}
return f_changed || r_changed;
}
} // namespace ue2

View File

@@ -0,0 +1,45 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Cyclic Path Redundancy pass. Removes redundant vertices on paths
* leading to a cyclic repeat.
*/
#ifndef NG_CYCLIC_REDUNDANCY_H
#define NG_CYCLIC_REDUNDANCY_H
namespace ue2 {
class NGHolder;
bool removeCyclicPathRedundancy(NGHolder &g);
} // namespace ue2
#endif

383
src/nfagraph/ng_depth.cpp Normal file
View File

@@ -0,0 +1,383 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief NFA graph vertex depth calculations.
*/
#include "ng_depth.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/graph_range.h"
#include <deque>
#include <vector>
#include <boost/graph/dag_shortest_paths.hpp>
#include <boost/graph/depth_first_search.hpp>
#include <boost/graph/breadth_first_search.hpp>
#include <boost/graph/filtered_graph.hpp>
#include <boost/graph/reverse_graph.hpp>
#include <boost/graph/topological_sort.hpp>
#include <boost/graph/property_maps/constant_property_map.hpp>
using namespace std;
using boost::filtered_graph;
using boost::make_constant_property;
using boost::reverse_graph;
namespace ue2 {
namespace {
/** Distance value used to indicate that the vertex can't be reached. */
static const int DIST_UNREACHABLE = INT_MAX;
/**
* Distance value used to indicate that the distance to a vertex is infinite
* (for example, it's the max distance and there's a cycle in the path) or so
* large that we should consider it effectively infinite.
*/
static const int DIST_INFINITY = INT_MAX - 1;
//
// Filters
//
template <class GraphT>
struct NodeFilter {
typedef typename GraphT::edge_descriptor EdgeT;
NodeFilter() { }
NodeFilter(const vector<bool> *bad_in, const GraphT *g_in)
: bad(bad_in), g(g_in) { }
bool operator()(const EdgeT &e) const {
u32 src_idx = (*g)[source(e, *g)].index;
u32 tar_idx = (*g)[target(e, *g)].index;
if (tar_idx == NODE_START_DOTSTAR) {
return false;
}
return !(*bad)[src_idx] && !(*bad)[tar_idx];
}
const vector<bool> *bad;
const GraphT *g;
};
template <class GraphT>
struct StartFilter {
typedef typename GraphT::edge_descriptor EdgeT;
StartFilter() { }
explicit StartFilter(const GraphT *g_in) : g(g_in) { }
bool operator()(const EdgeT &e) const {
u32 src_idx = (*g)[source(e, *g)].index;
u32 tar_idx = (*g)[target(e, *g)].index;
// Remove our stylised edges from anchored start to startDs.
if (src_idx == NODE_START && tar_idx == NODE_START_DOTSTAR) {
return false;
}
// Also remove the equivalent in the reversed direction.
if (src_idx == NODE_ACCEPT_EOD && tar_idx == NODE_ACCEPT) {
return false;
}
return true;
}
const GraphT *g;
};
} // namespace
template<class GraphT>
static
void findLoopReachable(const GraphT &g, const NFAVertex srcVertex,
vector<bool> &deadNodes) {
typedef typename GraphT::edge_descriptor EdgeT;
typedef set<EdgeT> EdgeSet;
EdgeSet deadEdges;
BackEdges<EdgeSet> be(deadEdges);
auto index_map = get(&NFAGraphVertexProps::index, g);
depth_first_search(g, visitor(be).root_vertex(srcVertex).vertex_index_map(
index_map));
AcyclicFilter<EdgeSet> af(&deadEdges);
filtered_graph<GraphT, AcyclicFilter<EdgeSet> > acyclic_g(g, af);
vector<NFAVertex> topoOrder; /* actually reverse topological order */
topoOrder.reserve(deadNodes.size());
topological_sort(acyclic_g, back_inserter(topoOrder),
vertex_index_map(index_map));
for (const auto &e : deadEdges) {
u32 srcIdx = g[source(e, g)].index;
if (srcIdx != NODE_START_DOTSTAR) {
deadNodes[srcIdx] = true;
}
}
for (auto it = topoOrder.rbegin(); it != topoOrder.rend(); ++it) {
NFAVertex v = *it;
for (const auto &e : in_edges_range(v, g)) {
if (deadNodes[g[source(e, g)].index]) {
deadNodes[g[v].index] = true;
break;
}
}
}
}
template <class GraphT>
static
void calcDepthFromSource(const NGHolder &graph, const GraphT &g,
typename GraphT::vertex_descriptor srcVertex,
const vector<bool> &deadNodes,
vector<int> &dMin, vector<int> &dMax) {
typedef typename GraphT::edge_descriptor EdgeT;
const size_t numVerts = num_vertices(graph);
NodeFilter<GraphT> nf(&deadNodes, &g);
StartFilter<GraphT> sf(&g);
/* minimum distance needs to run on a graph with .*start unreachable
* from start */
typedef filtered_graph<GraphT, StartFilter<GraphT> > StartFilteredGraph;
const StartFilteredGraph mindist_g(g, sf);
/* maximum distance needs to run on a graph without cycles & nodes
* reachable from cycles */
typedef filtered_graph<GraphT, NodeFilter<GraphT> > NodeFilteredGraph;
const NodeFilteredGraph maxdist_g(g, nf);
// Record distance of each vertex from source using one of the following
// algorithms.
/* note: filtered graphs have same num_{vertices,edges} as base */
dMin.assign(numVerts, DIST_UNREACHABLE);
dMax.assign(numVerts, DIST_UNREACHABLE);
dMin[mindist_g[srcVertex].index] = 0;
using boost::make_iterator_property_map;
auto min_index_map = get(&NFAGraphVertexProps::index, mindist_g);
breadth_first_search(mindist_g, srcVertex,
boost::vertex_index_map(min_index_map).
visitor(make_bfs_visitor(record_distances(
make_iterator_property_map(
dMin.begin(), min_index_map),
boost::on_tree_edge()))));
auto max_index_map = get(&NFAGraphVertexProps::index, maxdist_g);
dag_shortest_paths(maxdist_g, srcVertex,
boost::vertex_index_map(max_index_map).
distance_map(make_iterator_property_map(dMax.begin(),
max_index_map)).
weight_map(make_constant_property<EdgeT>(-1)));
for (size_t i = 0; i < numVerts; i++) {
if (dMin[i] > DIST_UNREACHABLE) {
dMin[i] = DIST_UNREACHABLE;
}
DEBUG_PRINTF("%zu: dm %d %d\n", i, dMin[i], dMax[i]);
if (dMax[i] >= DIST_UNREACHABLE && dMin[i] < DIST_UNREACHABLE) {
dMax[i] = -DIST_INFINITY; /* max depths currently negative */
DEBUG_PRINTF("bumping max to %d\n", dMax[i]);
} else if (dMax[i] >= DIST_UNREACHABLE
|| dMax[i] < -DIST_UNREACHABLE) {
dMax[i] = -DIST_UNREACHABLE;
DEBUG_PRINTF("bumping max to %d\n", dMax[i]);
}
}
}
/**
* \brief Convert the integer distance we use in our shortest path calculations
* to a \ref depth value.
*/
static
depth depthFromDistance(int val) {
assert(val >= 0);
if (val >= DIST_UNREACHABLE) {
return depth::unreachable();
} else if (val == DIST_INFINITY) {
return depth::infinity();
}
return depth((u32)val);
}
static
DepthMinMax getDepths(u32 idx, const vector<int> &dMin,
const vector<int> &dMax) {
DepthMinMax d(depthFromDistance(dMin[idx]),
depthFromDistance(-1 * dMax[idx]));
DEBUG_PRINTF("idx=%u, depths=%s\n", idx, d.str().c_str());
assert(d.min <= d.max);
return d;
}
template<class Graph, class Output>
static
void calcAndStoreDepth(const NGHolder &h, const Graph &g,
const typename Graph::vertex_descriptor src,
const vector<bool> &deadNodes,
vector<int> &dMin /* util */,
vector<int> &dMax /* util */,
vector<Output> &depths,
DepthMinMax Output::*store) {
calcDepthFromSource(h, g, src, deadNodes, dMin, dMax);
for (auto v : vertices_range(g)) {
u32 idx = g[v].index;
assert(idx < depths.size());
Output &d = depths.at(idx);
d.*store = getDepths(idx, dMin, dMax);
}
}
void calcDepths(const NGHolder &g, std::vector<NFAVertexDepth> &depths) {
assert(hasCorrectlyNumberedVertices(g));
const size_t numVertices = num_vertices(g);
depths.clear();
depths.resize(numVertices);
vector<int> dMin;
vector<int> dMax;
/*
* create a filtered graph for max depth calculations: all nodes/edges
* reachable from a loop need to be removed
*/
vector<bool> deadNodes(numVertices);
findLoopReachable(g.g, g.start, deadNodes);
DEBUG_PRINTF("doing start\n");
calcAndStoreDepth(g, g.g, g.start, deadNodes, dMin, dMax,
depths, &NFAVertexDepth::fromStart);
DEBUG_PRINTF("doing startds\n");
calcAndStoreDepth(g, g.g, g.startDs, deadNodes, dMin, dMax,
depths, &NFAVertexDepth::fromStartDotStar);
}
void calcDepths(const NGHolder &g, std::vector<NFAVertexRevDepth> &depths) {
assert(hasCorrectlyNumberedVertices(g));
const size_t numVertices = num_vertices(g);
depths.clear();
depths.resize(numVertices);
vector<int> dMin;
vector<int> dMax;
/* reverse the graph before walking it */
typedef reverse_graph<NFAGraph, const NFAGraph&> RevNFAGraph;
const RevNFAGraph rg(g.g);
/*
* create a filtered graph for max depth calculations: all nodes/edges
* reachable from a loop need to be removed
*/
vector<bool> deadNodes(numVertices);
findLoopReachable(rg, g.acceptEod, deadNodes);
DEBUG_PRINTF("doing accept\n");
calcAndStoreDepth<RevNFAGraph, NFAVertexRevDepth>(
g, rg, g.accept, deadNodes, dMin, dMax, depths,
&NFAVertexRevDepth::toAccept);
DEBUG_PRINTF("doing accepteod\n");
deadNodes[NODE_ACCEPT] = true; // Hide accept->acceptEod edge.
calcAndStoreDepth<RevNFAGraph, NFAVertexRevDepth>(
g, rg, g.acceptEod, deadNodes, dMin, dMax, depths,
&NFAVertexRevDepth::toAcceptEod);
}
void calcDepths(const NGHolder &g, vector<NFAVertexBidiDepth> &depths) {
assert(hasCorrectlyNumberedVertices(g));
const size_t numVertices = num_vertices(g);
depths.clear();
depths.resize(numVertices);
vector<int> dMin;
vector<int> dMax;
/*
* create a filtered graph for max depth calculations: all nodes/edges
* reachable from a loop need to be removed
*/
vector<bool> deadNodes(numVertices);
findLoopReachable(g.g, g.start, deadNodes);
DEBUG_PRINTF("doing start\n");
calcAndStoreDepth<NFAGraph, NFAVertexBidiDepth>(
g, g.g, g.start, deadNodes, dMin, dMax, depths,
&NFAVertexBidiDepth::fromStart);
DEBUG_PRINTF("doing startds\n");
calcAndStoreDepth<NFAGraph, NFAVertexBidiDepth>(
g, g.g, g.startDs, deadNodes, dMin, dMax, depths,
&NFAVertexBidiDepth::fromStartDotStar);
/* Now go backwards */
typedef reverse_graph<NFAGraph, const NFAGraph&> RevNFAGraph;
const RevNFAGraph rg(g.g);
deadNodes.assign(numVertices, false);
findLoopReachable(rg, g.acceptEod, deadNodes);
DEBUG_PRINTF("doing accept\n");
calcAndStoreDepth<RevNFAGraph, NFAVertexBidiDepth>(
g, rg, g.accept, deadNodes, dMin, dMax, depths,
&NFAVertexBidiDepth::toAccept);
DEBUG_PRINTF("doing accepteod\n");
deadNodes[NODE_ACCEPT] = true; // Hide accept->acceptEod edge.
calcAndStoreDepth<RevNFAGraph, NFAVertexBidiDepth>(
g, rg, g.acceptEod, deadNodes, dMin, dMax, depths,
&NFAVertexBidiDepth::toAcceptEod);
}
void calcDepthsFrom(const NGHolder &g, const NFAVertex src,
vector<DepthMinMax> &depths) {
assert(hasCorrectlyNumberedVertices(g));
const size_t numVertices = num_vertices(g);
vector<bool> deadNodes(numVertices);
findLoopReachable(g.g, g.start, deadNodes);
vector<int> dMin, dMax;
calcDepthFromSource(g, g.g, src, deadNodes, dMin, dMax);
depths.clear();
depths.resize(numVertices);
for (auto v : vertices_range(g)) {
u32 idx = g[v].index;
depths.at(idx) = getDepths(idx, dMin, dMax);
}
}
} // namespace ue2

95
src/nfagraph/ng_depth.h Normal file
View File

@@ -0,0 +1,95 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief NFA graph vertex depth calculations.
*/
#ifndef STRUCTURAL_ANALYSIS_H
#define STRUCTURAL_ANALYSIS_H
#include "nfagraph/ng_holder.h"
#include "ue2common.h"
#include "util/depth.h"
#include <vector>
namespace ue2 {
class NGHolder;
/**
* \brief Encapsulates min/max depths relative to the start and startDs
* vertices.
*/
struct NFAVertexDepth {
DepthMinMax fromStart;
DepthMinMax fromStartDotStar;
};
/**
* \brief Encapsulates min/max depths relative to the accept and acceptEod
* vertices.
*/
struct NFAVertexRevDepth {
DepthMinMax toAccept;
DepthMinMax toAcceptEod;
};
/**
* \brief Encapsulates min/max depths relative to all of our special vertices.
*/
struct NFAVertexBidiDepth : NFAVertexDepth, NFAVertexRevDepth {
};
/**
* \brief Calculate depths from start and startDs.
* Fills the vector \p depths (indexed by \p vertex_index).
*/
void calcDepths(const NGHolder &g, std::vector<NFAVertexDepth> &depths);
/**
* \brief Calculate depths to accept and acceptEod.
* Fills the vector \p depths (indexed by \p vertex_index).
*/
void calcDepths(const NGHolder &g, std::vector<NFAVertexRevDepth> &depths);
/**
* \brief Calculate depths to/from all special vertices.
* Fills the vector \p depths (indexed by \p vertex_index).
*/
void calcDepths(const NGHolder &g, std::vector<NFAVertexBidiDepth> &depths);
/** Calculate the (min, max) depths from the given \p src to every vertex in
* the graph and return them in a vector, indexed by \p vertex_index. */
void calcDepthsFrom(const NGHolder &g, const NFAVertex src,
std::vector<DepthMinMax> &depths);
} // namespace ue2
#endif

View File

@@ -0,0 +1,85 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Calculate dominator and post-dominator trees.
*
* A small wrapper around the BGL's lengauer_tarjan_dominator_tree algorithm.
*/
#include "ng_dominators.h"
#include "ue2common.h"
#include "ng_holder.h"
#include "ng_util.h"
#include "util/ue2_containers.h"
#include <boost-patched/graph/dominator_tree.hpp> // locally patched version
#include <boost/graph/reverse_graph.hpp>
using namespace std;
using boost::make_assoc_property_map;
using boost::make_iterator_property_map;
namespace ue2 {
template <class Graph>
ue2::unordered_map<NFAVertex, NFAVertex> calcDominators(const Graph &g,
NFAVertex source) {
const size_t num_verts = num_vertices(g);
auto index_map = get(&NFAGraphVertexProps::index, g);
vector<size_t> dfnum(num_verts, 0);
vector<NFAVertex> parents(num_verts, Graph::null_vertex());
auto dfnum_map = make_iterator_property_map(dfnum.begin(), index_map);
auto parent_map = make_iterator_property_map(parents.begin(), index_map);
vector<NFAVertex> vertices_by_dfnum(num_verts, Graph::null_vertex());
// Output map.
unordered_map<NFAVertex, NFAVertex> doms;
auto dom_map = make_assoc_property_map(doms);
boost_ue2::lengauer_tarjan_dominator_tree(g, source, index_map, dfnum_map,
parent_map, vertices_by_dfnum,
dom_map);
return doms;
}
ue2::unordered_map<NFAVertex, NFAVertex> findDominators(const NGHolder &g) {
assert(hasCorrectlyNumberedVertices(g));
return calcDominators(g.g, g.start);
}
ue2::unordered_map<NFAVertex, NFAVertex> findPostDominators(const NGHolder &g) {
assert(hasCorrectlyNumberedVertices(g));
return calcDominators(boost::reverse_graph<NFAGraph, const NFAGraph &>(g.g),
g.acceptEod);
}
} // namespace ue2

View File

@@ -0,0 +1,51 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Calculate dominator and post-dominator trees.
*
* A small wrapper around the BGL's lengauer_tarjan_dominator_tree algorithm.
*/
#ifndef NG_DOMINATORS_H
#define NG_DOMINATORS_H
#include "ng_holder.h"
#include "util/ue2_containers.h"
namespace ue2 {
class NGHolder;
ue2::unordered_map<NFAVertex, NFAVertex> findDominators(const NGHolder &g);
ue2::unordered_map<NFAVertex, NFAVertex> findPostDominators(const NGHolder &g);
} // namespace ue2
#endif // NG_DOMINATORS_H

454
src/nfagraph/ng_dump.cpp Normal file
View File

@@ -0,0 +1,454 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Dump code for NFA graphs.
*
* The dump support in this file is for internal use only, and thus is not even
* compiled in release builds, where DUMP_SUPPORT is not switched on.
*/
#include "config.h"
#include "ng_dump.h"
#include "hwlm/hwlm_build.h"
#include "ng.h"
#include "ng_util.h"
#include "parser/position.h"
#include "ue2common.h"
#include "nfa/accel.h"
#include "nfa/nfa_internal.h" // for MO_INVALID_IDX
#include "smallwrite/smallwrite_dump.h"
#include "rose/rose_build.h"
#include "rose/rose_internal.h"
#include "util/bitutils.h"
#include "util/dump_charclass.h"
#include "util/report.h"
#include "util/report_manager.h"
#include "util/ue2string.h"
#include "hs_compile.h" /* for HS_MODE_* flags */
#include <cmath>
#include <fstream>
#include <iomanip>
#include <map>
#include <ostream>
#include <set>
#include <sstream>
#include <utility>
#ifndef DUMP_SUPPORT
#error No dump support!
#endif
using namespace std;
namespace ue2 {
static
void describeAssert(ostream &os, u32 flags) {
#define DESCRIBE_ASSERT_CASE(x) case POS_FLAG_##x: s = #x; break
while (flags) {
const char *s;
switch (1 << findAndClearLSB_32(&flags)) {
DESCRIBE_ASSERT_CASE(NOFLOAT);
DESCRIBE_ASSERT_CASE(MUST_FLOAT);
DESCRIBE_ASSERT_CASE(FIDDLE_ACCEPT);
DESCRIBE_ASSERT_CASE(VIRTUAL_START);
DESCRIBE_ASSERT_CASE(MULTILINE_START);
DESCRIBE_ASSERT_CASE(ASSERT_WORD_TO_WORD);
DESCRIBE_ASSERT_CASE(ASSERT_WORD_TO_NONWORD);
DESCRIBE_ASSERT_CASE(ASSERT_NONWORD_TO_WORD);
DESCRIBE_ASSERT_CASE(ASSERT_NONWORD_TO_NONWORD);
DESCRIBE_ASSERT_CASE(ASSERT_WORD_TO_WORD_UCP);
DESCRIBE_ASSERT_CASE(ASSERT_WORD_TO_NONWORD_UCP);
DESCRIBE_ASSERT_CASE(ASSERT_NONWORD_TO_WORD_UCP);
DESCRIBE_ASSERT_CASE(ASSERT_NONWORD_TO_NONWORD_UCP);
default:
s = "unknown flag";
}
os << s << "\\n";
}
#undef DESCRIBE_ASSERT_CASE
}
static
void describeReport(ostream &os, const ReportID report,
const ReportManager *rm) {
if (!rm) {
os << "\\nReport: " << report;
} else {
os << "\\nReport: " << report << " (";
const Report &ir = rm->getReport(report);
switch (ir.type) {
case EXTERNAL_CALLBACK:
os << "EXTERNAL " << ir.onmatch;
if (ir.offsetAdjust) {
os << " adj " << ir.offsetAdjust;
}
break;
case EXTERNAL_CALLBACK_SOM_STORED:
os << "SOM_STORED " << ir.somDistance;
break;
case EXTERNAL_CALLBACK_SOM_REL:
os << "SOM_REL " << ir.somDistance;
break;
case EXTERNAL_CALLBACK_SOM_ABS:
os << "SOM_ABS " << ir.somDistance;
break;
case EXTERNAL_CALLBACK_SOM_REV_NFA:
os << "SOM_REV_NFA " << ir.revNfaIndex;
break;
case INTERNAL_SOM_LOC_SET:
os << "SOM_LOC_SET " << ir.onmatch;
break;
case INTERNAL_SOM_LOC_SET_IF_UNSET:
os << "SOM_LOC_SET_IF_UNSET " << ir.onmatch;
break;
case INTERNAL_SOM_LOC_SET_IF_WRITABLE:
os << "SOM_LOC_SET_IF_WRITABLE " << ir.onmatch;
break;
case INTERNAL_SOM_LOC_SET_SOM_REV_NFA:
os << "SOM_LOC_SET_SOM_REV_NFA " << ir.onmatch << " nfa="
<< ir.revNfaIndex;
break;
case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_UNSET:
os << "SOM_LOC_SET_SOM_REV_NFA_IF_UNSET " << ir.onmatch << " nfa="
<< ir.revNfaIndex;
break;
case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_WRITABLE:
os << "SOM_LOC_SET_SOM_REV_NFA_IF_WRITABLE " << ir.onmatch
<< " nfa=" << ir.revNfaIndex;
break;
case INTERNAL_SOM_LOC_COPY:
os << "SOM_LOC_COPY " << ir.somDistance << " to " << ir.onmatch;
break;
case INTERNAL_SOM_LOC_COPY_IF_WRITABLE:
os << "SOM_LOC_COPY_IF_WRITABLE " << ir.somDistance
<< " to " << ir.onmatch;
break;
case INTERNAL_SOM_LOC_MAKE_WRITABLE:
os << "SOM_LOC_MAKE_WRITABLE " << ir.onmatch;
break;
default:
os << "no dump code!";
break;
}
os << ")";
}
}
namespace {
template <typename VertexT, typename EdgeT, typename GraphT>
class NFAWriter {
public:
explicit NFAWriter(const GraphT &g_in) : g(g_in) {}
NFAWriter(const GraphT &g_in, const ReportManager &rm_in)
: g(g_in), rm(&rm_in) {}
NFAWriter(const GraphT &g_in,
const ue2::unordered_map<NFAVertex, u32> &region_map_in)
: g(g_in), region_map(&region_map_in) {}
void operator()(ostream& os, const VertexT& v) const {
u32 v_index = g[v].index;
os << "[";
os << "fontsize=11, width=2, height=2, ";
os << "label=\"" << v_index;
os << "\\n";
if (is_special(v, g)) {
switch (v_index) {
case NODE_START:
os << "START"; break;
case NODE_START_DOTSTAR:
os << "START-DS"; break;
case NODE_ACCEPT:
os << "ACCEPT"; break;
case NODE_ACCEPT_EOD:
os << "ACCEPT-EOD"; break;
default:
os << "UNKNOWN-SPECIAL"; break;
}
os << "\\n";
} else {
// If it's an assert vertex, then display its info.
u32 assert_flags = g[v].assert_flags;
if (assert_flags) {
describeAssert(os, assert_flags);
os << "\\n";
}
}
// Dump character reachability (in brief).
describeClass(os, g[v].char_reach, 5, CC_OUT_DOT);
for (const auto &report : g[v].reports) {
describeReport(os, report, rm);
}
os << "\",";
if (is_any_start(v, g)) {
os << "shape=octagon,";
}
os << "]";
// If we have a region map, use it to generate clusters.
if (region_map) {
auto region_id = region_map->at(v);
os << "subgraph cluster_" << region_id << " { label=\"region "
<< region_id << "\"; style=dashed;" << v_index << ";}";
}
}
void operator()(ostream& os, const EdgeT& e) const {
// Edge label. Print priority.
os << "[fontsize=9,label=\"";
// If it's an edge from start, print top id.
if (is_any_start(source(e, g), g) && !is_any_start(target(e, g), g)) {
os << "TOP " << g[e].top << "\\n";
}
// If it's an assert vertex, then display its info.
int assert_flags = g[e].assert_flags;
if (assert_flags) {
os << "\\n";
describeAssert(os, assert_flags);
}
os << "\"]";
}
private:
const GraphT &g;
const ReportManager *rm = nullptr;
const ue2::unordered_map<NFAVertex, u32> *region_map = nullptr;
};
}
template <typename GraphT>
void dumpGraphImpl(const char *name, const GraphT &g) {
typedef typename boost::graph_traits<GraphT>::vertex_descriptor VertexT;
typedef typename boost::graph_traits<GraphT>::edge_descriptor EdgeT;
ofstream os(name);
NFAWriter<VertexT, EdgeT, GraphT> writer(g);
writeGraphviz(os, g, writer, get(&NFAGraphVertexProps::index, g));
}
template <typename GraphT>
void dumpGraphImpl(const char *name, const GraphT &g, const ReportManager &rm) {
typedef typename boost::graph_traits<GraphT>::vertex_descriptor VertexT;
typedef typename boost::graph_traits<GraphT>::edge_descriptor EdgeT;
ofstream os(name);
NFAWriter<VertexT, EdgeT, GraphT> writer(g, rm);
writeGraphviz(os, g, writer, get(&NFAGraphVertexProps::index, g));
}
template <typename GraphT>
void dumpGraphImpl(const char *name, const GraphT &g,
const ue2::unordered_map<NFAVertex, u32> &region_map) {
typedef typename boost::graph_traits<GraphT>::vertex_descriptor VertexT;
typedef typename boost::graph_traits<GraphT>::edge_descriptor EdgeT;
ofstream os(name);
NFAWriter<VertexT, EdgeT, GraphT> writer(g, region_map);
writeGraphviz(os, g, writer, get(&NFAGraphVertexProps::index, g));
}
// manual instantiation of templated dumpGraph above.
template void dumpGraphImpl(const char *, const NFAGraph &);
void dumpDotWrapperImpl(const NGWrapper &nw, const char *name,
const Grey &grey) {
if (grey.dumpFlags & Grey::DUMP_INT_GRAPH) {
stringstream ss;
ss << grey.dumpPath << "Expr_" << nw.expressionIndex << "_" << name << ".dot";
DEBUG_PRINTF("dumping dot graph to '%s'\n", ss.str().c_str());
dumpGraphImpl(ss.str().c_str(), nw.g);
}
}
void dumpComponentImpl(const NGHolder &g, const char *name, u32 expr,
u32 comp, const Grey &grey) {
if (grey.dumpFlags & Grey::DUMP_INT_GRAPH) {
stringstream ss;
ss << grey.dumpPath << "Comp_" << expr << "-" << comp << "_"
<< name << ".dot";
DEBUG_PRINTF("dumping dot graph to '%s'\n", ss.str().c_str());
dumpGraphImpl(ss.str().c_str(), g.g);
}
}
void dumpSomSubComponentImpl(const NGHolder &g, const char *name, u32 expr,
u32 comp, u32 plan, const Grey &grey) {
if (grey.dumpFlags & Grey::DUMP_INT_GRAPH) {
stringstream ss;
ss << grey.dumpPath << "Comp_" << expr << "-" << comp << "_"
<< name << "_" << plan << ".dot";
DEBUG_PRINTF("dumping dot graph to '%s'\n", ss.str().c_str());
dumpGraphImpl(ss.str().c_str(), g.g);
}
}
void dumpHolderImpl(const NGHolder &h, unsigned int stageNumber,
const char *stageName, const Grey &grey) {
if (grey.dumpFlags & Grey::DUMP_INT_GRAPH) {
stringstream ss;
ss << grey.dumpPath << "Holder_X_" << stageNumber
<< "-" << stageName << ".dot";
dumpGraphImpl(ss.str().c_str(), h.g);
}
}
void dumpHolderImpl(const NGHolder &h,
const ue2::unordered_map<NFAVertex, u32> &region_map,
unsigned int stageNumber, const char *stageName,
const Grey &grey) {
if (grey.dumpFlags & Grey::DUMP_INT_GRAPH) {
stringstream ss;
ss << grey.dumpPath << "Holder_X_" << stageNumber
<< "-" << stageName << ".dot";
dumpGraphImpl(ss.str().c_str(), h.g, region_map);
}
}
void dumpSmallWrite(const RoseEngine *rose, const Grey &grey) {
if (!grey.dumpFlags) {
return;
}
const struct SmallWriteEngine *smwr = getSmallWrite(rose);
stringstream ss;
ss << grey.dumpPath << "smallwrite.txt";
FILE *f = fopen(ss.str().c_str(), "w");
smwrDumpText(smwr, f);
fclose(f);
smwrDumpNFA(smwr, false, grey.dumpPath);
}
static UNUSED
const char *irTypeToString(u8 type) {
#define IR_TYPE_CASE(x) case x: return #x
switch (type) {
IR_TYPE_CASE(EXTERNAL_CALLBACK);
IR_TYPE_CASE(EXTERNAL_CALLBACK_SOM_REL);
IR_TYPE_CASE(INTERNAL_SOM_LOC_SET);
IR_TYPE_CASE(INTERNAL_SOM_LOC_SET_IF_UNSET);
IR_TYPE_CASE(INTERNAL_SOM_LOC_SET_IF_WRITABLE);
IR_TYPE_CASE(INTERNAL_SOM_LOC_SET_SOM_REV_NFA);
IR_TYPE_CASE(INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_UNSET);
IR_TYPE_CASE(INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_WRITABLE);
IR_TYPE_CASE(INTERNAL_SOM_LOC_COPY);
IR_TYPE_CASE(INTERNAL_SOM_LOC_COPY_IF_WRITABLE);
IR_TYPE_CASE(INTERNAL_SOM_LOC_MAKE_WRITABLE);
IR_TYPE_CASE(EXTERNAL_CALLBACK_SOM_STORED);
IR_TYPE_CASE(EXTERNAL_CALLBACK_SOM_ABS);
IR_TYPE_CASE(EXTERNAL_CALLBACK_SOM_REV_NFA);
IR_TYPE_CASE(INTERNAL_SOM_LOC_SET_FROM);
IR_TYPE_CASE(INTERNAL_SOM_LOC_SET_FROM_IF_WRITABLE);
IR_TYPE_CASE(INTERNAL_ROSE_CHAIN);
default: return "<unknown>";
}
#undef IR_TYPE_CASE
}
static really_inline
int isReverseNfaReport(const Report &ri) {
switch (ri.type) {
case INTERNAL_SOM_LOC_SET_SOM_REV_NFA:
case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_UNSET:
case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_WRITABLE:
case EXTERNAL_CALLBACK_SOM_REV_NFA:
return 1;
default:
break; // fall through
}
return 0;
}
static really_inline
int isSomRelSetReport(const Report &ri) {
switch (ri.type) {
case INTERNAL_SOM_LOC_SET:
case INTERNAL_SOM_LOC_SET_IF_UNSET:
case INTERNAL_SOM_LOC_SET_IF_WRITABLE:
return 1;
default:
break; // fall through
}
return 0;
}
void dumpReportManager(const ReportManager &rm, const Grey &grey) {
if (!grey.dumpFlags) {
return;
}
stringstream ss;
ss << grey.dumpPath << "internal_reports.txt";
FILE *f = fopen(ss.str().c_str(), "w");
const vector<Report> &reports = rm.reports();
for (u32 i = 0; i < reports.size(); i++) {
const Report &ir = reports[i];
fprintf(f, "int %u: %s onmatch: %u", i, irTypeToString(ir.type),
ir.onmatch);
u32 dkey = rm.getDkey(ir);
if (dkey != MO_INVALID_IDX) {
fprintf(f, " dkey %u", dkey);
}
if (ir.ekey != MO_INVALID_IDX) {
fprintf(f, " ekey %u", ir.ekey);
}
if (ir.hasBounds()) {
fprintf(f, " hasBounds (minOffset=%llu, maxOffset=%llu, "
"minLength=%llu)",
ir.minOffset, ir.maxOffset, ir.minLength);
}
if (ir.offsetAdjust != 0) {
fprintf(f, " offsetAdjust: %d", ir.offsetAdjust);
}
if (isReverseNfaReport(ir)) {
fprintf(f, " reverse nfa: %u", ir.revNfaIndex);
}
if (isSomRelSetReport(ir)) {
fprintf(f, " set, adjust: %lld", ir.somDistance);
}
fprintf(f, "\n");
}
fclose(f);
}
} // namespace ue2

173
src/nfagraph/ng_dump.h Normal file
View File

@@ -0,0 +1,173 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Dump code for NFA graphs.
*/
#ifndef NG_DUMP_H
#define NG_DUMP_H
#include "grey.h"
#include "ng_holder.h" // for graph types
#include "ue2common.h"
#include "util/ue2_containers.h"
#ifdef DUMP_SUPPORT
#include <fstream>
#endif
struct RoseEngine;
namespace ue2 {
class NGHolder;
class NG;
class NGWrapper;
class ReportManager;
// Implementations for stubs below -- all have the suffix "Impl".
#ifdef DUMP_SUPPORT
template <typename GraphT>
void dumpGraphImpl(const char *name, const GraphT &g);
template <typename GraphT>
void dumpGraphImpl(const char *name, const GraphT &g, const ReportManager &rm);
void dumpDotWrapperImpl(const NGWrapper &w, const char *name, const Grey &grey);
void dumpComponentImpl(const NGHolder &g, const char *name, u32 expr, u32 comp,
const Grey &grey);
void dumpSomSubComponentImpl(const NGHolder &g, const char *name, u32 expr,
u32 comp, u32 plan, const Grey &grey);
void dumpHolderImpl(const NGHolder &h, unsigned int stageNumber,
const char *stageName, const Grey &grey);
// Variant that takes a region map as well.
void dumpHolderImpl(const NGHolder &h,
const ue2::unordered_map<NFAVertex, u32> &region_map,
unsigned int stageNumber, const char *stageName,
const Grey &grey);
template <typename GraphT>
static inline void dumpGraph(UNUSED const char *name, UNUSED const GraphT &g) {
dumpGraphImpl(name, g);
}
#endif // DUMP_SUPPORT
// Stubs which call through to dump code if compiled in.
UNUSED static inline
void dumpDotWrapper(UNUSED const NGWrapper &w, UNUSED const char *name,
UNUSED const Grey &grey) {
#ifdef DUMP_SUPPORT
dumpDotWrapperImpl(w, name, grey);
#endif
}
UNUSED static inline
void dumpComponent(UNUSED const NGHolder &h, UNUSED const char *name,
UNUSED u32 expr, UNUSED u32 comp, UNUSED const Grey &grey) {
#ifdef DUMP_SUPPORT
dumpComponentImpl(h, name, expr, comp, grey);
#endif
}
UNUSED static inline
void dumpSomSubComponent(UNUSED const NGHolder &h, UNUSED const char *name,
UNUSED u32 expr, UNUSED u32 comp, UNUSED u32 plan,
UNUSED const Grey &grey) {
#ifdef DUMP_SUPPORT
dumpSomSubComponentImpl(h, name, expr, comp, plan, grey);
#endif
}
UNUSED static inline
void dumpHolder(UNUSED const NGHolder &h, UNUSED unsigned int stageNumber,
UNUSED const char *name, UNUSED const Grey &grey) {
#ifdef DUMP_SUPPORT
dumpHolderImpl(h, stageNumber, name, grey);
#endif
}
UNUSED static inline
void dumpHolder(UNUSED const NGHolder &h,
UNUSED const ue2::unordered_map<NFAVertex, u32> &region_map,
UNUSED unsigned int stageNumber, UNUSED const char *name,
UNUSED const Grey &grey) {
#ifdef DUMP_SUPPORT
dumpHolderImpl(h, region_map, stageNumber, name, grey);
#endif
}
#ifdef DUMP_SUPPORT
void dumpReportManager(const ReportManager &rm, const Grey &grey);
void dumpSmallWrite(const RoseEngine *rose, const Grey &grey);
#else
static UNUSED
void dumpReportManager(const ReportManager &, const Grey &) {
}
static UNUSED
void dumpSmallWrite(const RoseEngine *, const Grey &) {
}
#endif
#ifdef DUMP_SUPPORT
// replace boost's graphviz writer
template <typename GraphT, typename WriterT, typename VertexID>
static void writeGraphviz(std::ostream &out, const GraphT &g, WriterT w,
const VertexID &vertex_id) {
const std::string delimiter(" -> ");
out << "digraph G {" << std::endl;
typename boost::graph_traits<GraphT>::vertex_iterator i, end;
for(boost::tie(i,end) = vertices(g); i != end; ++i) {
out << get(vertex_id, *i);
w(out, *i); // print vertex attributes
out << ";" << std::endl;
}
typename boost::graph_traits<GraphT>::edge_iterator ei, edge_end;
for(boost::tie(ei, edge_end) = edges(g); ei != edge_end; ++ei) {
out << (get(vertex_id, source(*ei, g))) << delimiter
<< (get(vertex_id, target(*ei, g))) << " ";
w(out, *ei); // print edge attributes
out << ";" << std::endl;
}
out << "}" << std::endl;
}
#endif // DUMP_SUPPORT
} // namespace ue2
#endif // NG_DUMP_H

View File

@@ -0,0 +1,517 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Edge redundancy graph reductions.
*/
#include "ng_edge_redundancy.h"
#include "ng_holder.h"
#include "ng_prune.h"
#include "ng_util.h"
#include "ue2common.h"
#include "parser/position.h"
#include "util/compile_context.h"
#include "util/container.h"
#include "util/graph_range.h"
#include "util/ue2_containers.h"
#include <set>
#include <vector>
using namespace std;
namespace ue2 {
/* reverse edge redundancy removal is possible but is not implemented as it
* regressed rose pattern support in the regression suite: 19026 - 19027
* (foo.{1,5}b?ar)
*
* If rose becomes smarter we can reimplement.
*/
static never_inline
bool checkVerticesFwd(const NGHolder &g, const set<NFAVertex> &sad,
const set<NFAVertex> &happy) {
/* need to check if for each vertex in sad if it has an edge to a happy
* vertex */
for (auto u : sad) {
bool ok = false;
for (auto v : adjacent_vertices_range(u, g)) {
if (contains(happy, v)) {
ok = true;
break;
}
}
if (!ok) {
return false;
}
}
return true;
}
static never_inline
bool checkVerticesRev(const NGHolder &g, const set<NFAVertex> &sad,
const set<NFAVertex> &happy) {
/* need to check if for each vertex in sad if it has an edge to a happy
* vertex */
for (auto v : sad) {
bool ok = false;
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (contains(happy, u)) {
ok = true;
break;
}
}
if (!ok) {
return false;
}
}
return true;
}
/** \brief Redundant self-loop removal.
*
* A self loop on a vertex v can be removed if:
*
* For every vertex u in pred(v) either:
* 1: u has a self loop and cr(v) subset of cr(u)
* OR
* 2: u has an edge to vertex satisfying criterion 1
*
* Note: we remove all dead loops at the end of the pass and do not check the
* live status of the loops we are depending on during the analysis.
*
* We don't end up in situations where we remove a group of loops which depend
* on each other as:
*
* - there must be at least one vertex not in the group which is a pred of some
* member of the group (as we don't remove loops on specials)
*
* For each pred vertex of the group:
* - the vertex must be 'sad' as it is not part of the group
* - therefore it must have edges to each member of the group (to happy, trans)
* - therefore the group is enabled simultaneously
* - due to internal group edges, all members will still be active after the
* next character.
*
* Actually, the vertex redundancy code will merge the entire group into one
* cyclic state.
*/
static
bool removeEdgeRedundancyNearCyclesFwd(NGHolder &g, bool ignore_starts) {
unsigned dead_count = 0;
set<NFAVertex> happy;
set<NFAVertex> sad;
for (auto v : vertices_range(g)) {
if (is_special(v, g) || !hasSelfLoop(v, g)) {
continue;
}
const CharReach &cr_v = g[v].char_reach;
happy.clear();
sad.clear();
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (u == v) {
continue;
}
if (!hasSelfLoop(u, g)) {
sad.insert(u);
continue;
}
if (ignore_starts) {
if (u == g.startDs || is_virtual_start(u, g)) {
sad.insert(u);
continue;
}
}
const CharReach &cr_u = g[u].char_reach;
if ((cr_u & cr_v) != cr_v) {
sad.insert(u);
continue;
}
happy.insert(u);
}
if (!happy.empty() && checkVerticesFwd(g, sad, happy)) {
dead_count++;
remove_edge(v, v, g);
}
}
DEBUG_PRINTF("found %u removable edges.\n", dead_count);
return dead_count;
}
/** \brief Redundant self-loop removal (reverse version).
*
* A self loop on a vertex v can be removed if:
*
* For every vertex u in succ(v) either:
* 1: u has a self loop and cr(v) is a subset of cr(u).
* OR
* 2: u is not an accept and u has an edge from a vertex satisfying
* criterion 1.
* OR
* 3: u is in an accept and u has an edge from a vertex v' satisfying
* criterion 1 and report(v) == report(v').
*/
static
bool removeEdgeRedundancyNearCyclesRev(NGHolder &g) {
unsigned dead_count = 0;
set<NFAVertex> happy;
set<NFAVertex> sad;
for (auto v : vertices_range(g)) {
if (is_special(v, g) || !hasSelfLoop(v, g)) {
continue;
}
const CharReach &cr_v = g[v].char_reach;
happy.clear();
sad.clear();
for (auto u : adjacent_vertices_range(v, g)) {
if (u == v) {
continue;
}
if (!hasSelfLoop(u, g)) {
sad.insert(u);
continue;
}
assert(!is_special(u, g));
const CharReach &cr_u = g[u].char_reach;
if (!cr_v.isSubsetOf(cr_u)) {
sad.insert(u);
continue;
}
happy.insert(u);
}
if (!happy.empty() && checkVerticesRev(g, sad, happy)) {
dead_count++;
remove_edge(v, v, g);
}
}
DEBUG_PRINTF("found %u removable edges.\n", dead_count);
return dead_count;
}
static
bool parentsSubsetOf(const NGHolder &g, NFAVertex v,
const flat_set<NFAVertex> &other_parents, NFAVertex other,
map<NFAVertex, bool> &done) {
map<NFAVertex, bool>::const_iterator dit = done.find(v);
if (dit != done.end()) {
return dit->second;
}
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (u == v && contains(other_parents, other)) {
continue;
}
if (!contains(other_parents, u)) {
done[v] = false;
return false;
}
}
done[v] = true;
return true;
}
static
bool checkFwdCandidate(const NGHolder &g, NFAVertex fixed_src,
const flat_set<NFAVertex> &fixed_parents,
const NFAEdge &candidate,
map<NFAVertex, bool> &done) {
NFAVertex w = source(candidate, g);
NFAVertex v = target(candidate, g);
const CharReach &cr_w = g[w].char_reach;
const CharReach &cr_u = g[fixed_src].char_reach;
/* There is no reason why self loops cannot be considered by this
* transformation but the removal is already handled by many other
* transformations. */
if (w == v) {
return false;
}
if (is_special(w, g)) {
return false;
}
if (!cr_w.isSubsetOf(cr_u)) {
return false;
}
/* check that each parent of w is also a parent of u */
if (!parentsSubsetOf(g, w, fixed_parents, fixed_src, done)) {
return false;
}
DEBUG_PRINTF("edge (%u, %u) killed by edge (%u, %u)\n",
g[w].index, g[v].index,
g[fixed_src].index, g[v].index);
return true;
}
static never_inline
void checkLargeOutU(const NGHolder &g, NFAVertex u,
const flat_set<NFAVertex> &parents_u,
flat_set<NFAVertex> &possible_w,
map<NFAVertex, bool> &done,
set<NFAEdge> *dead) {
/* only vertices with at least one parent in common with u need to be
* considered, and we also only consider potential siblings with subset
* reach. */
possible_w.clear();
const CharReach &cr_u = g[u].char_reach;
for (auto p : parents_u) {
for (auto v : adjacent_vertices_range(p, g)) {
const CharReach &cr_w = g[v].char_reach;
if (cr_w.isSubsetOf(cr_u)) {
possible_w.insert(v);
}
}
}
// If there's only one, it's us, and we have no work to do.
if (possible_w.size() <= 1) {
assert(possible_w.empty() || *possible_w.begin() == u);
return;
}
for (const auto &e : out_edges_range(u, g)) {
const NFAVertex v = target(e, g);
if (is_special(v, g)) {
continue;
}
if (contains(*dead, e)) {
continue;
}
/* Now need check to find any edges which can be removed due to the
* existence of edge e */
for (const auto &e2 : in_edges_range(v, g)) {
if (e == e2 || contains(*dead, e2)) {
continue;
}
const NFAVertex w = source(e2, g);
if (!contains(possible_w, w)) {
continue;
}
if (checkFwdCandidate(g, u, parents_u, e2, done)) {
dead->insert(e2);
}
}
}
}
static never_inline
void checkSmallOutU(const NGHolder &g, NFAVertex u,
const flat_set<NFAVertex> &parents_u,
map<NFAVertex, bool> &done,
set<NFAEdge> *dead) {
for (const auto &e : out_edges_range(u, g)) {
const NFAVertex v = target(e, g);
if (is_special(v, g)) {
continue;
}
if (contains(*dead, e)) {
continue;
}
/* Now need check to find any edges which can be removed due to the
* existence of edge e */
for (const auto &e2 : in_edges_range(v, g)) {
if (e == e2 || contains(*dead, e2)) {
continue;
}
if (checkFwdCandidate(g, u, parents_u, e2, done)) {
dead->insert(e2);
}
}
}
}
/** \brief Forward edge redundancy pass.
*
* An edge e from w to v is redundant if there exists an edge e' such that:
* e' is from u to v
* and: reach(w) is a subset of reach(u)
* and: proper_pred(w) is a subset of pred(u)
* and: self_loop(w) implies self_loop(u) or edge from (w to u)
*
* Note: edges to accepts also require report ID checks.
*/
static
bool removeEdgeRedundancyFwd(NGHolder &g, bool ignore_starts) {
set<NFAEdge> dead;
map<NFAVertex, bool> done;
flat_set<NFAVertex> parents_u;
flat_set<NFAVertex> possible_w;
for (auto u : vertices_range(g)) {
if (ignore_starts && (u == g.startDs || is_virtual_start(u, g))) {
continue;
}
parents_u.clear();
pred(g, u, &parents_u);
done.clear();
if (hasGreaterOutDegree(1, u, g)) {
checkLargeOutU(g, u, parents_u, possible_w, done, &dead);
} else {
checkSmallOutU(g, u, parents_u, done, &dead);
}
}
if (dead.empty()) {
return false;
}
DEBUG_PRINTF("found %zu removable non-selfloops.\n", dead.size());
remove_edges(dead, g);
pruneUseless(g);
return true;
}
/** Entry point: Runs all the edge redundancy passes. If SoM is tracked,
* don't consider startDs or virtual starts as cyclic vertices. */
bool removeEdgeRedundancy(NGHolder &g, som_type som, const CompileContext &cc) {
if (!cc.grey.removeEdgeRedundancy) {
return false;
}
bool changed = false;
changed |= removeEdgeRedundancyNearCyclesFwd(g, som);
changed |= removeEdgeRedundancyNearCyclesRev(g);
changed |= removeEdgeRedundancyFwd(g, som);
return changed;
}
/** \brief Removes optional stuff from the front of floating patterns, since it's
* redundant with startDs.
*
* For each successor of startDs, remove any in-edges that aren't from either
* start or startDs. This allows us to prune redundant vertices at the start of
* a pattern:
*
* /(hat)?stand --> /stand/
*
*/
bool removeSiblingsOfStartDotStar(NGHolder &g) {
vector<NFAEdge> dead;
for (auto v : adjacent_vertices_range(g.startDs, g)) {
DEBUG_PRINTF("checking %u\n", g[v].index);
if (is_special(v, g)) {
continue;
}
for (const auto &e : in_edges_range(v, g)) {
NFAVertex u = source(e, g);
if (is_special(u, g)) {
continue;
}
DEBUG_PRINTF("removing %u->%u\n", g[u].index,
g[v].index);
dead.push_back(e);
}
}
if (dead.empty()) {
return false;
}
DEBUG_PRINTF("found %zu removable edges.\n", dead.size());
remove_edges(dead, g);
pruneUseless(g);
return true;
}
/** Removes all edges into virtual starts other than those from start/startDs,
* providing there is an edge from startDs. This operation is an optimisation
* for SOM mode. (see UE-1544) */
bool optimiseVirtualStarts(NGHolder &g) {
vector<NFAEdge> dead;
for (auto v : adjacent_vertices_range(g.startDs, g)) {
u32 flags = g[v].assert_flags;
if (!(flags & POS_FLAG_VIRTUAL_START)) {
continue;
}
for (const auto &e : in_edges_range(v, g)) {
if (!is_any_start(source(e, g), g)) {
dead.push_back(e);
}
}
}
if (dead.empty()) {
return false;
}
DEBUG_PRINTF("removing %zu edges into virtual starts\n", dead.size());
remove_edges(dead, g);
pruneUseless(g);
return true;
}
} // namespace ue2

View File

@@ -0,0 +1,65 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Edge redundancy graph reductions.
*/
#ifndef NG_EDGE_REDUNDANCY_H
#define NG_EDGE_REDUNDANCY_H
#include "som/som.h"
namespace ue2 {
class NGHolder;
struct CompileContext;
/** \brief Entry point: Runs all the edge redundancy passes. */
bool removeEdgeRedundancy(NGHolder &g, som_type som, const CompileContext &cc);
/** \brief Removes optional stuff from the front of floating patterns, since
* it's redundant with startDs.
*
* For each successor of startDs, remove any in-edges that aren't from either
* start or startDs. This allows us to prune redundant vertices at the start of
* a pattern:
*
* /(hat)?stand --> /stand/
*
*/
bool removeSiblingsOfStartDotStar(NGHolder &g);
/** \brief Removes all edges into virtual starts other than those from
* start/startDs, providing there is an edge from startDs.
*
* This operation is an optimisation for SOM mode. (see UE-1544) */
bool optimiseVirtualStarts(NGHolder &g);
} // namespace ue2
#endif

View File

@@ -0,0 +1,695 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Equivalence class graph reduction pass.
*/
#include "ng_equivalence.h"
#include "grey.h"
#include "ng_depth.h"
#include "ng_holder.h"
#include "ng_util.h"
#include "util/compile_context.h"
#include "util/graph_range.h"
#include "util/order_check.h"
#include <algorithm>
#include <set>
#include <stack>
#include <vector>
#include <boost/ptr_container/ptr_vector.hpp>
using namespace std;
using boost::ptr_vector;
namespace ue2 {
enum EquivalenceType {
LEFT_EQUIVALENCE = 0,
RIGHT_EQUIVALENCE,
MAX_EQUIVALENCE
};
namespace {
class VertexInfo;
// custom comparison functor for unordered_set and flat_set
struct VertexInfoPtrCmp {
// for flat_set
bool operator()(const VertexInfo *a, const VertexInfo *b) const;
// for unordered_set
size_t operator()(const VertexInfo *a) const;
};
/** Precalculated (and maintained) information about a vertex. */
class VertexInfo {
public:
VertexInfo(NFAVertex v_in, const NGHolder &g)
: v(v_in), vert_index(g[v].index), cr(g[v].char_reach), edge_top(~0),
equivalence_class(~0), vertex_flags(g[v].assert_flags) {}
flat_set<VertexInfo *, VertexInfoPtrCmp> pred; //!< predecessors of this vertex
flat_set<VertexInfo *, VertexInfoPtrCmp> succ; //!< successors of this vertex
NFAVertex v;
u32 vert_index;
CharReach cr;
CharReach pred_cr;
CharReach succ_cr;
unsigned edge_top;
unsigned equivalence_class;
unsigned vertex_flags;
};
}
typedef ue2::unordered_set<VertexInfo *, VertexInfoPtrCmp> VertexInfoSet;
typedef ue2::unordered_map<unsigned, VertexInfoSet> ClassMap;
// compare two vertex info pointers on their vertex index
bool VertexInfoPtrCmp::operator()(const VertexInfo *a,
const VertexInfo *b) const {
return a->vert_index < b->vert_index;
}
// provide a "hash" for vertex info pointer by returning its vertex index
size_t VertexInfoPtrCmp::operator()(const VertexInfo *a) const {
return a->vert_index;
}
namespace {
// to avoid traversing infomap each time we need to check the class during
// partitioning, we will cache the information pertaining to a particular class
class ClassInfo {
public:
struct ClassDepth {
ClassDepth() {}
ClassDepth(const NFAVertexDepth &d)
: d1(d.fromStart), d2(d.fromStartDotStar) {}
ClassDepth(const NFAVertexRevDepth &rd)
: d1(rd.toAccept), d2(rd.toAcceptEod) {}
DepthMinMax d1;
DepthMinMax d2;
};
ClassInfo(const NGHolder &g, VertexInfo &vi, ClassDepth &d_in,
EquivalenceType eq)
: vertex_flags(vi.vertex_flags), edge_top(vi.edge_top), cr(vi.cr),
depth(d_in) {
// hackety-hack!
node_type = g[vi.v].index;
if (node_type > N_SPECIALS) {
// we treat all regular vertices the same
node_type = N_SPECIALS;
}
// get all the adjacent vertices' CharReach
adjacent_cr = eq == LEFT_EQUIVALENCE ? vi.pred_cr : vi.succ_cr;
if (eq == RIGHT_EQUIVALENCE) {
rs = g[vi.v].reports;
}
}
bool operator<(const ClassInfo &b) const;
private:
flat_set<ReportID> rs; /* for right equiv only */
unsigned vertex_flags;
u32 edge_top;
CharReach cr;
CharReach adjacent_cr;
unsigned node_type;
ClassDepth depth;
};
// work queue class. this contraption has two goals:
// 1. uniqueness of elements
// 2. FILO operation
class WorkQueue {
public:
explicit WorkQueue(unsigned c) {
q.reserve(c);
}
// unique push
void push(unsigned id) {
if (ids.insert(id).second) {
q.push_back(id);
}
}
// pop
unsigned pop() {
unsigned id = q.back();
ids.erase(id);
q.pop_back();
return id;
}
void append(WorkQueue &other) {
for (const auto &e : other) {
push(e);
}
}
void clear() {
ids.clear();
q.clear();
}
bool empty() const {
return ids.empty();
}
vector<unsigned>::const_iterator begin() const {
return q.begin();
}
vector<unsigned>::const_iterator end() const {
return q.end();
}
size_t capacity() const {
return q.capacity();
}
private:
set<unsigned> ids; //!< stores id's, for uniqueness
vector<unsigned> q; //!< vector of id's that we use as FILO.
};
}
bool ClassInfo::operator<(const ClassInfo &b) const {
const ClassInfo &a = *this;
ORDER_CHECK(node_type);
ORDER_CHECK(depth.d1);
ORDER_CHECK(depth.d2);
ORDER_CHECK(cr);
ORDER_CHECK(adjacent_cr);
ORDER_CHECK(edge_top);
ORDER_CHECK(vertex_flags);
ORDER_CHECK(rs);
return false;
}
static
bool outIsIrreducible(NFAVertex &v, const NGHolder &g) {
unsigned nonSpecialVertices = 0;
for (auto w : adjacent_vertices_range(v, g)) {
if (!is_special(w, g) && w != v) {
nonSpecialVertices++;
}
}
return nonSpecialVertices == 1;
}
static
bool inIsIrreducible(NFAVertex &v, const NGHolder &g) {
unsigned nonSpecialVertices = 0;
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (!is_special(u, g) && u != v) {
nonSpecialVertices++;
}
}
return nonSpecialVertices == 1;
}
/** Cheaply check whether this graph can't be reduced at all, because it is
* just a chain of vertices with no other edges. */
static
bool isIrreducible(const NGHolder &g) {
for (auto v : vertices_range(g)) {
// skip specials
if (is_special(v, g)) {
continue;
}
// we want meaningful in_degree to be 1. we also want to make sure we
// don't count self-loop + 1 incoming edge as not irreducible
if (in_degree(v, g) != 1 && !inIsIrreducible(v, g)) {
return false;
}
// we want meaningful out_degree to be 1. we also want to make sure we
// don't count self-loop + 1 outgoing edge as not irreducible
if (out_degree(v, g) != 1 && !outIsIrreducible(v, g)) {
return false;
}
}
return true;
}
#ifndef NDEBUG
static
bool hasEdgeAsserts(NFAVertex v, const NGHolder &g) {
for (const auto &e : in_edges_range(v, g)) {
if (g[e].assert_flags != 0) {
return true;
}
}
for (const auto &e : out_edges_range(v, g)) {
if (g[e].assert_flags != 0) {
return true;
}
}
return false;
}
#endif
// populate VertexInfo table
static
void getVertexInfos(const NGHolder &g, ptr_vector<VertexInfo> &infos) {
vector<VertexInfo *> vertex_map; // indexed by vertex_index property
vertex_map.resize(num_vertices(g));
for (auto v : vertices_range(g)) {
VertexInfo *vi = new VertexInfo(v, g);
// insert our new shiny VertexInfo into the info map
infos.push_back(vi);
vertex_map[g[v].index] = vi;
}
// now, go through each vertex and populate its predecessor and successor lists
for (VertexInfo &cur_vi : infos) {
// find predecessors
for (const auto &e : in_edges_range(cur_vi.v, g)) {
NFAVertex u = source(e, g);
VertexInfo *vmi = vertex_map[g[u].index];
cur_vi.pred_cr |= vmi->cr;
cur_vi.pred.insert(vmi);
// also set up edge tops
if (is_triggered(g) && u == g.start) {
cur_vi.edge_top = g[e].top;
}
}
// find successors
for (auto w : adjacent_vertices_range(cur_vi.v, g)) {
VertexInfo *vmi = vertex_map[g[w].index];
cur_vi.succ_cr |= vmi->cr;
cur_vi.succ.insert(vmi);
}
assert(!hasEdgeAsserts(cur_vi.v, g));
}
}
// store equivalence class in VertexInfo for each vertex
static
void partitionGraph(ptr_vector<VertexInfo> &infos, ClassMap &classes,
WorkQueue &work_queue, const NGHolder &g,
EquivalenceType eq) {
map<ClassInfo, unsigned> classinfomap;
// get distances from start (or accept) for all vertices
// only one of them is used at a time, never both
vector<NFAVertexDepth> depths;
vector<NFAVertexRevDepth> rdepths;
if (eq == LEFT_EQUIVALENCE) {
calcDepths(g, depths);
} else {
calcDepths(g, rdepths);
}
// partition the graph based on CharReach
for (VertexInfo &vi : infos) {
ClassInfo::ClassDepth depth;
if (eq == LEFT_EQUIVALENCE) {
depth = depths[vi.vert_index];
} else {
depth = rdepths[vi.vert_index];
}
ClassInfo ci(g, vi, depth, eq);
auto ii = classinfomap.find(ci);
if (ii == classinfomap.end()) {
unsigned new_class = classinfomap.size();
vi.equivalence_class = new_class;
classinfomap[ci] = new_class;
// insert this vertex into the class map
VertexInfoSet &vertices = classes[new_class];
vertices.insert(&vi);
} else {
unsigned eq_class = ii->second;
vi.equivalence_class = eq_class;
// insert this vertex into the class map
VertexInfoSet &vertices = classes[eq_class];
vertices.insert(&vi);
// we now know that this particular class has more than one
// vertex, so we add it to the work queue
work_queue.push(eq_class);
}
}
DEBUG_PRINTF("partitioned, %lu equivalence classes\n", classinfomap.size());
}
// generalized equivalence processing (left and right)
// basically, goes through every vertex in a class and checks if all successor or
// predecessor classes match in all vertices. if classes mismatch, a vertex is
// split into a separate class, along with all vertices having the same set of
// successor/predecessor classes. the opposite side (successors for left
// equivalence, predecessors for right equivalence) classes get revalidated in
// case of a split.
static
void equivalence(ClassMap &classmap, WorkQueue &work_queue,
EquivalenceType eq_type) {
// now, go through the work queue until it's empty
map<flat_set<unsigned>, VertexInfoSet> tentative_classmap;
flat_set<unsigned> cur_classes;
// local work queue, to store classes we want to revalidate in case of split
WorkQueue reval_queue(work_queue.capacity());
while (!work_queue.empty()) {
// dequeue our class from the work queue
unsigned cur_class = work_queue.pop();
// get all vertices in current equivalence class
VertexInfoSet &cur_class_vertices = classmap[cur_class];
if (cur_class_vertices.size() < 2) {
continue;
}
// clear data from previous iterations
tentative_classmap.clear();
DEBUG_PRINTF("doing equivalence pass for class %u, %zd vertices\n",
cur_class, cur_class_vertices.size());
// go through vertices in this class
for (VertexInfo *vi : cur_class_vertices) {
cur_classes.clear();
// get vertex lists for equivalence vertices and vertices for
// revalidation in case of split
const auto &eq_vertices =
(eq_type == LEFT_EQUIVALENCE) ? vi->pred : vi->succ;
const auto &reval_vertices =
(eq_type == LEFT_EQUIVALENCE) ? vi->succ : vi->pred;
// go through equivalence and note the classes
for (const VertexInfo *tmp : eq_vertices) {
cur_classes.insert(tmp->equivalence_class);
}
// note all the classes that need to be reevaluated
for (const VertexInfo *tmp : reval_vertices) {
reval_queue.push(tmp->equivalence_class);
}
VertexInfoSet &tentative_classes = tentative_classmap[cur_classes];
tentative_classes.insert(vi);
}
// if we found more than one class, split and revalidate everything
if (tentative_classmap.size() > 1) {
auto tmi = tentative_classmap.begin();
// start from the second class
for (++tmi; tmi != tentative_classmap.end(); ++tmi) {
unsigned new_class = classmap.size();
const VertexInfoSet &vertices_to_split = tmi->second;
VertexInfoSet &new_class_vertices = classmap[new_class];
for (VertexInfo *vi : vertices_to_split) {
vi->equivalence_class = new_class;
cur_class_vertices.erase(vi);
new_class_vertices.insert(vi);
}
if (tmi->first.find(cur_class) != tmi->first.end()) {
reval_queue.push(new_class);
}
}
work_queue.append(reval_queue);
}
reval_queue.clear();
}
}
static
bool require_separate_eod_vertex(const VertexInfoSet &vert_infos,
const NGHolder &g) {
/* We require separate eod and normal accept vertices for a class if we have
* both normal accepts and eod accepts AND the reports are different for eod
* and non-eod reports. */
flat_set<ReportID> non_eod;
flat_set<ReportID> eod;
for (const VertexInfo *vi : vert_infos) {
NFAVertex v = vi->v;
if (edge(v, g.accept, g).second) {
insert(&non_eod, g[v].reports);
}
if (edge(v, g.acceptEod, g).second) {
insert(&eod, g[v].reports);
}
}
if (non_eod.empty() || eod.empty()) {
return false;
}
return non_eod != eod;
}
static
void mergeClass(ptr_vector<VertexInfo> &infos, NGHolder &g, unsigned eq_class,
VertexInfoSet &cur_class_vertices, set<NFAVertex> *toRemove) {
DEBUG_PRINTF("Replacing %zd vertices from equivalence class %u with a "
"single vertex.\n", cur_class_vertices.size(), eq_class);
// replace equivalence class with a single vertex:
// 1. create new vertex with matching properties
// 2. wire all predecessors to new vertex
// 2a. update info for new vertex with new predecessors
// 2b. update each predecessor's successor list
// 3. wire all successors to new vertex
// 3a. update info for new vertex with new successors
// 3b. update each successor's predecessor list
// 4. remove old vertex
// any differences between vertex properties were resolved during
// initial partitioning, so we assume that every vertex in equivalence
// class has the same CharReach et al.
// so, we find the first vertex in our class and get all its properties
/* For left equivalence, if the members have different reporting behaviour
* we sometimes require two vertices to be created (one connected to accept
* and one to accepteod) */
NFAVertex old_v = (*cur_class_vertices.begin())->v;
NFAVertex new_v = clone_vertex(g, old_v); /* set up new vertex with same
* props */
g[new_v].reports.clear(); /* populated as we pull in succs */
VertexInfo *new_vertex_info = new VertexInfo(new_v, g);
// store this vertex in our global vertex list
infos.push_back(new_vertex_info);
NFAVertex new_v_eod = NGHolder::null_vertex();
VertexInfo *new_vertex_info_eod = nullptr;
if (require_separate_eod_vertex(cur_class_vertices, g)) {
new_v_eod = clone_vertex(g, old_v);
g[new_v_eod].reports.clear();
new_vertex_info_eod = new VertexInfo(new_v_eod, g);
infos.push_back(new_vertex_info_eod);
}
const unsigned edgetop = (*cur_class_vertices.begin())->edge_top;
for (VertexInfo *old_vertex_info : cur_class_vertices) {
assert(old_vertex_info->equivalence_class == eq_class);
// mark this vertex for removal
toRemove->insert(old_vertex_info->v);
// for each predecessor, add edge to new vertex and update info
for (VertexInfo *pred_info : old_vertex_info->pred) {
// update info for new vertex
new_vertex_info->pred.insert(pred_info);
if (new_vertex_info_eod) {
new_vertex_info_eod->pred.insert(pred_info);
}
// update info for predecessor
pred_info->succ.erase(old_vertex_info);
// if edge doesn't exist, create it
NFAEdge e = add_edge_if_not_present(pred_info->v, new_v, g).first;
// put edge top, if applicable
if (edgetop != (unsigned) -1) {
g[e].top = edgetop;
}
pred_info->succ.insert(new_vertex_info);
if (new_v_eod) {
NFAEdge ee = add_edge_if_not_present(pred_info->v, new_v_eod,
g).first;
// put edge top, if applicable
if (edgetop != (unsigned) -1) {
g[ee].top = edgetop;
}
pred_info->succ.insert(new_vertex_info_eod);
}
}
// for each successor, add edge from new vertex and update info
for (VertexInfo *succ_info : old_vertex_info->succ) {
NFAVertex succ_v = succ_info->v;
// update info for successor
succ_info->pred.erase(old_vertex_info);
if (new_v_eod && succ_v == g.acceptEod) {
// update info for new vertex
new_vertex_info_eod->succ.insert(succ_info);
insert(&g[new_v_eod].reports,
g[old_vertex_info->v].reports);
add_edge_if_not_present(new_v_eod, succ_v, g);
succ_info->pred.insert(new_vertex_info_eod);
} else {
// update info for new vertex
new_vertex_info->succ.insert(succ_info);
// if edge doesn't exist, create it
add_edge_if_not_present(new_v, succ_v, g);
succ_info->pred.insert(new_vertex_info);
if (is_any_accept(succ_v, g)) {
insert(&g[new_v].reports,
g[old_vertex_info->v].reports);
}
}
}
}
// update classmap
new_vertex_info->equivalence_class = eq_class;
cur_class_vertices.insert(new_vertex_info);
}
// walk through vertices of an equivalence class and replace them with a single
// vertex (or, in rare cases for left equiv, a pair if we cannot satisfy the
// report behaviour with a single vertex).
static
bool mergeEquivalentClasses(ClassMap &classmap, ptr_vector<VertexInfo> &infos,
NGHolder &g) {
bool merged = false;
set<NFAVertex> toRemove;
// go through all classes and merge classes with more than one vertex
for (auto &cm : classmap) {
// get all vertices in current equivalence class
unsigned eq_class = cm.first;
VertexInfoSet &cur_class_vertices = cm.second;
// we don't care for single-vertex classes
if (cur_class_vertices.size() > 1) {
merged = true;
mergeClass(infos, g, eq_class, cur_class_vertices, &toRemove);
}
}
// remove all dead vertices
DEBUG_PRINTF("removing %zd vertices.\n", toRemove.size());
remove_vertices(toRemove, g);
return merged;
}
bool reduceGraphEquivalences(NGHolder &g, const CompileContext &cc) {
if (!cc.grey.equivalenceEnable) {
DEBUG_PRINTF("equivalence processing disabled in grey box\n");
return false;
}
g.renumberVertices();
// Cheap check: if all the non-special vertices have in-degree one and
// out-degree one, there's no redundancy in this here graph and we can
// vamoose.
if (isIrreducible(g)) {
DEBUG_PRINTF("skipping equivalence processing, graph is irreducible\n");
return false;
}
// take note if we have merged any vertices
bool merge = false;
for (int eqi = 0; eqi < MAX_EQUIVALENCE; ++eqi) {
// map of all information pertaining a vertex
ptr_vector<VertexInfo> infos;
ClassMap classes;
// create a list of equivalence classes to check
WorkQueue work_queue(num_vertices(g));
EquivalenceType eq_type = (EquivalenceType) eqi;
// resize the vector, make room for twice the vertices we have
infos.reserve(num_vertices(g) * 2);
// get information on every vertex in the graph
// new vertices are allocated here, and stored in infos
getVertexInfos(g, infos);
// partition the graph
partitionGraph(infos, classes, work_queue, g, eq_type);
// do equivalence processing
equivalence(classes, work_queue, eq_type);
// replace equivalent classes with single vertices
// new vertices are (possibly) allocated here, and stored in infos
merge |= mergeEquivalentClasses(classes, infos, g);
}
return merge;
}
} // namespace ue2

View File

@@ -0,0 +1,47 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Equivalence class graph reduction pass.
*/
#ifndef NG_EQUIVALENCE_H_
#define NG_EQUIVALENCE_H_
namespace ue2 {
class NGHolder;
struct CompileContext;
/** Attempt to make the NFA graph \p g smaller by performing a number of local
* transformations. */
bool reduceGraphEquivalences(NGHolder &g, const CompileContext &cc);
} // namespace ue2
#endif /* NG_EQUIVALENCE_H_ */

323
src/nfagraph/ng_execute.cpp Normal file
View File

@@ -0,0 +1,323 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Execute an NFA over a given input, returning the set of states that
* are active afterwards.
*
* Note: although our external interfaces for execute_graph() use std::set, we
* use a dynamic bitset containing the vertex indices internally for
* performance.
*/
#include "ng_execute.h"
#include "ng_holder.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/container.h"
#include "util/dump_charclass.h"
#include "util/graph_range.h"
#include "util/ue2string.h"
#include <sstream>
#include <string>
#include <boost/dynamic_bitset.hpp>
#include <boost/graph/depth_first_search.hpp>
#include <boost/graph/reverse_graph.hpp>
using namespace std;
using boost::dynamic_bitset;
namespace ue2 {
struct StateInfo {
StateInfo(NFAVertex v, const CharReach &cr) : vertex(v), reach(cr) {}
StateInfo() : vertex(NFAGraph::null_vertex()) {}
NFAVertex vertex;
CharReach reach;
};
#ifdef DEBUG
static
std::string dumpStates(const dynamic_bitset<> &s) {
std::ostringstream oss;
for (size_t i = s.find_first(); i != s.npos; i = s.find_next(i)) {
oss << i << " ";
}
return oss.str();
}
#endif
static
void step(const NGHolder &g, const vector<StateInfo> &info,
const dynamic_bitset<> &in, dynamic_bitset<> *out) {
out->reset();
for (size_t i = in.find_first(); i != in.npos; i = in.find_next(i)) {
NFAVertex u = info[i].vertex;
for (auto v : adjacent_vertices_range(u, g)) {
out->set(g[v].index);
}
}
}
static
void filter_by_reach(const vector<StateInfo> &info, dynamic_bitset<> *states,
const CharReach &cr) {
for (size_t i = states->find_first(); i != states->npos;
i = states->find_next(i)) {
if ((info[i].reach & cr).none()) {
states->reset(i);
}
}
}
template<typename inputT>
static
void execute_graph_i(const NGHolder &g, const vector<StateInfo> &info,
const inputT &input, dynamic_bitset<> *states,
bool kill_sds) {
dynamic_bitset<> &curr = *states;
dynamic_bitset<> next(curr.size());
DEBUG_PRINTF("%zu states in\n", states->count());
for (const auto &e : input) {
DEBUG_PRINTF("processing %s\n", describeClass(e).c_str());
step(g, info, curr, &next);
if (kill_sds) {
next.reset(NODE_START_DOTSTAR);
}
filter_by_reach(info, &next, e);
next.swap(curr);
if (curr.empty()) {
DEBUG_PRINTF("went dead\n");
break;
}
}
DEBUG_PRINTF("%zu states out\n", states->size());
}
static
void fillStateBitset(const NGHolder &g, const set<NFAVertex> &in,
dynamic_bitset<> &out) {
out.reset();
for (auto v : in) {
u32 idx = g[v].index;
out.set(idx);
}
}
static
void fillVertexSet(const dynamic_bitset<> &in,
const vector<StateInfo> &info, set<NFAVertex> &out) {
out.clear();
for (size_t i = in.find_first(); i != in.npos; i = in.find_next(i)) {
out.insert(info[i].vertex);
}
}
static
void fillInfoTable(const NGHolder &g, vector<StateInfo> &info) {
info.resize(num_vertices(g));
for (auto v : vertices_range(g)) {
u32 idx = g[v].index;
const CharReach &cr = g[v].char_reach;
assert(idx < info.size());
info[idx] = StateInfo(v, cr);
}
}
void execute_graph(const NGHolder &g, const ue2_literal &input,
set<NFAVertex> *states, bool kill_sds) {
assert(hasCorrectlyNumberedVertices(g));
vector<StateInfo> info;
fillInfoTable(g, info);
dynamic_bitset<> work_states(num_vertices(g));
fillStateBitset(g, *states, work_states);
execute_graph_i(g, info, input, &work_states, kill_sds);
fillVertexSet(work_states, info, *states);
}
void execute_graph(const NGHolder &g, const vector<CharReach> &input,
set<NFAVertex> *states) {
assert(hasCorrectlyNumberedVertices(g));
vector<StateInfo> info;
fillInfoTable(g, info);
dynamic_bitset<> work_states(num_vertices(g));
fillStateBitset(g, *states, work_states);
execute_graph_i(g, info, input, &work_states, false);
fillVertexSet(work_states, info, *states);
}
typedef boost::reverse_graph<const NFAGraph, const NFAGraph &> RevNFAGraph;
namespace {
class eg_visitor : public boost::default_dfs_visitor {
public:
eg_visitor(const NGHolder &running_g_in, const vector<StateInfo> &info_in,
const NGHolder &input_g_in,
map<NFAVertex, dynamic_bitset<> > &states_in)
: vertex_count(num_vertices(running_g_in)), running_g(running_g_in),
info(info_in), input_g(input_g_in), states(states_in),
succs(vertex_count) {}
void finish_vertex(NFAVertex input_v, const RevNFAGraph &) {
if (input_v == input_g.accept) {
return;
}
assert(input_v != input_g.acceptEod);
DEBUG_PRINTF("finished p%u\n", input_g[input_v].index);
/* finish vertex is called on vertex --> implies that all its parents
* (in the forward graph) are also finished. Our parents will have
* pushed all of their successors for us into our stateset. */
states[input_v].resize(vertex_count);
dynamic_bitset<> our_states = states[input_v];
states[input_v].reset();
filter_by_reach(info, &our_states,
input_g[input_v].char_reach);
if (input_v != input_g.startDs &&
edge(input_v, input_v, input_g).second) {
bool changed;
do {
DEBUG_PRINTF("actually not finished -> have self loop\n");
succs.reset();
step(running_g, info, our_states, &succs);
filter_by_reach(info, &succs,
input_g[input_v].char_reach);
dynamic_bitset<> our_states2 = our_states | succs;
changed = our_states2 != our_states;
our_states.swap(our_states2);
} while (changed);
}
DEBUG_PRINTF(" active rstates: %s\n", dumpStates(our_states).c_str());
succs.reset();
step(running_g, info, our_states, &succs);
/* we need to push into all our (forward) children their successors
* from us. */
for (auto v : adjacent_vertices_range(input_v, input_g)) {
DEBUG_PRINTF("pushing our states to pstate %u\n",
input_g[v].index);
if (v == input_g.startDs) {
/* no need for intra start edges */
continue;
}
states[v].resize(vertex_count); // May not yet exist
if (v != input_g.accept) {
states[v] |= succs;
} else {
/* accept is a magical pseudo state which does not consume
* characters and we are using to collect the output states. We
* must fill it with our states rather than our succs. */
DEBUG_PRINTF("prev outputted rstates: %s\n",
dumpStates(states[v]).c_str());
DEBUG_PRINTF("outputted rstates: %s\n",
dumpStates(our_states).c_str());
states[v] |= our_states;
DEBUG_PRINTF("new outputted rstates: %s\n",
dumpStates(states[v]).c_str());
}
}
/* note: the states at this vertex are no longer required */
}
private:
const size_t vertex_count;
const NGHolder &running_g;
const vector<StateInfo> &info;
const NGHolder &input_g;
map<NFAVertex, dynamic_bitset<> > &states; /* vertex in input_g -> set of
states in running_g */
dynamic_bitset<> succs; // temp use internally
};
} // namespace
void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
const set<NFAVertex> &input_start_states,
set<NFAVertex> *states) {
DEBUG_PRINTF("g has %zu vertices, input_dag has %zu vertices\n",
num_vertices(running_g), num_vertices(input_dag));
assert(hasCorrectlyNumberedVertices(running_g));
assert(in_degree(input_dag.acceptEod, input_dag) == 1);
map<NFAVertex, boost::default_color_type> colours;
/* could just a topo order, but really it is time to pull a slightly bigger
* gun: DFS */
RevNFAGraph revg(input_dag.g);
map<NFAVertex, dynamic_bitset<> > dfs_states;
vector<StateInfo> info;
fillInfoTable(running_g, info);
dynamic_bitset<> input_fs(num_vertices(running_g));
fillStateBitset(running_g, *states, input_fs);
for (auto v : input_start_states) {
dfs_states[v] = input_fs;
}
depth_first_visit(revg, input_dag.accept,
eg_visitor(running_g, info, input_dag, dfs_states),
make_assoc_property_map(colours));
fillVertexSet(dfs_states[input_dag.accept], info, *states);
#ifdef DEBUG
DEBUG_PRINTF(" output rstates:");
for (auto v : *states) {
printf(" %u", running_g[v].index);
}
printf("\n");
#endif
}
void execute_graph(const NGHolder &running_g, const NGHolder &input_dag,
set<NFAVertex> *states) {
set<NFAVertex> input_start_states = {input_dag.start, input_dag.startDs};
execute_graph(running_g, input_dag, input_start_states, states);
}
} // namespace ue2

67
src/nfagraph/ng_execute.h Normal file
View File

@@ -0,0 +1,67 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Execute an NFA over a given input, returning the set of states that
* are active afterwards.
*/
#ifndef NG_EXECUTE_H
#define NG_EXECUTE_H
#include "ng_holder.h"
#include <set>
#include <vector>
namespace ue2 {
class CharReach;
struct ue2_literal;
void execute_graph(const NGHolder &g, const ue2_literal &input,
std::set<NFAVertex> *states, bool kill_sds = false);
void execute_graph(const NGHolder &g, const std::vector<CharReach> &input,
std::set<NFAVertex> *states);
/** on exit, states contains any state which may still be enabled after
* receiving an input which corresponds to some path through the input_dag from
* start or startDs to accept. input_dag MUST be acyclic aside from self-loops.
*/
void execute_graph(const NGHolder &g, const NGHolder &input_dag,
std::set<NFAVertex> *states);
/* as above, but able to specify the source states for the input graph */
void execute_graph(const NGHolder &g, const NGHolder &input_dag,
const std::set<NFAVertex> &input_start_states,
std::set<NFAVertex> *states);
} // namespace ue2
#endif

View File

@@ -0,0 +1,155 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Code for discovering properties of an NGWrapper used by
* hs_expression_info.
*/
#include "ng_expr_info.h"
#include "hs_internal.h"
#include "ng.h"
#include "ng_asserts.h"
#include "ng_depth.h"
#include "ng_edge_redundancy.h"
#include "ng_holder.h"
#include "ng_reports.h"
#include "ng_util.h"
#include "ue2common.h"
#include "parser/position.h" // for POS flags
#include "util/boundary_reports.h"
#include "util/compile_context.h"
#include "util/depth.h"
#include "util/graph.h"
#include "util/graph_range.h"
#include "util/report_manager.h"
#include <limits.h>
#include <set>
using namespace std;
namespace ue2 {
/* get rid of leading \b and multiline ^ vertices */
static
void removeLeadingVirtualVerticesFromRoot(NGWrapper &w, NFAVertex root) {
vector<NFAVertex> victims;
for (auto v : adjacent_vertices_range(root, w)) {
if (w[v].assert_flags & POS_FLAG_VIRTUAL_START) {
DEBUG_PRINTF("(?m)^ vertex or leading \\[bB] vertex\n");
victims.push_back(v);
}
}
for (auto u : victims) {
for (auto v : adjacent_vertices_range(u, w)) {
add_edge_if_not_present(root, v, w);
}
}
remove_vertices(victims, w);
}
static
void checkVertex(const ReportManager &rm, const NGWrapper &w, NFAVertex v,
const vector<DepthMinMax> &depths, DepthMinMax &info) {
if (is_any_accept(v, w)) {
return;
}
if (is_any_start(v, w)) {
info.min = 0;
info.max = max(info.max, depth(0));
return;
}
u32 idx = w[v].index;
assert(idx < depths.size());
const DepthMinMax &d = depths.at(idx);
for (ReportID report_id : w[v].reports) {
const Report &ir = rm.getReport(report_id);
assert(ir.type == EXTERNAL_CALLBACK);
s32 adjust = ir.offsetAdjust;
info.min = min(info.min, d.min + adjust);
info.max = max(info.max, d.max + adjust);
}
}
static
bool hasOffsetAdjust(const ReportManager &rm, const NGWrapper &w) {
for (const auto &report_id : all_reports(w)) {
if (rm.getReport(report_id).offsetAdjust) {
return true;
}
}
return false;
}
void fillExpressionInfo(ReportManager &rm, NGWrapper &w, hs_expr_info *info) {
assert(info);
/* ensure utf8 starts at cp boundary */
ensureCodePointStart(rm, w);
resolveAsserts(rm, w);
optimiseVirtualStarts(w);
removeLeadingVirtualVerticesFromRoot(w, w.start);
removeLeadingVirtualVerticesFromRoot(w, w.startDs);
vector<DepthMinMax> depths;
calcDepthsFrom(w, w.start, depths);
DepthMinMax d;
for (auto u : inv_adjacent_vertices_range(w.accept, w)) {
checkVertex(rm, w, u, depths, d);
}
for (auto u : inv_adjacent_vertices_range(w.acceptEod, w)) {
checkVertex(rm, w, u, depths, d);
}
if (d.max.is_finite()) {
info->max_width = d.max;
} else {
info->max_width = UINT_MAX;
}
if (d.min.is_finite()) {
info->min_width = d.min;
} else {
info->min_width = UINT_MAX;
}
info->unordered_matches = hasOffsetAdjust(rm, w);
info->matches_at_eod = can_match_at_eod(w);
info->matches_only_at_eod = can_only_match_at_eod(w);
}
} // namespace ue2

View File

@@ -0,0 +1,50 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Code for discovering properties of an NGWrapper used by
* hs_expression_info.
*/
#ifndef NG_EXPR_INFO_H
#define NG_EXPR_INFO_H
struct hs_expr_info;
#include "ue2common.h"
namespace ue2 {
class NGWrapper;
class ReportManager;
void fillExpressionInfo(ReportManager &rm, NGWrapper &w, hs_expr_info *info);
} // namespace ue2
#endif // NG_EXPR_INFO_H

View File

@@ -0,0 +1,878 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Propagate extended parameters to vertex reports and reduce graph if
* possible.
*
* This code handles the propagation of the extension parameters specified by
* the user with the hs_expr_ext structure into the reports on the graph's
* vertices.
*
* There are also some analyses that prune edges that cannot contribute to a
* match given these constraints, or transform the graph in order to make a
* constraint implicit.
*/
#include "ng.h"
#include "ng_depth.h"
#include "ng_dump.h"
#include "ng_extparam.h"
#include "ng_prune.h"
#include "ng_reports.h"
#include "ng_som_util.h"
#include "ng_width.h"
#include "ng_util.h"
#include "ue2common.h"
#include "parser/position.h"
#include "util/compile_context.h"
#include "util/compile_error.h"
#include "util/container.h"
#include "util/graph.h"
#include "util/graph_range.h"
#include <sstream>
#include <string>
using namespace std;
namespace ue2 {
static const u32 MAX_MAXOFFSET_TO_ANCHOR = 2000;
static const u32 MAX_MINLENGTH_TO_CONVERT = 2000;
/** \brief Find the (min, max) offset adjustment for the reports on a given
* vertex. */
static
pair<s32,s32> getMinMaxOffsetAdjust(const ReportManager &rm,
const NGHolder &g, NFAVertex v) {
s32 minAdj = 0, maxAdj = 0;
const auto &reports = g[v].reports;
for (auto ri = reports.begin(), re = reports.end(); ri != re; ++ri) {
const Report &ir = rm.getReport(*ri);
if (ri == reports.begin()) {
minAdj = ir.offsetAdjust;
maxAdj = ir.offsetAdjust;
} else {
minAdj = min(minAdj, ir.offsetAdjust);
maxAdj = max(maxAdj, ir.offsetAdjust);
}
}
return make_pair(minAdj, maxAdj);
}
/** \brief Find the (min, max) length of any match for the given holder. */
static
DepthMinMax findMatchLengths(const ReportManager &rm, const NGHolder &g) {
DepthMinMax match_depths;
vector<DepthMinMax> depths = getDistancesFromSOM(g);
pair<s32, s32> adj;
for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
u32 idx = g[v].index;
DepthMinMax d = depths[idx]; // copy
adj = getMinMaxOffsetAdjust(rm, g, v);
DEBUG_PRINTF("vertex %u: depths=%s, adj=[%d,%d]\n", idx,
d.str().c_str(), adj.first, adj.second);
d.min += adj.first;
d.max += adj.second;
match_depths = unionDepthMinMax(match_depths, d);
}
for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
if (v == g.accept) {
continue;
}
u32 idx = g[v].index;
DepthMinMax d = depths[idx]; // copy
adj = getMinMaxOffsetAdjust(rm, g, v);
DEBUG_PRINTF("vertex %u: depths=%s, adj=[%d,%d]\n", idx,
d.str().c_str(), adj.first, adj.second);
d.min += adj.first;
d.max += adj.second;
match_depths = unionDepthMinMax(match_depths, d);
}
DEBUG_PRINTF("match_depths=%s\n", match_depths.str().c_str());
assert(match_depths.min.is_reachable());
assert(match_depths.max.is_reachable());
return match_depths;
}
/** \brief Replace the graph's reports with new reports that specify bounds. */
static
void updateReportBounds(ReportManager &rm, NGWrapper &g, NFAVertex accept,
set<NFAVertex> &done) {
for (auto v : inv_adjacent_vertices_range(accept, g)) {
// Don't operate on g.accept itself.
if (v == g.accept) {
assert(accept == g.acceptEod);
continue;
}
// Don't operate on a vertex we've already done.
if (contains(done, v)) {
continue;
}
done.insert(v);
flat_set<ReportID> new_reports;
auto &reports = g[v].reports;
for (auto id : reports) {
Report ir = rm.getReport(id); // make a copy
assert(!ir.hasBounds());
// Note that we need to cope with offset adjustment here.
ir.minOffset = g.min_offset - ir.offsetAdjust;
if (g.max_offset == MAX_OFFSET) {
ir.maxOffset = MAX_OFFSET;
} else {
ir.maxOffset = g.max_offset - ir.offsetAdjust;
}
assert(ir.maxOffset >= ir.minOffset);
ir.minLength = g.min_length;
if (g.min_length && !g.som) {
ir.quashSom = true;
}
DEBUG_PRINTF("id %u -> min_offset=%llu, max_offset=%llu, "
"min_length=%llu\n",
id, ir.minOffset, ir.maxOffset, ir.minLength);
new_reports.insert(rm.getInternalId(ir));
}
DEBUG_PRINTF("swapping reports on vertex %u\n",
g[v].index);
reports.swap(new_reports);
}
}
static
bool hasVirtualStarts(const NGHolder &g) {
for (auto v : adjacent_vertices_range(g.start, g)) {
if (g[v].assert_flags & POS_FLAG_VIRTUAL_START) {
return true;
}
}
return false;
}
/** If the pattern is unanchored, has a max_offset and has not asked for SOM,
* we can use that knowledge to anchor it which will limit its lifespan. Note
* that we can't use this transformation if there's a min_length, as it's
* currently handled using "sly SOM".
*
* Note that it is possible to handle graphs that have a combination of
* anchored and unanchored paths, but it's too tricky for the moment.
*/
static
bool anchorPatternWithBoundedRepeat(NGWrapper &g, const depth &minWidth,
const depth &maxWidth) {
assert(!g.som);
assert(g.max_offset != MAX_OFFSET);
assert(minWidth <= maxWidth);
assert(maxWidth.is_reachable());
DEBUG_PRINTF("widths=[%s,%s], min/max offsets=[%llu,%llu]\n",
minWidth.str().c_str(), maxWidth.str().c_str(), g.min_offset,
g.max_offset);
if (g.max_offset > MAX_MAXOFFSET_TO_ANCHOR) {
return false;
}
if (g.max_offset < minWidth) {
assert(0);
return false;
}
// If the pattern has virtual starts, we probably don't want to touch it.
if (hasVirtualStarts(g)) {
DEBUG_PRINTF("virtual starts, bailing\n");
return false;
}
// Similarly, bail if the pattern is vacuous. TODO: this could be done, we
// would just need to be a little careful with reports.
if (isVacuous(g)) {
DEBUG_PRINTF("vacuous, bailing\n");
return false;
}
u32 min_bound, max_bound;
if (maxWidth.is_infinite()) {
min_bound = 0;
max_bound = g.max_offset - minWidth;
} else {
min_bound = g.min_offset > maxWidth ? g.min_offset - maxWidth : 0;
max_bound = g.max_offset - minWidth;
}
DEBUG_PRINTF("prepending ^.{%u,%u}\n", min_bound, max_bound);
vector<NFAVertex> initials;
for (auto v : adjacent_vertices_range(g.startDs, g)) {
if (v == g.startDs) {
continue;
}
initials.push_back(v);
}
if (initials.empty()) {
DEBUG_PRINTF("no initial vertices\n");
return false;
}
// Wire up 'min_offset' mandatory dots from anchored start.
NFAVertex u = g.start;
for (u32 i = 0; i < min_bound; i++) {
NFAVertex v = add_vertex(g);
g[v].char_reach.setall();
add_edge(u, v, g);
u = v;
}
NFAVertex head = u;
// Wire up optional dots for (max_offset - min_offset).
for (u32 i = 0; i < max_bound - min_bound; i++) {
NFAVertex v = add_vertex(g);
g[v].char_reach.setall();
if (head != u) {
add_edge(head, v, g);
}
add_edge(u, v, g);
u = v;
}
// Remove edges from starts and wire both head and u to our initials.
for (auto v : initials) {
remove_edge(g.startDs, v, g);
remove_edge(g.start, v, g);
if (head != u) {
add_edge(head, v, g);
}
add_edge(u, v, g);
}
g.renumberVertices();
g.renumberEdges();
return true;
}
static
NFAVertex findSingleCyclic(const NGHolder &g) {
NFAVertex v = NFAGraph::null_vertex();
for (const auto &e : edges_range(g)) {
if (source(e, g) == target(e, g)) {
if (source(e, g) == g.startDs) {
continue;
}
if (v != NFAGraph::null_vertex()) {
// More than one cyclic vertex.
return NFAGraph::null_vertex();
}
v = source(e, g);
}
}
if (v != NFAGraph::null_vertex()) {
DEBUG_PRINTF("cyclic is %u\n", g[v].index);
assert(!is_special(v, g));
}
return v;
}
static
bool hasOffsetAdjust(const ReportManager &rm, NGWrapper &g,
int *adjust) {
const auto &reports = all_reports(g);
if (reports.empty()) {
assert(0);
return false;
}
int offsetAdjust = rm.getReport(*reports.begin()).offsetAdjust;
for (auto report : reports) {
const Report &ir = rm.getReport(report);
if (ir.offsetAdjust != offsetAdjust) {
DEBUG_PRINTF("different adjusts!\n");
return false;
}
}
*adjust = offsetAdjust;
return true;
}
/** If the pattern has a min_length and is of "ratchet" form with one unbounded
* repeat, that repeat can become a bounded repeat.
*
* /foo.*bar/{min_length=100} --> /foo.{94,}bar/
*/
static
bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
assert(g.min_length);
if (g.min_length > MAX_MINLENGTH_TO_CONVERT) {
return false;
}
// If the pattern has virtual starts, we probably don't want to touch it.
if (hasVirtualStarts(g)) {
DEBUG_PRINTF("virtual starts, bailing\n");
return false;
}
// The graph must contain a single cyclic vertex (other than startDs), and
// that vertex can have one pred and one successor.
NFAVertex cyclic = findSingleCyclic(g);
if (cyclic == NFAGraph::null_vertex()) {
return false;
}
NFAGraph::adjacency_iterator ai, ae;
tie(ai, ae) = adjacent_vertices(g.start, g);
if (*ai == g.startDs) {
++ai;
}
NFAVertex v = *ai;
if (++ai != ae) {
DEBUG_PRINTF("more than one initial vertex\n");
return false;
}
u32 width = 0;
// Walk from the start vertex to the cyclic state and ensure we have a
// chain of vertices.
while (v != cyclic) {
DEBUG_PRINTF("vertex %u\n", g[v].index);
width++;
tie(ai, ae) = adjacent_vertices(v, g);
set<NFAVertex> succ(ai, ae);
if (contains(succ, cyclic)) {
if (succ.size() == 1) {
v = cyclic;
} else if (succ.size() == 2) {
// Cyclic and jump edge.
succ.erase(cyclic);
NFAVertex v2 = *succ.begin();
if (!edge(cyclic, v2, g).second) {
DEBUG_PRINTF("bad form\n");
return false;
}
v = cyclic;
} else {
DEBUG_PRINTF("bad form\n");
return false;
}
} else {
if (succ.size() != 1) {
DEBUG_PRINTF("bad form\n");
return false;
}
v = *succ.begin();
}
}
// Check the cyclic state is A-OK.
v = getSoleDestVertex(g, cyclic);
if (v == NFAGraph::null_vertex()) {
DEBUG_PRINTF("cyclic has more than one successor\n");
return false;
}
// Walk from the cyclic state to an accept and ensure we have a chain of
// vertices.
while (!is_any_accept(v, g)) {
DEBUG_PRINTF("vertex %u\n", g[v].index);
width++;
tie(ai, ae) = adjacent_vertices(v, g);
set<NFAVertex> succ(ai, ae);
if (succ.size() != 1) {
DEBUG_PRINTF("bad form\n");
return false;
}
v = *succ.begin();
}
int offsetAdjust = 0;
if (!hasOffsetAdjust(rm, g, &offsetAdjust)) {
return false;
}
DEBUG_PRINTF("adjusting width by %d\n", offsetAdjust);
width += offsetAdjust;
DEBUG_PRINTF("width=%u, vertex %u is cyclic\n", width,
g[cyclic].index);
if (width >= g.min_length) {
DEBUG_PRINTF("min_length=%llu is guaranteed, as width=%u\n",
g.min_length, width);
g.min_length = 0;
return true;
}
vector<NFAVertex> preds;
vector<NFAEdge> dead;
for (auto u : inv_adjacent_vertices_range(cyclic, g)) {
DEBUG_PRINTF("pred %u\n", g[u].index);
if (u == cyclic) {
continue;
}
preds.push_back(u);
// We want to delete the out-edges of each predecessor, but need to
// make sure we don't delete the startDs self loop.
for (const auto &e : out_edges_range(u, g)) {
if (target(e, g) != g.startDs) {
dead.push_back(e);
}
}
}
remove_edges(dead, g);
assert(!preds.empty());
const CharReach &cr = g[cyclic].char_reach;
for (u32 i = 0; i < g.min_length - width - 1; ++i) {
v = add_vertex(g);
g[v].char_reach = cr;
for (auto u : preds) {
add_edge(u, v, g);
}
preds.clear();
preds.push_back(v);
}
assert(!preds.empty());
for (auto u : preds) {
add_edge(u, cyclic, g);
}
g.renumberVertices();
g.renumberEdges();
clearReports(g);
g.min_length = 0;
return true;
}
static
bool hasExtParams(const NGWrapper &g) {
if (g.min_length != 0) {
return true;
}
if (g.min_offset != 0) {
return true;
}
if (g.max_offset != MAX_OFFSET) {
return true;
}
return false;
}
static
depth maxDistFromStart(const NFAVertexBidiDepth &d) {
if (!d.fromStartDotStar.max.is_unreachable()) {
// A path from startDs, any path, implies we can match at any offset.
return depth::infinity();
}
return d.fromStart.max;
}
static
const depth& maxDistToAccept(const NFAVertexBidiDepth &d) {
if (d.toAccept.max.is_unreachable()) {
return d.toAcceptEod.max;
} else if (d.toAcceptEod.max.is_unreachable()) {
return d.toAccept.max;
}
return max(d.toAccept.max, d.toAcceptEod.max);
}
static
const depth& minDistFromStart(const NFAVertexBidiDepth &d) {
return min(d.fromStartDotStar.min, d.fromStart.min);
}
static
const depth& minDistToAccept(const NFAVertexBidiDepth &d) {
return min(d.toAccept.min, d.toAcceptEod.min);
}
static
bool isEdgePrunable(const NGWrapper &g,
const vector<NFAVertexBidiDepth> &depths,
const NFAEdge &e) {
const NFAVertex u = source(e, g);
const NFAVertex v = target(e, g);
DEBUG_PRINTF("edge (%u,%u)\n", g[u].index,
g[v].index);
// Leave our special-to-special edges alone.
if (is_special(u, g) && is_special(v, g)) {
DEBUG_PRINTF("ignoring special-to-special\n");
return false;
}
// We must be careful around start: we don't want to remove (start, v) if
// (startDs, v) exists as well, since later code will assume the presence
// of both edges, but other cases are OK.
if (u == g.start && edge(g.startDs, v, g).second) {
DEBUG_PRINTF("ignoring unanchored start edge\n");
return false;
}
u32 u_idx = g[u].index;
u32 v_idx = g[v].index;
assert(u_idx < depths.size() && v_idx < depths.size());
const NFAVertexBidiDepth &du = depths.at(u_idx);
const NFAVertexBidiDepth &dv = depths.at(v_idx);
if (g.min_offset) {
depth max_offset = maxDistFromStart(du) + maxDistToAccept(dv);
if (max_offset.is_finite() && max_offset < g.min_offset) {
DEBUG_PRINTF("max_offset=%s too small\n", max_offset.str().c_str());
return true;
}
}
if (g.max_offset != MAX_OFFSET) {
depth min_offset = minDistFromStart(du) + minDistToAccept(dv);
assert(min_offset.is_finite());
if (min_offset > g.max_offset) {
DEBUG_PRINTF("min_offset=%s too large\n", min_offset.str().c_str());
return true;
}
}
if (g.min_length && is_any_accept(v, g)) {
// Simple take on min_length. If we're an edge to accept and our max
// dist from start is too small, we can be pruned.
const depth &width = du.fromStart.max;
if (width.is_finite() && width < g.min_length) {
DEBUG_PRINTF("max width %s from start too small for min_length\n",
width.str().c_str());
return true;
}
}
return false;
}
static
void pruneExtUnreachable(NGWrapper &g) {
vector<NFAVertexBidiDepth> depths;
calcDepths(g, depths);
vector<NFAEdge> dead;
for (const auto &e : edges_range(g)) {
if (isEdgePrunable(g, depths, e)) {
DEBUG_PRINTF("pruning\n");
dead.push_back(e);
}
}
if (dead.empty()) {
return;
}
remove_edges(dead, g);
pruneUseless(g);
}
/** Remove vacuous edges in graphs where the min_offset or min_length
* constraints dictate that they can never produce a match. */
static
void pruneVacuousEdges(NGWrapper &g) {
if (!g.min_length && !g.min_offset) {
return;
}
vector<NFAEdge> dead;
for (const auto &e : edges_range(g)) {
const NFAVertex u = source(e, g);
const NFAVertex v = target(e, g);
// Special case: Crudely remove vacuous edges from start in graphs with a
// min_offset.
if (g.min_offset && u == g.start && is_any_accept(v, g)) {
DEBUG_PRINTF("vacuous edge in graph with min_offset!\n");
dead.push_back(e);
continue;
}
// If a min_length is set, vacuous edges can be removed.
if (g.min_length && is_any_start(u, g) && is_any_accept(v, g)) {
DEBUG_PRINTF("vacuous edge in graph with min_length!\n");
dead.push_back(e);
continue;
}
}
if (dead.empty()) {
return;
}
remove_edges(dead, g);
pruneUseless(g);
}
static
void pruneUnmatchable(NGWrapper &g, const vector<DepthMinMax> &depths,
const ReportManager &rm, NFAVertex accept) {
vector<NFAEdge> dead;
for (const auto &e : in_edges_range(accept, g)) {
NFAVertex v = source(e, g);
if (v == g.accept) {
assert(accept == g.acceptEod); // stylised edge
continue;
}
u32 idx = g[v].index;
DepthMinMax d = depths[idx]; // copy
pair<s32, s32> adj = getMinMaxOffsetAdjust(rm, g, v);
DEBUG_PRINTF("vertex %u: depths=%s, adj=[%d,%d]\n", idx,
d.str().c_str(), adj.first, adj.second);
d.min += adj.first;
d.max += adj.second;
if (d.max.is_finite() && d.max < g.min_length) {
DEBUG_PRINTF("prune, max match length %s < min_length=%llu\n",
d.max.str().c_str(), g.min_length);
dead.push_back(e);
continue;
}
if (g.max_offset != MAX_OFFSET && d.min > g.max_offset) {
DEBUG_PRINTF("prune, min match length %s > max_offset=%llu\n",
d.min.str().c_str(), g.max_offset);
dead.push_back(e);
continue;
}
}
remove_edges(dead, g);
}
/** Remove edges to accepts that can never produce a match long enough to
* satisfy our min_length and max_offset constraints. */
static
void pruneUnmatchable(NGWrapper &g, const ReportManager &rm) {
if (!g.min_length) {
return;
}
vector<DepthMinMax> depths = getDistancesFromSOM(g);
pruneUnmatchable(g, depths, rm, g.accept);
pruneUnmatchable(g, depths, rm, g.acceptEod);
pruneUseless(g);
}
static
bool isUnanchored(const NGHolder &g) {
for (auto v : adjacent_vertices_range(g.start, g)) {
if (!edge(g.startDs, v, g).second) {
DEBUG_PRINTF("fail, %u is anchored vertex\n",
g[v].index);
return false;
}
}
return true;
}
static
bool hasOffsetAdjustments(const ReportManager &rm, const NGHolder &g) {
for (auto report : all_reports(g)) {
const Report &ir = rm.getReport(report);
if (ir.offsetAdjust) {
return true;
}
}
return false;
}
void handleExtendedParams(ReportManager &rm, NGWrapper &g,
UNUSED const CompileContext &cc) {
if (!hasExtParams(g)) {
return;
}
depth minWidth = findMinWidth(g);
depth maxWidth = findMaxWidth(g);
bool is_anchored = !has_proper_successor(g.startDs, g)
&& out_degree(g.start, g);
bool has_offset_adj = hasOffsetAdjustments(rm, g);
DEBUG_PRINTF("minWidth=%s, maxWidth=%s, anchored=%d, offset_adj=%d\n",
minWidth.str().c_str(), maxWidth.str().c_str(), is_anchored,
has_offset_adj);
DepthMinMax match_depths = findMatchLengths(rm, g);
DEBUG_PRINTF("match depths %s\n", match_depths.str().c_str());
if (is_anchored && maxWidth.is_finite() && g.min_offset > maxWidth) {
ostringstream oss;
oss << "Expression is anchored and cannot satisfy min_offset="
<< g.min_offset << " as it can only produce matches of length "
<< maxWidth << " bytes at most.";
throw CompileError(g.expressionIndex, oss.str());
}
if (minWidth > g.max_offset) {
ostringstream oss;
oss << "Expression has max_offset=" << g.max_offset << " but requires "
<< minWidth << " bytes to match.";
throw CompileError(g.expressionIndex, oss.str());
}
if (maxWidth.is_finite() && match_depths.max < g.min_length) {
ostringstream oss;
oss << "Expression has min_length=" << g.min_length << " but can "
"only produce matches of length " << match_depths.max <<
" bytes at most.";
throw CompileError(g.expressionIndex, oss.str());
}
if (g.min_length && g.min_length <= match_depths.min) {
DEBUG_PRINTF("min_length=%llu constraint is unnecessary\n",
g.min_length);
g.min_length = 0;
}
if (!hasExtParams(g)) {
return;
}
pruneVacuousEdges(g);
pruneUnmatchable(g, rm);
if (!has_offset_adj) {
pruneExtUnreachable(g);
}
// We may have removed all the edges to accept, in which case this
// expression cannot match.
if (in_degree(g.accept, g) == 0 && in_degree(g.acceptEod, g) == 1) {
throw CompileError(g.expressionIndex, "Extended parameter "
"constraints can not be satisfied for any match from "
"this expression.");
}
// Remove reports on vertices without an edge to accept (which have been
// pruned above).
clearReports(g);
// Recalc.
minWidth = findMinWidth(g);
maxWidth = findMaxWidth(g);
is_anchored = proper_out_degree(g.startDs, g) == 0 &&
out_degree(g.start, g);
has_offset_adj = hasOffsetAdjustments(rm, g);
// If the pattern is completely anchored and has a min_length set, this can
// be converted to a min_offset.
if (g.min_length && (g.min_offset <= g.min_length) && is_anchored) {
DEBUG_PRINTF("converting min_length to min_offset=%llu for "
"anchored case\n", g.min_length);
g.min_offset = g.min_length;
g.min_length = 0;
}
if (g.min_offset && g.min_offset <= minWidth && !has_offset_adj) {
DEBUG_PRINTF("min_offset=%llu constraint is unnecessary\n",
g.min_offset);
g.min_offset = 0;
}
if (!hasExtParams(g)) {
return;
}
// If the pattern has a min_length and is of "ratchet" form with one
// unbounded repeat, that repeat can become a bounded repeat.
// e.g. /foo.*bar/{min_length=100} --> /foo.{94,}bar/
if (g.min_length && transformMinLengthToRepeat(rm, g)) {
DEBUG_PRINTF("converted min_length to bounded repeat\n");
// recalc
minWidth = findMinWidth(g);
}
// If the pattern is unanchored, has a max_offset and has not asked for
// SOM, we can use that knowledge to anchor it which will limit its
// lifespan. Note that we can't use this transformation if there's a
// min_length, as it's currently handled using "sly SOM".
// Note that it is possible to handle graphs that have a combination of
// anchored and unanchored paths, but it's too tricky for the moment.
if (g.max_offset != MAX_OFFSET && !g.som && !g.min_length &&
!has_offset_adj && isUnanchored(g)) {
if (anchorPatternWithBoundedRepeat(g, minWidth, maxWidth)) {
DEBUG_PRINTF("minWidth=%s, maxWidth=%s\n", minWidth.str().c_str(),
maxWidth.str().c_str());
if (minWidth == maxWidth) {
// For a fixed width pattern, we can retire the offsets as they
// are implicit in the graph now.
g.min_offset = 0;
g.max_offset = MAX_OFFSET;
}
}
}
//dumpGraph("final.dot", g.g);
if (!hasExtParams(g)) {
return;
}
set<NFAVertex> done;
updateReportBounds(rm, g, g.accept, done);
updateReportBounds(rm, g, g.acceptEod, done);
}
} // namespace ue2

View File

@@ -0,0 +1,48 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Propagate extended parameters to vertex reports and reduce graph if
* possible.
*/
#ifndef NG_EXTPARAM_H
#define NG_EXTPARAM_H
namespace ue2 {
struct CompileContext;
class NGWrapper;
class ReportManager;
void handleExtendedParams(ReportManager &rm, NGWrapper &g,
const CompileContext &cc);
} // namespace ue2
#endif

View File

@@ -0,0 +1,142 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Rose mask construction from NGHolder.
*/
#include "ng_fixed_width.h"
#include "grey.h"
#include "ng_holder.h"
#include "ng_util.h"
#include "rose/rose_build.h"
#include "util/container.h"
#include "ue2common.h"
#include <algorithm>
#include <iterator>
#include <set>
using namespace std;
namespace ue2 {
static
bool findMask(const NGHolder &g, vector<CharReach> *mask, bool *anchored,
ue2::flat_set<ReportID> *reports) {
DEBUG_PRINTF("looking for a mask pattern\n");
set<NFAVertex> s_succ;
insert(&s_succ, adjacent_vertices(g.start, g));
set<NFAVertex> sds_succ;
insert(&sds_succ, adjacent_vertices(g.startDs, g));
*anchored = sds_succ.size() == 1; /* sds itself */
bool floating = is_subset_of(s_succ, sds_succ);
DEBUG_PRINTF("sds %zu s %zu%s%s\n", sds_succ.size(), s_succ.size(),
*anchored ? " anchored" : "", floating ? " floating" : "");
if (!*anchored && !floating) {
DEBUG_PRINTF("semi-anchored\n");
return false;
}
set<NFAVertex> &succs = *anchored ? s_succ : sds_succ;
succs.erase(g.startDs);
if (succs.size() != 1) {
DEBUG_PRINTF("branchy root\n");
return false;
}
NFAVertex u = *anchored ? g.start : g.startDs;
NFAVertex v = *succs.begin();
while (true) {
DEBUG_PRINTF("validating vertex %u\n", g[v].index);
assert(v != g.acceptEod);
// If we've reached an accept, we MAY have found a valid Rose pattern
if (v == g.accept) {
DEBUG_PRINTF("accept\n");
insert(reports, g[u].reports);
return true;
}
mask->push_back(g[v].char_reach);
if (out_degree(v, g) != 1) {
DEBUG_PRINTF("out_degree != 1\n");
return false; /* not a chain */
}
u = v;
v = *adjacent_vertices(v, g).first;
if (in_degree(v, g) != 1) {
DEBUG_PRINTF("blargh\n"); /* picks up cases where there is no path
* to case accept (large cycles),
* ensures term */
return false;
}
}
}
bool handleFixedWidth(RoseBuild &rose, const NGHolder &g, const Grey &grey) {
if (!grey.roseMasks) {
return false;
}
if (in_degree(g.acceptEod,g) != 1) {
DEBUG_PRINTF("EOD anchoring not supported\n");
return false;
}
ue2::flat_set<ReportID> reports;
bool anchored = false;
vector<CharReach> mask;
if (!findMask(g, &mask, &anchored, &reports)) {
return false;
}
DEBUG_PRINTF("%smasky masky\n", anchored ? "anchored " : "");
assert(!mask.empty());
assert(!reports.empty());
if (rose.add(anchored, mask, reports)) {
DEBUG_PRINTF("added as rose mask\n");
return true;
} else {
DEBUG_PRINTF("failed to add masky\n");
return false;
}
}
} // namespace ue2

View File

@@ -0,0 +1,46 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Rose mask construction from NGHolder.
*/
#ifndef NG_FIXED_WIDTH_H
#define NG_FIXED_WIDTH_H
namespace ue2 {
class RoseBuild;
class NGHolder;
struct Grey;
bool handleFixedWidth(RoseBuild &build, const NGHolder &g, const Grey &grey);
} // namespace ue2
#endif // NG_FIXED_WIDTH_H

114
src/nfagraph/ng_graph.h Normal file
View File

@@ -0,0 +1,114 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Definition of the NFAGraph type used for all NFA graph
* representations.
*
* Note that most of the time we don't work on a bare NFAGraph: instead
* we use an NGHolder, which wraps the graph and defines our special vertices,
* etc.
*/
#ifndef NG_GRAPH_H
#define NG_GRAPH_H
#include "util/charreach.h"
#include "util/ue2_containers.h"
#include "ue2common.h"
#include <boost/graph/adjacency_iterator.hpp>
#include <boost/graph/adjacency_list.hpp>
#include <boost/graph/graph_traits.hpp>
namespace ue2 {
/** \brief Properties associated with each vertex in an NFAGraph. */
struct NFAGraphVertexProps {
/** \brief Set of characters on which this vertex is reachable. */
CharReach char_reach;
/** \brief Set of reports raised by this vertex. */
ue2::flat_set<ReportID> reports;
/** \brief Unique index for this vertex, used for BGL algorithms. */
u32 index = 0;
/** \brief Flags associated with assertions. */
u32 assert_flags = 0;
};
/** \brief Properties associated with each edge in an NFAGraph. */
struct NFAGraphEdgeProps {
/** \brief Unique index for this edge, used for BGL algorithms. */
u32 index = 0;
/** \brief For graphs that will be implemented as multi-top engines, this
* specifies the top event. Only used on edges from the start vertex. */
u32 top = 0;
/** \brief Flags associated with assertions. */
u32 assert_flags = 0;
};
// For flexibility: boost::listS, boost::listS for out-edge and vertex lists.
// boost::bidirectionalS for directed graph so that we can get at in-edges.
typedef boost::adjacency_list<boost::listS,
boost::listS,
boost::bidirectionalS,
NFAGraphVertexProps,
NFAGraphEdgeProps> NFAGraph;
typedef NFAGraph::vertex_descriptor NFAVertex;
typedef NFAGraph::edge_descriptor NFAEdge;
/** \brief vertex_index values for special nodes in the NFAGraph. */
enum SpecialNodes {
/** \brief Anchored start vertex. WARNING: this may be triggered at various
* locations (not just zero) for triggered graphs. */
NODE_START,
/** \brief Unanchored start-dotstar vertex. WARNING: this may not have a
* proper self-loop. */
NODE_START_DOTSTAR,
/** \brief Accept vertex. All vertices that can match at arbitrary offsets
* must have an edge to this vertex. */
NODE_ACCEPT,
/** \brief Accept-EOD vertex. Vertices that must raise a match at EOD only
* must have an edge to this vertex. */
NODE_ACCEPT_EOD,
/** \brief Sentinel, number of special vertices. */
N_SPECIALS
};
} // namespace ue2
#endif

842
src/nfagraph/ng_haig.cpp Normal file
View File

@@ -0,0 +1,842 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Build code for Haig SOM DFA.
*/
#include "ng_haig.h"
#include "grey.h"
#include "nfa/goughcompile.h"
#include "ng_holder.h"
#include "ng_mcclellan_internal.h"
#include "ng_restructuring.h"
#include "ng_som_util.h"
#include "ng_squash.h"
#include "ng_util.h"
#include "util/bitfield.h"
#include "util/container.h"
#include "util/determinise.h"
#include "util/graph_range.h"
#include "util/make_unique.h"
#include "util/ue2_containers.h"
#include <algorithm>
#include <functional>
#include <map>
#include <set>
#include <vector>
#include <boost/dynamic_bitset.hpp>
using namespace std;
using boost::dynamic_bitset;
namespace ue2 {
#define NFA_STATE_LIMIT 256
#define HAIG_MAX_NFA_STATE 600
#define HAIG_MAX_LIVE_SOM_SLOTS 32
namespace {
struct haig_too_wide {
};
template<typename stateset>
static
void populateInit(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &state_ids,
stateset *init, stateset *initDS,
vector<NFAVertex> *v_by_index) {
DEBUG_PRINTF("graph kind: %u\n", (int)g.kind);
for (auto v : vertices_range(g)) {
u32 v_index = g[v].index;
if (state_ids.at(v) == NO_STATE) {
continue;
}
if (is_any_start(v, g)) {
init->set(v_index);
if (hasSelfLoop(v, g) || is_triggered(g)) {
DEBUG_PRINTF("setting %u\n", v_index);
initDS->set(v_index);
}
}
assert(v_index < init->size());
}
v_by_index->clear();
v_by_index->resize(num_vertices(g), NFAGraph::null_vertex());
for (auto v : vertices_range(g)) {
u32 v_index = g[v].index;
assert((*v_by_index)[v_index] == NFAGraph::null_vertex());
(*v_by_index)[v_index] = v;
}
}
template<typename StateSet>
void populateAccepts(const NGHolder &g, StateSet *accept, StateSet *acceptEod) {
for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
accept->set(g[v].index);
}
for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
if (v == g.accept) {
continue;
}
acceptEod->set(g[v].index);
}
}
class Automaton_Base {
public:
Automaton_Base(const NGHolder &graph_in,
const ue2::unordered_map<NFAVertex, u32> &state_ids_in)
: graph(graph_in), state_ids(state_ids_in) {
calculateAlphabet(graph, alpha, unalpha, &alphasize);
assert(alphasize <= ALPHABET_SIZE);
}
static bool canPrune(const flat_set<ReportID> &) { return false; }
const NGHolder &graph;
const ue2::unordered_map<NFAVertex, u32> &state_ids;
array<u16, ALPHABET_SIZE> alpha;
array<u16, ALPHABET_SIZE> unalpha;
u16 alphasize;
set<dstate_id_t> done_a;
set<dstate_id_t> done_b;
u16 start_anchored;
u16 start_floating;
};
class Automaton_Big : public Automaton_Base {
public:
typedef dynamic_bitset<> StateSet;
typedef map<StateSet, dstate_id_t> StateMap;
Automaton_Big(const NGHolder &graph_in,
const ue2::unordered_map<NFAVertex, u32> &state_ids_in,
som_type som, const vector<vector<CharReach>> &triggers,
bool unordered_som)
: Automaton_Base(graph_in, state_ids_in), numStates(num_vertices(graph)),
init(numStates), initDS(numStates), squash(numStates),
accept(numStates), acceptEod(numStates), toppable(numStates),
dead(numStates) {
populateInit(graph, state_ids, &init, &initDS, &v_by_index);
populateAccepts(graph, &accept, &acceptEod);
start_anchored = DEAD_STATE + 1;
if (initDS == init) {
start_floating = start_anchored;
} else if (initDS.any()) {
start_floating = start_anchored + 1;
} else {
start_floating = DEAD_STATE;
}
if (!unordered_som) {
for (const auto &sq : findSquashers(graph, som)) {
NFAVertex v = sq.first;
u32 vert_id = graph[v].index;
squash.set(vert_id);
squash_mask[vert_id] = shrinkStateSet(sq.second);
}
}
cr_by_index = populateCR(graph, v_by_index, alpha);
if (is_triggered(graph)) {
markToppableStarts(graph, state_ids, false, triggers, &toppable);
}
}
private:
// Convert an NFAStateSet (as used by the squash code) into a StateSet.
StateSet shrinkStateSet(const NFAStateSet &in) const {
StateSet out(dead.size());
for (size_t i = in.find_first(); i != in.npos && i < out.size();
i = in.find_next(i)) {
out.set(i);
}
return out;
}
public:
void transition(const StateSet &in, StateSet *next) {
transition_graph(*this, v_by_index, in, next);
}
const vector<StateSet> initial() {
vector<StateSet> rv(1, init);
if (start_floating != DEAD_STATE && start_floating != start_anchored) {
rv.push_back(initDS);
}
return rv;
}
private:
void reports_i(const StateSet &in, bool eod, flat_set<ReportID> &rv) {
StateSet acc = in & (eod ? acceptEod : accept);
for (size_t i = acc.find_first(); i != StateSet::npos;
i = acc.find_next(i)) {
NFAVertex v = v_by_index[i];
DEBUG_PRINTF("marking report\n");
const auto &my_reports = graph[v].reports;
rv.insert(my_reports.begin(), my_reports.end());
}
}
public:
void reports(const StateSet &in, flat_set<ReportID> &rv) {
reports_i(in, false, rv);
}
void reportsEod(const StateSet &in, flat_set<ReportID> &rv) {
reports_i(in, true, rv);
}
public:
u32 numStates;
vector<NFAVertex> v_by_index;
vector<CharReach> cr_by_index; /* pre alpha'ed */
StateSet init;
StateSet initDS;
StateSet squash; /* states which allow us to mask out other states */
StateSet accept;
StateSet acceptEod;
StateSet toppable; /* states which are allowed to be on when a top arrives,
* triggered dfas only */
map<u32, StateSet> squash_mask;
StateSet dead;
};
class Automaton_Graph : public Automaton_Base {
public:
typedef bitfield<NFA_STATE_LIMIT> StateSet;
typedef ue2::unordered_map<StateSet, dstate_id_t> StateMap;
Automaton_Graph(const NGHolder &graph_in,
const ue2::unordered_map<NFAVertex, u32> &state_ids_in,
som_type som, const vector<vector<CharReach>> &triggers,
bool unordered_som)
: Automaton_Base(graph_in, state_ids_in) {
populateInit(graph, state_ids, &init, &initDS, &v_by_index);
populateAccepts(graph, &accept, &acceptEod);
start_anchored = DEAD_STATE + 1;
if (initDS == init) {
start_floating = start_anchored;
} else if (initDS.any()) {
start_floating = start_anchored + 1;
} else {
start_floating = DEAD_STATE;
}
if (!unordered_som) {
for (const auto &sq : findSquashers(graph, som)) {
NFAVertex v = sq.first;
u32 vert_id = graph[v].index;
squash.set(vert_id);
squash_mask[vert_id] = shrinkStateSet(sq.second);
}
}
cr_by_index = populateCR(graph, v_by_index, alpha);
if (is_triggered(graph)) {
dynamic_bitset<> temp(NFA_STATE_LIMIT);
markToppableStarts(graph, state_ids, false, triggers, &temp);
toppable = bitfield<NFA_STATE_LIMIT>(temp);
}
}
private:
// Convert an NFAStateSet (as used by the squash code) into a StateSet.
StateSet shrinkStateSet(const NFAStateSet &in) const {
StateSet out;
for (size_t i = in.find_first(); i != in.npos && i < out.size();
i = in.find_next(i)) {
out.set(i);
}
return out;
}
public:
void transition(const StateSet &in, StateSet *next) {
transition_graph(*this, v_by_index, in, next);
}
const vector<StateSet> initial() {
vector<StateSet> rv(1, init);
if (start_floating != DEAD_STATE && start_floating != start_anchored) {
rv.push_back(initDS);
}
return rv;
}
private:
void reports_i(const StateSet &in, bool eod, flat_set<ReportID> &rv) {
StateSet acc = in & (eod ? acceptEod : accept);
for (size_t i = acc.find_first(); i != StateSet::npos;
i = acc.find_next(i)) {
NFAVertex v = v_by_index[i];
DEBUG_PRINTF("marking report\n");
const auto &my_reports = graph[v].reports;
rv.insert(my_reports.begin(), my_reports.end());
}
}
public:
void reports(const StateSet &in, flat_set<ReportID> &rv) {
reports_i(in, false, rv);
}
void reportsEod(const StateSet &in, flat_set<ReportID> &rv) {
reports_i(in, true, rv);
}
public:
vector<NFAVertex> v_by_index;
vector<CharReach> cr_by_index; /* pre alpha'ed */
StateSet init;
StateSet initDS;
StateSet squash; /* states which allow us to mask out other states */
StateSet accept;
StateSet acceptEod;
StateSet toppable; /* states which are allowed to be on when a top arrives,
* triggered dfas only */
map<u32, StateSet> squash_mask;
StateSet dead;
};
class Automaton_Haig_Merge {
public:
typedef vector<u16> StateSet;
typedef ue2::unordered_map<StateSet, dstate_id_t> StateMap;
explicit Automaton_Haig_Merge(const vector<const raw_som_dfa *> &in)
: nfas(in.begin(), in.end()), dead(in.size()) {
calculateAlphabet();
populateAsFs();
}
void populateAsFs(void) {
bool fs_same = true;
bool fs_dead = true;
as.resize(nfas.size());
fs.resize(nfas.size());
for (u32 i = 0; i < nfas.size(); i++) {
as[i] = nfas[i]->start_anchored;
fs[i] = nfas[i]->start_floating;
if (fs[i]) {
fs_dead = false;
}
if (as[i] != fs[i]) {
fs_same = false;
}
}
start_anchored = DEAD_STATE + 1;
if (fs_same) {
start_floating = start_anchored;
} else if (fs_dead) {
start_floating = DEAD_STATE;
} else {
start_floating = start_anchored + 1;
}
}
void calculateAlphabet(void) {
DEBUG_PRINTF("calculating alphabet\n");
vector<CharReach> esets(1, CharReach::dot());
for (const auto &haig : nfas) {
DEBUG_PRINTF("...next dfa alphabet\n");
assert(haig);
const auto &alpha_remap = haig->alpha_remap;
for (size_t i = 0; i < esets.size(); i++) {
assert(esets[i].any());
if (esets[i].count() == 1) {
DEBUG_PRINTF("skipping singleton eq set\n");
continue;
}
CharReach t;
u8 leader_s = alpha_remap[esets[i].find_first()];
DEBUG_PRINTF("checking eq set, leader %02hhx \n", leader_s);
for (size_t s = esets[i].find_first();
s != CharReach::npos; s = esets[i].find_next(s)) {
if (alpha_remap[s] != leader_s) {
t.set(s);
}
}
if (t.any() && t != esets[i]) {
esets[i] &= ~t;
esets.push_back(t);
}
}
}
alphasize = buildAlphabetFromEquivSets(esets, alpha, unalpha);
}
void transition(const StateSet &in, StateSet *next) {
u16 t[ALPHABET_SIZE];
for (u32 i = 0; i < alphasize; i++) {
next[i].resize(nfas.size());
}
for (u32 j = 0; j < nfas.size(); j++) {
getFullTransitionFromState(*nfas[j], in[j], t);
for (u32 i = 0; i < alphasize; i++) {
next[i][j]= t[unalpha[i]];
}
}
}
const vector<StateSet> initial() {
vector<StateSet> rv(1, as);
if (start_floating != DEAD_STATE && start_floating != start_anchored) {
rv.push_back(fs);
}
return rv;
}
private:
void reports_i(const StateSet &in, flat_set<ReportID> dstate::*r_set,
flat_set<ReportID> &r) {
for (u32 i = 0; i < nfas.size(); i++) {
const auto &rs = nfas[i]->states[in[i]].*r_set;
insert(&r, rs);
}
}
public:
void reports(const StateSet &in, flat_set<ReportID> &rv) {
reports_i(in, &dstate::reports, rv);
}
void reportsEod(const StateSet &in, flat_set<ReportID> &rv) {
reports_i(in, &dstate::reports_eod, rv);
}
static bool canPrune(const flat_set<ReportID> &) { return false; }
private:
vector<const raw_som_dfa *> nfas;
vector<dstate_id_t> as;
vector<dstate_id_t> fs;
public:
array<u16, ALPHABET_SIZE> alpha;
array<u16, ALPHABET_SIZE> unalpha;
u16 alphasize;
StateSet dead;
u16 start_anchored;
u16 start_floating;
};
}
enum bslm_mode {
ONLY_EXISTING,
INCLUDE_INVALID
};
static
bool is_any_start_inc_virtual(NFAVertex v, const NGHolder &g) {
return is_virtual_start(v, g) || is_any_start(v, g);
}
static
s32 getSlotID(const NGHolder &g,
UNUSED const ue2::unordered_map<NFAVertex, u32> &state_ids,
NFAVertex v) {
if (is_triggered(g) && v == g.start) {
assert(state_ids.at(v) != NO_STATE);
} else if (is_any_start_inc_virtual(v, g)) {
return CREATE_NEW_SOM;
}
return g[v].index;
}
template<typename stateset>
static
void haig_do_preds(const NGHolder &g, const stateset &nfa_states,
const vector<NFAVertex> &state_mapping,
som_tran_info &preds) {
for (size_t i = nfa_states.find_first(); i != stateset::npos;
i = nfa_states.find_next(i)) {
NFAVertex v = state_mapping[i];
s32 slot_id = g[v].index;
DEBUG_PRINTF("d vertex %u\n", g[v].index);
vector<u32> &out_map = preds[slot_id];
for (auto u : inv_adjacent_vertices_range(v, g)) {
out_map.push_back(g[u].index);
}
sort(out_map.begin(), out_map.end());
assert(!out_map.empty() || v == g.start);
}
}
template<typename stateset>
static
void haig_do_report(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &state_ids,
NFAVertex accept_v, const stateset &source_nfa_states,
const vector<NFAVertex> &state_mapping,
set<som_report> &out) {
for (size_t i = source_nfa_states.find_first(); i != stateset::npos;
i = source_nfa_states.find_next(i)) {
NFAVertex v = state_mapping[i];
if (!edge(v, accept_v, g).second) {
continue;
}
for (ReportID report_id : g[v].reports) {
out.insert(som_report(report_id, getSlotID(g, state_ids, v)));
}
}
}
static
void haig_note_starts(const NGHolder &g, map<u32, u32> *out) {
if (is_triggered(g)) {
return;
}
DEBUG_PRINTF("seeing who creates new som values\n");
vector<DepthMinMax> depths = getDistancesFromSOM(g);
for (auto v : vertices_range(g)) {
if (is_any_start_inc_virtual(v, g)) {
DEBUG_PRINTF("%u creates new som value\n", g[v].index);
out->emplace(g[v].index, 0U);
continue;
}
if (is_any_accept(v, g)) {
continue;
}
const DepthMinMax &d = depths[g[v].index];
if (d.min == d.max && d.min.is_finite()) {
DEBUG_PRINTF("%u is fixed at %u\n", g[v].index, (u32)d.min);
out->emplace(g[v].index, d.min);
}
}
}
template<class Auto>
static
bool doHaig(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &state_ids,
som_type som, const vector<vector<CharReach>> &triggers,
bool unordered_som, raw_som_dfa *rdfa) {
u32 state_limit = HAIG_FINAL_DFA_STATE_LIMIT; /* haig never backs down from
a fight */
typedef typename Auto::StateSet StateSet;
vector<StateSet> nfa_state_map;
Auto n(g, state_ids, som, triggers, unordered_som);
try {
if (determinise(n, rdfa->states, state_limit, &nfa_state_map)) {
DEBUG_PRINTF("state limit exceeded\n");
return false;
}
} catch (haig_too_wide &) {
DEBUG_PRINTF("too many live som states\n");
return false;
}
rdfa->start_anchored = n.start_anchored;
rdfa->start_floating = n.start_floating;
rdfa->alpha_size = n.alphasize;
rdfa->alpha_remap = n.alpha;
rdfa->state_som.reserve(rdfa->states.size());
for (u32 i = 0; i < rdfa->states.size(); i++) {
rdfa->state_som.push_back(dstate_som());
const StateSet &source_states = nfa_state_map[i];
if (source_states.count() > HAIG_MAX_LIVE_SOM_SLOTS) {
DEBUG_PRINTF("too many live states\n");
return false;
}
DEBUG_PRINTF("generating som info for %u\n", i);
haig_do_preds(g, source_states, n.v_by_index,
rdfa->state_som.back().preds);
haig_do_report(g, state_ids, g.accept, source_states, n.v_by_index,
rdfa->state_som.back().reports);
haig_do_report(g, state_ids, g.acceptEod, source_states, n.v_by_index,
rdfa->state_som.back().reports_eod);
}
haig_note_starts(g, &rdfa->new_som_nfa_states);
rdfa->trigger_nfa_state = NODE_START;
return true;
}
unique_ptr<raw_som_dfa> attemptToBuildHaig(NGHolder &g, som_type som,
u32 somPrecision,
const vector<vector<CharReach> > &triggers,
const Grey &grey, bool unordered_som) {
assert(is_triggered(g) != triggers.empty());
assert(!unordered_som || is_triggered(g));
if (!grey.allowGough) {
/* must be at least one engine capable of handling raw som dfas */
return nullptr;
}
auto state_ids = numberStates(g);
dropUnusedStarts(g, state_ids);
DEBUG_PRINTF("attempting to build haig \n");
assert(allMatchStatesHaveReports(g));
assert(hasCorrectlyNumberedVertices(g));
u32 numStates = num_vertices(g);
if (numStates > HAIG_MAX_NFA_STATE) {
DEBUG_PRINTF("giving up... looks too big\n");
return nullptr;
}
auto rdfa = ue2::make_unique<raw_som_dfa>(g.kind, unordered_som);
DEBUG_PRINTF("determinising nfa with %u vertices\n", numStates);
bool rv;
if (numStates <= NFA_STATE_LIMIT) {
/* fast path */
rv = doHaig<Automaton_Graph>(g, state_ids, som, triggers, unordered_som,
rdfa.get());
} else {
/* not the fast path */
rv = doHaig<Automaton_Big>(g, state_ids, som, triggers, unordered_som,
rdfa.get());
}
if (!rv) {
return nullptr;
}
DEBUG_PRINTF("determinised, building impl dfa (a,f) = (%hu,%hu)\n",
rdfa->start_anchored, rdfa->start_floating);
rdfa->stream_som_loc_width = somPrecision;
assert(rdfa->kind == g.kind);
return rdfa;
}
static
void haig_merge_do_preds(const vector<const raw_som_dfa *> &dfas,
const vector<u32> &per_dfa_adj,
const vector<dstate_id_t> &source_nfa_states,
som_tran_info &som_tran) {
for (u32 d = 0; d < dfas.size(); ++d) {
u32 adj = per_dfa_adj[d];
const som_tran_info &som_tran_d
= dfas[d]->state_som[source_nfa_states[d]].preds;
for (som_tran_info::const_iterator it = som_tran_d.begin();
it != som_tran_d.end(); ++it) {
assert(it->first != CREATE_NEW_SOM);
u32 dest_slot = it->first < N_SPECIALS ? it->first
: it->first + adj;
vector<u32> &out = som_tran[dest_slot];
if (!out.empty()) {
/* stylised specials already done; it does not matter who builds
the preds */
assert(dest_slot < N_SPECIALS);
continue;
}
for (vector<u32>::const_iterator jt = it->second.begin();
jt != it->second.end(); ++jt) {
if (*jt < N_SPECIALS || *jt == CREATE_NEW_SOM) {
out.push_back(*jt);
} else {
out.push_back(*jt + adj);
}
}
}
}
}
static
void haig_merge_note_starts(const vector<const raw_som_dfa *> &dfas,
const vector<u32> &per_dfa_adj,
map<u32, u32> *out) {
for (u32 d = 0; d < dfas.size(); ++d) {
u32 adj = per_dfa_adj[d];
const map<u32, u32> &new_soms = dfas[d]->new_som_nfa_states;
for (map<u32, u32>::const_iterator it = new_soms.begin();
it != new_soms.end(); ++it) {
if (it->first < N_SPECIALS) {
assert(!it->second);
out->emplace(it->first, 0U);
} else {
assert(d + 1 >= per_dfa_adj.size()
|| it->first + adj < per_dfa_adj[d + 1]);
out->emplace(it->first + adj, it->second);
}
}
}
}
static never_inline
void haig_merge_do_report(const vector<const raw_som_dfa *> &dfas,
const vector<u32> &per_dfa_adj,
const vector<dstate_id_t> &source_nfa_states,
bool eod, set<som_report> &out) {
for (u32 d = 0; d < dfas.size(); ++d) {
u32 adj = per_dfa_adj[d];
const set<som_report> &reps = eod
? dfas[d]->state_som[source_nfa_states[d]].reports_eod
: dfas[d]->state_som[source_nfa_states[d]].reports;
for (set<som_report>::const_iterator it = reps.begin();
it != reps.end(); ++it) {
u32 slot = it->slot;
if (slot != CREATE_NEW_SOM && slot >= N_SPECIALS) {
slot += adj;
}
out.insert(som_report(it->report, slot));
}
}
}
static
u32 total_slots_used(const raw_som_dfa &rdfa) {
u32 rv = 0;
for (vector<dstate_som>::const_iterator it = rdfa.state_som.begin();
it != rdfa.state_som.end(); ++it) {
for (som_tran_info::const_iterator jt = it->preds.begin();
jt != it->preds.end(); ++jt) {
assert(jt->first != CREATE_NEW_SOM);
ENSURE_AT_LEAST(&rv, jt->first + 1);
}
}
const map<u32, u32> &new_soms = rdfa.new_som_nfa_states;
for (map<u32, u32>::const_iterator it = new_soms.begin();
it != new_soms.end(); ++it) {
ENSURE_AT_LEAST(&rv, it->first + 1);
}
return rv;
}
unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &dfas,
u32 limit) {
assert(!dfas.empty());
Automaton_Haig_Merge n(dfas);
DEBUG_PRINTF("merging %zu dfas\n", dfas.size());
bool unordered_som = false;
for (const auto &haig : dfas) {
assert(haig);
assert(haig->kind == dfas.front()->kind);
unordered_som |= haig->unordered_som_triggers;
if (haig->states.size() > limit) {
DEBUG_PRINTF("too many states!\n");
return nullptr;
}
}
typedef Automaton_Haig_Merge::StateSet StateSet;
vector<StateSet> nfa_state_map;
auto rdfa = ue2::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som);
int rv = determinise(n, rdfa->states, limit, &nfa_state_map);
if (rv) {
DEBUG_PRINTF("%d:state limit (%u) exceeded\n", rv, limit);
return nullptr; /* over state limit */
}
rdfa->start_anchored = n.start_anchored;
rdfa->start_floating = n.start_floating;
rdfa->alpha_size = n.alphasize;
rdfa->alpha_remap = n.alpha;
vector<u32> per_dfa_adj;
u32 curr_adj = 0;
for (const auto &haig : dfas) {
per_dfa_adj.push_back(curr_adj);
curr_adj += total_slots_used(*haig);
if (curr_adj < per_dfa_adj.back()) {
/* overflowed our som slot count */
return nullptr;
}
}
rdfa->state_som.reserve(rdfa->states.size());
for (u32 i = 0; i < rdfa->states.size(); i++) {
rdfa->state_som.push_back(dstate_som());
const vector<dstate_id_t> &source_nfa_states = nfa_state_map[i];
DEBUG_PRINTF("finishing state %u\n", i);
haig_merge_do_preds(dfas, per_dfa_adj, source_nfa_states,
rdfa->state_som.back().preds);
if (rdfa->state_som.back().preds.size() > HAIG_MAX_LIVE_SOM_SLOTS) {
DEBUG_PRINTF("som slot limit exceeded (%zu)\n",
rdfa->state_som.back().preds.size());
return nullptr;
}
haig_merge_do_report(dfas, per_dfa_adj, source_nfa_states,
false /* not eod */,
rdfa->state_som.back().reports);
haig_merge_do_report(dfas, per_dfa_adj, source_nfa_states,
true /* eod */,
rdfa->state_som.back().reports_eod);
}
haig_merge_note_starts(dfas, per_dfa_adj, &rdfa->new_som_nfa_states);
rdfa->trigger_nfa_state = NODE_START;
DEBUG_PRINTF("merged, building impl dfa (a,f) = (%hu,%hu)\n",
rdfa->start_anchored, rdfa->start_floating);
rdfa->stream_som_loc_width = dfas[0]->stream_som_loc_width;
return rdfa;
}
} // namespace ue2

68
src/nfagraph/ng_haig.h Normal file
View File

@@ -0,0 +1,68 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Build code for Haig SOM DFA.
*/
#ifndef NG_HAIG_H
#define NG_HAIG_H
#include "ue2common.h"
#include "som/som.h"
#include <memory>
#include <vector>
namespace ue2 {
class CharReach;
class NGHolder;
struct Grey;
struct raw_som_dfa;
#define HAIG_FINAL_DFA_STATE_LIMIT 16383
#define HAIG_HARD_DFA_STATE_LIMIT 8192
/* unordered_som_triggers being true indicates that a live haig may be subjected
* to later tops arriving with earlier soms (without the haig going dead in
* between)
*/
std::unique_ptr<raw_som_dfa> attemptToBuildHaig(NGHolder &g, som_type som,
u32 somPrecision,
const std::vector<std::vector<CharReach> > &triggers,
const Grey &grey, bool unordered_som_triggers = false);
std::unique_ptr<raw_som_dfa>
attemptToMergeHaig(const std::vector<const raw_som_dfa *> &dfas,
u32 limit = HAIG_HARD_DFA_STATE_LIMIT);
} // namespace ue2
#endif

230
src/nfagraph/ng_holder.cpp Normal file
View File

@@ -0,0 +1,230 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "ng_holder.h"
#include "ng_util.h"
#include "ue2common.h"
using namespace std;
namespace ue2 {
// internal use only
static NFAVertex addSpecialVertex(NFAGraph &g, SpecialNodes id) {
NFAVertex v = add_vertex(g);
g[v].index = id;
return v;
}
NGHolder::NGHolder(void)
: g(),
// add initial special nodes
start(addSpecialVertex(g, NODE_START)),
startDs(addSpecialVertex(g, NODE_START_DOTSTAR)),
accept(addSpecialVertex(g, NODE_ACCEPT)),
acceptEod(addSpecialVertex(g, NODE_ACCEPT_EOD)),
// misc data
numVertices(N_SPECIALS),
numEdges(0),
isValidNumEdges(true),
isValidNumVertices(true) {
// wire up some fake edges for the stylized bits of the NFA
add_edge(start, startDs, *this);
add_edge(startDs, startDs, *this);
add_edge(accept, acceptEod, *this);
g[start].char_reach.setall();
g[startDs].char_reach.setall();
}
NGHolder::NGHolder(nfa_kind k)
: kind (k), g(),
// add initial special nodes
start(addSpecialVertex(g, NODE_START)),
startDs(addSpecialVertex(g, NODE_START_DOTSTAR)),
accept(addSpecialVertex(g, NODE_ACCEPT)),
acceptEod(addSpecialVertex(g, NODE_ACCEPT_EOD)),
// misc data
numVertices(N_SPECIALS),
numEdges(0),
isValidNumEdges(true),
isValidNumVertices(true) {
// wire up some fake edges for the stylized bits of the NFA
add_edge(start, startDs, *this);
add_edge(startDs, startDs, *this);
add_edge(accept, acceptEod, *this);
g[start].char_reach.setall();
g[startDs].char_reach.setall();
}
NGHolder::~NGHolder(void) {
DEBUG_PRINTF("destroying holder @ %p\n", this);
}
size_t num_edges(NGHolder &h) {
if (!h.isValidNumEdges) {
h.numEdges = num_edges(h.g);
h.isValidNumEdges = true;
}
return h.numEdges;
}
size_t num_edges(const NGHolder &h) {
if (!h.isValidNumEdges) {
return num_edges(h.g);
}
return h.numEdges;
}
size_t num_vertices(NGHolder &h) {
if (!h.isValidNumVertices) {
h.numVertices = num_vertices(h.g);
h.isValidNumVertices = true;
}
return h.numVertices;
}
size_t num_vertices(const NGHolder &h) {
if (!h.isValidNumVertices) {
return num_vertices(h.g);
}
return h.numVertices;
}
void remove_edge(const NFAEdge &e, NGHolder &h) {
remove_edge(e, h.g);
assert(!h.isValidNumEdges || h.numEdges > 0);
h.numEdges--;
}
void remove_edge(NFAVertex u, NFAVertex v, NGHolder &h) {
remove_edge(u, v, h.g);
assert(!h.isValidNumEdges || h.numEdges > 0);
h.numEdges--;
}
void remove_vertex(NFAVertex v, NGHolder &h) {
remove_vertex(v, h.g);
assert(!h.isValidNumVertices || h.numVertices > 0);
h.numVertices--;
}
void clear_vertex(NFAVertex v, NGHolder &h) {
h.isValidNumEdges = false;
clear_vertex_faster(v, h.g);
}
void clear_in_edges(NFAVertex v, NGHolder &h) {
h.isValidNumEdges = false;
clear_in_edges(v, h.g);
}
void clear_out_edges(NFAVertex v, NGHolder &h) {
h.isValidNumEdges = false;
clear_out_edges(v, h.g);
}
void clear_graph(NGHolder &h) {
NFAGraph::vertex_iterator vi, ve;
for (tie(vi, ve) = vertices(h); vi != ve;) {
NFAVertex v = *vi;
++vi;
clear_vertex(v, h);
if (!is_special(v, h)) {
remove_vertex(v, h);
}
}
assert(num_vertices(h) == N_SPECIALS);
// Recreate special stylised edges.
add_edge(h.start, h.startDs, h);
add_edge(h.startDs, h.startDs, h);
add_edge(h.accept, h.acceptEod, h);
}
std::pair<NFAEdge, bool> add_edge(NFAVertex u, NFAVertex v, NGHolder &h) {
assert(edge(u, v, h.g).second == false);
pair<NFAEdge, bool> e = add_edge(u, v, h.g);
h.g[e.first].index = h.numEdges++;
assert(!h.isValidNumEdges || h.numEdges > 0); // no wrapping
h.g[e.first].top = 0;
return e;
}
std::pair<NFAEdge, bool> add_edge(NFAVertex u, NFAVertex v,
const NFAGraph::edge_property_type &ep,
NGHolder &h) {
assert(edge(u, v, h.g).second == false);
pair<NFAEdge, bool> e = add_edge(u, v, ep, h.g);
h.g[e.first].index = h.numEdges++;
assert(!h.isValidNumEdges || h.numEdges > 0); // no wrapping
return e;
}
NFAVertex add_vertex(NGHolder &h) {
NFAVertex v = add_vertex(h.g);
h[v].index = h.numVertices++;
assert(h.numVertices > 0); // no wrapping
return v;
}
NFAVertex add_vertex(const NFAGraph::vertex_property_type &vp, NGHolder &h) {
NFAVertex v = add_vertex(h);
u32 i = h.g[v].index; /* preserve index */
h.g[v] = vp;
h.g[v].index = i;
return v;
}
void NGHolder::renumberEdges() {
numEdges = renumberGraphEdges(g);
isValidNumEdges = true;
}
void NGHolder::renumberVertices() {
numVertices = renumberGraphVertices(g);
isValidNumVertices = true;
}
NFAVertex NGHolder::getSpecialVertex(u32 id) const {
switch (id) {
case NODE_START: return start;
case NODE_START_DOTSTAR: return startDs;
case NODE_ACCEPT: return accept;
case NODE_ACCEPT_EOD: return acceptEod;
default: return nullptr;
}
}
}

329
src/nfagraph/ng_holder.h Normal file
View File

@@ -0,0 +1,329 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef NG_HOLDER_H
#define NG_HOLDER_H
#include "ng_graph.h"
#include "ue2common.h"
#include "nfa/nfa_kind.h"
#include <boost/graph/adjacency_iterator.hpp>
#include <boost/graph/adjacency_list.hpp>
#include <boost/graph/graph_traits.hpp>
namespace ue2 {
/** \brief Encapsulates an NFAGraph, stores special vertices and other
* metadata.
*
* When constructed, the graph will have the following stylised "special"
* edges:
*
* - (start, startDs)
* - (startDs, startDs) (self-loop)
* - (accept, acceptEod)
*/
class NGHolder : boost::noncopyable {
public:
NGHolder(void);
explicit NGHolder(nfa_kind kind);
virtual ~NGHolder(void);
// Pack edge and vertex indices.
// Note: maintaining edge index order can be expensive due to the frequency
// of edge removal/addition, so only renumberEdges() when required by
// operations on edge lists.
void renumberEdges();
void renumberVertices();
NFAVertex getSpecialVertex(u32 id) const;
nfa_kind kind = NFA_OUTFIX; /* Role that this plays in Rose */
/** \brief Underlying graph object */
NFAGraph g;
const NFAVertex start; //!< Anchored start vertex.
const NFAVertex startDs; //!< Unanchored start-dotstar vertex.
const NFAVertex accept; //!< Accept vertex.
const NFAVertex acceptEod; //!< Accept at EOD vertex.
using directed_category = NFAGraph::directed_category;
using edge_parallel_category = NFAGraph::edge_parallel_category;
using traversal_category = NFAGraph::traversal_category;
using vertex_descriptor = NFAGraph::vertex_descriptor;
using edge_descriptor = NFAGraph::edge_descriptor;
using adjacency_iterator = NFAGraph::adjacency_iterator;
using edge_iterator = NFAGraph::edge_iterator;
using in_edge_iterator = NFAGraph::in_edge_iterator;
using inv_adjacency_iterator = NFAGraph::inv_adjacency_iterator;
using out_edge_iterator = NFAGraph::out_edge_iterator;
using vertex_iterator = NFAGraph::vertex_iterator;
using edge_property_type = NFAGraph::edge_property_type;
using vertex_property_type = NFAGraph::vertex_property_type;
// These free functions, which follow the BGL model, are the interface to
// the graph held by this class.
friend size_t num_vertices(NGHolder &h);
friend size_t num_vertices(const NGHolder &h);
friend size_t num_edges(NGHolder &h);
friend size_t num_edges(const NGHolder &h);
friend void remove_vertex(NFAVertex v, NGHolder &h);
friend void clear_vertex(NFAVertex v, NGHolder &h);
friend void clear_in_edges(NFAVertex v, NGHolder &h);
friend void clear_out_edges(NFAVertex v, NGHolder &h);
friend void remove_edge(const NFAEdge &e, NGHolder &h);
friend void remove_edge(NFAVertex u, NFAVertex v, NGHolder &h);
template<class Predicate>
friend void remove_out_edge_if(NFAVertex v, Predicate pred, NGHolder &h) {
boost::remove_out_edge_if(v, pred, h.g);
h.isValidNumEdges = false;
}
template<class Predicate>
friend void remove_in_edge_if(NFAVertex v, Predicate pred, NGHolder &h) {
boost::remove_in_edge_if(v, pred, h.g);
h.isValidNumEdges = false;
}
template<class Predicate>
friend void remove_edge_if(Predicate pred, NGHolder &h) {
boost::remove_edge_if(pred, h.g);
h.isValidNumEdges = false;
}
friend std::pair<NFAEdge, bool> add_edge(NFAVertex u, NFAVertex v,
NGHolder &h);
friend std::pair<NFAEdge, bool> add_edge(NFAVertex u, NFAVertex v,
const edge_property_type &ep,
NGHolder &h);
friend NFAVertex add_vertex(NGHolder &h);
friend NFAVertex add_vertex(const vertex_property_type &vp, NGHolder &h);
static NFAVertex null_vertex(void) { return NFAGraph::null_vertex(); }
// Subscript operators for BGL bundled properties.
using graph_bundled = NFAGraph::graph_bundled;
using vertex_bundled = NFAGraph::vertex_bundled;
using edge_bundled = NFAGraph::edge_bundled;
vertex_bundled &operator[](NFAVertex v) {
return get(boost::vertex_bundle, g)[v];
}
const vertex_bundled &operator[](NFAVertex v) const {
return get(boost::vertex_bundle, g)[v];
}
edge_bundled &operator[](const NFAEdge &e) {
return get(boost::edge_bundle, g)[e];
}
const edge_bundled &operator[](const NFAEdge &e) const {
return get(boost::edge_bundle, g)[e];
}
protected:
/* Since the NFAGraph vertex/edge list selectors are std::lists, computing
* num_vertices and num_edges is O(N). We use these members to store a
* cached copy of the size.
*
* In the future, with C++11's constant-time std::list::size, these may
* become obsolete. */
u32 numVertices;
u32 numEdges;
bool isValidNumEdges;
bool isValidNumVertices;
};
/** \brief True if the vertex \p v is one of our special vertices. */
template <typename GraphT>
static really_inline
bool is_special(const NFAVertex v, const GraphT &g) {
return g[v].index < N_SPECIALS;
}
static really_inline
std::pair<NFAGraph::adjacency_iterator, NFAGraph::adjacency_iterator>
adjacent_vertices(NFAVertex v, const NGHolder &h) {
return adjacent_vertices(v, h.g);
}
static really_inline
std::pair<NFAEdge, bool> edge(NFAVertex u, NFAVertex v, const NGHolder &h) {
return boost::edge(u, v, h.g);
}
static really_inline
std::pair<NFAGraph::edge_iterator, NFAGraph::edge_iterator>
edges(const NGHolder &h) {
return edges(h.g);
}
static really_inline
size_t in_degree(NFAVertex v, const NGHolder &h) {
return in_degree(v, h.g);
}
static really_inline
std::pair<NFAGraph::in_edge_iterator, NFAGraph::in_edge_iterator>
in_edges(NFAVertex v, const NGHolder &h) {
return in_edges(v, h.g);
}
static really_inline
std::pair<NFAGraph::inv_adjacency_iterator, NFAGraph::inv_adjacency_iterator>
inv_adjacent_vertices(NFAVertex v, const NGHolder &h) {
return inv_adjacent_vertices(v, h.g);
}
static really_inline
size_t out_degree(NFAVertex v, const NGHolder &h) {
return out_degree(v, h.g);
}
static really_inline
std::pair<NFAGraph::out_edge_iterator, NFAGraph::out_edge_iterator>
out_edges(NFAVertex v, const NGHolder &h) {
return out_edges(v, h.g);
}
static really_inline
NFAVertex source(const NFAEdge &e, const NGHolder &h) {
return source(e, h.g);
}
static really_inline
NFAVertex target(const NFAEdge &e, const NGHolder &h) {
return target(e, h.g);
}
static really_inline
std::pair<NFAGraph::vertex_iterator, NFAGraph::vertex_iterator>
vertices(const NGHolder &h) {
return vertices(h.g);
}
/**
* \brief Clears all non-special vertices and edges from the graph.
*
* Note: not the same as the BGL's clear() function, which removes all vertices
* and edges.
*/
void clear_graph(NGHolder &h);
/*
* \brief Clear and remove all of the vertices pointed to by the given iterator
* range.
*
* If renumber is false, no renumbering of vertex indices is done.
*
* Note: should not be called with iterators that will be invalidated by vertex
* removal (such as NFAGraph::vertex_iterator).
*/
template <class Iter>
void remove_vertices(Iter begin, Iter end, NGHolder &h, bool renumber = true) {
if (begin == end) {
return;
}
for (Iter it = begin; it != end; ++it) {
NFAVertex v = *it;
if (!is_special(v, h)) {
clear_vertex(v, h);
remove_vertex(v, h);
} else {
assert(0);
}
}
if (renumber) {
h.renumberEdges();
h.renumberVertices();
}
}
/** \brief Clear and remove all of the vertices pointed to by the vertex
* descriptors in the given container.
*
* This is a convenience wrapper around the iterator variant above.
*/
template <class Container>
void remove_vertices(const Container &c, NGHolder &h, bool renumber = true) {
remove_vertices(c.begin(), c.end(), h, renumber);
}
/*
* \brief Clear and remove all of the edges pointed to by the given iterator
* range.
*
* If renumber is false, no renumbering of vertex indices is done.
*
* Note: should not be called with iterators that will be invalidated by vertex
* removal (such as NFAGraph::edge_iterator).
*/
template <class Iter>
void remove_edges(Iter begin, Iter end, NGHolder &h, bool renumber = true) {
if (begin == end) {
return;
}
for (Iter it = begin; it != end; ++it) {
const NFAEdge &e = *it;
remove_edge(e, h);
}
if (renumber) {
h.renumberEdges();
}
}
/** \brief Clear and remove all of the edges pointed to by the edge descriptors
* in the given container.
*
* This is a convenience wrapper around the iterator variant above.
*/
template <class Container>
void remove_edges(const Container &c, NGHolder &h, bool renumber = true) {
remove_edges(c.begin(), c.end(), h, renumber);
}
static UNUSED
bool is_triggered(const NGHolder &g) {
return is_triggered(g.kind);
}
static UNUSED
bool generates_callbacks(const NGHolder &g) {
return generates_callbacks(g.kind);
}
} // namespace ue2
#endif

View File

@@ -0,0 +1,215 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Loose equality testing for NGHolder graphs.
*
* Loose equality check for holders' graph structure and vertex_index,
* vertex_char_reach and (optionally reports).
*/
#include "ng_is_equal.h"
#include "grey.h"
#include "ng_holder.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/container.h"
#include "util/graph_range.h"
#include "util/make_unique.h"
#include "util/ue2_containers.h"
#include <set>
#include <boost/functional/hash/hash.hpp>
using namespace std;
namespace ue2 {
namespace {
struct check_report {
virtual ~check_report() {}
virtual bool operator()(const flat_set<ReportID> &reports_a,
const flat_set<ReportID> &reports_b) const = 0;
};
struct full_check_report : public check_report {
bool operator()(const flat_set<ReportID> &reports_a,
const flat_set<ReportID> &reports_b) const override {
return reports_a == reports_b;
}
};
struct equiv_check_report : public check_report {
equiv_check_report(ReportID a_in, ReportID b_in)
: a_rep(a_in), b_rep(b_in) {}
bool operator()(const flat_set<ReportID> &reports_a,
const flat_set<ReportID> &reports_b) const override {
return contains(reports_a, a_rep) == contains(reports_b, b_rep);
}
private:
ReportID a_rep;
ReportID b_rep;
};
}
static
bool is_equal_i(const NGHolder &a, const NGHolder &b,
const check_report &check_rep) {
assert(hasCorrectlyNumberedVertices(a));
assert(hasCorrectlyNumberedVertices(b));
size_t num_verts = num_vertices(a);
if (num_verts != num_vertices(b)) {
return false;
}
vector<NFAVertex> vert_a;
vector<NFAVertex> vert_b;
vector<NFAVertex> adj_a;
vector<NFAVertex> adj_b;
vert_a.reserve(num_verts);
vert_b.reserve(num_verts);
adj_a.reserve(num_verts);
adj_b.reserve(num_verts);
insert(&vert_a, vert_a.end(), vertices(a));
insert(&vert_b, vert_b.end(), vertices(b));
sort(vert_a.begin(), vert_a.end(), make_index_ordering(a));
sort(vert_b.begin(), vert_b.end(), make_index_ordering(b));
for (size_t i = 0; i < vert_a.size(); i++) {
NFAVertex va = vert_a[i];
NFAVertex vb = vert_b[i];
DEBUG_PRINTF("vertex %u\n", a[va].index);
// Vertex index must be the same.
if (a[va].index != b[vb].index) {
DEBUG_PRINTF("bad index\n");
return false;
}
// Reach must be the same.
if (a[va].char_reach != b[vb].char_reach) {
DEBUG_PRINTF("bad reach\n");
return false;
}
if (!check_rep(a[va].reports, b[vb].reports)) {
DEBUG_PRINTF("bad reports\n");
return false;
}
// Other vertex properties may vary.
/* Check successors */
adj_a.clear();
adj_b.clear();
insert(&adj_a, adj_a.end(), adjacent_vertices(va, a));
insert(&adj_b, adj_b.end(), adjacent_vertices(vb, b));
if (adj_a.size() != adj_b.size()) {
DEBUG_PRINTF("bad adj\n");
return false;
}
sort(adj_a.begin(), adj_a.end(), make_index_ordering(a));
sort(adj_b.begin(), adj_b.end(), make_index_ordering(b));
for (size_t j = 0; j < adj_a.size(); j++) {
if (a[adj_a[j]].index != b[adj_b[j]].index) {
DEBUG_PRINTF("bad adj\n");
return false;
}
}
}
/* check top for edges out of start */
vector<pair<u32, u32>> top_a;
vector<pair<u32, u32>> top_b;
for (const auto &e : out_edges_range(a.start, a)) {
top_a.emplace_back(a[target(e, a)].index, a[e].top);
}
for (const auto &e : out_edges_range(b.start, b)) {
top_b.emplace_back(b[target(e, b)].index, b[e].top);
}
sort(top_a.begin(), top_a.end());
sort(top_b.begin(), top_b.end());
if (top_a != top_b) {
DEBUG_PRINTF("bad top\n");
return false;
}
DEBUG_PRINTF("good\n");
return true;
}
/** \brief loose hash of an NGHolder; equal if is_equal would return true. */
u64a hash_holder(const NGHolder &g) {
size_t rv = 0;
for (auto v : vertices_range(g)) {
boost::hash_combine(rv, g[v].index);
boost::hash_combine(rv, g[v].char_reach);
for (auto w : adjacent_vertices_range(v, g)) {
boost::hash_combine(rv, g[w].index);
}
}
return rv;
}
bool is_equal(const NGHolder &a, const NGHolder &b) {
DEBUG_PRINTF("testing %p %p\n", &a, &b);
if (&a == &b) {
return true;
}
return is_equal_i(a, b, full_check_report());
}
bool is_equal(const NGHolder &a, ReportID a_rep,
const NGHolder &b, ReportID b_rep) {
DEBUG_PRINTF("testing %p %p\n", &a, &b);
if (&a == &b && a_rep == b_rep) {
return true;
}
return is_equal_i(a, b, equiv_check_report(a_rep, b_rep));
}
} // namespace ue2

View File

@@ -0,0 +1,69 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Loose equality testing for NGHolder graphs.
*
* Loose equality check for holders' graph structure and vertex_index,
* vertex_char_reach and (optionally reports).
*/
#ifndef NG_IS_EQUAL_H
#define NG_IS_EQUAL_H
#include "ue2common.h"
#include <memory>
#include <boost/core/noncopyable.hpp>
namespace ue2 {
class NGHolder;
bool is_equal(const NGHolder &a, const NGHolder &b);
bool is_equal(const NGHolder &a, ReportID a_r, const NGHolder &b, ReportID b_r);
u64a hash_holder(const NGHolder &g);
// Util Functors
struct NGHolderHasher {
size_t operator()(const std::shared_ptr<const NGHolder> &h) const {
return hash_holder(*h);
}
};
struct NGHolderEqual {
bool operator()(const std::shared_ptr<const NGHolder> &a,
const std::shared_ptr<const NGHolder> &b) const {
return is_equal(*a, *b);
}
};
} // namespace ue2
#endif // NG_IS_EQUAL_H

363
src/nfagraph/ng_lbr.cpp Normal file
View File

@@ -0,0 +1,363 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Large Bounded Repeat (LBR) engine build code.
*/
#include "ng_lbr.h"
#include "grey.h"
#include "ng_holder.h"
#include "ng_repeat.h"
#include "ng_reports.h"
#include "nfa/shufticompile.h"
#include "nfa/trufflecompile.h"
#include "nfa/lbr_internal.h"
#include "nfa/nfa_internal.h"
#include "nfa/repeatcompile.h"
#include "util/alloc.h"
#include "util/bitutils.h" // for lg2
#include "util/compile_context.h"
#include "util/container.h"
#include "util/depth.h"
#include "util/dump_charclass.h"
#include "util/verify_types.h"
using namespace std;
namespace ue2 {
static
u32 depth_to_u32(const depth &d) {
assert(d.is_reachable());
if (d.is_infinite()) {
return REPEAT_INF;
}
u32 d_val = d;
assert(d_val < REPEAT_INF);
return d_val;
}
template<class LbrStruct> static
u64a* getTable(NFA *nfa) {
char *ptr = (char *)nfa + sizeof(struct NFA) + sizeof(LbrStruct) +
sizeof(RepeatInfo);
ptr = ROUNDUP_PTR(ptr, alignof(u64a));
return (u64a *)ptr;
}
template <class LbrStruct> static
void fillNfa(NFA *nfa, lbr_common *c, ReportID report, const depth &repeatMin,
const depth &repeatMax, u32 minPeriod, enum RepeatType rtype) {
assert(nfa);
RepeatStateInfo rsi(rtype, repeatMin, repeatMax, minPeriod);
DEBUG_PRINTF("selected %s model for {%s,%s} repeat\n",
repeatTypeName(rtype), repeatMin.str().c_str(),
repeatMax.str().c_str());
// Fill the lbr_common structure first. Note that the RepeatInfo structure
// directly follows the LbrStruct.
const u32 info_offset = sizeof(LbrStruct);
c->repeatInfoOffset = info_offset;
c->report = report;
RepeatInfo *info = (RepeatInfo *)((char *)c + info_offset);
info->type = verify_u8(rtype);
info->repeatMin = depth_to_u32(repeatMin);
info->repeatMax = depth_to_u32(repeatMax);
info->stateSize = rsi.stateSize;
info->packedCtrlSize = rsi.packedCtrlSize;
info->horizon = rsi.horizon;
info->minPeriod = minPeriod;
memcpy(&info->packedFieldSizes, rsi.packedFieldSizes.data(),
byte_length(rsi.packedFieldSizes));
info->patchCount = rsi.patchCount;
info->patchSize = rsi.patchSize;
info->encodingSize = rsi.encodingSize;
info->patchesOffset = rsi.patchesOffset;
// Fill the NFA structure.
nfa->nPositions = repeatMin;
nfa->streamStateSize = verify_u32(rsi.packedCtrlSize + rsi.stateSize);
nfa->scratchStateSize = (u32)sizeof(lbr_state);
nfa->minWidth = verify_u32(repeatMin);
nfa->maxWidth = repeatMax.is_finite() ? verify_u32(repeatMax) : 0;
// Fill the lbr table for sparse lbr model.
if (rtype == REPEAT_SPARSE_OPTIMAL_P) {
u64a *table = getTable<LbrStruct>(nfa);
// Adjust table length according to the optimal patch length.
size_t len = nfa->length;
assert((u32)repeatMax >= rsi.patchSize);
len -= sizeof(u64a) * ((u32)repeatMax - rsi.patchSize);
nfa->length = verify_u32(len);
info->length = verify_u32(sizeof(RepeatInfo)
+ sizeof(u64a) * (rsi.patchSize + 1));
memcpy(table, rsi.table.data(), byte_length(rsi.table));
}
}
template <class LbrStruct> static
aligned_unique_ptr<NFA> makeLbrNfa(NFAEngineType nfa_type,
enum RepeatType rtype,
const depth &repeatMax) {
size_t tableLen = 0;
if (rtype == REPEAT_SPARSE_OPTIMAL_P) {
tableLen = sizeof(u64a) * (repeatMax + 1);
}
size_t len = sizeof(NFA) + sizeof(LbrStruct) + sizeof(RepeatInfo) +
tableLen + sizeof(u64a);
aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(len);
nfa->type = verify_u8(nfa_type);
nfa->length = verify_u32(len);
return nfa;
}
static
aligned_unique_ptr<NFA> buildLbrDot(const CharReach &cr, const depth &repeatMin,
const depth &repeatMax, u32 minPeriod,
bool is_reset, ReportID report) {
if (!cr.all()) {
return nullptr;
}
enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
is_reset);
aligned_unique_ptr<NFA> nfa
= makeLbrNfa<lbr_dot>(LBR_NFA_Dot, rtype, repeatMax);
struct lbr_dot *ld = (struct lbr_dot *)getMutableImplNfa(nfa.get());
fillNfa<lbr_dot>(nfa.get(), &ld->common, report, repeatMin, repeatMax,
minPeriod, rtype);
DEBUG_PRINTF("built dot lbr\n");
return nfa;
}
static
aligned_unique_ptr<NFA> buildLbrVerm(const CharReach &cr,
const depth &repeatMin,
const depth &repeatMax, u32 minPeriod,
bool is_reset, ReportID report) {
const CharReach escapes(~cr);
if (escapes.count() != 1) {
return nullptr;
}
enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
is_reset);
aligned_unique_ptr<NFA> nfa
= makeLbrNfa<lbr_verm>(LBR_NFA_Verm, rtype, repeatMax);
struct lbr_verm *lv = (struct lbr_verm *)getMutableImplNfa(nfa.get());
lv->c = escapes.find_first();
fillNfa<lbr_verm>(nfa.get(), &lv->common, report, repeatMin, repeatMax,
minPeriod, rtype);
DEBUG_PRINTF("built verm lbr\n");
return nfa;
}
static
aligned_unique_ptr<NFA> buildLbrNVerm(const CharReach &cr,
const depth &repeatMin,
const depth &repeatMax, u32 minPeriod,
bool is_reset, ReportID report) {
const CharReach escapes(cr);
if (escapes.count() != 1) {
return nullptr;
}
enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
is_reset);
aligned_unique_ptr<NFA> nfa
= makeLbrNfa<lbr_verm>(LBR_NFA_NVerm, rtype, repeatMax);
struct lbr_verm *lv = (struct lbr_verm *)getMutableImplNfa(nfa.get());
lv->c = escapes.find_first();
fillNfa<lbr_verm>(nfa.get(), &lv->common, report, repeatMin, repeatMax,
minPeriod, rtype);
DEBUG_PRINTF("built negated verm lbr\n");
return nfa;
}
static
aligned_unique_ptr<NFA> buildLbrShuf(const CharReach &cr,
const depth &repeatMin,
const depth &repeatMax, u32 minPeriod,
bool is_reset, ReportID report) {
enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
is_reset);
aligned_unique_ptr<NFA> nfa
= makeLbrNfa<lbr_shuf>(LBR_NFA_Shuf, rtype, repeatMax);
struct lbr_shuf *ls = (struct lbr_shuf *)getMutableImplNfa(nfa.get());
fillNfa<lbr_shuf>(nfa.get(), &ls->common, report, repeatMin, repeatMax,
minPeriod, rtype);
if (shuftiBuildMasks(~cr, &ls->mask_lo, &ls->mask_hi) == -1) {
return nullptr;
}
DEBUG_PRINTF("built shuf lbr\n");
return nfa;
}
static
aligned_unique_ptr<NFA> buildLbrTruf(const CharReach &cr,
const depth &repeatMin,
const depth &repeatMax, u32 minPeriod,
bool is_reset, ReportID report) {
enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
is_reset);
aligned_unique_ptr<NFA> nfa
= makeLbrNfa<lbr_truf>(LBR_NFA_Truf, rtype, repeatMax);
struct lbr_truf *lc = (struct lbr_truf *)getMutableImplNfa(nfa.get());
fillNfa<lbr_truf>(nfa.get(), &lc->common, report, repeatMin, repeatMax,
minPeriod, rtype);
truffleBuildMasks(~cr, &lc->mask1, &lc->mask2);
DEBUG_PRINTF("built truffle lbr\n");
return nfa;
}
static
aligned_unique_ptr<NFA> constructLBR(const CharReach &cr,
const depth &repeatMin,
const depth &repeatMax, u32 minPeriod,
bool is_reset, ReportID report) {
DEBUG_PRINTF("bounds={%s,%s}, cr=%s (count %zu), report=%u\n",
repeatMin.str().c_str(), repeatMax.str().c_str(),
describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count(),
report);
assert(repeatMin <= repeatMax);
assert(repeatMax.is_reachable());
aligned_unique_ptr<NFA> nfa
= buildLbrDot(cr, repeatMin, repeatMax, minPeriod, is_reset, report);
if (!nfa) {
nfa = buildLbrVerm(cr, repeatMin, repeatMax, minPeriod, is_reset,
report);
}
if (!nfa) {
nfa = buildLbrNVerm(cr, repeatMin, repeatMax, minPeriod, is_reset,
report);
}
if (!nfa) {
nfa = buildLbrShuf(cr, repeatMin, repeatMax, minPeriod, is_reset,
report);
}
if (!nfa) {
nfa = buildLbrTruf(cr, repeatMin, repeatMax, minPeriod, is_reset,
report);
}
if (!nfa) {
assert(0);
return nullptr;
}
return nfa;
}
aligned_unique_ptr<NFA> constructLBR(const PureRepeat &repeat,
const vector<vector<CharReach>> &triggers,
const CompileContext &cc) {
if (!cc.grey.allowLbr) {
return nullptr;
}
assert(!repeat.reach.none());
if (repeat.reports.size() != 1) {
DEBUG_PRINTF("too many reports\n");
return nullptr;
}
bool is_reset;
u32 min_period = minPeriod(triggers, repeat.reach, &is_reset);
if (depth(min_period) > repeat.bounds.max) {
DEBUG_PRINTF("trigger is longer than repeat; only need one offset\n");
is_reset = true;
}
ReportID report = *repeat.reports.begin();
DEBUG_PRINTF("building LBR %s\n", repeat.bounds.str().c_str());
return constructLBR(repeat.reach, repeat.bounds.min, repeat.bounds.max,
min_period, is_reset, report);
}
/** \brief Construct an LBR engine from the given graph \p g. */
aligned_unique_ptr<NFA> constructLBR(const NGHolder &g,
const vector<vector<CharReach>> &triggers,
const CompileContext &cc) {
if (!cc.grey.allowLbr) {
return nullptr;
}
PureRepeat repeat;
if (!isPureRepeat(g, repeat)) {
return nullptr;
}
return constructLBR(repeat, triggers, cc);
}
/** \brief True if graph \p g could be turned into an LBR engine. */
bool isLBR(const NGHolder &g, const Grey &grey) {
if (!grey.allowLbr) {
return false;
}
PureRepeat repeat;
if (!isPureRepeat(g, repeat)) {
DEBUG_PRINTF("not pure bounded repeat\n");
return false;
}
if (repeat.reports.size() != 1) {
DEBUG_PRINTF("too many reports\n");
return false;
}
return true;
}
} // namespace ue2

71
src/nfagraph/ng_lbr.h Normal file
View File

@@ -0,0 +1,71 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Large Bounded Repeat (LBR) engine build code.
*/
#ifndef NG_LBR_H
#define NG_LBR_H
#include "ue2common.h"
#include "util/alloc.h"
#include <memory>
#include <vector>
struct NFA;
namespace ue2 {
class CharReach;
class NGHolder;
class ReportManager;
struct CompileContext;
struct DepthMinMax;
struct Grey;
struct PureRepeat;
/** \brief Construct an LBR engine from the given graph \p g. */
aligned_unique_ptr<NFA>
constructLBR(const NGHolder &g,
const std::vector<std::vector<CharReach>> &triggers,
const CompileContext &cc);
/** \brief Construct an LBR engine from the given PureRepeat. */
aligned_unique_ptr<NFA>
constructLBR(const PureRepeat &repeat,
const std::vector<std::vector<CharReach>> &triggers,
const CompileContext &cc);
/** \brief True if graph \p g could be turned into an LBR engine. */
bool isLBR(const NGHolder &g, const Grey &grey);
} // namespace ue2
#endif // NG_LBR_H

571
src/nfagraph/ng_limex.cpp Normal file
View File

@@ -0,0 +1,571 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Limex NFA construction code.
*/
#include "ng_limex.h"
#include "grey.h"
#include "ng_equivalence.h"
#include "ng_holder.h"
#include "ng_misc_opt.h"
#include "ng_prune.h"
#include "ng_redundancy.h"
#include "ng_repeat.h"
#include "ng_reports.h"
#include "ng_restructuring.h"
#include "ng_squash.h"
#include "ng_util.h"
#include "ng_width.h"
#include "ue2common.h"
#include "nfa/limex_compile.h"
#include "nfa/limex_limits.h"
#include "nfa/nfa_internal.h"
#include "util/compile_context.h"
#include "util/container.h"
#include "util/graph_range.h"
#include "util/verify_types.h"
#include "util/ue2_containers.h"
#include <map>
#include <vector>
using namespace std;
namespace ue2 {
#ifndef NDEBUG
// Some sanity checking for the graph; returns false if something is wrong.
// Only used in assertions.
static
bool sanityCheckGraph(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &state_ids) {
ue2::unordered_set<u32> seen_states;
for (auto v : vertices_range(g)) {
// Non-specials should have non-empty reachability.
if (!is_special(v, g)) {
if (g[v].char_reach.none()) {
DEBUG_PRINTF("vertex %u has empty reach\n",
g[v].index);
return false;
}
}
// Vertices with edges to accept or acceptEod must have reports.
if (is_match_vertex(v, g) && v != g.accept) {
if (g[v].reports.empty()) {
DEBUG_PRINTF("vertex %u has no reports\n",
g[v].index);
return false;
}
}
// Participant vertices should have distinct state indices.
if (!contains(state_ids, v)) {
DEBUG_PRINTF("vertex %u has no state index!\n",
g[v].index);
return false;
}
u32 s = state_ids.at(v);
if (s != NO_STATE && !seen_states.insert(s).second) {
DEBUG_PRINTF("vertex %u has dupe state %u\n",
g[v].index, s);
return false;
}
}
return true;
}
#endif
static
void findSquashStates(const NGHolder &g,
const vector<BoundedRepeatData> &repeats,
map<NFAVertex, NFAStateSet> &squashMap) {
squashMap = findSquashers(g);
filterSquashers(g, squashMap);
/* We also filter out the cyclic states representing bounded repeats, as
* they are not really cyclic. */
for (const auto &br : repeats) {
squashMap.erase(br.cyclic);
}
}
/**
* \brief Drop edges from start to vertices that also have an edge from
* startDs.
*
* Note that this also includes the (start, startDs) edge, which is not
* necessary for actual NFA implementation (and is actually something we don't
* want to affect state numbering, etc).
*/
static
void dropRedundantStartEdges(NGHolder &g) {
remove_out_edge_if(g.start, [&](const NFAEdge &e) {
return edge(g.startDs, target(e, g), g).second;
}, g);
// Ensure that we always remove (start, startDs), even if startDs has had
// its self-loop removed as an optimization.
remove_edge(g.start, g.startDs, g);
}
static
void makeTopStates(NGHolder &g, map<u32, NFAVertex> &tops,
const map<u32, CharReach> &top_reach) {
map<u32, vector<NFAVertex>> top_succs;
for (const auto &e : out_edges_range(g.start, g)) {
NFAVertex v = target(e, g);
if (v == g.startDs) {
continue;
}
u32 t = g[e].top;
top_succs[t].push_back(v);
}
for (const auto &top : top_succs) {
u32 t = top.first;
CharReach top_cr;
if (contains(top_reach, t)) {
top_cr = top_reach.at(t);
} else {
top_cr = CharReach::dot();
}
assert(!contains(tops, t));
NFAVertex s = NFAGraph::null_vertex();
flat_set<NFAVertex> succs;
insert(&succs, top.second);
for (auto v : top.second) {
if (!top_cr.isSubsetOf(g[v].char_reach)) {
continue;
}
flat_set<NFAVertex> vsuccs;
insert(&vsuccs, adjacent_vertices(v, g));
if (succs != vsuccs) {
continue;
}
if (g[v].reports != g[g.start].reports) {
continue;
}
s = v;
break;
}
if (!s) {
s = add_vertex(g[g.start], g);
g[s].char_reach = top_cr;
for (auto v : top.second) {
add_edge(s, v, g);
}
}
tops[t] = s;
}
// We are completely replacing the start vertex, so clear its reports.
clear_out_edges(g.start, g);
add_edge(g.start, g.startDs, g);
g[g.start].reports.clear();
// Only retain reports (which we copied on add_vertex above) for new top
// vertices connected to accepts.
for (const auto &m : tops) {
NFAVertex v = m.second;
if (!edge(v, g.accept, g).second && !edge(v, g.acceptEod, g).second) {
g[v].reports.clear();
}
}
}
static
set<NFAVertex> findZombies(const NGHolder &h,
const map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
const ue2::unordered_map<NFAVertex, u32> &state_ids,
const CompileContext &cc) {
set<NFAVertex> zombies;
if (!cc.grey.allowZombies) {
return zombies;
}
// We only use zombie masks in streaming mode.
if (!cc.streaming) {
return zombies;
}
if (in_degree(h.acceptEod, h) != 1 || all_reports(h).size() != 1) {
DEBUG_PRINTF("can be made undead - bad reports\n");
return zombies;
}
for (auto u : inv_adjacent_vertices_range(h.accept, h)) {
assert(h[u].reports.size() == 1);
for (auto v : adjacent_vertices_range(u, h)) {
if (edge(v, h.accept, h).second
&& h[v].char_reach.all()) {
if (!contains(br_cyclic, v)) {
goto ok;
}
const BoundedRepeatSummary &sum = br_cyclic.at(v);
if (u == v && sum.repeatMax.is_infinite()) {
goto ok;
}
}
}
DEBUG_PRINTF("does not go to dot accept\n");
return zombies;
ok:;
}
for (const auto &v : inv_adjacent_vertices_range(h.accept, h)) {
if (state_ids.at(v) != NO_STATE) {
zombies.insert(v);
}
}
return zombies;
}
static
void reverseStateOrdering(ue2::unordered_map<NFAVertex, u32> &state_ids) {
vector<NFAVertex> ordering;
for (auto &e : state_ids) {
if (e.second == NO_STATE) {
continue;
}
ordering.push_back(e.first);
}
// Sort in reverse order by state ID.
sort(ordering.begin(), ordering.end(),
[&state_ids](NFAVertex a, NFAVertex b) {
return state_ids.at(a) > state_ids.at(b);
});
u32 stateNum = 0;
for (const auto &v : ordering) {
DEBUG_PRINTF("renumber, %u -> %u\n", state_ids.at(v), stateNum);
state_ids[v] = stateNum++;
}
}
static
map<u32, CharReach>
findTopReach(const map<u32, vector<vector<CharReach>>> &triggers) {
map<u32, CharReach> top_reach;
for (const auto &m : triggers) {
const auto top = m.first;
CharReach cr;
for (const auto &trigger : m.second) {
if (trigger.empty()) {
// We don't know anything about this trigger. Assume it can
// have any reach.
cr.setall();
break;
}
cr |= *trigger.rbegin();
}
top_reach.emplace(top, cr);
}
return top_reach;
}
static
unique_ptr<NGHolder>
prepareGraph(const NGHolder &h_in, const ReportManager *rm,
const map<u32, u32> &fixed_depth_tops,
const map<u32, vector<vector<CharReach>>> &triggers,
bool impl_test_only, const CompileContext &cc,
ue2::unordered_map<NFAVertex, u32> &state_ids,
vector<BoundedRepeatData> &repeats, map<u32, NFAVertex> &tops) {
assert(is_triggered(h_in) || fixed_depth_tops.empty());
unique_ptr<NGHolder> h = cloneHolder(h_in);
// Bounded repeat handling.
analyseRepeats(*h, rm, fixed_depth_tops, triggers, &repeats, cc.streaming,
impl_test_only, cc.grey);
// If we're building a rose/suffix, do the top dance.
if (is_triggered(*h)) {
makeTopStates(*h, tops, findTopReach(triggers));
}
dropRedundantStartEdges(*h);
// Do state numbering
state_ids = numberStates(*h, tops);
dropUnusedStarts(*h, state_ids);
// In debugging, we sometimes like to reverse the state numbering to stress
// the NFA construction code.
if (cc.grey.numberNFAStatesWrong) {
reverseStateOrdering(state_ids);
}
assert(sanityCheckGraph(*h, state_ids));
return h;
}
static
aligned_unique_ptr<NFA>
constructNFA(const NGHolder &h_in, const ReportManager *rm,
const map<u32, u32> &fixed_depth_tops,
const map<u32, vector<vector<CharReach>>> &triggers,
bool compress_state, bool do_accel, bool impl_test_only, u32 hint,
const CompileContext &cc) {
if (!generates_callbacks(h_in)) {
rm = nullptr;
} else {
assert(rm);
}
ue2::unordered_map<NFAVertex, u32> state_ids;
vector<BoundedRepeatData> repeats;
map<u32, NFAVertex> tops;
unique_ptr<NGHolder> h
= prepareGraph(h_in, rm, fixed_depth_tops, triggers, impl_test_only, cc,
state_ids, repeats, tops);
// Quick exit: if we've got an embarrassment of riches, i.e. more states
// than we can implement in our largest NFA model, bail here.
u32 numStates = countStates(*h, state_ids, false);
if (numStates > NFA_MAX_STATES) {
DEBUG_PRINTF("Can't build an NFA with %u states\n", numStates);
return nullptr;
}
map<NFAVertex, BoundedRepeatSummary> br_cyclic;
for (const auto &br : repeats) {
br_cyclic[br.cyclic] = BoundedRepeatSummary(br.repeatMin, br.repeatMax);
}
map<NFAVertex, NFAStateSet> reportSquashMap;
map<NFAVertex, NFAStateSet> squashMap;
// build map of squashed and squashers
if (cc.grey.squashNFA) {
findSquashStates(*h, repeats, squashMap);
if (rm && cc.grey.highlanderSquash) {
reportSquashMap = findHighlanderSquashers(*h, *rm);
}
}
set<NFAVertex> zombies = findZombies(*h, br_cyclic, state_ids, cc);
if (!cc.streaming || !cc.grey.compressNFAState) {
compress_state = false;
}
return generate(*h, state_ids, repeats, reportSquashMap, squashMap, tops,
zombies, do_accel, compress_state, hint, cc);
}
aligned_unique_ptr<NFA>
constructNFA(const NGHolder &h_in, const ReportManager *rm,
const map<u32, u32> &fixed_depth_tops,
const map<u32, vector<vector<CharReach>>> &triggers,
bool compress_state, const CompileContext &cc) {
const u32 hint = INVALID_NFA;
const bool do_accel = cc.grey.accelerateNFA;
const bool impl_test_only = false;
return constructNFA(h_in, rm, fixed_depth_tops, triggers, compress_state,
do_accel, impl_test_only, hint, cc);
}
#ifndef RELEASE_BUILD
// Variant that allows a hint to be specified.
aligned_unique_ptr<NFA>
constructNFA(const NGHolder &h_in, const ReportManager *rm,
const map<u32, u32> &fixed_depth_tops,
const map<u32, vector<vector<CharReach>>> &triggers,
bool compress_state, u32 hint, const CompileContext &cc) {
const bool do_accel = cc.grey.accelerateNFA;
const bool impl_test_only = false;
return constructNFA(h_in, rm, fixed_depth_tops, triggers,
compress_state, do_accel, impl_test_only, hint, cc);
}
#endif // RELEASE_BUILD
static
aligned_unique_ptr<NFA> constructReversedNFA_i(const NGHolder &h_in, u32 hint,
const CompileContext &cc) {
// Make a mutable copy of the graph that we can renumber etc.
NGHolder h;
cloneHolder(h, h_in);
assert(h.kind == NFA_REV_PREFIX); /* triggered, raises internal callbacks */
// Do state numbering.
auto state_ids = numberStates(h);
dropUnusedStarts(h, state_ids);
// Quick exit: if we've got an embarrassment of riches, i.e. more states
// than we can implement in our largest NFA model, bail here.
u32 numStates = countStates(h, state_ids, false);
if (numStates > NFA_MAX_STATES) {
DEBUG_PRINTF("Can't build an NFA with %u states\n", numStates);
return nullptr;
}
assert(sanityCheckGraph(h, state_ids));
map<u32, NFAVertex> tops; /* only the standards tops for nfas */
set<NFAVertex> zombies;
vector<BoundedRepeatData> repeats;
map<NFAVertex, NFAStateSet> reportSquashMap;
map<NFAVertex, NFAStateSet> squashMap;
return generate(h, state_ids, repeats, reportSquashMap, squashMap, tops,
zombies, false, false, hint, cc);
}
aligned_unique_ptr<NFA> constructReversedNFA(const NGHolder &h_in,
const CompileContext &cc) {
u32 hint = INVALID_NFA; // no hint
return constructReversedNFA_i(h_in, hint, cc);
}
#ifndef RELEASE_BUILD
// Variant that allows a hint to be specified.
aligned_unique_ptr<NFA> constructReversedNFA(const NGHolder &h_in, u32 hint,
const CompileContext &cc) {
return constructReversedNFA_i(h_in, hint, cc);
}
#endif // RELEASE_BUILD
u32 isImplementableNFA(const NGHolder &g, const ReportManager *rm,
const CompileContext &cc) {
// Quick check: we can always implement an NFA with less than NFA_MAX_STATES
// states. Note that top masks can generate extra states, so we account for
// those here too.
if (num_vertices(g) + NFA_MAX_TOP_MASKS < NFA_MAX_STATES) {
return true;
}
if (!generates_callbacks(g)) {
rm = nullptr;
} else {
assert(rm);
}
// The BEST way to tell if an NFA is implementable is to implement it!
const bool impl_test_only = true;
const map<u32, u32> fixed_depth_tops; // empty
const map<u32, vector<vector<CharReach>>> triggers; // empty
/* Perform the first part of the construction process and see if the
* resultant NGHolder has <= NFA_MAX_STATES. If it does, we know we can
* implement it as an NFA. */
ue2::unordered_map<NFAVertex, u32> state_ids;
vector<BoundedRepeatData> repeats;
map<u32, NFAVertex> tops;
unique_ptr<NGHolder> h
= prepareGraph(g, rm, fixed_depth_tops, triggers, impl_test_only, cc,
state_ids, repeats, tops);
assert(h);
u32 numStates = countStates(*h, state_ids, false);
if (numStates <= NFA_MAX_STATES) {
return numStates;
}
return 0;
}
void reduceImplementableGraph(NGHolder &g, som_type som, const ReportManager *rm,
const CompileContext &cc) {
NGHolder g_pristine;
cloneHolder(g_pristine, g);
reduceGraphEquivalences(g, cc);
removeRedundancy(g, som);
if (rm && generates_callbacks(g)) {
pruneHighlanderDominated(g, *rm);
}
if (!isImplementableNFA(g, rm, cc)) {
DEBUG_PRINTF("reductions made graph unimplementable, roll back\n");
clear_graph(g);
cloneHolder(g, g_pristine);
}
}
u32 countAccelStates(const NGHolder &g, const ReportManager *rm,
const CompileContext &cc) {
if (!generates_callbacks(g)) {
rm = nullptr;
} else {
assert(rm);
}
const bool impl_test_only = true;
const map<u32, u32> fixed_depth_tops; // empty
const map<u32, vector<vector<CharReach>>> triggers; // empty
ue2::unordered_map<NFAVertex, u32> state_ids;
vector<BoundedRepeatData> repeats;
map<u32, NFAVertex> tops;
unique_ptr<NGHolder> h
= prepareGraph(g, rm, fixed_depth_tops, triggers, impl_test_only, cc,
state_ids, repeats, tops);
if (!h || countStates(*h, state_ids, false) > NFA_MAX_STATES) {
DEBUG_PRINTF("not constructible\n");
return NFA_MAX_ACCEL_STATES + 1;
}
assert(h->kind == g.kind);
// Should have no bearing on accel calculation, so we leave these empty.
const set<NFAVertex> zombies;
const map<NFAVertex, NFAStateSet> reportSquashMap;
const map<NFAVertex, NFAStateSet> squashMap;
return countAccelStates(*h, state_ids, repeats, reportSquashMap, squashMap,
tops, zombies, cc);
}
} // namespace ue2

138
src/nfagraph/ng_limex.h Normal file
View File

@@ -0,0 +1,138 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Limex NFA construction code.
*/
#ifndef NG_LIMEX_H
#define NG_LIMEX_H
#include "ue2common.h"
#include "som/som.h"
#include "util/alloc.h"
#include <map>
#include <memory>
#include <vector>
struct NFA;
namespace ue2 {
class CharReach;
class NG;
class NGHolder;
class ReportManager;
struct CompileContext;
/** \brief Determine if the given graph is implementable as an NFA.
*
* Returns zero if the NFA is not implementable (usually because it has too
* many states for any of our models). Otherwise returns the number of states.
*
* ReportManager is used by NFA_SUFFIX and NFA_OUTFIX only. NFA_PREFIX and
* NFA_INFIX use unmanaged rose-local reports.
*/
u32 isImplementableNFA(const NGHolder &g, const ReportManager *rm,
const CompileContext &cc);
/** \brief Late-stage graph reductions.
*
* This will call \ref removeRedundancy and apply its changes to the given
* holder only if it is implementable afterwards. */
void reduceImplementableGraph(NGHolder &g, som_type som, const ReportManager *rm,
const CompileContext &cc);
/**
* \brief For a given graph, count the number of accel states it will have in
* an implementation.
*
* \return the number of accel states, or NFA_MAX_ACCEL_STATES + 1 if an
* implementation would not be constructible.
*/
u32 countAccelStates(const NGHolder &g, const ReportManager *rm,
const CompileContext &cc);
/** \brief Construct an NFA from the given NFAGraph.
*
* Returns zero if the NFA is not implementable (usually because it has too
* many states for any of our models). Otherwise returns the number of states.
*
* ReportManager is used by NFA_SUFFIX and NFA_OUTFIX only. NFA_PREFIX and
* NFA_INFIX use unmanaged rose-local reports.
*
* Note: this variant of the function allows a model to be specified with the
* \a hint parameter.
*/
aligned_unique_ptr<NFA>
constructNFA(const NGHolder &g, const ReportManager *rm,
const std::map<u32, u32> &fixed_depth_tops,
const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
bool compress_state, const CompileContext &cc);
/** \brief Build a reverse NFA from the graph given, which should have already
* been reversed.
*
* Used for reverse NFAs used in SOM mode.
*/
aligned_unique_ptr<NFA> constructReversedNFA(const NGHolder &h,
const CompileContext &cc);
#ifndef RELEASE_BUILD
/** \brief Construct an NFA (with model type hint) from the given NFAGraph.
*
* Returns zero if the NFA is not implementable (usually because it has too
* many states for any of our models). Otherwise returns the number of states.
*
* ReportManager is used by NFA_SUFFIX and NFA_OUTFIX only. NFA_PREFIX and
* NFA_INFIX use unmanaged rose-local reports.
*
* Note: this variant of the function allows a model to be specified with the
* \a hint parameter.
*/
aligned_unique_ptr<NFA>
constructNFA(const NGHolder &g, const ReportManager *rm,
const std::map<u32, u32> &fixed_depth_tops,
const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
bool compress_state, u32 hint, const CompileContext &cc);
/** \brief Build a reverse NFA (with model type hint) from the graph given,
* which should have already been reversed.
*
* Used for reverse NFAs used in SOM mode.
*/
aligned_unique_ptr<NFA> constructReversedNFA(const NGHolder &h, u32 hint,
const CompileContext &cc);
#endif // RELEASE_BUILD
} // namespace ue2
#endif // NG_METEOR_H

View File

@@ -0,0 +1,778 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief NFA acceleration analysis code.
*/
#include "ng_limex_accel.h"
#include "ng_holder.h"
#include "ng_misc_opt.h"
#include "ng_util.h"
#include "ue2common.h"
#include "nfa/accel.h"
#include "util/bitutils.h" // for CASE_CLEAR
#include "util/charreach.h"
#include "util/container.h"
#include "util/dump_charclass.h"
#include "util/graph_range.h"
#include <algorithm>
#include <map>
using namespace std;
namespace ue2 {
#define WIDE_FRIEND_MIN 200
static
void findAccelFriendGeneration(const NGHolder &g, const CharReach &cr,
const flat_set<NFAVertex> &cands,
const flat_set<NFAVertex> &preds,
flat_set<NFAVertex> *next_cands,
flat_set<NFAVertex> *next_preds,
flat_set<NFAVertex> *friends) {
for (auto v : cands) {
if (contains(preds, v)) {
continue;
}
const CharReach &acr = g[v].char_reach;
DEBUG_PRINTF("checking %u\n", g[v].index);
if (acr.count() < WIDE_FRIEND_MIN || !acr.isSubsetOf(cr)) {
DEBUG_PRINTF("bad reach %zu\n", acr.count());
continue;
}
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (!contains(preds, u)) {
DEBUG_PRINTF("bad pred\n");
goto next_cand;
}
}
next_preds->insert(v);
insert(next_cands, adjacent_vertices(v, g));
DEBUG_PRINTF("%u is a friend indeed\n", g[v].index);
friends->insert(v);
next_cand:;
}
}
void findAccelFriends(const NGHolder &g, NFAVertex v,
const map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
u32 offset, flat_set<NFAVertex> *friends) {
/* A friend of an accel state is a successor state which can only be on when
* the accel is on. This requires that it has a subset of the accel state's
* preds and a charreach which is a subset of the accel state.
*
* A friend can be safely ignored when accelerating provided there is
* sufficient back-off. A friend is useful if it has a wide reach.
*/
/* BR cyclic states which may go stale cannot have friends as they may
* suddenly turn off leading their so-called friends stranded and alone.
* TODO: restrict to only stale going BR cyclics
*/
if (contains(br_cyclic, v) && !br_cyclic.at(v).unbounded()) {
return;
}
u32 friend_depth = offset + 1;
flat_set<NFAVertex> preds;
insert(&preds, inv_adjacent_vertices(v, g));
const CharReach &cr = g[v].char_reach;
flat_set<NFAVertex> cands;
insert(&cands, adjacent_vertices(v, g));
flat_set<NFAVertex> next_preds;
flat_set<NFAVertex> next_cands;
for (u32 i = 0; i < friend_depth; i++) {
findAccelFriendGeneration(g, cr, cands, preds, &next_cands, &next_preds,
friends);
preds.insert(next_preds.begin(), next_preds.end());
next_preds.clear();
cands.swap(next_cands);
next_cands.clear();
}
}
static
void buildTwoByteStops(flat_set<pair<u8, u8>> &twobyte, const CharReach &cr1,
const CharReach &cr2) {
for (size_t c1 = cr1.find_first(); c1 != cr1.npos; c1 = cr1.find_next(c1)) {
for (size_t c2 = cr2.find_first(); c2 != cr2.npos;
c2 = cr2.find_next(c2)) {
twobyte.emplace((u8)c1, (u8)c2);
}
}
}
static
void findStopLiteralsAtVertex(NFAVertex v, const NGHolder &g,
DoubleAccelInfo &build) {
DEBUG_PRINTF("state %u\n", g[v].index);
// double-byte accel is possible: calculate all single- and double-byte
// accel literals.
const CharReach &cr1 = g[v].char_reach;
if (edge(v, g.accept, g).second) {
// If this first byte is an accept state, it must contribute a
// single-byte escape. We can still go on and calculate additional
// double-byte ones, though.
/* TODO: fix for rose */
build.stop1 |= cr1;
}
flat_set<pair<u8, u8>> twobyte; // for just this starting state
bool single = false;
for (auto w : adjacent_vertices_range(v, g)) {
if (w == g.accept || w == g.acceptEod) {
continue;
}
const CharReach &cr2 = g[w].char_reach;
size_t count = cr1.count() * cr2.count() + build.stop2.size();
if (count > 0 && count <= 8) { // can't do more than 8 two-byte
buildTwoByteStops(twobyte, cr1, cr2);
} else {
// two many two-byte literals, add the first byte as single
single = true;
break;
}
}
if (single || twobyte.empty()) {
assert(!cr1.none());
build.stop1 |= cr1;
} else {
assert(!twobyte.empty());
build.stop2.insert(twobyte.begin(), twobyte.end());
}
}
static
bool is_bit5_insensitive(const flat_set<pair<u8, u8>> &stop) {
if (stop.size() != 4) {
return false;
}
const u8 a = stop.begin()->first & CASE_CLEAR;
const u8 b = stop.begin()->second & CASE_CLEAR;
for (flat_set<pair<u8, u8>>::const_iterator it = stop.begin();
it != stop.end(); ++it) {
if ((it->first & CASE_CLEAR) != a || (it->second & CASE_CLEAR) != b) {
return false;
}
}
return true;
}
static
bool is_dverm(const DoubleAccelInfo &a) {
if (a.stop1.any()) {
return false;
}
if (a.stop2.size() == 1) {
return true;
}
return is_bit5_insensitive(a.stop2);
}
static
bool is_double_better(const DoubleAccelInfo &a, const DoubleAccelInfo &b) {
/* Note: this is not an operator< */
if (a.stop2.empty()) {
return false;
}
if (b.stop2.empty()) {
return true;
}
if (a.stop1.count() > b.stop1.count()) {
return false;
}
if (a.stop1.count() < b.stop1.count()) {
return true;
}
bool a_dvm = is_dverm(a);
bool b_dvm = is_dverm(b);
if (b_dvm && !a_dvm) {
return false;
}
if (!b_dvm && a_dvm) {
return true;
}
if (a.stop2.size() > b.stop2.size()) {
return false;
}
if (a.stop2.size() < b.stop2.size()) {
return true;
}
return a.offset < b.offset;
}
/** \brief Find the escape literals for a two byte accel at the given accel
* offset */
static
void findDoubleAccel(const NGHolder &g, NFAVertex v, u32 accel_offset,
DoubleAccelInfo &build) {
DEBUG_PRINTF("find double accel +%u for vertex %u\n", accel_offset,
g[v].index);
build.offset = accel_offset;
// Our accel state contributes single-byte escapes
build.stop1 |= ~g[v].char_reach;
flat_set<NFAVertex> searchStates; // states that contribute stop literals
searchStates.insert(v); /* TODO: verify */
/* Note: We cannot search past an accepting state */
/* TODO: remove restriction for non-callback generating */
flat_set<NFAVertex> nextStates;
insert(&nextStates, adjacent_vertices(v, g));
nextStates.erase(v);
nextStates.erase(g.accept);
nextStates.erase(g.acceptEod);
searchStates.swap(nextStates);
nextStates.clear();
// subsequent iterations are simpler, just follow all edges
for (u32 j = 1; j <= accel_offset; j++) {
for (auto u : searchStates) {
insert(&nextStates, adjacent_vertices(u, g));
if (edge(u, g.accept, g).second) {
nextStates.clear();
break;
}
nextStates.erase(g.accept);
nextStates.erase(g.acceptEod);
}
searchStates.swap(nextStates);
nextStates.clear();
}
vector<NFAVertex> sorted;
insert(&sorted, sorted.end(), searchStates);
sort(sorted.begin(), sorted.end(), make_index_ordering(g));
for (auto sv : sorted) {
findStopLiteralsAtVertex(sv, g, build);
}
}
DoubleAccelInfo findBestDoubleAccelInfo(const NGHolder &g, NFAVertex v) {
DoubleAccelInfo rv;
for (u32 offset = 0; offset <= MAX_ACCEL_DEPTH; offset++) {
DoubleAccelInfo b_temp;
findDoubleAccel(g, v, offset, b_temp);
if (is_double_better(b_temp, rv)) {
rv = b_temp;
}
}
return rv;
}
static
void findPaths(const NGHolder &g, NFAVertex v,
const vector<CharReach> &refined_cr,
vector<vector<CharReach> > *paths,
const flat_set<NFAVertex> &forbidden, u32 depth) {
static const u32 MAGIC_TOO_WIDE_NUMBER = 16;
if (!depth) {
paths->push_back(vector<CharReach>());
return;
}
if (v == g.accept || v == g.acceptEod) {
paths->push_back(vector<CharReach>());
if (!generates_callbacks(g) || v == g.acceptEod) {
paths->back().push_back(CharReach()); /* red tape options */
}
return;
}
/* for the escape 'literals' we want to use the minimal cr so we
* can be more selective */
const CharReach &cr = refined_cr[g[v].index];
if (out_degree(v, g) >= MAGIC_TOO_WIDE_NUMBER
|| hasSelfLoop(v, g)) {
/* give up on pushing past this point */
paths->push_back(vector<CharReach>());
vector<CharReach> &p = paths->back();
p.push_back(cr);
return;
}
for (auto w : adjacent_vertices_range(v, g)) {
if (contains(forbidden, w)) {
/* path has looped back to one of the active+boring acceleration
* states. We can ignore this path if we have sufficient back-
* off. */
paths->push_back(vector<CharReach>());
paths->back().push_back(CharReach());
continue;
}
u32 new_depth = depth - 1;
vector<vector<CharReach> > curr;
do {
curr.clear();
findPaths(g, w, refined_cr, &curr, forbidden, new_depth);
} while (new_depth-- && curr.size() >= MAGIC_TOO_WIDE_NUMBER);
for (vector<vector<CharReach> >::iterator it = curr.begin();
it != curr.end(); ++it) {
paths->push_back(vector<CharReach>());
vector<CharReach> &p = paths->back();
p.swap(*it);
p.push_back(cr);
}
}
}
static
AccelScheme merge(const AccelScheme &a, const AccelScheme &b) {
return AccelScheme(a.cr | b.cr, MAX(a.offset, b.offset));
}
static
void findBest(vector<vector<CharReach> >::const_iterator pb,
vector<vector<CharReach> >::const_iterator pe,
const AccelScheme &curr, AccelScheme *best) {
assert(curr.offset <= MAX_ACCEL_DEPTH);
DEBUG_PRINTF("paths left %zu\n", pe - pb);
if (pb == pe) {
*best = curr;
return;
}
DEBUG_PRINTF("p len %zu\n", pb->end() - pb->begin());
vector<AccelScheme> priority_path;
u32 i = 0;
for (vector<CharReach>::const_iterator p = pb->begin(); p != pb->end();
++p, i++) {
priority_path.push_back(AccelScheme(*p & ~curr.cr, i));
}
sort(priority_path.begin(), priority_path.end());
for (vector<AccelScheme>::iterator it = priority_path.begin();
it != priority_path.end(); ++it) {
vector<AccelScheme>::iterator jt = it + 1;
for (; jt != priority_path.end(); ++jt) {
if (!it->cr.isSubsetOf(jt->cr)) {
break;
}
}
priority_path.erase(it + 1, jt);
DEBUG_PRINTF("||%zu\n", it->cr.count());
}
DEBUG_PRINTF("---\n");
for (vector<AccelScheme>::const_iterator it = priority_path.begin();
it != priority_path.end(); ++it) {
DEBUG_PRINTF("%u:|| = %zu; p remaining len %zu\n", i, it->cr.count(),
priority_path.end() - it);
AccelScheme in = merge(curr, *it);
if (in > *best) {
DEBUG_PRINTF("worse\n");
continue;
}
AccelScheme temp = *best;
findBest(pb + 1, pe, in, &temp);
if (temp < *best) {
DEBUG_PRINTF("new best\n");
*best = temp;
if (curr.cr == best->cr) {
return; /* could only get better by offset */
}
}
}
}
#ifdef DEBUG
static
void dumpPaths(const vector<vector<CharReach> > &paths) {
for (vector<vector<CharReach> >::const_iterator p = paths.begin();
p != paths.end(); ++p) {
DEBUG_PRINTF("path: [");
for (vector<CharReach>::const_iterator it = p->begin(); it != p->end();
++it) {
printf(" [");
describeClass(stdout, *it, 20, CC_OUT_TEXT);
printf("]");
}
printf(" ]\n");
}
}
#endif
static
void blowoutPathsLessStrictSegment(vector<vector<CharReach> > *paths) {
/* paths segments which are a superset of an earlier segment should never be
* picked as an acceleration segment -> to improve processing just replace
* with dot */
for (vector<vector<CharReach> >::iterator p = paths->begin();
p != paths->end(); ++p) {
for (vector<CharReach>::iterator it = p->begin(); it != p->end();
++it) {
vector<CharReach>::iterator jt = it;
for (++jt; jt != p->end(); ++jt) {
if (it->isSubsetOf(*jt)) {
*jt = CharReach::dot();
}
}
}
}
}
static
void unifyPathsLastSegment(vector<vector<CharReach> > *paths) {
/* try to unify paths which only differ in the last segment */
for (vector<vector<CharReach> >::iterator p = paths->begin();
p != paths->end() && p + 1 != paths->end();) {
vector<CharReach> &a = *p;
vector<CharReach> &b = *(p + 1);
if (a.size() != b.size()) {
++p;
continue;
}
u32 i = 0;
for (; i < a.size() - 1; i++) {
if (a[i] != b[i]) {
break;
}
}
if (i == a.size() - 1) {
/* we can unify these paths */
a[i] |= b[i];
paths->erase(p + 1);
} else {
++p;
}
}
}
static
void improvePaths(vector<vector<CharReach> > *paths) {
#ifdef DEBUG
DEBUG_PRINTF("orig paths\n");
dumpPaths(*paths);
#endif
blowoutPathsLessStrictSegment(paths);
sort(paths->begin(), paths->end());
unifyPathsLastSegment(paths);
#ifdef DEBUG
DEBUG_PRINTF("opt paths\n");
dumpPaths(*paths);
#endif
}
AccelScheme nfaFindAccel(const NGHolder &g, const vector<NFAVertex> &verts,
const vector<CharReach> &refined_cr,
const map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
bool allow_wide) {
CharReach terminating;
for (auto v : verts) {
if (!hasSelfLoop(v, g)) {
DEBUG_PRINTF("no self loop\n");
return AccelScheme(); /* invalid scheme */
}
// check that this state is reachable on most characters
terminating |= ~g[v].char_reach;
}
DEBUG_PRINTF("set vertex has %zu stop chars\n", terminating.count());
size_t limit = allow_wide ? ACCEL_MAX_FLOATING_STOP_CHAR
: ACCEL_MAX_STOP_CHAR;
if (terminating.count() > limit) {
return AccelScheme(); /* invalid scheme */
}
vector<vector<CharReach> > paths;
flat_set<NFAVertex> ignore_vert_set(verts.begin(), verts.end());
/* Note: we can not in general (TODO: ignore when possible) ignore entries
* into the bounded repeat cyclic states as that is when the magic happens
*/
for (map<NFAVertex, BoundedRepeatSummary>::const_iterator it
= br_cyclic.begin();
it != br_cyclic.end(); ++it) {
/* TODO: can allow if repeatMin <= 1 ? */
ignore_vert_set.erase(it->first);
}
for (auto v : verts) {
for (auto w : adjacent_vertices_range(v, g)) {
if (w != v) {
findPaths(g, w, refined_cr, &paths, ignore_vert_set,
MAX_ACCEL_DEPTH);
}
}
}
/* paths built wrong: reverse them */
for (vector<vector<CharReach> >::iterator it = paths.begin();
it != paths.end(); ++it) {
reverse(it->begin(), it->end());
}
improvePaths(&paths);
DEBUG_PRINTF("we have %zu paths\n", paths.size());
if (paths.size() > 40) {
return AccelScheme(); /* too many paths to explore */
}
/* if we were smart we would do something netflowy on the paths to find the
* best cut. But we aren't, so we will just brute force it.
*/
AccelScheme curr(terminating, 0U);
AccelScheme best;
findBest(paths.begin(), paths.end(), curr, &best);
/* find best is a bit lazy in terms of minimising the offset, see if we can
* make it better. need to find the min max offset that we need.*/
u32 offset = 0;
for (vector<vector<CharReach> >::iterator p = paths.begin();
p != paths.end(); ++p) {
u32 i = 0;
for (vector<CharReach>::iterator it = p->begin(); it != p->end();
++it, i++) {
if (it->isSubsetOf(best.cr)) {
break;
}
}
offset = MAX(offset, i);
}
assert(offset <= best.offset);
best.offset = offset;
return best;
}
NFAVertex get_sds_or_proxy(const NGHolder &g) {
DEBUG_PRINTF("looking for sds proxy\n");
if (proper_out_degree(g.startDs, g)) {
return g.startDs;
}
NFAVertex v = NFAGraph::null_vertex();
for (auto w : adjacent_vertices_range(g.start, g)) {
if (w != g.startDs) {
if (!v) {
v = w;
} else {
return g.startDs;
}
}
}
if (!v) {
return g.startDs;
}
while (true) {
if (hasSelfLoop(v, g)) {
DEBUG_PRINTF("woot %u\n", g[v].index);
return v;
}
if (out_degree(v, g) != 1) {
break;
}
NFAVertex u = getSoleDestVertex(g, v);
if (!g[u].char_reach.all()) {
break;
}
v = u;
}
return g.startDs;
}
/** \brief Check if vertex \a v is an accelerable state (for a limex NFA). */
bool nfaCheckAccel(const NGHolder &g, NFAVertex v,
const vector<CharReach> &refined_cr,
const map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
AccelScheme *as, bool allow_wide) {
// For a state to be accelerable, our current criterion is that it be a
// large character class with a self-loop and narrow set of possible other
// successors (i.e. no special successors, union of successor reachability
// is small).
if (!hasSelfLoop(v, g)) {
return false;
}
// check that this state is reachable on most characters
/* we want to use the maximal reach here (in the graph) */
CharReach terminating = g[v].char_reach;
terminating.flip();
DEBUG_PRINTF("vertex %u is cyclic and has %zu stop chars%s\n",
g[v].index, terminating.count(),
allow_wide ? " (w)" : "");
size_t limit = allow_wide ? ACCEL_MAX_FLOATING_STOP_CHAR
: ACCEL_MAX_STOP_CHAR;
if (terminating.count() > limit) {
DEBUG_PRINTF("too leaky\n");
return false;
}
flat_set<NFAVertex> curr, next;
insert(&curr, adjacent_vertices(v, g));
curr.erase(v); // erase self-loop
// We consider offsets of zero through three; this is fairly arbitrary at
// present and could probably be increased (FIXME)
/* WARNING: would/could do horrible things to compile time */
bool stop = false;
vector<CharReach> depthReach(MAX_ACCEL_DEPTH);
unsigned int depth;
for (depth = 0; !stop && depth < MAX_ACCEL_DEPTH; depth++) {
CharReach &cr = depthReach[depth];
for (auto t : curr) {
if (is_special(t, g)) {
// We've bumped into the edge of the graph, so we should stop
// searching.
// Exception: iff our cyclic state is not a dot, than we can
// safely accelerate towards an EOD accept.
/* Exception: nfas that don't generate callbacks so accepts are
* fine too */
if (t == g.accept && !generates_callbacks(g)) {
stop = true; // don't search beyond this depth
continue;
} else if (t == g.accept) {
goto depth_done;
}
assert(t == g.acceptEod);
stop = true; // don't search beyond this depth
} else {
// Non-special vertex
insert(&next, adjacent_vertices(t, g));
/* for the escape 'literals' we want to use the minimal cr so we
* can be more selective */
cr |= refined_cr[g[t].index];
}
}
cr |= terminating;
DEBUG_PRINTF("depth %u has unioned reach %zu\n", depth, cr.count());
curr.swap(next);
next.clear();
}
depth_done:
if (depth == 0) {
return false;
}
DEBUG_PRINTF("selecting from depth 0..%u\n", depth);
/* Look for the most awesome acceleration evar */
for (unsigned int i = 0; i < depth; i++) {
if (depthReach[i].none()) {
DEBUG_PRINTF("red tape acceleration engine depth %u\n", i);
*as = AccelScheme(CharReach(), i);
return true;
}
}
// First, loop over our depths and see if we have a suitable 2-byte
// caseful vermicelli option: this is the (second) fastest accel we have
if (depth > 1) {
for (unsigned int i = 0; i < (depth - 1); i++) {
const CharReach &cra = depthReach[i];
const CharReach &crb = depthReach[i + 1];
if ((cra.count() == 1 && crb.count() == 1)
|| (cra.count() == 2 && crb.count() == 2
&& cra.isBit5Insensitive() && crb.isBit5Insensitive())) {
DEBUG_PRINTF("two-byte vermicelli, depth %u\n", i);
*as = AccelScheme(CharReach::dot(), i);
return true;
}
}
}
// Second option: a two-byte shufti (i.e. less than eight 2-byte
// literals)
if (depth > 1) {
for (unsigned int i = 0; i < (depth - 1); i++) {
if (depthReach[i].count()*depthReach[i+1].count() <= 8) {
DEBUG_PRINTF("two-byte shufti, depth %u\n", i);
*as = AccelScheme(CharReach::dot(), i);
return true;
}
}
}
// Look for one byte accel schemes verm/shufti;
vector<NFAVertex> verts(1, v);
*as = nfaFindAccel(g, verts, refined_cr, br_cyclic, allow_wide);
DEBUG_PRINTF("as width %zu\n", as->cr.count());
return as->cr.count() <= ACCEL_MAX_STOP_CHAR || allow_wide;
}
} // namespace ue2

View File

@@ -0,0 +1,114 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief NFA acceleration analysis code.
*/
#ifndef NG_LIMEX_ACCEL_H
#define NG_LIMEX_ACCEL_H
#include "ng_holder.h"
#include "ng_misc_opt.h"
#include "ue2common.h"
#include "util/charreach.h"
#include "util/order_check.h"
#include "util/ue2_containers.h"
#include <map>
#include <vector>
namespace ue2 {
/* compile time accel defs */
#define MAX_ACCEL_DEPTH 4
#define MAX_MERGED_ACCEL_STOPS 200
#define ACCEL_MAX_STOP_CHAR 24
#define ACCEL_MAX_FLOATING_STOP_CHAR 192 /* accelerating sds is important */
void findAccelFriends(const NGHolder &g, NFAVertex v,
const std::map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
u32 offset,
ue2::flat_set<NFAVertex> *friends);
struct DoubleAccelInfo {
DoubleAccelInfo() : offset(0) {}
u32 offset; //!< offset correction to apply
CharReach stop1; //!< single-byte accel stop literals
flat_set<std::pair<u8, u8>> stop2; //!< double-byte accel stop literals
};
DoubleAccelInfo findBestDoubleAccelInfo(const NGHolder &g, NFAVertex v);
struct AccelScheme {
AccelScheme(const CharReach &cr_in, u32 offset_in)
: cr(cr_in), offset(offset_in) {
assert(offset <= MAX_ACCEL_DEPTH);
}
AccelScheme() : cr(CharReach::dot()), offset(MAX_ACCEL_DEPTH + 1) {}
bool operator<(const AccelScheme &b) const {
const AccelScheme &a = *this;
// Don't use ORDER_CHECK as it will (stupidly) eval count() too many
// times.
const size_t a_count = cr.count(), b_count = b.cr.count();
if (a_count != b_count) {
return a_count < b_count;
}
/* TODO: give bonus if one is a 'caseless' character */
ORDER_CHECK(offset);
ORDER_CHECK(cr);
return false;
}
bool operator>(const AccelScheme &b) const {
return b < *this;
}
CharReach cr;
u32 offset;
};
NFAVertex get_sds_or_proxy(const NGHolder &g);
AccelScheme nfaFindAccel(const NGHolder &g, const std::vector<NFAVertex> &verts,
const std::vector<CharReach> &refined_cr,
const std::map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
bool allow_wide);
/** \brief Check if vertex \a v is an accelerable state (for a limex NFA). */
bool nfaCheckAccel(const NGHolder &g, NFAVertex v,
const std::vector<CharReach> &refined_cr,
const std::map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
AccelScheme *as, bool allow_wide);
} // namespace ue2
#endif

View File

@@ -0,0 +1,852 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Literal analysis and scoring.
*/
#include "ng_literal_analysis.h"
#include "ng_holder.h"
#include "ng_split.h"
#include "ng_util.h"
#include "ue2common.h"
#include "rose/rose_common.h"
#include "util/compare.h"
#include "util/depth.h"
#include "util/graph.h"
#include "util/graph_range.h"
#include "util/ue2string.h"
#include <algorithm>
#include <fstream>
#include <queue>
#include <boost/graph/boykov_kolmogorov_max_flow.hpp>
using namespace std;
using boost::vertex_index;
namespace ue2 {
/** Maximum number of paths to generate. */
static const u32 MAX_WIDTH = 11;
/** Scoring adjustment for 'uniqueness' in literal. */
static const u64a WEIGHT_OF_UNIQUENESS = 250;
namespace {
/* Small literal graph type used for the suffix tree used in
* compressAndScore. */
typedef boost::adjacency_list_traits<boost::vecS, boost::vecS,
boost::bidirectionalS> LitGraphTraits;
typedef LitGraphTraits::vertex_descriptor LitVertex;
typedef LitGraphTraits::edge_descriptor LitEdge;
struct LitGraphVertexProps {
LitGraphVertexProps() {}
explicit LitGraphVertexProps(const ue2_literal::elem &c_in) : c(c_in) {}
ue2_literal::elem c; // string element (char + bool)
};
struct LitGraphEdgeProps {
LitGraphEdgeProps() {}
explicit LitGraphEdgeProps(u64a score_in) : score(score_in) {}
u64a score = NO_LITERAL_AT_EDGE_SCORE;
};
typedef boost::adjacency_list<boost::vecS, boost::vecS, boost::bidirectionalS,
LitGraphVertexProps, LitGraphEdgeProps,
boost::no_property> LitGraph;
typedef pair<LitVertex, NFAVertex> VertexPair;
typedef std::queue<VertexPair> LitVertexQ;
} // namespace
#ifdef DUMP_SUPPORT
/** \brief Dump the literal graph in Graphviz format. */
static UNUSED
void dumpGraph(const char *filename, const LitGraph &lg, const LitVertex &root,
const LitVertex &sink) {
ofstream fout(filename);
fout << "digraph G {" << endl;
for (auto v : vertices_range(lg)) {
fout << boost::get(vertex_index, lg, v);
if (v == root) {
fout << "[label=\"ROOT\"];";
} else if (v == sink) {
fout << "[label=\"SINK\"];";
} else {
ue2_literal s;
s.push_back(lg[v].c);
fout << "[label=\"" << dumpString(s) << "\"];";
}
fout << endl;
}
for (const auto &e : edges_range(lg)) {
LitVertex u = source(e, lg), v = target(e, lg);
fout << boost::get(vertex_index, lg, u) << " -> " <<
boost::get(vertex_index, lg, v) <<
"[label=\"" << lg[e].score << "\"]" <<
";" << endl;
}
fout << "}" << endl;
}
#endif // DUMP_SUPPORT
static
bool allowExpand(size_t numItems, size_t totalPathsSoFar) {
if (numItems == 0) {
return false;
}
if (numItems + totalPathsSoFar > MAX_WIDTH) {
return false;
}
return true;
}
static
LitVertex addToLitGraph(LitGraph &lg, LitVertex sink,
LitVertex pred, const ue2_literal::elem &c) {
// Check if we already have this in the graph.
for (auto v : adjacent_vertices_range(pred, lg)) {
if (v == sink) {
continue;
}
if (lg[v].c == c) {
return v;
}
}
LitVertex lv = add_vertex(LitGraphVertexProps(c), lg);
add_edge(pred, lv, lg);
return lv;
}
static
void addToQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex sink,
LitVertex pred, const CharReach &cr, NFAVertex v) {
for (size_t i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) {
if (myisupper(i) && cr.test(mytolower(i))) {
// ignore upper half of a nocase pair
continue;
}
bool nocase = myislower(i) && cr.test(mytoupper(i));
ue2_literal::elem c((char)i, nocase);
LitVertex lv = addToLitGraph(lg, sink, pred, c);
workQ.push(VertexPair(lv, v));
}
}
static
void initWorkQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex root,
LitVertex sink, const NGHolder &g, const NFAEdge &e) {
NFAVertex u = source(e, g);
NFAVertex v = target(e, g);
const CharReach &cr = g[v].char_reach;
if (!allowExpand(cr.count(), 0)) {
return;
}
addToQueue(workQ, lg, sink, root, cr, u);
}
static
u32 crCardinality(const CharReach &cr) {
// Special-case for handling dots, much faster than running the find_next
// loop below.
if (cr.all()) {
return 230; // [^A-Z]
}
u32 rv = 0;
for (size_t i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) {
if (myisupper(i) && cr.test(mytolower(i))) {
// ignore upper half of a nocase pair
continue;
}
rv++;
}
return rv;
}
/** Filter out literals that include other literals as suffixes. We do this by
* identifying vertices connected to the sink and removing their other
* out-edges. */
static
void filterLitGraph(LitGraph &lg, const LitVertex sink) {
for (auto v : inv_adjacent_vertices_range(sink, lg)) {
remove_out_edge_if(v, [&lg, &sink](const LitEdge &e) {
return target(e, lg) != sink;
}, lg);
}
// We could do a DFS-and-prune here, if we wanted. Right now, we just
// handle it in extractLiterals by throwing away paths that don't run all
// the way from sink to root.
}
/** Extracts all the literals from the given literal graph. Walks the graph
* from each predecessor of the sink (note: it's a suffix tree except for this
* convenience) towards the source, storing each string as we go. */
static
void extractLiterals(const LitGraph &lg, const LitVertex root,
const LitVertex sink, set<ue2_literal> &s) {
ue2_literal lit;
for (auto u : inv_adjacent_vertices_range(sink, lg)) {
lit.clear();
while (u != root) {
lit.push_back(lg[u].c);
assert(in_degree(u, lg) <= 1);
LitGraph::inv_adjacency_iterator ai2, ae2;
tie(ai2, ae2) = inv_adjacent_vertices(u, lg);
if (ai2 == ae2) {
// Path has been cut, time for the next literal.
goto next_literal;
}
u = *ai2;
}
s.insert(lit);
next_literal:
;
}
}
#ifndef NDEBUG
static
bool hasSuffixLiterals(const set<ue2_literal> &s) {
for (auto it = s.begin(), ite = s.end(); it != ite; ++it) {
for (auto jt = std::next(it); jt != ite; ++jt) {
if (isSuffix(*it, *jt) || isSuffix(*jt, *it)) {
DEBUG_PRINTF("'%s' and '%s' have suffix issues\n",
dumpString(*it).c_str(),
dumpString(*jt).c_str());
return true;
}
}
}
return false;
}
#endif
static
void processWorkQueue(const NGHolder &g, const NFAEdge &e,
set<ue2_literal> &s) {
if (is_special(target(e, g), g)) {
return;
}
LitGraph lg;
LitVertex root = add_vertex(lg);
LitVertex sink = add_vertex(lg);
LitVertexQ workQ;
initWorkQueue(workQ, lg, root, sink, g, e);
while (!workQ.empty()) {
const LitVertex lv = workQ.front().first;
const NFAVertex &t = workQ.front().second;
const CharReach &cr = g[t].char_reach;
u32 cr_card = crCardinality(cr);
size_t numItems = cr_card * in_degree(t, g);
size_t committed_count = workQ.size() + in_degree(sink, lg) - 1;
if (g[t].index == NODE_START) {
// reached start, add to literal set
add_edge_if_not_present(lv, sink, lg);
goto next_work_elem;
}
// Expand next vertex
if (allowExpand(numItems, committed_count)) {
for (auto u : inv_adjacent_vertices_range(t, g)) {
addToQueue(workQ, lg, sink, lv, cr, u);
}
goto next_work_elem;
}
// Expand this vertex
if (allowExpand(cr_card, committed_count)) {
for (size_t i = cr.find_first(); i != CharReach::npos;
i = cr.find_next(i)) {
if (myisupper(i) && cr.test(mytolower(i))) {
// ignore upper half of a nocase pair
continue;
}
bool nocase = myislower(i) && cr.test(mytoupper(i));
ue2_literal::elem c((char)i, nocase);
LitVertex lt = addToLitGraph(lg, sink, lv, c);
add_edge_if_not_present(lt, sink, lg);
}
goto next_work_elem;
}
// add to literal set
add_edge_if_not_present(lv, sink, lg);
next_work_elem:
workQ.pop();
}
filterLitGraph(lg, sink);
//dumpGraph("litgraph.dot", lg, root, sink);
extractLiterals(lg, root, sink, s);
// Our literal set should contain no literal that is a suffix of another.
assert(!hasSuffixLiterals(s));
DEBUG_PRINTF("edge %u (%u->%u) produced %zu literals\n", g[e].index,
g[source(e, g)].index, g[target(e, g)].index, s.size());
}
static
u64a litUniqueness(const string &s) {
CharReach seen(s);
return seen.count();
}
/** Count the significant bits of this literal (i.e. seven for nocase alpha,
* eight for everything else). */
static
u64a litCountBits(const ue2_literal &lit) {
u64a n = 0;
for (const auto &c : lit) {
n += c.nocase ? 7 : 8;
}
return n;
}
/** Returns a fairly arbitrary score for the given literal, used to compare the
* suitability of different candidates. */
static
u64a scoreLiteral(const ue2_literal &s) {
// old scoring scheme: SUM(s in S: 1/s.len()^2)
// now weight (currently 75/25) with number of unique chars
// in the string
u64a len = litCountBits(s);
u64a lenUnique = litUniqueness(s.get_string()) * 8;
u64a weightedLen = (1000ULL - WEIGHT_OF_UNIQUENESS) * len +
WEIGHT_OF_UNIQUENESS * lenUnique;
weightedLen /= 8;
DEBUG_PRINTF("scored literal '%s' %llu\n",
escapeString(s.get_string()).c_str(), weightedLen);
return weightedLen;
}
/**
* calculateScore has the following properties:
* - score of literal is the same as the score of the reversed literal;
* - score of substring of literal is worse than the original literal's score;
* - score of any literal should be non-zero.
*/
static
u64a calculateScore(const ue2_literal &s) {
if (s.empty()) {
return NO_LITERAL_AT_EDGE_SCORE;
}
u64a weightedLen = scoreLiteral(s);
DEBUG_PRINTF("len %zu, wl %llu\n", s.length(), weightedLen);
u64a rv = 1000000000000000ULL/(weightedLen * weightedLen * weightedLen);
if (!rv) {
rv = 1;
}
DEBUG_PRINTF("len %zu, score %llu\n", s.length(), rv);
return rv;
}
/** Adds a literal in reverse order, building up a suffix tree. */
static
void addReversedLiteral(const ue2_literal &lit, LitGraph &lg,
const LitVertex &root, const LitVertex &sink) {
DEBUG_PRINTF("literal: '%s'\n", escapeString(lit).c_str());
ue2_literal suffix;
LitVertex v = root;
for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
suffix.push_back(*it);
LitVertex w;
for (auto v2 : adjacent_vertices_range(v, lg)) {
if (v2 != sink && lg[v2].c == *it) {
w = v2;
goto next_char;
}
}
w = add_vertex(LitGraphVertexProps(*it), lg);
add_edge(v, w, LitGraphEdgeProps(calculateScore(suffix)), lg);
next_char:
v = w;
}
// Wire the last vertex to the sink.
add_edge(v, sink, lg);
}
static
void extractLiterals(const vector<LitEdge> &cutset, const LitGraph &lg,
const LitVertex &root, set<ue2_literal> &s) {
for (const auto &e : cutset) {
LitVertex u = source(e, lg), v = target(e, lg);
ue2_literal lit;
lit.push_back(lg[v].c);
while (u != root) {
lit.push_back(lg[u].c);
assert(in_degree(u, lg) == 1);
LitGraph::inv_adjacency_iterator ai, ae;
tie(ai, ae) = inv_adjacent_vertices(u, lg);
if (ai == ae) {
// Path has been cut, time for the next literal.
goto next_literal;
}
u = *ai;
}
DEBUG_PRINTF("extracted: '%s'\n", escapeString(lit).c_str());
s.insert(lit);
next_literal:
;
}
}
#ifdef DEBUG
static UNUSED
const char *describeColor(boost::default_color_type c) {
switch (c) {
case boost::white_color:
return "white";
case boost::gray_color:
return "gray";
case boost::green_color:
return "green";
case boost::red_color:
return "red";
case boost::black_color:
return "black";
default:
return "unknown";
}
}
#endif
/**
* The BGL's boykov_kolmogorov_max_flow requires that all edges have their
* reverse edge in the graph. This function adds them, returning the new edges
* and constructing a map of (edge, rev edge).
*/
static
vector<LitEdge> addReverseEdges(LitGraph &lg,
ue2::unordered_map<LitEdge, LitEdge> &reverse_edge_map) {
vector<LitEdge> reverseMe;
reverse_edge_map.clear();
reverse_edge_map.reserve(num_edges(lg) * 2);
for (const auto &e : edges_range(lg)) {
LitVertex u = source(e, lg), v = target(e, lg);
assert(u != v);
bool exists;
LitEdge rev;
tie(rev, exists) = edge(v, u, lg);
if (exists) {
reverse_edge_map[e] = rev;
} else {
reverseMe.push_back(e);
}
}
vector<LitEdge> reverseEdges;
reverseEdges.reserve(reverseMe.size());
for (const auto &e : reverseMe) {
LitVertex u = source(e, lg), v = target(e, lg);
LitEdge rev = add_edge(v, u, lg[e], lg).first;
reverseEdges.push_back(rev);
reverse_edge_map[e] = rev;
reverse_edge_map[rev] = e;
}
return reverseEdges;
}
static
void findMinCut(LitGraph &lg, const LitVertex &root, const LitVertex &sink,
vector<LitEdge> &cutset) {
cutset.clear();
//dumpGraph("litgraph.dot", lg, root, sink);
assert(!in_degree(root, lg));
assert(!out_degree(sink, lg));
// Add reverse edges for the convenience of the BGL's max flow algorithm.
ue2::unordered_map<LitEdge, LitEdge> reverse_edge_map;
vector<LitEdge> tempEdges = addReverseEdges(lg, reverse_edge_map);
const auto v_index_map = get(vertex_index, lg);
const size_t num_verts = num_vertices(lg);
vector<boost::default_color_type> colors(num_verts);
vector<s32> distances(num_verts);
vector<LitEdge> predecessors(num_verts);
ue2::unordered_map<LitEdge, u64a> residuals;
residuals.reserve(num_edges(lg));
UNUSED u64a flow = boykov_kolmogorov_max_flow(lg,
get(&LitGraphEdgeProps::score, lg),
make_assoc_property_map(residuals),
make_assoc_property_map(reverse_edge_map),
make_iterator_property_map(predecessors.begin(), v_index_map),
make_iterator_property_map(colors.begin(), v_index_map),
make_iterator_property_map(distances.begin(), v_index_map),
get(vertex_index, lg), root, sink);
DEBUG_PRINTF("done, flow = %llu\n", flow);
// Remove temporary reverse edges.
for (const auto &e : tempEdges) {
remove_edge(e, lg);
}
vector<LitEdge> white_cut, black_cut;
u64a white_flow = 0, black_flow = 0;
for (const auto &e : edges_range(lg)) {
const LitVertex u = source(e, lg), v = target(e, lg);
const auto ucolor = colors[boost::get(vertex_index, lg, u)];
const auto vcolor = colors[boost::get(vertex_index, lg, v)];
DEBUG_PRINTF("edge %zu:%s -> %zu:%s score %llu\n",
boost::get(vertex_index, lg, u), describeColor(ucolor),
boost::get(vertex_index, lg, v), describeColor(vcolor),
lg[e].score);
if (ucolor != boost::white_color && vcolor == boost::white_color) {
assert(target(e, lg) != sink);
white_cut.push_back(e);
white_flow += lg[e].score;
}
if (ucolor == boost::black_color && vcolor != boost::black_color) {
assert(target(e, lg) != sink);
black_cut.push_back(e);
black_flow += lg[e].score;
}
}
DEBUG_PRINTF("white flow = %llu, black flow = %llu\n",
white_flow, black_flow);
assert(white_flow && black_flow);
if (white_flow <= black_flow) {
DEBUG_PRINTF("selected white cut\n");
cutset.swap(white_cut);
} else {
DEBUG_PRINTF("selected black cut\n");
cutset.swap(black_cut);
}
DEBUG_PRINTF("min cut has %zu edges\n", cutset.size());
assert(!cutset.empty());
}
/** Takes a set of literals and derives a better one from them, returning its
* score. Literals with a common suffix S will be replaced with S. (for
* example, {foobar, fooobar} -> {oobar}).
*/
u64a compressAndScore(set<ue2_literal> &s) {
if (s.empty()) {
return NO_LITERAL_AT_EDGE_SCORE;
}
if (s.size() == 1) {
return calculateScore(*s.begin());
}
UNUSED u64a initialScore = scoreSet(s);
DEBUG_PRINTF("begin, initial literals have score %llu\n",
initialScore);
LitGraph lg;
const LitVertex root = add_vertex(lg);
const LitVertex sink = add_vertex(lg);
for (const auto &lit : s) {
addReversedLiteral(lit, lg, root, sink);
}
DEBUG_PRINTF("suffix tree has %zu vertices and %zu edges\n",
num_vertices(lg), num_edges(lg));
vector<LitEdge> cutset;
findMinCut(lg, root, sink, cutset);
s.clear();
extractLiterals(cutset, lg, root, s);
u64a score = scoreSet(s);
DEBUG_PRINTF("compressed score is %llu\n", score);
assert(score <= initialScore);
return score;
}
u64a scoreSet(const set<ue2_literal> &s) {
if (s.empty()) {
return NO_LITERAL_AT_EDGE_SCORE;
}
u64a score = 1ULL;
for (const auto &lit : s) {
score += calculateScore(lit);
}
return score;
}
set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAEdge &e) {
set<ue2_literal> s;
processWorkQueue(g, e, s);
return s;
}
set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAVertex &v,
bool only_first_encounter) {
set<ue2_literal> s;
if (is_special(v, g)) {
return s;
}
set<ue2_literal> ls;
for (const auto &e : in_edges_range(v, g)) {
if (source(e, g) == v && only_first_encounter) {
continue; /* ignore self loop on root vertex as we are interested in
* the first time we visit the vertex on the way to
* accept. In fact, we can ignore any back edges - but
* they would require a bit of effort to discover. */
}
ls = getLiteralSet(g, e);
if (ls.empty()) {
s.clear();
return s;
} else {
s.insert(ls.begin(), ls.end());
}
}
return s;
}
vector<u64a> scoreEdges(const NGHolder &g) {
assert(hasCorrectlyNumberedEdges(g));
vector<u64a> scores(num_edges(g));
for (const auto &e : edges_range(g)) {
u32 eidx = g[e].index;
assert(eidx < scores.size());
set<ue2_literal> ls = getLiteralSet(g, e);
scores[eidx] = compressAndScore(ls);
}
return scores;
}
static
bool splitOffLeadingLiteral_i(const NGHolder &g, bool anch,
ue2_literal *lit_out,
NGHolder *rhs) {
NFAVertex u;
NFAVertex v;
if (!anch) {
DEBUG_PRINTF("looking for leading floating literal\n");
set<NFAVertex> s_succ;
insert(&s_succ, adjacent_vertices(g.start, g));
set<NFAVertex> sds_succ;
insert(&sds_succ, adjacent_vertices(g.startDs, g));
bool floating = is_subset_of(s_succ, sds_succ);
if (!floating) {
DEBUG_PRINTF("not floating\n");
return false;
}
sds_succ.erase(g.startDs);
if (sds_succ.size() != 1) {
DEBUG_PRINTF("branchy root\n");
return false;
}
u = g.startDs;
v = *sds_succ.begin();
} else {
DEBUG_PRINTF("looking for leading anchored literal\n");
if (proper_out_degree(g.startDs, g)) {
DEBUG_PRINTF("not anchored\n");
return false;
}
set<NFAVertex> s_succ;
insert(&s_succ, adjacent_vertices(g.start, g));
s_succ.erase(g.startDs);
if (s_succ.size() != 1) {
DEBUG_PRINTF("branchy root\n");
return false;
}
u = g.start;
v = *s_succ.begin();
}
while (true) {
DEBUG_PRINTF("validating vertex %u\n", g[v].index);
assert(v != g.acceptEod && v != g.accept);
const CharReach &cr = g[v].char_reach;
if (cr.count() != 1 && !cr.isCaselessChar()) {
break;
}
// Rose can only handle mixed-sensitivity literals up to the max mask
// length.
if (lit_out->length() >= MAX_MASK2_WIDTH) {
if (mixed_sensitivity(*lit_out)) {
DEBUG_PRINTF("long and mixed sensitivity\n");
break;
}
if (ourisalpha((char)cr.find_first())) {
if (cr.isCaselessChar() != lit_out->any_nocase()) {
DEBUG_PRINTF("stop at mixed sensitivity on '%c'\n",
(char)cr.find_first());
break;
}
}
}
if (edge(v, g.accept, g).second || edge(v, g.acceptEod, g).second) {
DEBUG_PRINTF("connection to accept\n");
break;
}
lit_out->push_back(cr.find_first(), cr.isCaselessChar());
u = v;
if (out_degree(v, g) != 1) {
DEBUG_PRINTF("out_degree != 1\n");
break;
}
v = *adjacent_vertices(v, g).first;
if (in_degree(v, g) != 1) {
DEBUG_PRINTF("blargh\n"); /* picks up cases where there is no path
* to case accept (large cycles),
* ensures term */
break;
}
}
if (lit_out->empty()) {
return false;
}
assert(u != g.startDs);
ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
vector<NFAVertex> pivots;
insert(&pivots, pivots.end(), adjacent_vertices(u, g));
splitRHS(g, pivots, rhs, &rhs_map);
DEBUG_PRINTF("literal is '%s' (len %zu)\n", dumpString(*lit_out).c_str(),
lit_out->length());
assert(is_triggered(*rhs));
return true;
}
bool splitOffLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
NGHolder *rhs) {
return splitOffLeadingLiteral_i(g, false, lit_out, rhs);
}
bool splitOffAnchoredLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
NGHolder *rhs) {
return splitOffLeadingLiteral_i(g, true, lit_out, rhs);
}
bool getTrailingLiteral(const NGHolder &g, ue2_literal *lit_out) {
if (in_degree(g.acceptEod, g) != 1) {
return false;
}
NFAVertex v = getSoleSourceVertex(g, g.accept);
if (!v) {
return false;
}
set<ue2_literal> s = getLiteralSet(g, v, false);
if (s.size() != 1) {
return false;
}
const ue2_literal &lit = *s.begin();
if (lit.length() > MAX_MASK2_WIDTH && mixed_sensitivity(lit)) {
DEBUG_PRINTF("long & mixed-sensitivity, Rose can't handle this.\n");
return false;
}
*lit_out = lit;
return true;
}
} // namespace ue2

View File

@@ -0,0 +1,82 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Literal analysis and scoring.
*/
#ifndef NG_LITERAL_ANALYSIS_H
#define NG_LITERAL_ANALYSIS_H
#include <set>
#include <vector>
#include "ng_holder.h"
#include "util/ue2string.h"
namespace ue2 {
#define NO_LITERAL_AT_EDGE_SCORE 10000000ULL
/* Score for special-to-special edges */
#define INVALID_EDGE_CAP 100000000ULL
class NGHolder;
/**
* Fetch the literal set for a given vertex, returning it in \p s. Note: does
* NOT take into account any constraints due to streaming mode requirements.
*
* if only_first_encounter is requested, the output set may drop literals
* generated by revisiting the destination vertex.
*/
std::set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAVertex &v,
bool only_first_encounter = true);
std::set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAEdge &e);
/** Score all the edges in the given graph, returning them in \p scores indexed
* by edge_index. */
std::vector<u64a> scoreEdges(const NGHolder &h);
/** Returns a score for a literal set. Lower scores are better. */
u64a scoreSet(const std::set<ue2_literal> &s);
/** Compress a literal set to fewer literals. */
u64a compressAndScore(std::set<ue2_literal> &s);
bool splitOffLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
NGHolder *rhs);
bool splitOffAnchoredLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
NGHolder *rhs);
bool getTrailingLiteral(const NGHolder &g, ue2_literal *lit_out);
} // namespace ue2
#endif

View File

@@ -0,0 +1,222 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Literal Component Splitting. Identifies literals that span the
* graph and moves them into Rose.
*/
#include "grey.h"
#include "ng.h"
#include "ng_literal_component.h"
#include "ng_prune.h"
#include "ng_util.h"
#include "ue2common.h"
#include "rose/rose_build.h"
#include "util/container.h"
#include "util/graph.h"
#include "util/graph_range.h"
#include "util/ue2string.h"
using namespace std;
namespace ue2 {
static
bool isLiteralChar(const NGWrapper &g, NFAVertex v,
bool &nocase, bool &casefixed) {
const CharReach &cr = g[v].char_reach;
const size_t num = cr.count();
if (num > 2) {
return false; // char class
}
if (!casefixed) {
if (num == 2 && cr.isCaselessChar()) {
nocase = true;
casefixed = true;
return true;
} else if (num == 1) {
if (cr.isAlpha()) {
nocase = false;
casefixed = true;
}
// otherwise, still acceptable but we can't fix caselessness yet
return true;
}
} else {
// nocase property is fixed
if (nocase) {
if ((num == 2 && cr.isCaselessChar()) ||
(num == 1 && !cr.isAlpha())) {
return true;
}
} else {
return (num == 1);
}
}
return false;
}
static
void addToString(string &s, const NGHolder &g, NFAVertex v) {
const CharReach &cr = g[v].char_reach;
assert(cr.count() == 1 || cr.isCaselessChar());
char c = (char)cr.find_first();
s.push_back(c);
}
static
bool splitOffLiteral(NG &ng, NGWrapper &g, NFAVertex v, const bool anchored,
set<NFAVertex> &dead) {
DEBUG_PRINTF("examine vertex %u\n", g[v].index);
bool nocase = false, casefixed = false;
assert(!is_special(v, g));
size_t reqInDegree;
if (anchored) {
reqInDegree = 1;
assert(edge(g.start, v, g).second);
} else {
reqInDegree = 2;
assert(edge(g.start, v, g).second);
assert(edge(g.startDs, v, g).second);
}
if (hasGreaterInDegree(reqInDegree, v, g)) {
DEBUG_PRINTF("extra in-edges\n");
return false;
}
if (!isLiteralChar(g, v, nocase, casefixed)) {
DEBUG_PRINTF("not literal\n");
return false;
}
string literal;
addToString(literal, g, v);
// Remaining vertices must come in a chain, each with one in-edge and one
// out-edge only.
NFAVertex u;
while (1) {
if (out_degree(v, g) != 1) {
DEBUG_PRINTF("branches, not literal\n");
return false;
}
u = v; // previous vertex
v = *(adjacent_vertices(v, g).first);
DEBUG_PRINTF("loop, v=%u\n", g[v].index);
if (is_special(v, g)) {
if (v == g.accept || v == g.acceptEod) {
break; // OK
} else {
assert(0); // start?
return false;
}
} else {
// Ordinary, must be literal
if (!isLiteralChar(g, v, nocase, casefixed)) {
DEBUG_PRINTF("not literal\n");
return false;
}
if (in_degree(v, g) != 1) {
DEBUG_PRINTF("branches, not literal\n");
return false;
}
}
addToString(literal, g, v);
}
// Successfully found a literal; there might be multiple report IDs, in
// which case we add all the reports.
assert(!is_special(u, g));
bool eod = v == g.acceptEod;
assert(eod || v == g.accept);
DEBUG_PRINTF("success: found %s literal '%s'\n",
anchored ? "anchored" : "unanchored",
escapeString(literal).c_str());
// Literals of length 1 are better served going through later optimisation
// passes, where they might be combined together into a character class.
if (literal.length() == 1) {
DEBUG_PRINTF("skipping literal of length 1\n");
return false;
}
ng.rose->add(anchored, eod, ue2_literal(literal, nocase), g[u].reports);
// Remove the terminal vertex. Later, we rely on pruneUseless to remove the
// other vertices in this chain, since they'll no longer lead to an accept.
dead.insert(u);
return true;
}
/** \brief Split off literals. True if any changes were made to the graph. */
bool splitOffLiterals(NG &ng, NGWrapper &g) {
if (!ng.cc.grey.allowRose) {
return false;
}
bool changed = false;
set<NFAVertex> dead;
ue2::unordered_set<NFAVertex> unanchored; // for faster lookup.
insert(&unanchored, adjacent_vertices(g.startDs, g));
// Anchored literals.
for (auto v : adjacent_vertices_range(g.start, g)) {
if (!is_special(v, g) && !contains(unanchored, v)) {
changed |= splitOffLiteral(ng, g, v, true, dead);
}
}
// Unanchored literals.
for (auto v : adjacent_vertices_range(g.startDs, g)) {
if (!is_special(v, g)) {
changed |= splitOffLiteral(ng, g, v, false, dead);
}
}
if (changed) {
remove_vertices(dead, g);
pruneUseless(g);
return true;
}
return false;
}
} // namespace ue2

View File

@@ -0,0 +1,47 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Literal Component Splitting. Identifies literals that span the
* graph and moves them into Rose.
*/
#ifndef NG_LITERAL_COMPONENT_H
#define NG_LITERAL_COMPONENT_H
namespace ue2 {
class NG;
class NGWrapper;
/** \brief Split off literals. True if any changes were made to the graph. */
bool splitOffLiterals(NG &ng, NGWrapper &graph);
} // namespace ue2
#endif // NG_LITERAL_COMPONENT_H

View File

@@ -0,0 +1,232 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Analysis for literals decorated by leading/trailing assertions or
* character classes.
*/
#include "ng_literal_decorated.h"
#include "nfagraph/ng_holder.h"
#include "nfagraph/ng_util.h"
#include "rose/rose_build.h"
#include "rose/rose_in_graph.h"
#include "rose/rose_in_util.h"
#include "util/compile_context.h"
#include "util/dump_charclass.h"
#include "util/make_unique.h"
#include <algorithm>
#include <memory>
#include <sstream>
#include <boost/graph/depth_first_search.hpp>
using namespace std;
namespace ue2 {
namespace {
/** \brief Max fixed-width paths to generate from a graph. */
static constexpr size_t MAX_PATHS = 10;
/** \brief Max degree for any non-special vertex in the graph. */
static constexpr size_t MAX_VERTEX_DEGREE = 6;
using Path = vector<NFAVertex>;
} // namespace
static
bool findPaths(const NGHolder &g, vector<Path> &paths) {
vector<NFAVertex> order = getTopoOrdering(g);
vector<vector<Path>> built(num_vertices(g));
for (auto it = order.rbegin(); it != order.rend(); ++it) {
NFAVertex v = *it;
auto &out = built[g[v].index];
assert(out.empty());
if (v == g.start || v == g.startDs) {
out.push_back({v});
continue;
}
// The paths to v are the paths to v's predecessors, with v added to
// the end of each.
for (auto u : inv_adjacent_vertices_range(v, g)) {
// We have a stylized connection from start -> startDs, but we
// don't need anchored and unanchored versions of the same path.
if (u == g.start && edge(g.startDs, v, g).second) {
continue;
}
// Similarly, avoid the accept->acceptEod edge.
if (u == g.accept) {
assert(v == g.acceptEod);
continue;
}
for (const auto &p : built[g[u].index]) {
out.push_back(p);
out.back().push_back(v);
if (out.size() > MAX_PATHS) {
// All these paths should eventually end up at a sink, so
// we've blown past our limit.
DEBUG_PRINTF("path limit exceeded\n");
return false;
}
}
}
}
insert(&paths, paths.end(), built[NODE_ACCEPT]);
insert(&paths, paths.end(), built[NODE_ACCEPT_EOD]);
DEBUG_PRINTF("%zu paths generated\n", paths.size());
return paths.size() <= MAX_PATHS;
}
static
bool hasLargeDegreeVertex(const NGHolder &g) {
for (const auto &v : vertices_range(g)) {
if (is_special(v, g)) { // specials can have large degree
continue;
}
if (has_greater_degree(MAX_VERTEX_DEGREE, v, g)) {
DEBUG_PRINTF("vertex %u has degree %zu\n", g[v].index,
boost::degree(v, g.g));
return true;
}
}
return false;
}
#if defined(DEBUG) || defined(DUMP_SUPPORT)
static UNUSED
string dumpPath(const NGHolder &g, const Path &path) {
ostringstream oss;
for (const auto &v : path) {
switch (g[v].index) {
case NODE_START:
oss << "<start>";
break;
case NODE_START_DOTSTAR:
oss << "<startDs>";
break;
case NODE_ACCEPT:
oss << "<accept>";
break;
case NODE_ACCEPT_EOD:
oss << "<acceptEod>";
break;
default:
oss << describeClass(g[v].char_reach);
break;
}
}
return oss.str();
}
#endif
struct PathMask {
PathMask(const NGHolder &g, const Path &path)
: is_anchored(path.front() == g.start),
is_eod(path.back() == g.acceptEod) {
assert(path.size() >= 2);
mask.reserve(path.size() - 2);
for (const auto &v : path) {
if (is_special(v, g)) {
continue;
}
mask.push_back(g[v].char_reach);
}
// Reports are attached to the second-to-last vertex.
reports = g[*next(path.rbegin())].reports;
assert(!reports.empty());
}
vector<CharReach> mask;
ue2::flat_set<ReportID> reports;
bool is_anchored;
bool is_eod;
};
bool handleDecoratedLiterals(RoseBuild &rose, const NGHolder &g,
const CompileContext &cc) {
if (!cc.grey.allowDecoratedLiteral) {
return false;
}
if (!isAcyclic(g)) {
DEBUG_PRINTF("not acyclic\n");
return false;
}
if (hasLargeDegreeVertex(g)) {
DEBUG_PRINTF("large degree\n");
return false;
}
vector<Path> paths;
if (!findPaths(g, paths)) {
DEBUG_PRINTF("couldn't split into a small number of paths\n");
return false;
}
assert(!paths.empty());
assert(paths.size() <= MAX_PATHS);
vector<PathMask> masks;
masks.reserve(paths.size());
for (const auto &path : paths) {
DEBUG_PRINTF("path: %s\n", dumpPath(g, path).c_str());
PathMask pm(g, path);
if (!rose.validateMask(pm.mask, pm.reports, pm.is_anchored,
pm.is_eod)) {
DEBUG_PRINTF("failed validation\n");
return false;
}
masks.push_back(move(pm));
}
for (const auto &pm : masks) {
rose.addMask(pm.mask, pm.reports, pm.is_anchored, pm.is_eod);
}
DEBUG_PRINTF("all ok, %zu masks added\n", masks.size());
return true;
}
} // namespace ue2

View File

@@ -0,0 +1,52 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Analysis for literals decorated by leading/trailing assertions or
* character classes.
*/
#ifndef NFAGRAPH_NG_LITERAL_DECORATED_H
#define NFAGRAPH_NG_LITERAL_DECORATED_H
namespace ue2 {
class RoseBuild;
class NGHolder;
struct CompileContext;
/**
* \brief If the graph contains only a decorated literal, feed it to the Rose
* builder. Returns true on success.
*/
bool handleDecoratedLiterals(RoseBuild &rose, const NGHolder &g,
const CompileContext &cc);
} // namespace ue2
#endif // NFAGRAPH_NG_LITERAL_DECORATED_H

View File

@@ -0,0 +1,665 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Build code for McClellan DFA.
*/
#include "ng_mcclellan.h"
#include "grey.h"
#include "nfa/dfa_min.h"
#include "nfa/rdfa.h"
#include "ng_holder.h"
#include "ng_mcclellan_internal.h"
#include "ng_restructuring.h"
#include "ng_squash.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/bitfield.h"
#include "util/determinise.h"
#include "util/graph_range.h"
#include "util/make_unique.h"
#include "util/report_manager.h"
#include "util/ue2_containers.h"
#include <algorithm>
#include <functional>
#include <map>
#include <set>
#include <vector>
#include <boost/dynamic_bitset.hpp>
using namespace std;
using boost::dynamic_bitset;
namespace ue2 {
#define FINAL_DFA_STATE_LIMIT 16383
#define DFA_STATE_LIMIT 1024
#define NFA_STATE_LIMIT 256
u16 buildAlphabetFromEquivSets(const std::vector<CharReach> &esets,
array<u16, ALPHABET_SIZE> &alpha,
array<u16, ALPHABET_SIZE> &unalpha) {
u16 i = 0;
for (; i < esets.size(); i++) {
const CharReach &cr = esets[i];
#ifdef DEBUG
DEBUG_PRINTF("eq set: ");
for (size_t s = cr.find_first(); s != CharReach::npos;
s = cr.find_next(s)) {
printf("%02hhx ", (u8)s);
}
printf("-> %u\n", i);
#endif
u16 leader = cr.find_first();
for (size_t s = cr.find_first(); s != CharReach::npos;
s = cr.find_next(s)) {
alpha[s] = i;
}
unalpha[i] = leader;
}
for (u16 j = N_CHARS; j < ALPHABET_SIZE; j++, i++) {
alpha[j] = i;
unalpha[i] = j;
}
return i; // alphabet size
}
void calculateAlphabet(const NGHolder &g, array<u16, ALPHABET_SIZE> &alpha,
array<u16, ALPHABET_SIZE> &unalpha, u16 *alphasize) {
vector<CharReach> esets(1, CharReach::dot());
for (auto v : vertices_range(g)) {
if (is_special(v, g)) {
continue;
}
const CharReach &cr = g[v].char_reach;
for (size_t i = 0; i < esets.size(); i++) {
if (esets[i].count() == 1) {
continue;
}
CharReach t = cr & esets[i];
if (t.any() && t != esets[i]) {
esets[i] &= ~t;
esets.push_back(t);
}
}
}
// for deterministic compiles
sort(esets.begin(), esets.end());
assert(alphasize);
*alphasize = buildAlphabetFromEquivSets(esets, alpha, unalpha);
}
static
bool allExternalReports(const ReportManager &rm,
const flat_set<ReportID> &reports) {
for (auto report_id : reports) {
if (!isExternalReport(rm.getReport(report_id))) {
return false;
}
}
return true;
}
static
dstate_id_t successor(const vector<dstate> &dstates, dstate_id_t c,
const array<u16, ALPHABET_SIZE> &alpha, symbol_t s) {
return dstates[c].next[alpha[s]];
}
void getFullTransitionFromState(const raw_dfa &n, dstate_id_t state,
dstate_id_t *out_table) {
for (u32 i = 0; i < ALPHABET_SIZE; i++) {
out_table[i] = successor(n.states, state, n.alpha_remap, i);
}
}
template<typename stateset>
static
void populateInit(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &state_ids,
stateset *init, stateset *init_deep,
vector<NFAVertex> *v_by_index) {
for (auto v : vertices_range(g)) {
if (state_ids.at(v) == NO_STATE) {
continue;
}
u32 vert_id = g[v].index;
assert(vert_id < init->size());
if (is_any_start(v, g)) {
init->set(vert_id);
if (hasSelfLoop(v, g) || is_triggered(g)) {
DEBUG_PRINTF("setting %u\n", vert_id);
init_deep->set(vert_id);
}
}
}
v_by_index->clear();
v_by_index->resize(num_vertices(g), NFAGraph::null_vertex());
for (auto v : vertices_range(g)) {
u32 vert_id = g[v].index;
assert((*v_by_index)[vert_id] == NFAGraph::null_vertex());
(*v_by_index)[vert_id] = v;
}
if (is_triggered(g)) {
*init_deep = *init;
}
}
template<typename StateSet>
void populateAccepts(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &state_ids,
StateSet *accept, StateSet *acceptEod) {
for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
if (state_ids.at(v) != NO_STATE) {
accept->set(g[v].index);
}
}
for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
if (v == g.accept) {
continue;
}
if (state_ids.at(v) != NO_STATE) {
acceptEod->set(g[v].index);
}
}
}
static
bool canPruneEdgesFromAccept(const ReportManager &rm, const NGHolder &g) {
bool seen = false;
u32 ekey = 0;
for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
if (is_special(v, g)) {
continue;
}
for (auto report_id : g[v].reports) {
const Report &ir = rm.getReport(report_id);
if (!isSimpleExhaustible(ir)) {
return false;
}
if (!seen) {
seen = true;
ekey = ir.ekey;
} else if (ekey != ir.ekey) {
return false;
}
}
}
/* need to check accept eod does not have any unseen reports as well */
for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
if (is_special(v, g)) {
continue;
}
for (auto report_id : g[v].reports) {
const Report &ir = rm.getReport(report_id);
if (!isSimpleExhaustible(ir)) {
return false;
}
if (!seen) {
seen = true;
ekey = ir.ekey;
} else if (ekey != ir.ekey) {
return false;
}
}
}
return true;
}
static
bool overhangMatchesTrigger(const vector<vector<CharReach> > &all_triggers,
vector<CharReach>::const_reverse_iterator itb,
vector<CharReach>::const_reverse_iterator ite) {
for (const auto &trigger : all_triggers) {
vector<CharReach>::const_reverse_iterator it = itb;
vector<CharReach>::const_reverse_iterator kt = trigger.rbegin();
for (; it != ite && kt != trigger.rend(); ++it, ++kt) {
if ((*it & *kt).none()) {
/* this trigger does not match the overhang, try next */
goto try_next_trigger;
}
}
return true;
try_next_trigger:;
}
return false; /* no trigger matches the over hang */
}
static
bool triggerAllowed(const NGHolder &g, const NFAVertex v,
const vector<vector<CharReach> > &all_triggers,
const vector<CharReach> &trigger) {
set<NFAVertex> curr;
set<NFAVertex> next;
curr.insert(v);
for (auto it = trigger.rbegin(); it != trigger.rend(); ++it) {
next.clear();
for (auto u : curr) {
assert(u != g.startDs); /* triggered graphs should not use sds */
if (u == g.start) {
if (overhangMatchesTrigger(all_triggers, it, trigger.rend())) {
return true;
}
continue;
}
if ((g[u].char_reach & *it).none()) {
continue;
}
insert(&next, inv_adjacent_vertices(u, g));
}
if (next.empty()) {
return false;
}
next.swap(curr);
}
return true;
}
void markToppableStarts(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &state_ids,
bool single_trigger,
const vector<vector<CharReach>> &triggers,
dynamic_bitset<> *out) {
if (single_trigger) {
return; /* no live states can lead to new states */
}
for (auto v : vertices_range(g)) {
if (state_ids.at(v) == NO_STATE) {
continue;
}
u32 vert_id = g[v].index;
for (const auto &trigger : triggers) {
if (triggerAllowed(g, v, triggers, trigger)) {
DEBUG_PRINTF("idx %u is valid location for top\n", vert_id);
out->set(vert_id);
break;
}
}
}
assert(out->test(g[g.start].index));
}
namespace {
class Automaton_Big {
public:
typedef dynamic_bitset<> StateSet;
typedef map<StateSet, dstate_id_t> StateMap;
Automaton_Big(const ReportManager *rm_in, const NGHolder &graph_in,
const ue2::unordered_map<NFAVertex, u32> &state_ids_in,
bool single_trigger,
const vector<vector<CharReach>> &triggers, bool prunable_in)
: rm(rm_in), graph(graph_in), state_ids(state_ids_in),
numStates(num_vertices(graph)), init(numStates), initDS(numStates),
squash(numStates), accept(numStates), acceptEod(numStates),
toppable(numStates), prunable(prunable_in), dead(numStates) {
populateInit(graph, state_ids, &init, &initDS, &v_by_index);
populateAccepts(graph, state_ids, &accept, &acceptEod);
start_anchored = DEAD_STATE + 1;
if (initDS == init) {
start_floating = start_anchored;
} else if (initDS.any()) {
start_floating = start_anchored + 1;
} else {
start_floating = DEAD_STATE;
}
calculateAlphabet(graph, alpha, unalpha, &alphasize);
for (const auto &sq : findSquashers(graph)) {
NFAVertex v = sq.first;
u32 vert_id = graph[v].index;
squash.set(vert_id);
squash_mask[vert_id] = shrinkStateSet(sq.second);
}
cr_by_index = populateCR(graph, v_by_index, alpha);
if (is_triggered(graph)) {
markToppableStarts(graph, state_ids, single_trigger, triggers,
&toppable);
}
}
private:
// Convert an NFAStateSet (as used by the squash code) into a StateSet
StateSet shrinkStateSet(const NFAStateSet &in) const {
StateSet out(dead.size());
for (size_t i = in.find_first(); i != in.npos && i < out.size();
i = in.find_next(i)) {
out.set(i);
}
return out;
}
public:
void transition(const StateSet &in, StateSet *next) {
transition_graph(*this, v_by_index, in, next);
}
const vector<StateSet> initial() {
vector<StateSet> rv(1, init);
if (start_floating != DEAD_STATE && start_floating != start_anchored) {
rv.push_back(initDS);
}
return rv;
}
private:
void reports_i(const StateSet &in, bool eod, flat_set<ReportID> &rv) {
StateSet acc = in & (eod ? acceptEod : accept);
for (size_t i = acc.find_first(); i != StateSet::npos;
i = acc.find_next(i)) {
NFAVertex v = v_by_index[i];
DEBUG_PRINTF("marking report\n");
const auto &my_reports = graph[v].reports;
rv.insert(my_reports.begin(), my_reports.end());
}
}
public:
void reports(const StateSet &in, flat_set<ReportID> &rv) {
reports_i(in, false, rv);
}
void reportsEod(const StateSet &in, flat_set<ReportID> &rv) {
reports_i(in, true, rv);
}
bool canPrune(const flat_set<ReportID> &test_reports) const {
if (!rm || !prunable || !canPruneEdgesFromAccept(*rm, graph)) {
return false;
}
return allExternalReports(*rm, test_reports);
}
private:
const ReportManager *rm;
public:
const NGHolder &graph;
const ue2::unordered_map<NFAVertex, u32> &state_ids;
u32 numStates;
vector<NFAVertex> v_by_index;
vector<CharReach> cr_by_index; /* pre alpha'ed */
StateSet init;
StateSet initDS;
StateSet squash; /* states which allow us to mask out other states */
StateSet accept;
StateSet acceptEod;
StateSet toppable; /* states which are allowed to be on when a top arrives,
* triggered dfas only */
map<u32, StateSet> squash_mask;
bool prunable;
StateSet dead;
array<u16, ALPHABET_SIZE> alpha;
array<u16, ALPHABET_SIZE> unalpha;
u16 alphasize;
u16 start_anchored;
u16 start_floating;
};
class Automaton_Graph {
public:
typedef bitfield<NFA_STATE_LIMIT> StateSet;
typedef ue2::unordered_map<StateSet, dstate_id_t> StateMap;
Automaton_Graph(const ReportManager *rm_in, const NGHolder &graph_in,
const ue2::unordered_map<NFAVertex, u32> &state_ids_in,
bool single_trigger,
const vector<vector<CharReach>> &triggers, bool prunable_in)
: rm(rm_in), graph(graph_in), state_ids(state_ids_in),
prunable(prunable_in) {
populateInit(graph, state_ids, &init, &initDS, &v_by_index);
populateAccepts(graph, state_ids, &accept, &acceptEod);
start_anchored = DEAD_STATE + 1;
if (initDS == init) {
start_floating = start_anchored;
} else if (initDS.any()) {
start_floating = start_anchored + 1;
} else {
start_floating = DEAD_STATE;
}
calculateAlphabet(graph, alpha, unalpha, &alphasize);
assert(alphasize <= ALPHABET_SIZE);
for (const auto &sq : findSquashers(graph)) {
NFAVertex v = sq.first;
u32 vert_id = graph[v].index;
squash.set(vert_id);
squash_mask[vert_id] = shrinkStateSet(sq.second);
}
cr_by_index = populateCR(graph, v_by_index, alpha);
if (is_triggered(graph)) {
dynamic_bitset<> temp(NFA_STATE_LIMIT);
markToppableStarts(graph, state_ids, single_trigger, triggers,
&temp);
toppable = bitfield<NFA_STATE_LIMIT>(temp);
}
}
private:
// Convert an NFAStateSet (as used by the squash code) into a StateSet
StateSet shrinkStateSet(const NFAStateSet &in) const {
StateSet out;
for (size_t i = in.find_first(); i != in.npos && i < out.size();
i = in.find_next(i)) {
out.set(i);
}
return out;
}
public:
void transition(const StateSet &in, StateSet *next) {
transition_graph(*this, v_by_index, in, next);
}
const vector<StateSet> initial() {
vector<StateSet> rv(1, init);
if (start_floating != DEAD_STATE && start_floating != start_anchored) {
rv.push_back(initDS);
}
return rv;
}
private:
void reports_i(const StateSet &in, bool eod, flat_set<ReportID> &rv) {
StateSet acc = in & (eod ? acceptEod : accept);
for (size_t i = acc.find_first(); i != StateSet::npos;
i = acc.find_next(i)) {
NFAVertex v = v_by_index[i];
DEBUG_PRINTF("marking report\n");
const auto &my_reports = graph[v].reports;
rv.insert(my_reports.begin(), my_reports.end());
}
}
public:
void reports(const StateSet &in, flat_set<ReportID> &rv) {
reports_i(in, false, rv);
}
void reportsEod(const StateSet &in, flat_set<ReportID> &rv) {
reports_i(in, true, rv);
}
bool canPrune(const flat_set<ReportID> &test_reports) const {
if (!rm || !prunable || !canPruneEdgesFromAccept(*rm, graph)) {
return false;
}
return allExternalReports(*rm, test_reports);
}
private:
const ReportManager *rm;
public:
const NGHolder &graph;
const ue2::unordered_map<NFAVertex, u32> &state_ids;
vector<NFAVertex> v_by_index;
vector<CharReach> cr_by_index; /* pre alpha'ed */
StateSet init;
StateSet initDS;
StateSet squash; /* states which allow us to mask out other states */
StateSet accept;
StateSet acceptEod;
StateSet toppable; /* states which are allowed to be on when a top arrives,
* triggered dfas only */
map<u32, StateSet> squash_mask;
bool prunable;
StateSet dead;
array<u16, ALPHABET_SIZE> alpha;
array<u16, ALPHABET_SIZE> unalpha;
u16 alphasize;
u16 start_anchored;
u16 start_floating;
};
} // namespace
unique_ptr<raw_dfa> buildMcClellan(const NGHolder &g, const ReportManager *rm,
bool single_trigger,
const vector<vector<CharReach>> &triggers,
const Grey &grey, bool finalChance) {
if (!grey.allowMcClellan) {
return nullptr;
}
// Construct a mutable copy of the graph so that we can drop unused starts.
auto g_copy = cloneHolder(g);
NGHolder &graph = *g_copy;
auto state_ids = numberStates(graph);
dropUnusedStarts(graph, state_ids);
DEBUG_PRINTF("attempting to build ?%d? mcclellan\n", (int)graph.kind);
assert(allMatchStatesHaveReports(graph));
bool prunable = grey.highlanderPruneDFA && generates_callbacks(graph);
assert(rm || !generates_callbacks(graph));
if (!generates_callbacks(graph)) {
rm = nullptr;
}
assert(triggers.empty() == !is_triggered(graph));
/* We must be getting desperate if it is an outfix, so use the final chance
* state limit logic */
u32 state_limit
= (graph.kind == NFA_OUTFIX || finalChance) ? FINAL_DFA_STATE_LIMIT
: DFA_STATE_LIMIT;
unique_ptr<raw_dfa> rdfa = ue2::make_unique<raw_dfa>(graph.kind);
const u32 numStates = num_vertices(graph);
DEBUG_PRINTF("determinising nfa with %u vertices\n", numStates);
if (numStates <= NFA_STATE_LIMIT) {
/* Fast path. Automaton_Graph uses a bitfield internally to represent
* states and is quicker than Automaton_Big. */
Automaton_Graph n(rm, graph, state_ids, single_trigger, triggers,
prunable);
if (determinise(n, rdfa->states, state_limit)) {
DEBUG_PRINTF("state limit exceeded\n");
return nullptr; /* over state limit */
}
rdfa->start_anchored = n.start_anchored;
rdfa->start_floating = n.start_floating;
rdfa->alpha_size = n.alphasize;
rdfa->alpha_remap = n.alpha;
} else {
/* Slow path. Too many states to use Automaton_Graph. */
Automaton_Big n(rm, graph, state_ids, single_trigger, triggers,
prunable);
if (determinise(n, rdfa->states, state_limit)) {
DEBUG_PRINTF("state limit exceeded\n");
return nullptr; /* over state limit */
}
rdfa->start_anchored = n.start_anchored;
rdfa->start_floating = n.start_floating;
rdfa->alpha_size = n.alphasize;
rdfa->alpha_remap = n.alpha;
}
minimize_hopcroft(*rdfa, grey);
DEBUG_PRINTF("after determinised into %zu states, building impl dfa "
"(a,f) = (%hu,%hu)\n", rdfa->states.size(),
rdfa->start_anchored, rdfa->start_floating);
return rdfa;
}
unique_ptr<raw_dfa> buildMcClellan(const NGHolder &g, const ReportManager *rm,
const Grey &grey) {
assert(!is_triggered(g));
vector<vector<CharReach>> triggers;
return buildMcClellan(g, rm, false, triggers, grey);
}
} // namespace ue2

View File

@@ -0,0 +1,81 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Build code for McClellan DFA.
*/
#ifndef NG_MCCLELLAN_H
#define NG_MCCLELLAN_H
#include "ue2common.h"
#include <memory>
#include <vector>
namespace ue2 {
class CharReach;
class NGHolder;
class ReportManager;
struct Grey;
struct raw_dfa;
/**
* \brief Determinises an NFA Graph into a raw_dfa.
*
* \param g
* The NGHolder.
* \param rm
* A pointer to the ReportManager, if managed reports are used (e.g.
* for outfixes/suffixes). Otherwise nullptr.
* \param single_trigger
* True if it is known that the nfa will only ever be trigger once.
* \param triggers
* Representing when tops may arrive. Only used by NFA_INFIX and
* NFA_SUFFIX, should be empty for other types.
* \param grey
* Grey box object.
* \param finalChance
* Allows us to build bigger DFAs as the only alternative is an outfix.
*
* \return A raw_dfa, or nullptr on failure (state limit blown).
*/
std::unique_ptr<raw_dfa> buildMcClellan(const NGHolder &g,
const ReportManager *rm, bool single_trigger,
const std::vector<std::vector<CharReach>> &triggers,
const Grey &grey, bool finalChance = false);
/** Convenience wrapper for non-triggered engines */
std::unique_ptr<raw_dfa> buildMcClellan(const NGHolder &g,
const ReportManager *rm,
const Grey &grey);
} // namespace ue2
#endif // NG_MCCLELLAN_H

View File

@@ -0,0 +1,144 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Shared build code for DFAs (McClellan, Haig).
*/
#ifndef NG_MCCLELLAN_INTERNAL_H
#define NG_MCCLELLAN_INTERNAL_H
#include "ue2common.h"
#include "nfa/mcclellancompile.h"
#include "nfagraph/ng_holder.h"
#include "nfagraph/ng_restructuring.h" // for NO_STATE
#include "util/charreach.h"
#include "util/graph_range.h"
#include "util/ue2_containers.h"
#include <boost/dynamic_bitset.hpp>
#include <map>
#include <vector>
namespace ue2 {
struct raw_dfa;
/** Fills alpha, unalpha and returns alphabet size. */
u16 buildAlphabetFromEquivSets(const std::vector<CharReach> &esets,
std::array<u16, ALPHABET_SIZE> &alpha,
std::array<u16, ALPHABET_SIZE> &unalpha);
/** \brief Calculates an alphabet remapping based on the symbols which the
* graph discriminates on. Throws in some special DFA symbols as well. */
void calculateAlphabet(const NGHolder &g, std::array<u16, ALPHABET_SIZE> &alpha,
std::array<u16, ALPHABET_SIZE> &unalpha, u16 *alphasize);
void getFullTransitionFromState(const raw_dfa &n, u16 state,
u16 *out_table);
/** produce a map of states on which it is valid to receive tops */
void markToppableStarts(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &state_ids,
bool single_trigger,
const std::vector<std::vector<CharReach>> &triggers,
boost::dynamic_bitset<> *out);
template<typename autom>
void transition_graph(autom &nfa, const std::vector<NFAVertex> &vByStateId,
const typename autom::StateSet &in,
typename autom::StateSet *next) {
typedef typename autom::StateSet StateSet;
const NGHolder &graph = nfa.graph;
const auto &state_ids = nfa.state_ids;
const auto &alpha = nfa.alpha;
const StateSet &squash = nfa.squash;
const std::map<u32, StateSet> &squash_mask = nfa.squash_mask;
const std::vector<CharReach> &cr_by_index = nfa.cr_by_index;
for (symbol_t s = 0; s < nfa.alphasize; s++) {
next[s].reset();
}
/* generate top transitions, false -> top = selfloop */
bool top_allowed = is_triggered(graph);
StateSet succ = nfa.dead;
for (size_t i = in.find_first(); i != in.npos; i = in.find_next(i)) {
NFAVertex u = vByStateId[i];
for (const auto &v : adjacent_vertices_range(u, graph)) {
if (state_ids.at(v) == NO_STATE) {
continue;
}
succ.set(graph[v].index);
}
if (top_allowed && !nfa.toppable.test(i)) {
/* we don't need to generate a top at this location as we are in
* an nfa state which cannot be on when a trigger arrives. */
top_allowed = false;
}
}
StateSet active_squash = succ & squash;
if (active_squash.any()) {
for (size_t j = active_squash.find_first(); j != active_squash.npos;
j = active_squash.find_next(j)) {
succ &= squash_mask.find(j)->second;
}
}
for (size_t j = succ.find_first(); j != succ.npos; j = succ.find_next(j)) {
const CharReach &cr = cr_by_index[j];
for (size_t s = cr.find_first(); s != cr.npos; s = cr.find_next(s)) {
next[s].set(j); /* already alpha'ed */
}
}
next[alpha[TOP]] = in;
if (top_allowed) {
/* we don't add in the anchored starts as the only case as the only
* time it is appropriate is if no characters have been consumed.*/
next[alpha[TOP]] |= nfa.initDS;
active_squash = next[alpha[TOP]] & squash;
if (active_squash.any()) {
for (size_t j = active_squash.find_first(); j != active_squash.npos;
j = active_squash.find_next(j)) {
next[alpha[TOP]] &= squash_mask.find(j)->second;
}
}
}
}
} // namespace ue2
#endif

View File

@@ -0,0 +1,549 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Miscellaneous optimisations.
*
* We sometimes see patterns of the form:
*
* /^.*<[^<]*foobaz/s
*
* This is bad for Rose as the escapes from the cyclic state are the same as
* the trigger. However, we can transform this into:
*
* /^.*<.*foobaz/s
*
* ... as the first dot star can eat all but the last '<'.
*
* Slightly more formally:
*
* Given a cyclic state v with character reachability v_cr and proper preds
* {p1 .. pn} with character reachability {p1_cr .. pn_cr}.
*
* let v_cr' = union(intersection(p1_cr .. pn_cr), v_cr)
*
* v_cr can be replaced with v_cr' without changing the behaviour of the system
* if:
*
* for any given proper pred pi: if pi is set in the nfa then after consuming
* any symbol in v_cr', pi will still be set in the nfa and every successor of
* v is a successor of pi.
*
* The easiest way for this condition to be satisfied is for each proper pred
* pi to have all its preds all have an edge to a pred of pi with a character
* reachability containing v_cr'. There are, however, other ways to establish
* the condition holds.
*
* Note: a similar transformation can be applied in reverse, details left as an
* exercise for the interested reader. */
#include "ng_misc_opt.h"
#include "ng_holder.h"
#include "ng_prune.h"
#include "ng_util.h"
#include "util/charreach.h"
#include "util/container.h"
#include "util/graph_range.h"
#include "ue2common.h"
#include <map>
#include <set>
#include <vector>
using namespace std;
namespace ue2 {
static
void findCandidates(NGHolder &g, const vector<NFAVertex> &ordering,
vector<NFAVertex> *cand) {
for (auto it = ordering.rbegin(), ite = ordering.rend(); it != ite; ++it) {
NFAVertex v = *it;
if (is_special(v, g)
|| !hasSelfLoop(v, g)
|| g[v].char_reach.all()) {
continue;
}
// For `v' to be a candidate, its predecessors must all have the same
// successor set as `v'.
set<NFAVertex> succ_v, succ_u;
succ(g, v, &succ_v);
for (auto u : inv_adjacent_vertices_range(v, g)) {
succ_u.clear();
succ(g, u, &succ_u);
if (succ_v != succ_u) {
goto next_cand;
}
}
DEBUG_PRINTF("vertex %u is a candidate\n", g[v].index);
cand->push_back(v);
next_cand:;
}
}
static
void findCandidates_rev(NGHolder &g, const vector<NFAVertex> &ordering,
vector<NFAVertex> *cand) {
for (auto it = ordering.begin(), ite = ordering.end(); it != ite; ++it) {
NFAVertex v = *it;
if (is_special(v, g)
|| !hasSelfLoop(v, g)
|| g[v].char_reach.all()) {
continue;
}
// For `v' to be a candidate, its predecessors must all have the same
// successor set as `v'.
set<NFAVertex> pred_v, pred_u;
pred(g, v, &pred_v);
for (auto u : adjacent_vertices_range(v, g)) {
pred_u.clear();
pred(g, u, &pred_u);
if (pred_v != pred_u) {
goto next_cand;
}
}
DEBUG_PRINTF("vertex %u is a candidate\n", g[v].index);
cand->push_back(v);
next_cand:;
}
}
/** Find the intersection of the reachability of the predecessors of \p v. */
static
void predCRIntersection(const NGHolder &g, NFAVertex v, CharReach &add) {
add.setall();
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (u != v) {
add &= g[u].char_reach;
}
}
}
/** Find the intersection of the reachability of the successors of \p v. */
static
void succCRIntersection(const NGHolder &g, NFAVertex v, CharReach &add) {
add.setall();
for (auto u : adjacent_vertices_range(v, g)) {
if (u != v) {
add &= g[u].char_reach;
}
}
}
/** The sustain set is used to show that once vertex p is on it stays on given
* the alphabet new_cr. Every vertex pp in the sustain set has the following
* properties:
* -# an edge to p
* -# enough edges to vertices in the sustain set to ensure that a vertex in
* the sustain set will be on after consuming a character. */
static
set<NFAVertex> findSustainSet(const NGHolder &g, NFAVertex p,
bool ignore_starts, const CharReach &new_cr) {
set<NFAVertex> cand;
pred(g, p, &cand);
if (ignore_starts) {
cand.erase(g.startDs);
}
/* remove elements from cand until the sustain set property holds */
bool changed;
do {
DEBUG_PRINTF("|cand| %zu\n", cand.size());
changed = false;
set<NFAVertex>::const_iterator it = cand.begin();
while (it != cand.end()) {
NFAVertex u = *it;
++it;
CharReach sus_cr;
for (auto v : adjacent_vertices_range(u, g)) {
if (contains(cand, v)) {
sus_cr |= g[v].char_reach;
}
}
if (!new_cr.isSubsetOf(sus_cr)) {
cand.erase(u);
changed = true;
}
}
} while (changed);
/* Note: it may be possible to find a (larger) sustain set for a smaller
* new_cr */
return cand;
}
/** Finds the reverse version of the sustain set.. whatever that means. */
static
set<NFAVertex> findSustainSet_rev(const NGHolder &g, NFAVertex p,
const CharReach &new_cr) {
set<NFAVertex> cand;
succ(g, p, &cand);
/* remove elements from cand until the sustain set property holds */
bool changed;
do {
changed = false;
set<NFAVertex>::const_iterator it = cand.begin();
while (it != cand.end()) {
NFAVertex u = *it;
++it;
CharReach sus_cr;
for (auto v : inv_adjacent_vertices_range(u, g)) {
if (contains(cand, v)) {
sus_cr |= g[v].char_reach;
}
}
if (!new_cr.isSubsetOf(sus_cr)) {
cand.erase(u);
changed = true;
}
}
} while (changed);
/* Note: it may be possible to find a (larger) sustain set for a smaller
* new_cr */
return cand;
}
static
bool enlargeCyclicVertex(NGHolder &g, som_type som, NFAVertex v) {
DEBUG_PRINTF("considering vertex %u\n", g[v].index);
const CharReach &v_cr = g[v].char_reach;
CharReach add;
predCRIntersection(g, v, add);
add |= v_cr;
if (add == v_cr) {
DEBUG_PRINTF("no benefit\n");
return false;
}
DEBUG_PRINTF("cr of width %zu up for grabs\n", add.count() - v_cr.count());
for (auto p : inv_adjacent_vertices_range(v, g)) {
if (p == v) {
continue;
}
DEBUG_PRINTF("looking at pred %u\n", g[p].index);
bool ignore_sds = som; /* if we are tracking som, entries into a state
from sds are significant. */
set<NFAVertex> sustain = findSustainSet(g, p, ignore_sds, add);
DEBUG_PRINTF("sustain set is %zu\n", sustain.size());
if (sustain.empty()) {
DEBUG_PRINTF("yawn\n");
}
for (auto pp : inv_adjacent_vertices_range(p, g)) {
/* we need to ensure that whenever pp sets p, that a member of the
sustain set is set. Note: p's cr may be not be a subset of
new_cr */
CharReach sustain_cr;
for (auto pv : adjacent_vertices_range(pp, g)) {
if (contains(sustain, pv)) {
sustain_cr |= g[pv].char_reach;
}
}
if (!g[p].char_reach.isSubsetOf(sustain_cr)) {
DEBUG_PRINTF("unable to establish that preds are forced on\n");
return false;
}
}
}
/* the cr can be increased */
g[v].char_reach = add;
DEBUG_PRINTF("vertex %u was widened\n", g[v].index);
return true;
}
static
bool enlargeCyclicVertex_rev(NGHolder &g, NFAVertex v) {
DEBUG_PRINTF("considering vertex %u\n", g[v].index);
const CharReach &v_cr = g[v].char_reach;
CharReach add;
succCRIntersection(g, v, add);
add |= v_cr;
if (add == v_cr) {
DEBUG_PRINTF("no benefit\n");
return false;
}
DEBUG_PRINTF("cr of width %zu up for grabs\n", add.count() - v_cr.count());
for (auto p : adjacent_vertices_range(v, g)) {
if (p == v) {
continue;
}
DEBUG_PRINTF("looking at succ %u\n", g[p].index);
set<NFAVertex> sustain = findSustainSet_rev(g, p, add);
DEBUG_PRINTF("sustain set is %zu\n", sustain.size());
if (sustain.empty()) {
DEBUG_PRINTF("yawn\n");
}
for (auto pp : adjacent_vertices_range(p, g)) {
/* we need to ensure something - see fwd ver */
CharReach sustain_cr;
for (auto pv : inv_adjacent_vertices_range(pp, g)) {
if (contains(sustain, pv)) {
sustain_cr |= g[pv].char_reach;
}
}
if (!g[p].char_reach.isSubsetOf(sustain_cr)) {
DEBUG_PRINTF("unable to establish that succs are thingy\n");
return false;
}
}
}
/* the cr can be increased */
g[v].char_reach = add;
DEBUG_PRINTF("vertex %u was widened\n", g[v].index);
return true;
}
static
bool enlargeCyclicCR(NGHolder &g, som_type som,
const vector<NFAVertex> &ordering) {
DEBUG_PRINTF("hello\n");
vector<NFAVertex> candidates;
findCandidates(g, ordering, &candidates);
bool rv = false;
for (auto v : candidates) {
rv |= enlargeCyclicVertex(g, som, v);
}
return rv;
}
static
bool enlargeCyclicCR_rev(NGHolder &g, const vector<NFAVertex> &ordering) {
DEBUG_PRINTF("olleh\n");
vector<NFAVertex> candidates;
findCandidates_rev(g, ordering, &candidates);
bool rv = false;
for (auto v : candidates) {
rv |= enlargeCyclicVertex_rev(g, v);
}
return rv;
}
bool improveGraph(NGHolder &g, som_type som) {
/* use a topo ordering so that we can get chains of cyclic states
* done in one sweep */
const vector<NFAVertex> ordering = getTopoOrdering(g);
return enlargeCyclicCR(g, som, ordering)
| enlargeCyclicCR_rev(g, ordering);
}
/** finds a smaller reachability for a state by the reverse transformation of
* enlargeCyclicCR. */
CharReach reduced_cr(NFAVertex v, const NGHolder &g,
const map<NFAVertex, BoundedRepeatSummary> &br_cyclic) {
DEBUG_PRINTF("find minimal cr for %u\n", g[v].index);
CharReach v_cr = g[v].char_reach;
if (proper_in_degree(v, g) != 1) {
return v_cr;
}
NFAVertex pred = getSoleSourceVertex(g, v);
assert(pred);
/* require pred to be fed by one vertex OR (start + startDS) */
NFAVertex predpred;
size_t idp = in_degree(pred, g);
if (hasSelfLoop(pred, g)) {
return v_cr; /* not cliche */
} else if (idp == 1) {
predpred = getSoleSourceVertex(g, pred);
} else if (idp == 2
&& edge(g.start, pred, g).second
&& edge(g.startDs, pred, g).second) {
predpred = g.startDs;
} else {
return v_cr; /* not cliche */
}
assert(predpred);
/* require predpred to be cyclic and its cr to be a superset of
pred and v */
if (!hasSelfLoop(predpred, g)) {
return v_cr; /* not cliche */
}
if (contains(br_cyclic, predpred)
&& !br_cyclic.at(predpred).unbounded()) {
return v_cr; /* fake cyclic */
}
const CharReach &p_cr = g[pred].char_reach;
const CharReach &pp_cr = g[predpred].char_reach;
if (!v_cr.isSubsetOf(pp_cr) || !p_cr.isSubsetOf(pp_cr)) {
return v_cr; /* not cliche */
}
DEBUG_PRINTF("confirming [x]* prop\n");
/* we require all of v succs to be succ of p */
set<NFAVertex> v_succ;
insert(&v_succ, adjacent_vertices(v, g));
set<NFAVertex> p_succ;
insert(&p_succ, adjacent_vertices(pred, g));
if (!is_subset_of(v_succ, p_succ)) {
DEBUG_PRINTF("fail\n");
return v_cr; /* not cliche */
}
if (contains(v_succ, g.accept) || contains(v_succ, g.acceptEod)) {
/* need to check that reports of v are a subset of p's */
if (!is_subset_of(g[v].reports,
g[pred].reports)) {
DEBUG_PRINTF("fail - reports not subset\n");
return v_cr; /* not cliche */
}
}
DEBUG_PRINTF("woot success\n");
v_cr &= ~p_cr;
return v_cr;
}
vector<CharReach> reduced_cr(const NGHolder &g,
const map<NFAVertex, BoundedRepeatSummary> &br_cyclic) {
assert(hasCorrectlyNumberedVertices(g));
vector<CharReach> refined_cr(num_vertices(g), CharReach());
for (auto v : vertices_range(g)) {
u32 v_idx = g[v].index;
refined_cr[v_idx] = reduced_cr(v, g, br_cyclic);
}
return refined_cr;
}
static
bool anyOutSpecial(NFAVertex v, const NGHolder &g) {
for (auto w : adjacent_vertices_range(v, g)) {
if (is_special(w, g) && w != v) {
return true;
}
}
return false;
}
bool mergeCyclicDotStars(NGHolder &g) {
set<NFAVertex> verticesToRemove;
set<NFAEdge> edgesToRemove;
// avoid graphs where startDs is not a free spirit
if (out_degree(g.startDs, g) > 1) {
return false;
}
// check if any of the connected vertices are dots
for (auto v : adjacent_vertices_range(g.start, g)) {
if (is_special(v, g)) {
continue;
}
const CharReach &cr = g[v].char_reach;
// if this is a cyclic dot
if (cr.all() && edge(v, v, g).second) {
// prevent insane graphs
if (anyOutSpecial(v, g)) {
continue;
}
// we don't know if we're going to remove this vertex yet
vector<NFAEdge> deadEdges;
// check if all adjacent vertices have edges from start
for (const auto &e : out_edges_range(v, g)) {
NFAVertex t = target(e, g);
// skip self
if (t == v) {
continue;
}
// skip vertices that don't have edges from start
if (!edge(g.start, t, g).second) {
continue;
}
// add an edge from startDs to this vertex
add_edge_if_not_present(g.startDs, t, g);
// mark this edge for removal
deadEdges.push_back(e);
}
// if the number of edges to be removed equals out degree, vertex
// needs to be removed; else, only remove the edges
if (deadEdges.size() == proper_out_degree(v, g)) {
verticesToRemove.insert(v);
} else {
edgesToRemove.insert(deadEdges.begin(), deadEdges.end());
}
}
}
if (verticesToRemove.empty() && edgesToRemove.empty()) {
return false;
}
DEBUG_PRINTF("removing %zu edges and %zu vertices\n", edgesToRemove.size(),
verticesToRemove.size());
remove_edges(edgesToRemove, g);
remove_vertices(verticesToRemove, g);
/* some predecessors to the cyclic vertices may no longer be useful (no out
* edges), so we can remove them */
pruneUseless(g);
return true;
}
} // namespace ue2

View File

@@ -0,0 +1,77 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Miscellaneous optimisations.
*/
#ifndef NG_MISC_OPT_H
#define NG_MISC_OPT_H
#include <map>
#include <vector>
#include "ng_holder.h"
#include "som/som.h"
#include "util/depth.h"
namespace ue2 {
/** Small structure describing the bounds on a repeat. */
struct BoundedRepeatSummary {
BoundedRepeatSummary(void) : repeatMin(0), repeatMax(depth::infinity()) {}
BoundedRepeatSummary(const depth &min_in, const depth &max_in)
: repeatMin(min_in), repeatMax(max_in) {
assert(repeatMin <= repeatMax);
assert(repeatMax.is_reachable());
}
bool unbounded(void) const { return repeatMax.is_infinite(); }
depth repeatMin; //!< minimum repeat bound.
depth repeatMax; //!< maximum repeat bound.
};
/* returns true if anything changed */
bool improveGraph(NGHolder &g, som_type som);
/** Sometimes the reach of a vertex is greater than it needs to be to reduce
* stop chars for the benefit of the rest of our code base (accel, sidecar,
* etc). In these circumstances, we can treat the reach as the smaller one as
* the graphs are equivalent. */
CharReach reduced_cr(NFAVertex v, const NGHolder &g,
const std::map<NFAVertex, BoundedRepeatSummary> &br_cyclic);
std::vector<CharReach> reduced_cr(const NGHolder &g,
const std::map<NFAVertex, BoundedRepeatSummary> &br_cyclic);
/** Remove cyclic stars connected to start */
bool mergeCyclicDotStars(NGHolder &g);
} // namespace ue2
#endif

220
src/nfagraph/ng_netflow.cpp Normal file
View File

@@ -0,0 +1,220 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Network flow (min flow, max cut) algorithms.
*/
#include "ng_netflow.h"
#include "ng_holder.h"
#include "ng_literal_analysis.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/container.h"
#include "util/graph_range.h"
#include <algorithm>
#include <boost/graph/boykov_kolmogorov_max_flow.hpp>
using namespace std;
using boost::default_color_type;
namespace ue2 {
static
void addReverseEdge(const NGHolder &g, vector<NFAEdge> &reverseEdge,
NFAEdge fwd, NFAEdge rev) {
u32 fwdIndex = g[fwd].index;
u32 revIndex = g[rev].index;
// Make sure our vector is big enough.
size_t sz = max(fwdIndex, revIndex) + 1;
if (reverseEdge.size() < sz) {
reverseEdge.resize(sz);
}
// Add entries to list.
reverseEdge[fwdIndex] = rev;
reverseEdge[revIndex] = fwd;
}
/** Add temporary reverse edges to the graph \p g, as they are required by the
* BGL's boykov_kolmogorov_max_flow algorithm. */
static
void addReverseEdges(NGHolder &g, vector<NFAEdge> &reverseEdge,
vector<u64a> &capacityMap) {
// We're probably going to need space for 2x edge count.
const size_t numEdges = num_edges(g);
reverseEdge.reserve(numEdges * 2);
capacityMap.reserve(numEdges * 2);
// To avoid walking the graph for _ages_, we build a temporary map of all
// edges indexed by vertex pair for existence checks.
map<pair<size_t, size_t>, NFAEdge> allEdges;
for (const auto &e : edges_range(g)) {
NFAVertex u = source(e, g), v = target(e, g);
size_t uidx = g[u].index, vidx = g[v].index;
allEdges[make_pair(uidx, vidx)] = e;
}
// Now we walk over all edges and add their reverse edges to the reverseEdge
// vector, also adding them to the graph when they don't already exist.
for (const auto &m : allEdges) {
const NFAEdge &fwd = m.second;
const size_t uidx = m.first.first, vidx = m.first.second;
auto it = allEdges.find(make_pair(vidx, uidx));
if (it == allEdges.end()) {
// No reverse edge, add one.
NFAVertex u = source(fwd, g), v = target(fwd, g);
NFAEdge rev = add_edge(v, u, g).first;
it = allEdges.insert(make_pair(make_pair(vidx, uidx), rev)).first;
// Add to capacity map.
u32 revIndex = g[rev].index;
if (capacityMap.size() < revIndex + 1) {
capacityMap.resize(revIndex + 1);
}
capacityMap[revIndex] = 0;
}
addReverseEdge(g, reverseEdge, fwd, it->second);
}
}
/** Remove all edges with indices >= \p idx. */
static
void removeEdgesFromIndex(NGHolder &g, vector<u64a> &capacityMap, u32 idx) {
remove_edge_if([&](const NFAEdge &e) { return g[e].index >= idx; }, g);
capacityMap.resize(idx);
}
/** A wrapper around boykov_kolmogorov_max_flow, returns the max flow and
* colour map (from which we can find the min cut). */
static
u64a getMaxFlow(NGHolder &h, const vector<u64a> &capacityMap_in,
vector<default_color_type> &colorMap) {
vector<u64a> capacityMap = capacityMap_in;
NFAVertex src = h.start;
NFAVertex sink = h.acceptEod;
// netflow relies on these stylised edges, as all starts should be covered
// by our source and all accepts by our sink.
assert(edge(h.start, h.startDs, h).second);
assert(edge(h.accept, h.acceptEod, h).second);
// The boykov_kolmogorov_max_flow algorithm requires us to have reverse
// edges for all edges in the graph, so we create them here (and remove
// them after the call).
const unsigned int numRealEdges = num_edges(h);
vector<NFAEdge> reverseEdges;
addReverseEdges(h, reverseEdges, capacityMap);
const unsigned int numTotalEdges = num_edges(h);
const unsigned int numVertices = num_vertices(h);
vector<u64a> edgeResiduals(numTotalEdges);
vector<NFAEdge> predecessors(numVertices);
vector<s32> distances(numVertices);
assert(colorMap.size() == numVertices);
const NFAGraph &g = h.g;
auto v_index_map = get(&NFAGraphVertexProps::index, g);
auto e_index_map = get(&NFAGraphEdgeProps::index, g);
u64a flow = boykov_kolmogorov_max_flow(g,
make_iterator_property_map(capacityMap.begin(), e_index_map),
make_iterator_property_map(edgeResiduals.begin(), e_index_map),
make_iterator_property_map(reverseEdges.begin(), e_index_map),
make_iterator_property_map(predecessors.begin(), v_index_map),
make_iterator_property_map(colorMap.begin(), v_index_map),
make_iterator_property_map(distances.begin(), v_index_map),
v_index_map,
src, sink);
// Remove reverse edges from graph.
removeEdgesFromIndex(h, capacityMap, numRealEdges);
assert(num_edges(h.g) == numRealEdges);
DEBUG_PRINTF("flow = %llu\n", flow);
return flow;
}
/** Returns a min cut (in \p cutset) for the graph in \p h. */
vector<NFAEdge> findMinCut(NGHolder &h, const vector<u64a> &scores) {
assert(hasCorrectlyNumberedEdges(h));
assert(hasCorrectlyNumberedVertices(h));
vector<default_color_type> colorMap(num_vertices(h));
u64a flow = getMaxFlow(h, scores, colorMap);
vector<NFAEdge> picked_white;
vector<NFAEdge> picked_black;
u64a observed_black_flow = 0;
u64a observed_white_flow = 0;
for (const auto &e : edges_range(h)) {
NFAVertex from = source(e, h);
NFAVertex to = target(e, h);
u64a ec = scores[h[e].index];
if (ec == 0) {
continue; // skips, among other things, reverse edges
}
default_color_type fromColor = colorMap[h[from].index];
default_color_type toColor = colorMap[h[to].index];
if (fromColor != boost::white_color && toColor == boost::white_color) {
assert(ec <= INVALID_EDGE_CAP);
DEBUG_PRINTF("found white cut edge %u->%u cap %llu\n",
h[from].index, h[to].index, ec);
observed_white_flow += ec;
picked_white.push_back(e);
}
if (fromColor == boost::black_color && toColor != boost::black_color) {
assert(ec <= INVALID_EDGE_CAP);
DEBUG_PRINTF("found black cut edge %u->%u cap %llu\n",
h[from].index, h[to].index, ec);
observed_black_flow += ec;
picked_black.push_back(e);
}
}
DEBUG_PRINTF("min flow = %llu b flow = %llu w flow %llu\n", flow,
observed_black_flow, observed_white_flow);
if (MIN(observed_white_flow, observed_black_flow) != flow) {
DEBUG_PRINTF("bad cut\n");
}
if (observed_white_flow < observed_black_flow) {
return picked_white;
} else {
return picked_black;
}
}
} // namespace ue2

49
src/nfagraph/ng_netflow.h Normal file
View File

@@ -0,0 +1,49 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Network flow (min flow, max cut) algorithms.
*/
#ifndef NG_NETFLOW_H
#define NG_NETFLOW_H
#include "ng_holder.h"
#include "ue2common.h"
#include <vector>
namespace ue2 {
class NGHolder;
/** Returns a min cut (in \p cutset) for the graph in \p h. */
std::vector<NFAEdge> findMinCut(NGHolder &h, const std::vector<u64a> &scores);
} // namespace ue2
#endif

View File

@@ -0,0 +1,374 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Prefilter Reductions.
*
* This file contains routines for reducing the size of an NFA graph that we
* know will be used as a prefilter.
*
* The approach used is to consider the graph as a chain of region subgraphs,
* and to reduce the size of the graph by replacing regions with constructs
* that can be implemented in fewer states.
*
* Right now, the approach used is to replace a region with a bounded repeat of
* vertices (with bounds derived from the min/max width of the region
* subgraph). These vertices are given the union of the region's character
* reachability.
*
* For regions with bounded max width, this strategy is quite dependent on the
* LimEx NFA's bounded repeat functionality.
*/
#include "ng_prefilter.h"
#include "ng_holder.h"
#include "ng_region.h"
#include "ng_util.h"
#include "ng_width.h"
#include "ue2common.h"
#include "util/compile_context.h"
#include "util/container.h"
#include "util/dump_charclass.h"
#include "util/ue2_containers.h"
#include "util/graph_range.h"
#include <queue>
#include <boost/range/adaptor/map.hpp>
using namespace std;
using boost::adaptors::map_values;
namespace ue2 {
/** Keep attempting to reduce the size of the graph until the number of
* vertices falls below this value. */
static const size_t MAX_COMPONENT_VERTICES = 128;
/** Only replace a region with at least this many vertices. */
static const size_t MIN_REPLACE_VERTICES = 2;
/** Estimate of how many vertices are required to represent a bounded repeat in
* the implementation NFA. */
static const size_t BOUNDED_REPEAT_COUNT = 4;
/** Scoring penalty for boundary regions. */
static const size_t PENALTY_BOUNDARY = 32;
namespace {
/** Information describing a region. */
struct RegionInfo {
explicit RegionInfo(u32 id_in) : id(id_in) {}
u32 id; //!< region id
deque<NFAVertex> vertices; //!< vertices in the region
CharReach reach; //!< union of region reach
depth minWidth = 0; //!< min width of region subgraph
depth maxWidth = depth::infinity(); //!< max width of region subgraph
bool atBoundary = false; //!< region is next to an accept
// Bigger score is better.
size_t score() const {
// FIXME: charreach should be a signal?
size_t numVertices = vertices.size();
if (atBoundary) {
return numVertices - min(PENALTY_BOUNDARY, numVertices);
} else {
return numVertices;
}
}
};
/** Comparator used to order regions for consideration in a priority queue. */
struct RegionInfoQueueComp {
bool operator()(const RegionInfo &r1, const RegionInfo &r2) const {
size_t score1 = r1.score(), score2 = r2.score();
if (score1 != score2) {
return score1 < score2;
}
if (r1.reach.count() != r2.reach.count()) {
return r1.reach.count() < r2.reach.count();
}
return r1.id < r2.id;
}
};
} // namespace
static
void findWidths(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &region_map,
RegionInfo &ri) {
NGHolder rg;
ue2::unordered_map<NFAVertex, NFAVertex> mapping;
fillHolder(&rg, g, ri.vertices, &mapping);
// Wire our entries to start and our exits to accept.
for (auto v : ri.vertices) {
NFAVertex v_new = mapping[v];
assert(v_new != NFAGraph::null_vertex());
if (isRegionEntry(g, v, region_map) &&
!edge(rg.start, v_new, rg).second) {
add_edge(rg.start, v_new, rg);
}
if (isRegionExit(g, v, region_map) &&
!edge(v_new, rg.accept, rg).second) {
add_edge(v_new, rg.accept, rg);
}
}
ri.minWidth = findMinWidth(rg);
ri.maxWidth = findMaxWidth(rg);
}
// acc can be either h.accept or h.acceptEod.
static
void markBoundaryRegions(const NGHolder &h,
const ue2::unordered_map<NFAVertex, u32> &region_map,
map<u32, RegionInfo> &regions, NFAVertex acc) {
for (auto v : inv_adjacent_vertices_range(acc, h)) {
if (is_special(v, h)) {
continue;
}
u32 id = region_map.at(v);
map<u32, RegionInfo>::iterator ri = regions.find(id);
if (ri == regions.end()) {
continue; // Not tracking this region as it's too small.
}
ri->second.atBoundary = true;
}
}
static
map<u32, RegionInfo> findRegionInfo(const NGHolder &h,
const ue2::unordered_map<NFAVertex, u32> &region_map) {
map<u32, RegionInfo> regions;
for (auto v : vertices_range(h)) {
if (is_special(v, h)) {
continue;
}
u32 id = region_map.at(v);
RegionInfo &ri = regions.insert(
make_pair(id, RegionInfo(id))).first->second;
ri.vertices.push_back(v);
ri.reach |= h[v].char_reach;
}
// There's no point tracking more information about regions that we won't
// consider replacing, so we remove them from the region map.
for (map<u32, RegionInfo>::iterator it = regions.begin();
it != regions.end();) {
if (it->second.vertices.size() < MIN_REPLACE_VERTICES) {
regions.erase(it++);
} else {
++it;
}
}
DEBUG_PRINTF("%zu regions\n", regions.size());
markBoundaryRegions(h, region_map, regions, h.accept);
markBoundaryRegions(h, region_map, regions, h.acceptEod);
// Determine min/max widths.
for (RegionInfo &ri : regions | map_values) {
findWidths(h, region_map, ri);
DEBUG_PRINTF("region %u %shas widths [%s,%s]\n", ri.id,
ri.atBoundary ? "(boundary) " : "",
ri.minWidth.str().c_str(), ri.maxWidth.str().c_str());
}
return regions;
}
static
void copyInEdges(NGHolder &g, NFAVertex from, NFAVertex to,
const ue2::unordered_set<NFAVertex> &rverts) {
for (const auto &e : in_edges_range(from, g)) {
NFAVertex u = source(e, g);
if (contains(rverts, u)) {
continue;
}
if (edge(u, to, g).second) {
continue;
}
add_edge(u, to, g[e], g);
}
}
static
void copyOutEdges(NGHolder &g, NFAVertex from, NFAVertex to,
const ue2::unordered_set<NFAVertex> &rverts) {
for (const auto &e : out_edges_range(from, g)) {
NFAVertex t = target(e, g);
if (contains(rverts, t)) {
continue;
}
add_edge_if_not_present(to, t, g[e], g);
if (is_any_accept(t, g)) {
const auto &reports = g[from].reports;
g[to].reports.insert(reports.begin(), reports.end());
}
}
}
static
void replaceRegion(NGHolder &g, const RegionInfo &ri,
size_t *verticesAdded, size_t *verticesRemoved) {
// TODO: more complex replacements.
assert(ri.vertices.size() >= MIN_REPLACE_VERTICES);
assert(ri.minWidth.is_finite());
size_t replacementSize;
if (ri.minWidth == ri.maxWidth || ri.maxWidth.is_infinite()) {
replacementSize = ri.minWidth; // {N} or {N,}
} else {
replacementSize = ri.maxWidth; // {N,M} case
}
DEBUG_PRINTF("orig size %zu, replace size %zu\n", ri.vertices.size(),
replacementSize);
deque<NFAVertex> verts;
for (size_t i = 0; i < replacementSize; i++) {
NFAVertex v = add_vertex(g);
g[v].char_reach = ri.reach;
if (i > 0) {
add_edge(verts.back(), v, g);
}
verts.push_back(v);
}
if (ri.maxWidth.is_infinite()) {
add_edge(verts.back(), verts.back(), g);
}
// Set of vertices in region, for quick lookups.
const ue2::unordered_set<NFAVertex> rverts(ri.vertices.begin(),
ri.vertices.end());
for (size_t i = 0; i < replacementSize; i++) {
NFAVertex v_new = verts[i];
for (auto v_old : ri.vertices) {
if (i == 0) {
copyInEdges(g, v_old, v_new, rverts);
}
if (i + 1 >= ri.minWidth) {
copyOutEdges(g, v_old, v_new, rverts);
}
}
}
remove_vertices(ri.vertices, g, false);
*verticesAdded = verts.size();
*verticesRemoved = ri.vertices.size();
}
namespace {
struct SourceHasEdgeToAccept {
explicit SourceHasEdgeToAccept(const NGHolder &g_in) : g(g_in) {}
bool operator()(const NFAEdge &e) const {
return edge(source(e, g), g.accept, g).second;
}
const NGHolder &g;
};
}
static
void reduceRegions(NGHolder &h) {
map<u32, RegionInfo> regions = findRegionInfo(h, assignRegions(h));
RegionInfoQueueComp cmp;
priority_queue<RegionInfo, deque<RegionInfo>, RegionInfoQueueComp> pq(cmp);
size_t numVertices = 0;
for (const RegionInfo &ri : regions | map_values) {
numVertices += ri.vertices.size();
pq.push(ri);
}
while (numVertices > MAX_COMPONENT_VERTICES && !pq.empty()) {
const RegionInfo &ri = pq.top();
DEBUG_PRINTF("region %u: vertices=%zu reach=%s score=%zu, "
"widths=[%s,%s]\n",
ri.id, ri.vertices.size(), describeClass(ri.reach).c_str(),
ri.score(), ri.minWidth.str().c_str(),
ri.maxWidth.str().c_str());
size_t verticesAdded = 0;
size_t verticesRemoved = 0;
replaceRegion(h, ri, &verticesAdded, &verticesRemoved);
DEBUG_PRINTF("%zu vertices removed, %zu vertices added\n",
verticesRemoved, verticesAdded);
// We are trusting that implementation NFAs will be able to use the
// LimEx bounded repeat code here.
numVertices -= verticesRemoved;
numVertices += BOUNDED_REPEAT_COUNT;
DEBUG_PRINTF("numVertices is now %zu\n", numVertices);
pq.pop();
}
// We may have vertices that have edges to both accept and acceptEod: in
// this case, we can optimize for performance by removing the acceptEod
// edges.
remove_in_edge_if(h.acceptEod, SourceHasEdgeToAccept(h), h.g);
}
void prefilterReductions(NGHolder &h, const CompileContext &cc) {
if (!cc.grey.prefilterReductions) {
return;
}
if (num_vertices(h) <= MAX_COMPONENT_VERTICES) {
DEBUG_PRINTF("graph is already small enough (%zu vertices)\n",
num_vertices(h));
return;
}
DEBUG_PRINTF("graph with %zu vertices\n", num_vertices(h));
h.renumberVertices();
h.renumberEdges();
reduceRegions(h);
h.renumberVertices();
h.renumberEdges();
}
} // namespace ue2

View File

@@ -0,0 +1,45 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Prefilter Reductions.
*/
#ifndef NG_PREFILTER_H
#define NG_PREFILTER_H
namespace ue2 {
class NGHolder;
struct CompileContext;
void prefilterReductions(NGHolder &h, const CompileContext &cc);
} // namespace ue2
#endif

438
src/nfagraph/ng_prune.cpp Normal file
View File

@@ -0,0 +1,438 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Functions for pruning unreachable vertices or reports from the graph.
*/
#include "ng_prune.h"
#include "ng_dominators.h"
#include "ng_holder.h"
#include "ng_reports.h"
#include "ng_util.h"
#include "util/container.h"
#include "util/graph.h"
#include "util/graph_range.h"
#include "util/report_manager.h"
#include <deque>
#include <map>
#include <boost/graph/depth_first_search.hpp>
#include <boost/graph/reverse_graph.hpp>
using namespace std;
using boost::default_color_type;
using boost::reverse_graph;
namespace ue2 {
/** Remove any vertices that can't be reached by traversing the graph in
* reverse from acceptEod. */
void pruneUnreachable(NGHolder &g) {
deque<NFAVertex> dead;
if (!hasGreaterInDegree(1, g.acceptEod, g) &&
!hasGreaterInDegree(0, g.accept, g) &&
edge(g.accept, g.acceptEod, g).second) {
// Trivial case: there are no in-edges to our accepts (other than
// accept->acceptEod), so all non-specials are unreachable.
for (auto v : vertices_range(g)) {
if (!is_special(v, g)) {
dead.push_back(v);
}
}
} else {
// Walk a reverse graph from acceptEod with Boost's depth_first_visit
// call.
typedef reverse_graph<NFAGraph, NFAGraph&> RevNFAGraph;
RevNFAGraph revg(g.g);
map<NFAVertex, default_color_type> colours;
depth_first_visit(revg, g.acceptEod,
make_dfs_visitor(boost::null_visitor()),
make_assoc_property_map(colours));
DEBUG_PRINTF("color map has %zu entries after DFV\n", colours.size());
// All non-special vertices that aren't in the colour map (because they
// weren't reached) can be removed.
for (auto v : vertices_range(revg)) {
if (is_special(v, revg)) {
continue;
}
if (!contains(colours, v)) {
dead.push_back(v);
}
}
}
if (dead.empty()) {
DEBUG_PRINTF("no unreachable vertices\n");
return;
}
remove_vertices(dead, g, false);
DEBUG_PRINTF("removed %zu unreachable vertices\n", dead.size());
}
template<class nfag_t>
static
bool pruneForwardUseless(NGHolder &h, const nfag_t &g, NFAVertex s,
vector<default_color_type> &vertexColor) {
// Begin with all vertices set to white, as DFV only marks visited
// vertices.
fill(vertexColor.begin(), vertexColor.end(), boost::white_color);
auto index_map = get(&NFAGraphVertexProps::index, g);
depth_first_visit(g, s, make_dfs_visitor(boost::null_visitor()),
make_iterator_property_map(vertexColor.begin(),
index_map));
vector<NFAVertex> dead;
// All non-special vertices that are still white can be removed.
for (auto v : vertices_range(g)) {
u32 idx = g[v].index;
if (!is_special(v, g) && vertexColor[idx] == boost::white_color) {
DEBUG_PRINTF("vertex %u is unreachable from %u\n",
g[v].index, g[s].index);
dead.push_back(v);
}
}
if (dead.empty()) {
return false;
}
DEBUG_PRINTF("removing %zu vertices\n", dead.size());
remove_vertices(dead, h, false);
return true;
}
/** Remove any vertices which can't be reached by traversing the graph forward
* from start or in reverse from acceptEod. If \p renumber is false, no
* vertex/edge renumbering is done. */
void pruneUseless(NGHolder &g, bool renumber) {
DEBUG_PRINTF("pruning useless vertices\n");
assert(hasCorrectlyNumberedVertices(g));
vector<default_color_type> vertexColor(num_vertices(g));
bool work_done = pruneForwardUseless(g, g.g, g.start, vertexColor);
work_done |= pruneForwardUseless(
g, reverse_graph<NFAGraph, NFAGraph &>(g.g), g.acceptEod, vertexColor);
if (!work_done) {
return;
}
if (renumber) {
g.renumberEdges();
g.renumberVertices();
}
}
/** This code removes any vertices which do not accept any symbols. Any
* vertices which no longer lie on a path from a start to an accept are also
* pruned. */
void pruneEmptyVertices(NGHolder &g) {
DEBUG_PRINTF("pruning empty vertices\n");
vector<NFAVertex> dead;
for (auto v : vertices_range(g)) {
if (is_special(v, g)) {
continue;
}
const CharReach &cr = g[v].char_reach;
if (cr.none()) {
DEBUG_PRINTF("empty: %u\n", g[v].index);
dead.push_back(v);
}
}
if (dead.empty()) {
return;
}
remove_vertices(dead, g);
pruneUseless(g);
}
/** Remove any edges from vertices that generate accepts (for Highlander
* graphs). */
void pruneHighlanderAccepts(NGHolder &g, const ReportManager &rm) {
// Safety check: all reports must be simple exhaustible reports, or this is
// not safe. This optimisation should be called early enough that no
// internal reports have been added.
for (auto report_id : all_reports(g)) {
const Report &ir = rm.getReport(report_id);
if (ir.ekey == INVALID_EKEY || ir.hasBounds() ||
!isExternalReport(ir)) {
DEBUG_PRINTF("report %u is not external highlander with "
"no bounds\n", report_id);
return;
}
}
vector<NFAEdge> dead;
for (auto u : inv_adjacent_vertices_range(g.accept, g)) {
if (is_special(u, g)) {
continue;
}
// We can prune any out-edges that aren't accepts
for (const auto &e : out_edges_range(u, g)) {
if (!is_any_accept(target(e, g), g)) {
dead.push_back(e);
}
}
}
if (dead.empty()) {
return;
}
DEBUG_PRINTF("found %zu removable edges due to single match\n", dead.size());
remove_edges(dead, g);
pruneUseless(g);
}
static
bool isDominatedByReporter(const NGHolder &g,
const ue2::unordered_map<NFAVertex, NFAVertex> &dom,
NFAVertex v, ReportID report_id) {
for (auto it = dom.find(v); it != end(dom); it = dom.find(v)) {
NFAVertex u = it->second;
// Note: reporters with edges only to acceptEod are not considered to
// dominate.
if (edge(u, g.accept, g).second && contains(g[u].reports, report_id)) {
DEBUG_PRINTF("%u is dominated by %u, and both report %u\n",
g[v].index, g[u].index, report_id);
return true;
}
v = u;
}
return false;
}
/**
* True if the vertex has (a) a self-loop, (b) only out-edges to accept and
* itself and (c) only simple exhaustible reports.
*/
static
bool hasOnlySelfLoopAndExhaustibleAccepts(const NGHolder &g,
const ReportManager &rm,
NFAVertex v) {
if (!edge(v, v, g).second) {
return false;
}
for (auto w : adjacent_vertices_range(v, g)) {
if (w != v && w != g.accept) {
return false;
}
}
for (const auto &report_id : g[v].reports) {
if (!isSimpleExhaustible(rm.getReport(report_id))) {
return false;
}
}
return true;
}
void pruneHighlanderDominated(NGHolder &g, const ReportManager &rm) {
vector<NFAVertex> reporters;
for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
for (const auto &report_id : g[v].reports) {
const Report &r = rm.getReport(report_id);
if (isSimpleExhaustible(r)) {
reporters.push_back(v);
break;
}
}
}
for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
for (const auto &report_id : g[v].reports) {
const Report &r = rm.getReport(report_id);
if (isSimpleExhaustible(r)) {
reporters.push_back(v);
break;
}
}
}
if (reporters.empty()) {
return;
}
sort(begin(reporters), end(reporters), make_index_ordering(g));
reporters.erase(unique(begin(reporters), end(reporters)), end(reporters));
DEBUG_PRINTF("%zu vertices have simple exhaustible reports\n",
reporters.size());
const auto &dom = findDominators(g);
bool modified = false;
// If a reporter vertex is dominated by another with the same report, we
// can remove that report; if all reports are removed, we can remove the
// vertex entirely.
for (const auto v : reporters) {
const auto reports = g[v].reports; // copy, as we're going to mutate
for (const auto &report_id : reports) {
if (!isSimpleExhaustible(rm.getReport(report_id))) {
continue;
}
if (isDominatedByReporter(g, dom, v, report_id)) {
DEBUG_PRINTF("removed dominated report %u from vertex %u\n",
report_id, g[v].index);
g[v].reports.erase(report_id);
}
}
if (g[v].reports.empty()) {
DEBUG_PRINTF("removed edges to accepts from %u, no reports left\n",
g[v].index);
remove_edge(v, g.accept, g);
remove_edge(v, g.acceptEod, g);
modified = true;
}
}
// If a reporter vertex has a self-loop, but otherwise only leads to accept
// (note: NOT acceptEod) and has simple exhaustible reports, we can delete
// the self-loop.
for (const auto v : reporters) {
if (hasOnlySelfLoopAndExhaustibleAccepts(g, rm, v)) {
remove_edge(v, v, g);
modified = true;
DEBUG_PRINTF("removed self-loop on %u\n", g[v].index);
}
}
if (!modified) {
return;
}
pruneUseless(g);
// We may have only removed self-loops, in which case pruneUseless wouldn't
// renumber, so we do edge renumbering explicitly here.
g.renumberEdges();
}
/** Removes the given Report ID from vertices connected to accept, and then
* prunes useless vertices that have had their report sets reduced to empty. */
void pruneReport(NGHolder &g, ReportID report) {
set<NFAEdge> dead;
for (const auto &e : in_edges_range(g.accept, g)) {
NFAVertex u = source(e, g);
auto &reports = g[u].reports;
if (contains(reports, report)) {
reports.erase(report);
if (reports.empty()) {
dead.insert(e);
}
}
}
for (const auto &e : in_edges_range(g.acceptEod, g)) {
NFAVertex u = source(e, g);
if (u == g.accept) {
continue;
}
auto &reports = g[u].reports;
if (contains(reports, report)) {
reports.erase(report);
if (reports.empty()) {
dead.insert(e);
}
}
}
if (dead.empty()) {
return;
}
remove_edges(dead, g);
pruneUnreachable(g);
g.renumberVertices();
g.renumberEdges();
}
/** Removes all Report IDs bar the given one from vertices connected to accept,
* and then prunes useless vertices that have had their report sets reduced to
* empty. */
void pruneAllOtherReports(NGHolder &g, ReportID report) {
set<NFAEdge> dead;
for (const auto &e : in_edges_range(g.accept, g)) {
NFAVertex u = source(e, g);
auto &reports = g[u].reports;
if (contains(reports, report)) {
reports.clear();
reports.insert(report);
} else {
reports.clear();
dead.insert(e);
}
}
for (const auto &e : in_edges_range(g.acceptEod, g)) {
NFAVertex u = source(e, g);
if (u == g.accept) {
continue;
}
auto &reports = g[u].reports;
if (contains(reports, report)) {
reports.clear();
reports.insert(report);
} else {
reports.clear();
dead.insert(e);
}
}
if (dead.empty()) {
return;
}
remove_edges(dead, g);
pruneUnreachable(g);
g.renumberVertices();
g.renumberEdges();
}
} // namespace ue2

75
src/nfagraph/ng_prune.h Normal file
View File

@@ -0,0 +1,75 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Functions for pruning unreachable vertices or reports from the graph.
*/
#ifndef NG_PRUNE_H
#define NG_PRUNE_H
#include "ue2common.h"
namespace ue2 {
class NGHolder;
class ReportManager;
/** Remove any vertices that can't be reached by traversing the graph in
* reverse from acceptEod. */
void pruneUnreachable(NGHolder &g);
/** Remove any vertices which can't be reached by traversing the graph forward
* from start or in reverse from acceptEod. If \p renumber is false, no
* vertex/edge renumbering is done. */
void pruneUseless(NGHolder &g, bool renumber = true);
/** Remove any vertices with empty reachability. */
void pruneEmptyVertices(NGHolder &g);
/** Remove any edges from vertices that generate accepts (for Highlander
* graphs). */
void pruneHighlanderAccepts(NGHolder &g, const ReportManager &rm);
/**
* Prune highlander reports that are dominated by earlier ones in the graph.
*/
void pruneHighlanderDominated(NGHolder &g, const ReportManager &rm);
/** Removes the given Report ID from vertices connected to accept, and then
* prunes useless vertices that have had their report sets reduced to empty. */
void pruneReport(NGHolder &g, ReportID report);
/** Removes all Report IDs bar the given one from vertices connected to accept,
* and then prunes useless vertices that have had their report sets reduced to
* empty. */
void pruneAllOtherReports(NGHolder &g, ReportID report);
} // namespace ue2
#endif // NG_PRUNE_H

578
src/nfagraph/ng_puff.cpp Normal file
View File

@@ -0,0 +1,578 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Puff construction from NGHolder.
*/
#include "ng_puff.h"
#include "grey.h"
#include "ng_depth.h"
#include "ng_holder.h"
#include "ng_prune.h"
#include "ng_repeat.h"
#include "ng_reports.h"
#include "ng_util.h"
#include "ue2common.h"
#include "nfa/nfa_api_queue.h"
#include "nfa/mpvcompile.h"
#include "rose/rose_build.h"
#include "util/compile_context.h"
#include "util/graph_range.h"
#include "util/report_manager.h"
#include <vector>
using namespace std;
namespace ue2 {
static const unsigned MIN_PUFF_LENGTH = 16;
static const unsigned HEAD_BACKOFF = 16;
static
size_t countChain(const NGHolder &g, NFAVertex v) {
size_t count = 0;
while (v) {
DEBUG_PRINTF("counting vertex %u\n", g[v].index);
if (is_special(v, g)) {
break;
}
count++;
v = getSoleDestVertex(g, v);
}
DEBUG_PRINTF("done %zu\n", count);
return count;
}
static
void wireNewAccepts(NGHolder &g, NFAVertex head,
const flat_set<ReportID> &chain_reports) {
for (auto u : inv_adjacent_vertices_range(head, g)) {
if (is_special(u, g)) {
continue;
}
DEBUG_PRINTF("adding edge: %u -> accept\n", g[u].index);
assert(!edge(u, g.accept, g).second);
assert(!edge(u, g.acceptEod, g).second);
add_edge(u, g.accept, g);
// Replace reports with our chain reports.
auto &u_reports = g[u].reports;
u_reports.clear();
u_reports.insert(chain_reports.begin(), chain_reports.end());
}
}
static
bool isFixedDepth(const NGHolder &g, NFAVertex v) {
// If the vertex is reachable from startDs, it can't be fixed depth.
vector<DepthMinMax> depthFromStartDs;
calcDepthsFrom(g, g.startDs, depthFromStartDs);
u32 idx = g[v].index;
const DepthMinMax &ds = depthFromStartDs.at(idx);
if (ds.min.is_reachable()) {
DEBUG_PRINTF("vertex reachable from startDs\n");
return false;
}
vector<DepthMinMax> depthFromStart;
calcDepthsFrom(g, g.start, depthFromStart);
/* we can still consider the head of a puff chain as at fixed depth if
* it has a self-loop: so we look at all the preds of v (other than v
* itself) */
assert(v && !is_special(v, g));
u32 count = 0;
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (u == v) {
continue; // self-loop
}
count++;
idx = g[u].index;
const DepthMinMax &d = depthFromStart.at(idx);
if (d.min != d.max) {
return false;
}
}
return count != 0; // at least one fixed-depth pred
}
static
bool singleStart(const NGHolder &g) {
set<NFAVertex> seen;
for (auto v : adjacent_vertices_range(g.start, g)) {
if (!is_special(v, g)) {
DEBUG_PRINTF("saw %u\n", g[v].index);
seen.insert(v);
}
}
for (auto v : adjacent_vertices_range(g.startDs, g)) {
if (!is_special(v, g)) {
DEBUG_PRINTF("saw %u\n", g[v].index);
seen.insert(v);
}
}
DEBUG_PRINTF("comp has %zu starts\n", seen.size());
return seen.size() == 1;
}
static
bool triggerResetsPuff(const NGHolder &g, NFAVertex head) {
const CharReach puff_escapes = ~g[head].char_reach;
for (auto u : inv_adjacent_vertices_range(head, g)) {
if (!g[u].char_reach.isSubsetOf(puff_escapes)) {
DEBUG_PRINTF("no reset on trigger %u %u\n", g[u].index,
g[head].index);
return false;
}
}
DEBUG_PRINTF("reset on trigger\n");
return true;
}
/** ".*[X]{N}" can be treated as ".*[X]{N,}" (misc_opt does reverse transform)
* */
static
bool triggerFloodsPuff(const NGHolder &g, NFAVertex head) {
DEBUG_PRINTF("head = %u\n", g[head].index);
const CharReach &puff_cr = g[head].char_reach;
/* we can use the pred of the head as the base of our check if it the cr
* matches as if
* head cr subsetof pred cr: if head is being pushed on then puff must
* still being pushed on
* pred cr subsetof head cr: if the puff matches then head must be also
* always be on if the is connected to a wide enough cyclic
*/
if (proper_in_degree(head, g) == 1
&& puff_cr == g[getSoleSourceVertex(g, head)].char_reach) {
head = getSoleSourceVertex(g, head);
DEBUG_PRINTF("temp new head = %u\n", g[head].index);
}
for (auto s : inv_adjacent_vertices_range(head, g)) {
DEBUG_PRINTF("s = %u\n", g[s].index);
if (!puff_cr.isSubsetOf(g[s].char_reach)) {
DEBUG_PRINTF("no flood on trigger %u %u\n",
g[s].index, g[head].index);
return false;
}
if (!hasSelfLoop(s, g) && s != g.start) {
DEBUG_PRINTF("no self loop\n");
return false;
}
if (s == g.start && !edge(g.startDs, head, g).second) {
DEBUG_PRINTF("not float\n");
return false;
}
}
DEBUG_PRINTF("reset on trigger\n");
return true;
}
static
u32 allowedSquashDistance(const CharReach &cr, u32 min_width, const NGHolder &g,
NFAVertex pv, bool prefilter) {
CharReach accept_cr;
DEBUG_PRINTF("hello |cr|=%zu %d\n", cr.count(), (int)cr.find_first());
if (prefilter) {
/* a later prefilter stage make weaken the lead up so we can't be sure
* that all the triggers will be squashing the puffette. */
return 0;
}
/* TODO: inspect further back in the pattern */
for (auto u : inv_adjacent_vertices_range(pv, g)) {
accept_cr |= g[u].char_reach;
}
DEBUG_PRINTF("|accept_cr|=%zu\n", accept_cr.count());
if ((accept_cr & cr).any()) {
return 0; /* the accept byte doesn't always kill the puffette. TODO:
* maybe if we look further back we could find something that
* would kill the puffette... */
}
DEBUG_PRINTF("returning squash distance of %u\n", min_width);
return min_width;
}
/** Gives a stronger puff trigger when the trigger is connected to a wide
* cyclic state (aside from sds) */
static
void improveHead(NGHolder &g, NFAVertex *a, vector<NFAVertex> *nodes) {
DEBUG_PRINTF("attempting to improve puff trigger\n");
assert(!nodes->empty());
const CharReach &puff_cr = g[nodes->back()].char_reach;
if (puff_cr.all()) {
return; /* we can't really do much with this one */
}
/* add the runway */
DEBUG_PRINTF("backing off - allowing a decent header\n");
assert(nodes->size() > HEAD_BACKOFF);
for (u32 i = 0; i < HEAD_BACKOFF - 1; i++) {
nodes->pop_back();
}
*a = nodes->back();
nodes->pop_back();
}
static
void constructPuff(NGHolder &g, const NFAVertex a, const NFAVertex puffv,
const CharReach &cr, const ReportID report, u32 width,
bool fixed_depth, bool unbounded, bool auto_restart,
RoseBuild &rose, ReportManager &rm,
flat_set<ReportID> &chain_reports, bool prefilter) {
DEBUG_PRINTF("constructing Puff for report %u\n", report);
DEBUG_PRINTF("a = %u\n", g[a].index);
const bool pureAnchored = a == g.start && singleStart(g);
if (!pureAnchored) {
if (a == g.startDs || a == g.start) {
DEBUG_PRINTF("add outfix ar(false)\n");
raw_puff rp(width, unbounded, report, cr, auto_restart);
rose.addOutfix(rp);
return;
}
DEBUG_PRINTF("add chain tail\n");
u32 qi = ~0U;
u32 event = MQE_TOP;
raw_puff rp(width, unbounded, report, cr);
rose.addChainTail(rp, &qi, &event);
assert(qi != ~0U);
u32 squashDistance = allowedSquashDistance(cr, width, g, puffv,
prefilter);
Report ir = makeRoseTrigger(event, squashDistance);
/* only need to trigger once if floatingUnboundedDot */
bool floatingUnboundedDot = unbounded && cr.all() && !fixed_depth;
if (floatingUnboundedDot) {
ir.ekey = rm.getUnassociatedExhaustibleKey();
}
ReportID id = rm.getInternalId(ir);
chain_reports.insert(id);
} else {
DEBUG_PRINTF("add outfix ar(%d)\n", (int)auto_restart);
assert(!auto_restart || unbounded);
raw_puff rp(width, unbounded, report, cr, auto_restart);
rose.addOutfix(rp);
}
}
static
bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
set<NFAVertex> &dead, const CompileContext &cc,
bool prefilter) {
DEBUG_PRINTF("hello\n");
vector<NFAVertex> nodes;
const CharReach &cr = g[a].char_reach;
bool isDot = cr.all();
bool unbounded = false;
bool exhaustible = can_exhaust(g, rm);
while (a) {
if (is_special(a, g)) {
DEBUG_PRINTF("stopped puffing due to special vertex\n");
break;
}
if (g[a].char_reach != cr) {
DEBUG_PRINTF("stopped puffing due to change in character "
"reachability\n");
break;
}
if (proper_in_degree(a, g) != 1) {
DEBUG_PRINTF("stopped puffing due to in degree != 1\n");
break;
}
size_t outDegree = out_degree(a, g);
if (outDegree != 1 && (!hasSelfLoop(a, g) || outDegree != 2)) {
DEBUG_PRINTF("stopping puffing due to out degree\n");
break;
}
if (hasSelfLoop(a, g)) {
DEBUG_PRINTF("has self-loop, marking unbounded\n");
unbounded = true;
}
nodes.push_back(a);
DEBUG_PRINTF("vertex %u has in_degree %zu\n", g[a].index,
in_degree(a, g));
a = getSoleSourceVertex(g, a);
if (!a) {
break;
}
// Snark: we can't handle this case, because we can only handle a
// single report ID on a vertex
if (is_match_vertex(a, g)) {
DEBUG_PRINTF("stop puffing due to vertex that leads to accept\n");
if (!nodes.empty()) {
nodes.pop_back();
}
break;
}
}
if (!nodes.empty() && proper_in_degree(nodes.back(), g) != 1) {
for (auto u : inv_adjacent_vertices_range(nodes.back(), g)) {
if (is_special(u, g)) {
DEBUG_PRINTF("pop\n");
a = nodes.back();
nodes.pop_back();
break;
}
}
}
if (a != g.startDs && edge(g.startDs, a, g).second
&& proper_out_degree(a, g) == 1
&& g[a].char_reach == cr) {
nodes.push_back(a);
a = g.startDs;
}
bool auto_restart = false;
DEBUG_PRINTF("a = %u\n", g[a].index);
if (nodes.size() < MIN_PUFF_LENGTH || a == g.startDs) {
DEBUG_PRINTF("bad %zu %u\n", nodes.size(), g[a].index);
if (nodes.size() < MIN_PUFF_LENGTH) {
return false;
} else {
DEBUG_PRINTF("mark unbounded\n");
unbounded = true;
a = g.start;
auto_restart = !isDot;
}
}
bool supported = false;
bool fixed_depth = isFixedDepth(g, nodes.back());
if (exhaustible) {
supported = true;
} else if (fixed_depth) {
supported = true;
} else if (unbounded) {
/* any C{n, } can be supported as all ranges will be squashed together
* only need to track the first */
supported = true;
} else if (triggerResetsPuff(g, nodes.back())) {
supported = true;
} else if (triggerFloodsPuff(g, nodes.back())) {
DEBUG_PRINTF("trigger floods puff\n");
supported = true;
unbounded = true;
}
if (!supported) {
DEBUG_PRINTF("not supported\n");
return false;
}
if (cc.grey.puffImproveHead && a != g.start) {
if (edge(g.startDs, a, g).second) {
goto skip_improve; /* direct sds cases are better handled by auto
* restarting puffettes */
}
if (fixed_depth) {
goto skip_improve; /* no danger of trigger floods */
}
/* if we come after something literalish don't bother */
if (g[a].char_reach.count() <= 2
&& in_degree(a, g) == 1
&& g[getSoleSourceVertex(g, a)].char_reach.count() <= 2) {
goto skip_improve;
}
if (nodes.size() < MIN_PUFF_LENGTH + HEAD_BACKOFF) {
return false; /* not enough of the puff left to worth bothering
about */
}
improveHead(g, &a, &nodes);
skip_improve:;
}
assert(!nodes.empty());
const auto &reports = g[nodes[0]].reports;
assert(!reports.empty());
for (auto report : reports) {
const Report &ir = rm.getReport(report);
const bool highlander = ir.ekey != INVALID_EKEY;
if (!unbounded && highlander && !isSimpleExhaustible(ir)) {
DEBUG_PRINTF("report %u is bounded highlander but not simple "
"exhaustible\n",
report);
return false;
}
if (ir.type == INTERNAL_ROSE_CHAIN) {
DEBUG_PRINTF("puffettes cannot be chained together\n");
return false;
}
}
NFAVertex puffv = nodes.back();
assert(puffv != NFAGraph::null_vertex());
u32 width = countChain(g, nodes.back());
flat_set<ReportID> chain_reports;
for (auto report : reports) {
constructPuff(g, a, puffv, cr, report, width, fixed_depth, unbounded,
auto_restart, rose, rm, chain_reports, prefilter);
}
if (!chain_reports.empty()) {
wireNewAccepts(g, puffv, chain_reports);
}
dead.insert(nodes.begin(), nodes.end());
return true;
}
bool splitOffPuffs(RoseBuild &rose, ReportManager &rm, NGHolder &g,
bool prefilter, const CompileContext &cc) {
if (!cc.grey.allowPuff) {
return false;
}
size_t count = 0;
set<NFAVertex> dead;
for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
if (doComponent(rose, rm, g, v, dead, cc, prefilter)) {
count++;
}
}
if (!dead.empty()) {
remove_vertices(dead, g);
pruneUseless(g);
}
DEBUG_PRINTF("puffs: %zu\n", count);
return num_vertices(g) <= N_SPECIALS;
}
bool isPuffable(const NGHolder &g, bool fixed_depth,
const ReportManager &rm, const Grey &grey) {
if (!grey.allowPuff) {
return false;
}
if (!onlyOneTop(g)) {
DEBUG_PRINTF("more than one top\n");
return false;
}
const set<ReportID> reports = all_reports(g);
if (reports.size() != 1) {
DEBUG_PRINTF("too many reports\n");
return false;
}
const Report &ir = rm.getReport(*reports.begin());
if (ir.type == INTERNAL_ROSE_CHAIN) {
DEBUG_PRINTF("puffettes cannot be chained together\n");
return false;
}
PureRepeat repeat;
if (!isPureRepeat(g, repeat)) {
DEBUG_PRINTF("not pure bounded repeat\n");
return false;
}
if (repeat.bounds.min == depth(0)) {
DEBUG_PRINTF("repeat min bound is zero\n");
return false;
}
// We can puff if:
// (a) repeat is {N,}; or
// (b) repeat is {N} and fixed-depth, or highlander (and will accept the
// first match)
DEBUG_PRINTF("repeat is %s\n", repeat.bounds.str().c_str());
if (repeat.bounds.max.is_infinite()) {
return true;
}
if (repeat.bounds.min == repeat.bounds.max) {
if (fixed_depth) {
DEBUG_PRINTF("fixed depth\n");
return true;
}
const bool highlander = ir.ekey != INVALID_EKEY;
// If we're highlander, we must be simple-exhaustible as well.
if (highlander && isSimpleExhaustible(ir)) {
return true;
}
}
return false;
}
} // namespace ue2

56
src/nfagraph/ng_puff.h Normal file
View File

@@ -0,0 +1,56 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Puff construction from NGHolder.
*/
#ifndef NG_PUFF_H
#define NG_PUFF_H
namespace ue2 {
struct CompileContext;
struct Grey;
class RoseBuild;
class NGHolder;
class ReportManager;
/** \brief Split off portions of the graph that are implementable as Puff
* engines. Returns true if the entire graph is consumed. */
bool splitOffPuffs(RoseBuild &rose, ReportManager &rm, NGHolder &g,
bool prefilter, const CompileContext &cc);
/** \brief True if the entire graph in \a g could be constructed as a Puff
* engine. */
bool isPuffable(const NGHolder &g, bool fixed_depth, const ReportManager &rm,
const Grey &grey);
} // namespace ue2
#endif

View File

@@ -0,0 +1,915 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief NFA graph reductions.
*
* This code attempts to make the NFA graph smaller by performing a number of
* local transformations:
*
* ### (1) removal of redundant vertices:
*
* v is redundant wrt to u if succ(v) is a subset of succ(u)
* AND pred(v) is a subset of pred(u)
* AND cr(v) is a subset of cr(u)
*
* ### (2) 'diamond' transformation:
*
* given succ(v) == succ(u) and pred(v) == pred(u),
* v and u can be replaced by w with succ(w) = succ(v), pred(w) = pred(v),
* and cr(w) = union(cr(v), cr(u))
*
* ### (3) locally identifiable left equivalence:
*
* given pred(v) == pred(u) (**) and cr(v) == cr(u),
* v and u can be replaced by w with pred(w) = pred(v), cr(w) = cr(v),
* and succ(w) = union(succ(v), succ(u))
*
* ### (4) locally identifiable right equivalence:
*
* given succ(v) == succ(u) (**) and cr(v) == cr(u),
* v and u can be replaced by w with succ(w) = succ(v), cr(w) = cr(v),
* and pred(w) = union(pred(v), pred(u))
*
* NOTE (**): for left and right equivalence, we can also do the transform if
* set(u) contains u, set(v) contains v and the sets are otherwise equal. This
* enables equivalent vertices with self-loops to be merged.
*
* If v and u raise accepts, they can only be merged if they raise the same
* report IDs.
*
* Transformations are applied repeatedly until the graph stops changing.
*
* Note that the final graph may depend on the order in which these
* transformations are applied. In order to reduce the non-determinism the
* following order is imposed: (1); (2); (3) + (4).
*/
#include "ng_redundancy.h"
#include "ng_holder.h"
#include "ng_calc_components.h"
#include "ng_dominators.h"
#include "ng_prune.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/container.h"
#include "util/graph_range.h"
#include "util/ue2_containers.h"
#include <algorithm>
#include <cassert>
#include <map>
#include <set>
#include <vector>
#include <boost/graph/depth_first_search.hpp>
#include <boost/graph/reverse_graph.hpp>
using namespace std;
namespace ue2 {
namespace {
/** Precalculated (and maintained) information about a vertex. */
class VertexInfo {
public:
flat_set<NFAVertex> pred; //!< predecessors of this vertex
flat_set<NFAVertex> succ; //!< successors of this vertex
bool isAccept = false; //!< does this vertex lead to accept?
bool isRemoved = false; //!< have we already removed this vertex?
size_t inDegree() const { return pred.size(); }
size_t outDegree() const { return succ.size(); }
};
class VertexInfoMap {
public:
explicit VertexInfoMap(const NGHolder &gg)
: g(gg), infos(num_vertices(gg)) {}
VertexInfo &operator[](NFAVertex v) {
u32 i = g[v].index;
assert(i < infos.size());
return infos[i];
}
const VertexInfo &operator[](NFAVertex v) const {
u32 i = g[v].index;
assert(i < infos.size());
return infos[i];
}
private:
const NGHolder &g;
vector<VertexInfo> infos;
};
} // namespace
/** Populates the info map with their predecessor and successor states, and
* whether they are accept states. */
static
void populateContainers(const NGHolder &g, VertexInfoMap &infoMap) {
for (auto v : vertices_range(g)) {
VertexInfo &info = infoMap[v];
assert(info.pred.empty() && info.succ.empty());
// Build successor and predecessor sets
insert(&info.pred, inv_adjacent_vertices(v, g));
insert(&info.succ, adjacent_vertices(v, g));
// Note whether the vertex is an accept state
if (!is_special(v, g)) {
if (contains(info.succ, g.accept)
|| contains(info.succ, g.acceptEod)) {
info.isAccept = true;
}
}
}
}
/** Helper function to take the intersection of two sorted vertex sets
* in-place. */
static
void inplaceIntersection(vector<NFAVertex> &vset1,
const flat_set<NFAVertex> &vset2) {
const NFAVertex GONE = NFAGraph::null_vertex();
vector<NFAVertex>::iterator it = vset1.begin(), ite = vset1.end();
flat_set<NFAVertex>::const_iterator jt = vset2.begin(), jte = vset2.end();
while ((it != ite) && (jt != jte)) {
assert(*it != GONE);
if (*it < *jt) {
// present in vset1 but not in vset2. Set to null, remove in a
// second pass.
*it = GONE;
++it;
} else if (*jt < *it) {
// present in vset2 but not in vset1, skip.
++jt;
} else {
// present in both sets.
++it; ++jt;
}
}
// Left overs are only in that set.
vset1.erase(it, ite);
// Remove nulls created above.
vset1.erase(remove(vset1.begin(), vset1.end(), GONE), vset1.end());
}
/** Find the intersection of the successors of our predecessors. */
static
void succPredIntersection(const NFAVertex v, const flat_set<NFAVertex> &predSet,
const VertexInfoMap &infoMap,
vector<NFAVertex> &intersection,
bool considerSelf = true /* follow self loops */) {
/* find a good seed for the intersection */
const flat_set<NFAVertex> *best = nullptr;
for (auto u : predSet) {
if (!considerSelf && u == v) {
continue;
}
const flat_set<NFAVertex> &succSet = infoMap[u].succ;
if (!best || succSet.size() <= best->size()) {
best = &succSet;
// Break out if we've reduced our intersection to [v]
if (best->size() == 1) {
assert(*(best->begin()) == v);
intersection.push_back(v);
return;
}
}
}
if (best) {
insert(&intersection, intersection.end(), *best);
}
for (auto u : predSet) {
if (!considerSelf && u == v) {
continue;
}
inplaceIntersection(intersection, infoMap[u].succ);
// Check: intersection should always be at least size 1
assert(!intersection.empty());
// Break out if we've reduced our intersection to [v]
if (intersection.size() == 1) {
assert(*intersection.begin() == v);
return;
}
}
}
/** Find the intersection of the predecessors of our successors. */
static
void predSuccIntersection(const NFAVertex v,
const flat_set<NFAVertex> &succSet,
const VertexInfoMap &infoMap,
vector<NFAVertex> &intersection,
bool considerSelf = true /* follow self loops */) {
/* find a good seed for the intersection */
const flat_set<NFAVertex> *best = nullptr;
for (auto w : succSet) {
if (!considerSelf && w == v) {
continue;
}
const flat_set<NFAVertex> &predSet = infoMap[w].pred;
if (!best || predSet.size() <= best->size()) {
best = &predSet;
// Break out if we've reduced our intersection to [v]
if (best->size() == 1) {
assert(*(best->begin()) == v);
intersection.push_back(v);
return;
}
}
}
if (best) {
insert(&intersection, intersection.end(), *best);
}
for (auto w : succSet) {
if (!considerSelf && w == v) {
continue;
}
inplaceIntersection(intersection, infoMap[w].pred);
// Check: intersection should always be at least size 1
assert(!intersection.empty());
// Break out if we've reduced our intersection to [v]
if (intersection.size() == 1) {
assert(*intersection.begin() == v);
return;
}
}
}
/** Update containers to take into account the removal of vertex v. */
static
void markForRemoval(const NFAVertex v, VertexInfoMap &infoMap,
set<NFAVertex> &removable) {
VertexInfo &info = infoMap[v];
assert(!info.isRemoved);
assert(!contains(removable, v));
info.isRemoved = true;
removable.insert(v);
// remove v from its predecessors' successors
for (auto u : info.pred) {
infoMap[u].succ.erase(v);
}
// remove v from its successors' predecessors
for (auto w : info.succ) {
infoMap[w].pred.erase(v);
}
}
static
bool hasInEdgeTops(const NGHolder &g, NFAVertex v) {
bool exists;
NFAEdge e;
tie(e, exists) = edge_by_target(g.start, v, g);
if (exists && g[e].top != 0) {
return true;
}
return false;
}
/** Transform (1), removal of redundant vertices. */
static
bool doUselessMergePass(NGHolder &g, som_type som, VertexInfoMap &infoMap,
set<NFAVertex> &removable) {
/* useless merges can be done in any order, no need to take any care with
* ordering */
// Temporary vectors used for intersections below
vector<NFAVertex> succPredSet, predSuccSet, intersection;
bool changed = false;
for (auto v : vertices_range(g)) {
VertexInfo &info = infoMap[v];
if (info.isRemoved) {
continue;
}
assert(!contains(removable, v));
if (is_special(v, g)) {
continue;
}
/* we do not need to check for out edge tops - as only specials (start)
* can have tops and they are already disqualified. */
if (hasInEdgeTops(g, v)) {
continue; // Conservatively skip anything with nonzero tops.
}
if (info.pred.empty() || info.succ.empty()) {
DEBUG_PRINTF("vertex %u has empty pred/succ list\n",
g[v].index);
assert(0); // non-special states should always have succ/pred lists
continue;
}
// The following cases are more complex and rely on the intersection of
// Succ(Pred(v)) and Pred(Succ(v))
// Compute intersections, operating on the smaller set first
// Note that we use vectors here, as set_intersection underneath
// guarantees sorted output, and vectors were quite a bit
// faster than sets or lists.
succPredSet.clear();
predSuccSet.clear();
if (info.pred.size() <= info.succ.size()) {
succPredIntersection(v, info.pred, infoMap, succPredSet);
if (succPredSet.size() == 1) {
// nobody in here but us chickens
assert(*succPredSet.begin() == v);
continue;
}
predSuccIntersection(v, info.succ, infoMap, predSuccSet);
if (predSuccSet.size() == 1) {
assert(*predSuccSet.begin() == v);
continue;
}
} else {
predSuccIntersection(v, info.succ, infoMap, predSuccSet);
if (predSuccSet.size() == 1) {
assert(*predSuccSet.begin() == v);
continue;
}
succPredIntersection(v, info.pred, infoMap, succPredSet);
if (succPredSet.size() == 1) {
assert(*succPredSet.begin() == v);
continue;
}
}
// Find the intersection of Succ(Pred(v)) and Pred(Succ(v))
intersection.clear();
set_intersection(succPredSet.begin(), succPredSet.end(),
predSuccSet.begin(), predSuccSet.end(),
back_inserter(intersection));
/* Boring if it is just us in the intersection */
if (intersection.size() < 2) {
continue;
}
// Compare char_reach, mark v for removal if any members of
// the intersection have an equal or greater reach
const CharReach &currReach = g[v].char_reach;
const auto &currReports = g[v].reports;
for (auto t : intersection) {
const VertexInfo &info2 = infoMap[t];
/* start is never a succ of a state, so will never be in the
* predsucc/succpred intersection */
assert(t != g.start);
if (t == v || info2.isRemoved) {
continue;
}
// For each candidate C to make V redundant, check:
// if V is an accept state, C must be an accept state for
// the same pattern
// pred(C) is a superset of pred(V)
// succ(C) is a superset of succ(V)
// reach(C) is a superset of reach(V)
//
// Note: pred/sec tests are covered by the intersections
// calculated above.
/* note: links to accepts are also tracked in succs */
if (info.isAccept && currReports != g[t].reports) {
continue;
}
if (som) {
if (t == g.startDs) {
continue;
}
if (is_virtual_start(t, g) != is_virtual_start(v, g)) {
continue;
}
}
/* we do not need to check for out edge tops - as only start
* can have tops and it has already been ruled out. */
if (hasInEdgeTops(g, t)) {
continue; // Conservatively skip anything with nonzero tops.
}
CharReach &otherReach = g[t].char_reach;
if (currReach.isSubsetOf(otherReach)) {
DEBUG_PRINTF("removing redundant vertex %u (keeping %u)\n",
g[v].index, g[t].index);
markForRemoval(v, infoMap, removable);
changed = true;
break;
}
}
}
return changed;
}
/** Transform (2), diamond merge pass. */
static
bool doDiamondMergePass(NGHolder &g, som_type som, VertexInfoMap &infoMap,
set<NFAVertex> &removable) {
// Temporary vectors used for intersections below
vector<NFAVertex> succPredSet, predSuccSet, intersection;
bool changed = false;
for (auto v : vertices_range(g)) {
VertexInfo &info = infoMap[v];
if (info.isRemoved) {
continue;
}
assert(!contains(removable, v));
if (is_special(v, g)) {
continue;
}
/* we do not need to check for out edge tops - as only specials (start)
* can have tops and they are already disqualified. */
if (hasInEdgeTops(g, v)) {
continue; // Conservatively skip anything with nonzero tops.
}
if (info.pred.empty() || info.succ.empty()) {
assert(0); // non-special states should always have succ/pred lists
continue;
}
// The following cases are more complex and rely on the intersection of
// Succ(Pred(v)) and Pred(Succ(v))
// Compute intersections, operating on the smaller set first
// Note that we use vectors here, as set_intersection underneath
// guarantees sorted output, and vectors were quite a bit faster than
// sets or lists.
succPredSet.clear();
predSuccSet.clear();
if (info.pred.size() <= info.succ.size()) {
succPredIntersection(v, info.pred, infoMap, succPredSet);
if (succPredSet.size() == 1) {
// nobody in here but us chickens
assert(*succPredSet.begin() == v);
continue;
}
predSuccIntersection(v, info.succ, infoMap, predSuccSet);
if (predSuccSet.size() == 1) {
assert(*predSuccSet.begin() == v);
continue;
}
} else {
predSuccIntersection(v, info.succ, infoMap, predSuccSet);
if (predSuccSet.size() == 1) {
assert(*predSuccSet.begin() == v);
continue;
}
succPredIntersection(v, info.pred, infoMap, succPredSet);
if (succPredSet.size() == 1) {
assert(*succPredSet.begin() == v);
continue;
}
}
// Find the intersection of Succ(Pred(v)) and Pred(Succ(v))
intersection.clear();
set_intersection(succPredSet.begin(), succPredSet.end(),
predSuccSet.begin(), predSuccSet.end(),
back_inserter(intersection));
/* Boring if it is just us in the intersection */
if (intersection.size() < 2) {
continue;
}
/* ensure that we look for candidates in the same order */
sort(intersection.begin(), intersection.end(), make_index_ordering(g));
const CharReach &currReach = g[v].char_reach;
const auto &currReports = g[v].reports;
for (auto t : intersection) {
const VertexInfo &info2 = infoMap[t];
if (t == v || info2.isRemoved || is_special(t, g)) {
continue;
}
/* note: links to accepts are also tracked in succs */
if (info.isAccept && currReports != g[t].reports) {
continue;
}
/* we do not need to check for out edge tops - as only specials
* (start) can have tops and they are already disqualified. */
if (hasInEdgeTops(g, t)) {
continue; // Conservatively skip anything with nonzero tops.
}
if (som) {
if (is_virtual_start(v, g) != is_virtual_start(t, g)) {
continue; // can only merge like with like.
}
}
// If in-degree of v == in-degree of target
// and out-degree of v == out-degree of target
// (because pred and succ are supersets)
// then combine charreach of v into target and remove v
if (info.inDegree() == info2.inDegree()
&& info.outDegree() == info2.outDegree()) {
// add character reachability of v into target
CharReach &otherReach = g[t].char_reach;
otherReach |= currReach;
// v can be removed
DEBUG_PRINTF("removing redundant vertex %u and merging "
"reachability with vertex %u\n",
g[v].index, g[t].index);
markForRemoval(v, infoMap, removable);
changed = true;
break;
}
}
}
return changed;
}
namespace {
struct ReachMismatch {};
class ReachSubsetVisitor : public boost::default_dfs_visitor {
public:
explicit ReachSubsetVisitor(const CharReach &r) : cr(r) {}
template <class Graph, class Vertex>
void discover_vertex(const Vertex &v, const Graph &g) const {
if (is_any_start(v, g)) {
return; // start vertices are OK
} else if (is_special(v, g)) {
assert(0);
throw ReachMismatch(); // other special nodes??
}
const CharReach &vcr = g[v].char_reach;
DEBUG_PRINTF("checking if vcr (%zu) is subset of (%zu)\n", vcr.count(),
cr.count());
if (vcr != (vcr & cr)) {
throw ReachMismatch();
}
}
private:
const CharReach &cr;
};
/** Terminator function for DFS used in pathReachSubset. */
template <class Graph, class Vertex> class VertexIs {
public:
explicit VertexIs(const Vertex &v) : vertex(v) {}
bool operator()(const Vertex &v, const Graph &) const {
return v == vertex;
}
private:
Vertex vertex;
};
} // namespace
/** Returns true if every vertex on paths leading to edge \p e has reachability
* which is a subset of the reachability of \p dom */
static
bool reversePathReachSubset(const NFAEdge &e, const NFAVertex &dom,
const NGHolder &g) {
const CharReach &domReach = g[dom].char_reach;
if (domReach.all()) {
return true;
}
NFAVertex start = source(e, g);
using RevGraph = boost::reverse_graph<NFAGraph, const NFAGraph &>;
map<RevGraph::vertex_descriptor, boost::default_color_type> vertexColor;
// Walk the graph backwards from v, examining each node. We fail (return
// false) if we encounter a node with reach NOT a subset of domReach, and
// we stop searching at dom.
try {
depth_first_visit(RevGraph(g.g), start,
ReachSubsetVisitor(domReach),
make_assoc_property_map(vertexColor),
VertexIs<RevGraph, RevGraph::vertex_descriptor>(dom));
} catch(ReachMismatch&) {
return false;
}
return true;
}
/** Returns true if every vertex on paths leading from edge \p e has
* reachability which is a subset of the reachability of \p dom */
static
bool forwardPathReachSubset(const NFAEdge &e, const NFAVertex &dom,
const NGHolder &g) {
const CharReach &domReach = g[dom].char_reach;
if (domReach.all()) {
return true;
}
NFAVertex start = target(e, g);
map<NFAGraph::vertex_descriptor, boost::default_color_type> vertexColor;
// Walk the graph forward from v, examining each node. We fail (return
// false) if we encounter a node with reach NOT a subset of domReach, and
// we stop searching at dom.
try {
depth_first_visit(g.g, start,
ReachSubsetVisitor(domReach),
make_assoc_property_map(vertexColor),
VertexIs<NFAGraph, NFAVertex>(dom));
} catch(ReachMismatch&) {
return false;
}
return true;
}
static
bool allOutsSpecial(NFAVertex v, const NGHolder &g) {
for (auto w : adjacent_vertices_range(v, g)) {
if (!is_special(w, g)) {
return false;
}
}
return true;
}
static
bool allInsSpecial(NFAVertex v, const NGHolder &g) {
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (!is_special(u, g)) {
return false;
}
}
return true;
}
/** Cheaply check whether this graph can't be reduced at all, because it is
* just a chain of vertices with no other edges. */
static
bool isIrreducible(const NGHolder &g) {
for (auto v : vertices_range(g)) {
// skip specials
if (is_special(v, g)) {
continue;
}
if (in_degree(v, g) != 1 && !allInsSpecial(v, g)) {
return false;
}
if (out_degree(v, g) != 1 && !allOutsSpecial(v, g)) {
return false;
}
}
/* if calcComponents got sleepy and went home, the above checks don't hold
* as it assumes there is only one connected component. */
if (isAlternationOfClasses(g)) {
return false;
}
return true;
}
static
u32 findCyclic(const NGHolder &g, vector<bool> &cyclic) {
u32 count = 0;
cyclic.resize(num_vertices(g));
for (auto v : vertices_range(g)) {
assert(g[v].index < cyclic.size());
bool c = edge(v, v, g).second;
if (c) {
count++;
}
cyclic[g[v].index] = c;
}
return count;
}
static
void findCyclicDom(NGHolder &g, vector<bool> &cyclic,
set<NFAEdge> &dead, som_type som) {
ue2::unordered_map<NFAVertex, NFAVertex> dominators = findDominators(g);
for (auto v : vertices_range(g)) {
if (is_special(v, g)) {
continue;
}
// Path in through a dominator (e.g. '.+a?foobar')
NFAVertex dom = dominators[v];
if (dom && cyclic[g[dom].index]
&& edge(dom, v, g).second) {
if (som && dom == g.startDs) {
continue;
}
DEBUG_PRINTF("vertex %u is dominated by directly-connected cyclic "
"vertex %u\n", g[v].index,
g[dom].index);
// iff all paths through in-edge e of v involve vertices whose
// reachability is a subset of reach(dom), we can delete edge e.
for (const auto &e : in_edges_range(v, g)) {
if (source(e, g) == dom) {
continue;
}
if (reversePathReachSubset(e, dom, g)) {
DEBUG_PRINTF("edge (%u, %u) can be removed: leading paths "
"share dom reach\n",
g[source(e, g)].index, g[target(e, g)].index);
dead.insert(e);
if (source(e, g) == v) {
cyclic[g[v].index] = false;
}
continue;
}
}
}
}
}
static
void findCyclicPostDom(NGHolder &g, vector<bool> &cyclic,
set<NFAEdge> &dead) {
ue2::unordered_map<NFAVertex, NFAVertex> postdominators =
findPostDominators(g);
for (auto v : vertices_range(g)) {
if (is_special(v, g)) {
continue;
}
// Path out through a post-dominator (e.g. a?.+foobar')
NFAVertex postdom = postdominators[v];
if (postdom && cyclic[g[postdom].index]
&& edge(v, postdom, g).second) {
DEBUG_PRINTF("vertex %u is postdominated by directly-connected "
"cyclic vertex %u\n", g[v].index,
g[postdom].index);
// iff all paths through in-edge e of v involve vertices whose
// reachability is a subset of reach(dom), we can delete edge e.
for (const auto &e : out_edges_range(v, g)) {
if (target(e, g) == postdom) {
continue;
}
if (forwardPathReachSubset(e, postdom, g)) {
DEBUG_PRINTF("edge (%u, %u) can be removed: trailing paths "
"share postdom reach\n",
g[source(e, g)].index, g[target(e, g)].index);
if (target(e, g) == v) {
cyclic[g[v].index] = false;
}
dead.insert(e);
continue;
}
}
}
}
}
bool removeRedundancy(NGHolder &g, som_type som) {
DEBUG_PRINTF("rr som = %d\n", (int)som);
g.renumberVertices();
// Cheap check: if all the non-special vertices have in-degree one and
// out-degree one, there's no redundancy in this here graph and we can
// vamoose.
if (isIrreducible(g)) {
return false;
}
VertexInfoMap infoMap(g);
// Populate maps of successors and predecessors, and accept status
populateContainers(g, infoMap);
/* Run multiple passes: terminate when a full pass doesn't remove
* any vertices */
bool doUseless = true;
bool doDiamond = true;
set<NFAVertex> removable;
while (doUseless || doDiamond) {
if (doUseless
&& doUselessMergePass(g, som, infoMap, removable)) {
doDiamond = true;
}
doUseless = false;
if (doDiamond
&& doDiamondMergePass(g, som, infoMap, removable)) {
doUseless = true;
}
doDiamond = false;
}
DEBUG_PRINTF("found %zu removable vertices overall.\n", removable.size());
remove_vertices(removable, g);
return !removable.empty();
}
/** UE-524: remove edges into nodes that are dominated by cyclic nodes with
* reachability that is a superset of all paths feeding into that edge. */
bool removeCyclicDominated(NGHolder &g, som_type som) {
set<NFAEdge> dead;
vector<bool> cyclic;
bool changed = false;
findCyclic(g, cyclic);
findCyclicDom(g, cyclic, dead, som);
if (!dead.empty()) {
remove_edges(dead, g);
pruneUseless(g);
dead.clear();
cyclic.clear(); // need to recalculate cyclic as ids have changed
findCyclic(g, cyclic);
changed = true;
}
findCyclicPostDom(g, cyclic, dead);
if (!dead.empty()) {
remove_edges(dead, g);
pruneUseless(g);
dead.clear();
changed = true;
}
return changed;
}
} // namespace ue2

View File

@@ -0,0 +1,54 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief NFA graph reductions.
*/
#ifndef NG_REDUNDANCY_H
#define NG_REDUNDANCY_H
#include "som/som.h"
namespace ue2 {
class NGHolder;
struct CompileContext;
/** Attempt to make the NFA graph \p g smaller by performing a number of local
* transformations. */
bool removeRedundancy(NGHolder &g, som_type som);
/** UE-524: remove edges into nodes that are dominated by cyclic nodes with
* reachability that is a superset of all paths feeding into that edge. Returns
* true if any edges/vertices were removed. */
bool removeCyclicDominated(NGHolder &g, som_type som);
} // namespace ue2
#endif

476
src/nfagraph/ng_region.cpp Normal file
View File

@@ -0,0 +1,476 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Region analysis.
*
* Definition: a \a region is a subset of vertices in a graph such that:
* - the edges entering the region are a cutset of the graph
* - for every in-edge (u, v) to the region there exist edges (u, w) for all
* w in {w : w in region and w has an in-edge}
* - the regions in a graph partition the graph
*
* Note:
* - we partition a graph into the maximal number of regions
* - similar properties for exit edges should hold as a consequence
* - graph == sequence of regions
* - a region is considered to have an epsilon vertex to allow jumps
* - vertices which only lead to back edges need to be floated up in the topo
* order
*
* Algorithm overview:
* -# topo-order over the DAG skeleton;
* -# incrementally add vertices to the current region until the boundary edges
* form a valid cut-set;
* -# for each back-edge, if the source and target are in different regions,
* merge the regions (and all intervening regions) into a common region.
*/
#include "ng_region.h"
#include "ng_holder.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/container.h"
#include "util/ue2_containers.h"
#include "util/graph_range.h"
#include <set>
#include <utility>
#include <vector>
#include <boost/graph/filtered_graph.hpp>
#include <boost/graph/topological_sort.hpp>
using namespace std;
namespace ue2 {
typedef ue2::unordered_set<NFAEdge> BackEdgeSet;
typedef boost::filtered_graph<NFAGraph, AcyclicFilter<BackEdgeSet>>
AcyclicGraph;
namespace {
struct exit_info {
explicit exit_info(NFAVertex v) : exit(v) {}
NFAVertex exit;
ue2::unordered_set<NFAVertex> open;
};
}
static
void checkAndAddExitCandidate(const AcyclicGraph &g,
const ue2::unordered_set<NFAVertex> &r,
NFAVertex v, vector<exit_info> *exits) {
// set when we find our first candidate.
ue2::unordered_set<NFAVertex> *open = nullptr;
/* find the set of vertices reachable from v which are not in r */
for (auto w : adjacent_vertices_range(v, g)) {
if (!contains(r, w)) {
if (!open) {
exits->push_back(exit_info(v));
open = &exits->back().open;
}
open->insert(w);
}
}
if (open) {
DEBUG_PRINTF("exit %u\n", g[v].index);
}
}
static
void findExits(const AcyclicGraph &g, const ue2::unordered_set<NFAVertex> &r,
vector<exit_info> *exits) {
exits->clear();
for (auto v : r) {
checkAndAddExitCandidate(g, r, v, exits);
}
}
static
void refineExits(const AcyclicGraph &g, const ue2::unordered_set<NFAVertex> &r,
NFAVertex new_v, vector<exit_info> *exits) {
for (u32 i = 0; i < exits->size(); i++) {
(*exits)[i].open.erase(new_v); /* new_v is no long an open edge */
if ((*exits)[i].open.empty()) { /* no open edges: no longer an exit */
/* shuffle to back and kill */
(*exits)[i] = exits->back();
exits->pop_back();
i--;
}
}
checkAndAddExitCandidate(g, r, new_v, exits);
}
/** the set of exits from a candidate region are valid if: FIXME: document
*/
static
bool exitValid(UNUSED const AcyclicGraph &g, const vector<exit_info> &exits,
const ue2::unordered_set<NFAVertex> &open_jumps) {
if (exits.empty() || (exits.size() < 2 && open_jumps.empty())) {
return true;
}
if (exits.size() == 1 && open_jumps.size() == 1) {
DEBUG_PRINTF("oj %u, e %u\n", g[*open_jumps.begin()].index,
g[exits[0].exit].index);
if (*open_jumps.begin() == exits[0].exit) {
return true;
}
}
assert(!exits.empty());
const auto &enters = exits.front().open;
if (!open_jumps.empty() && enters != open_jumps) {
return false;
}
for (auto it = begin(exits) + 1; it != end(exits); ++it) {
if (it->open != enters) {
return false;
}
}
return true;
}
static
void setRegion(const ue2::unordered_set<NFAVertex> &r, u32 rid,
ue2::unordered_map<NFAVertex, u32> &regions) {
for (auto v : r) {
regions[v] = rid;
}
}
static
void buildInitialCandidate(const AcyclicGraph &g,
vector<NFAVertex>::const_reverse_iterator &it,
const vector<NFAVertex>::const_reverse_iterator &ite,
ue2::unordered_set<NFAVertex> *candidate,
/* in exits of prev region;
* out exits from candidate */
vector<exit_info> *exits,
ue2::unordered_set<NFAVertex> *open_jumps) {
if (it == ite) {
candidate->clear();
exits->clear();
return;
}
if (exits->empty()) {
DEBUG_PRINTF("odd\n");
candidate->clear();
DEBUG_PRINTF("adding %u to initial\n", g[*it].index);
candidate->insert(*it);
open_jumps->erase(*it);
checkAndAddExitCandidate(g, *candidate, *it, exits);
++it;
return;
}
ue2::unordered_set<NFAVertex> enters = (*exits)[0].open;
candidate->clear();
for (; it != ite; ++it) {
DEBUG_PRINTF("adding %u to initial\n", g[*it].index);
candidate->insert(*it);
if (contains(enters, *it)) {
break;
}
}
if (it != ite) {
enters.erase(*it);
open_jumps->swap(enters);
DEBUG_PRINTF("oj size = %zu\n", open_jumps->size());
++it;
} else {
open_jumps->clear();
}
findExits(g, *candidate, exits);
}
static
void findDagLeaders(const NGHolder &h, const AcyclicGraph &g,
const vector<NFAVertex> &topo,
ue2::unordered_map<NFAVertex, u32> &regions) {
assert(!topo.empty());
u32 curr_id = 0;
vector<NFAVertex>::const_reverse_iterator t_it = topo.rbegin();
vector<exit_info> exits;
ue2::unordered_set<NFAVertex> candidate;
ue2::unordered_set<NFAVertex> open_jumps;
DEBUG_PRINTF("adding %u to current\n", g[*t_it].index);
assert(t_it != topo.rend());
candidate.insert(*t_it++);
DEBUG_PRINTF("adding %u to current\n", g[*t_it].index);
assert(t_it != topo.rend());
candidate.insert(*t_it++);
findExits(g, candidate, &exits);
while (t_it != topo.rend()) {
assert(!candidate.empty());
if (exitValid(g, exits, open_jumps)) {
if (contains(candidate, h.accept) && !open_jumps.empty()) {
/* we have tried to make an optional region containing accept as
* we have an open jump to eod. This candidate region needs to
* be put in with the previous region. */
curr_id--;
DEBUG_PRINTF("merging in with region %u\n", curr_id);
} else {
DEBUG_PRINTF("setting region %u\n", curr_id);
}
setRegion(candidate, curr_id++, regions);
buildInitialCandidate(g, t_it, topo.rend(), &candidate, &exits,
&open_jumps);
} else {
NFAVertex curr = *t_it;
DEBUG_PRINTF("adding %u to current\n", g[curr].index);
candidate.insert(curr);
open_jumps.erase(curr);
refineExits(g, candidate, *t_it, &exits);
DEBUG_PRINTF(" open jumps %zu exits %zu\n", open_jumps.size(),
exits.size());
++t_it;
}
}
/* assert exits valid */
setRegion(candidate, curr_id, regions);
}
static
void mergeUnderBackEdges(const NGHolder &g, const vector<NFAVertex> &topo,
const BackEdgeSet &backEdges,
ue2::unordered_map<NFAVertex, u32> &regions) {
for (const auto &e : backEdges) {
NFAVertex u = source(e, g);
NFAVertex v = target(e, g);
u32 ru = regions[u];
u32 rv = regions[v];
if (ru == rv) {
continue;
}
DEBUG_PRINTF("merging v = %u(%u), u = %u(%u)\n", g[v].index, rv,
g[u].index, ru);
assert(rv < ru);
for (auto t : topo) {
u32 r = regions[t];
if (r <= ru && r > rv) {
regions[t] = rv;
} else if (r > ru) {
regions[t] = rv + r - ru;
}
}
}
}
static
void reorderSpecials(const NGHolder &w, const AcyclicGraph &acyclic_g,
vector<NFAVertex> &topoOrder) {
// Start is last element of reverse topo ordering.
auto it = find(topoOrder.begin(), topoOrder.end(), w.start);
if (it != topoOrder.end() - 1) {
DEBUG_PRINTF("repositioning start\n");
assert(it != topoOrder.end());
topoOrder.erase(it);
topoOrder.insert(topoOrder.end(), w.start);
}
// StartDs is second-to-last element of reverse topo ordering.
it = find(topoOrder.begin(), topoOrder.end(), w.startDs);
if (it != topoOrder.end() - 2) {
DEBUG_PRINTF("repositioning start ds\n");
assert(it != topoOrder.end());
topoOrder.erase(it);
topoOrder.insert(topoOrder.end() - 1, w.startDs);
}
// AcceptEOD is first element of reverse topo ordering.
it = find(topoOrder.begin(), topoOrder.end(), w.acceptEod);
if (it != topoOrder.begin()) {
DEBUG_PRINTF("repositioning accept\n");
assert(it != topoOrder.end());
topoOrder.erase(it);
topoOrder.insert(topoOrder.begin(), w.acceptEod);
}
// Accept is second element of reverse topo ordering, if it's connected.
it = find(topoOrder.begin(), topoOrder.end(), w.accept);
if (it != topoOrder.begin() + 1) {
DEBUG_PRINTF("repositioning accept\n");
assert(it != topoOrder.end());
topoOrder.erase(it);
if (in_degree(w.accept, acyclic_g) != 0) {
topoOrder.insert(topoOrder.begin() + 1, w.accept);
}
}
}
static
void liftSinks(const AcyclicGraph &acyclic_g, vector<NFAVertex> &topoOrder) {
ue2::unordered_set<NFAVertex> sinks;
for (auto v : vertices_range(acyclic_g)) {
if (is_special(v, acyclic_g)) {
continue;
}
if (isLeafNode(v, acyclic_g)) {
DEBUG_PRINTF("sink found %u\n", acyclic_g[v].index);
sinks.insert(v);
}
}
if (sinks.empty()) {
DEBUG_PRINTF("no sinks found\n");
return;
}
bool changed;
do {
DEBUG_PRINTF("look\n");
changed = false;
for (auto v : vertices_range(acyclic_g)) {
if (is_special(v, acyclic_g) || contains(sinks, v)) {
continue;
}
for (auto w : adjacent_vertices_range(v, acyclic_g)) {
if (!contains(sinks, w)) {
goto next;
}
}
DEBUG_PRINTF("sink found %u\n", acyclic_g[v].index);
sinks.insert(v);
changed = true;
next:;
}
} while (changed);
for (auto ri = topoOrder.rbegin() + 1; ri != topoOrder.rend(); ++ri) {
if (!contains(sinks, *ri)) {
continue;
}
NFAVertex s = *ri;
DEBUG_PRINTF("handling sink %u\n", acyclic_g[s].index);
ue2::unordered_set<NFAVertex> parents;
for (const auto &e : in_edges_range(s, acyclic_g)) {
parents.insert(source(e, acyclic_g));
}
/* vertex has no children not reachable on a back edge, bubble the
* vertex up the topo order to be near its parents */
vector<NFAVertex>::reverse_iterator rj = ri;
--rj;
while (rj != topoOrder.rbegin() && !contains(parents, *rj)) {
/* sink is in rj + 1 */
assert(*(rj + 1) == s);
DEBUG_PRINTF("lifting\n");
using std::swap;
swap(*rj, *(rj + 1));
--rj;
}
}
}
/** Build a reverse topo ordering (with only the specials that are in use). We
* also want to ensure vertices which only lead to back edges are placed near
* their parents. */
static
vector<NFAVertex> buildTopoOrder(const NGHolder &w,
const AcyclicGraph &acyclic_g,
vector<boost::default_color_type> &colours) {
vector<NFAVertex> topoOrder;
topological_sort(
acyclic_g, back_inserter(topoOrder),
color_map(make_iterator_property_map(
colours.begin(), get(&NFAGraphVertexProps::index, acyclic_g))));
reorderSpecials(w, acyclic_g, topoOrder);
if (topoOrder.empty()) {
return topoOrder;
}
liftSinks(acyclic_g, topoOrder);
DEBUG_PRINTF("TOPO ORDER\n");
for (auto ri = topoOrder.rbegin(); ri != topoOrder.rend(); ++ri) {
DEBUG_PRINTF("[%u]\n", acyclic_g[*ri].index);
}
DEBUG_PRINTF("----------\n");
return topoOrder;
}
ue2::unordered_map<NFAVertex, u32> assignRegions(const NGHolder &g) {
assert(hasCorrectlyNumberedVertices(g));
const u32 numVertices = num_vertices(g);
DEBUG_PRINTF("assigning regions for %u vertices in holder\n", numVertices);
vector<boost::default_color_type> colours(numVertices);
// Build an acyclic graph for this NGHolder.
BackEdgeSet deadEdges;
depth_first_search(
g.g, visitor(BackEdges<BackEdgeSet>(deadEdges))
.root_vertex(g.start)
.color_map(make_iterator_property_map(
colours.begin(), get(&NFAGraphVertexProps::index, g.g))));
AcyclicFilter<BackEdgeSet> af(&deadEdges);
AcyclicGraph acyclic_g(g.g, af);
// Build a (reverse) topological ordering.
vector<NFAVertex> topoOrder = buildTopoOrder(g, acyclic_g, colours);
// Everybody starts in region 0.
ue2::unordered_map<NFAVertex, u32> regions;
regions.reserve(numVertices);
for (auto v : vertices_range(g)) {
regions.emplace(v, 0);
}
findDagLeaders(g, acyclic_g, topoOrder, regions);
mergeUnderBackEdges(g, topoOrder, deadEdges, regions);
return regions;
}
} // namespace ue2

219
src/nfagraph/ng_region.h Normal file
View File

@@ -0,0 +1,219 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Region analysis and utility functions.
*/
#ifndef NG_REGION_H
#define NG_REGION_H
#include "ng_holder.h"
#include "util/container.h"
#include "util/graph_range.h"
#include "util/ue2_containers.h"
#include <vector>
namespace ue2 {
/** \brief Assign a region ID to every vertex in the graph. */
ue2::unordered_map<NFAVertex, u32> assignRegions(const NGHolder &g);
/** \brief True if vertices \p a and \p b are in the same region. */
template <class Graph>
bool inSameRegion(const Graph &g, NFAVertex a, NFAVertex b,
const ue2::unordered_map<NFAVertex, u32> &region_map) {
assert(contains(region_map, a) && contains(region_map, b));
return region_map.at(a) == region_map.at(b) &&
is_special(a, g) == is_special(b, g);
}
/** \brief True if vertex \p b is in a later region than vertex \p a. */
template <class Graph>
bool inLaterRegion(const Graph &g, NFAVertex a, NFAVertex b,
const ue2::unordered_map<NFAVertex, u32> &region_map) {
assert(contains(region_map, a) && contains(region_map, b));
u32 aa = g[a].index;
u32 bb = g[b].index;
if (bb == NODE_START || bb == NODE_START_DOTSTAR) {
return false;
}
if (aa == NODE_START || aa == NODE_START_DOTSTAR) {
return true;
}
if (bb == NODE_ACCEPT || bb == NODE_ACCEPT_EOD) {
return true;
}
if (aa == NODE_ACCEPT || aa == NODE_ACCEPT_EOD) {
return false;
}
return region_map.at(a) < region_map.at(b);
}
/** \brief True if vertex \p b is in an earlier region than vertex \p a. */
template <class Graph>
bool inEarlierRegion(const Graph &g, NFAVertex a, NFAVertex b,
const ue2::unordered_map<NFAVertex, u32> &region_map) {
assert(contains(region_map, a) && contains(region_map, b));
u32 aa = g[a].index;
u32 bb = g[b].index;
if (bb == NODE_START || bb == NODE_START_DOTSTAR) {
return true;
}
if (aa == NODE_START || aa == NODE_START_DOTSTAR) {
return false;
}
if (bb == NODE_ACCEPT || bb == NODE_ACCEPT_EOD) {
return false;
}
if (aa == NODE_ACCEPT || aa == NODE_ACCEPT_EOD) {
return true;
}
return region_map.at(b) < region_map.at(a);
}
/** \brief True if vertex \p v is an entry vertex for its region. */
template <class Graph>
bool isRegionEntry(const Graph &g, NFAVertex v,
const ue2::unordered_map<NFAVertex, u32> &region_map) {
// Note that some graph types do not have inv_adjacent_vertices, so we must
// use in_edges here.
for (const auto &e : in_edges_range(v, g)) {
if (!inSameRegion(g, v, source(e, g), region_map)) {
return true;
}
}
return false;
}
/** \brief True if vertex \p v is an exit vertex for its region. */
template <class Graph>
bool isRegionExit(const Graph &g, NFAVertex v,
const ue2::unordered_map<NFAVertex, u32> &region_map) {
for (auto w : adjacent_vertices_range(v, g)) {
if (!inSameRegion(g, v, w, region_map)) {
return true;
}
}
return false;
}
/** \brief True if vertex \p v is in a region all on its own. */
template <class Graph>
bool isSingletonRegion(const Graph &g, NFAVertex v,
const ue2::unordered_map<NFAVertex, u32> &region_map) {
for (const auto &e : in_edges_range(v, g)) {
auto u = source(e, g);
if (u != v && inSameRegion(g, v, u, region_map)) {
return false;
}
for (auto w : ue2::adjacent_vertices_range(u, g)) {
if (w != v && inSameRegion(g, v, w, region_map)) {
return false;
}
}
}
for (auto w : adjacent_vertices_range(v, g)) {
if (w != v && inSameRegion(g, v, w, region_map)) {
return false;
}
for (const auto &e : in_edges_range(w, g)) {
auto u = source(e, g);
if (u != v && inSameRegion(g, v, u, region_map)) {
return false;
}
}
return true;
}
return true;
}
/**
* \brief True if the region containing vertex \p v is optional. The vertex \p v
* should be a region leader.
*/
template <class Graph>
bool isOptionalRegion(const Graph &g, NFAVertex v,
const ue2::unordered_map<NFAVertex, u32> &region_map) {
assert(isRegionEntry(g, v, region_map));
DEBUG_PRINTF("check if r%u is optional (inspecting v%u)\n",
region_map.at(v), g[v].index);
// Region zero is never optional.
assert(contains(region_map, v));
if (region_map.at(v) == 0) {
return false;
}
// Optional if v has a predecessor in an earlier region that has a
// successor in a later one.
for (const auto &e : in_edges_range(v, g)) {
auto u = source(e, g);
if (inSameRegion(g, v, u, region_map)) {
continue;
}
DEBUG_PRINTF(" searching from u=%u\n", g[u].index);
assert(inEarlierRegion(g, v, u, region_map));
for (auto w : adjacent_vertices_range(u, g)) {
DEBUG_PRINTF(" searching to w=%u\n", g[w].index);
if (inLaterRegion(g, v, w, region_map)) {
return true;
}
}
return false;
}
return false;
}
} // namespace ue2
#endif

View File

@@ -0,0 +1,270 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Region Redundancy optimisation pass.
*
* Identifies and removes entire regions that are adjacent to a cyclic state
* with a superset of their character reachability.
*/
#include "ng_region_redundancy.h"
#include "ng_holder.h"
#include "ng_region.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/container.h"
#include "util/graph_range.h"
#include <set>
using namespace std;
namespace ue2 {
namespace {
/** Precalculated information about a region. */
struct RegionInfo {
NFAVertex entry; //!< arbitrary entry vertex
CharReach cr; //!< union of the reach of all vertices in region
};
} // namespace
static
bool regionHasUnexpectedAccept(const NGHolder &g, const u32 region,
const flat_set<ReportID> &expected_reports,
const ue2::unordered_map<NFAVertex, u32> &region_map) {
/* TODO: only check vertices connected to accept/acceptEOD */
for (auto v : vertices_range(g)) {
if (region != region_map.at(v)) {
continue;
}
if (is_any_accept(v, g)) {
return true; /* encountering an actual special in the region is
* possible but definitely unexpected */
}
for (auto w : adjacent_vertices_range(v, g)) {
if (is_any_accept(w, g) && g[v].reports != expected_reports) {
return true;
}
}
}
return false;
}
static
void processCyclicStateForward(NGHolder &h, NFAVertex cyc,
const map<u32, RegionInfo> &info,
const ue2::unordered_map<NFAVertex, u32> &region_map,
set<u32> &deadRegions) {
u32 region = region_map.at(cyc);
CharReach cr = h[cyc].char_reach;
auto reports = h[cyc].reports;
DEBUG_PRINTF("going forward from %u/%u\n", h[cyc].index,
region);
map<u32, RegionInfo>::const_iterator it;
while ((it = info.find(++region)) != info.end()) {
NFAVertex v = it->second.entry;
const CharReach &region_cr = it->second.cr;
assert(isRegionEntry(h, v, region_map) && !is_special(v, h));
DEBUG_PRINTF("checking %u\n", h[v].index);
if (!region_cr.isSubsetOf(cr)) {
DEBUG_PRINTF("doesn't cover the reach of region %u\n", region);
break;
}
if (isOptionalRegion(h, v, region_map)
&& !regionHasUnexpectedAccept(h, region, reports, region_map)) {
DEBUG_PRINTF("cyclic state %u leads to optional region leader %u\n",
h[cyc].index, h[v].index);
deadRegions.insert(region);
} else if (isSingletonRegion(h, v, region_map)) {
/* we can use this region as straw and suck in optional regions on
* the other side. This allows us to transform /a{n,m}/ to /a{n}/ */
cr = h[v].char_reach;
reports = h[v].reports;
DEBUG_PRINTF("%u is straw\n", region);
assert(cr.isSubsetOf(h[cyc].char_reach));
if (hasSelfLoop(v, h)) {
DEBUG_PRINTF("%u is straw has a self-loop - kill\n", region);
remove_edge(v, v, h);
}
} else {
break;
}
}
}
static
void processCyclicStateReverse(NGHolder &h, NFAVertex cyc,
const map<u32, RegionInfo> &info,
const ue2::unordered_map<NFAVertex, u32> &region_map,
set<u32> &deadRegions) {
u32 region = region_map.at(cyc);
CharReach cr = h[cyc].char_reach;
auto reports = h[cyc].reports;
DEBUG_PRINTF("going back from %u/%u\n", h[cyc].index, region);
map<u32, RegionInfo>::const_iterator it;
while ((it = info.find(--region)) != info.end()) {
NFAVertex v = it->second.entry;
const CharReach &region_cr = it->second.cr;
assert(isRegionEntry(h, v, region_map) && !is_special(v, h));
DEBUG_PRINTF("checking %u\n", h[v].index);
if (!region_cr.isSubsetOf(cr)) {
DEBUG_PRINTF("doesn't cover the reach of region %u\n", region);
break;
}
if (isOptionalRegion(h, v, region_map)
&& !regionHasUnexpectedAccept(h, region, reports, region_map)) {
DEBUG_PRINTF("cyclic state %u trails optional region leader %u\n",
h[cyc].index, h[v].index);
deadRegions.insert(region);
} else if (isSingletonRegion(h, v, region_map)) {
/* we can use this region as a reverse straw and suck in optional
* regions on the other side. This allows us to transform
* /^a?a{n}.*b/ to /^a{n}.*b/ */
cr = h[v].char_reach;
reports = h[v].reports;
DEBUG_PRINTF("%u is straw\n", region);
assert(cr.isSubsetOf(h[cyc].char_reach));
if (hasSelfLoop(v, h)) {
DEBUG_PRINTF("%u is straw has a self-loop - kill\n", region);
remove_edge(v, v, h);
}
} else {
break;
}
if (!region) { // No wrapping
break;
}
}
}
static
map<u32, RegionInfo> buildRegionInfoMap(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &region_map) {
map<u32, RegionInfo> info;
for (auto v : vertices_range(g)) {
u32 region = region_map.at(v);
if (is_special(v, g) || region == 0) {
continue;
}
RegionInfo &ri = info[region];
ri.cr |= g[v].char_reach;
if (isRegionEntry(g, v, region_map)) {
ri.entry = v;
}
}
return info;
}
static
bool hasNoStartAnchoring(const NGHolder &h) {
for (auto v : adjacent_vertices_range(h.start, h)) {
if (!edge(h.startDs, v, h).second) {
return false;
}
}
return true;
}
void removeRegionRedundancy(NGHolder &g, som_type som) {
auto region_map = assignRegions(g);
map<u32, RegionInfo> info = buildRegionInfoMap(g, region_map);
set<u32> deadRegions;
/* if we are not tracking som, we can treat sds as a cyclic region if there
* is no anchoring */
if (!som && hasNoStartAnchoring(g)) {
processCyclicStateForward(g, g.startDs, info, region_map, deadRegions);
}
// Walk the region mapping, looking for regions that consist of a single
// cyclic node.
for (const auto &m : info) {
// Must not have already been removed
if (contains(deadRegions, m.first)) {
continue;
}
NFAVertex v = m.second.entry;
/* require a singleton cyclic region */
if (!hasSelfLoop(v, g) || !isSingletonRegion(g, v, region_map)) {
continue;
}
if (som && is_virtual_start(v, g)) {
continue;
}
processCyclicStateForward(g, v, info, region_map, deadRegions);
processCyclicStateReverse(g, v, info, region_map, deadRegions);
}
if (deadRegions.empty()) {
return;
}
vector<NFAVertex> dead;
for (auto v : vertices_range(g)) {
if (is_special(v, g)) {
continue;
}
u32 region = region_map.at(v);
if (contains(deadRegions, region)) {
dead.push_back(v);
}
}
if (!dead.empty()) {
DEBUG_PRINTF("removing %zu vertices from %zu dead regions\n",
dead.size(), deadRegions.size());
remove_vertices(dead, g);
}
}
} // namespace ue2

View File

@@ -0,0 +1,49 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Region Redundancy optimisation pass.
*
* Identifies and removes entire regions that are adjacent to a cyclic state
* with a superset of their character reachability.
*/
#ifndef NG_REGION_REDUNDANCY_H
#define NG_REGION_REDUNDANCY_H
#include "som/som.h"
namespace ue2 {
class NGHolder;
void removeRegionRedundancy(NGHolder &g, som_type som);
} // namespace ue2
#endif

2531
src/nfagraph/ng_repeat.cpp Normal file

File diff suppressed because it is too large Load Diff

160
src/nfagraph/ng_repeat.h Normal file
View File

@@ -0,0 +1,160 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Bounded repeat analysis.
*/
#ifndef NG_REPEAT_H
#define NG_REPEAT_H
#include "ng_holder.h"
#include "ue2common.h"
#include "nfa/repeat_internal.h"
#include "util/depth.h"
#include "util/ue2_containers.h"
#include <map>
#include <vector>
namespace ue2 {
class NGHolder;
class ReportManager;
struct Grey;
/**
* \brief Everything you need to know about a bounded repeat that we have
* transformed.
*/
struct BoundedRepeatData {
BoundedRepeatData(enum RepeatType type_in, const depth &a, const depth &z,
u32 minPeriod_in, NFAVertex cyc, NFAVertex pos,
const std::vector<NFAVertex> &tug_in)
: type(type_in), repeatMin(a), repeatMax(z), minPeriod(minPeriod_in),
cyclic(cyc), pos_trigger(pos), tug_triggers(tug_in) {}
BoundedRepeatData() = delete; // no default construction allowed.
enum RepeatType type; //!< selected type based on bounds and structure
depth repeatMin; //!< minimum repeat bound
depth repeatMax; //!< maximum repeat bound
u32 minPeriod; //!< min trigger period
NFAVertex cyclic; //!< cyclic vertex representing repeat in graph
NFAVertex pos_trigger; //!< positive trigger vertex
std::vector<NFAVertex> tug_triggers; //!< list of tug trigger vertices
};
/**
* \brief Run the bounded repeat analysis and transform the graph where
* bounded repeats are found.
*
* \param h
* Graph to operate on.
* \param rm
* ReportManager, or nullptr if the graph's reports are internal (e.g. for
* Rose use).
* \param fixed_depth_tops
* Map of top to possible trigger depth.
* \param triggers
* Map of top to the vector of triggers (i.e. preceding literals/masks)
* \param repeats
* Repeat info is filled in for caller here.
* \param streaming
* True if we're in streaming mode.
* \param simple_model_selection
* Don't perform complex (and slow) model selection analysis, e.g.
* determining whether the repeat is sole entry.
* \param grey
* Grey box object.
* \param reformed_start_ds
* If supplied, this will be set to true if the graph was optimised for a
* leading first repeat, resulting in the output graph having no self-loop
* on startDs.
*/
void analyseRepeats(NGHolder &h, const ReportManager *rm,
const std::map<u32, u32> &fixed_depth_tops,
const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
std::vector<BoundedRepeatData> *repeats, bool streaming,
bool simple_model_selection, const Grey &grey,
bool *reformed_start_ds = nullptr);
/**
* \brief Information on repeats in a holder, returned from \ref findRepeats.
*/
struct GraphRepeatInfo {
depth repeatMin; /**< minimum bound */
depth repeatMax; /**< effective max bound */
std::vector<NFAVertex> vertices; /**< vertices involved in repeat */
};
/**
* \brief Provides information on repeats in the graph.
*/
void findRepeats(const NGHolder &h, u32 minRepeatVertices,
std::vector<GraphRepeatInfo> *repeats_out);
struct PureRepeat {
CharReach reach;
DepthMinMax bounds;
ue2::flat_set<ReportID> reports;
bool operator==(const PureRepeat &a) const {
return reach == a.reach && bounds == a.bounds && reports == a.reports;
}
bool operator!=(const PureRepeat &a) const { return !(*this == a); }
bool operator<(const PureRepeat &a) const {
if (reach != a.reach) {
return reach < a.reach;
}
if (bounds != a.bounds) {
return bounds < a.bounds;
}
return reports < a.reports;
}
};
/**
* \brief Returns true and fills the given PureRepeat structure if the graph is
* wholly a repeat over a single character class.
*
* For example, something like:
*
* /^[a-z]{10,20}/
*
* - Note: graph must not use SDS or EOD.
* - Note: \p PureRepeat::bounds::max is set to infinity if there is no upper
* bound on the repeat.
*/
bool isPureRepeat(const NGHolder &h, PureRepeat &r);
} // namespace ue2
#endif // NG_REPEAT_H

View File

@@ -0,0 +1,86 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Utility functions for working with Report ID sets.
*/
#include "ng_reports.h"
#include "ng_holder.h"
#include "util/container.h"
#include "util/compile_context.h"
#include "util/graph_range.h"
#include "util/report_manager.h"
using namespace std;
namespace ue2 {
/** Returns the set of all reports in the graph. */
set<ReportID> all_reports(const NGHolder &g) {
set<ReportID> rv;
for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
insert(&rv, g[v].reports);
}
for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
insert(&rv, g[v].reports);
}
return rv;
}
/** True if *all* reports in the graph are exhaustible. */
bool can_exhaust(const NGHolder &g, const ReportManager &rm) {
for (ReportID report_id : all_reports(g)) {
if (rm.getReport(report_id).ekey == INVALID_EKEY) {
return false;
}
}
return true;
}
/** Derive a maximum offset for the graph from the max_offset values of its
* reports. Returns MAX_OFFSET for inf. */
u64a findMaxOffset(const NGHolder &g, const ReportManager &rm) {
u64a maxOffset = 0;
set<ReportID> reports = all_reports(g);
assert(!reports.empty());
for (ReportID report_id : all_reports(g)) {
const Report &ir = rm.getReport(report_id);
if (ir.hasBounds()) {
maxOffset = max(maxOffset, ir.maxOffset);
} else {
return MAX_OFFSET;
}
}
return maxOffset;
}
} // namespace ue2

57
src/nfagraph/ng_reports.h Normal file
View File

@@ -0,0 +1,57 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Utility functions for working with Report ID sets.
*/
#ifndef NG_REPORTS_H
#define NG_REPORTS_H
#include "ue2common.h"
#include <set>
namespace ue2 {
class NGHolder;
class ReportManager;
/** Returns the set of all reports in the graph. */
std::set<ReportID> all_reports(const NGHolder &g);
/** True if *all* reports in the graph are exhaustible. */
bool can_exhaust(const NGHolder &g, const ReportManager &rm);
/** Derive a maximum offset for the graph from the max_offset values of its
* reports. Returns MAX_OFFSET for inf. */
u64a findMaxOffset(const NGHolder &g, const ReportManager &rm);
} // namespace ue2
#endif // NG_REPORTS_H

View File

@@ -0,0 +1,340 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief State numbering and late graph restructuring code.
*/
#include "ng_restructuring.h"
#include "grey.h"
#include "ng_holder.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/graph_range.h"
#include <algorithm>
#include <cassert>
#include <boost/graph/transpose_graph.hpp>
using namespace std;
namespace ue2 {
/** Connect the start vertex to each of the vertices in \p tops. This is useful
* temporarily for when we need to run a graph algorithm that expects a single
* source vertex. */
void wireStartToTops(NGHolder &g, const map<u32, NFAVertex> &tops,
vector<NFAEdge> &topEdges) {
for (const auto &top : tops) {
NFAVertex v = top.second;
assert(!isLeafNode(v, g));
const NFAEdge &e = add_edge(g.start, v, g).first;
topEdges.push_back(e);
}
}
static
void getStateOrdering(NGHolder &g, const map<u32, NFAVertex> &tops,
vector<NFAVertex> &ordering) {
// First, wire up our "tops" to start so that we have a single source,
// which will give a nicer topo order.
vector<NFAEdge> topEdges;
wireStartToTops(g, tops, topEdges);
renumberGraphVertices(g);
vector<NFAVertex> temp = getTopoOrdering(g);
remove_edges(topEdges, g);
// Move {start, startDs} to the end, so they'll be first when we reverse
// the ordering.
temp.erase(remove(temp.begin(), temp.end(), g.startDs));
temp.erase(remove(temp.begin(), temp.end(), g.start));
temp.push_back(g.startDs);
temp.push_back(g.start);
// Walk ordering, remove vertices that shouldn't be participating in state
// numbering, such as accepts.
for (auto v : temp) {
if (is_any_accept(v, g)) {
continue; // accepts don't need states
}
ordering.push_back(v);
}
// Output of topo order was in reverse.
reverse(ordering.begin(), ordering.end());
}
// Returns the number of states.
static
ue2::unordered_map<NFAVertex, u32>
getStateIndices(const NGHolder &h, const vector<NFAVertex> &ordering) {
ue2::unordered_map<NFAVertex, u32> states;
for (const auto &v : vertices_range(h)) {
states[v] = NO_STATE;
}
u32 stateNum = 0;
for (auto v : ordering) {
DEBUG_PRINTF("assigning state num %u to vertex %u\n", stateNum,
h[v].index);
states[v] = stateNum++;
}
return states;
}
/** UE-1648: A state with a single successor that happens to be a predecessor
* can be given any ol' state ID by the topological ordering, so we sink it
* next to its pred. This enables better merging. */
static
void optimiseTightLoops(const NGHolder &g, vector<NFAVertex> &ordering) {
deque<pair<NFAVertex, NFAVertex>> candidates;
auto start = ordering.begin();
for (auto it = ordering.begin(), ite = ordering.end(); it != ite; ++it) {
NFAVertex v = *it;
if (is_special(v, g)) {
continue;
}
if (out_degree(v, g) == 1) {
NFAVertex t = *(adjacent_vertices(v, g).first);
if (v == t) {
continue;
}
if (edge(t, v, g).second && find(start, it, t) != ite) {
candidates.push_back(make_pair(v, t));
}
}
}
for (const auto &cand : candidates) {
NFAVertex v = cand.first, u = cand.second;
auto u_it = find(ordering.begin(), ordering.end(), u);
auto v_it = find(ordering.begin(), ordering.end(), v);
// Only move candidates backwards in the ordering, and only move them
// when necessary.
if (u_it >= v_it || distance(u_it, v_it) == 1) {
continue;
}
DEBUG_PRINTF("moving vertex %u next to %u\n",
g[v].index, g[u].index);
ordering.erase(v_it);
ordering.insert(++u_it, v);
}
}
ue2::unordered_map<NFAVertex, u32>
numberStates(NGHolder &h, const map<u32, NFAVertex> &tops) {
DEBUG_PRINTF("numbering states for holder %p\n", &h);
vector<NFAVertex> ordering;
getStateOrdering(h, tops, ordering);
optimiseTightLoops(h, ordering);
ue2::unordered_map<NFAVertex, u32> states = getStateIndices(h, ordering);
return states;
}
u32 countStates(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &state_ids,
bool addTops) {
if (state_ids.empty()) {
return 0;
}
u32 max_state = 0;
for (const auto &m : state_ids) {
if (m.second != NO_STATE) {
max_state = max(m.second, max_state);
}
}
u32 num_states = max_state + 1;
assert(contains(state_ids, g.start));
if (addTops && state_ids.at(g.start) != NO_STATE) {
num_states--;
set<u32> tops;
for (auto e : out_edges_range(g.start, g)) {
tops.insert(g[e].top);
}
num_states += tops.size();
}
return num_states;
}
/**
* Returns true if start leads to all of startDs's proper successors or if
* start has no successors other than startDs.
*/
static
bool startIsRedundant(const NGHolder &g) {
set<NFAVertex> start, startDs;
for (const auto &e : out_edges_range(g.start, g)) {
NFAVertex v = target(e, g);
if (v == g.startDs) {
continue;
}
start.insert(v);
}
for (const auto &e : out_edges_range(g.startDs, g)) {
NFAVertex v = target(e, g);
if (v == g.startDs) {
continue;
}
startDs.insert(v);
}
// Trivial case: start has no successors other than startDs.
if (start.empty()) {
DEBUG_PRINTF("start has no out-edges other than to startDs\n");
return true;
}
if (start != startDs) {
DEBUG_PRINTF("out-edges of start and startDs aren't equivalent\n");
return false;
}
return true;
}
/** One final, FINAL optimisation. Drop either start or startDs if it's unused
* in this graph. We leave this until this late because having both vertices in
* the graph, with fixed state indices, is useful for merging and other
* analyses. */
void dropUnusedStarts(NGHolder &g, ue2::unordered_map<NFAVertex, u32> &states) {
u32 adj = 0;
if (startIsRedundant(g)) {
DEBUG_PRINTF("dropping unused start\n");
states[g.start] = NO_STATE;
adj++;
}
if (proper_out_degree(g.startDs, g) == 0) {
DEBUG_PRINTF("dropping unused startDs\n");
states[g.startDs] = NO_STATE;
adj++;
}
if (!adj) {
DEBUG_PRINTF("both start and startDs must remain\n");
return;
}
// We have removed one or both of the starts. Walk the non-special vertices
// in the graph with state indices assigned to them and subtract
// adj from all of them.
for (auto v : vertices_range(g)) {
u32 &state = states[v]; // note ref
if (state == NO_STATE) {
continue;
}
if (is_any_start(v, g)) {
assert(state <= 1);
state = 0; // one start remains
} else {
assert(!is_special(v, g));
assert(state >= adj);
state -= adj;
}
}
}
/** Construct a reversed copy of an arbitrary NGHolder, mapping starts to
* accepts. */
void reverseHolder(const NGHolder &g_in, NGHolder &g) {
// Make the BGL do the grunt work.
ue2::unordered_map<NFAVertex, NFAVertex> vertexMap;
boost::transpose_graph(g_in.g, g.g,
orig_to_copy(boost::make_assoc_property_map(vertexMap)).
vertex_index_map(get(&NFAGraphVertexProps::index, g_in.g)));
// The transpose_graph operation will have created extra copies of our
// specials. We have to rewire their neighbours to the 'real' specials and
// delete them.
NFAVertex start = vertexMap[g_in.acceptEod];
NFAVertex startDs = vertexMap[g_in.accept];
NFAVertex accept = vertexMap[g_in.startDs];
NFAVertex acceptEod = vertexMap[g_in.start];
// Successors of starts.
for (const auto &e : out_edges_range(start, g)) {
NFAVertex v = target(e, g);
add_edge(g.start, v, g[e], g);
}
for (const auto &e : out_edges_range(startDs, g)) {
NFAVertex v = target(e, g);
add_edge(g.startDs, v, g[e], g);
}
// Predecessors of accepts.
for (const auto &e : in_edges_range(accept, g)) {
NFAVertex u = source(e, g);
add_edge(u, g.accept, g[e], g);
}
for (const auto &e : in_edges_range(acceptEod, g)) {
NFAVertex u = source(e, g);
add_edge(u, g.acceptEod, g[e], g);
}
// Remove our impostors.
clear_vertex(start, g);
remove_vertex(start, g);
clear_vertex(startDs, g);
remove_vertex(startDs, g);
clear_vertex(accept, g);
remove_vertex(accept, g);
clear_vertex(acceptEod, g);
remove_vertex(acceptEod, g);
// Renumber so that g's properties (number of vertices, edges) are
// accurate.
g.renumberVertices();
g.renumberEdges();
assert(num_vertices(g) == num_vertices(g_in));
assert(num_edges(g) == num_edges(g_in));
}
} // namespace ue2

View File

@@ -0,0 +1,86 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief State numbering and late graph restructuring code.
*/
#ifndef NG_RESTRUCTURING_H
#define NG_RESTRUCTURING_H
#include "ng_holder.h"
#include "ue2common.h"
#include "util/ue2_containers.h"
#include <map>
#include <vector>
namespace ue2 {
class NGHolder;
/** Construct a reversed copy of an arbitrary NGHolder, mapping starts to
* accepts. */
void reverseHolder(const NGHolder &g, NGHolder &out);
/** Connect the start vertex to each of the vertices in \p tops. This is useful
* temporarily for when we need to run a graph algorithm that expects a single
* source vertex. */
void wireStartToTops(NGHolder &g, const std::map<u32, NFAVertex> &tops,
std::vector<NFAEdge> &topEdges);
/**
* \brief Special state index value meaning that the vertex will not
* participate in an (NFA/DFA/etc) implementation.
*/
static constexpr u32 NO_STATE = ~0;
/**
* \brief Gives each participating vertex in the graph a unique state index.
*/
ue2::unordered_map<NFAVertex, u32>
numberStates(NGHolder &h,
const std::map<u32, NFAVertex> &tops = std::map<u32, NFAVertex>{});
/**
* \brief Counts the number of states (vertices with state indices) in the
* graph.
*
* If addTops is true, also accounts for states that will be constructed for
* each unique top.
*/
u32 countStates(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &state_ids,
bool addTops = true);
/** Optimisation: drop unnecessary start states. */
void dropUnusedStarts(NGHolder &g, ue2::unordered_map<NFAVertex, u32> &states);
} // namespace ue2
#endif

297
src/nfagraph/ng_revacc.cpp Normal file
View File

@@ -0,0 +1,297 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Reverse acceleration analysis.
*/
#include "ng_revacc.h"
#include "grey.h"
#include "ng_holder.h"
#include "ue2common.h"
#include "nfa/accel.h"
#include "nfa/nfa_internal.h"
#include "util/bitutils.h"
#include "util/charreach.h"
#include "util/graph_range.h"
using namespace std;
namespace ue2 {
static
bool isPseudoNoCaseChar(const CharReach &cr) {
return cr.count() == 2 && !(cr.find_first() & 32)
&& cr.test(cr.find_first() | 32);
}
static
bool lookForEodSchemes(const RevAccInfo &rev_info, const u32 minWidth,
NFA *nfa) {
DEBUG_PRINTF("pure eod triggered pattern\n");
/* 2 char */
for (u8 nocase = 0; nocase < 2; nocase++) {
for (u8 i = 1; i < MAX_RACCEL_OFFSET; i++) {
const CharReach &cr = rev_info.acceptEodReach[i];
const CharReach &cr2 = rev_info.acceptEodReach[i - 1];
if (!nocase && cr.count() == 1 && cr2.count() == 1) {
assert(i < minWidth);
if (i >= minWidth) {
goto single;
}
nfa->rAccelType = ACCEL_RDEOD;
nfa->rAccelData.array[0] = (u8)cr.find_first();
nfa->rAccelData.array[1] = (u8)cr2.find_first();
nfa->rAccelOffset = i + 1;
DEBUG_PRINTF("raccel eod x2 %u %04hx\n",
nfa->rAccelOffset, nfa->rAccelData.dc);
return true;
} else if (nocase && (cr.count() == 1 || isPseudoNoCaseChar(cr))
&& (cr2.count() == 1 || isPseudoNoCaseChar(cr2))) {
assert(i < minWidth);
if (i >= minWidth) {
goto single;
}
nfa->rAccelType = ACCEL_RDEOD_NOCASE;
nfa->rAccelData.array[0] = (u8)cr.find_first() & CASE_CLEAR; /* uppercase */
nfa->rAccelData.array[1] = (u8)cr2.find_first() & CASE_CLEAR;
nfa->rAccelOffset = i + 1;
DEBUG_PRINTF("raccel nc eod x2 %u %04hx\n",
nfa->rAccelOffset, nfa->rAccelData.dc);
return true;
}
}
}
single:
/* 1 char */
for (u8 nocase = 0; nocase < 2; nocase++) {
for (u8 i = 0; i < MAX_RACCEL_OFFSET; i++) {
const CharReach &cr = rev_info.acceptEodReach[i];
if (!nocase && cr.count() == 1) {
assert(i < minWidth);
if (i >= minWidth) {
return false;
}
nfa->rAccelType = ACCEL_REOD;
nfa->rAccelData.c = (u8) cr.find_first();
nfa->rAccelOffset = i + 1;
DEBUG_PRINTF("raccel eod %u %02hhx\n",
nfa->rAccelOffset, nfa->rAccelData.c);
return true;
} else if (nocase && isPseudoNoCaseChar(cr)) {
assert(i < minWidth);
if (i >= minWidth) {
return false;
}
nfa->rAccelType = ACCEL_REOD_NOCASE;
nfa->rAccelData.c = (u8)cr.find_first(); /* uppercase */
nfa->rAccelOffset = i + 1;
DEBUG_PRINTF("raccel nc eod %u %02hhx\n",
nfa->rAccelOffset, nfa->rAccelData.c);
return true;
}
}
}
return false;
}
static
bool lookForFloatingSchemes(const RevAccInfo &rev_info,
const u32 minWidth, NFA *nfa) {
/* 2 char */
for (u8 nocase = 0; nocase < 2; nocase++) {
for (u8 i = 1; i < MAX_RACCEL_OFFSET; i++) {
CharReach cr = rev_info.acceptEodReach[i] | rev_info.acceptReach[i];
CharReach cr2 = rev_info.acceptEodReach[i - 1]
| rev_info.acceptReach[i - 1];
if (!nocase && cr.count() == 1 && cr2.count() == 1) {
assert((u8)(i - 1) < minWidth);
if (i > minWidth) {
goto single;
}
nfa->rAccelType = ACCEL_RDVERM;
nfa->rAccelData.array[0] = (u8)cr.find_first();
nfa->rAccelData.array[1] = (u8)cr2.find_first();
nfa->rAccelOffset = i;
DEBUG_PRINTF("raccel dverm %u %02hhx%02hhx\n",
nfa->rAccelOffset, nfa->rAccelData.array[0],
nfa->rAccelData.array[1]);
return true;
} else if (nocase && (cr.count() == 1 || isPseudoNoCaseChar(cr))
&& (cr2.count() == 1 || isPseudoNoCaseChar(cr2))) {
assert((u8)(i - 1) < minWidth);
if (i > minWidth) {
goto single;
}
nfa->rAccelType = ACCEL_RDVERM_NOCASE;
nfa->rAccelData.array[0] = (u8)cr.find_first() & CASE_CLEAR;
nfa->rAccelData.array[1] = (u8)cr2.find_first() & CASE_CLEAR;
nfa->rAccelOffset = i;
DEBUG_PRINTF("raccel dverm %u %02hhx%02hhx nc\n",
nfa->rAccelOffset, nfa->rAccelData.array[0],
nfa->rAccelData.array[1]);
return true;
}
}
}
single:
/* 1 char */
for (u8 nocase = 0; nocase < 2; nocase++) {
for (u8 i = 0; i < MAX_RACCEL_OFFSET; i++) {
CharReach cr = rev_info.acceptEodReach[i] | rev_info.acceptReach[i];
if (!nocase && cr.count() == 1) {
assert(i < minWidth);
if (i >= minWidth) {
return false;
}
nfa->rAccelType = ACCEL_RVERM;
nfa->rAccelData.c = (u8)cr.find_first();
nfa->rAccelOffset = i + 1;
DEBUG_PRINTF("raccel verm %u %02hhx\n", nfa->rAccelOffset,
nfa->rAccelData.c);
return true;
} else if (nocase && isPseudoNoCaseChar(cr)) {
assert(i < minWidth);
if (i >= minWidth) {
return false;
}
nfa->rAccelType = ACCEL_RVERM_NOCASE;
nfa->rAccelData.c = (u8)cr.find_first(); /* 'uppercase' char */
nfa->rAccelOffset = i + 1;
DEBUG_PRINTF("raccel nc verm %u %02hhx\n", nfa->rAccelOffset,
nfa->rAccelData.c);
return true;
}
}
}
return false;
}
void buildReverseAcceleration(NFA *nfa, const RevAccInfo &rev_info,
u32 min_width, bool eod_only) {
assert(nfa);
if (!rev_info.valid) {
return;
}
nfa->rAccelOffset = 1;
assert(rev_info.acceptReach[0].any() || rev_info.acceptEodReach[0].any());
if (rev_info.acceptReach[0].none() && rev_info.acceptEodReach[0].none()) {
DEBUG_PRINTF("expected path to accept\n");
return;
}
if (rev_info.acceptReach[0].none()) {
/* eod only */
if (lookForEodSchemes(rev_info, min_width, nfa)) {
assert(nfa->rAccelOffset <= min_width);
return;
}
}
if (eod_only) {
return;
}
if (!lookForFloatingSchemes(rev_info, min_width, nfa)) {
DEBUG_PRINTF("failed to accelerate\n");
}
}
static
void populateRevAccelInfo(const NGHolder &g, NFAVertex terminal,
vector<CharReach> *reach) {
set<NFAVertex> vset;
for (auto v : inv_adjacent_vertices_range(terminal, g)) {
if (!is_special(v, g)) {
vset.insert(v);
}
}
for (u8 offset = 0; offset < MAX_RACCEL_OFFSET; offset++) {
set<NFAVertex> next;
for (auto v : vset) {
const CharReach &cr = g[v].char_reach;
(*reach)[offset] |= cr;
DEBUG_PRINTF("off %u adding %zu to %zu\n", offset, cr.count(),
(*reach)[offset].count());
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (u == g.start || u == g.startDs) {
/* kill all subsequent offsets by setting to dot, setting
* to dot is in someways not accurate as there may be no
* data at all but neither case can be accelerated */
for (u8 i = offset + 1; i < MAX_RACCEL_OFFSET; i++) {
(*reach)[i].setall();
}
break;
} else if (!is_special(u, g)) {
next.insert(u);
}
}
}
swap(vset, next);
}
}
void populateReverseAccelerationInfo(RevAccInfo &rai, const NGHolder &g) {
DEBUG_PRINTF("pop rev info\n");
populateRevAccelInfo(g, g.accept, &rai.acceptReach);
populateRevAccelInfo(g, g.acceptEod, &rai.acceptEodReach);
rai.valid = true;
}
void mergeReverseAccelerationInfo(RevAccInfo &dest, const RevAccInfo &vic) {
DEBUG_PRINTF("merging ra\n");
dest.valid &= vic.valid;
for (u8 i = 0; i < MAX_RACCEL_OFFSET; i++) {
dest.acceptReach[i] |= vic.acceptReach[i];
dest.acceptEodReach[i] |= vic.acceptEodReach[i];
}
}
RevAccInfo::RevAccInfo(void)
: valid(false), acceptReach(MAX_RACCEL_OFFSET),
acceptEodReach(MAX_RACCEL_OFFSET) {}
} // namespace ue2

65
src/nfagraph/ng_revacc.h Normal file
View File

@@ -0,0 +1,65 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Reverse acceleration analysis.
*/
#ifndef NG_REVACC_H
#define NG_REVACC_H
#include "util/charreach.h"
#include <vector>
struct NFA;
namespace ue2 {
class NGHolder;
#define MAX_RACCEL_OFFSET 16
struct RevAccInfo {
RevAccInfo(void);
bool valid;
std::vector<CharReach> acceptReach; /**< bytes which can appear n
* bytes before a match */
std::vector<CharReach> acceptEodReach; /**< bytes which can appear n
* bytes before eod match */
};
void buildReverseAcceleration(struct NFA *nfa, const RevAccInfo &rev_info,
u32 min_width, bool eod_only = false);
void populateReverseAccelerationInfo(RevAccInfo &rai, const NGHolder &g);
void mergeReverseAccelerationInfo(RevAccInfo &dest, const RevAccInfo &vic);
} // namespace ue2
#endif

3036
src/nfagraph/ng_rose.cpp Normal file

File diff suppressed because it is too large Load Diff

70
src/nfagraph/ng_rose.h Normal file
View File

@@ -0,0 +1,70 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Rose construction from NGHolder.
*/
#ifndef NG_ROSE_H
#define NG_ROSE_H
#include "ue2common.h"
namespace ue2 {
class NGHolder;
class ReportManager;
class RoseBuild;
struct CompileContext;
struct ue2_literal;
/** \brief Attempt to consume the entire pattern in graph \a h with Rose.
* Returns true if successful. */
bool splitOffRose(RoseBuild &rose, const NGHolder &h, bool prefilter,
const CompileContext &cc);
/** \brief Attempt to consume the entire pattern in graph \a h with Rose.
* This is the last attempt to handle a pattern before we resort to an outfix.
* Returns true if successful. */
bool finalChanceRose(RoseBuild &rose, const NGHolder &h, bool prefilter,
const CompileContext &cc);
/** \brief True if the pattern in \a h is consumable by Rose. This function
* may be conservative (return false even if supported) for efficiency. */
bool checkRose(const ReportManager &rm, const NGHolder &h, bool prefilter,
const CompileContext &cc);
/** \brief Returns the delay or MO_INVALID_IDX if the graph cannot match with
* the trailing literal. */
u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
u32 max_delay, bool overhang_ok = true);
} // namespace ue2
#endif // NG_ROSE_H

93
src/nfagraph/ng_sep.cpp Normal file
View File

@@ -0,0 +1,93 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Short Exhaustible Passthroughs.
*
* Analysis code for determining whether a graph should be treated specially
* because it is short and contains exhaustible reports; typically we turn
* these into outfixes rather than risk them becoming Rose literals.
*
* For example, the pattern:
*
* /[a-f]/H
*
* ... is far better suited to becoming a small outfix that generates one match
* and goes dead than being split into six one-byte Rose literals that end up
* in the literal matcher.
*/
#include "ng_sep.h"
#include "grey.h"
#include "ng_holder.h"
#include "ng_reports.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/graph_range.h"
using namespace std;
namespace ue2 {
static
bool checkFromVertex(const NGHolder &g, NFAVertex start) {
for (auto v : adjacent_vertices_range(start, g)) {
if (v == g.startDs) {
continue;
}
assert(!is_special(v, g)); /* should not be vacuous */
if (!edge(g.startDs, v, g).second) { /* only floating starts */
return false;
} else if (out_degree(v, g) == 1
&& edge(v, g.accept, g).second) { /* only floating end */
; /* possible sep */
} else {
return false;
}
}
return true;
}
bool isSEP(const NGHolder &g, const ReportManager &rm, const Grey &grey) {
if (!grey.mergeSEP || !can_exhaust(g, rm)) {
return false;
}
if (!checkFromVertex(g, g.start) || !checkFromVertex(g, g.startDs)) {
return false;
}
assert(out_degree(g.start, g) || proper_out_degree(g.startDs, g));
DEBUG_PRINTF("graph is an SEP\n");
return true;
}
} // namespace ue2

46
src/nfagraph/ng_sep.h Normal file
View File

@@ -0,0 +1,46 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Short Exhaustible Passthroughs.
*/
#ifndef NG_SEP_H
#define NG_SEP_H
namespace ue2 {
struct Grey;
class NGHolder;
class ReportManager;
bool isSEP(const NGHolder &g, const ReportManager &rm, const Grey &grey);
} // namespace ue2
#endif

View File

@@ -0,0 +1,245 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Rose construction from NGHolder for cases representing small literal
* sets.
*/
#include "ng_small_literal_set.h"
#include "grey.h"
#include "ng_util.h"
#include "ng_holder.h"
#include "rose/rose_build.h"
#include "util/compare.h"
#include "util/compile_context.h"
#include "util/container.h"
#include "util/graph_range.h"
#include "util/order_check.h"
#include "util/ue2string.h"
#include "ue2common.h"
#include <map>
#include <set>
#include <vector>
#include <boost/range/adaptor/map.hpp>
using namespace std;
using boost::adaptors::map_keys;
namespace ue2 {
/** \brief The maximum number of literals to accept per pattern. */
static const size_t MAX_LITERAL_SET_SIZE = 30;
/**
* \brief The maximum number of literals to accept per pattern where at least
* one is weak (has period < MIN_STRONG_PERIOD).
*/
static const size_t MAX_WEAK_LITERAL_SET_SIZE = 20;
/**
* \brief The minimum string period to consider a literal "strong" (and not
* apply the weak size limit).
*/
static const size_t MIN_STRONG_PERIOD = 3;
namespace {
struct sls_literal {
bool anchored;
bool eod;
ue2_literal s;
explicit sls_literal(bool a) : anchored(a), eod(false) {}
sls_literal append(char c, bool nocase) const {
sls_literal rv(anchored);
rv.s = s;
rv.s.push_back(ue2_literal::elem(c, nocase));
return rv;
}
};
static
bool operator<(const sls_literal &a, const sls_literal &b) {
ORDER_CHECK(anchored);
ORDER_CHECK(eod);
ORDER_CHECK(s);
return false;
}
} // namespace
static
bool checkLongMixedSensitivityLiterals(
const map<sls_literal, ue2::flat_set<ReportID>> &literals) {
const size_t len = MAX_MASK2_WIDTH;
for (const sls_literal &lit : literals | map_keys) {
if (mixed_sensitivity(lit.s) && lit.s.length() > len) {
return false;
}
}
return true;
}
static
bool findLiterals(const NGHolder &g,
map<sls_literal, ue2::flat_set<ReportID>> *literals) {
vector<NFAVertex> order = getTopoOrdering(g);
vector<set<sls_literal>> built(num_vertices(g));
for (auto it = order.rbegin(); it != order.rend(); ++it) {
NFAVertex v = *it;
set<sls_literal> &out = built[g[v].index];
assert(out.empty());
if (v == g.start) {
out.insert(sls_literal(true));
continue;
} else if (v == g.startDs) {
out.insert(sls_literal(false));
continue;
}
bool eod = v == g.acceptEod;
bool accept = v == g.accept || v == g.acceptEod;
const CharReach &cr = g[v].char_reach;
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (u == g.accept) {
continue;
}
if (u == g.start && edge(g.startDs, v, g).second) {
/* floating start states may have connections to start and
* startDs - don't create duplicate anchored literals */
DEBUG_PRINTF("skipping as floating\n");
continue;
}
set<sls_literal> &in = built[g[u].index];
assert(!in.empty());
for (const sls_literal &lit : in) {
if (accept) {
sls_literal accept_lit = lit; // copy
accept_lit.eod = eod;
insert(&(*literals)[accept_lit], g[u].reports);
continue;
}
for (size_t c = cr.find_first(); c != cr.npos;
c = cr.find_next(c)) {
bool nocase = ourisalpha(c) && cr.test(mytoupper(c))
&& cr.test(mytolower(c));
if (nocase && (char)c == mytolower(c)) {
continue; /* uppercase already handled us */
}
out.insert(lit.append((u8)c, nocase));
if (out.size() + literals->size() > MAX_LITERAL_SET_SIZE) {
return false;
}
}
}
}
}
return true;
}
static
size_t min_period(const map<sls_literal, ue2::flat_set<ReportID>> &literals) {
size_t rv = SIZE_MAX;
for (const sls_literal &lit : literals | map_keys) {
rv = min(rv, minStringPeriod(lit.s));
}
DEBUG_PRINTF("min period %zu\n", rv);
return rv;
}
// If this component is just a small set of literals and can be handled by
// Rose, feed it directly into rose.
bool handleSmallLiteralSets(RoseBuild &rose, const NGHolder &g,
const CompileContext &cc) {
if (!cc.grey.allowSmallLiteralSet) {
return false;
}
if (!isAcyclic(g)) {
/* literal sets would typically be acyclic... */
DEBUG_PRINTF("not acyclic\n");
return false;
}
map<sls_literal, ue2::flat_set<ReportID>> literals;
if (!findLiterals(g, &literals)) {
DEBUG_PRINTF(":(\n");
return false;
}
assert(!literals.empty());
if (literals.size() > MAX_LITERAL_SET_SIZE) {
/* try a mask instead */
DEBUG_PRINTF("too many literals\n");
return false;
}
size_t period = min_period(literals);
if (period < MIN_STRONG_PERIOD &&
literals.size() > MAX_WEAK_LITERAL_SET_SIZE) {
DEBUG_PRINTF("too many literals with weak period\n");
return false;
}
if (!checkLongMixedSensitivityLiterals(literals)) {
DEBUG_PRINTF("long mixed\n");
return false;
}
DEBUG_PRINTF("adding %zu literals\n", literals.size());
for (const auto &m : literals) {
const sls_literal &lit = m.first;
const auto &reports = m.second;
rose.add(lit.anchored, lit.eod, lit.s, reports);
}
return true;
}
} // namespace ue2

View File

@@ -0,0 +1,50 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Rose construction from NGHolder for cases representing small literal
* sets.
*/
#ifndef NG_SMALL_LITERAL_SET_H
#define NG_SMALL_LITERAL_SET_H
namespace ue2 {
class RoseBuild;
class NGHolder;
struct CompileContext;
/** \brief If the graph represents a small set of literals, feed them directly
* to rose. Returns true if successful. */
bool handleSmallLiteralSets(RoseBuild &rose, const NGHolder &h,
const CompileContext &cc);
} // namespace ue2
#endif // NG_SMALL_LITERAL_SET_H

3108
src/nfagraph/ng_som.cpp Normal file

File diff suppressed because it is too large Load Diff

77
src/nfagraph/ng_som.h Normal file
View File

@@ -0,0 +1,77 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief SOM ("Start of Match") analysis.
*/
#ifndef NG_SOM_H
#define NG_SOM_H
#include "som/som.h"
namespace ue2 {
class NG;
class NGHolder;
class NGWrapper;
struct Grey;
enum sombe_rv {
SOMBE_FAIL,
SOMBE_HANDLED_INTERNAL,
SOMBE_HANDLED_ALL
};
/** \brief Perform SOM analysis on the given graph.
*
* This function will replace report IDs and mutate the graph, then return
* SOMBE_HANDLED_INTERNAL if SOM can be established and the full graph still
* needs to be handled (rose, etc).
*
* Returns SOMBE_HANDLED_ALL if everything has been done and the pattern has
* been handled in all its glory.
*
* Returns SOMBE_FAIL and does not mutate the graph if SOM cannot be
* established.
*
* May throw a "Pattern too large" exception if prefixes of the
* pattern are too large to compile.
*/
sombe_rv doSom(NG &ng, NGHolder &h, const NGWrapper &w, u32 comp_id,
som_type som);
/** Returns SOMBE_FAIL (and the original graph) if SOM cannot be established.
* May also throw pattern too large if prefixes of the pattern are too large to
* compile. */
sombe_rv doSomWithHaig(NG &ng, NGHolder &h, const NGWrapper &w, u32 comp_id,
som_type som);
} // namespace ue2
#endif // NG_SOM_H

View File

@@ -0,0 +1,198 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Add redundancy to graph to assist in SOM analysis.
*
* Currently patterns of the form:
*
* /(GET|POST).*foo/
*
* baffle our SOM analysis as the T's get merged into one by our graph
* reductions and they lose the fixed depth property. One way to solve this is
* to tell the T vertex to go fork itself before we do the main SOM pass.
*
* Overall plan:
*
* 1. build a topo ordering
* 2. walk vertices in topo order
* 3. fix up vertices where possible
* 4. go home
*
* Vertex fix up plan:
*
* 1. consider depth of vertex
* - if vertex is at fixed depth continue to next vertex
* - if vertex can be at an unbounded depth continue to next vertex
* - if vertex has a pred which is not a fixed depth continue to next vertex
* 2. group preds by their depth
* 3. for each group:
* - create a clone of the vertex (vertex props and out edges)
* - create edges from each vertex in the group to the clone
* - work out the depth for the clone
* 4. blow away original vertex
*
* Originally in UE-1862.
*/
#include "ng_som_add_redundancy.h"
#include "ng_dump.h"
#include "ng_holder.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/container.h"
#include "util/depth.h"
#include "util/graph.h"
#include "util/graph_range.h"
using namespace std;
namespace ue2 {
/** \brief Hard limit on the maximum number of new vertices to create. */
static const size_t MAX_NEW_VERTICES = 32;
static
const DepthMinMax &getDepth(NFAVertex v, const NGHolder &g,
const vector<DepthMinMax> &depths) {
return depths.at(g[v].index);
}
static
bool hasFloatingPred(NFAVertex v, const NGHolder &g,
const vector<DepthMinMax> &depths) {
for (auto u : inv_adjacent_vertices_range(v, g)) {
const DepthMinMax &d = getDepth(u, g, depths);
if (d.min != d.max) {
return true;
}
}
return false;
}
static
bool forkVertex(NFAVertex v, NGHolder &g, vector<DepthMinMax> &depths,
set<NFAVertex> &dead, size_t *numNewVertices) {
map<depth, vector<NFAEdge>> predGroups;
for (const auto &e : in_edges_range(v, g)) {
const DepthMinMax &d = getDepth(source(e, g), g, depths);
assert(d.min == d.max);
predGroups[d.min].push_back(e);
}
DEBUG_PRINTF("forking vertex with %zu pred groups\n", predGroups.size());
if (*numNewVertices + predGroups.size() > MAX_NEW_VERTICES) {
return false;
}
*numNewVertices += predGroups.size();
for (auto &group : predGroups) {
const depth &predDepth = group.first;
const vector<NFAEdge> &preds = group.second;
// Clone v for this depth with all its associated out-edges.
u32 clone_idx = depths.size(); // next index to be used
NFAVertex clone = add_vertex(g[v], g);
depth clone_depth = predDepth + 1;
g[clone].index = clone_idx;
depths.push_back(DepthMinMax(clone_depth, clone_depth));
DEBUG_PRINTF("cloned vertex %u with depth %s\n", clone_idx,
clone_depth.str().c_str());
// Add copies of the out-edges from v.
for (const auto &e : out_edges_range(v, g)) {
add_edge(clone, target(e, g), g[e], g);
}
// Add in-edges from preds in this group.
for (const auto &e : preds) {
add_edge(source(e, g), clone, g[e], g);
}
}
clear_vertex(v, g);
dead.insert(v);
return true;
}
bool addSomRedundancy(NGHolder &g, vector<DepthMinMax> &depths) {
DEBUG_PRINTF("entry\n");
const vector<NFAVertex> ordering = getTopoOrdering(g);
set<NFAVertex> dead;
size_t numNewVertices = 0;
for (auto it = ordering.rbegin(), ite = ordering.rend(); it != ite; ++it) {
NFAVertex v = *it;
if (is_special(v, g)) {
continue;
}
if (!hasGreaterInDegree(0, v, g)) {
continue; // unreachable, probably killed
}
const DepthMinMax &d = getDepth(v, g, depths);
DEBUG_PRINTF("vertex %u has depths %s\n", g[v].index,
d.str().c_str());
if (d.min == d.max) {
DEBUG_PRINTF("fixed depth\n");
continue;
}
if (d.max.is_unreachable()) {
DEBUG_PRINTF("unbounded depth\n");
continue;
}
if (hasFloatingPred(v, g, depths)) {
DEBUG_PRINTF("has floating pred\n");
continue;
}
if (!forkVertex(v, g, depths, dead, &numNewVertices)) {
DEBUG_PRINTF("new vertex limit reached\n");
break;
}
}
assert(numNewVertices <= MAX_NEW_VERTICES);
if (dead.empty()) {
return false; // no changes made to the graph
}
remove_vertices(dead, g);
return true;
}
} // namespace ue2

View File

@@ -0,0 +1,47 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Add redundancy to graph to assist in SOM analysis.
*/
#ifndef NG_SOM_ADD_REDUNDANCY_H
#define NG_SOM_ADD_REDUNDANCY_H
#include "util/depth.h"
#include <vector>
namespace ue2 {
class NGHolder;
bool addSomRedundancy(NGHolder &g, std::vector<DepthMinMax> &depths);
} // namespace ue2
#endif

View File

@@ -0,0 +1,358 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Utility functions related to SOM ("Start of Match").
*/
#include "ng_som_util.h"
#include "ng_depth.h"
#include "ng_execute.h"
#include "ng_holder.h"
#include "ng_prune.h"
#include "ng_util.h"
#include "util/container.h"
#include "util/graph_range.h"
using namespace std;
namespace ue2 {
static
void wireSuccessorsToStart(NGHolder &g, NFAVertex u) {
for (auto v : adjacent_vertices_range(u, g)) {
add_edge_if_not_present(g.start, v, g);
}
}
vector<DepthMinMax> getDistancesFromSOM(const NGHolder &g_orig) {
// We operate on a temporary copy of the original graph here, so we don't
// have to mutate the original.
NGHolder g;
ue2::unordered_map<NFAVertex, NFAVertex> vmap; // vertex in g_orig to vertex in g
cloneHolder(g, g_orig, &vmap);
vector<NFAVertex> vstarts;
for (auto v : vertices_range(g)) {
if (is_virtual_start(v, g)) {
vstarts.push_back(v);
}
}
vstarts.push_back(g.startDs);
// wire the successors of every virtual start or startDs to g.start.
for (auto v : vstarts) {
wireSuccessorsToStart(g, v);
}
// drop the in-edges of every virtual start so that they don't participate
// in the depth calculation.
for (auto v : vstarts) {
clear_in_edges(v, g);
}
//dumpGraph("som_depth.dot", g.g);
vector<DepthMinMax> temp_depths; // numbered by vertex index in g
calcDepthsFrom(g, g.start, temp_depths);
// Transfer depths, indexed by vertex index in g_orig.
vector<DepthMinMax> depths(num_vertices(g_orig));
for (auto v_orig : vertices_range(g_orig)) {
assert(contains(vmap, v_orig));
NFAVertex v_new = vmap[v_orig];
u32 orig_idx = g_orig[v_orig].index;
DepthMinMax &d = depths.at(orig_idx);
if (v_orig == g_orig.startDs || is_virtual_start(v_orig, g_orig)) {
// StartDs and virtual starts always have zero depth.
d = DepthMinMax(0, 0);
} else {
u32 new_idx = g[v_new].index;
d = temp_depths.at(new_idx);
}
}
return depths;
}
bool firstMatchIsFirst(const NGHolder &p) {
/* If the first match (by end offset) is not the first match (by start
* offset) then we can't create a lock after it.
*
* Consider: 4009:/(foobar|ob).*bugger/s
*
* We don't care about races on the last byte as they can be resolved easily
* at runtime /(foobar|obar).*hi/
*
* It should be obvious we don't care about one match being a prefix
* of another as they share the same start offset.
*
* Therefore, the case were we cannot establish that the som does not
* regress is when there exists s1 and s2 in the language of p and s2 is a
* proper infix of s1.
*
* It is tempting to add the further restriction that there does not exist a
* prefix of s1 that is in the language of p (as in which case we would
* presume, the lock has already been set). However, we have no way of
* knowing if the lock can be cleared by some characters, and if so, if it
* is still set. TODO: if we knew the lock's escapes where we could verify
* that the rest of s1 does not clear the lock. (1)
*/
DEBUG_PRINTF("entry\n");
/* If there are any big cycles throw up our hands in despair */
if (hasBigCycles(p)) {
DEBUG_PRINTF("fail, big cycles\n");
return false;
}
set<NFAVertex> states;
/* turn on all states (except starts - avoid suffix matches) */
/* If we were doing (1) we would also except states leading to accepts -
avoid prefix matches */
for (auto v : vertices_range(p)) {
assert(!is_virtual_start(v, p));
if (!is_special(v, p)) {
DEBUG_PRINTF("turning on %u\n", p[v].index);
states.insert(v);
}
}
/* run the prefix the main graph */
execute_graph(p, p, &states);
for (auto v : states) {
/* need to check if this vertex may represent an infix match - ie
* it does not have an edge to accept. */
DEBUG_PRINTF("check %u\n", p[v].index);
if (!edge(v, p.accept, p).second) {
DEBUG_PRINTF("fail %u\n", p[v].index);
return false;
}
}
DEBUG_PRINTF("done first is first check\n");
return true;
}
bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &region_map,
smgb_cache &cache) {
/* Need to ensure all matches of the graph g up to u contain no infixes
* which are also matches of the graph to u.
*
* This is basically the same as firstMatchIsFirst except we g is not
* always a dag. As we haven't gotten around to writing an execute_graph
* that operates on general graphs, we take some (hopefully) conservative
* short cuts.
*
* Note: if the u can be jumped we will take jump edges
* into account as a possibility of som going backwards
*
* TODO: write a generalised ng_execute_graph/make this less hacky
*/
assert(&g == &cache.g);
if (contains(cache.smgb, u)) {
return cache.smgb[u];
}
DEBUG_PRINTF("checking if som can go backwards on %u\n",
g[u].index);
set<NFAEdge> be;
BackEdges<set<NFAEdge>> backEdgeVisitor(be);
depth_first_search(
g.g, visitor(backEdgeVisitor)
.root_vertex(g.start)
.vertex_index_map(get(&NFAGraphVertexProps::index, g.g)));
bool rv;
if (0) {
exit:
DEBUG_PRINTF("using cached result\n");
cache.smgb[u] = rv;
return rv;
}
assert(contains(region_map, u));
const u32 u_region = region_map.at(u);
for (const auto &e : be) {
NFAVertex s = source(e, g);
NFAVertex t = target(e, g);
/* only need to worry about big cycles including/before u */
DEBUG_PRINTF("back edge %u %u\n", g[s].index,
g[t].index);
if (s != t && region_map.at(s) <= u_region) {
DEBUG_PRINTF("eek big cycle\n");
rv = true; /* big cycle -> eek */
goto exit;
}
}
ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy;
NGHolder c_g;
cloneHolder(c_g, g, &orig_to_copy);
for (NFAVertex v : vertices_range(g)) {
if (!is_virtual_start(v, g)) {
continue;
}
NFAVertex c_v = orig_to_copy[v];
orig_to_copy[v] = c_g.startDs;
for (NFAVertex c_w : adjacent_vertices_range(c_v, c_g)) {
add_edge_if_not_present(c_g.startDs, c_w, c_g);
}
clear_vertex(c_v, c_g);
}
NFAVertex c_u = orig_to_copy[u];
clear_in_edges(c_g.acceptEod, c_g);
add_edge(c_g.accept, c_g.acceptEod, c_g);
clear_in_edges(c_g.accept, c_g);
clear_out_edges(c_u, c_g);
if (hasSelfLoop(u, g)) {
add_edge(c_u, c_u, c_g);
}
add_edge(c_u, c_g.accept, c_g);
set<NFAVertex> u_succ;
insert(&u_succ, adjacent_vertices(u, g));
u_succ.erase(u);
for (auto t : inv_adjacent_vertices_range(u, g)) {
if (t == u) {
continue;
}
for (auto v : adjacent_vertices_range(t, g)) {
if (contains(u_succ, v)) {
add_edge(orig_to_copy[t], c_g.accept, c_g);
break;
}
}
}
pruneUseless(c_g);
be.clear();
depth_first_search(c_g.g, visitor(backEdgeVisitor).root_vertex(c_g.start).
vertex_index_map(get(&NFAGraphVertexProps::index, c_g.g)));
for (const auto &e : be) {
NFAVertex s = source(e, c_g);
NFAVertex t = target(e, c_g);
DEBUG_PRINTF("back edge %u %u\n", c_g[s].index, c_g[t].index);
if (s != t) {
assert(0);
DEBUG_PRINTF("eek big cycle\n");
rv = true; /* big cycle -> eek */
goto exit;
}
}
DEBUG_PRINTF("checking acyclic+selfloop graph\n");
rv = !firstMatchIsFirst(c_g);
DEBUG_PRINTF("som may regress? %d\n", (int)rv);
goto exit;
}
bool sentClearsTail(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &region_map,
const NGHolder &sent, u32 last_head_region,
u32 *bad_region) {
/* if a subsequent match from the prefix clears the rest of the pattern
* we can just keep track of the last match of the prefix.
* To see if this property holds, we could:
*
* 1A: turn on all states in the tail and run all strings that may
* match the prefix past the tail, if we are still in any states then
* this property does not hold.
*
* 1B: we turn on the initial states of the tail and run any strings which
* may finish any partial matches in the prefix and see if we end up with
* anything which would also imply that this property does not hold.
*
* OR
*
* 2: we just turn everything and run the prefix inputs past it and see what
* we are left with. I think that is equivalent to scheme 1 and is easier to
* implement. TODO: ponder
*
* Anyway, we are going with scheme 2 until further notice.
*/
u32 first_bad_region = ~0U;
set<NFAVertex> states;
/* turn on all states */
DEBUG_PRINTF("region %u is cutover\n", last_head_region);
for (auto v : vertices_range(g)) {
if (v != g.accept && v != g.acceptEod) {
states.insert(v);
}
}
for (UNUSED auto v : states) {
DEBUG_PRINTF("start state: %u\n", g[v].index);
}
/* run the prefix the main graph */
execute_graph(g, sent, &states);
/* .. and check if we are left with anything in the tail region */
for (auto v : states) {
if (v == g.start || v == g.startDs) {
continue; /* not in tail */
}
DEBUG_PRINTF("v %u is still on\n", g[v].index);
assert(v != g.accept && v != g.acceptEod); /* no cr */
assert(contains(region_map, v));
const u32 v_region = region_map.at(v);
if (v_region > last_head_region) {
DEBUG_PRINTF("bailing, %u > %u\n", v_region, last_head_region);
first_bad_region = min(first_bad_region, v_region);
}
}
if (first_bad_region != ~0U) {
DEBUG_PRINTF("first bad region is %u\n", first_bad_region);
*bad_region = first_bad_region;
return false;
}
return true;
}
} // namespace ue2

View File

@@ -0,0 +1,84 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Utility functions related to SOM ("Start of Match").
*/
#ifndef NG_SOM_UTIL_H
#define NG_SOM_UTIL_H
#include "ng_util.h"
#include "util/depth.h"
#include "util/ue2_containers.h"
#include <map>
#include <vector>
namespace ue2 {
class NGHolder;
/**
* Returns min/max distance from start of match, index by vertex_id.
*/
std::vector<DepthMinMax> getDistancesFromSOM(const NGHolder &g);
/**
* Returns true if the first match by end-offset must always be the first match
* by start-offset.
*/
bool firstMatchIsFirst(const NGHolder &p);
struct smgb_cache : public mbsb_cache {
explicit smgb_cache(const NGHolder &gg) : mbsb_cache(gg) {}
std::map<NFAVertex, bool> smgb;
};
bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &region_map,
smgb_cache &cache);
/**
* Returns true if matching 'sent' causes all tail states in the main graph \a
* g to go dead. A tail state is any state with a region greater than
* \a last_head_region.
*
* - The graph \a sent must be a "kinda-DAG", where the only back-edges present
* are self-loops.
* - If the result is false, \a bad_region will be updated with the smallest
* region ID associated with a tail state that is still on.
*/
bool sentClearsTail(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &region_map,
const NGHolder &sent, u32 last_head_region,
u32 *bad_region);
} // namespace ue2
#endif // NG_SOM_UTIL_H

216
src/nfagraph/ng_split.cpp Normal file
View File

@@ -0,0 +1,216 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Functions for splitting NFAGraphs into LHS and RHS.
*/
#include "ng_split.h"
#include "ng_holder.h"
#include "ng_prune.h"
#include "ng_util.h"
#include "util/container.h"
#include "util/graph.h"
#include "util/graph_range.h"
#include "util/ue2_containers.h"
#include <map>
#include <set>
#include <vector>
using namespace std;
namespace ue2 {
static
void clearAccepts(NGHolder &g) {
for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
g[v].reports.clear();
}
for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
g[v].reports.clear();
}
clear_in_edges(g.accept, g);
clear_in_edges(g.acceptEod, g);
add_edge(g.accept, g.acceptEod, g);
}
static
void filterSplitMap(const NGHolder &g, ue2::unordered_map<NFAVertex, NFAVertex> *out_map) {
ue2::unordered_set<NFAVertex> verts;
insert(&verts, vertices(g));
ue2::unordered_map<NFAVertex, NFAVertex>::iterator it = out_map->begin();
while (it != out_map->end()) {
ue2::unordered_map<NFAVertex, NFAVertex>::iterator jt = it;
++it;
if (!contains(verts, jt->second)) {
out_map->erase(jt);
}
}
}
static
void splitLHS(const NGHolder &base, const vector<NFAVertex> &pivots,
const vector<NFAVertex> &rhs_pivots,
NGHolder *lhs, ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map) {
assert(lhs && lhs_map);
cloneHolder(*lhs, base, lhs_map);
clearAccepts(*lhs);
for (auto pivot : pivots) {
DEBUG_PRINTF("pivot is %u lv %zu lm %zu\n", base[pivot].index,
num_vertices(*lhs), lhs_map->size());
assert(contains(*lhs_map, pivot));
for (auto v : rhs_pivots) {
assert(contains(*lhs_map, v));
remove_edge((*lhs_map)[pivot], (*lhs_map)[v], *lhs);
}
(*lhs)[(*lhs_map)[pivot]].reports.insert(0);
add_edge((*lhs_map)[pivot], lhs->accept, *lhs);
}
pruneUseless(*lhs);
filterSplitMap(*lhs, lhs_map);
switch (base.kind) {
case NFA_PREFIX:
case NFA_OUTFIX:
lhs->kind = NFA_PREFIX;
break;
case NFA_INFIX:
case NFA_SUFFIX:
lhs->kind = NFA_INFIX;
break;
case NFA_REV_PREFIX:
assert(0);
break;
}
}
void splitLHS(const NGHolder &base, NFAVertex pivot,
NGHolder *lhs, ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map) {
vector<NFAVertex> pivots(1, pivot);
vector<NFAVertex> rhs_pivots;
insert(&rhs_pivots, rhs_pivots.end(), adjacent_vertices(pivot, base));
splitLHS(base, pivots, rhs_pivots, lhs, lhs_map);
}
void splitRHS(const NGHolder &base, const vector<NFAVertex> &pivots,
NGHolder *rhs, ue2::unordered_map<NFAVertex, NFAVertex> *rhs_map) {
assert(rhs && rhs_map);
cloneHolder(*rhs, base, rhs_map);
clear_out_edges(rhs->start, *rhs);
clear_out_edges(rhs->startDs, *rhs);
add_edge(rhs->start, rhs->startDs, *rhs);
add_edge(rhs->startDs, rhs->startDs, *rhs);
for (auto pivot : pivots) {
assert(contains(*rhs_map, pivot));
add_edge(rhs->start, (*rhs_map)[pivot], *rhs);
}
pruneUseless(*rhs);
filterSplitMap(*rhs, rhs_map);
switch (base.kind) {
case NFA_PREFIX:
case NFA_INFIX:
rhs->kind = NFA_INFIX;
break;
case NFA_SUFFIX:
case NFA_OUTFIX:
rhs->kind = NFA_SUFFIX;
break;
case NFA_REV_PREFIX:
assert(0);
break;
}
}
/** \brief Fills \a succ with the common successors of the vertices in \a
* pivots. */
static
void findCommonSuccessors(const NGHolder &g, const vector<NFAVertex> &pivots,
vector<NFAVertex> &succ) {
assert(!pivots.empty());
// Note: for determinism, we must sort our successor sets by vertex_index.
set<NFAVertex, VertexIndexOrdering<NGHolder> > adj(g), adj_temp(g);
insert(&adj, adjacent_vertices(pivots.at(0), g));
for (auto it = pivots.begin() + 1, ite = pivots.end(); it != ite; ++it) {
NFAVertex pivot = *it;
adj_temp.clear();
for (auto v : adjacent_vertices_range(pivot, g)) {
if (contains(adj, v)) {
adj_temp.insert(v);
}
}
adj.swap(adj_temp);
}
succ.insert(succ.end(), adj.begin(), adj.end());
}
void splitGraph(const NGHolder &base, const vector<NFAVertex> &pivots,
NGHolder *lhs, ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map,
NGHolder *rhs, ue2::unordered_map<NFAVertex, NFAVertex> *rhs_map) {
DEBUG_PRINTF("splitting graph at %zu vertices\n", pivots.size());
assert(!has_parallel_edge(base));
/* RHS pivots are built from the common set of successors of pivots. */
vector<NFAVertex> rhs_pivots;
findCommonSuccessors(base, pivots, rhs_pivots);
/* generate lhs */
splitLHS(base, pivots, rhs_pivots, lhs, lhs_map);
/* generate the rhs */
splitRHS(base, rhs_pivots, rhs, rhs_map);
assert(!has_parallel_edge(*lhs));
assert(!has_parallel_edge(*rhs));
}
void splitGraph(const NGHolder &base, NFAVertex pivot,
NGHolder *lhs, ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map,
NGHolder *rhs, ue2::unordered_map<NFAVertex, NFAVertex> *rhs_map) {
vector<NFAVertex> pivots(1, pivot);
splitGraph(base, pivots, lhs, lhs_map, rhs, rhs_map);
}
} // namespace ue2

74
src/nfagraph/ng_split.h Normal file
View File

@@ -0,0 +1,74 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Functions for splitting NFAGraphs into LHS and RHS.
*/
#ifndef NG_SPLIT_H
#define NG_SPLIT_H
#include <vector>
#include "ng_holder.h"
#include "util/ue2_containers.h"
namespace ue2 {
class NGHolder;
/** Note: pivot should be a vertex that dominates acceptEod. Treating 'in'
* allocated to rhs if they are reachable from the pivot. Conversely, a vertex
* is in the lhs if it is reachable from start without going through the
* pivot. The pivot ends up in the LHS and any adjacent vertices in the RHS.
*
* When multiple split vertices are provided:
* - RHS contains all vertices reachable from every pivot
* - LHS contains all vertices which are reachable from start ignoring any
* vertices which have an edge to every pivot
*/
void splitGraph(const NGHolder &base, NFAVertex pivot, NGHolder *lhs,
ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map,
NGHolder *rhs,
ue2::unordered_map<NFAVertex, NFAVertex> *rhs_map);
void splitGraph(const NGHolder &base, const std::vector<NFAVertex> &pivots,
NGHolder *lhs,
ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map,
NGHolder *rhs,
ue2::unordered_map<NFAVertex, NFAVertex> *rhs_map);
void splitLHS(const NGHolder &base, NFAVertex pivot, NGHolder *lhs,
ue2::unordered_map<NFAVertex, NFAVertex> *lhs_map);
void splitRHS(const NGHolder &base, const std::vector<NFAVertex> &pivots,
NGHolder *rhs, ue2::unordered_map<NFAVertex, NFAVertex> *rhs_map);
} // namespace ue2
#endif // NG_SPLIT_H

655
src/nfagraph/ng_squash.cpp Normal file
View File

@@ -0,0 +1,655 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief NFA graph state squashing analysis.
*
* The basic idea behind the state squashing is that when we are in a cyclic
* state v there are certain other states which are completely irrelevant. This
* is used primarily by the determinisation process to produce smaller DFAs by
* not tracking irrelevant states. It's also used by the LimEx NFA model.
*
* Working out which states we can ignore mainly uses the post-dominator
* analysis.
*
* ### Dot Squash Masks:
*
* The following vertices are added to the squash mask:
* - (1) Any vertex post-dominated by the cyclic dot state
* - (2) Any other vertex post-dominated by the cyclic dot state's successors
* - (3) Any vertex post-dominated by a predecessor of the cyclic dot state -
* provided the predecessor's successors are a subset of the cyclic state's
* successors [For (3), the term successor also includes report information]
*
* (2) and (3) allow us to get squash masks from .* as well as .+
*
* The squash masks are not optimal especially in the case where there
* alternations on both sides - for example in:
*
* /foo(bar|baz).*(abc|xyz)/s
*
* 'foo' is irrelevant once the dot star is hit, but it has no post-dominators
* so isn't picked up ('bar' and 'baz' are picked up by (2)). We may be able to
* do a more complete analysis based on cutting the graph and seeing which
* vertices are unreachable but the current approach is quick and probably
* adequate.
*
*
* ### Non-Dot Squash Masks:
*
* As for dot states. However, if anything in a pdom tree falls outside the
* character range of the cyclic state the whole pdom tree is ignored. Also when
* considering the predecessor's pdom tree it is necessary to verify that the
* predecessor's character reachability falls within that of the cyclic state.
*
* We could do better in this case by not throwing away the whole pdom tree -
* however the bits which we can keep are not clear from the pdom tree of the
* cyclic state - it probably can be based on the dom or pdom tree of the bad
* vertex.
*
* An example of us doing badly is:
*
* /HTTP.*Referer[^\n]*google/s
*
* as '[\\n]*' doesn't get a squash mask at all due to .* but we should be able
* to squash 'Referer'.
*
* ### Extension:
*
* If a state leads solely to a squashable state (or its immediate successors)
* with the same reachability we can make this state a squash state of any of
* the original states squashees which we postdominate. Could probably tighten
* this up but it would require thought. May not need to keep the original
* squasher around but that would also require thought.
*
* ### SOM Notes:
*
* If (left) start of match is required, it is illegal to squash any state which
* may result in an early start of match reaching the squashing state.
*/
#include "config.h"
#include "ng_squash.h"
#include "ng_dominators.h"
#include "ng_dump.h"
#include "ng_holder.h"
#include "ng_prune.h"
#include "ng_region.h"
#include "ng_restructuring.h"
#include "ng_som_util.h"
#include "ng_util.h"
#include "ng_util.h"
#include "util/container.h"
#include "util/graph_range.h"
#include "util/report_manager.h"
#include "ue2common.h"
#include <deque>
#include <map>
#include <boost/graph/depth_first_search.hpp>
#include <boost/graph/reverse_graph.hpp>
using namespace std;
namespace ue2 {
typedef ue2::unordered_map<NFAVertex,
ue2::unordered_set<NFAVertex> > PostDomTree;
static
void buildPDomTree(const NGHolder &g, PostDomTree &tree) {
ue2::unordered_map<NFAVertex, NFAVertex> postdominators =
findPostDominators(g);
for (auto v : vertices_range(g)) {
if (is_special(v, g)) {
continue;
}
NFAVertex pdom = postdominators[v];
if (pdom) {
DEBUG_PRINTF("vertex %u -> %u\n", g[pdom].index,
g[v].index);
tree[pdom].insert(v);
}
}
}
/**
* Builds a squash mask based on the pdom tree of v and the given char reach.
* The built squash mask is a bit conservative for non-dot cases and could
* be improved with a bit of thought.
*/
static
void buildSquashMask(NFAStateSet &mask, const NGHolder &g, NFAVertex v,
const CharReach &cr, const NFAStateSet &init,
const vector<NFAVertex> &vByIndex, const PostDomTree &tree,
som_type som, const vector<DepthMinMax> &som_depths,
const ue2::unordered_map<NFAVertex, u32> &region_map,
smgb_cache &cache) {
DEBUG_PRINTF("build base squash mask for vertex %u)\n",
g[v].index);
vector<NFAVertex> q;
PostDomTree::const_iterator it = tree.find(v);
if (it != tree.end()) {
q.insert(q.end(), it->second.begin(), it->second.end());
}
const u32 v_index = g[v].index;
while (!q.empty()) {
NFAVertex u = q.back();
q.pop_back();
const CharReach &cru = g[u].char_reach;
if ((cru & ~cr).any()) {
/* bail: bad cr on vertex u */
/* TODO: this could be better
*
* we still need to ensure that we record any paths leading to u.
* Hence all vertices R which can reach u must be excluded from the
* squash mask. Note: R != pdom(u) and there may exist an x in (R -
* pdom(u)) which is in pdom(y) where y is in q. Clear ?
*/
mask.set();
return;
}
const u32 u_index = g[u].index;
if (som) {
/* We cannot add a state u to the squash mask of v if it may have an
* earlier start of match offset. ie for us to add a state u to v
* maxSomDist(u) <= minSomDist(v)
*/
const depth &max_som_dist_u = som_depths[u_index].max;
const depth &min_som_dist_v = som_depths[v_index].min;
if (max_som_dist_u.is_infinite()) {
/* it is hard to tell due to the INF if u can actually store an
* earlier SOM than w (state we are building the squash mask
* for) - need to think more deeply
*/
if (mustBeSetBefore(u, v, g, cache)
&& !somMayGoBackwards(u, g, region_map, cache)) {
DEBUG_PRINTF("u %u v %u\n", u_index, v_index);
goto squash_ok;
}
}
if (max_som_dist_u > min_som_dist_v) {
/* u can't be squashed as it may be storing an earlier SOM */
goto add_children_to_queue;
}
}
squash_ok:
mask.set(u_index);
DEBUG_PRINTF("pdom'ed %u\n", u_index);
add_children_to_queue:
it = tree.find(u);
if (it != tree.end()) {
q.insert(q.end(), it->second.begin(), it->second.end());
}
}
if (cr.all()) {
/* the init states aren't in the pdom tree. If all their succ states
* are set (or v), we can consider them post dominated */
/* Note: init states will always result in a later som */
for (size_t i = init.find_first(); i != init.npos;
i = init.find_next(i)) {
/* Yes vacuous patterns do exist */
NFAVertex iv = vByIndex[i];
for (auto w : adjacent_vertices_range(iv, g)) {
if (w == g.accept || w == g.acceptEod) {
DEBUG_PRINTF("skipping %zu due to vacuous accept\n", i);
goto next_init_state;
}
u32 vert_id = g[w].index;
if (w != iv && w != v && !mask.test(vert_id)) {
DEBUG_PRINTF("skipping %zu due to %u\n", i, vert_id);
goto next_init_state;
}
}
DEBUG_PRINTF("pdom'ed %zu\n", i);
mask.set(i);
next_init_state:;
}
}
mask.flip();
}
static
void buildSucc(NFAStateSet &succ, const NGHolder &g, NFAVertex v) {
for (auto w : adjacent_vertices_range(v, g)) {
if (!is_special(w, g)) {
succ.set(g[w].index);
}
}
}
static
void buildPred(NFAStateSet &pred, const NGHolder &g, NFAVertex v) {
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (!is_special(u, g)) {
pred.set(g[u].index);
}
}
}
static
void findDerivedSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex,
const PostDomTree &pdom_tree, const NFAStateSet &init,
map<NFAVertex, NFAStateSet> *squash, som_type som,
const vector<DepthMinMax> &som_depths,
const ue2::unordered_map<NFAVertex, u32> &region_map,
smgb_cache &cache) {
deque<NFAVertex> remaining;
for (const auto &m : *squash) {
remaining.push_back(m.first);
}
while (!remaining.empty()) {
NFAVertex v = remaining.back();
remaining.pop_back();
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (is_special(u, g)) {
continue;
}
if (g[v].char_reach != g[u].char_reach) {
continue;
}
if (out_degree(u, g) != 1) {
continue;
}
NFAStateSet u_squash(init.size());
u32 u_index = g[u].index;
buildSquashMask(u_squash, g, u, g[u].char_reach, init, vByIndex,
pdom_tree, som, som_depths, region_map, cache);
u_squash.set(u_index); /* never clear ourselves */
if ((~u_squash).any()) { // i.e. some bits unset in mask
DEBUG_PRINTF("%u is an upstream squasher of %u\n", u_index,
g[v].index);
(*squash)[u] = u_squash;
remaining.push_back(u);
}
}
}
}
map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g, som_type som) {
map<NFAVertex, NFAStateSet> squash;
// Number of bits to use for all our masks. If we're a triggered graph,
// tops have already been assigned, so we don't have to account for them.
const u32 numStates = num_vertices(g);
// Build post-dominator tree.
PostDomTree pdom_tree;
buildPDomTree(g, pdom_tree);
// Build list of vertices by state ID and a set of init states.
vector<NFAVertex> vByIndex(numStates, NFAGraph::null_vertex());
NFAStateSet initStates(numStates);
smgb_cache cache(g);
// Mappings used for SOM mode calculations, otherwise left empty.
unordered_map<NFAVertex, u32> region_map;
vector<DepthMinMax> som_depths;
if (som) {
region_map = assignRegions(g);
som_depths = getDistancesFromSOM(g);
}
for (auto v : vertices_range(g)) {
const u32 vert_id = g[v].index;
DEBUG_PRINTF("vertex %u/%u\n", vert_id, numStates);
assert(vert_id < numStates);
vByIndex[vert_id] = v;
if (is_any_start(v, g) || !in_degree(v, g)) {
initStates.set(vert_id);
}
}
for (u32 i = 0; i < numStates; i++) {
NFAVertex v = vByIndex[i];
assert(v != NFAGraph::null_vertex());
const CharReach &cr = g[v].char_reach;
/* only non-init cyclics can be squashers */
if (!hasSelfLoop(v, g) || initStates.test(i)) {
continue;
}
DEBUG_PRINTF("state %u is cyclic\n", i);
NFAStateSet mask(numStates), succ(numStates), pred(numStates);
buildSquashMask(mask, g, v, cr, initStates, vByIndex, pdom_tree, som,
som_depths, region_map, cache);
buildSucc(succ, g, v);
buildPred(pred, g, v);
const auto &reports = g[v].reports;
for (size_t j = succ.find_first(); j != succ.npos;
j = succ.find_next(j)) {
NFAVertex vj = vByIndex[j];
NFAStateSet pred2(numStates);
buildPred(pred2, g, vj);
if (pred2 == pred) {
DEBUG_PRINTF("adding the sm from %zu to %u's sm\n", j, i);
NFAStateSet tmp(numStates);
buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree,
som, som_depths, region_map, cache);
mask &= tmp;
}
}
for (size_t j = pred.find_first(); j != pred.npos;
j = pred.find_next(j)) {
NFAVertex vj = vByIndex[j];
NFAStateSet succ2(numStates);
buildSucc(succ2, g, vj);
/* we can use j as a basis for squashing if its succs are a subset
* of ours */
if ((succ2 & ~succ).any()) {
continue;
}
if (som) {
/* We cannot use j to add to the squash mask of v if it may
* have an earlier start of match offset. ie for us j as a
* basis for the squash mask of v we require:
* maxSomDist(j) <= minSomDist(v)
*/
/* ** TODO ** */
const depth &max_som_dist_j =
som_depths[g[vj].index].max;
const depth &min_som_dist_v =
som_depths[g[v].index].min;
if (max_som_dist_j > min_som_dist_v ||
max_som_dist_j.is_infinite()) {
/* j can't be used as it may be storing an earlier SOM */
continue;
}
}
const CharReach &crv = g[vj].char_reach;
/* we also require that j's report information be a subset of ours
*/
bool seen_special = false;
for (auto w : adjacent_vertices_range(vj, g)) {
if (is_special(w, g)) {
if (!edge(v, w, g).second) {
goto next_j;
}
seen_special = true;
}
}
// FIXME: should be subset check?
if (seen_special && g[vj].reports != reports) {
continue;
}
/* ok we can use j */
if ((crv & ~cr).none()) {
NFAStateSet tmp(numStates);
buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree,
som, som_depths, region_map, cache);
mask &= tmp;
mask.reset(j);
}
next_j:;
}
mask.set(i); /* never clear ourselves */
if ((~mask).any()) { // i.e. some bits unset in mask
DEBUG_PRINTF("%u squashes %zu other states\n", i, (~mask).count());
squash.emplace(v, mask);
}
}
findDerivedSquashers(g, vByIndex, pdom_tree, initStates, &squash, som,
som_depths, region_map, cache);
return squash;
}
#define MIN_PURE_ACYCLIC_SQUASH 10 /** magic number */
/** Some squash states are clearly not advantageous in the NFA, as they do
* incur the cost of an exception:
* -# acyclic states
* -# squash only a few acyclic states
*/
void filterSquashers(const NGHolder &g,
map<NFAVertex, NFAStateSet> &squash) {
DEBUG_PRINTF("filtering\n");
map<u32, NFAVertex> rev; /* vertex_index -> vertex */
for (auto v : vertices_range(g)) {
rev[g[v].index] = v;
}
for (auto v : vertices_range(g)) {
if (!contains(squash, v)) {
continue;
}
DEBUG_PRINTF("looking at squash set for vertex %u\n",
g[v].index);
if (!hasSelfLoop(v, g)) {
DEBUG_PRINTF("acyclic\n");
squash.erase(v);
continue;
}
NFAStateSet squashed = squash[v];
squashed.flip(); /* default sense for mask of survivors */
for (NFAStateSet::size_type sq = squashed.find_first();
sq != squashed.npos; sq = squashed.find_next(sq)) {
NFAVertex u = rev[sq];
if (hasSelfLoop(u, g)) {
DEBUG_PRINTF("squashing a cyclic (%zu) is always good\n", sq);
goto next_vertex;
}
}
if (squashed.count() < MIN_PURE_ACYCLIC_SQUASH) {
DEBUG_PRINTF("squash set too small\n");
squash.erase(v);
continue;
}
next_vertex:;
DEBUG_PRINTF("squash set ok\n");
}
}
static
void getHighlanderReporters(const NGHolder &g, const NFAVertex accept,
const ReportManager &rm,
set<NFAVertex> &verts) {
for (auto v : inv_adjacent_vertices_range(accept, g)) {
if (v == g.accept) {
continue;
}
const auto &reports = g[v].reports;
if (reports.empty()) {
assert(0);
continue;
}
// Must be _all_ highlander callback reports.
for (auto report : reports) {
const Report &ir = rm.getReport(report);
if (ir.ekey == INVALID_EKEY || ir.type != EXTERNAL_CALLBACK) {
goto next_vertex;
}
// If there's any bounds, these are handled outside the NFA and
// probably shouldn't be pre-empted.
if (ir.hasBounds()) {
goto next_vertex;
}
}
verts.insert(v);
next_vertex:
continue;
}
}
static
void removeEdgesToAccept(NGHolder &g, NFAVertex v) {
const auto &reports = g[v].reports;
assert(!reports.empty());
// We remove any accept edge with a non-empty subset of the reports of v.
set<NFAEdge> dead;
for (const auto &e : in_edges_range(g.accept, g)) {
NFAVertex u = source(e, g);
const auto &r = g[u].reports;
if (!r.empty() && is_subset_of(r, reports)) {
DEBUG_PRINTF("vertex %u\n", g[u].index);
dead.insert(e);
}
}
for (const auto &e : in_edges_range(g.acceptEod, g)) {
NFAVertex u = source(e, g);
const auto &r = g[u].reports;
if (!r.empty() && is_subset_of(r, reports)) {
DEBUG_PRINTF("vertex %u\n", g[u].index);
dead.insert(e);
}
}
assert(!dead.empty());
remove_edges(dead, g);
}
static
vector<NFAVertex> findUnreachable(const NGHolder &g) {
const boost::reverse_graph<NFAGraph, const NFAGraph &> revg(g.g);
ue2::unordered_map<NFAVertex, boost::default_color_type> colours;
colours.reserve(num_vertices(g));
depth_first_visit(revg, g.acceptEod,
make_dfs_visitor(boost::null_visitor()),
make_assoc_property_map(colours));
// Unreachable vertices are not in the colour map.
vector<NFAVertex> unreach;
for (auto v : vertices_range(revg)) {
if (!contains(colours, v)) {
unreach.push_back(v);
}
}
return unreach;
}
/** Populates squash masks for states that can be switched off by highlander
* (single match) reporters. */
map<NFAVertex, NFAStateSet>
findHighlanderSquashers(const NGHolder &g, const ReportManager &rm) {
map<NFAVertex, NFAStateSet> squash;
set<NFAVertex> verts;
getHighlanderReporters(g, g.accept, rm, verts);
getHighlanderReporters(g, g.acceptEod, rm, verts);
if (verts.empty()) {
DEBUG_PRINTF("no highlander reports\n");
return squash;
}
const u32 numStates = num_vertices(g);
for (auto v : verts) {
DEBUG_PRINTF("vertex %u with %zu reports\n", g[v].index,
g[v].reports.size());
// Find the set of vertices that lead to v or any other reporter with a
// subset of v's reports. We do this by creating a copy of the graph,
// cutting the appropriate out-edges to accept and seeing which
// vertices become unreachable.
ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy;
NGHolder h;
cloneHolder(h, g, &orig_to_copy);
removeEdgesToAccept(h, orig_to_copy[v]);
vector<NFAVertex> unreach = findUnreachable(h);
DEBUG_PRINTF("can squash %zu vertices\n", unreach.size());
if (unreach.empty()) {
continue;
}
if (!contains(squash, v)) {
squash[v] = NFAStateSet(numStates);
squash[v].set();
}
NFAStateSet &mask = squash[v];
for (auto uv : unreach) {
DEBUG_PRINTF("squashes index %u\n", h[uv].index);
mask.reset(h[uv].index);
}
}
return squash;
}
} // namespace ue2

71
src/nfagraph/ng_squash.h Normal file
View File

@@ -0,0 +1,71 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief NFA graph state squashing analysis.
*/
#ifndef NG_SQUASH_H
#define NG_SQUASH_H
#include "ng_holder.h"
#include "som/som.h"
#include "ue2common.h"
#include "util/ue2_containers.h"
#include <map>
#include <boost/dynamic_bitset.hpp>
namespace ue2 {
class NGHolder;
class ReportManager;
/** Dynamically-sized bitset, as an NFA can have an arbitrary number of states. */
typedef boost::dynamic_bitset<> NFAStateSet;
/**
* Populates the squash mask for each vertex (i.e. the set of states to be left
* on during squashing).
*
* The NFAStateSet in the output map is indexed by vertex_index.
*/
std::map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g,
som_type som = SOM_NONE);
/** Filters out squash states intended only for use in DFA construction. */
void filterSquashers(const NGHolder &g,
std::map<NFAVertex, NFAStateSet> &squash);
/** Populates squash masks for states that can be switched off by highlander
* (single match) reporters. */
std::map<NFAVertex, NFAStateSet>
findHighlanderSquashers(const NGHolder &g, const ReportManager &rm);
} // namespace ue2
#endif // NG_SQUASH_H

190
src/nfagraph/ng_stop.cpp Normal file
View File

@@ -0,0 +1,190 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Stop Alphabet calculation.
*/
#include "ng_stop.h"
#include "ng_depth.h"
#include "ng_holder.h"
#include "ng_misc_opt.h"
#include "ng_util.h"
#include "ue2common.h"
#include "nfa/castlecompile.h"
#include "som/som.h"
#include "util/charreach.h"
#include "util/container.h"
#include "util/dump_charclass.h"
#include "util/graph.h"
#include "util/graph_range.h"
#include "util/verify_types.h"
#include <map>
#include <set>
#include <vector>
using namespace std;
namespace ue2 {
/** Stop alphabet depth threshold. */
static const u32 MAX_STOP_DEPTH = 8;
namespace {
/** Depths from start, startDs for this graph. */
struct InitDepths {
explicit InitDepths(const NGHolder &g) {
calcDepthsFrom(g, g.start, start);
calcDepthsFrom(g, g.startDs, startDs);
}
depth maxDist(const NGHolder &g, NFAVertex v) const {
u32 idx = g[v].index;
assert(idx < start.size() && idx < startDs.size());
const depth &d_start = start.at(idx).max;
const depth &d_startDs = startDs.at(idx).max;
if (d_start.is_unreachable()) {
return d_startDs;
} else if (d_startDs.is_unreachable()) {
return d_start;
}
return max(d_start, d_startDs);
}
private:
vector<DepthMinMax> start;
vector<DepthMinMax> startDs;
};
} // namespace
/** Find the set of characters that are not present in the reachability of
* graph \p g after a certain depth (currently 8). If a character in this set
* is encountered, it means that the NFA is either dead or has not progressed
* more than 8 characters from its start states. */
CharReach findStopAlphabet(const NGHolder &g, som_type som) {
const depth max_depth(MAX_STOP_DEPTH);
const InitDepths depths(g);
const map<NFAVertex, BoundedRepeatSummary> no_vertices;
CharReach stopcr;
for (auto v : vertices_range(g)) {
if (is_special(v, g)) {
continue;
}
if (depths.maxDist(g, v) >= max_depth) {
if (som == SOM_NONE) {
stopcr |= reduced_cr(v, g, no_vertices);
} else {
stopcr |= g[v].char_reach;
}
}
}
// Turn alphabet into stops.
stopcr.flip();
return stopcr;
}
/** Calculate the stop alphabet for each depth from 0 to MAX_STOP_DEPTH. Then
* build an eight-bit mask per character C, with each bit representing the
* depth before the location of character C (if encountered) that the NFA would
* be in a predictable start state. */
vector<u8> findLeftOffsetStopAlphabet(const NGHolder &g, som_type som) {
const depth max_depth(MAX_STOP_DEPTH);
const InitDepths depths(g);
const map<NFAVertex, BoundedRepeatSummary> no_vertices;
vector<CharReach> reach(MAX_STOP_DEPTH);
for (auto v : vertices_range(g)) {
if (is_special(v, g)) {
continue;
}
CharReach v_cr;
if (som == SOM_NONE) {
v_cr = reduced_cr(v, g, no_vertices);
} else {
v_cr = g[v].char_reach;
}
u32 d = min(max_depth, depths.maxDist(g, v));
for (u32 i = 0; i < d; i++) {
reach[i] |= v_cr;
}
}
#ifdef DEBUG
for (u32 i = 0; i < MAX_STOP_DEPTH; i++) {
DEBUG_PRINTF("depth %u, stop chars: ", i);
describeClass(stdout, ~reach[i], 20, CC_OUT_TEXT);
printf("\n");
}
#endif
vector<u8> stop(N_CHARS, 0);
for (u32 i = 0; i < MAX_STOP_DEPTH; i++) {
CharReach cr = ~reach[i]; // invert reach for stop chars.
const u8 mask = 1U << i;
for (size_t c = cr.find_first(); c != cr.npos; c = cr.find_next(c)) {
stop[c] |= mask;
}
}
return stop;
}
vector<u8> findLeftOffsetStopAlphabet(const CastleProto &castle,
UNUSED som_type som) {
const depth max_width = findMaxWidth(castle);
DEBUG_PRINTF("castle has reach %s and max width %s\n",
describeClass(castle.reach()).c_str(),
max_width.str().c_str());
const CharReach escape = ~castle.reach(); // invert reach for stop chars.
u32 d = min(max_width, depth(MAX_STOP_DEPTH));
const u8 mask = verify_u8((1U << d) - 1);
vector<u8> stop(N_CHARS, 0);
for (size_t c = escape.find_first(); c != escape.npos;
c = escape.find_next(c)) {
stop[c] |= mask;
}
return stop;
}
} // namespace ue2

62
src/nfagraph/ng_stop.h Normal file
View File

@@ -0,0 +1,62 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Stop Alphabet calculation.
*/
#ifndef NG_STOP_H
#define NG_STOP_H
#include "ue2common.h"
#include "som/som.h"
#include <vector>
namespace ue2 {
struct CastleProto;
class CharReach;
class NGHolder;
/** Find the set of characters that are not present in the reachability of
* graph \p g after a certain depth (currently 8). If a character in this set
* is encountered, it means that the NFA is either dead or has not progressed
* more than 8 characters from its start states. */
CharReach findStopAlphabet(const NGHolder &g, som_type som);
/** Calculate the stop alphabet for each depth from 0 to MAX_STOP_DEPTH. Then
* build an eight-bit mask per character C, with each bit representing the
* depth before the location of character C (if encountered) that the NFA would
* be in a predictable start state. */
std::vector<u8> findLeftOffsetStopAlphabet(const NGHolder &g, som_type som);
std::vector<u8> findLeftOffsetStopAlphabet(const CastleProto &c, som_type som);
} // namespace ue2
#endif

View File

@@ -0,0 +1,614 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief NFA graph merging ("uncalc")
*
* The file contains our collection of NFA graph merging strategies.
*
* NFAGraph merging is generally guided by the length of the common prefix
* between NFAGraph pairs.
*/
#include "grey.h"
#include "ng_holder.h"
#include "ng_limex.h"
#include "ng_redundancy.h"
#include "ng_region.h"
#include "ng_restructuring.h"
#include "ng_uncalc_components.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/compile_context.h"
#include "util/container.h"
#include "util/graph_range.h"
#include "util/ue2string.h"
#include <algorithm>
#include <deque>
#include <map>
#include <queue>
#include <set>
#include <vector>
using namespace std;
namespace ue2 {
static const u32 FAST_STATE_LIMIT = 256; /**< largest possible desirable NFA */
/** Sentinel value meaning no component has yet been selected. */
static const u32 NO_COMPONENT = 0xffffffffu;
static
vector<NFAVertex> getSortedVA(const NGHolder &g,
const ue2::unordered_map<NFAVertex, u32> &state_ids) {
vector<NFAVertex> out;
out.reserve(num_vertices(g));
for (auto v : vertices_range(g)) {
assert(contains(state_ids, v));
if (state_ids.at(v) == NO_STATE) {
continue;
}
out.push_back(v);
}
// Order vertices by their state indices.
sort(begin(out), end(out), [&state_ids](NFAVertex a, NFAVertex b) {
return state_ids.at(a) < state_ids.at(b);
});
#ifndef NDEBUG
// State indices should match vector indices.
for (u32 i = 0; i < out.size(); i++) {
assert(state_ids.at(out.at(i)) == i);
}
#endif
return out;
}
static never_inline
bool cplVerticesMatch(const NGHolder &ga, NFAVertex va,
const NGHolder &gb, NFAVertex vb) {
// Must have the same reachability.
if (ga[va].char_reach != gb[vb].char_reach) {
return false;
}
// If they're start vertices, they must be the same one.
if (is_any_start(va, ga) || is_any_start(vb, gb)) {
if (ga[va].index != gb[vb].index) {
return false;
}
}
bool va_accept = edge(va, ga.accept, ga).second;
bool vb_accept = edge(vb, gb.accept, gb).second;
bool va_acceptEod = edge(va, ga.acceptEod, ga).second;
bool vb_acceptEod = edge(vb, gb.acceptEod, gb).second;
// Must have the same accept/acceptEod edges.
if (va_accept != vb_accept || va_acceptEod != vb_acceptEod) {
return false;
}
return true;
}
static never_inline
u32 cplCommonReachAndSimple(const NGHolder &ga, const vector<NFAVertex> &a,
const NGHolder &gb, const vector<NFAVertex> &b) {
u32 ml = min(a.size(), b.size());
if (ml > 65535) {
ml = 65535;
}
// Count the number of common vertices which share reachability, report and
// "startedness" properties.
u32 max = 0;
for (; max < ml; max++) {
if (!cplVerticesMatch(ga, a[max], gb, b[max])) {
break;
}
}
return max;
}
u32 commonPrefixLength(const NGHolder &ga,
const ue2::unordered_map<NFAVertex, u32> &a_state_ids,
const NGHolder &gb,
const ue2::unordered_map<NFAVertex, u32> &b_state_ids) {
vector<NFAVertex> a = getSortedVA(ga, a_state_ids);
vector<NFAVertex> b = getSortedVA(gb, b_state_ids);
/* upper bound on the common region based on local properties */
u32 max = cplCommonReachAndSimple(ga, a, gb, b);
DEBUG_PRINTF("cpl upper bound %u\n", max);
while (max > 0) {
bool ok = true;
/* shrink max region based on in-edges from outside the region */
for (size_t j = max; j > 0; j--) {
for (auto u : inv_adjacent_vertices_range(a[j - 1], ga)) {
u32 state_id = a_state_ids.at(u);
if (state_id != NO_STATE && state_id >= max) {
max = j - 1;
DEBUG_PRINTF("lowering max to %u\n", max);
goto next_vertex;
}
}
for (auto u : inv_adjacent_vertices_range(b[j - 1], gb)) {
u32 state_id = b_state_ids.at(u);
if (state_id != NO_STATE && state_id >= max) {
max = j - 1;
DEBUG_PRINTF("lowering max to %u\n", max);
goto next_vertex;
}
}
next_vertex:;
}
/* Ensure that every pair of vertices has same out-edges to vertices in
the region. */
for (size_t i = 0; ok && i < max; i++) {
size_t a_count = 0;
size_t b_count = 0;
NFAGraph::out_edge_iterator ei, ee;
for (tie(ei, ee) = out_edges(a[i], ga); ok && ei != ee; ++ei) {
u32 sid = a_state_ids.at(target(*ei, ga));
if (sid == NO_STATE || sid >= max) {
continue;
}
a_count++;
NFAEdge b_edge;
bool has_b_edge;
tie(b_edge, has_b_edge) = edge(b[i], b[sid], gb);
if (!has_b_edge) {
max = i;
ok = false;
DEBUG_PRINTF("lowering max to %u due to edge %zu->%u\n",
max, i, sid);
break;
}
if (ga[*ei].top != gb[b_edge].top) {
max = i;
ok = false;
DEBUG_PRINTF("tops don't match on edge %zu->%u\n",
i, sid);
}
}
NFAGraph::adjacency_iterator ai, ae;
for (tie(ai, ae) = adjacent_vertices(b[i], gb); ok && ai != ae;
++ai) {
u32 sid = b_state_ids.at(*ai);
if (sid == NO_STATE || sid >= max) {
continue;
}
b_count++;
}
if (a_count != b_count) {
max = i;
DEBUG_PRINTF("lowering max to %u due to a,b count "
"(a_count=%zu, b_count=%zu)\n", max, a_count,
b_count);
ok = false;
}
}
if (ok) {
DEBUG_PRINTF("survived checks, returning cpl %u\n", max);
return max;
}
}
DEBUG_PRINTF("failed to find any common region\n");
return 0;
}
static never_inline
void mergeNfa(NGHolder &dest, vector<NFAVertex> &destStateMap,
ue2::unordered_map<NFAVertex, u32> &dest_state_ids,
NGHolder &vic, vector<NFAVertex> &vicStateMap,
size_t common_len) {
map<NFAVertex, NFAVertex> vmap; // vic -> dest
vmap[vic.start] = dest.start;
vmap[vic.startDs] = dest.startDs;
vmap[vic.accept] = dest.accept;
vmap[vic.acceptEod] = dest.acceptEod;
vmap[nullptr] = nullptr;
u32 stateNum = countStates(dest, dest_state_ids);
// For vertices in the common len, add to vmap and merge in the reports, if
// any.
for (u32 i = 0; i < common_len; i++) {
NFAVertex v_old = vicStateMap[i], v = destStateMap[i];
vmap[v_old] = v;
const auto &reports = vic[v_old].reports;
dest[v].reports.insert(reports.begin(), reports.end());
}
// Add in vertices beyond the common len, giving them state numbers
// starting at stateNum.
for (u32 i = common_len; i < vicStateMap.size(); i++) {
NFAVertex v_old = vicStateMap[i];
if (is_special(v_old, vic)) {
// Dest already has start vertices, just merge the reports.
u32 idx = vic[v_old].index;
NFAVertex v = dest.getSpecialVertex(idx);
const auto &reports = vic[v_old].reports;
dest[v].reports.insert(reports.begin(), reports.end());
continue;
}
NFAVertex v = add_vertex(vic[v_old], dest);
dest_state_ids[v] = stateNum++;
vmap[v_old] = v;
}
/* add edges */
DEBUG_PRINTF("common_len=%zu\n", common_len);
for (const auto &e : edges_range(vic)) {
NFAVertex u_old = source(e, vic), v_old = target(e, vic);
NFAVertex u = vmap[u_old], v = vmap[v_old];
bool uspecial = is_special(u, dest);
bool vspecial = is_special(v, dest);
// Skip stylised edges that are already present.
if (uspecial && vspecial && edge(u, v, dest).second) {
continue;
}
// We're in the common region if v's state ID is low enough, unless v
// is a special (an accept), in which case we use u's state ID.
assert(contains(dest_state_ids, v));
bool in_common_region = dest_state_ids.at(v) < common_len;
if (vspecial && dest_state_ids.at(u) < common_len) {
in_common_region = true;
}
DEBUG_PRINTF("adding idx=%u (state %u) -> idx=%u (state %u)%s\n",
dest[u].index, dest_state_ids.at(u),
dest[v].index, dest_state_ids.at(v),
in_common_region ? " [common]" : "");
if (in_common_region) {
if (!is_special(v, dest)) {
DEBUG_PRINTF("skipping common edge\n");
assert(edge(u, v, dest).second);
// Should never merge edges with different top values.
assert(vic[e].top == dest[edge(u, v, dest).first].top);
continue;
} else {
assert(is_any_accept(v, dest));
// If the edge exists in both graphs, skip it.
if (edge(u, v, dest).second) {
DEBUG_PRINTF("skipping common edge to accept\n");
continue;
}
}
}
assert(!edge(u, v, dest).second);
add_edge(u, v, vic[e], dest);
}
dest.renumberEdges();
dest.renumberVertices();
}
static never_inline
void mergeNfaComponent(NGHolder &pholder, NGHolder &vholder, size_t cpl) {
assert(&pholder != &vholder);
auto v_state_ids = numberStates(vholder);
auto p_state_ids = numberStates(pholder);
auto vhvmap = getSortedVA(vholder, v_state_ids);
auto phvmap = getSortedVA(pholder, p_state_ids);
mergeNfa(pholder, phvmap, p_state_ids, vholder, vhvmap, cpl);
}
namespace {
struct NfaMergeCandidateH {
NfaMergeCandidateH(size_t cpl_in, NGHolder *first_in, NGHolder *second_in,
u32 tb_in)
: cpl(cpl_in), first(first_in), second(second_in), tie_breaker(tb_in) {}
size_t cpl; //!< common prefix length
NGHolder *first; //!< first component to merge
NGHolder *second; //!< second component to merge
u32 tie_breaker; //!< for determinism
bool operator<(const NfaMergeCandidateH &other) const {
if (cpl != other.cpl) {
return cpl < other.cpl;
} else {
return tie_breaker < other.tie_breaker;
}
}
};
} // end namespace
/** Returns true if graphs \p h1 and \p h2 can (and should) be merged. */
static
bool shouldMerge(NGHolder &ha,
const ue2::unordered_map<NFAVertex, u32> &a_state_ids,
NGHolder &hb,
const ue2::unordered_map<NFAVertex, u32> &b_state_ids,
size_t cpl, const ReportManager *rm,
const CompileContext &cc) {
size_t combinedStateCount =
countStates(ha, a_state_ids) + countStates(hb, b_state_ids) - cpl;
if (combinedStateCount > FAST_STATE_LIMIT) {
// More complex implementability check.
NGHolder h_temp;
cloneHolder(h_temp, ha);
assert(h_temp.kind == hb.kind);
mergeNfaComponent(h_temp, hb, cpl);
reduceImplementableGraph(h_temp, SOM_NONE, rm, cc);
u32 numStates = isImplementableNFA(h_temp, rm, cc);
DEBUG_PRINTF("isImplementableNFA returned %u states\n", numStates);
if (!numStates) {
DEBUG_PRINTF("not implementable\n");
return false;
} else if (numStates > FAST_STATE_LIMIT) {
DEBUG_PRINTF("too many states to merge\n");
return false;
}
}
return true;
}
/** Returns true if the graph has start vertices that are compatible for
* merging. Rose may generate all sorts of wacky vacuous cases, and the merge
* code isn't currently up to handling them. */
static
bool compatibleStarts(const NGHolder &ga, const NGHolder &gb) {
// Start and startDs must have the same self-loops.
return (edge(ga.startDs, ga.startDs, ga).second ==
edge(gb.startDs, gb.startDs, gb).second) &&
(edge(ga.start, ga.start, ga).second ==
edge(gb.start, gb.start, gb).second);
}
static never_inline
void buildNfaMergeQueue(const vector<NGHolder *> &cluster,
priority_queue<NfaMergeCandidateH> *pq) {
const size_t cs = cluster.size();
assert(cs < NO_COMPONENT);
// First, make sure all holders have numbered states and collect their
// counts.
vector<ue2::unordered_map<NFAVertex, u32>> states_map(cs);
for (size_t i = 0; i < cs; i++) {
assert(cluster[i]);
NGHolder &g = *(cluster[i]);
states_map[i] = numberStates(g);
}
vector<u16> seen_cpl(cs * cs, 0);
vector<u32> best_comp(cs, NO_COMPONENT);
/* TODO: understand, explain */
for (u32 ci = 0; ci < cs; ci++) {
for (u32 cj = ci + 1; cj < cs; cj++) {
u16 cpl = 0;
bool calc = false;
if (best_comp[ci] != NO_COMPONENT) {
u32 bc = best_comp[ci];
if (seen_cpl[bc + cs * cj] < seen_cpl[bc + cs * ci]) {
cpl = seen_cpl[bc + cs * cj];
DEBUG_PRINTF("using cached cpl from %u %u\n", bc, cpl);
calc = true;
}
}
if (!calc && best_comp[cj] != NO_COMPONENT) {
u32 bc = best_comp[cj];
if (seen_cpl[bc + cs * ci] < seen_cpl[bc + cs * cj]) {
cpl = seen_cpl[bc + cs * ci];
DEBUG_PRINTF("using cached cpl from %u %u\n", bc, cpl);
calc = true;
}
}
NGHolder &g_i = *(cluster[ci]);
NGHolder &g_j = *(cluster[cj]);
if (!compatibleStarts(g_i, g_j)) {
continue;
}
if (!calc) {
cpl = commonPrefixLength(g_i, states_map[ci],
g_j, states_map[cj]);
}
seen_cpl[ci + cs * cj] = cpl;
seen_cpl[cj + cs * ci] = cpl;
if (best_comp[cj] == NO_COMPONENT
|| seen_cpl[best_comp[cj] + cs * cj] < cpl) {
best_comp[cj] = ci;
}
DEBUG_PRINTF("cpl %u %u = %u\n", ci, cj, cpl);
pq->push(NfaMergeCandidateH(cpl, cluster[ci], cluster[cj],
ci * cs + cj));
}
}
}
/** True if the graphs have compatible starts for merging, i.e. they are NOT
* both vacuous with different reports on the starts. */
static
bool mergeableStarts(const NGHolder &h1, const NGHolder &h2) {
bool vac1 = isVacuous(h1), vac2 = isVacuous(h2);
// Safety tests: reports should be empty on non-vacuous graphs.
if (!vac1) {
assert(h1[h1.start].reports.empty());
assert(h1[h1.startDs].reports.empty());
}
if (!vac2) {
assert(h2[h2.start].reports.empty());
assert(h2[h2.startDs].reports.empty());
}
if (vac1 && vac2) {
// Graphs must have the same reports on their starts to be mergeable
// (and top on start->accept).
if (h1[h1.start].reports
!= h2[h2.start].reports) {
return false;
}
if (h1[h1.startDs].reports
!= h2[h2.startDs].reports) {
return false;
}
pair<NFAEdge, bool> e1, e2;
e1 = edge(h1.start, h1.accept, h1);
e2 = edge(h2.start, h2.accept, h2);
if (e1.second || e2.second) {
if (e1.second && e2.second &&
h1[e1.first].top != h2[e2.first].top) {
return false;
}
}
e1 = edge(h1.start, h1.acceptEod, h1);
e2 = edge(h2.start, h2.acceptEod, h2);
if (e1.second || e2.second) {
if (e1.second && e2.second &&
h1[e1.first].top != h2[e2.first].top) {
return false;
}
}
}
return true;
}
/** Merge graph \p ga into graph \p gb. Returns false on failure. */
bool mergeNfaPair(NGHolder &ga, NGHolder &gb, const ReportManager *rm,
const CompileContext &cc) {
assert(ga.kind == gb.kind);
auto a_state_ids = numberStates(ga);
auto b_state_ids = numberStates(gb);
// At the moment, since our vertices can only have one report ID each,
// we must ensure that our start vertices have the same report ID,
// otherwise they can't be merged. This happens in vacuous NFAs, used
// by Rose.
// XXX: the multi-top code has this limitation, too.
if (!mergeableStarts(ga, gb)) {
DEBUG_PRINTF("starts aren't mergeable\n");
return false;
}
// NOTE: states must be numbered already.
u32 cpl = commonPrefixLength(ga, a_state_ids, gb, b_state_ids);
if (!shouldMerge(gb, b_state_ids, ga, a_state_ids, cpl, rm, cc)) {
return false;
}
mergeNfaComponent(gb, ga, cpl);
reduceImplementableGraph(gb, SOM_NONE, rm, cc);
b_state_ids = numberStates(gb);
return true;
}
/** Merge the group of graphs in \p cluster where possible. The (from, to)
* mapping of merged graphs is returned in \p merged. */
void mergeNfaCluster(const vector<NGHolder *> &cluster,
const ReportManager *rm,
map<NGHolder *, NGHolder *> &merged,
const CompileContext &cc) {
if (cluster.size() < 2) {
return;
}
DEBUG_PRINTF("new cluster, size %zu\n", cluster.size());
merged.clear();
priority_queue<NfaMergeCandidateH> pq;
buildNfaMergeQueue(cluster, &pq);
while (!pq.empty()) {
NGHolder &pholder = *pq.top().first;
NGHolder &vholder = *pq.top().second;
pq.pop();
if (contains(merged, &pholder) || contains(merged, &vholder)) {
DEBUG_PRINTF("dead\n");
continue;
}
if (!mergeNfaPair(vholder, pholder, rm, cc)) {
DEBUG_PRINTF("merge failed\n");
continue;
}
merged.emplace(&vholder, &pholder);
// Seek closure.
for (auto &m : merged) {
if (m.second == &vholder) {
m.second = &pholder;
}
}
}
}
} // namespace ue2

View File

@@ -0,0 +1,81 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief NFA graph merging ("uncalc")
*/
#ifndef NG_UNCALC_COMPONENTS_H
#define NG_UNCALC_COMPONENTS_H
#include <map>
#include <vector>
#include "nfagraph/ng_graph.h"
#include "util/ue2_containers.h"
namespace ue2 {
struct CompileContext;
struct Grey;
class NGHolder;
class ReportManager;
/**
* \brief Returns the common prefix length for a pair of graphs.
*
* The CPL is calculated based the topological ordering given by the state
* indices for each graph.
*/
u32 commonPrefixLength(const NGHolder &ga,
const ue2::unordered_map<NFAVertex, u32> &a_state_ids,
const NGHolder &gb,
const ue2::unordered_map<NFAVertex, u32> &b_state_ids);
/**
* \brief Merge the group of graphs in \p cluster where possible.
*
* The (from, to) mapping of merged graphs is returned in \p merged.
*/
void mergeNfaCluster(const std::vector<NGHolder *> &cluster,
const ReportManager *rm,
std::map<NGHolder *, NGHolder *> &merged,
const CompileContext &cc);
/**
* \brief Merge graph \p ga into graph \p gb.
*
* Returns false on failure. On success, \p gb is reduced via \ref
* reduceImplementableGraph and renumbered.
*/
bool mergeNfaPair(NGHolder &ga, NGHolder &gb, const ReportManager *rm,
const CompileContext &cc);
} // namespace ue2
#endif

View File

@@ -0,0 +1,114 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Create an undirected graph from an NFAGraph.
*/
#ifndef NG_UNDIRECTED_H_CB42C71CF38E3D
#define NG_UNDIRECTED_H_CB42C71CF38E3D
#include "ng_holder.h"
#include "ng_util.h"
#include "ue2common.h"
#include "util/graph_range.h"
#include "util/ue2_containers.h"
namespace ue2 {
/**
* \brief BGL graph type for the undirected NFA graph.
*
* Note that we use a set for the out-edge lists: this avoids the construction
* of parallel edges. The only vertex property constructed is \a
* vertex_index_t.
*/
typedef boost::adjacency_list<boost::setS, // out edges
boost::listS, // vertices
boost::undirectedS, // graph is undirected
boost::property<boost::vertex_index_t, u32> >
NFAUndirectedGraph;
typedef NFAUndirectedGraph::vertex_descriptor NFAUndirectedVertex;
/**
* Make a copy of an NFAGraph with undirected edges, optionally without start
* vertices. Mappings from the original graph to the new one are provided.
*
* Note that new vertex indices are assigned contiguously in \a vertices(g) order.
*/
template <typename GraphT>
void createUnGraph(const GraphT &g,
bool excludeStarts,
bool excludeAccepts,
NFAUndirectedGraph &ug,
ue2::unordered_map<NFAVertex, NFAUndirectedVertex> &old2new,
ue2::unordered_map<u32, NFAVertex> &newIdx2old) {
u32 idx = 0;
for (auto v : ue2::vertices_range(g)) {
// skip all accept nodes
if (excludeAccepts && is_any_accept(v, g)) {
continue;
}
// skip starts if required
if (excludeStarts && is_any_start(v, g)) {
continue;
}
NFAUndirectedVertex nuv = boost::add_vertex(ug);
old2new[v] = nuv;
newIdx2old[idx] = v;
boost::put(boost::vertex_index, ug, nuv, idx++);
}
for (const auto &e : ue2::edges_range(g)) {
NFAVertex src = source(e, g);
NFAVertex targ = target(e, g);
if ((excludeAccepts && is_any_accept(src, g))
|| (excludeStarts && is_any_start(src, g))) {
continue;
}
if ((excludeAccepts && is_any_accept(targ, g))
|| (excludeStarts && is_any_start(targ, g))) {
continue;
}
NFAUndirectedVertex new_src = old2new[src];
NFAUndirectedVertex new_targ = old2new[targ];
boost::add_edge(new_src, new_targ, ug);
}
}
} // namespace ue2
#endif /* NG_UNDIRECTED_H_CB42C71CF38E3D */

305
src/nfagraph/ng_utf8.cpp Normal file
View File

@@ -0,0 +1,305 @@
/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief UTF-8 transforms and operations.
*/
#include "ng_utf8.h"
#include "ng.h"
#include "ng_prune.h"
#include "ng_util.h"
#include "util/graph_range.h"
#include "util/unicode_def.h"
#include <set>
#include <vector>
using namespace std;
namespace ue2 {
static
void allowIllegal(NGWrapper &w, NFAVertex v, u8 pred_char) {
if (in_degree(v, w) != 1) {
DEBUG_PRINTF("unexpected pred\n");
assert(0); /* should be true due to the early stage of this analysis */
return;
}
CharReach &cr = w[v].char_reach;
if (pred_char == 0xe0) {
assert(cr.isSubsetOf(CharReach(0xa0, 0xbf)));
if (cr == CharReach(0xa0, 0xbf)) {
cr |= CharReach(0x80, 0x9f);
}
} else if (pred_char == 0xf0) {
assert(cr.isSubsetOf(CharReach(0x90, 0xbf)));
if (cr == CharReach(0x90, 0xbf)) {
cr |= CharReach(0x80, 0x8f);
}
} else if (pred_char == 0xf4) {
assert(cr.isSubsetOf(CharReach(0x80, 0x8f)));
if (cr == CharReach(0x80, 0x8f)) {
cr |= CharReach(0x90, 0xbf);
}
} else {
assert(0); /* unexpected pred */
}
}
/** \brief Relax forbidden UTF-8 sequences.
*
* Some byte sequences can not appear in valid UTF-8 as they encode code points
* above \\x{10ffff} or they represent overlong encodings. As we require valid
* UTF-8 input, we have no defined behaviour in these cases, as a result we can
* accept them if it simplifies the graph. */
void relaxForbiddenUtf8(NGWrapper &w) {
if (!w.utf8) {
return;
}
const CharReach e0(0xe0);
const CharReach f0(0xf0);
const CharReach f4(0xf4);
for (auto v : vertices_range(w)) {
const CharReach &cr = w[v].char_reach;
if (cr == e0 || cr == f0 || cr == f4) {
u8 pred_char = cr.find_first();
for (auto t : adjacent_vertices_range(v, w)) {
allowIllegal(w, t, pred_char);
}
}
}
}
static
bool hasPredInSet(const NGHolder &g, NFAVertex v, const set<NFAVertex> &s) {
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (contains(s, u)) {
return true;
}
}
return false;
}
static
bool hasSuccInSet(const NGHolder &g, NFAVertex v, const set<NFAVertex> &s) {
for (auto w : adjacent_vertices_range(v, g)) {
if (contains(s, w)) {
return true;
}
}
return false;
}
static
void findSeeds(const NGHolder &h, const bool som, vector<NFAVertex> *seeds) {
set<NFAVertex> bad; /* from zero-width asserts near accepts, etc */
for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
const CharReach &cr = h[v].char_reach;
if (!isutf8ascii(cr) && !isutf8start(cr)) {
bad.insert(v);
}
}
for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) {
const CharReach &cr = h[v].char_reach;
if (!isutf8ascii(cr) && !isutf8start(cr)) {
bad.insert(v);
}
}
// we want to be careful with asserts connected to starts
// as well as they may not finish a code point
for (auto v : vertices_range(h)) {
if (is_virtual_start(v, h)) {
bad.insert(v);
insert(&bad, adjacent_vertices(v, h));
}
}
/* we cannot handle vertices connected to accept as would report matches in
* the middle of codepoints. acceptEod is not a problem as the input must
* end at a codepoint boundary */
bad.insert(h.accept);
// If we're in SOM mode, we don't want to mess with vertices that have a
// direct edge from startDs.
if (som) {
insert(&bad, adjacent_vertices(h.startDs, h));
}
set<NFAVertex> already_seeds; /* already marked as seeds */
for (auto v : vertices_range(h)) {
const CharReach &cr = h[v].char_reach;
if (!isutf8ascii(cr) || !hasSelfLoop(v, h)) {
continue;
}
if (hasSuccInSet(h, v, bad)) {
continue;
}
// Skip vertices that are directly connected to other vertices already
// in the seeds list: we can't collapse two of these directly next to
// each other.
if (hasPredInSet(h, v, already_seeds) ||
hasSuccInSet(h, v, already_seeds)) {
continue;
}
DEBUG_PRINTF("%u is a seed\n", h[v].index);
seeds->push_back(v);
already_seeds.insert(v);
}
}
static
bool expandCyclic(NGHolder &h, NFAVertex v) {
DEBUG_PRINTF("inspecting %u\n", h[v].index);
bool changes = false;
set<NFAVertex> v_preds;
set<NFAVertex> v_succs;
pred(h, v, &v_preds);
succ(h, v, &v_succs);
set<NFAVertex> start_siblings;
set<NFAVertex> end_siblings;
CharReach &v_cr = h[v].char_reach;
/* We need to find start vertices which have all of our preds.
* As we have a self loop, it must be one of our succs. */
for (auto a : adjacent_vertices_range(v, h)) {
set<NFAVertex> a_preds;
pred(h, a, &a_preds);
if (a_preds == v_preds && isutf8start(h[a].char_reach)) {
DEBUG_PRINTF("%u is a start v\n", h[a].index);
start_siblings.insert(a);
}
}
/* We also need to find full cont vertices which have all our own succs;
* As we have a self loop, it must be one of our preds. */
for (auto a : inv_adjacent_vertices_range(v, h)) {
set<NFAVertex> a_succs;
succ(h, a, &a_succs);
if (a_succs == v_succs && h[a].char_reach == UTF_CONT_CR) {
DEBUG_PRINTF("%u is a full tail cont\n", h[a].index);
end_siblings.insert(a);
}
}
for (auto s : start_siblings) {
if (out_degree(s, h) != 1) {
continue;
}
const CharReach &cr = h[s].char_reach;
if (cr.isSubsetOf(UTF_TWO_START_CR)) {
if (end_siblings.find(*adjacent_vertices(s, h).first)
== end_siblings.end()) {
DEBUG_PRINTF("%u is odd\n", h[s].index);
continue;
}
} else if (cr.isSubsetOf(UTF_THREE_START_CR)) {
NFAVertex m = *adjacent_vertices(s, h).first;
if (h[m].char_reach != UTF_CONT_CR
|| out_degree(m, h) != 1) {
continue;
}
if (end_siblings.find(*adjacent_vertices(m, h).first)
== end_siblings.end()) {
DEBUG_PRINTF("%u is odd\n", h[s].index);
continue;
}
} else if (cr.isSubsetOf(UTF_FOUR_START_CR)) {
NFAVertex m1 = *adjacent_vertices(s, h).first;
if (h[m1].char_reach != UTF_CONT_CR
|| out_degree(m1, h) != 1) {
continue;
}
NFAVertex m2 = *adjacent_vertices(m1, h).first;
if (h[m2].char_reach != UTF_CONT_CR
|| out_degree(m2, h) != 1) {
continue;
}
if (end_siblings.find(*adjacent_vertices(m2, h).first)
== end_siblings.end()) {
DEBUG_PRINTF("%u is odd\n", h[s].index);
continue;
}
} else {
DEBUG_PRINTF("%u is bad\n", h[s].index);
continue;
}
v_cr |= cr;
clear_vertex(s, h);
changes = true;
}
if (changes) {
v_cr |= UTF_CONT_CR; /* we need to add in cont reach */
v_cr.set(0xc0); /* we can also add in the forbidden bytes as we require
* valid unicode data */
v_cr.set(0xc1);
v_cr |= CharReach(0xf5, 0xff);
}
return changes;
}
/** \brief Contract cycles of UTF-8 code points down to a single cyclic vertex
* where possible, based on the assumption that we will always be matching
* against well-formed input. */
void utf8DotRestoration(NGHolder &h, bool som) {
vector<NFAVertex> seeds; /* cyclic ascii vertices */
findSeeds(h, som, &seeds);
bool changes = false;
for (auto v : seeds) {
changes |= expandCyclic(h, v);
}
if (changes) {
pruneUseless(h);
}
}
} // namespace ue2

Some files were not shown because too many files have changed in this diff Show More