mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
ng_extparam: split up work and do per-comp reduce
This change breaks extparam processing up into: - propagateExtendedParams: propagates min_length, min_offset and max_offset into the reports on the graph - reduceExtendedParams: runs graph reductions based on extparams Then, we apply the reduce pass to the whole graph, and later as well to each component after calc_components.
This commit is contained in:
parent
0a163b5535
commit
a871f70c25
@ -388,7 +388,7 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
|
||||
// fuzz graph - this must happen before any transformations are made
|
||||
make_fuzzy(*g, expr.edit_distance, cc.grey);
|
||||
|
||||
handleExtendedParams(rm, *g, expr, cc);
|
||||
propagateExtendedParams(*g, expr, rm);
|
||||
fillExpressionInfo(rm, *g, expr, &local_info);
|
||||
}
|
||||
catch (const CompileError &e) {
|
||||
|
@ -214,6 +214,7 @@ bool addComponent(NG &ng, NGHolder &g, const ExpressionInfo &expr,
|
||||
|
||||
assert(allMatchStatesHaveReports(g));
|
||||
|
||||
reduceExtendedParams(g, ng.rm, som);
|
||||
reduceGraph(g, som, expr.utf8, cc);
|
||||
|
||||
dumpComponent(g, "02_reduced", expr.index, comp_id, ng.cc.grey);
|
||||
@ -223,6 +224,13 @@ bool addComponent(NG &ng, NGHolder &g, const ExpressionInfo &expr,
|
||||
removeRegionRedundancy(g, som);
|
||||
}
|
||||
|
||||
// We might be done at this point: if we've run out of vertices, we can
|
||||
// stop processing.
|
||||
if (num_vertices(g) == N_SPECIALS) {
|
||||
DEBUG_PRINTF("all vertices claimed\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
// "Short Exhaustible Passthrough" patterns always become outfixes.
|
||||
if (!som && isSEP(g, ng.rm, cc.grey)) {
|
||||
DEBUG_PRINTF("graph is SEP\n");
|
||||
@ -358,10 +366,22 @@ bool NG::addGraph(ExpressionInfo &expr, unique_ptr<NGHolder> g_ptr) {
|
||||
|
||||
optimiseVirtualStarts(g); /* good for som */
|
||||
|
||||
handleExtendedParams(rm, g, expr, cc);
|
||||
if (expr.min_length) {
|
||||
// We have a minimum length constraint, which we currently use SOM to
|
||||
// satisfy.
|
||||
propagateExtendedParams(g, expr, rm);
|
||||
reduceExtendedParams(g, rm, som);
|
||||
|
||||
// We may have removed all the edges to accept, in which case this
|
||||
// expression cannot match.
|
||||
if (can_never_match(g)) {
|
||||
throw CompileError(expr.index, "Extended parameter constraints can not "
|
||||
"be satisfied for any match from this "
|
||||
"expression.");
|
||||
}
|
||||
|
||||
if (any_of_in(all_reports(g), [&](ReportID id) {
|
||||
return rm.getReport(id).minLength;
|
||||
})) {
|
||||
// We have at least one report with a minimum length constraint, which
|
||||
// we currently use SOM to satisfy.
|
||||
som = SOM_LEFT;
|
||||
ssm.somPrecision(8);
|
||||
}
|
||||
@ -377,10 +397,16 @@ bool NG::addGraph(ExpressionInfo &expr, unique_ptr<NGHolder> g_ptr) {
|
||||
relaxForbiddenUtf8(g, expr);
|
||||
}
|
||||
|
||||
if (expr.highlander && !expr.min_length && !expr.min_offset) {
|
||||
if (all_of_in(all_reports(g), [&](ReportID id) {
|
||||
const auto &report = rm.getReport(id);
|
||||
return report.ekey != INVALID_EKEY && !report.minLength &&
|
||||
!report.minOffset;
|
||||
})) {
|
||||
// In highlander mode: if we don't have constraints on our reports that
|
||||
// may prevent us accepting our first match (i.e. extended params) we
|
||||
// can prune the other out-edges of all vertices connected to accept.
|
||||
// TODO: shift the report checking down into pruneHighlanderAccepts()
|
||||
// to allow us to handle the parts we can in mixed cases.
|
||||
pruneHighlanderAccepts(g, rm);
|
||||
}
|
||||
|
||||
|
@ -26,12 +26,13 @@
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
/**
|
||||
* \file
|
||||
* \brief Propagate extended parameters to vertex reports and reduce graph if
|
||||
* possible.
|
||||
*
|
||||
* This code handles the propagation of the extension parameters specified by
|
||||
* the user with the hs_expr_ext structure into the reports on the graph's
|
||||
* the user with the \ref hs_expr_ext structure into the reports on the graph's
|
||||
* vertices.
|
||||
*
|
||||
* There are also some analyses that prune edges that cannot contribute to a
|
||||
@ -68,8 +69,28 @@ namespace ue2 {
|
||||
static const u32 MAX_MAXOFFSET_TO_ANCHOR = 2000;
|
||||
static const u32 MAX_MINLENGTH_TO_CONVERT = 2000;
|
||||
|
||||
/** \brief Find the (min, max) offset adjustment for the reports on a given
|
||||
* vertex. */
|
||||
/** True if all the given reports have the same extparam bounds. */
|
||||
template<typename Container>
|
||||
bool hasSameBounds(const Container &reports, const ReportManager &rm) {
|
||||
assert(!reports.empty());
|
||||
|
||||
const auto &first = rm.getReport(*reports.begin());
|
||||
for (auto id : reports) {
|
||||
const auto &report = rm.getReport(id);
|
||||
if (report.minOffset != first.minOffset ||
|
||||
report.maxOffset != first.maxOffset ||
|
||||
report.minLength != first.minLength) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Find the (min, max) offset adjustment for the reports on a given
|
||||
* vertex.
|
||||
*/
|
||||
static
|
||||
pair<s32,s32> getMinMaxOffsetAdjust(const ReportManager &rm,
|
||||
const NGHolder &g, NFAVertex v) {
|
||||
@ -130,55 +151,76 @@ DepthMinMax findMatchLengths(const ReportManager &rm, const NGHolder &g) {
|
||||
return match_depths;
|
||||
}
|
||||
|
||||
template<typename Function>
|
||||
void replaceReports(NGHolder &g, NFAVertex accept, flat_set<NFAVertex> &seen,
|
||||
Function func) {
|
||||
for (auto v : inv_adjacent_vertices_range(accept, g)) {
|
||||
if (v == g.accept) {
|
||||
// Don't operate on accept: the accept->acceptEod edge is stylised.
|
||||
assert(accept == g.acceptEod);
|
||||
assert(g[v].reports.empty());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!seen.insert(v).second) {
|
||||
continue; // We have already processed v.
|
||||
}
|
||||
|
||||
auto &reports = g[v].reports;
|
||||
if (reports.empty()) {
|
||||
continue;
|
||||
}
|
||||
decltype(g[v].reports) new_reports;
|
||||
for (auto id : g[v].reports) {
|
||||
new_reports.insert(func(v, id));
|
||||
}
|
||||
reports = std::move(new_reports);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic function for replacing all the reports in the graph.
|
||||
*
|
||||
* Pass this a function that takes a vertex and a ReportID returns another
|
||||
* ReportID (or the same one) to replace it with.
|
||||
*/
|
||||
template<typename Function>
|
||||
void replaceReports(NGHolder &g, Function func) {
|
||||
flat_set<NFAVertex> seen;
|
||||
replaceReports(g, g.accept, seen, func);
|
||||
replaceReports(g, g.acceptEod, seen, func);
|
||||
}
|
||||
|
||||
/** \brief Replace the graph's reports with new reports that specify bounds. */
|
||||
static
|
||||
void updateReportBounds(ReportManager &rm, NGHolder &g,
|
||||
const ExpressionInfo &expr, NFAVertex accept,
|
||||
set<NFAVertex> &done) {
|
||||
for (auto v : inv_adjacent_vertices_range(accept, g)) {
|
||||
// Don't operate on g.accept itself.
|
||||
if (v == g.accept) {
|
||||
assert(accept == g.acceptEod);
|
||||
continue;
|
||||
const ExpressionInfo &expr) {
|
||||
DEBUG_PRINTF("updating report bounds\n");
|
||||
replaceReports(g, [&](NFAVertex, ReportID id) {
|
||||
Report report = rm.getReport(id); // make a copy
|
||||
assert(!report.hasBounds());
|
||||
|
||||
// Note that we need to cope with offset adjustment here.
|
||||
|
||||
report.minOffset = expr.min_offset - report.offsetAdjust;
|
||||
if (expr.max_offset == MAX_OFFSET) {
|
||||
report.maxOffset = MAX_OFFSET;
|
||||
} else {
|
||||
report.maxOffset = expr.max_offset - report.offsetAdjust;
|
||||
}
|
||||
assert(report.maxOffset >= report.minOffset);
|
||||
|
||||
report.minLength = expr.min_length;
|
||||
if (expr.min_length && !expr.som) {
|
||||
report.quashSom = true;
|
||||
}
|
||||
|
||||
// Don't operate on a vertex we've already done.
|
||||
if (contains(done, v)) {
|
||||
continue;
|
||||
}
|
||||
done.insert(v);
|
||||
DEBUG_PRINTF("id %u -> min_offset=%llu, max_offset=%llu, "
|
||||
"min_length=%llu\n", id, report.minOffset,
|
||||
report.maxOffset, report.minLength);
|
||||
|
||||
flat_set<ReportID> new_reports;
|
||||
auto &reports = g[v].reports;
|
||||
|
||||
for (auto id : reports) {
|
||||
Report ir = rm.getReport(id); // make a copy
|
||||
assert(!ir.hasBounds());
|
||||
|
||||
// Note that we need to cope with offset adjustment here.
|
||||
|
||||
ir.minOffset = expr.min_offset - ir.offsetAdjust;
|
||||
if (expr.max_offset == MAX_OFFSET) {
|
||||
ir.maxOffset = MAX_OFFSET;
|
||||
} else {
|
||||
ir.maxOffset = expr.max_offset - ir.offsetAdjust;
|
||||
}
|
||||
assert(ir.maxOffset >= ir.minOffset);
|
||||
|
||||
ir.minLength = expr.min_length;
|
||||
if (expr.min_length && !expr.som) {
|
||||
ir.quashSom = true;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("id %u -> min_offset=%llu, max_offset=%llu, "
|
||||
"min_length=%llu\n",
|
||||
id, ir.minOffset, ir.maxOffset, ir.minLength);
|
||||
new_reports.insert(rm.getInternalId(ir));
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("swapping reports on vertex %zu\n", g[v].index);
|
||||
reports.swap(new_reports);
|
||||
}
|
||||
return rm.getInternalId(report);
|
||||
});
|
||||
}
|
||||
|
||||
static
|
||||
@ -191,32 +233,93 @@ bool hasVirtualStarts(const NGHolder &g) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/** If the pattern is unanchored, has a max_offset and has not asked for SOM,
|
||||
* we can use that knowledge to anchor it which will limit its lifespan. Note
|
||||
* that we can't use this transformation if there's a min_length, as it's
|
||||
* currently handled using "sly SOM".
|
||||
/** Set the min_length param for all reports to zero. */
|
||||
static
|
||||
void clearMinLengthParam(NGHolder &g, ReportManager &rm) {
|
||||
DEBUG_PRINTF("clearing min length\n");
|
||||
replaceReports(g, [&rm](NFAVertex, ReportID id) {
|
||||
const auto &report = rm.getReport(id);
|
||||
if (report.minLength) {
|
||||
Report new_report = report;
|
||||
new_report.minLength = 0;
|
||||
return rm.getInternalId(new_report);
|
||||
}
|
||||
return id;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the min_offset param to zero and the max_offset param to MAX_OFFSET for
|
||||
* all reports.
|
||||
*/
|
||||
static
|
||||
void clearOffsetParams(NGHolder &g, ReportManager &rm) {
|
||||
DEBUG_PRINTF("clearing min and max offset\n");
|
||||
replaceReports(g, [&rm](NFAVertex, ReportID id) {
|
||||
const auto &report = rm.getReport(id);
|
||||
if (report.minLength) {
|
||||
Report new_report = report;
|
||||
new_report.minOffset = 0;
|
||||
new_report.maxOffset = MAX_OFFSET;
|
||||
return rm.getInternalId(new_report);
|
||||
}
|
||||
return id;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* If the pattern is unanchored, has a max_offset and has not asked for SOM, we
|
||||
* can use that knowledge to anchor it which will limit its lifespan. Note that
|
||||
* we can't use this transformation if there's a min_length, as it's currently
|
||||
* handled using "sly SOM".
|
||||
*
|
||||
* Note that it is possible to handle graphs that have a combination of
|
||||
* anchored and unanchored paths, but it's too tricky for the moment.
|
||||
*/
|
||||
static
|
||||
bool anchorPatternWithBoundedRepeat(NGHolder &g, const ExpressionInfo &expr,
|
||||
const depth &minWidth,
|
||||
const depth &maxWidth) {
|
||||
assert(!expr.som);
|
||||
assert(expr.max_offset != MAX_OFFSET);
|
||||
assert(minWidth <= maxWidth);
|
||||
assert(maxWidth.is_reachable());
|
||||
|
||||
DEBUG_PRINTF("widths=[%s,%s], min/max offsets=[%llu,%llu]\n",
|
||||
minWidth.str().c_str(), maxWidth.str().c_str(),
|
||||
expr.min_offset, expr.max_offset);
|
||||
|
||||
if (expr.max_offset > MAX_MAXOFFSET_TO_ANCHOR) {
|
||||
bool anchorPatternWithBoundedRepeat(NGHolder &g, ReportManager &rm) {
|
||||
if (!isFloating(g)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (expr.max_offset < minWidth) {
|
||||
const auto &reports = all_reports(g);
|
||||
if (reports.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (any_of_in(reports, [&](ReportID id) {
|
||||
const auto &report = rm.getReport(id);
|
||||
return report.maxOffset == MAX_OFFSET || report.minLength ||
|
||||
report.offsetAdjust;
|
||||
})) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!hasSameBounds(reports, rm)) {
|
||||
DEBUG_PRINTF("mixed report bounds\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
const depth minWidth = findMinWidth(g);
|
||||
const depth maxWidth = findMaxWidth(g);
|
||||
|
||||
assert(minWidth <= maxWidth);
|
||||
assert(maxWidth.is_reachable());
|
||||
|
||||
const auto &first_report = rm.getReport(*reports.begin());
|
||||
const auto min_offset = first_report.minOffset;
|
||||
const auto max_offset = first_report.maxOffset;
|
||||
assert(max_offset < MAX_OFFSET);
|
||||
|
||||
DEBUG_PRINTF("widths=[%s,%s], min/max offsets=[%llu,%llu]\n",
|
||||
minWidth.str().c_str(), maxWidth.str().c_str(),
|
||||
min_offset, max_offset);
|
||||
|
||||
if (max_offset > MAX_MAXOFFSET_TO_ANCHOR) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (max_offset < minWidth) {
|
||||
assert(0);
|
||||
return false;
|
||||
}
|
||||
@ -237,10 +340,10 @@ bool anchorPatternWithBoundedRepeat(NGHolder &g, const ExpressionInfo &expr,
|
||||
u32 min_bound, max_bound;
|
||||
if (maxWidth.is_infinite()) {
|
||||
min_bound = 0;
|
||||
max_bound = expr.max_offset - minWidth;
|
||||
max_bound = max_offset - minWidth;
|
||||
} else {
|
||||
min_bound = expr.min_offset > maxWidth ? expr.min_offset - maxWidth : 0;
|
||||
max_bound = expr.max_offset - minWidth;
|
||||
min_bound = min_offset > maxWidth ? min_offset - maxWidth : 0;
|
||||
max_bound = max_offset - minWidth;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("prepending ^.{%u,%u}\n", min_bound, max_bound);
|
||||
@ -293,6 +396,13 @@ bool anchorPatternWithBoundedRepeat(NGHolder &g, const ExpressionInfo &expr,
|
||||
renumber_vertices(g);
|
||||
renumber_edges(g);
|
||||
|
||||
if (minWidth == maxWidth) {
|
||||
// For a fixed width pattern, we can retire the offsets as
|
||||
// they are implicit in the graph now.
|
||||
clearOffsetParams(g, rm);
|
||||
}
|
||||
|
||||
clearReports(g);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -341,17 +451,27 @@ bool hasOffsetAdjust(const ReportManager &rm, NGHolder &g,
|
||||
return true;
|
||||
}
|
||||
|
||||
/** If the pattern has a min_length and is of "ratchet" form with one unbounded
|
||||
/**
|
||||
* If the pattern has a min_length and is of "ratchet" form with one unbounded
|
||||
* repeat, that repeat can become a bounded repeat.
|
||||
*
|
||||
* /foo.*bar/{min_length=100} --> /foo.{94,}bar/
|
||||
*/
|
||||
static
|
||||
bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g,
|
||||
ExpressionInfo &expr) {
|
||||
assert(expr.min_length);
|
||||
bool transformMinLengthToRepeat(NGHolder &g, ReportManager &rm) {
|
||||
const auto &reports = all_reports(g);
|
||||
|
||||
if (expr.min_length > MAX_MINLENGTH_TO_CONVERT) {
|
||||
if (reports.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!hasSameBounds(reports, rm)) {
|
||||
DEBUG_PRINTF("mixed report bounds\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto &min_length = rm.getReport(*reports.begin()).minLength;
|
||||
if (!min_length || min_length > MAX_MINLENGTH_TO_CONVERT) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -381,7 +501,6 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g,
|
||||
|
||||
u32 width = 0;
|
||||
|
||||
|
||||
// Walk from the start vertex to the cyclic state and ensure we have a
|
||||
// chain of vertices.
|
||||
while (v != cyclic) {
|
||||
@ -443,10 +562,10 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g,
|
||||
DEBUG_PRINTF("width=%u, vertex %zu is cyclic\n", width,
|
||||
g[cyclic].index);
|
||||
|
||||
if (width >= expr.min_length) {
|
||||
if (width >= min_length) {
|
||||
DEBUG_PRINTF("min_length=%llu is guaranteed, as width=%u\n",
|
||||
expr.min_length, width);
|
||||
expr.min_length = 0;
|
||||
min_length, width);
|
||||
clearMinLengthParam(g, rm);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -474,7 +593,7 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g,
|
||||
|
||||
const CharReach &cr = g[cyclic].char_reach;
|
||||
|
||||
for (u32 i = 0; i < expr.min_length - width - 1; ++i) {
|
||||
for (u32 i = 0; i < min_length - width - 1; ++i) {
|
||||
v = add_vertex(g);
|
||||
g[v].char_reach = cr;
|
||||
|
||||
@ -491,9 +610,8 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g,
|
||||
|
||||
renumber_vertices(g);
|
||||
renumber_edges(g);
|
||||
clearMinLengthParam(g, rm);
|
||||
clearReports(g);
|
||||
|
||||
expr.min_length = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -511,8 +629,8 @@ bool hasExtParams(const ExpressionInfo &expr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static
|
||||
depth maxDistFromStart(const NFAVertexBidiDepth &d) {
|
||||
template<class VertexDepth>
|
||||
depth maxDistFromStart(const VertexDepth &d) {
|
||||
if (!d.fromStartDotStar.max.is_unreachable()) {
|
||||
// A path from startDs, any path, implies we can match at any offset.
|
||||
return depth::infinity();
|
||||
@ -541,7 +659,7 @@ const depth& minDistToAccept(const NFAVertexBidiDepth &d) {
|
||||
}
|
||||
|
||||
static
|
||||
bool isEdgePrunable(const NGHolder &g, const ExpressionInfo &expr,
|
||||
bool isEdgePrunable(const NGHolder &g, const Report &report,
|
||||
const vector<NFAVertexBidiDepth> &depths,
|
||||
const NFAEdge &e) {
|
||||
const NFAVertex u = source(e, g);
|
||||
@ -570,29 +688,29 @@ bool isEdgePrunable(const NGHolder &g, const ExpressionInfo &expr,
|
||||
const NFAVertexBidiDepth &du = depths.at(u_idx);
|
||||
const NFAVertexBidiDepth &dv = depths.at(v_idx);
|
||||
|
||||
if (expr.min_offset) {
|
||||
if (report.minOffset) {
|
||||
depth max_offset = maxDistFromStart(du) + maxDistToAccept(dv);
|
||||
if (max_offset.is_finite() && max_offset < expr.min_offset) {
|
||||
if (max_offset.is_finite() && max_offset < report.minOffset) {
|
||||
DEBUG_PRINTF("max_offset=%s too small\n", max_offset.str().c_str());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (expr.max_offset != MAX_OFFSET) {
|
||||
if (report.maxOffset != MAX_OFFSET) {
|
||||
depth min_offset = minDistFromStart(du) + minDistToAccept(dv);
|
||||
assert(min_offset.is_finite());
|
||||
|
||||
if (min_offset > expr.max_offset) {
|
||||
if (min_offset > report.maxOffset) {
|
||||
DEBUG_PRINTF("min_offset=%s too large\n", min_offset.str().c_str());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (expr.min_length && is_any_accept(v, g)) {
|
||||
if (report.minLength && is_any_accept(v, g)) {
|
||||
// Simple take on min_length. If we're an edge to accept and our max
|
||||
// dist from start is too small, we can be pruned.
|
||||
const depth &width = du.fromStart.max;
|
||||
if (width.is_finite() && width < expr.min_length) {
|
||||
if (width.is_finite() && width < report.minLength) {
|
||||
DEBUG_PRINTF("max width %s from start too small for min_length\n",
|
||||
width.str().c_str());
|
||||
return true;
|
||||
@ -603,14 +721,26 @@ bool isEdgePrunable(const NGHolder &g, const ExpressionInfo &expr,
|
||||
}
|
||||
|
||||
static
|
||||
void pruneExtUnreachable(NGHolder &g, const ExpressionInfo &expr) {
|
||||
void pruneExtUnreachable(NGHolder &g, const ReportManager &rm) {
|
||||
const auto &reports = all_reports(g);
|
||||
if (reports.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!hasSameBounds(reports, rm)) {
|
||||
DEBUG_PRINTF("report bounds vary\n");
|
||||
return;
|
||||
}
|
||||
|
||||
const auto &report = rm.getReport(*reports.begin());
|
||||
|
||||
vector<NFAVertexBidiDepth> depths;
|
||||
calcDepths(g, depths);
|
||||
|
||||
vector<NFAEdge> dead;
|
||||
|
||||
for (const auto &e : edges_range(g)) {
|
||||
if (isEdgePrunable(g, expr, depths, e)) {
|
||||
if (isEdgePrunable(g, report, depths, e)) {
|
||||
DEBUG_PRINTF("pruning\n");
|
||||
dead.push_back(e);
|
||||
}
|
||||
@ -622,32 +752,45 @@ void pruneExtUnreachable(NGHolder &g, const ExpressionInfo &expr) {
|
||||
|
||||
remove_edges(dead, g);
|
||||
pruneUseless(g);
|
||||
clearReports(g);
|
||||
}
|
||||
|
||||
/** Remove vacuous edges in graphs where the min_offset or min_length
|
||||
* constraints dictate that they can never produce a match. */
|
||||
/**
|
||||
* Remove vacuous edges in graphs where the min_offset or min_length
|
||||
* constraints dictate that they can never produce a match.
|
||||
*/
|
||||
static
|
||||
void pruneVacuousEdges(NGHolder &g, const ExpressionInfo &expr) {
|
||||
if (!expr.min_length && !expr.min_offset) {
|
||||
return;
|
||||
}
|
||||
|
||||
void pruneVacuousEdges(NGHolder &g, const ReportManager &rm) {
|
||||
vector<NFAEdge> dead;
|
||||
|
||||
auto has_min_offset = [&](NFAVertex v) {
|
||||
assert(!g[v].reports.empty()); // must be reporter
|
||||
return all_of_in(g[v].reports, [&](ReportID id) {
|
||||
return rm.getReport(id).minOffset > 0;
|
||||
});
|
||||
};
|
||||
|
||||
auto has_min_length = [&](NFAVertex v) {
|
||||
assert(!g[v].reports.empty()); // must be reporter
|
||||
return all_of_in(g[v].reports, [&](ReportID id) {
|
||||
return rm.getReport(id).minLength > 0;
|
||||
});
|
||||
};
|
||||
|
||||
for (const auto &e : edges_range(g)) {
|
||||
const NFAVertex u = source(e, g);
|
||||
const NFAVertex v = target(e, g);
|
||||
|
||||
// Special case: Crudely remove vacuous edges from start in graphs with a
|
||||
// min_offset.
|
||||
if (expr.min_offset && u == g.start && is_any_accept(v, g)) {
|
||||
// Special case: Crudely remove vacuous edges from start in graphs with
|
||||
// a min_offset.
|
||||
if (u == g.start && is_any_accept(v, g) && has_min_offset(u)) {
|
||||
DEBUG_PRINTF("vacuous edge in graph with min_offset!\n");
|
||||
dead.push_back(e);
|
||||
continue;
|
||||
}
|
||||
|
||||
// If a min_length is set, vacuous edges can be removed.
|
||||
if (expr.min_length && is_any_start(u, g) && is_any_accept(v, g)) {
|
||||
if (is_any_start(u, g) && is_any_accept(v, g) && has_min_length(u)) {
|
||||
DEBUG_PRINTF("vacuous edge in graph with min_length!\n");
|
||||
dead.push_back(e);
|
||||
continue;
|
||||
@ -658,13 +801,14 @@ void pruneVacuousEdges(NGHolder &g, const ExpressionInfo &expr) {
|
||||
return;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("removing %zu vacuous edges\n", dead.size());
|
||||
remove_edges(dead, g);
|
||||
pruneUseless(g);
|
||||
clearReports(g);
|
||||
}
|
||||
|
||||
static
|
||||
void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr,
|
||||
const vector<DepthMinMax> &depths,
|
||||
void pruneUnmatchable(NGHolder &g, const vector<DepthMinMax> &depths,
|
||||
const ReportManager &rm, NFAVertex accept) {
|
||||
vector<NFAEdge> dead;
|
||||
|
||||
@ -675,6 +819,11 @@ void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr,
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!hasSameBounds(g[v].reports, rm)) {
|
||||
continue;
|
||||
}
|
||||
const auto &report = rm.getReport(*g[v].reports.begin());
|
||||
|
||||
u32 idx = g[v].index;
|
||||
DepthMinMax d = depths[idx]; // copy
|
||||
pair<s32, s32> adj = getMinMaxOffsetAdjust(rm, g, v);
|
||||
@ -683,16 +832,16 @@ void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr,
|
||||
d.min += adj.first;
|
||||
d.max += adj.second;
|
||||
|
||||
if (d.max.is_finite() && d.max < expr.min_length) {
|
||||
if (d.max.is_finite() && d.max < report.minLength) {
|
||||
DEBUG_PRINTF("prune, max match length %s < min_length=%llu\n",
|
||||
d.max.str().c_str(), expr.min_length);
|
||||
d.max.str().c_str(), report.minLength);
|
||||
dead.push_back(e);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (expr.max_offset != MAX_OFFSET && d.min > expr.max_offset) {
|
||||
if (report.maxOffset != MAX_OFFSET && d.min > report.maxOffset) {
|
||||
DEBUG_PRINTF("prune, min match length %s > max_offset=%llu\n",
|
||||
d.min.str().c_str(), expr.max_offset);
|
||||
d.min.str().c_str(), report.maxOffset);
|
||||
dead.push_back(e);
|
||||
continue;
|
||||
}
|
||||
@ -701,47 +850,36 @@ void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr,
|
||||
remove_edges(dead, g);
|
||||
}
|
||||
|
||||
/** Remove edges to accepts that can never produce a match long enough to
|
||||
* satisfy our min_length and max_offset constraints. */
|
||||
/**
|
||||
* Remove edges to accepts that can never produce a match long enough to
|
||||
* satisfy our min_length and max_offset constraints.
|
||||
*/
|
||||
static
|
||||
void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr,
|
||||
const ReportManager &rm) {
|
||||
if (!expr.min_length) {
|
||||
void pruneUnmatchable(NGHolder &g, const ReportManager &rm) {
|
||||
if (!any_of_in(all_reports(g), [&](ReportID id) {
|
||||
return rm.getReport(id).minLength > 0;
|
||||
})) {
|
||||
return;
|
||||
}
|
||||
|
||||
vector<DepthMinMax> depths = getDistancesFromSOM(g);
|
||||
|
||||
pruneUnmatchable(g, expr, depths, rm, g.accept);
|
||||
pruneUnmatchable(g, expr, depths, rm, g.acceptEod);
|
||||
pruneUnmatchable(g, depths, rm, g.accept);
|
||||
pruneUnmatchable(g, depths, rm, g.acceptEod);
|
||||
|
||||
pruneUseless(g);
|
||||
}
|
||||
|
||||
static
|
||||
bool isUnanchored(const NGHolder &g) {
|
||||
for (auto v : adjacent_vertices_range(g.start, g)) {
|
||||
if (!edge(g.startDs, v, g).second) {
|
||||
DEBUG_PRINTF("fail, %zu is anchored vertex\n", g[v].index);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
clearReports(g);
|
||||
}
|
||||
|
||||
static
|
||||
bool hasOffsetAdjustments(const ReportManager &rm, const NGHolder &g) {
|
||||
for (auto report : all_reports(g)) {
|
||||
const Report &ir = rm.getReport(report);
|
||||
if (ir.offsetAdjust) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
return any_of_in(all_reports(g), [&rm](ReportID id) {
|
||||
return rm.getReport(id).offsetAdjust != 0;
|
||||
});
|
||||
}
|
||||
|
||||
void handleExtendedParams(ReportManager &rm, NGHolder &g, ExpressionInfo &expr,
|
||||
UNUSED const CompileContext &cc) {
|
||||
void propagateExtendedParams(NGHolder &g, ExpressionInfo &expr,
|
||||
ReportManager &rm) {
|
||||
if (!hasExtParams(expr)) {
|
||||
return;
|
||||
}
|
||||
@ -750,11 +888,6 @@ void handleExtendedParams(ReportManager &rm, NGHolder &g, ExpressionInfo &expr,
|
||||
depth maxWidth = findMaxWidth(g);
|
||||
bool is_anchored = !has_proper_successor(g.startDs, g)
|
||||
&& out_degree(g.start, g);
|
||||
bool has_offset_adj = hasOffsetAdjustments(rm, g);
|
||||
|
||||
DEBUG_PRINTF("minWidth=%s, maxWidth=%s, anchored=%d, offset_adj=%d\n",
|
||||
minWidth.str().c_str(), maxWidth.str().c_str(), is_anchored,
|
||||
has_offset_adj);
|
||||
|
||||
DepthMinMax match_depths = findMatchLengths(rm, g);
|
||||
DEBUG_PRINTF("match depths %s\n", match_depths.str().c_str());
|
||||
@ -792,91 +925,122 @@ void handleExtendedParams(ReportManager &rm, NGHolder &g, ExpressionInfo &expr,
|
||||
return;
|
||||
}
|
||||
|
||||
pruneVacuousEdges(g, expr);
|
||||
pruneUnmatchable(g, expr, rm);
|
||||
updateReportBounds(rm, g, expr);
|
||||
}
|
||||
|
||||
if (!has_offset_adj) {
|
||||
pruneExtUnreachable(g, expr);
|
||||
/**
|
||||
* If the pattern is completely anchored and has a min_length set, this can
|
||||
* be converted to a min_offset.
|
||||
*/
|
||||
static
|
||||
void replaceMinLengthWithOffset(NGHolder &g, ReportManager &rm) {
|
||||
if (has_proper_successor(g.startDs, g)) {
|
||||
return; // not wholly anchored
|
||||
}
|
||||
|
||||
// We may have removed all the edges to accept, in which case this
|
||||
// expression cannot match.
|
||||
if (in_degree(g.accept, g) == 0 && in_degree(g.acceptEod, g) == 1) {
|
||||
throw CompileError(expr.index, "Extended parameter "
|
||||
"constraints can not be satisfied for any match from "
|
||||
"this expression.");
|
||||
replaceReports(g, [&rm](NFAVertex, ReportID id) {
|
||||
const auto &report = rm.getReport(id);
|
||||
if (report.minLength) {
|
||||
Report new_report = report;
|
||||
u64a min_len_offset = report.minLength - report.offsetAdjust;
|
||||
new_report.minOffset = max(report.minOffset, min_len_offset);
|
||||
new_report.minLength = 0;
|
||||
return rm.getInternalId(new_report);
|
||||
}
|
||||
return id;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear offset bounds on reports that are not needed because they're satisfied
|
||||
* by vertex depth.
|
||||
*/
|
||||
static
|
||||
void removeUnneededOffsetBounds(NGHolder &g, ReportManager &rm) {
|
||||
vector<NFAVertexDepth> depths;
|
||||
calcDepths(g, depths);
|
||||
|
||||
replaceReports(g, [&](NFAVertex v, ReportID id) {
|
||||
const auto &d = depths.at(g[v].index);
|
||||
const depth &min_depth = min(d.fromStartDotStar.min, d.fromStart.min);
|
||||
const depth &max_depth = maxDistFromStart(d);
|
||||
|
||||
DEBUG_PRINTF("vertex %zu has min_depth=%s, max_depth=%s\n", g[v].index,
|
||||
min_depth.str().c_str(), max_depth.str().c_str());
|
||||
|
||||
Report report = rm.getReport(id); // copy
|
||||
bool modified = false;
|
||||
if (report.minOffset && !report.offsetAdjust &&
|
||||
report.minOffset <= min_depth) {
|
||||
report.minOffset = 0;
|
||||
modified = true;
|
||||
}
|
||||
if (report.maxOffset != MAX_OFFSET && max_depth.is_finite() &&
|
||||
report.maxOffset >= max_depth) {
|
||||
report.maxOffset = MAX_OFFSET;
|
||||
modified = true;
|
||||
}
|
||||
if (modified) {
|
||||
DEBUG_PRINTF("vertex %zu, changed bounds to [%llu,%llu]\n",
|
||||
g[v].index, report.minOffset, report.maxOffset);
|
||||
return rm.getInternalId(report);
|
||||
}
|
||||
|
||||
return id;
|
||||
});
|
||||
}
|
||||
|
||||
void reduceExtendedParams(NGHolder &g, ReportManager &rm, som_type som) {
|
||||
if (!any_of_in(all_reports(g),
|
||||
[&](ReportID id) { return rm.getReport(id).hasBounds(); })) {
|
||||
DEBUG_PRINTF("no extparam bounds\n");
|
||||
return;
|
||||
}
|
||||
|
||||
// Remove reports on vertices without an edge to accept (which have been
|
||||
// pruned above).
|
||||
clearReports(g);
|
||||
DEBUG_PRINTF("graph has extparam bounds\n");
|
||||
|
||||
// Recalc.
|
||||
minWidth = findMinWidth(g);
|
||||
maxWidth = findMaxWidth(g);
|
||||
is_anchored = proper_out_degree(g.startDs, g) == 0 &&
|
||||
out_degree(g.start, g);
|
||||
has_offset_adj = hasOffsetAdjustments(rm, g);
|
||||
|
||||
// If the pattern is completely anchored and has a min_length set, this can
|
||||
// be converted to a min_offset.
|
||||
if (expr.min_length && (expr.min_offset <= expr.min_length) &&
|
||||
is_anchored) {
|
||||
DEBUG_PRINTF("convertinexpr.min_length to min_offset=%llu for "
|
||||
"anchored case\n", expr.min_length);
|
||||
expr.min_offset = expr.min_length;
|
||||
expr.min_length = 0;
|
||||
pruneVacuousEdges(g, rm);
|
||||
if (can_never_match(g)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (expr.min_offset && expr.min_offset <= minWidth && !has_offset_adj) {
|
||||
DEBUG_PRINTF("min_offset=%llu constraint is unnecessary\n",
|
||||
expr.min_offset);
|
||||
expr.min_offset = 0;
|
||||
pruneUnmatchable(g, rm);
|
||||
if (can_never_match(g)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!hasExtParams(expr)) {
|
||||
if (!hasOffsetAdjustments(rm, g)) {
|
||||
pruneExtUnreachable(g, rm);
|
||||
if (can_never_match(g)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
replaceMinLengthWithOffset(g, rm);
|
||||
if (can_never_match(g)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If the pattern has a min_length and is of "ratchet" form with one
|
||||
// unbounded repeat, that repeat can become a bounded repeat.
|
||||
// e.g. /foo.*bar/{min_length=100} --> /foo.{94,}bar/
|
||||
if (expr.min_length && transformMinLengthToRepeat(rm, g, expr)) {
|
||||
DEBUG_PRINTF("converted min_length to bounded repeat\n");
|
||||
// recalc
|
||||
minWidth = findMinWidth(g);
|
||||
transformMinLengthToRepeat(g, rm);
|
||||
if (can_never_match(g)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If the pattern is unanchored, has a max_offset and has not asked for
|
||||
// SOM, we can use that knowledge to anchor it which will limit its
|
||||
// lifespan. Note that we can't use this transformation if there's a
|
||||
// min_length, as it's currently handled using "sly SOM".
|
||||
|
||||
// Note that it is possible to handle graphs that have a combination of
|
||||
// anchored and unanchored paths, but it's too tricky for the moment.
|
||||
|
||||
if (expr.max_offset != MAX_OFFSET && !expr.som && !expr.min_length &&
|
||||
!has_offset_adj && isUnanchored(g)) {
|
||||
if (anchorPatternWithBoundedRepeat(g, expr, minWidth, maxWidth)) {
|
||||
DEBUG_PRINTF("minWidth=%s, maxWidth=%s\n", minWidth.str().c_str(),
|
||||
maxWidth.str().c_str());
|
||||
if (minWidth == maxWidth) {
|
||||
// For a fixed width pattern, we can retire the offsets as they
|
||||
// are implicit in the graph now.
|
||||
expr.min_offset = 0;
|
||||
expr.max_offset = MAX_OFFSET;
|
||||
}
|
||||
if (som == SOM_NONE) {
|
||||
anchorPatternWithBoundedRepeat(g, rm);
|
||||
if (can_never_match(g)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
//dumpGraph("final.dot", g);
|
||||
|
||||
if (!hasExtParams(expr)) {
|
||||
return;
|
||||
}
|
||||
|
||||
set<NFAVertex> done;
|
||||
updateReportBounds(rm, g, expr, g.accept, done);
|
||||
updateReportBounds(rm, g, expr, g.acceptEod, done);
|
||||
removeUnneededOffsetBounds(g, rm);
|
||||
}
|
||||
|
||||
} // namespace ue2
|
||||
|
@ -34,15 +34,30 @@
|
||||
#ifndef NG_EXTPARAM_H
|
||||
#define NG_EXTPARAM_H
|
||||
|
||||
#include "som/som.h"
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
struct CompileContext;
|
||||
class ExpressionInfo;
|
||||
class NGHolder;
|
||||
class ReportManager;
|
||||
|
||||
void handleExtendedParams(ReportManager &rm, NGHolder &g, ExpressionInfo &expr,
|
||||
const CompileContext &cc);
|
||||
/**
|
||||
* \brief Propagate extended parameter information to vertex reports. Will
|
||||
* throw CompileError if this expression's extended parameters are not
|
||||
* satisfiable.
|
||||
*
|
||||
* This will also remove extended parameter constraints that are guaranteed to
|
||||
* be satisfied from ExpressionInfo.
|
||||
*/
|
||||
void propagateExtendedParams(NGHolder &g, ExpressionInfo &expr,
|
||||
ReportManager &rm);
|
||||
|
||||
/**
|
||||
* \brief Perform graph reductions (if possible) to do with extended parameter
|
||||
* constraints on reports.
|
||||
*/
|
||||
void reduceExtendedParams(NGHolder &g, ReportManager &rm, som_type som);
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
|
@ -41,6 +41,7 @@
|
||||
#include "nfagraph/ng_depth.h"
|
||||
#include "nfagraph/ng_holder.h"
|
||||
#include "nfagraph/ng_mcclellan.h"
|
||||
#include "nfagraph/ng_reports.h"
|
||||
#include "nfagraph/ng_prune.h"
|
||||
#include "nfagraph/ng_util.h"
|
||||
#include "smallwrite/smallwrite_internal.h"
|
||||
@ -179,8 +180,23 @@ void SmallWriteBuildImpl::add(const NGHolder &g, const ExpressionInfo &expr) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (expr.som || expr.min_length || isVacuous(g)) {
|
||||
poisoned = true; /* cannot support in smwr */
|
||||
if (expr.som) {
|
||||
DEBUG_PRINTF("no SOM support in small-write engine\n");
|
||||
poisoned = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (isVacuous(g)) {
|
||||
DEBUG_PRINTF("no vacuous graph support in small-write engine\n");
|
||||
poisoned = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (any_of_in(::ue2::all_reports(g), [&](ReportID id) {
|
||||
return rm.getReport(id).minLength > 0;
|
||||
})) {
|
||||
DEBUG_PRINTF("no min_length extparam support in small-write engine\n");
|
||||
poisoned = true;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -168,7 +168,7 @@ static const expected_info ei_test[] = {
|
||||
|
||||
// Some cases with extended parameters.
|
||||
{"^abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0},
|
||||
{"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 6, UINT_MAX, 0, 0, 0},
|
||||
{"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0},
|
||||
{"abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0},
|
||||
{"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0},
|
||||
{"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 5, 0}, 6, UINT_MAX, 0, 0, 0},
|
||||
@ -185,7 +185,7 @@ static const expected_info ei_test[] = {
|
||||
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, UINT_MAX, 0, 0, 0},
|
||||
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0},
|
||||
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2},
|
||||
4, UINT_MAX, 0, 0, 0},
|
||||
10, UINT_MAX, 0, 0, 0},
|
||||
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
|
||||
4, UINT_MAX, 0, 0, 0},
|
||||
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
|
||||
@ -194,7 +194,7 @@ static const expected_info ei_test[] = {
|
||||
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, 7, 0, 0, 0},
|
||||
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, 8, 0, 0, 0},
|
||||
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 8, 2},
|
||||
4, 8, 0, 0, 0},
|
||||
8, 8, 0, 0, 0},
|
||||
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
|
||||
4, 8, 0, 0, 0},
|
||||
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
|
||||
|
Loading…
x
Reference in New Issue
Block a user