ng_extparam: split up work and do per-comp reduce

This change breaks extparam processing up into:

 - propagateExtendedParams: propagates min_length, min_offset and
   max_offset into the reports on the graph
 - reduceExtendedParams: runs graph reductions based on extparams

Then, we apply the reduce pass to the whole graph, and later as well to
each component after calc_components.
This commit is contained in:
Justin Viiret 2017-03-23 14:10:14 +11:00 committed by Matthew Barr
parent 0a163b5535
commit a871f70c25
6 changed files with 435 additions and 214 deletions

View File

@ -388,7 +388,7 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
// fuzz graph - this must happen before any transformations are made
make_fuzzy(*g, expr.edit_distance, cc.grey);
handleExtendedParams(rm, *g, expr, cc);
propagateExtendedParams(*g, expr, rm);
fillExpressionInfo(rm, *g, expr, &local_info);
}
catch (const CompileError &e) {

View File

@ -214,6 +214,7 @@ bool addComponent(NG &ng, NGHolder &g, const ExpressionInfo &expr,
assert(allMatchStatesHaveReports(g));
reduceExtendedParams(g, ng.rm, som);
reduceGraph(g, som, expr.utf8, cc);
dumpComponent(g, "02_reduced", expr.index, comp_id, ng.cc.grey);
@ -223,6 +224,13 @@ bool addComponent(NG &ng, NGHolder &g, const ExpressionInfo &expr,
removeRegionRedundancy(g, som);
}
// We might be done at this point: if we've run out of vertices, we can
// stop processing.
if (num_vertices(g) == N_SPECIALS) {
DEBUG_PRINTF("all vertices claimed\n");
return true;
}
// "Short Exhaustible Passthrough" patterns always become outfixes.
if (!som && isSEP(g, ng.rm, cc.grey)) {
DEBUG_PRINTF("graph is SEP\n");
@ -358,10 +366,22 @@ bool NG::addGraph(ExpressionInfo &expr, unique_ptr<NGHolder> g_ptr) {
optimiseVirtualStarts(g); /* good for som */
handleExtendedParams(rm, g, expr, cc);
if (expr.min_length) {
// We have a minimum length constraint, which we currently use SOM to
// satisfy.
propagateExtendedParams(g, expr, rm);
reduceExtendedParams(g, rm, som);
// We may have removed all the edges to accept, in which case this
// expression cannot match.
if (can_never_match(g)) {
throw CompileError(expr.index, "Extended parameter constraints can not "
"be satisfied for any match from this "
"expression.");
}
if (any_of_in(all_reports(g), [&](ReportID id) {
return rm.getReport(id).minLength;
})) {
// We have at least one report with a minimum length constraint, which
// we currently use SOM to satisfy.
som = SOM_LEFT;
ssm.somPrecision(8);
}
@ -377,10 +397,16 @@ bool NG::addGraph(ExpressionInfo &expr, unique_ptr<NGHolder> g_ptr) {
relaxForbiddenUtf8(g, expr);
}
if (expr.highlander && !expr.min_length && !expr.min_offset) {
if (all_of_in(all_reports(g), [&](ReportID id) {
const auto &report = rm.getReport(id);
return report.ekey != INVALID_EKEY && !report.minLength &&
!report.minOffset;
})) {
// In highlander mode: if we don't have constraints on our reports that
// may prevent us accepting our first match (i.e. extended params) we
// can prune the other out-edges of all vertices connected to accept.
// TODO: shift the report checking down into pruneHighlanderAccepts()
// to allow us to handle the parts we can in mixed cases.
pruneHighlanderAccepts(g, rm);
}

View File

@ -26,12 +26,13 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
/**
* \file
* \brief Propagate extended parameters to vertex reports and reduce graph if
* possible.
*
* This code handles the propagation of the extension parameters specified by
* the user with the hs_expr_ext structure into the reports on the graph's
* the user with the \ref hs_expr_ext structure into the reports on the graph's
* vertices.
*
* There are also some analyses that prune edges that cannot contribute to a
@ -68,8 +69,28 @@ namespace ue2 {
static const u32 MAX_MAXOFFSET_TO_ANCHOR = 2000;
static const u32 MAX_MINLENGTH_TO_CONVERT = 2000;
/** \brief Find the (min, max) offset adjustment for the reports on a given
* vertex. */
/** True if all the given reports have the same extparam bounds. */
template<typename Container>
bool hasSameBounds(const Container &reports, const ReportManager &rm) {
assert(!reports.empty());
const auto &first = rm.getReport(*reports.begin());
for (auto id : reports) {
const auto &report = rm.getReport(id);
if (report.minOffset != first.minOffset ||
report.maxOffset != first.maxOffset ||
report.minLength != first.minLength) {
return false;
}
}
return true;
}
/**
* \brief Find the (min, max) offset adjustment for the reports on a given
* vertex.
*/
static
pair<s32,s32> getMinMaxOffsetAdjust(const ReportManager &rm,
const NGHolder &g, NFAVertex v) {
@ -130,55 +151,76 @@ DepthMinMax findMatchLengths(const ReportManager &rm, const NGHolder &g) {
return match_depths;
}
template<typename Function>
void replaceReports(NGHolder &g, NFAVertex accept, flat_set<NFAVertex> &seen,
Function func) {
for (auto v : inv_adjacent_vertices_range(accept, g)) {
if (v == g.accept) {
// Don't operate on accept: the accept->acceptEod edge is stylised.
assert(accept == g.acceptEod);
assert(g[v].reports.empty());
continue;
}
if (!seen.insert(v).second) {
continue; // We have already processed v.
}
auto &reports = g[v].reports;
if (reports.empty()) {
continue;
}
decltype(g[v].reports) new_reports;
for (auto id : g[v].reports) {
new_reports.insert(func(v, id));
}
reports = std::move(new_reports);
}
}
/**
* Generic function for replacing all the reports in the graph.
*
* Pass this a function that takes a vertex and a ReportID returns another
* ReportID (or the same one) to replace it with.
*/
template<typename Function>
void replaceReports(NGHolder &g, Function func) {
flat_set<NFAVertex> seen;
replaceReports(g, g.accept, seen, func);
replaceReports(g, g.acceptEod, seen, func);
}
/** \brief Replace the graph's reports with new reports that specify bounds. */
static
void updateReportBounds(ReportManager &rm, NGHolder &g,
const ExpressionInfo &expr, NFAVertex accept,
set<NFAVertex> &done) {
for (auto v : inv_adjacent_vertices_range(accept, g)) {
// Don't operate on g.accept itself.
if (v == g.accept) {
assert(accept == g.acceptEod);
continue;
const ExpressionInfo &expr) {
DEBUG_PRINTF("updating report bounds\n");
replaceReports(g, [&](NFAVertex, ReportID id) {
Report report = rm.getReport(id); // make a copy
assert(!report.hasBounds());
// Note that we need to cope with offset adjustment here.
report.minOffset = expr.min_offset - report.offsetAdjust;
if (expr.max_offset == MAX_OFFSET) {
report.maxOffset = MAX_OFFSET;
} else {
report.maxOffset = expr.max_offset - report.offsetAdjust;
}
assert(report.maxOffset >= report.minOffset);
report.minLength = expr.min_length;
if (expr.min_length && !expr.som) {
report.quashSom = true;
}
// Don't operate on a vertex we've already done.
if (contains(done, v)) {
continue;
}
done.insert(v);
DEBUG_PRINTF("id %u -> min_offset=%llu, max_offset=%llu, "
"min_length=%llu\n", id, report.minOffset,
report.maxOffset, report.minLength);
flat_set<ReportID> new_reports;
auto &reports = g[v].reports;
for (auto id : reports) {
Report ir = rm.getReport(id); // make a copy
assert(!ir.hasBounds());
// Note that we need to cope with offset adjustment here.
ir.minOffset = expr.min_offset - ir.offsetAdjust;
if (expr.max_offset == MAX_OFFSET) {
ir.maxOffset = MAX_OFFSET;
} else {
ir.maxOffset = expr.max_offset - ir.offsetAdjust;
}
assert(ir.maxOffset >= ir.minOffset);
ir.minLength = expr.min_length;
if (expr.min_length && !expr.som) {
ir.quashSom = true;
}
DEBUG_PRINTF("id %u -> min_offset=%llu, max_offset=%llu, "
"min_length=%llu\n",
id, ir.minOffset, ir.maxOffset, ir.minLength);
new_reports.insert(rm.getInternalId(ir));
}
DEBUG_PRINTF("swapping reports on vertex %zu\n", g[v].index);
reports.swap(new_reports);
}
return rm.getInternalId(report);
});
}
static
@ -191,32 +233,93 @@ bool hasVirtualStarts(const NGHolder &g) {
return false;
}
/** If the pattern is unanchored, has a max_offset and has not asked for SOM,
* we can use that knowledge to anchor it which will limit its lifespan. Note
* that we can't use this transformation if there's a min_length, as it's
* currently handled using "sly SOM".
/** Set the min_length param for all reports to zero. */
static
void clearMinLengthParam(NGHolder &g, ReportManager &rm) {
DEBUG_PRINTF("clearing min length\n");
replaceReports(g, [&rm](NFAVertex, ReportID id) {
const auto &report = rm.getReport(id);
if (report.minLength) {
Report new_report = report;
new_report.minLength = 0;
return rm.getInternalId(new_report);
}
return id;
});
}
/**
* Set the min_offset param to zero and the max_offset param to MAX_OFFSET for
* all reports.
*/
static
void clearOffsetParams(NGHolder &g, ReportManager &rm) {
DEBUG_PRINTF("clearing min and max offset\n");
replaceReports(g, [&rm](NFAVertex, ReportID id) {
const auto &report = rm.getReport(id);
if (report.minLength) {
Report new_report = report;
new_report.minOffset = 0;
new_report.maxOffset = MAX_OFFSET;
return rm.getInternalId(new_report);
}
return id;
});
}
/**
* If the pattern is unanchored, has a max_offset and has not asked for SOM, we
* can use that knowledge to anchor it which will limit its lifespan. Note that
* we can't use this transformation if there's a min_length, as it's currently
* handled using "sly SOM".
*
* Note that it is possible to handle graphs that have a combination of
* anchored and unanchored paths, but it's too tricky for the moment.
*/
static
bool anchorPatternWithBoundedRepeat(NGHolder &g, const ExpressionInfo &expr,
const depth &minWidth,
const depth &maxWidth) {
assert(!expr.som);
assert(expr.max_offset != MAX_OFFSET);
assert(minWidth <= maxWidth);
assert(maxWidth.is_reachable());
DEBUG_PRINTF("widths=[%s,%s], min/max offsets=[%llu,%llu]\n",
minWidth.str().c_str(), maxWidth.str().c_str(),
expr.min_offset, expr.max_offset);
if (expr.max_offset > MAX_MAXOFFSET_TO_ANCHOR) {
bool anchorPatternWithBoundedRepeat(NGHolder &g, ReportManager &rm) {
if (!isFloating(g)) {
return false;
}
if (expr.max_offset < minWidth) {
const auto &reports = all_reports(g);
if (reports.empty()) {
return false;
}
if (any_of_in(reports, [&](ReportID id) {
const auto &report = rm.getReport(id);
return report.maxOffset == MAX_OFFSET || report.minLength ||
report.offsetAdjust;
})) {
return false;
}
if (!hasSameBounds(reports, rm)) {
DEBUG_PRINTF("mixed report bounds\n");
return false;
}
const depth minWidth = findMinWidth(g);
const depth maxWidth = findMaxWidth(g);
assert(minWidth <= maxWidth);
assert(maxWidth.is_reachable());
const auto &first_report = rm.getReport(*reports.begin());
const auto min_offset = first_report.minOffset;
const auto max_offset = first_report.maxOffset;
assert(max_offset < MAX_OFFSET);
DEBUG_PRINTF("widths=[%s,%s], min/max offsets=[%llu,%llu]\n",
minWidth.str().c_str(), maxWidth.str().c_str(),
min_offset, max_offset);
if (max_offset > MAX_MAXOFFSET_TO_ANCHOR) {
return false;
}
if (max_offset < minWidth) {
assert(0);
return false;
}
@ -237,10 +340,10 @@ bool anchorPatternWithBoundedRepeat(NGHolder &g, const ExpressionInfo &expr,
u32 min_bound, max_bound;
if (maxWidth.is_infinite()) {
min_bound = 0;
max_bound = expr.max_offset - minWidth;
max_bound = max_offset - minWidth;
} else {
min_bound = expr.min_offset > maxWidth ? expr.min_offset - maxWidth : 0;
max_bound = expr.max_offset - minWidth;
min_bound = min_offset > maxWidth ? min_offset - maxWidth : 0;
max_bound = max_offset - minWidth;
}
DEBUG_PRINTF("prepending ^.{%u,%u}\n", min_bound, max_bound);
@ -293,6 +396,13 @@ bool anchorPatternWithBoundedRepeat(NGHolder &g, const ExpressionInfo &expr,
renumber_vertices(g);
renumber_edges(g);
if (minWidth == maxWidth) {
// For a fixed width pattern, we can retire the offsets as
// they are implicit in the graph now.
clearOffsetParams(g, rm);
}
clearReports(g);
return true;
}
@ -341,17 +451,27 @@ bool hasOffsetAdjust(const ReportManager &rm, NGHolder &g,
return true;
}
/** If the pattern has a min_length and is of "ratchet" form with one unbounded
/**
* If the pattern has a min_length and is of "ratchet" form with one unbounded
* repeat, that repeat can become a bounded repeat.
*
* /foo.*bar/{min_length=100} --> /foo.{94,}bar/
*/
static
bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g,
ExpressionInfo &expr) {
assert(expr.min_length);
bool transformMinLengthToRepeat(NGHolder &g, ReportManager &rm) {
const auto &reports = all_reports(g);
if (expr.min_length > MAX_MINLENGTH_TO_CONVERT) {
if (reports.empty()) {
return false;
}
if (!hasSameBounds(reports, rm)) {
DEBUG_PRINTF("mixed report bounds\n");
return false;
}
const auto &min_length = rm.getReport(*reports.begin()).minLength;
if (!min_length || min_length > MAX_MINLENGTH_TO_CONVERT) {
return false;
}
@ -381,7 +501,6 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g,
u32 width = 0;
// Walk from the start vertex to the cyclic state and ensure we have a
// chain of vertices.
while (v != cyclic) {
@ -443,10 +562,10 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g,
DEBUG_PRINTF("width=%u, vertex %zu is cyclic\n", width,
g[cyclic].index);
if (width >= expr.min_length) {
if (width >= min_length) {
DEBUG_PRINTF("min_length=%llu is guaranteed, as width=%u\n",
expr.min_length, width);
expr.min_length = 0;
min_length, width);
clearMinLengthParam(g, rm);
return true;
}
@ -474,7 +593,7 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g,
const CharReach &cr = g[cyclic].char_reach;
for (u32 i = 0; i < expr.min_length - width - 1; ++i) {
for (u32 i = 0; i < min_length - width - 1; ++i) {
v = add_vertex(g);
g[v].char_reach = cr;
@ -491,9 +610,8 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g,
renumber_vertices(g);
renumber_edges(g);
clearMinLengthParam(g, rm);
clearReports(g);
expr.min_length = 0;
return true;
}
@ -511,8 +629,8 @@ bool hasExtParams(const ExpressionInfo &expr) {
return false;
}
static
depth maxDistFromStart(const NFAVertexBidiDepth &d) {
template<class VertexDepth>
depth maxDistFromStart(const VertexDepth &d) {
if (!d.fromStartDotStar.max.is_unreachable()) {
// A path from startDs, any path, implies we can match at any offset.
return depth::infinity();
@ -541,7 +659,7 @@ const depth& minDistToAccept(const NFAVertexBidiDepth &d) {
}
static
bool isEdgePrunable(const NGHolder &g, const ExpressionInfo &expr,
bool isEdgePrunable(const NGHolder &g, const Report &report,
const vector<NFAVertexBidiDepth> &depths,
const NFAEdge &e) {
const NFAVertex u = source(e, g);
@ -570,29 +688,29 @@ bool isEdgePrunable(const NGHolder &g, const ExpressionInfo &expr,
const NFAVertexBidiDepth &du = depths.at(u_idx);
const NFAVertexBidiDepth &dv = depths.at(v_idx);
if (expr.min_offset) {
if (report.minOffset) {
depth max_offset = maxDistFromStart(du) + maxDistToAccept(dv);
if (max_offset.is_finite() && max_offset < expr.min_offset) {
if (max_offset.is_finite() && max_offset < report.minOffset) {
DEBUG_PRINTF("max_offset=%s too small\n", max_offset.str().c_str());
return true;
}
}
if (expr.max_offset != MAX_OFFSET) {
if (report.maxOffset != MAX_OFFSET) {
depth min_offset = minDistFromStart(du) + minDistToAccept(dv);
assert(min_offset.is_finite());
if (min_offset > expr.max_offset) {
if (min_offset > report.maxOffset) {
DEBUG_PRINTF("min_offset=%s too large\n", min_offset.str().c_str());
return true;
}
}
if (expr.min_length && is_any_accept(v, g)) {
if (report.minLength && is_any_accept(v, g)) {
// Simple take on min_length. If we're an edge to accept and our max
// dist from start is too small, we can be pruned.
const depth &width = du.fromStart.max;
if (width.is_finite() && width < expr.min_length) {
if (width.is_finite() && width < report.minLength) {
DEBUG_PRINTF("max width %s from start too small for min_length\n",
width.str().c_str());
return true;
@ -603,14 +721,26 @@ bool isEdgePrunable(const NGHolder &g, const ExpressionInfo &expr,
}
static
void pruneExtUnreachable(NGHolder &g, const ExpressionInfo &expr) {
void pruneExtUnreachable(NGHolder &g, const ReportManager &rm) {
const auto &reports = all_reports(g);
if (reports.empty()) {
return;
}
if (!hasSameBounds(reports, rm)) {
DEBUG_PRINTF("report bounds vary\n");
return;
}
const auto &report = rm.getReport(*reports.begin());
vector<NFAVertexBidiDepth> depths;
calcDepths(g, depths);
vector<NFAEdge> dead;
for (const auto &e : edges_range(g)) {
if (isEdgePrunable(g, expr, depths, e)) {
if (isEdgePrunable(g, report, depths, e)) {
DEBUG_PRINTF("pruning\n");
dead.push_back(e);
}
@ -622,32 +752,45 @@ void pruneExtUnreachable(NGHolder &g, const ExpressionInfo &expr) {
remove_edges(dead, g);
pruneUseless(g);
clearReports(g);
}
/** Remove vacuous edges in graphs where the min_offset or min_length
* constraints dictate that they can never produce a match. */
/**
* Remove vacuous edges in graphs where the min_offset or min_length
* constraints dictate that they can never produce a match.
*/
static
void pruneVacuousEdges(NGHolder &g, const ExpressionInfo &expr) {
if (!expr.min_length && !expr.min_offset) {
return;
}
void pruneVacuousEdges(NGHolder &g, const ReportManager &rm) {
vector<NFAEdge> dead;
auto has_min_offset = [&](NFAVertex v) {
assert(!g[v].reports.empty()); // must be reporter
return all_of_in(g[v].reports, [&](ReportID id) {
return rm.getReport(id).minOffset > 0;
});
};
auto has_min_length = [&](NFAVertex v) {
assert(!g[v].reports.empty()); // must be reporter
return all_of_in(g[v].reports, [&](ReportID id) {
return rm.getReport(id).minLength > 0;
});
};
for (const auto &e : edges_range(g)) {
const NFAVertex u = source(e, g);
const NFAVertex v = target(e, g);
// Special case: Crudely remove vacuous edges from start in graphs with a
// min_offset.
if (expr.min_offset && u == g.start && is_any_accept(v, g)) {
// Special case: Crudely remove vacuous edges from start in graphs with
// a min_offset.
if (u == g.start && is_any_accept(v, g) && has_min_offset(u)) {
DEBUG_PRINTF("vacuous edge in graph with min_offset!\n");
dead.push_back(e);
continue;
}
// If a min_length is set, vacuous edges can be removed.
if (expr.min_length && is_any_start(u, g) && is_any_accept(v, g)) {
if (is_any_start(u, g) && is_any_accept(v, g) && has_min_length(u)) {
DEBUG_PRINTF("vacuous edge in graph with min_length!\n");
dead.push_back(e);
continue;
@ -658,13 +801,14 @@ void pruneVacuousEdges(NGHolder &g, const ExpressionInfo &expr) {
return;
}
DEBUG_PRINTF("removing %zu vacuous edges\n", dead.size());
remove_edges(dead, g);
pruneUseless(g);
clearReports(g);
}
static
void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr,
const vector<DepthMinMax> &depths,
void pruneUnmatchable(NGHolder &g, const vector<DepthMinMax> &depths,
const ReportManager &rm, NFAVertex accept) {
vector<NFAEdge> dead;
@ -675,6 +819,11 @@ void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr,
continue;
}
if (!hasSameBounds(g[v].reports, rm)) {
continue;
}
const auto &report = rm.getReport(*g[v].reports.begin());
u32 idx = g[v].index;
DepthMinMax d = depths[idx]; // copy
pair<s32, s32> adj = getMinMaxOffsetAdjust(rm, g, v);
@ -683,16 +832,16 @@ void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr,
d.min += adj.first;
d.max += adj.second;
if (d.max.is_finite() && d.max < expr.min_length) {
if (d.max.is_finite() && d.max < report.minLength) {
DEBUG_PRINTF("prune, max match length %s < min_length=%llu\n",
d.max.str().c_str(), expr.min_length);
d.max.str().c_str(), report.minLength);
dead.push_back(e);
continue;
}
if (expr.max_offset != MAX_OFFSET && d.min > expr.max_offset) {
if (report.maxOffset != MAX_OFFSET && d.min > report.maxOffset) {
DEBUG_PRINTF("prune, min match length %s > max_offset=%llu\n",
d.min.str().c_str(), expr.max_offset);
d.min.str().c_str(), report.maxOffset);
dead.push_back(e);
continue;
}
@ -701,47 +850,36 @@ void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr,
remove_edges(dead, g);
}
/** Remove edges to accepts that can never produce a match long enough to
* satisfy our min_length and max_offset constraints. */
/**
* Remove edges to accepts that can never produce a match long enough to
* satisfy our min_length and max_offset constraints.
*/
static
void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr,
const ReportManager &rm) {
if (!expr.min_length) {
void pruneUnmatchable(NGHolder &g, const ReportManager &rm) {
if (!any_of_in(all_reports(g), [&](ReportID id) {
return rm.getReport(id).minLength > 0;
})) {
return;
}
vector<DepthMinMax> depths = getDistancesFromSOM(g);
pruneUnmatchable(g, expr, depths, rm, g.accept);
pruneUnmatchable(g, expr, depths, rm, g.acceptEod);
pruneUnmatchable(g, depths, rm, g.accept);
pruneUnmatchable(g, depths, rm, g.acceptEod);
pruneUseless(g);
}
static
bool isUnanchored(const NGHolder &g) {
for (auto v : adjacent_vertices_range(g.start, g)) {
if (!edge(g.startDs, v, g).second) {
DEBUG_PRINTF("fail, %zu is anchored vertex\n", g[v].index);
return false;
}
}
return true;
clearReports(g);
}
static
bool hasOffsetAdjustments(const ReportManager &rm, const NGHolder &g) {
for (auto report : all_reports(g)) {
const Report &ir = rm.getReport(report);
if (ir.offsetAdjust) {
return true;
}
}
return false;
return any_of_in(all_reports(g), [&rm](ReportID id) {
return rm.getReport(id).offsetAdjust != 0;
});
}
void handleExtendedParams(ReportManager &rm, NGHolder &g, ExpressionInfo &expr,
UNUSED const CompileContext &cc) {
void propagateExtendedParams(NGHolder &g, ExpressionInfo &expr,
ReportManager &rm) {
if (!hasExtParams(expr)) {
return;
}
@ -750,11 +888,6 @@ void handleExtendedParams(ReportManager &rm, NGHolder &g, ExpressionInfo &expr,
depth maxWidth = findMaxWidth(g);
bool is_anchored = !has_proper_successor(g.startDs, g)
&& out_degree(g.start, g);
bool has_offset_adj = hasOffsetAdjustments(rm, g);
DEBUG_PRINTF("minWidth=%s, maxWidth=%s, anchored=%d, offset_adj=%d\n",
minWidth.str().c_str(), maxWidth.str().c_str(), is_anchored,
has_offset_adj);
DepthMinMax match_depths = findMatchLengths(rm, g);
DEBUG_PRINTF("match depths %s\n", match_depths.str().c_str());
@ -792,91 +925,122 @@ void handleExtendedParams(ReportManager &rm, NGHolder &g, ExpressionInfo &expr,
return;
}
pruneVacuousEdges(g, expr);
pruneUnmatchable(g, expr, rm);
updateReportBounds(rm, g, expr);
}
if (!has_offset_adj) {
pruneExtUnreachable(g, expr);
/**
* If the pattern is completely anchored and has a min_length set, this can
* be converted to a min_offset.
*/
static
void replaceMinLengthWithOffset(NGHolder &g, ReportManager &rm) {
if (has_proper_successor(g.startDs, g)) {
return; // not wholly anchored
}
// We may have removed all the edges to accept, in which case this
// expression cannot match.
if (in_degree(g.accept, g) == 0 && in_degree(g.acceptEod, g) == 1) {
throw CompileError(expr.index, "Extended parameter "
"constraints can not be satisfied for any match from "
"this expression.");
replaceReports(g, [&rm](NFAVertex, ReportID id) {
const auto &report = rm.getReport(id);
if (report.minLength) {
Report new_report = report;
u64a min_len_offset = report.minLength - report.offsetAdjust;
new_report.minOffset = max(report.minOffset, min_len_offset);
new_report.minLength = 0;
return rm.getInternalId(new_report);
}
return id;
});
}
/**
* Clear offset bounds on reports that are not needed because they're satisfied
* by vertex depth.
*/
static
void removeUnneededOffsetBounds(NGHolder &g, ReportManager &rm) {
vector<NFAVertexDepth> depths;
calcDepths(g, depths);
replaceReports(g, [&](NFAVertex v, ReportID id) {
const auto &d = depths.at(g[v].index);
const depth &min_depth = min(d.fromStartDotStar.min, d.fromStart.min);
const depth &max_depth = maxDistFromStart(d);
DEBUG_PRINTF("vertex %zu has min_depth=%s, max_depth=%s\n", g[v].index,
min_depth.str().c_str(), max_depth.str().c_str());
Report report = rm.getReport(id); // copy
bool modified = false;
if (report.minOffset && !report.offsetAdjust &&
report.minOffset <= min_depth) {
report.minOffset = 0;
modified = true;
}
if (report.maxOffset != MAX_OFFSET && max_depth.is_finite() &&
report.maxOffset >= max_depth) {
report.maxOffset = MAX_OFFSET;
modified = true;
}
if (modified) {
DEBUG_PRINTF("vertex %zu, changed bounds to [%llu,%llu]\n",
g[v].index, report.minOffset, report.maxOffset);
return rm.getInternalId(report);
}
return id;
});
}
void reduceExtendedParams(NGHolder &g, ReportManager &rm, som_type som) {
if (!any_of_in(all_reports(g),
[&](ReportID id) { return rm.getReport(id).hasBounds(); })) {
DEBUG_PRINTF("no extparam bounds\n");
return;
}
// Remove reports on vertices without an edge to accept (which have been
// pruned above).
clearReports(g);
DEBUG_PRINTF("graph has extparam bounds\n");
// Recalc.
minWidth = findMinWidth(g);
maxWidth = findMaxWidth(g);
is_anchored = proper_out_degree(g.startDs, g) == 0 &&
out_degree(g.start, g);
has_offset_adj = hasOffsetAdjustments(rm, g);
// If the pattern is completely anchored and has a min_length set, this can
// be converted to a min_offset.
if (expr.min_length && (expr.min_offset <= expr.min_length) &&
is_anchored) {
DEBUG_PRINTF("convertinexpr.min_length to min_offset=%llu for "
"anchored case\n", expr.min_length);
expr.min_offset = expr.min_length;
expr.min_length = 0;
pruneVacuousEdges(g, rm);
if (can_never_match(g)) {
return;
}
if (expr.min_offset && expr.min_offset <= minWidth && !has_offset_adj) {
DEBUG_PRINTF("min_offset=%llu constraint is unnecessary\n",
expr.min_offset);
expr.min_offset = 0;
pruneUnmatchable(g, rm);
if (can_never_match(g)) {
return;
}
if (!hasExtParams(expr)) {
if (!hasOffsetAdjustments(rm, g)) {
pruneExtUnreachable(g, rm);
if (can_never_match(g)) {
return;
}
}
replaceMinLengthWithOffset(g, rm);
if (can_never_match(g)) {
return;
}
// If the pattern has a min_length and is of "ratchet" form with one
// unbounded repeat, that repeat can become a bounded repeat.
// e.g. /foo.*bar/{min_length=100} --> /foo.{94,}bar/
if (expr.min_length && transformMinLengthToRepeat(rm, g, expr)) {
DEBUG_PRINTF("converted min_length to bounded repeat\n");
// recalc
minWidth = findMinWidth(g);
transformMinLengthToRepeat(g, rm);
if (can_never_match(g)) {
return;
}
// If the pattern is unanchored, has a max_offset and has not asked for
// SOM, we can use that knowledge to anchor it which will limit its
// lifespan. Note that we can't use this transformation if there's a
// min_length, as it's currently handled using "sly SOM".
// Note that it is possible to handle graphs that have a combination of
// anchored and unanchored paths, but it's too tricky for the moment.
if (expr.max_offset != MAX_OFFSET && !expr.som && !expr.min_length &&
!has_offset_adj && isUnanchored(g)) {
if (anchorPatternWithBoundedRepeat(g, expr, minWidth, maxWidth)) {
DEBUG_PRINTF("minWidth=%s, maxWidth=%s\n", minWidth.str().c_str(),
maxWidth.str().c_str());
if (minWidth == maxWidth) {
// For a fixed width pattern, we can retire the offsets as they
// are implicit in the graph now.
expr.min_offset = 0;
expr.max_offset = MAX_OFFSET;
}
if (som == SOM_NONE) {
anchorPatternWithBoundedRepeat(g, rm);
if (can_never_match(g)) {
return;
}
}
//dumpGraph("final.dot", g);
if (!hasExtParams(expr)) {
return;
}
set<NFAVertex> done;
updateReportBounds(rm, g, expr, g.accept, done);
updateReportBounds(rm, g, expr, g.acceptEod, done);
removeUnneededOffsetBounds(g, rm);
}
} // namespace ue2

View File

@ -34,15 +34,30 @@
#ifndef NG_EXTPARAM_H
#define NG_EXTPARAM_H
#include "som/som.h"
namespace ue2 {
struct CompileContext;
class ExpressionInfo;
class NGHolder;
class ReportManager;
void handleExtendedParams(ReportManager &rm, NGHolder &g, ExpressionInfo &expr,
const CompileContext &cc);
/**
* \brief Propagate extended parameter information to vertex reports. Will
* throw CompileError if this expression's extended parameters are not
* satisfiable.
*
* This will also remove extended parameter constraints that are guaranteed to
* be satisfied from ExpressionInfo.
*/
void propagateExtendedParams(NGHolder &g, ExpressionInfo &expr,
ReportManager &rm);
/**
* \brief Perform graph reductions (if possible) to do with extended parameter
* constraints on reports.
*/
void reduceExtendedParams(NGHolder &g, ReportManager &rm, som_type som);
} // namespace ue2

View File

@ -41,6 +41,7 @@
#include "nfagraph/ng_depth.h"
#include "nfagraph/ng_holder.h"
#include "nfagraph/ng_mcclellan.h"
#include "nfagraph/ng_reports.h"
#include "nfagraph/ng_prune.h"
#include "nfagraph/ng_util.h"
#include "smallwrite/smallwrite_internal.h"
@ -179,8 +180,23 @@ void SmallWriteBuildImpl::add(const NGHolder &g, const ExpressionInfo &expr) {
return;
}
if (expr.som || expr.min_length || isVacuous(g)) {
poisoned = true; /* cannot support in smwr */
if (expr.som) {
DEBUG_PRINTF("no SOM support in small-write engine\n");
poisoned = true;
return;
}
if (isVacuous(g)) {
DEBUG_PRINTF("no vacuous graph support in small-write engine\n");
poisoned = true;
return;
}
if (any_of_in(::ue2::all_reports(g), [&](ReportID id) {
return rm.getReport(id).minLength > 0;
})) {
DEBUG_PRINTF("no min_length extparam support in small-write engine\n");
poisoned = true;
return;
}

View File

@ -168,7 +168,7 @@ static const expected_info ei_test[] = {
// Some cases with extended parameters.
{"^abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 6, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 5, 0}, 6, UINT_MAX, 0, 0, 0},
@ -185,7 +185,7 @@ static const expected_info ei_test[] = {
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2},
4, UINT_MAX, 0, 0, 0},
10, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
4, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
@ -194,7 +194,7 @@ static const expected_info ei_test[] = {
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, 7, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, 8, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 8, 2},
4, 8, 0, 0, 0},
8, 8, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
4, 8, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},