diff --git a/src/hs.cpp b/src/hs.cpp index c1e1cdce..b9d3b356 100644 --- a/src/hs.cpp +++ b/src/hs.cpp @@ -388,7 +388,7 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags, // fuzz graph - this must happen before any transformations are made make_fuzzy(*g, expr.edit_distance, cc.grey); - handleExtendedParams(rm, *g, expr, cc); + propagateExtendedParams(*g, expr, rm); fillExpressionInfo(rm, *g, expr, &local_info); } catch (const CompileError &e) { diff --git a/src/nfagraph/ng.cpp b/src/nfagraph/ng.cpp index dc74dcee..8b247c74 100644 --- a/src/nfagraph/ng.cpp +++ b/src/nfagraph/ng.cpp @@ -214,6 +214,7 @@ bool addComponent(NG &ng, NGHolder &g, const ExpressionInfo &expr, assert(allMatchStatesHaveReports(g)); + reduceExtendedParams(g, ng.rm, som); reduceGraph(g, som, expr.utf8, cc); dumpComponent(g, "02_reduced", expr.index, comp_id, ng.cc.grey); @@ -223,6 +224,13 @@ bool addComponent(NG &ng, NGHolder &g, const ExpressionInfo &expr, removeRegionRedundancy(g, som); } + // We might be done at this point: if we've run out of vertices, we can + // stop processing. + if (num_vertices(g) == N_SPECIALS) { + DEBUG_PRINTF("all vertices claimed\n"); + return true; + } + // "Short Exhaustible Passthrough" patterns always become outfixes. if (!som && isSEP(g, ng.rm, cc.grey)) { DEBUG_PRINTF("graph is SEP\n"); @@ -358,10 +366,22 @@ bool NG::addGraph(ExpressionInfo &expr, unique_ptr g_ptr) { optimiseVirtualStarts(g); /* good for som */ - handleExtendedParams(rm, g, expr, cc); - if (expr.min_length) { - // We have a minimum length constraint, which we currently use SOM to - // satisfy. + propagateExtendedParams(g, expr, rm); + reduceExtendedParams(g, rm, som); + + // We may have removed all the edges to accept, in which case this + // expression cannot match. + if (can_never_match(g)) { + throw CompileError(expr.index, "Extended parameter constraints can not " + "be satisfied for any match from this " + "expression."); + } + + if (any_of_in(all_reports(g), [&](ReportID id) { + return rm.getReport(id).minLength; + })) { + // We have at least one report with a minimum length constraint, which + // we currently use SOM to satisfy. som = SOM_LEFT; ssm.somPrecision(8); } @@ -377,10 +397,16 @@ bool NG::addGraph(ExpressionInfo &expr, unique_ptr g_ptr) { relaxForbiddenUtf8(g, expr); } - if (expr.highlander && !expr.min_length && !expr.min_offset) { + if (all_of_in(all_reports(g), [&](ReportID id) { + const auto &report = rm.getReport(id); + return report.ekey != INVALID_EKEY && !report.minLength && + !report.minOffset; + })) { // In highlander mode: if we don't have constraints on our reports that // may prevent us accepting our first match (i.e. extended params) we // can prune the other out-edges of all vertices connected to accept. + // TODO: shift the report checking down into pruneHighlanderAccepts() + // to allow us to handle the parts we can in mixed cases. pruneHighlanderAccepts(g, rm); } diff --git a/src/nfagraph/ng_extparam.cpp b/src/nfagraph/ng_extparam.cpp index 31a1f81b..19fa2295 100644 --- a/src/nfagraph/ng_extparam.cpp +++ b/src/nfagraph/ng_extparam.cpp @@ -26,12 +26,13 @@ * POSSIBILITY OF SUCH DAMAGE. */ -/** \file +/** + * \file * \brief Propagate extended parameters to vertex reports and reduce graph if * possible. * * This code handles the propagation of the extension parameters specified by - * the user with the hs_expr_ext structure into the reports on the graph's + * the user with the \ref hs_expr_ext structure into the reports on the graph's * vertices. * * There are also some analyses that prune edges that cannot contribute to a @@ -68,8 +69,28 @@ namespace ue2 { static const u32 MAX_MAXOFFSET_TO_ANCHOR = 2000; static const u32 MAX_MINLENGTH_TO_CONVERT = 2000; -/** \brief Find the (min, max) offset adjustment for the reports on a given - * vertex. */ +/** True if all the given reports have the same extparam bounds. */ +template +bool hasSameBounds(const Container &reports, const ReportManager &rm) { + assert(!reports.empty()); + + const auto &first = rm.getReport(*reports.begin()); + for (auto id : reports) { + const auto &report = rm.getReport(id); + if (report.minOffset != first.minOffset || + report.maxOffset != first.maxOffset || + report.minLength != first.minLength) { + return false; + } + } + + return true; +} + +/** + * \brief Find the (min, max) offset adjustment for the reports on a given + * vertex. + */ static pair getMinMaxOffsetAdjust(const ReportManager &rm, const NGHolder &g, NFAVertex v) { @@ -130,55 +151,76 @@ DepthMinMax findMatchLengths(const ReportManager &rm, const NGHolder &g) { return match_depths; } +template +void replaceReports(NGHolder &g, NFAVertex accept, flat_set &seen, + Function func) { + for (auto v : inv_adjacent_vertices_range(accept, g)) { + if (v == g.accept) { + // Don't operate on accept: the accept->acceptEod edge is stylised. + assert(accept == g.acceptEod); + assert(g[v].reports.empty()); + continue; + } + + if (!seen.insert(v).second) { + continue; // We have already processed v. + } + + auto &reports = g[v].reports; + if (reports.empty()) { + continue; + } + decltype(g[v].reports) new_reports; + for (auto id : g[v].reports) { + new_reports.insert(func(v, id)); + } + reports = std::move(new_reports); + } +} + +/** + * Generic function for replacing all the reports in the graph. + * + * Pass this a function that takes a vertex and a ReportID returns another + * ReportID (or the same one) to replace it with. + */ +template +void replaceReports(NGHolder &g, Function func) { + flat_set seen; + replaceReports(g, g.accept, seen, func); + replaceReports(g, g.acceptEod, seen, func); +} + /** \brief Replace the graph's reports with new reports that specify bounds. */ static void updateReportBounds(ReportManager &rm, NGHolder &g, - const ExpressionInfo &expr, NFAVertex accept, - set &done) { - for (auto v : inv_adjacent_vertices_range(accept, g)) { - // Don't operate on g.accept itself. - if (v == g.accept) { - assert(accept == g.acceptEod); - continue; + const ExpressionInfo &expr) { + DEBUG_PRINTF("updating report bounds\n"); + replaceReports(g, [&](NFAVertex, ReportID id) { + Report report = rm.getReport(id); // make a copy + assert(!report.hasBounds()); + + // Note that we need to cope with offset adjustment here. + + report.minOffset = expr.min_offset - report.offsetAdjust; + if (expr.max_offset == MAX_OFFSET) { + report.maxOffset = MAX_OFFSET; + } else { + report.maxOffset = expr.max_offset - report.offsetAdjust; + } + assert(report.maxOffset >= report.minOffset); + + report.minLength = expr.min_length; + if (expr.min_length && !expr.som) { + report.quashSom = true; } - // Don't operate on a vertex we've already done. - if (contains(done, v)) { - continue; - } - done.insert(v); + DEBUG_PRINTF("id %u -> min_offset=%llu, max_offset=%llu, " + "min_length=%llu\n", id, report.minOffset, + report.maxOffset, report.minLength); - flat_set new_reports; - auto &reports = g[v].reports; - - for (auto id : reports) { - Report ir = rm.getReport(id); // make a copy - assert(!ir.hasBounds()); - - // Note that we need to cope with offset adjustment here. - - ir.minOffset = expr.min_offset - ir.offsetAdjust; - if (expr.max_offset == MAX_OFFSET) { - ir.maxOffset = MAX_OFFSET; - } else { - ir.maxOffset = expr.max_offset - ir.offsetAdjust; - } - assert(ir.maxOffset >= ir.minOffset); - - ir.minLength = expr.min_length; - if (expr.min_length && !expr.som) { - ir.quashSom = true; - } - - DEBUG_PRINTF("id %u -> min_offset=%llu, max_offset=%llu, " - "min_length=%llu\n", - id, ir.minOffset, ir.maxOffset, ir.minLength); - new_reports.insert(rm.getInternalId(ir)); - } - - DEBUG_PRINTF("swapping reports on vertex %zu\n", g[v].index); - reports.swap(new_reports); - } + return rm.getInternalId(report); + }); } static @@ -191,32 +233,93 @@ bool hasVirtualStarts(const NGHolder &g) { return false; } -/** If the pattern is unanchored, has a max_offset and has not asked for SOM, - * we can use that knowledge to anchor it which will limit its lifespan. Note - * that we can't use this transformation if there's a min_length, as it's - * currently handled using "sly SOM". +/** Set the min_length param for all reports to zero. */ +static +void clearMinLengthParam(NGHolder &g, ReportManager &rm) { + DEBUG_PRINTF("clearing min length\n"); + replaceReports(g, [&rm](NFAVertex, ReportID id) { + const auto &report = rm.getReport(id); + if (report.minLength) { + Report new_report = report; + new_report.minLength = 0; + return rm.getInternalId(new_report); + } + return id; + }); +} + +/** + * Set the min_offset param to zero and the max_offset param to MAX_OFFSET for + * all reports. + */ +static +void clearOffsetParams(NGHolder &g, ReportManager &rm) { + DEBUG_PRINTF("clearing min and max offset\n"); + replaceReports(g, [&rm](NFAVertex, ReportID id) { + const auto &report = rm.getReport(id); + if (report.minLength) { + Report new_report = report; + new_report.minOffset = 0; + new_report.maxOffset = MAX_OFFSET; + return rm.getInternalId(new_report); + } + return id; + }); +} + +/** + * If the pattern is unanchored, has a max_offset and has not asked for SOM, we + * can use that knowledge to anchor it which will limit its lifespan. Note that + * we can't use this transformation if there's a min_length, as it's currently + * handled using "sly SOM". * * Note that it is possible to handle graphs that have a combination of * anchored and unanchored paths, but it's too tricky for the moment. */ static -bool anchorPatternWithBoundedRepeat(NGHolder &g, const ExpressionInfo &expr, - const depth &minWidth, - const depth &maxWidth) { - assert(!expr.som); - assert(expr.max_offset != MAX_OFFSET); - assert(minWidth <= maxWidth); - assert(maxWidth.is_reachable()); - - DEBUG_PRINTF("widths=[%s,%s], min/max offsets=[%llu,%llu]\n", - minWidth.str().c_str(), maxWidth.str().c_str(), - expr.min_offset, expr.max_offset); - - if (expr.max_offset > MAX_MAXOFFSET_TO_ANCHOR) { +bool anchorPatternWithBoundedRepeat(NGHolder &g, ReportManager &rm) { + if (!isFloating(g)) { return false; } - if (expr.max_offset < minWidth) { + const auto &reports = all_reports(g); + if (reports.empty()) { + return false; + } + + if (any_of_in(reports, [&](ReportID id) { + const auto &report = rm.getReport(id); + return report.maxOffset == MAX_OFFSET || report.minLength || + report.offsetAdjust; + })) { + return false; + } + + if (!hasSameBounds(reports, rm)) { + DEBUG_PRINTF("mixed report bounds\n"); + return false; + } + + const depth minWidth = findMinWidth(g); + const depth maxWidth = findMaxWidth(g); + + assert(minWidth <= maxWidth); + assert(maxWidth.is_reachable()); + + const auto &first_report = rm.getReport(*reports.begin()); + const auto min_offset = first_report.minOffset; + const auto max_offset = first_report.maxOffset; + assert(max_offset < MAX_OFFSET); + + DEBUG_PRINTF("widths=[%s,%s], min/max offsets=[%llu,%llu]\n", + minWidth.str().c_str(), maxWidth.str().c_str(), + min_offset, max_offset); + + if (max_offset > MAX_MAXOFFSET_TO_ANCHOR) { + return false; + } + + if (max_offset < minWidth) { assert(0); return false; } @@ -237,10 +340,10 @@ bool anchorPatternWithBoundedRepeat(NGHolder &g, const ExpressionInfo &expr, u32 min_bound, max_bound; if (maxWidth.is_infinite()) { min_bound = 0; - max_bound = expr.max_offset - minWidth; + max_bound = max_offset - minWidth; } else { - min_bound = expr.min_offset > maxWidth ? expr.min_offset - maxWidth : 0; - max_bound = expr.max_offset - minWidth; + min_bound = min_offset > maxWidth ? min_offset - maxWidth : 0; + max_bound = max_offset - minWidth; } DEBUG_PRINTF("prepending ^.{%u,%u}\n", min_bound, max_bound); @@ -293,6 +396,13 @@ bool anchorPatternWithBoundedRepeat(NGHolder &g, const ExpressionInfo &expr, renumber_vertices(g); renumber_edges(g); + if (minWidth == maxWidth) { + // For a fixed width pattern, we can retire the offsets as + // they are implicit in the graph now. + clearOffsetParams(g, rm); + } + + clearReports(g); return true; } @@ -341,17 +451,27 @@ bool hasOffsetAdjust(const ReportManager &rm, NGHolder &g, return true; } -/** If the pattern has a min_length and is of "ratchet" form with one unbounded +/** + * If the pattern has a min_length and is of "ratchet" form with one unbounded * repeat, that repeat can become a bounded repeat. * * /foo.*bar/{min_length=100} --> /foo.{94,}bar/ */ static -bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g, - ExpressionInfo &expr) { - assert(expr.min_length); +bool transformMinLengthToRepeat(NGHolder &g, ReportManager &rm) { + const auto &reports = all_reports(g); - if (expr.min_length > MAX_MINLENGTH_TO_CONVERT) { + if (reports.empty()) { + return false; + } + + if (!hasSameBounds(reports, rm)) { + DEBUG_PRINTF("mixed report bounds\n"); + return false; + } + + const auto &min_length = rm.getReport(*reports.begin()).minLength; + if (!min_length || min_length > MAX_MINLENGTH_TO_CONVERT) { return false; } @@ -381,7 +501,6 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g, u32 width = 0; - // Walk from the start vertex to the cyclic state and ensure we have a // chain of vertices. while (v != cyclic) { @@ -443,10 +562,10 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g, DEBUG_PRINTF("width=%u, vertex %zu is cyclic\n", width, g[cyclic].index); - if (width >= expr.min_length) { + if (width >= min_length) { DEBUG_PRINTF("min_length=%llu is guaranteed, as width=%u\n", - expr.min_length, width); - expr.min_length = 0; + min_length, width); + clearMinLengthParam(g, rm); return true; } @@ -474,7 +593,7 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g, const CharReach &cr = g[cyclic].char_reach; - for (u32 i = 0; i < expr.min_length - width - 1; ++i) { + for (u32 i = 0; i < min_length - width - 1; ++i) { v = add_vertex(g); g[v].char_reach = cr; @@ -491,9 +610,8 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGHolder &g, renumber_vertices(g); renumber_edges(g); + clearMinLengthParam(g, rm); clearReports(g); - - expr.min_length = 0; return true; } @@ -511,8 +629,8 @@ bool hasExtParams(const ExpressionInfo &expr) { return false; } -static -depth maxDistFromStart(const NFAVertexBidiDepth &d) { +template +depth maxDistFromStart(const VertexDepth &d) { if (!d.fromStartDotStar.max.is_unreachable()) { // A path from startDs, any path, implies we can match at any offset. return depth::infinity(); @@ -541,7 +659,7 @@ const depth& minDistToAccept(const NFAVertexBidiDepth &d) { } static -bool isEdgePrunable(const NGHolder &g, const ExpressionInfo &expr, +bool isEdgePrunable(const NGHolder &g, const Report &report, const vector &depths, const NFAEdge &e) { const NFAVertex u = source(e, g); @@ -570,29 +688,29 @@ bool isEdgePrunable(const NGHolder &g, const ExpressionInfo &expr, const NFAVertexBidiDepth &du = depths.at(u_idx); const NFAVertexBidiDepth &dv = depths.at(v_idx); - if (expr.min_offset) { + if (report.minOffset) { depth max_offset = maxDistFromStart(du) + maxDistToAccept(dv); - if (max_offset.is_finite() && max_offset < expr.min_offset) { + if (max_offset.is_finite() && max_offset < report.minOffset) { DEBUG_PRINTF("max_offset=%s too small\n", max_offset.str().c_str()); return true; } } - if (expr.max_offset != MAX_OFFSET) { + if (report.maxOffset != MAX_OFFSET) { depth min_offset = minDistFromStart(du) + minDistToAccept(dv); assert(min_offset.is_finite()); - if (min_offset > expr.max_offset) { + if (min_offset > report.maxOffset) { DEBUG_PRINTF("min_offset=%s too large\n", min_offset.str().c_str()); return true; } } - if (expr.min_length && is_any_accept(v, g)) { + if (report.minLength && is_any_accept(v, g)) { // Simple take on min_length. If we're an edge to accept and our max // dist from start is too small, we can be pruned. const depth &width = du.fromStart.max; - if (width.is_finite() && width < expr.min_length) { + if (width.is_finite() && width < report.minLength) { DEBUG_PRINTF("max width %s from start too small for min_length\n", width.str().c_str()); return true; @@ -603,14 +721,26 @@ bool isEdgePrunable(const NGHolder &g, const ExpressionInfo &expr, } static -void pruneExtUnreachable(NGHolder &g, const ExpressionInfo &expr) { +void pruneExtUnreachable(NGHolder &g, const ReportManager &rm) { + const auto &reports = all_reports(g); + if (reports.empty()) { + return; + } + + if (!hasSameBounds(reports, rm)) { + DEBUG_PRINTF("report bounds vary\n"); + return; + } + + const auto &report = rm.getReport(*reports.begin()); + vector depths; calcDepths(g, depths); vector dead; for (const auto &e : edges_range(g)) { - if (isEdgePrunable(g, expr, depths, e)) { + if (isEdgePrunable(g, report, depths, e)) { DEBUG_PRINTF("pruning\n"); dead.push_back(e); } @@ -622,32 +752,45 @@ void pruneExtUnreachable(NGHolder &g, const ExpressionInfo &expr) { remove_edges(dead, g); pruneUseless(g); + clearReports(g); } -/** Remove vacuous edges in graphs where the min_offset or min_length - * constraints dictate that they can never produce a match. */ +/** + * Remove vacuous edges in graphs where the min_offset or min_length + * constraints dictate that they can never produce a match. + */ static -void pruneVacuousEdges(NGHolder &g, const ExpressionInfo &expr) { - if (!expr.min_length && !expr.min_offset) { - return; - } - +void pruneVacuousEdges(NGHolder &g, const ReportManager &rm) { vector dead; + auto has_min_offset = [&](NFAVertex v) { + assert(!g[v].reports.empty()); // must be reporter + return all_of_in(g[v].reports, [&](ReportID id) { + return rm.getReport(id).minOffset > 0; + }); + }; + + auto has_min_length = [&](NFAVertex v) { + assert(!g[v].reports.empty()); // must be reporter + return all_of_in(g[v].reports, [&](ReportID id) { + return rm.getReport(id).minLength > 0; + }); + }; + for (const auto &e : edges_range(g)) { const NFAVertex u = source(e, g); const NFAVertex v = target(e, g); - // Special case: Crudely remove vacuous edges from start in graphs with a - // min_offset. - if (expr.min_offset && u == g.start && is_any_accept(v, g)) { + // Special case: Crudely remove vacuous edges from start in graphs with + // a min_offset. + if (u == g.start && is_any_accept(v, g) && has_min_offset(u)) { DEBUG_PRINTF("vacuous edge in graph with min_offset!\n"); dead.push_back(e); continue; } // If a min_length is set, vacuous edges can be removed. - if (expr.min_length && is_any_start(u, g) && is_any_accept(v, g)) { + if (is_any_start(u, g) && is_any_accept(v, g) && has_min_length(u)) { DEBUG_PRINTF("vacuous edge in graph with min_length!\n"); dead.push_back(e); continue; @@ -658,13 +801,14 @@ void pruneVacuousEdges(NGHolder &g, const ExpressionInfo &expr) { return; } + DEBUG_PRINTF("removing %zu vacuous edges\n", dead.size()); remove_edges(dead, g); pruneUseless(g); + clearReports(g); } static -void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr, - const vector &depths, +void pruneUnmatchable(NGHolder &g, const vector &depths, const ReportManager &rm, NFAVertex accept) { vector dead; @@ -675,6 +819,11 @@ void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr, continue; } + if (!hasSameBounds(g[v].reports, rm)) { + continue; + } + const auto &report = rm.getReport(*g[v].reports.begin()); + u32 idx = g[v].index; DepthMinMax d = depths[idx]; // copy pair adj = getMinMaxOffsetAdjust(rm, g, v); @@ -683,16 +832,16 @@ void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr, d.min += adj.first; d.max += adj.second; - if (d.max.is_finite() && d.max < expr.min_length) { + if (d.max.is_finite() && d.max < report.minLength) { DEBUG_PRINTF("prune, max match length %s < min_length=%llu\n", - d.max.str().c_str(), expr.min_length); + d.max.str().c_str(), report.minLength); dead.push_back(e); continue; } - if (expr.max_offset != MAX_OFFSET && d.min > expr.max_offset) { + if (report.maxOffset != MAX_OFFSET && d.min > report.maxOffset) { DEBUG_PRINTF("prune, min match length %s > max_offset=%llu\n", - d.min.str().c_str(), expr.max_offset); + d.min.str().c_str(), report.maxOffset); dead.push_back(e); continue; } @@ -701,47 +850,36 @@ void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr, remove_edges(dead, g); } -/** Remove edges to accepts that can never produce a match long enough to - * satisfy our min_length and max_offset constraints. */ +/** + * Remove edges to accepts that can never produce a match long enough to + * satisfy our min_length and max_offset constraints. + */ static -void pruneUnmatchable(NGHolder &g, const ExpressionInfo &expr, - const ReportManager &rm) { - if (!expr.min_length) { +void pruneUnmatchable(NGHolder &g, const ReportManager &rm) { + if (!any_of_in(all_reports(g), [&](ReportID id) { + return rm.getReport(id).minLength > 0; + })) { return; } vector depths = getDistancesFromSOM(g); - pruneUnmatchable(g, expr, depths, rm, g.accept); - pruneUnmatchable(g, expr, depths, rm, g.acceptEod); + pruneUnmatchable(g, depths, rm, g.accept); + pruneUnmatchable(g, depths, rm, g.acceptEod); pruneUseless(g); -} - -static -bool isUnanchored(const NGHolder &g) { - for (auto v : adjacent_vertices_range(g.start, g)) { - if (!edge(g.startDs, v, g).second) { - DEBUG_PRINTF("fail, %zu is anchored vertex\n", g[v].index); - return false; - } - } - return true; + clearReports(g); } static bool hasOffsetAdjustments(const ReportManager &rm, const NGHolder &g) { - for (auto report : all_reports(g)) { - const Report &ir = rm.getReport(report); - if (ir.offsetAdjust) { - return true; - } - } - return false; + return any_of_in(all_reports(g), [&rm](ReportID id) { + return rm.getReport(id).offsetAdjust != 0; + }); } -void handleExtendedParams(ReportManager &rm, NGHolder &g, ExpressionInfo &expr, - UNUSED const CompileContext &cc) { +void propagateExtendedParams(NGHolder &g, ExpressionInfo &expr, + ReportManager &rm) { if (!hasExtParams(expr)) { return; } @@ -750,11 +888,6 @@ void handleExtendedParams(ReportManager &rm, NGHolder &g, ExpressionInfo &expr, depth maxWidth = findMaxWidth(g); bool is_anchored = !has_proper_successor(g.startDs, g) && out_degree(g.start, g); - bool has_offset_adj = hasOffsetAdjustments(rm, g); - - DEBUG_PRINTF("minWidth=%s, maxWidth=%s, anchored=%d, offset_adj=%d\n", - minWidth.str().c_str(), maxWidth.str().c_str(), is_anchored, - has_offset_adj); DepthMinMax match_depths = findMatchLengths(rm, g); DEBUG_PRINTF("match depths %s\n", match_depths.str().c_str()); @@ -792,91 +925,122 @@ void handleExtendedParams(ReportManager &rm, NGHolder &g, ExpressionInfo &expr, return; } - pruneVacuousEdges(g, expr); - pruneUnmatchable(g, expr, rm); + updateReportBounds(rm, g, expr); +} - if (!has_offset_adj) { - pruneExtUnreachable(g, expr); +/** + * If the pattern is completely anchored and has a min_length set, this can + * be converted to a min_offset. + */ +static +void replaceMinLengthWithOffset(NGHolder &g, ReportManager &rm) { + if (has_proper_successor(g.startDs, g)) { + return; // not wholly anchored } - // We may have removed all the edges to accept, in which case this - // expression cannot match. - if (in_degree(g.accept, g) == 0 && in_degree(g.acceptEod, g) == 1) { - throw CompileError(expr.index, "Extended parameter " - "constraints can not be satisfied for any match from " - "this expression."); + replaceReports(g, [&rm](NFAVertex, ReportID id) { + const auto &report = rm.getReport(id); + if (report.minLength) { + Report new_report = report; + u64a min_len_offset = report.minLength - report.offsetAdjust; + new_report.minOffset = max(report.minOffset, min_len_offset); + new_report.minLength = 0; + return rm.getInternalId(new_report); + } + return id; + }); +} + +/** + * Clear offset bounds on reports that are not needed because they're satisfied + * by vertex depth. + */ +static +void removeUnneededOffsetBounds(NGHolder &g, ReportManager &rm) { + vector depths; + calcDepths(g, depths); + + replaceReports(g, [&](NFAVertex v, ReportID id) { + const auto &d = depths.at(g[v].index); + const depth &min_depth = min(d.fromStartDotStar.min, d.fromStart.min); + const depth &max_depth = maxDistFromStart(d); + + DEBUG_PRINTF("vertex %zu has min_depth=%s, max_depth=%s\n", g[v].index, + min_depth.str().c_str(), max_depth.str().c_str()); + + Report report = rm.getReport(id); // copy + bool modified = false; + if (report.minOffset && !report.offsetAdjust && + report.minOffset <= min_depth) { + report.minOffset = 0; + modified = true; + } + if (report.maxOffset != MAX_OFFSET && max_depth.is_finite() && + report.maxOffset >= max_depth) { + report.maxOffset = MAX_OFFSET; + modified = true; + } + if (modified) { + DEBUG_PRINTF("vertex %zu, changed bounds to [%llu,%llu]\n", + g[v].index, report.minOffset, report.maxOffset); + return rm.getInternalId(report); + } + + return id; + }); +} + +void reduceExtendedParams(NGHolder &g, ReportManager &rm, som_type som) { + if (!any_of_in(all_reports(g), + [&](ReportID id) { return rm.getReport(id).hasBounds(); })) { + DEBUG_PRINTF("no extparam bounds\n"); + return; } - // Remove reports on vertices without an edge to accept (which have been - // pruned above). - clearReports(g); + DEBUG_PRINTF("graph has extparam bounds\n"); - // Recalc. - minWidth = findMinWidth(g); - maxWidth = findMaxWidth(g); - is_anchored = proper_out_degree(g.startDs, g) == 0 && - out_degree(g.start, g); - has_offset_adj = hasOffsetAdjustments(rm, g); - - // If the pattern is completely anchored and has a min_length set, this can - // be converted to a min_offset. - if (expr.min_length && (expr.min_offset <= expr.min_length) && - is_anchored) { - DEBUG_PRINTF("convertinexpr.min_length to min_offset=%llu for " - "anchored case\n", expr.min_length); - expr.min_offset = expr.min_length; - expr.min_length = 0; + pruneVacuousEdges(g, rm); + if (can_never_match(g)) { + return; } - if (expr.min_offset && expr.min_offset <= minWidth && !has_offset_adj) { - DEBUG_PRINTF("min_offset=%llu constraint is unnecessary\n", - expr.min_offset); - expr.min_offset = 0; + pruneUnmatchable(g, rm); + if (can_never_match(g)) { + return; } - if (!hasExtParams(expr)) { + if (!hasOffsetAdjustments(rm, g)) { + pruneExtUnreachable(g, rm); + if (can_never_match(g)) { + return; + } + } + + replaceMinLengthWithOffset(g, rm); + if (can_never_match(g)) { return; } // If the pattern has a min_length and is of "ratchet" form with one // unbounded repeat, that repeat can become a bounded repeat. // e.g. /foo.*bar/{min_length=100} --> /foo.{94,}bar/ - if (expr.min_length && transformMinLengthToRepeat(rm, g, expr)) { - DEBUG_PRINTF("converted min_length to bounded repeat\n"); - // recalc - minWidth = findMinWidth(g); + transformMinLengthToRepeat(g, rm); + if (can_never_match(g)) { + return; } // If the pattern is unanchored, has a max_offset and has not asked for // SOM, we can use that knowledge to anchor it which will limit its // lifespan. Note that we can't use this transformation if there's a // min_length, as it's currently handled using "sly SOM". - - // Note that it is possible to handle graphs that have a combination of - // anchored and unanchored paths, but it's too tricky for the moment. - - if (expr.max_offset != MAX_OFFSET && !expr.som && !expr.min_length && - !has_offset_adj && isUnanchored(g)) { - if (anchorPatternWithBoundedRepeat(g, expr, minWidth, maxWidth)) { - DEBUG_PRINTF("minWidth=%s, maxWidth=%s\n", minWidth.str().c_str(), - maxWidth.str().c_str()); - if (minWidth == maxWidth) { - // For a fixed width pattern, we can retire the offsets as they - // are implicit in the graph now. - expr.min_offset = 0; - expr.max_offset = MAX_OFFSET; - } + if (som == SOM_NONE) { + anchorPatternWithBoundedRepeat(g, rm); + if (can_never_match(g)) { + return; } } - //dumpGraph("final.dot", g); - if (!hasExtParams(expr)) { - return; - } - - set done; - updateReportBounds(rm, g, expr, g.accept, done); - updateReportBounds(rm, g, expr, g.acceptEod, done); + removeUnneededOffsetBounds(g, rm); } } // namespace ue2 diff --git a/src/nfagraph/ng_extparam.h b/src/nfagraph/ng_extparam.h index 798acd3f..ae818075 100644 --- a/src/nfagraph/ng_extparam.h +++ b/src/nfagraph/ng_extparam.h @@ -34,15 +34,30 @@ #ifndef NG_EXTPARAM_H #define NG_EXTPARAM_H +#include "som/som.h" + namespace ue2 { -struct CompileContext; class ExpressionInfo; class NGHolder; class ReportManager; -void handleExtendedParams(ReportManager &rm, NGHolder &g, ExpressionInfo &expr, - const CompileContext &cc); +/** + * \brief Propagate extended parameter information to vertex reports. Will + * throw CompileError if this expression's extended parameters are not + * satisfiable. + * + * This will also remove extended parameter constraints that are guaranteed to + * be satisfied from ExpressionInfo. + */ +void propagateExtendedParams(NGHolder &g, ExpressionInfo &expr, + ReportManager &rm); + +/** + * \brief Perform graph reductions (if possible) to do with extended parameter + * constraints on reports. + */ +void reduceExtendedParams(NGHolder &g, ReportManager &rm, som_type som); } // namespace ue2 diff --git a/src/smallwrite/smallwrite_build.cpp b/src/smallwrite/smallwrite_build.cpp index 7d340d79..43a502f7 100644 --- a/src/smallwrite/smallwrite_build.cpp +++ b/src/smallwrite/smallwrite_build.cpp @@ -41,6 +41,7 @@ #include "nfagraph/ng_depth.h" #include "nfagraph/ng_holder.h" #include "nfagraph/ng_mcclellan.h" +#include "nfagraph/ng_reports.h" #include "nfagraph/ng_prune.h" #include "nfagraph/ng_util.h" #include "smallwrite/smallwrite_internal.h" @@ -179,8 +180,23 @@ void SmallWriteBuildImpl::add(const NGHolder &g, const ExpressionInfo &expr) { return; } - if (expr.som || expr.min_length || isVacuous(g)) { - poisoned = true; /* cannot support in smwr */ + if (expr.som) { + DEBUG_PRINTF("no SOM support in small-write engine\n"); + poisoned = true; + return; + } + + if (isVacuous(g)) { + DEBUG_PRINTF("no vacuous graph support in small-write engine\n"); + poisoned = true; + return; + } + + if (any_of_in(::ue2::all_reports(g), [&](ReportID id) { + return rm.getReport(id).minLength > 0; + })) { + DEBUG_PRINTF("no min_length extparam support in small-write engine\n"); + poisoned = true; return; } diff --git a/unit/hyperscan/expr_info.cpp b/unit/hyperscan/expr_info.cpp index e6ffa9ea..d2383479 100644 --- a/unit/hyperscan/expr_info.cpp +++ b/unit/hyperscan/expr_info.cpp @@ -168,7 +168,7 @@ static const expected_info ei_test[] = { // Some cases with extended parameters. {"^abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0}, - {"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 6, UINT_MAX, 0, 0, 0}, + {"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0}, {"abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0}, {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0}, {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 5, 0}, 6, UINT_MAX, 0, 0, 0}, @@ -185,7 +185,7 @@ static const expected_info ei_test[] = { {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, UINT_MAX, 0, 0, 0}, {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0}, {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2}, - 4, UINT_MAX, 0, 0, 0}, + 10, UINT_MAX, 0, 0, 0}, {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0}, {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2}, @@ -194,7 +194,7 @@ static const expected_info ei_test[] = { {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, 7, 0, 0, 0}, {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, 8, 0, 0, 0}, {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 8, 2}, - 4, 8, 0, 0, 0}, + 8, 8, 0, 0, 0}, {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2}, 4, 8, 0, 0, 0}, {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},