From 98eff64edf4957c44320664a552f71a03f73f825 Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Mon, 14 Dec 2015 10:08:57 +1100 Subject: [PATCH] ng_prefilter: turn large max bound into inf During prefilter region replacement, turn regions with very large max bounds into repeats with inf max bound. This improves compile time and the likelihood that we will actually be able to build an implementation for such patterns. --- src/nfagraph/ng_prefilter.cpp | 42 ++++++++++++++++++++++++--------- unit/hyperscan/bad_patterns.txt | 1 - 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/src/nfagraph/ng_prefilter.cpp b/src/nfagraph/ng_prefilter.cpp index 54aeb28a..c0caf1b9 100644 --- a/src/nfagraph/ng_prefilter.cpp +++ b/src/nfagraph/ng_prefilter.cpp @@ -80,6 +80,10 @@ static const size_t BOUNDED_REPEAT_COUNT = 4; /** Scoring penalty for boundary regions. */ static const size_t PENALTY_BOUNDARY = 32; +/** Regions with max bounds greater than this value will have their max bound + * replaced with inf. */ +static const size_t MAX_REPLACE_BOUND = 10000; + namespace { /** Information describing a region. */ @@ -158,7 +162,7 @@ void markBoundaryRegions(const NGHolder &h, } u32 id = region_map.at(v); - map::iterator ri = regions.find(id); + auto ri = regions.find(id); if (ri == regions.end()) { continue; // Not tracking this region as it's too small. } @@ -176,16 +180,14 @@ map findRegionInfo(const NGHolder &h, continue; } u32 id = region_map.at(v); - RegionInfo &ri = regions.insert( - make_pair(id, RegionInfo(id))).first->second; + RegionInfo &ri = regions.emplace(id, RegionInfo(id)).first->second; ri.vertices.push_back(v); ri.reach |= h[v].char_reach; } // There's no point tracking more information about regions that we won't // consider replacing, so we remove them from the region map. - for (map::iterator it = regions.begin(); - it != regions.end();) { + for (auto it = regions.begin(); it != regions.end();) { if (it->second.vertices.size() < MIN_REPLACE_VERTICES) { regions.erase(it++); } else { @@ -217,7 +219,10 @@ void copyInEdges(NGHolder &g, NFAVertex from, NFAVertex to, if (contains(rverts, u)) { continue; } - if (edge(u, to, g).second) { + + // Check with edge_by_target to cope with predecessors with large + // fan-out. + if (edge_by_target(u, to, g).second) { continue; } @@ -250,17 +255,27 @@ void replaceRegion(NGHolder &g, const RegionInfo &ri, assert(ri.vertices.size() >= MIN_REPLACE_VERTICES); assert(ri.minWidth.is_finite()); + depth minWidth = ri.minWidth; + depth maxWidth = ri.maxWidth; + + if (maxWidth > depth(MAX_REPLACE_BOUND)) { + DEBUG_PRINTF("using inf instead of large bound %s\n", + maxWidth.str().c_str()); + maxWidth = depth::infinity(); + } + size_t replacementSize; - if (ri.minWidth == ri.maxWidth || ri.maxWidth.is_infinite()) { - replacementSize = ri.minWidth; // {N} or {N,} + if (minWidth == maxWidth || maxWidth.is_infinite()) { + replacementSize = minWidth; // {N} or {N,} } else { - replacementSize = ri.maxWidth; // {N,M} case + replacementSize = maxWidth; // {N,M} case } DEBUG_PRINTF("orig size %zu, replace size %zu\n", ri.vertices.size(), replacementSize); - deque verts; + vector verts; + verts.reserve(replacementSize); for (size_t i = 0; i < replacementSize; i++) { NFAVertex v = add_vertex(g); g[v].char_reach = ri.reach; @@ -360,7 +375,8 @@ void prefilterReductions(NGHolder &h, const CompileContext &cc) { return; } - DEBUG_PRINTF("graph with %zu vertices\n", num_vertices(h)); + DEBUG_PRINTF("before: graph with %zu vertices, %zu edges\n", + num_vertices(h), num_edges(h)); h.renumberVertices(); h.renumberEdges(); @@ -369,6 +385,10 @@ void prefilterReductions(NGHolder &h, const CompileContext &cc) { h.renumberVertices(); h.renumberEdges(); + + DEBUG_PRINTF("after: graph with %zu vertices, %zu edges\n", + num_vertices(h), num_edges(h)); + } } // namespace ue2 diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt index fb2a2357..9fc3a413 100644 --- a/unit/hyperscan/bad_patterns.txt +++ b/unit/hyperscan/bad_patterns.txt @@ -117,7 +117,6 @@ 117:/[\x{ff]/ #Value in \x{...} sequence is non-hex or missing } at index 1. 118:/foo/{min_offset=10,max_offset=9} #In hs_expr_ext, min_offset must be less than or equal to max_offset. 120:/foo/{min_length=10,max_offset=9} #In hs_expr_ext, min_length must be less than or equal to max_offset. -121:/.e(?:(((eEbd..(d[^Be]{1,7}|A)){8,22}aD.){7}|EecA?(?:\b)c|bB[Dd])){29,37}[adb](?:.|A|c|[BEA]|D)..((?:c|[Cba]))?([Ee]|D)B+(.|[dbB]|E|E).[EcCe]ce(?:C|D)dD[EA]Ac.[aE]d/smiHWP #Pattern too large. 122:/ÀÀ/8 #Expression is not valid UTF-8. 123:/hello \6 world/P #Invalid back reference to expression 6. 124:/hello \6 world|dog/P #Invalid back reference to expression 6.