/* * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** \file * \brief Graph fuzzer for approximate matching */ #include "ng_fuzzy.h" #include "ng.h" #include "ng_depth.h" #include "ng_util.h" #include #include using namespace std; namespace ue2 { // returns all successors up to a given depth in a vector of sets, indexed by // zero-based depth from source vertex static vector> gatherSuccessorsByDepth(const NGHolder &g, NFAVertex src, u32 depth) { vector> result(depth); flat_set cur, next; assert(depth > 0); // populate current set of successors for (auto v : adjacent_vertices_range(src, g)) { // ignore self-loops if (src == v) { continue; } DEBUG_PRINTF("Node %zu depth 1\n", g[v].index); cur.insert(v); } result[0] = cur; for (unsigned d = 1; d < depth; d++) { // collect all successors for all current level vertices for (auto v : cur) { // don't go past special nodes if (is_special(v, g)) { continue; } for (auto succr : adjacent_vertices_range(v, g)) { // ignore self-loops if (v == succr) { continue; } DEBUG_PRINTF("Node %zu depth %u\n", g[succ].index, d + 1); next.insert(succr); } } result[d] = next; next.swap(cur); next.clear(); } return result; } // returns all predecessors up to a given depth in a vector of sets, indexed by // zero-based depth from source vertex static vector> gatherPredecessorsByDepth(const NGHolder &g, NFAVertex src, u32 depth) { vector> result(depth); flat_set cur, next; assert(depth > 0); // populate current set of successors for (auto v : inv_adjacent_vertices_range(src, g)) { // ignore self-loops if (src == v) { continue; } DEBUG_PRINTF("Node %zu depth 1\n", g[v].index); cur.insert(v); } result[0] = cur; for (unsigned d = 1; d < depth; d++) { // collect all successors for all current level vertices for (auto v : cur) { for (auto predc : inv_adjacent_vertices_range(v, g)) { // ignore self-loops if (v == predc) { continue; } DEBUG_PRINTF("Node %zu depth %u\n", g[pred].index, d + 1); next.insert(predc); } } result[d] = next; next.swap(cur); next.clear(); } return result; } /* * This struct produces a fuzzed graph; that is, a graph that is able to match * the original pattern, as well as input data within a certain edit distance. * Construct the struct, then call fuzz_graph() to transform the graph. * * Terminology used: * - Shadow vertices: vertices mirroring the original graph at various edit * distances * - Shadow graph level: edit distance of a particular shadow graph * - Helpers: dot vertices assigned to shadow vertices, used for insert/replace */ struct ShadowGraph { NGHolder &g; u32 edit_distance; bool hamming; map, NFAVertex> shadow_map; map, NFAVertex> helper_map; map clones; // edge creation is deferred vector> edges_to_be_added; flat_set orig; ShadowGraph(NGHolder &g_in, u32 ed_in, bool hamm_in) : g(g_in), edit_distance(ed_in), hamming(hamm_in) {} void fuzz_graph() { if (edit_distance == 0) { return; } DEBUG_PRINTF("edit distance = %u hamming = %s\n", edit_distance, hamming ? "true" : "false"); // step 1: prepare the vertices, helpers and shadows according to // the original graph prepare_graph(); // step 2: add shadow and helper nodes build_shadow_graph(); // step 3: set up reports for newly created vertices (and make clones // if necessary) if (!hamming) { create_reports(); } // step 4: wire up shadow graph and helpers for insert/replace/remove connect_shadow_graph(); // step 5: commit all the edge wirings DEBUG_PRINTF("Committing edge wirings\n"); for (const auto &p : edges_to_be_added) { add_edge_if_not_present(p.first, p.second, g); } DEBUG_PRINTF("Done!\n"); } private: const NFAVertex& get_clone(const NFAVertex &v) { return contains(clones, v) ? clones[v] : v; } void connect_to_clones(const NFAVertex &u, const NFAVertex &v) { const NFAVertex &clone_u = get_clone(u); const NFAVertex &clone_v = get_clone(v); edges_to_be_added.emplace_back(u, v); DEBUG_PRINTF("Adding edge: %zu -> %zu\n", g[u].index, g[v].index); // do not connect clones to accepts, we do it during cloning if (is_any_accept(clone_v, g)) { return; } edges_to_be_added.emplace_back(clone_u, clone_v); DEBUG_PRINTF("Adding edge: %zu -> %zu\n", g[clone_u].index, g[clone_v].index); } void prepare_graph() { DEBUG_PRINTF("Building shadow graphs\n"); for (auto v : vertices_range(g)) { // all level 0 vertices are their own helpers and their own shadows helper_map[make_pair(v, 0)] = v; shadow_map[make_pair(v, 0)] = v; // find special nodes if (is_any_accept(v, g)) { DEBUG_PRINTF("Node %zu is a special node\n", g[v].index); for (unsigned edit = 1; edit <= edit_distance; edit++) { // all accepts are their own shadows and helpers at all // levels shadow_map[make_pair(v, edit)] = v; helper_map[make_pair(v, edit)] = v; } continue; } DEBUG_PRINTF("Node %zu is to be shadowed\n", g[v].index); orig.insert(v); } } void build_shadow_graph() { for (auto v : orig) { DEBUG_PRINTF("Adding shadow/helper nodes for node %zu\n", g[v].index); for (unsigned dist = 1; dist <= edit_distance; dist++) { auto shadow_v = v; // start and startDs cannot have shadows but do have helpers if (!is_any_start(v, g)) { shadow_v = clone_vertex(g, v); DEBUG_PRINTF("New shadow node ID: %zu (level %u)\n", g[shadow_v].index, dist); } shadow_map[make_pair(v, dist)] = shadow_v; // if there's nowhere to go from this vertex, no helper needed if (proper_out_degree(v, g) < 1) { DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n", g[shadow_v].index, dist); helper_map[make_pair(v, dist)] = shadow_v; continue; } // start and startDs only have helpers for insert, so not Hamming if (hamming && is_any_start(v, g)) { DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n", g[shadow_v].index, dist); helper_map[make_pair(v, dist)] = shadow_v; continue; } auto helper_v = clone_vertex(g, v); DEBUG_PRINTF("New helper node ID: %zu (level %u)\n", g[helper_v].index, dist); // this is a helper, so make it a dot g[helper_v].char_reach = CharReach::dot(); // do not copy virtual start's assert flags if (is_virtual_start(v, g)) { DEBUG_PRINTF("Helper node ID is virtual start: %zu (level %u)\n", g[helper_v].index, dist); g[helper_v].assert_flags = 0; } helper_map[make_pair(v, dist)] = helper_v; } } } // wire up successors according to the original graph, wire helpers // to shadow successors (insert/replace) void connect_succs(NFAVertex v, u32 dist) { DEBUG_PRINTF("Wiring up successors for node %zu shadow level %u\n", g[v].index, dist); const auto &cur_shadow_v = shadow_map[make_pair(v, dist)]; const auto &cur_shadow_helper = helper_map[make_pair(v, dist)]; // multiple insert if (!hamming && dist > 1) { const auto &prev_level_helper = helper_map[make_pair(v, dist - 1)]; connect_to_clones(prev_level_helper, cur_shadow_helper); } for (auto orig_dst : adjacent_vertices_range(v, g)) { const auto &shadow_dst = shadow_map[make_pair(orig_dst, dist)]; connect_to_clones(cur_shadow_v, shadow_dst); // ignore startDs for insert/replace if (orig_dst == g.startDs) { continue; } connect_to_clones(cur_shadow_helper, shadow_dst); } } // wire up predecessors according to the original graph, wire // predecessors to helpers (replace), wire predecessor helpers to // helpers (multiple replace) void connect_preds(NFAVertex v, u32 dist) { DEBUG_PRINTF("Wiring up predecessors for node %zu shadow level %u\n", g[v].index, dist); const auto &cur_shadow_v = shadow_map[make_pair(v, dist)]; const auto &cur_shadow_helper = helper_map[make_pair(v, dist)]; auto orig_src_vertices = inv_adjacent_vertices_range(v, g); for (auto orig_src : orig_src_vertices) { // ignore edges from start to startDs if (v == g.startDs && orig_src == g.start) { continue; } // ignore self-loops for replace if (orig_src != v) { // do not wire a replace node for start vertices if we // have a virtual start if (is_virtual_start(v, g) && is_any_start(orig_src, g)) { continue; } if (dist) { const auto &prev_level_src = shadow_map[make_pair(orig_src, dist - 1)]; const auto &prev_level_helper = helper_map[make_pair(orig_src, dist - 1)]; connect_to_clones(prev_level_src, cur_shadow_helper); connect_to_clones(prev_level_helper, cur_shadow_helper); } } // wire predecessor according to original graph const auto &shadow_src = shadow_map[make_pair(orig_src, dist)]; connect_to_clones(shadow_src, cur_shadow_v); } } // wire up previous level helper to current shadow (insert) void connect_helpers(NFAVertex v, u32 dist) { DEBUG_PRINTF("Wiring up helpers for node %zu shadow level %u\n", g[v].index, dist); const auto &cur_shadow_helper = helper_map[make_pair(v, dist)]; auto prev_level_v = shadow_map[make_pair(v, dist - 1)]; connect_to_clones(prev_level_v, cur_shadow_helper); } /* * wiring edges for removal is a special case. * * when wiring edges for removal, as well as wiring up immediate * predecessors to immediate successors, we also need to wire up more * distant successors to their respective shadow graph levels. * * for example, consider graph start->a->b->c->d->accept. * * at edit distance 1, we need remove edges start->b, a->c, b->d, and * c->accept, all going from original graph (level 0) to shadow graph * level 1. * * at edit distance 2, we also need edges start->c, a->d and b->accept, * all going from level 0 to shadow graph level 2. * * this is propagated to all shadow levels; that is, given edit * distance 3, we will have edges from shadow levels 0->1, 0->2, * 0->3, 1->2, 1->3, and 2->3. * * therefore, we wire them in steps: first wire with step 1 (0->1, 1->2, * 2->3) at depth 1, then wire with step 2 (0->2, 1->3) at depth 2, etc. * * we also have to wire helpers to their removal successors, to * accommodate for a replace followed by a remove, on all shadow levels. * * and finally, we also have to wire source shadows into removal * successor helpers on a level above, to accommodate for a remove * followed by a replace. */ void connect_removals(NFAVertex v) { DEBUG_PRINTF("Wiring up remove edges for node %zu\n", g[v].index); // vertices returned by this function don't include self-loops auto dst_vertices_by_depth = gatherSuccessorsByDepth(g, v, edit_distance); auto orig_src_vertices = inv_adjacent_vertices_range(v, g); for (auto orig_src : orig_src_vertices) { // ignore self-loops if (orig_src == v) { continue; } for (unsigned step = 1; step <= edit_distance; step++) { for (unsigned dist = step; dist <= edit_distance; dist++) { auto &dst_vertices = dst_vertices_by_depth[step - 1]; for (auto &orig_dst : dst_vertices) { const auto &shadow_src = shadow_map[make_pair(orig_src, dist - step)]; const auto &shadow_helper = helper_map[make_pair(orig_src, dist - step)]; const auto &shadow_dst = shadow_map[make_pair(orig_dst, dist)]; // removal connect_to_clones(shadow_src, shadow_dst); // removal from helper vertex connect_to_clones(shadow_helper, shadow_dst); // removal into helper, requires additional edit if ((dist + 1) <= edit_distance) { const auto &next_level_helper = helper_map[make_pair(orig_dst, dist + 1)]; connect_to_clones(shadow_src, next_level_helper); } } } } } } void connect_shadow_graph() { DEBUG_PRINTF("Wiring up the graph\n"); for (auto v : orig) { DEBUG_PRINTF("Wiring up edges for node %zu\n", g[v].index); for (unsigned dist = 0; dist <= edit_distance; dist++) { // handle insert/replace connect_succs(v, dist); // handle replace/multiple insert connect_preds(v, dist); // handle helpers if (!hamming && dist > 0) { connect_helpers(v, dist); } } // handle removals if (!hamming) { connect_removals(v); } } } void connect_to_targets(NFAVertex src, const flat_set &targets) { for (auto dst : targets) { DEBUG_PRINTF("Adding edge: %zu -> %zu\n", g[src].index, g[dst].index); edges_to_be_added.emplace_back(src, dst); } } // create a clone of the vertex, but overwrite its report set void create_clone(NFAVertex v, const flat_set &reports, unsigned max_edit_distance, const flat_set &targets) { // some vertices may have the same reports, but different successors; // therefore, we may need to connect them multiple times, but still only // clone once bool needs_cloning = !contains(clones, v); DEBUG_PRINTF("Cloning node %zu\n", g[v].index); // go through all shadows and helpers, including // original vertex for (unsigned d = 0; d < max_edit_distance; d++) { auto shadow_v = shadow_map[make_pair(v, d)]; auto helper_v = helper_map[make_pair(v, d)]; NFAVertex new_shadow_v, new_helper_v; // make sure we don't clone the same vertex twice if (needs_cloning) { new_shadow_v = clone_vertex(g, shadow_v); DEBUG_PRINTF("New shadow node ID: %zu (level %u)\n", g[new_shadow_v].index, d); clones[shadow_v] = new_shadow_v; } else { new_shadow_v = clones[shadow_v]; } g[new_shadow_v].reports = reports; connect_to_targets(new_shadow_v, targets); if (shadow_v == helper_v) { continue; } if (needs_cloning) { new_helper_v = clone_vertex(g, helper_v); DEBUG_PRINTF("New helper node ID: %zu (level %u)\n", g[new_helper_v].index, d); clones[helper_v] = new_helper_v; } else { new_helper_v = clones[helper_v]; } g[new_helper_v].reports = reports; connect_to_targets(new_helper_v, targets); } } void write_reports(NFAVertex v, const flat_set &reports, unsigned max_edit_distance, const flat_set &targets) { // we're overwriting reports, but we're not losing any // information as we already cached all the different report // sets, so vertices having different reports will be cloned and set up // with the correct report set // go through all shadows and helpers, including original // vertex for (unsigned d = 0; d < max_edit_distance; d++) { auto shadow_v = shadow_map[make_pair(v, d)]; auto helper_v = helper_map[make_pair(v, d)]; DEBUG_PRINTF("Setting up reports for shadow node: %zu " "(level %u)\n", g[shadow_v].index, d); DEBUG_PRINTF("Setting up reports for helper node: %zu " "(level %u)\n", g[helper_v].index, d); g[shadow_v].reports = reports; g[helper_v].reports = reports; connect_to_targets(shadow_v, targets); connect_to_targets(helper_v, targets); } } /* * we may have multiple report sets per graph. that means, whenever we * construct additional paths through the graph (alternations, removals), we * have to account for the fact that some vertices are predecessors to * vertices with different report sets. * * whenever that happens, we have to clone the paths for both report sets, * and set up these new vertices with their respective report sets as well. * * in order to do that, we first have to get all the predecessors for accept * and acceptEod vertices. then, go through them one by one, and take note * of the report lists. the first report set we find, wins, the rest we * clone. * * we also have to do this in two passes, because there may be vertices that * are predecessors to vertices with different report sets, so to avoid * overwriting reports we will be caching reports info instead. */ void create_reports() { map, flat_set> reports_to_vertices; flat_set accepts{g.accept, g.acceptEod}; // gather reports info from all vertices connected to accept for (auto accept : accepts) { for (auto src : inv_adjacent_vertices_range(accept, g)) { // skip special vertices if (is_special(src, g)) { continue; } reports_to_vertices[g[src].reports].insert(src); } } // we expect to see at most two report sets assert(reports_to_vertices.size() > 0 && reports_to_vertices.size() <= 2); // set up all reports bool clone = false; for (const auto &pair : reports_to_vertices) { const auto &reports = pair.first; const auto &svertices = pair.second; for (auto src : svertices) { // get all predecessors up to edit distance auto src_vertices_by_depth = gatherPredecessorsByDepth(g, src, edit_distance); // find which accepts source vertex connects to flat_set targets; for (const auto &accept : accepts) { NFAEdge e; std::tie(e, std::ignore) = edge(src, accept, g); if (e) { targets.insert(accept); } } assert(targets.size()); for (unsigned d = 0; d < src_vertices_by_depth.size(); d++) { const auto &predcs = src_vertices_by_depth[d]; for (auto v : predcs) { // only clone a node if it already contains reports if (clone && !g[v].reports.empty()) { create_clone(v, reports, edit_distance - d, targets); } else { write_reports(v, reports, edit_distance - d, targets); } } } } // clone vertices only if it's not our first report set clone = true; } } }; // check if we will edit our way into a vacuous pattern static bool will_turn_vacuous(const NGHolder &g, u32 edit_distance) { auto depths = calcRevDepths(g); depth min_depth = depth::infinity(); auto idx = g[g.start].index; // check distance from start to accept/acceptEod if (depths[idx].toAccept.min.is_finite()) { min_depth = min(depths[idx].toAccept.min, min_depth); } if (depths[idx].toAcceptEod.min.is_finite()) { min_depth = min(depths[idx].toAcceptEod.min, min_depth); } idx = g[g.startDs].index; // check distance from startDs to accept/acceptEod if (depths[idx].toAccept.min.is_finite()) { min_depth = min(depths[idx].toAccept.min, min_depth); } if (depths[idx].toAcceptEod.min.is_finite()) { min_depth = min(depths[idx].toAcceptEod.min, min_depth); } assert(min_depth.is_finite()); // now, check if we can edit our way into a vacuous pattern if (min_depth <= (u64a) edit_distance + 1) { DEBUG_PRINTF("Pattern will turn vacuous if approximately matched\n"); return true; } return false; } void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool hamming, bool utf8, const Grey &grey) { if (edit_distance == 0) { return; } if (!grey.allowApproximateMatching) { throw CompileError("Approximate matching is disabled."); } if (edit_distance > grey.maxEditDistance) { throw CompileError("Edit distance is too big."); } if (utf8) { throw CompileError("UTF-8 is disallowed for approximate matching."); } // graph isn't fuzzable if there are edge assertions anywhere in the graph for (auto e : edges_range(g)) { // cppcheck-suppress useStlAlgorithm if (g[e].assert_flags) { throw CompileError("Zero-width assertions are disallowed for " "approximate matching."); } } if (!hamming && will_turn_vacuous(g, edit_distance)) { throw CompileError("Approximate matching patterns that reduce to " "vacuous patterns are disallowed."); } } void make_fuzzy(NGHolder &g, u32 edit_distance, bool hamming, const Grey &grey) { if (edit_distance == 0) { return; } assert(grey.allowApproximateMatching); assert(grey.maxEditDistance >= edit_distance); ShadowGraph sg(g, edit_distance, hamming); sg.fuzz_graph(); // For safety, enforce limit on actual vertex count. if (num_vertices(g) > grey.limitApproxMatchingVertices) { DEBUG_PRINTF("built %zu vertices > limit of %u\n", num_vertices(g), grey.limitApproxMatchingVertices); throw ResourceLimitError(); } } } // namespace ue2