diff --git a/util/ng_corpus_generator.cpp b/util/ng_corpus_generator.cpp index ca7c413a..6d0de539 100644 --- a/util/ng_corpus_generator.cpp +++ b/util/ng_corpus_generator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -218,7 +218,7 @@ namespace { /** \brief Concrete implementation */ class CorpusGeneratorImpl : public CorpusGenerator { public: - CorpusGeneratorImpl(const NGHolder &graph_in, CorpusProperties &props); + CorpusGeneratorImpl(const NGWrapper &graph_in, CorpusProperties &props); ~CorpusGeneratorImpl() {} void generateCorpus(vector &data); @@ -244,10 +244,13 @@ private: CorpusProperties &cProps; }; -CorpusGeneratorImpl::CorpusGeneratorImpl(const NGHolder &graph_in, +CorpusGeneratorImpl::CorpusGeneratorImpl(const NGWrapper &graph_in, CorpusProperties &props) : graph(graph_in), cProps(props) { - // empty + // if this pattern is to be matched approximately + if (graph_in.edit_distance && !props.editDistance) { + props.editDistance = props.rand(0, graph_in.edit_distance + 1); + } } void CorpusGeneratorImpl::generateCorpus(vector &data) { @@ -388,7 +391,7 @@ hit_limit: /** \brief Concrete implementation for UTF-8 */ class CorpusGeneratorUtf8 : public CorpusGenerator { public: - CorpusGeneratorUtf8(const NGHolder &graph_in, CorpusProperties &props); + CorpusGeneratorUtf8(const NGWrapper &graph_in, CorpusProperties &props); ~CorpusGeneratorUtf8() {} void generateCorpus(vector &data); @@ -407,17 +410,21 @@ private: void addRandom(const min_max &mm, vector *out); /** \brief The NFA graph we operate over. */ - const NGHolder &graph; + const NGWrapper &graph; /** \brief Reference to our corpus generator properties object (stores some * state) */ CorpusProperties &cProps; }; -CorpusGeneratorUtf8::CorpusGeneratorUtf8(const NGHolder &graph_in, +CorpusGeneratorUtf8::CorpusGeneratorUtf8(const NGWrapper &graph_in, CorpusProperties &props) : graph(graph_in), cProps(props) { - // empty + // we do not support Utf8 for approximate matching + if (graph.edit_distance) { + throw CorpusGenerationFailure("UTF-8 for edited patterns is not " + "supported."); + } } void CorpusGeneratorUtf8::generateCorpus(vector &data) { diff --git a/util/ng_corpus_generator.h b/util/ng_corpus_generator.h index a7445ab6..a02721bd 100644 --- a/util/ng_corpus_generator.h +++ b/util/ng_corpus_generator.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -45,6 +45,12 @@ class NGWrapper; } // namespace ue2 +struct CorpusGenerationFailure { + explicit CorpusGenerationFailure(const std::string s) : + message(std::move(s)) {} + std::string message; +}; + /** \brief Abstract interface to corpus generator tool. */ class CorpusGenerator { public: diff --git a/util/ng_find_matches.cpp b/util/ng_find_matches.cpp index 2b337365..b3c81574 100644 --- a/util/ng_find_matches.cpp +++ b/util/ng_find_matches.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -47,55 +47,732 @@ using namespace std; using namespace ue2; +using MatchSet = set>; +using StateBitSet = boost::dynamic_bitset<>; + namespace { -struct StateSet { - explicit StateSet(size_t sz) : s(sz), som(sz, 0) {} - boost::dynamic_bitset<> s; // bitset of states that are on - vector som; // som value for each state +// returns all successors up to a given depth in a vector of sets, indexed by +// zero-based depth from source vertex +static +vector> +gatherSuccessorsByDepth(const NGHolder &g, const NFAVertex &src, u32 depth) { + vector> result(depth); + flat_set cur, next; + + // populate current set of successors + for (auto v : adjacent_vertices_range(src, g)) { + // ignore self-loops + if (src == v) { + continue; + } + DEBUG_PRINTF("Node %zu depth 1\n", g[v].index); + + cur.insert(v); + } + result[0] = cur; + + for (u32 d = 1; d < depth; d++) { + // collect all successors for all current level vertices + for (auto v : cur) { + // don't go past special nodes + if (is_special(v, g)) { + continue; + } + + for (auto succ : adjacent_vertices_range(v, g)) { + // ignore self-loops + if (v == succ) { + continue; + } + DEBUG_PRINTF("Node %zu depth %u\n", g[succ].index, d + 1); + next.insert(succ); + } + } + result[d] = next; + next.swap(cur); + next.clear(); + } + + return result; +} + +// returns all predecessors up to a given depth in a vector of sets, indexed by +// zero-based depth from source vertex +static +vector> gatherPredecessorsByDepth(const NGHolder &g, + NFAVertex src, u32 depth) { + vector> result(depth); + flat_set cur, next; + + assert(depth > 0); + + // populate current set of successors + for (auto v : inv_adjacent_vertices_range(src, g)) { + // ignore self-loops + if (src == v) { + continue; + } + DEBUG_PRINTF("Node %zu depth 1\n", g[v].index); + cur.insert(v); + } + result[0] = cur; + + for (u32 d = 1; d < depth; d++) { + // collect all successors for all current level vertices + for (auto v : cur) { + for (auto pred : inv_adjacent_vertices_range(v, g)) { + // ignore self-loops + if (v == pred) { + continue; + } + DEBUG_PRINTF("Node %zu depth %u\n", g[pred].index, d + 1); + next.insert(pred); + } + } + result[d] = next; + next.swap(cur); + next.clear(); + } + + return result; +} + +// this is a per-vertex, per-shadow level state transition table +struct GraphCache { + GraphCache(u32 dist_in, const NGHolder &g) : + size(num_vertices(g)), edit_distance(dist_in) + { + auto dist_max = edit_distance + 1; + + allocateStateTransitionTable(dist_max); + populateTransitionCache(g, dist_max); + populateAcceptCache(g, dist_max); + } + + void allocateStateTransitionTable(u32 dist_max) { + // resize level 1 - per vertex + shadow_transitions.resize(size); + helper_transitions.resize(size); + + // resize level 2 - per shadow level + for (u32 i = 0; i < size; i++) { + shadow_transitions[i].resize(dist_max); + helper_transitions[i].resize(dist_max); + + // resize level 3 - per vertex + for (u32 d = 0; d < dist_max; d++) { + shadow_transitions[i][d].resize(size); + helper_transitions[i][d].resize(size); + } + } + + // accept states are indexed by edit distance + accept_states.resize(dist_max); + accept_eod_states.resize(dist_max); + + // vertex report maps are indexed by edit distance + vertex_reports_by_level.resize(dist_max); + vertex_eod_reports_by_level.resize(dist_max); + } + + /* + * certain transitions to helpers are disallowed: + * 1. transitions from accept/acceptEod + * 2. transitions to accept/acceptEod + * 3. from start to startDs + * 4. to a virtual/multiline start + * + * everything else is allowed. + */ + bool canTransitionToHelper(NFAVertex u, NFAVertex v, const NGHolder &g) const { + if (is_any_accept(u, g)) { + return false; + } + if (is_any_accept(v, g)) { + return false; + } + if (u == g.start && v == g.startDs) { + return false; + } + if (is_virtual_start(v, g)) { + return false; + } + return true; + } + + void populateTransitionCache(const NGHolder &g, u32 dist_max) { + // populate mapping of vertex index to vertex + vector idx_to_v(size); + for (auto v : vertices_range(g)) { + idx_to_v[g[v].index] = v; + } + + for (u32 i = 0; i < size; i++) { + auto cur_v = idx_to_v[i]; + + // set up transition tables + auto succs = gatherSuccessorsByDepth(g, cur_v, dist_max); + + assert(succs.size() == dist_max); + + for (u32 d = 0; d < dist_max; d++) { + auto &v_shadows = shadow_transitions[i][d]; + auto cur_v_bit = i; + + // enable transition to next level helper (this handles insertion) + if (d < edit_distance && !is_any_accept(cur_v, g)) { + auto &next_v_helpers = helper_transitions[i][d + 1]; + + next_v_helpers.set(cur_v_bit); + } + + // if vertex has a self-loop, we can also transition to it, + // but only if we're at shadow level 0 + if (edge(cur_v, cur_v, g).second && d == 0) { + v_shadows.set(cur_v_bit); + } + + // populate state transition tables + for (auto v : succs[d]) { + auto v_bit = g[v].index; + + // we cannot transition to startDs on any level other than + // level 0 + if (v != g.startDs || d == 0) { + // this handles direct transitions as well as removals + v_shadows.set(v_bit); + } + + // we can also transition to next-level helper (handles + // replace), provided we meet the criteria + if (d < edit_distance && canTransitionToHelper(cur_v, v, g)) { + auto &next_v_helpers = helper_transitions[i][d + 1]; + + next_v_helpers.set(v_bit); + } + } + } + } + } + + void populateAcceptCache(const NGHolder &g, u32 dist_max) { + // set up accept states masks + StateBitSet accept(size); + accept.set(g[g.accept].index); + StateBitSet accept_eod(size); + accept_eod.set(g[g.acceptEod].index); + + // gather accept and acceptEod states + for (u32 base_dist = 0; base_dist < dist_max; base_dist++) { + auto &states = accept_states[base_dist]; + auto &eod_states = accept_eod_states[base_dist]; + + states.resize(size); + eod_states.resize(size); + + // inspect each vertex + for (u32 i = 0; i < size; i++) { + // inspect all shadow levels from base_dist to dist_max + for (u32 d = 0; d < dist_max - base_dist; d++) { + auto &shadows = shadow_transitions[i][d]; + + // if this state transitions to accept, set its bit + if ((shadows & accept).any()) { + states.set(i); + } + if ((shadows & accept_eod).any()) { + eod_states.set(i); + } + } + } + } + + // populate accepts cache + for (auto v : inv_adjacent_vertices_range(g.accept, g)) { + const auto &rs = g[v].reports; + + for (u32 d = 0; d <= edit_distance; d++) { + // add self to report list at all levels + vertex_reports_by_level[d][v].insert(rs.begin(), rs.end()); + } + if (edit_distance == 0) { + // if edit distance is 0, no predecessors will have reports + continue; + } + + auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance); + for (u32 pd = 0; pd < preds_by_depth.size(); pd++) { + const auto &preds = preds_by_depth[pd]; + // for each predecessor, add reports up to maximum edit distance + // for current depth from source vertex + for (auto pred : preds) { + for (u32 d = 0; d < edit_distance - pd; d++) { + vertex_reports_by_level[d][pred].insert(rs.begin(), rs.end()); + } + } + } + } + for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) { + const auto &rs = g[v].reports; + + if (v == g.accept) { + continue; + } + + for (u32 d = 0; d <= edit_distance; d++) { + // add self to report list at all levels + vertex_eod_reports_by_level[d][v].insert(rs.begin(), rs.end()); + } + if (edit_distance == 0) { + // if edit distance is 0, no predecessors will have reports + continue; + } + + auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance); + for (u32 pd = 0; pd < preds_by_depth.size(); pd++) { + const auto &preds = preds_by_depth[pd]; + // for each predecessor, add reports up to maximum edit distance + // for current depth from source vertex + for (auto pred : preds) { + for (u32 d = 0; d < edit_distance - pd; d++) { + vertex_eod_reports_by_level[d][pred].insert(rs.begin(), rs.end()); + } + } + } + } + } + +#ifdef DEBUG + void dumpStateTransitionTable(const NGHolder &g) { + StateBitSet accept(size); + accept.set(g[g.accept].index); + StateBitSet accept_eod(size); + accept_eod.set(g[g.acceptEod].index); + + DEBUG_PRINTF("Dumping state transition tables\n"); + DEBUG_PRINTF("Shadows:\n"); + for (u32 i = 0; i < num_vertices(g); i++) { + DEBUG_PRINTF("%-7s %3u:", "Vertex", i); + for (u32 j = 0; j < num_vertices(g); j++) { + printf("%3i", j); + } + printf("\n"); + for (u32 d = 0; d <= edit_distance; d++) { + DEBUG_PRINTF("%-7s %3u:", "Level", d); + const auto &s = getShadowTransitions(i, d); + for (u32 j = 0; j < num_vertices(g); j++) { + printf("%3i", s.test(j)); + } + printf("\n"); + } + DEBUG_PRINTF("\n"); + } + + DEBUG_PRINTF("Helpers:\n"); + for (u32 i = 0; i < num_vertices(g); i++) { + DEBUG_PRINTF("%-7s %3u:", "Vertex", i); + for (u32 j = 0; j < num_vertices(g); j++) { + printf("%3i", j); + } + printf("\n"); + for (u32 d = 0; d <= edit_distance; d++) { + DEBUG_PRINTF("%-7s %3u:", "Level", d); + const auto &s = getHelperTransitions(i, d); + for (u32 j = 0; j < num_vertices(g); j++) { + printf("%3i", s.test(j)); + } + printf("\n"); + } + DEBUG_PRINTF("\n"); + } + + DEBUG_PRINTF("Accept transitions:\n"); + DEBUG_PRINTF("%-12s", "Vertex idx:"); + for (u32 j = 0; j < num_vertices(g); j++) { + printf("%3i", j); + } + printf("\n"); + for (u32 d = 0; d <= edit_distance; d++) { + DEBUG_PRINTF("%-7s %3u:", "Level", d); + const auto &s = getAcceptTransitions(d); + for (u32 j = 0; j < num_vertices(g); j++) { + printf("%3i", s.test(j)); + } + printf("\n"); + } + DEBUG_PRINTF("\n"); + + DEBUG_PRINTF("Accept EOD transitions:\n"); + DEBUG_PRINTF("%-12s", "Vertex idx:"); + for (u32 j = 0; j < num_vertices(g); j++) { + printf("%3i", j); + } + printf("\n"); + for (u32 d = 0; d <= edit_distance; d++) { + DEBUG_PRINTF("%-7s %3u:", "Level", d); + const auto &s = getAcceptEodTransitions(d); + for (u32 j = 0; j < num_vertices(g); j++) { + printf("%3i", s.test(j)); + } + printf("\n"); + } + DEBUG_PRINTF("\n"); + + DEBUG_PRINTF("%-12s ", "Accepts:"); + for (u32 i = 0; i < num_vertices(g); i++) { + printf("%3i", accept.test(i)); + } + printf("\n"); + + DEBUG_PRINTF("%-12s ", "EOD Accepts:"); + for (u32 i = 0; i < num_vertices(g); i++) { + printf("%3i", accept_eod.test(i)); + } + printf("\n"); + + DEBUG_PRINTF("Reports\n"); + for (auto v : vertices_range(g)) { + for (u32 d = 0; d <= edit_distance; d++) { + const auto &r = vertex_reports_by_level[d][v]; + const auto &e = vertex_eod_reports_by_level[d][v]; + DEBUG_PRINTF("%-7s %3zu %-8s %3zu %-8s %3zu\n", + "Vertex", g[v].index, "rs:", r.size(), "eod:", e.size()); + } + } + printf("\n"); + } +#endif + + const StateBitSet& getShadowTransitions(u32 idx, u32 level) const { + assert(idx < size); + assert(level <= edit_distance); + return shadow_transitions[idx][level]; + } + const StateBitSet& getHelperTransitions(u32 idx, u32 level) const { + assert(idx < size); + assert(level <= edit_distance); + return helper_transitions[idx][level]; + } + const StateBitSet& getAcceptTransitions(u32 level) const { + assert(level <= edit_distance); + return accept_states[level]; + } + const StateBitSet& getAcceptEodTransitions(u32 level) const { + assert(level <= edit_distance); + return accept_eod_states[level]; + } + + /* + * the bitsets are indexed by vertex and shadow level. the bitset's length is + * equal to the total number of vertices in the graph. + * + * for convenience, helper functions are provided. + */ + vector> shadow_transitions; + vector> helper_transitions; + + // accept states masks, indexed by shadow level + vector accept_states; + vector accept_eod_states; + + // map of all reports associated with any vertex, indexed by shadow level + vector>> vertex_reports_by_level; + vector>> vertex_eod_reports_by_level; + + u32 size; + u32 edit_distance; }; -using MatchSet = set>; + +/* + * SOM workflow is expected to be the following: + * - Caller calls getActiveStates, which reports SOM for each active states + * - Caller calls getSuccessorStates on each of the active states, which *doesn't* + * report SOM + * - Caller decides if the successor state should be activated, and calls + * activateState with SOM set to that of previous active state (not successor!) + * - activateState then resolves any conflicts between SOMs that may arise from + * multiple active states progressing to the same successor + */ +struct StateSet { + struct State { + enum node_type { + NODE_SHADOW = 0, + NODE_HELPER + }; + State(size_t idx_in, u32 level_in, size_t som_in, node_type type_in) : + idx(idx_in), level(level_in), som(som_in), type(type_in) {} + size_t idx; + u32 level; + size_t som; + node_type type; + }; + + StateSet(size_t sz, u32 dist_in) : + shadows(dist_in + 1), helpers(dist_in + 1), + shadows_som(dist_in + 1), helpers_som(dist_in + 1), + edit_distance(dist_in) { + for (u32 dist = 0; dist <= dist_in; dist++) { + shadows[dist].resize(sz, false); + helpers[dist].resize(sz, false); + shadows_som[dist].resize(sz, 0); + helpers_som[dist].resize(sz, 0); + } + } + + void reset() { + for (u32 dist = 0; dist <= edit_distance; dist++) { + shadows[dist].reset(); + helpers[dist].reset(); + fill(shadows_som[dist].begin(), shadows_som[dist].end(), 0); + fill(helpers_som[dist].begin(), helpers_som[dist].end(), 0); + } + } + + bool empty() const { + for (u32 dist = 0; dist <= edit_distance; dist++) { + if (shadows[dist].any()) { + return false; + } + if (helpers[dist].any()) { + return false; + } + } + return true; + } + + size_t count() const { + size_t result = 0; + + for (u32 dist = 0; dist <= edit_distance; dist++) { + result += shadows[dist].count(); + result += helpers[dist].count(); + } + + return result; + } + + bool setActive(const State &s) { + switch (s.type) { + case State::NODE_HELPER: + return helpers[s.level].test_set(s.idx); + case State::NODE_SHADOW: + return shadows[s.level].test_set(s.idx); + } + assert(0); + return false; + } + + size_t getCachedSom(const State &s) const { + switch (s.type) { + case State::NODE_HELPER: + return helpers_som[s.level][s.idx]; + case State::NODE_SHADOW: + return shadows_som[s.level][s.idx]; + } + assert(0); + return 0; + } + + void setCachedSom(const State &s, const size_t som_val) { + switch (s.type) { + case State::NODE_HELPER: + helpers_som[s.level][s.idx] = som_val; + break; + case State::NODE_SHADOW: + shadows_som[s.level][s.idx] = som_val; + break; + default: + assert(0); + } + } + +#ifdef DEBUG + void dumpActiveStates() const { + const auto states = getActiveStates(); + + DEBUG_PRINTF("Dumping active states\n"); + + for (const auto &state : states) { + DEBUG_PRINTF("type: %s idx: %zu level: %u som: %zu\n", + state.type == State::NODE_HELPER ? "HELPER" : "SHADOW", + state.idx, state.level, state.som); + } + } +#endif + + flat_set getActiveStates() const { + flat_set result; + + for (u32 dist = 0; dist <= edit_distance; dist++) { + // get all shadow vertices (including original graph) + const auto &cur_shadow_vertices = shadows[dist]; + for (size_t id = cur_shadow_vertices.find_first(); + id != cur_shadow_vertices.npos; + id = cur_shadow_vertices.find_next(id)) { + result.emplace(id, dist, shadows_som[dist][id], + State::NODE_SHADOW); + } + + // the rest is only valid for edited graphs + if (dist == 0) { + continue; + } + + // get all helper vertices + const auto &cur_helper_vertices = helpers[dist]; + for (size_t id = cur_helper_vertices.find_first(); + id != cur_helper_vertices.npos; + id = cur_helper_vertices.find_next(id)) { + result.emplace(id, dist, helpers_som[dist][id], + State::NODE_HELPER); + } + } + + return result; + } + + // does not return SOM + flat_set getSuccessors(const State &state, const GraphCache &gc) const { + flat_set result; + + // maximum shadow depth that we can go from current level + u32 max_depth = edit_distance - state.level + 1; + + for (u32 d = 0; d < max_depth; d++) { + const auto &shadow_succ = gc.getShadowTransitions(state.idx, d); + for (size_t id = shadow_succ.find_first(); + id != shadow_succ.npos; + id = shadow_succ.find_next(id)) { + auto new_level = state.level + d; + result.emplace(id, new_level, 0, State::NODE_SHADOW); + } + + const auto &helper_succ = gc.getHelperTransitions(state.idx, d); + for (size_t id = helper_succ.find_first(); + id != helper_succ.npos; + id = helper_succ.find_next(id)) { + auto new_level = state.level + d; + result.emplace(id, new_level, 0, State::NODE_HELPER); + } + } + + return result; + } + + flat_set getAcceptStates(const GraphCache &gc) const { + flat_set result; + + for (u32 dist = 0; dist <= edit_distance; dist++) { + // get all shadow vertices (including original graph) + auto cur_shadow_vertices = shadows[dist]; + cur_shadow_vertices &= gc.getAcceptTransitions(dist); + for (size_t id = cur_shadow_vertices.find_first(); + id != cur_shadow_vertices.npos; + id = cur_shadow_vertices.find_next(id)) { + result.emplace(id, dist, shadows_som[dist][id], + State::NODE_SHADOW); + } + auto cur_helper_vertices = helpers[dist]; + cur_helper_vertices &= gc.getAcceptTransitions(dist); + for (size_t id = cur_helper_vertices.find_first(); + id != cur_helper_vertices.npos; + id = cur_helper_vertices.find_next(id)) { + result.emplace(id, dist, helpers_som[dist][id], + State::NODE_HELPER); + } + } + + return result; + } + + flat_set getAcceptEodStates(const GraphCache &gc) const { + flat_set result; + + for (u32 dist = 0; dist <= edit_distance; dist++) { + // get all shadow vertices (including original graph) + auto cur_shadow_vertices = shadows[dist]; + cur_shadow_vertices &= gc.getAcceptEodTransitions(dist); + for (size_t id = cur_shadow_vertices.find_first(); + id != cur_shadow_vertices.npos; + id = cur_shadow_vertices.find_next(id)) { + result.emplace(id, dist, shadows_som[dist][id], + State::NODE_SHADOW); + } + auto cur_helper_vertices = helpers[dist]; + cur_helper_vertices &= gc.getAcceptEodTransitions(dist); + for (size_t id = cur_helper_vertices.find_first(); + id != cur_helper_vertices.npos; + id = cur_helper_vertices.find_next(id)) { + result.emplace(id, dist, helpers_som[dist][id], + State::NODE_HELPER); + } + } + + return result; + } + + // the caller must specify SOM at current offset, and must not attempt to + // resolve SOM inheritance conflicts + void activateState(const State &state) { + size_t cur_som = state.som; + if (setActive(state)) { + size_t cached_som = getCachedSom(state); + cur_som = min(cur_som, cached_som); + } + setCachedSom(state, cur_som); + } + + vector shadows; + vector helpers; + vector> shadows_som; + vector> helpers_som; + u32 edit_distance; +}; + +// for flat_set +bool operator<(const StateSet::State &a, const StateSet::State &b) { + ORDER_CHECK(idx); + ORDER_CHECK(level); + ORDER_CHECK(type); + ORDER_CHECK(som); + return false; +} struct fmstate { const size_t num_states; // number of vertices in graph StateSet states; // currently active states StateSet next; // states on after this iteration + GraphCache &gc; vector vertices; // mapping from index to vertex size_t offset = 0; unsigned char cur = 0; unsigned char prev = 0; - const bool som; const bool utf8; const bool allowStartDs; const ReportManager &rm; - boost::dynamic_bitset<> accept; // states leading to accept - boost::dynamic_bitset<> accept_with_eod; // states leading to accept or eod - - fmstate(const NGHolder &g, bool som_in, bool utf8_in, bool aSD_in, - const ReportManager &rm_in) - : num_states(num_vertices(g)), states(num_states), next(num_states), - vertices(num_vertices(g), NGHolder::null_vertex()), som(som_in), - utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in), accept(num_states), - accept_with_eod(num_states) { + fmstate(const NGHolder &g, GraphCache &gc_in, bool utf8_in, bool aSD_in, + const u32 edit_distance, const ReportManager &rm_in) + : num_states(num_vertices(g)), + states(num_states, edit_distance), + next(num_states, edit_distance), + gc(gc_in), vertices(num_vertices(g), NGHolder::null_vertex()), + utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in) { // init states - states.s.set(g[g.start].index); + states.activateState( + StateSet::State {g[g.start].index, 0, 0, + StateSet::State::NODE_SHADOW}); if (allowStartDs) { - states.s.set(g[g.startDs].index); + states.activateState( + StateSet::State {g[g.startDs].index, 0, 0, + StateSet::State::NODE_SHADOW}); } // fill vertex mapping - for (const auto &v : vertices_range(g)) { + for (auto v : vertices_range(g)) { vertices[g[v].index] = v; } - // init accept states - for (const auto &u : inv_adjacent_vertices_range(g.accept, g)) { - accept.set(g[u].index); - } - accept_with_eod = accept; - for (const auto &u : inv_adjacent_vertices_range(g.acceptEod, g)) { - accept_with_eod.set(g[u].index); - } } }; @@ -140,8 +817,7 @@ bool isUtf8CodePoint(const char c) { } static -bool canReach(const NGHolder &g, const NFAEdge &e, - struct fmstate &state) { +bool canReach(const NGHolder &g, const NFAEdge &e, struct fmstate &state) { auto flags = g[e].assert_flags; if (!flags) { return true; @@ -177,36 +853,52 @@ bool canReach(const NGHolder &g, const NFAEdge &e, static void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state, bool allowEodMatches) { - auto acc_states = state.states.s; - acc_states &= allowEodMatches ? state.accept_with_eod : state.accept; + flat_set accepts {g.accept, g.acceptEod}; - for (size_t i = acc_states.find_first(); i != acc_states.npos; - i = acc_states.find_next(i)) { - const NFAVertex u = state.vertices[i]; - const size_t &som_offset = state.states.som[i]; - - // we can't accept anything from startDs in between UTF-8 codepoints - if (state.utf8 && u == g.startDs && !isUtf8CodePoint(state.cur)) { + for (auto v : accepts) { + bool eod = v == g.acceptEod; + if (eod && !allowEodMatches) { continue; } - for (const auto &e : out_edges_range(u, g)) { - NFAVertex v = target(e, g); - if (v == g.accept || (v == g.acceptEod && allowEodMatches)) { - // check edge assertions if we are allowed to reach accept - if (!canReach(g, e, state)) { - continue; - } - DEBUG_PRINTF("match found at %zu\n", state.offset); + auto active_states = eod ? state.states.getAcceptEodStates(state.gc) : + state.states.getAcceptStates(state.gc); - assert(!g[u].reports.empty()); - for (const auto &report_id : g[u].reports) { - const Report &ri = state.rm.getReport(report_id); + DEBUG_PRINTF("Number of active states: %zu\n", active_states.size()); - DEBUG_PRINTF("report %u has offset adjustment %d\n", - report_id, ri.offsetAdjust); - matches.emplace(som_offset, state.offset + ri.offsetAdjust); - } + for (const auto &cur : active_states) { + auto u = state.vertices[cur.idx]; + + // we can't accept anything from startDs in between UTF-8 codepoints + if (state.utf8 && u == g.startDs && !isUtf8CodePoint(state.cur)) { + continue; + } + + const auto &reports = + eod ? + state.gc.vertex_eod_reports_by_level[cur.level][u] : + state.gc.vertex_reports_by_level[cur.level][u]; + + NFAEdge e = edge(u, v, g); + + // we assume edge assertions only exist at level 0 + if (e && !canReach(g, e, state)) { + continue; + } + + DEBUG_PRINTF("%smatch found at %zu\n", + eod ? "eod " : "", state.offset); + + assert(!reports.empty()); + for (const auto &report_id : reports) { + const Report &ri = state.rm.getReport(report_id); + + DEBUG_PRINTF("report %u has offset adjustment %d\n", + report_id, ri.offsetAdjust); + DEBUG_PRINTF("match from (i:%zu,l:%u,t:%u): (%zu,%zu)\n", + cur.idx, cur.level, cur.type, cur.som, + state.offset + ri.offsetAdjust); + matches.emplace(cur.som, state.offset + ri.offsetAdjust); } } } @@ -214,20 +906,18 @@ void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state, static void step(const NGHolder &g, struct fmstate &state) { - state.next.s.reset(); + state.next.reset(); - for (size_t i = state.states.s.find_first(); i != state.states.s.npos; - i = state.states.s.find_next(i)) { - const NFAVertex &u = state.vertices[i]; - const size_t &u_som_offset = state.states.som[i]; + const auto active = state.states.getActiveStates(); - for (const auto &e : out_edges_range(u, g)) { - NFAVertex v = target(e, g); - if (v == g.acceptEod) { - // can't know the future: we don't know if we're at EOD. - continue; - } - if (v == g.accept) { + for (const auto &cur : active) { + auto u = state.vertices[cur.idx]; + auto succ_list = state.states.getSuccessors(cur, state.gc); + + for (auto succ : succ_list) { + auto v = state.vertices[succ.idx]; + + if (is_any_accept(v, g)) { continue; } @@ -235,37 +925,70 @@ void step(const NGHolder &g, struct fmstate &state) { continue; } - const CharReach &cr = g[v].char_reach; - const size_t v_idx = g[v].index; + // GraphCache doesn't differentiate between successors for shadows + // and helpers, and StateSet does not know anything about the graph, + // so the only place we can do it is here. we can't self-loop on a + // startDs if we're startDs's helper, so disallow it. + if (u == g.startDs && v == g.startDs && + succ.level != 0 && succ.level == cur.level) { + continue; + } - // check reachability and edge assertions - if (cr.test(state.cur) && canReach(g, e, state)) { - // if we aren't in SOM mode, just set every SOM to 0 - if (!state.som) { - state.next.s.set(v_idx); - state.next.som[v_idx] = 0; - continue; + // for the reasons outlined above, also putting this here. + // disallow transitions from start to startDs on levels other than zero + if (u == g.start && v == g.startDs && + cur.level != 0 && succ.level != 0) { + continue; + } + + bool can_reach = false; + + if (succ.type == StateSet::State::NODE_HELPER) { + can_reach = true; + } else { + // we assume edge assertions only exist on level 0 + const CharReach &cr = g[v].char_reach; + NFAEdge e = edge(u, v, g); + + if (cr.test(state.cur) && + (!e || canReach(g, e, state))) { + can_reach = true; } + } - // if this is first vertex since start, use current offset as SOM + // check edge assertions if we are allowed to reach accept + DEBUG_PRINTF("reaching %zu->%zu ('%c'->'%c'): %s\n", + g[u].index, g[v].index, + ourisprint(state.prev) ? state.prev : '?', + ourisprint(state.cur) ? state.cur : '?', + can_reach ? "yes" : "no"); + + if (can_reach) { + // we should use current offset as SOM if: + // - we're at level 0 and we're a start vertex + // - we're a fake start shadow size_t next_som; - if (u == g.start || u == g.startDs || is_virtual_start(u, g)) { + bool reset = is_any_start(u, g) && cur.level == 0; + reset |= is_virtual_start(u, g) && + cur.type == StateSet::State::NODE_SHADOW; + + if (reset) { next_som = state.offset; } else { // else, inherit SOM from predecessor - next_som = u_som_offset; + next_som = cur.som; } + succ.som = next_som; - // check if the vertex is already active - // if this vertex is not yet active, use current SOM - if (!state.next.s.test(v_idx)) { - state.next.s.set(v_idx); - state.next.som[v_idx] = next_som; - } else { - // else, work out leftmost SOM - state.next.som[v_idx] = - min(next_som, state.next.som[v_idx]); - } + DEBUG_PRINTF("src: idx %zu level: %u som: %zu type: %s\n", + cur.idx, cur.level, cur.som, + cur.type == StateSet::State::NODE_HELPER ? "H" : "S"); + DEBUG_PRINTF("dst: idx %zu level: %u som: %zu type: %s\n", + succ.idx, succ.level, succ.som, + succ.type == StateSet::State::NODE_HELPER ? "H" : "S"); + + // activate successor (SOM will be handled by activateState) + state.next.activateState(succ); } } } @@ -312,15 +1035,28 @@ void filterMatches(MatchSet &matches) { * Fills \a matches with offsets into the data stream where a match is found. */ void findMatches(const NGHolder &g, const ReportManager &rm, - const string &input, MatchSet &matches, const bool notEod, - const bool som, const bool utf8) { + const string &input, MatchSet &matches, + const u32 edit_distance, const bool notEod, const bool utf8) { assert(hasCorrectlyNumberedVertices(g)); + // cannot match fuzzy utf8 patterns, this should've been filtered out at + // compile time, so make it an assert + assert(!edit_distance || !utf8); + + DEBUG_PRINTF("Finding matches\n"); + + GraphCache gc(edit_distance, g); +#ifdef DEBUG + gc.dumpStateTransitionTable(g); +#endif const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0); - struct fmstate state(g, som, utf8, allowStartDs, rm); + struct fmstate state(g, gc, utf8, allowStartDs, edit_distance, rm); for (auto it = input.begin(), ite = input.end(); it != ite; ++it) { +#ifdef DEBUG + state.states.dumpActiveStates(); +#endif state.offset = distance(input.begin(), it); state.cur = *it; @@ -328,26 +1064,26 @@ void findMatches(const NGHolder &g, const ReportManager &rm, getMatches(g, matches, state, false); - DEBUG_PRINTF("index %zu, %zu states on\n", state.offset, - state.next.s.count()); - if (state.next.s.empty()) { - if (state.som) { - filterMatches(matches); - } + DEBUG_PRINTF("offset %zu, %zu states on\n", state.offset, + state.next.count()); + if (state.next.empty()) { + filterMatches(matches); return; } state.states = state.next; state.prev = state.cur; } +#ifdef DEBUG + state.states.dumpActiveStates(); +#endif state.offset = input.size(); state.cur = 0; // do additional step to get matches after stream end, this time count eod // matches also (or not, if we're in notEod mode) + DEBUG_PRINTF("Looking for EOD matches\n"); getMatches(g, matches, state, !notEod); - if (state.som) { - filterMatches(matches); - } + filterMatches(matches); } diff --git a/util/ng_find_matches.h b/util/ng_find_matches.h index e9e47010..39ee3f68 100644 --- a/util/ng_find_matches.h +++ b/util/ng_find_matches.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -51,6 +51,7 @@ struct BoundaryReports; void findMatches(const ue2::NGHolder &g, const ue2::ReportManager &rm, const std::string &input, std::set> &matches, - const bool notEod, const bool som, const bool utf8); + const unsigned int max_edit_distance, const bool notEod, + const bool utf8); #endif // NG_FIND_MATCHES_H