From 4fc11cfc65ca01a327e9eb1d1666ced5ee8be818 Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Tue, 7 Nov 2017 13:39:00 +1100 Subject: [PATCH] Hamming: GraphTruth and corpus gen support --- util/ng_corpus_generator.cpp | 26 ++++++++++++++++++++++---- util/ng_find_matches.cpp | 33 ++++++++++++++++++++++++--------- util/ng_find_matches.h | 3 ++- 3 files changed, 48 insertions(+), 14 deletions(-) diff --git a/util/ng_corpus_generator.cpp b/util/ng_corpus_generator.cpp index c5fad785..e5e8e06c 100644 --- a/util/ng_corpus_generator.cpp +++ b/util/ng_corpus_generator.cpp @@ -255,16 +255,34 @@ CorpusGeneratorImpl::CorpusGeneratorImpl(const NGHolder &graph_in, CorpusProperties &props) : expr(expr_in), graph(graph_in), cProps(props) { // if this pattern is to be matched approximately - if (expr.edit_distance && !props.editDistance) { - props.editDistance = props.rand(0, expr.edit_distance + 1); + if ((expr.edit_distance || expr.hamm_distance) && !props.editDistance) { + props.editDistance = + props.rand(0, expr.hamm_distance + expr.edit_distance + 1); } } void CorpusGeneratorImpl::generateCorpus(vector &data) { newGenerator(data); - // If the caller has asked us, apply edit distance to corpora - if (cProps.editDistance) { + if (cProps.editDistance && !data.empty() && + data.size() < cProps.corpusLimit) { + // Create more entries by copying the corpora and applying edits + size_t diff = cProps.corpusLimit - data.size(); + size_t repeats = diff / data.size(); + size_t remains = diff % data.size(); + vector newdata; + for (size_t i = 0; i < repeats; i++) { + std::copy(data.begin(), data.end(), std::back_inserter(newdata)); + } + if (remains) { + std::copy_n(data.begin(), remains, std::back_inserter(newdata)); + } + for (auto &s : newdata) { + editCorpus(&s, cProps); + } + std::move(newdata.begin(), newdata.end(), back_inserter(data)); + } else if (cProps.editDistance) { + // If the caller has asked us, apply edit distance to corpora for (auto &s : data) { editCorpus(&s, cProps); } diff --git a/util/ng_find_matches.cpp b/util/ng_find_matches.cpp index 0a896f73..9cbc955a 100644 --- a/util/ng_find_matches.cpp +++ b/util/ng_find_matches.cpp @@ -139,8 +139,9 @@ gatherPredecessorsByDepth(const NGHolder &g, NFAVertex src, u32 depth) { // this is a per-vertex, per-shadow level state transition table struct GraphCache { - GraphCache(u32 dist_in, const NGHolder &g) : - size(num_vertices(g)), edit_distance(dist_in) + GraphCache(u32 dist_in, u32 hamm_in, const NGHolder &g) + : hamming(hamm_in > 0), size(num_vertices(g)), + edit_distance(hamming ? hamm_in : dist_in) { auto dist_max = edit_distance + 1; @@ -220,7 +221,7 @@ struct GraphCache { auto cur_v_bit = i; // enable transition to next level helper (this handles insertion) - if (d < edit_distance && !is_any_accept(cur_v, g)) { + if (!hamming && d < edit_distance && !is_any_accept(cur_v, g)) { auto &next_v_helpers = helper_transitions[i][d + 1]; next_v_helpers.set(cur_v_bit); @@ -232,6 +233,10 @@ struct GraphCache { v_shadows.set(cur_v_bit); } + if (hamming && d > 0) { + continue; + } + // populate state transition tables for (auto v : succs[d]) { auto v_bit = g[v].index; @@ -295,6 +300,7 @@ struct GraphCache { // add self to report list at all levels vertex_reports_by_level[d][v].insert(rs.begin(), rs.end()); } + if (edit_distance == 0) { // if edit distance is 0, no predecessors will have reports continue; @@ -323,7 +329,7 @@ struct GraphCache { // add self to report list at all levels vertex_eod_reports_by_level[d][v].insert(rs.begin(), rs.end()); } - if (edit_distance == 0) { + if (edit_distance == 0 || hamming) { // if edit distance is 0, no predecessors will have reports continue; } @@ -479,6 +485,7 @@ struct GraphCache { vector>> vertex_reports_by_level; vector>> vertex_eod_reports_by_level; + bool hamming; u32 size; u32 edit_distance; }; @@ -682,6 +689,7 @@ struct StateSet { result.emplace_back(id, dist, shadows_som[dist][id], State::NODE_SHADOW); } + auto cur_helper_vertices = helpers[dist]; cur_helper_vertices &= gc.getAcceptTransitions(dist); for (size_t id = cur_helper_vertices.find_first(); @@ -708,6 +716,7 @@ struct StateSet { result.emplace_back(id, dist, shadows_som[dist][id], State::NODE_SHADOW); } + auto cur_helper_vertices = helpers[dist]; cur_helper_vertices &= gc.getAcceptEodTransitions(dist); for (size_t id = cur_helper_vertices.find_first(); @@ -1076,27 +1085,33 @@ void filterMatches(MatchSet &matches) { */ bool findMatches(const NGHolder &g, const ReportManager &rm, const string &input, MatchSet &matches, - const u32 edit_distance, const bool notEod, const bool utf8) { + const u32 edit_distance, const u32 hamm_distance, + const bool notEod, const bool utf8) { assert(hasCorrectlyNumberedVertices(g)); // cannot match fuzzy utf8 patterns, this should've been filtered out at // compile time, so make it an assert assert(!edit_distance || !utf8); + // cannot be both edit and Hamming distance at once + assert(!edit_distance || !hamm_distance); - const size_t total_states = num_vertices(g) * (3 * edit_distance + 1); + bool hamming = hamm_distance > 0; + auto dist = hamming ? hamm_distance : edit_distance; + + const size_t total_states = num_vertices(g) * (3 * dist + 1); DEBUG_PRINTF("Finding matches (%zu total states)\n", total_states); if (total_states > STATE_COUNT_MAX) { DEBUG_PRINTF("too big\n"); return false; } - GraphCache gc(edit_distance, g); + GraphCache gc(edit_distance, hamm_distance, g); #ifdef DEBUG gc.dumpStateTransitionTable(g); #endif const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0); - struct fmstate state(g, gc, utf8, allowStartDs, edit_distance, rm); + struct fmstate state(g, gc, utf8, allowStartDs, dist, rm); StateSet::WorkingData wd; @@ -1104,7 +1119,7 @@ bool findMatches(const NGHolder &g, const ReportManager &rm, #ifdef DEBUG state.states.dumpActiveStates(); #endif - state.offset = distance(input.begin(), it); + state.offset = std::distance(input.begin(), it); state.cur = *it; step(g, state, wd); diff --git a/util/ng_find_matches.h b/util/ng_find_matches.h index 9860c202..93f95097 100644 --- a/util/ng_find_matches.h +++ b/util/ng_find_matches.h @@ -55,7 +55,8 @@ struct BoundaryReports; bool findMatches(const ue2::NGHolder &g, const ue2::ReportManager &rm, const std::string &input, std::set> &matches, - const unsigned int max_edit_distance, const bool notEod, + const unsigned int max_edit_distance, + const unsigned int max_hamm_distance, const bool notEod, const bool utf8); #endif // NG_FIND_MATCHES_H