Hamming: GraphTruth and corpus gen support

This commit is contained in:
Matthew Barr 2017-11-07 13:39:00 +11:00 committed by Xiang Wang
parent 1891f14755
commit 4fc11cfc65
3 changed files with 48 additions and 14 deletions

View File

@ -255,16 +255,34 @@ CorpusGeneratorImpl::CorpusGeneratorImpl(const NGHolder &graph_in,
CorpusProperties &props) CorpusProperties &props)
: expr(expr_in), graph(graph_in), cProps(props) { : expr(expr_in), graph(graph_in), cProps(props) {
// if this pattern is to be matched approximately // if this pattern is to be matched approximately
if (expr.edit_distance && !props.editDistance) { if ((expr.edit_distance || expr.hamm_distance) && !props.editDistance) {
props.editDistance = props.rand(0, expr.edit_distance + 1); props.editDistance =
props.rand(0, expr.hamm_distance + expr.edit_distance + 1);
} }
} }
void CorpusGeneratorImpl::generateCorpus(vector<string> &data) { void CorpusGeneratorImpl::generateCorpus(vector<string> &data) {
newGenerator(data); newGenerator(data);
if (cProps.editDistance && !data.empty() &&
data.size() < cProps.corpusLimit) {
// Create more entries by copying the corpora and applying edits
size_t diff = cProps.corpusLimit - data.size();
size_t repeats = diff / data.size();
size_t remains = diff % data.size();
vector<string> newdata;
for (size_t i = 0; i < repeats; i++) {
std::copy(data.begin(), data.end(), std::back_inserter(newdata));
}
if (remains) {
std::copy_n(data.begin(), remains, std::back_inserter(newdata));
}
for (auto &s : newdata) {
editCorpus(&s, cProps);
}
std::move(newdata.begin(), newdata.end(), back_inserter(data));
} else if (cProps.editDistance) {
// If the caller has asked us, apply edit distance to corpora // If the caller has asked us, apply edit distance to corpora
if (cProps.editDistance) {
for (auto &s : data) { for (auto &s : data) {
editCorpus(&s, cProps); editCorpus(&s, cProps);
} }

View File

@ -139,8 +139,9 @@ gatherPredecessorsByDepth(const NGHolder &g, NFAVertex src, u32 depth) {
// this is a per-vertex, per-shadow level state transition table // this is a per-vertex, per-shadow level state transition table
struct GraphCache { struct GraphCache {
GraphCache(u32 dist_in, const NGHolder &g) : GraphCache(u32 dist_in, u32 hamm_in, const NGHolder &g)
size(num_vertices(g)), edit_distance(dist_in) : hamming(hamm_in > 0), size(num_vertices(g)),
edit_distance(hamming ? hamm_in : dist_in)
{ {
auto dist_max = edit_distance + 1; auto dist_max = edit_distance + 1;
@ -220,7 +221,7 @@ struct GraphCache {
auto cur_v_bit = i; auto cur_v_bit = i;
// enable transition to next level helper (this handles insertion) // enable transition to next level helper (this handles insertion)
if (d < edit_distance && !is_any_accept(cur_v, g)) { if (!hamming && d < edit_distance && !is_any_accept(cur_v, g)) {
auto &next_v_helpers = helper_transitions[i][d + 1]; auto &next_v_helpers = helper_transitions[i][d + 1];
next_v_helpers.set(cur_v_bit); next_v_helpers.set(cur_v_bit);
@ -232,6 +233,10 @@ struct GraphCache {
v_shadows.set(cur_v_bit); v_shadows.set(cur_v_bit);
} }
if (hamming && d > 0) {
continue;
}
// populate state transition tables // populate state transition tables
for (auto v : succs[d]) { for (auto v : succs[d]) {
auto v_bit = g[v].index; auto v_bit = g[v].index;
@ -295,6 +300,7 @@ struct GraphCache {
// add self to report list at all levels // add self to report list at all levels
vertex_reports_by_level[d][v].insert(rs.begin(), rs.end()); vertex_reports_by_level[d][v].insert(rs.begin(), rs.end());
} }
if (edit_distance == 0) { if (edit_distance == 0) {
// if edit distance is 0, no predecessors will have reports // if edit distance is 0, no predecessors will have reports
continue; continue;
@ -323,7 +329,7 @@ struct GraphCache {
// add self to report list at all levels // add self to report list at all levels
vertex_eod_reports_by_level[d][v].insert(rs.begin(), rs.end()); vertex_eod_reports_by_level[d][v].insert(rs.begin(), rs.end());
} }
if (edit_distance == 0) { if (edit_distance == 0 || hamming) {
// if edit distance is 0, no predecessors will have reports // if edit distance is 0, no predecessors will have reports
continue; continue;
} }
@ -479,6 +485,7 @@ struct GraphCache {
vector<map<NFAVertex, flat_set<ReportID>>> vertex_reports_by_level; vector<map<NFAVertex, flat_set<ReportID>>> vertex_reports_by_level;
vector<map<NFAVertex, flat_set<ReportID>>> vertex_eod_reports_by_level; vector<map<NFAVertex, flat_set<ReportID>>> vertex_eod_reports_by_level;
bool hamming;
u32 size; u32 size;
u32 edit_distance; u32 edit_distance;
}; };
@ -682,6 +689,7 @@ struct StateSet {
result.emplace_back(id, dist, shadows_som[dist][id], result.emplace_back(id, dist, shadows_som[dist][id],
State::NODE_SHADOW); State::NODE_SHADOW);
} }
auto cur_helper_vertices = helpers[dist]; auto cur_helper_vertices = helpers[dist];
cur_helper_vertices &= gc.getAcceptTransitions(dist); cur_helper_vertices &= gc.getAcceptTransitions(dist);
for (size_t id = cur_helper_vertices.find_first(); for (size_t id = cur_helper_vertices.find_first();
@ -708,6 +716,7 @@ struct StateSet {
result.emplace_back(id, dist, shadows_som[dist][id], result.emplace_back(id, dist, shadows_som[dist][id],
State::NODE_SHADOW); State::NODE_SHADOW);
} }
auto cur_helper_vertices = helpers[dist]; auto cur_helper_vertices = helpers[dist];
cur_helper_vertices &= gc.getAcceptEodTransitions(dist); cur_helper_vertices &= gc.getAcceptEodTransitions(dist);
for (size_t id = cur_helper_vertices.find_first(); for (size_t id = cur_helper_vertices.find_first();
@ -1076,27 +1085,33 @@ void filterMatches(MatchSet &matches) {
*/ */
bool findMatches(const NGHolder &g, const ReportManager &rm, bool findMatches(const NGHolder &g, const ReportManager &rm,
const string &input, MatchSet &matches, const string &input, MatchSet &matches,
const u32 edit_distance, const bool notEod, const bool utf8) { const u32 edit_distance, const u32 hamm_distance,
const bool notEod, const bool utf8) {
assert(hasCorrectlyNumberedVertices(g)); assert(hasCorrectlyNumberedVertices(g));
// cannot match fuzzy utf8 patterns, this should've been filtered out at // cannot match fuzzy utf8 patterns, this should've been filtered out at
// compile time, so make it an assert // compile time, so make it an assert
assert(!edit_distance || !utf8); assert(!edit_distance || !utf8);
// cannot be both edit and Hamming distance at once
assert(!edit_distance || !hamm_distance);
const size_t total_states = num_vertices(g) * (3 * edit_distance + 1); bool hamming = hamm_distance > 0;
auto dist = hamming ? hamm_distance : edit_distance;
const size_t total_states = num_vertices(g) * (3 * dist + 1);
DEBUG_PRINTF("Finding matches (%zu total states)\n", total_states); DEBUG_PRINTF("Finding matches (%zu total states)\n", total_states);
if (total_states > STATE_COUNT_MAX) { if (total_states > STATE_COUNT_MAX) {
DEBUG_PRINTF("too big\n"); DEBUG_PRINTF("too big\n");
return false; return false;
} }
GraphCache gc(edit_distance, g); GraphCache gc(edit_distance, hamm_distance, g);
#ifdef DEBUG #ifdef DEBUG
gc.dumpStateTransitionTable(g); gc.dumpStateTransitionTable(g);
#endif #endif
const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0); const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0);
struct fmstate state(g, gc, utf8, allowStartDs, edit_distance, rm); struct fmstate state(g, gc, utf8, allowStartDs, dist, rm);
StateSet::WorkingData wd; StateSet::WorkingData wd;
@ -1104,7 +1119,7 @@ bool findMatches(const NGHolder &g, const ReportManager &rm,
#ifdef DEBUG #ifdef DEBUG
state.states.dumpActiveStates(); state.states.dumpActiveStates();
#endif #endif
state.offset = distance(input.begin(), it); state.offset = std::distance(input.begin(), it);
state.cur = *it; state.cur = *it;
step(g, state, wd); step(g, state, wd);

View File

@ -55,7 +55,8 @@ struct BoundaryReports;
bool findMatches(const ue2::NGHolder &g, const ue2::ReportManager &rm, bool findMatches(const ue2::NGHolder &g, const ue2::ReportManager &rm,
const std::string &input, const std::string &input,
std::set<std::pair<size_t, size_t>> &matches, std::set<std::pair<size_t, size_t>> &matches,
const unsigned int max_edit_distance, const bool notEod, const unsigned int max_edit_distance,
const unsigned int max_hamm_distance, const bool notEod,
const bool utf8); const bool utf8);
#endif // NG_FIND_MATCHES_H #endif // NG_FIND_MATCHES_H