Add support for Hamming distance approx matching

This commit is contained in:
Matthew Barr
2017-10-10 15:26:35 +11:00
committed by Xiang Wang
parent 5827bd1c2b
commit 1891f14755
13 changed files with 169 additions and 56 deletions

View File

@@ -347,14 +347,19 @@ bool NG::addGraph(ExpressionInfo &expr, unique_ptr<NGHolder> g_ptr) {
throw CompileError(expr.index, "Pattern can never match.");
}
bool hamming = expr.hamm_distance > 0;
u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance;
DEBUG_PRINTF("edit distance = %u hamming = %s\n", e_dist, hamming ? "true" : "false");
// validate graph's suitability for fuzzing before resolving asserts
validate_fuzzy_compile(g, expr.edit_distance, expr.utf8, cc.grey);
validate_fuzzy_compile(g, e_dist, hamming, expr.utf8, cc.grey);
resolveAsserts(rm, g, expr);
dumpDotWrapper(g, expr, "02_post_assert_resolve", cc.grey);
assert(allMatchStatesHaveReports(g));
make_fuzzy(g, expr.edit_distance, cc.grey);
make_fuzzy(g, e_dist, hamming, cc.grey);
dumpDotWrapper(g, expr, "02a_post_fuzz", cc.grey);
pruneUseless(g);

View File

@@ -161,14 +161,17 @@ void fillExpressionInfo(ReportManager &rm, const CompileContext &cc,
throw CompileError(expr.index, "Pattern can never match.");
}
bool hamming = expr.hamm_distance > 0;
u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance;
// validate graph's suitability for fuzzing
validate_fuzzy_compile(g, expr.edit_distance, expr.utf8, cc.grey);
validate_fuzzy_compile(g, e_dist, hamming, expr.utf8, cc.grey);
resolveAsserts(rm, g, expr);
assert(allMatchStatesHaveReports(g));
// fuzz graph - this must happen before any transformations are made
make_fuzzy(g, expr.edit_distance, cc.grey);
make_fuzzy(g, e_dist, hamming, cc.grey);
pruneUseless(g);
pruneEmptyVertices(g);

View File

@@ -144,6 +144,7 @@ vector<flat_set<NFAVertex>> gatherPredecessorsByDepth(const NGHolder &g,
struct ShadowGraph {
NGHolder &g;
u32 edit_distance;
bool hamming;
map<pair<NFAVertex, u32>, NFAVertex> shadow_map;
map<pair<NFAVertex, u32>, NFAVertex> helper_map;
map<NFAVertex, NFAVertex> clones;
@@ -151,13 +152,17 @@ struct ShadowGraph {
vector<pair<NFAVertex, NFAVertex>> edges_to_be_added;
flat_set<NFAVertex> orig;
ShadowGraph(NGHolder &g_in, u32 ed_in) : g(g_in), edit_distance(ed_in) {}
ShadowGraph(NGHolder &g_in, u32 ed_in, bool hamm_in)
: g(g_in), edit_distance(ed_in), hamming(hamm_in) {}
void fuzz_graph() {
if (edit_distance == 0) {
return;
}
DEBUG_PRINTF("edit distance = %u hamming = %s\n", edit_distance,
hamming ? "true" : "false");
// step 1: prepare the vertices, helpers and shadows according to
// the original graph
prepare_graph();
@@ -167,7 +172,9 @@ struct ShadowGraph {
// step 3: set up reports for newly created vertices (and make clones
// if necessary)
create_reports();
if (!hamming) {
create_reports();
}
// step 4: wire up shadow graph and helpers for insert/replace/remove
connect_shadow_graph();
@@ -244,6 +251,16 @@ private:
// if there's nowhere to go from this vertex, no helper needed
if (proper_out_degree(v, g) < 1) {
DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n",
g[shadow_v].index, dist);
helper_map[make_pair(v, dist)] = shadow_v;
continue;
}
// start and startDs only have helpers for insert, so not Hamming
if (hamming && is_any_start(v, g)) {
DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n",
g[shadow_v].index, dist);
helper_map[make_pair(v, dist)] = shadow_v;
continue;
}
@@ -256,6 +273,8 @@ private:
g[helper_v].char_reach = CharReach::dot();
// do not copy virtual start's assert flags
if (is_virtual_start(v, g)) {
DEBUG_PRINTF("Helper node ID is virtual start: %zu (level %u)\n",
g[helper_v].index, dist);
g[helper_v].assert_flags = 0;
}
helper_map[make_pair(v, dist)] = helper_v;
@@ -272,7 +291,7 @@ private:
const auto &cur_shadow_helper = helper_map[make_pair(v, dist)];
// multiple insert
if (dist > 1) {
if (!hamming && dist > 1) {
const auto &prev_level_helper = helper_map[make_pair(v, dist - 1)];
connect_to_clones(prev_level_helper, cur_shadow_helper);
}
@@ -429,13 +448,15 @@ private:
connect_preds(v, dist);
// handle helpers
if (dist > 0) {
if (!hamming && dist > 0) {
connect_helpers(v, dist);
}
}
// handle removals
connect_removals(v);
if (!hamming) {
connect_removals(v);
}
}
}
@@ -636,8 +657,8 @@ bool will_turn_vacuous(const NGHolder &g, u32 edit_distance) {
return false;
}
void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8,
const Grey &grey) {
void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool hamming,
bool utf8, const Grey &grey) {
if (edit_distance == 0) {
return;
}
@@ -657,13 +678,14 @@ void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8,
"approximate matching.");
}
}
if (will_turn_vacuous(g, edit_distance)) {
if (!hamming && will_turn_vacuous(g, edit_distance)) {
throw CompileError("Approximate matching patterns that reduce to "
"vacuous patterns are disallowed.");
}
}
void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey) {
void make_fuzzy(NGHolder &g, u32 edit_distance, bool hamming,
const Grey &grey) {
if (edit_distance == 0) {
return;
}
@@ -671,7 +693,7 @@ void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey) {
assert(grey.allowApproximateMatching);
assert(grey.maxEditDistance >= edit_distance);
ShadowGraph sg(g, edit_distance);
ShadowGraph sg(g, edit_distance, hamming);
sg.fuzz_graph();
// For safety, enforce limit on actual vertex count.

View File

@@ -40,10 +40,10 @@ struct Grey;
class NGHolder;
class ReportManager;
void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8,
const Grey &grey);
void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool hamming,
bool utf8, const Grey &grey);
void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey);
void make_fuzzy(NGHolder &g, u32 edit_distance, bool hamming, const Grey &grey);
}
#endif // NG_FUZZY_H