mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-09-30 03:34:25 +03:00
Add support for Hamming distance approx matching
This commit is contained in:
@@ -78,7 +78,8 @@ void validateExt(const hs_expr_ext &ext) {
|
||||
static const unsigned long long ALL_EXT_FLAGS = HS_EXT_FLAG_MIN_OFFSET |
|
||||
HS_EXT_FLAG_MAX_OFFSET |
|
||||
HS_EXT_FLAG_MIN_LENGTH |
|
||||
HS_EXT_FLAG_EDIT_DISTANCE;
|
||||
HS_EXT_FLAG_EDIT_DISTANCE |
|
||||
HS_EXT_FLAG_HAMMING_DISTANCE;
|
||||
if (ext.flags & ~ALL_EXT_FLAGS) {
|
||||
throw CompileError("Invalid hs_expr_ext flag set.");
|
||||
}
|
||||
@@ -96,6 +97,13 @@ void validateExt(const hs_expr_ext &ext) {
|
||||
throw CompileError("In hs_expr_ext, min_length must be less than or "
|
||||
"equal to max_offset.");
|
||||
}
|
||||
|
||||
if ((ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) &&
|
||||
(ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE)) {
|
||||
throw CompileError("In hs_expr_ext, cannot have both edit distance and "
|
||||
"Hamming distance.");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
|
||||
@@ -103,7 +111,7 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
|
||||
const hs_expr_ext *ext)
|
||||
: expr(index_in, flags & HS_FLAG_ALLOWEMPTY, flags & HS_FLAG_SINGLEMATCH,
|
||||
false, flags & HS_FLAG_PREFILTER, SOM_NONE, report, 0, MAX_OFFSET,
|
||||
0, 0) {
|
||||
0, 0, 0) {
|
||||
ParseMode mode(flags);
|
||||
|
||||
component = parse(expression, mode);
|
||||
@@ -158,6 +166,9 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
|
||||
if (ext->flags & HS_EXT_FLAG_EDIT_DISTANCE) {
|
||||
expr.edit_distance = ext->edit_distance;
|
||||
}
|
||||
if (ext->flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
|
||||
expr.hamm_distance = ext->hamming_distance;
|
||||
}
|
||||
}
|
||||
|
||||
// These are validated in validateExt, so an error will already have been
|
||||
|
@@ -45,11 +45,13 @@ public:
|
||||
ExpressionInfo(unsigned int index_in, bool allow_vacuous_in,
|
||||
bool highlander_in, bool utf8_in, bool prefilter_in,
|
||||
som_type som_in, ReportID report_in, u64a min_offset_in,
|
||||
u64a max_offset_in, u64a min_length_in, u32 edit_distance_in)
|
||||
u64a max_offset_in, u64a min_length_in, u32 edit_distance_in,
|
||||
u32 hamm_distance_in)
|
||||
: index(index_in), report(report_in), allow_vacuous(allow_vacuous_in),
|
||||
highlander(highlander_in), utf8(utf8_in), prefilter(prefilter_in),
|
||||
som(som_in), min_offset(min_offset_in), max_offset(max_offset_in),
|
||||
min_length(min_length_in), edit_distance(edit_distance_in) {}
|
||||
min_length(min_length_in), edit_distance(edit_distance_in),
|
||||
hamm_distance(hamm_distance_in) {}
|
||||
|
||||
/**
|
||||
* \brief Index of the expression represented by this graph.
|
||||
@@ -95,6 +97,7 @@ public:
|
||||
* 0 if not used.
|
||||
*/
|
||||
u32 edit_distance;
|
||||
u32 hamm_distance;
|
||||
};
|
||||
|
||||
}
|
||||
|
@@ -258,6 +258,13 @@ typedef struct hs_expr_ext {
|
||||
* hs_expr_ext::flags field.
|
||||
*/
|
||||
unsigned edit_distance;
|
||||
|
||||
/**
|
||||
* Allow patterns to approximately match within this Hamming distance. To
|
||||
* use this parameter, set the @ref HS_EXT_FLAG_HAMMING_DISTANCE flag in the
|
||||
* hs_expr_ext::flags field.
|
||||
*/
|
||||
unsigned hamming_distance;
|
||||
} hs_expr_ext_t;
|
||||
|
||||
/**
|
||||
@@ -281,6 +288,9 @@ typedef struct hs_expr_ext {
|
||||
/** Flag indicating that the hs_expr_ext::edit_distance field is used. */
|
||||
#define HS_EXT_FLAG_EDIT_DISTANCE 8ULL
|
||||
|
||||
/** Flag indicating that the hs_expr_ext::hamming_distance field is used. */
|
||||
#define HS_EXT_FLAG_HAMMING_DISTANCE 16ULL
|
||||
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
|
@@ -347,14 +347,19 @@ bool NG::addGraph(ExpressionInfo &expr, unique_ptr<NGHolder> g_ptr) {
|
||||
throw CompileError(expr.index, "Pattern can never match.");
|
||||
}
|
||||
|
||||
bool hamming = expr.hamm_distance > 0;
|
||||
u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance;
|
||||
|
||||
DEBUG_PRINTF("edit distance = %u hamming = %s\n", e_dist, hamming ? "true" : "false");
|
||||
|
||||
// validate graph's suitability for fuzzing before resolving asserts
|
||||
validate_fuzzy_compile(g, expr.edit_distance, expr.utf8, cc.grey);
|
||||
validate_fuzzy_compile(g, e_dist, hamming, expr.utf8, cc.grey);
|
||||
|
||||
resolveAsserts(rm, g, expr);
|
||||
dumpDotWrapper(g, expr, "02_post_assert_resolve", cc.grey);
|
||||
assert(allMatchStatesHaveReports(g));
|
||||
|
||||
make_fuzzy(g, expr.edit_distance, cc.grey);
|
||||
make_fuzzy(g, e_dist, hamming, cc.grey);
|
||||
dumpDotWrapper(g, expr, "02a_post_fuzz", cc.grey);
|
||||
|
||||
pruneUseless(g);
|
||||
|
@@ -161,14 +161,17 @@ void fillExpressionInfo(ReportManager &rm, const CompileContext &cc,
|
||||
throw CompileError(expr.index, "Pattern can never match.");
|
||||
}
|
||||
|
||||
bool hamming = expr.hamm_distance > 0;
|
||||
u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance;
|
||||
|
||||
// validate graph's suitability for fuzzing
|
||||
validate_fuzzy_compile(g, expr.edit_distance, expr.utf8, cc.grey);
|
||||
validate_fuzzy_compile(g, e_dist, hamming, expr.utf8, cc.grey);
|
||||
|
||||
resolveAsserts(rm, g, expr);
|
||||
assert(allMatchStatesHaveReports(g));
|
||||
|
||||
// fuzz graph - this must happen before any transformations are made
|
||||
make_fuzzy(g, expr.edit_distance, cc.grey);
|
||||
make_fuzzy(g, e_dist, hamming, cc.grey);
|
||||
|
||||
pruneUseless(g);
|
||||
pruneEmptyVertices(g);
|
||||
|
@@ -144,6 +144,7 @@ vector<flat_set<NFAVertex>> gatherPredecessorsByDepth(const NGHolder &g,
|
||||
struct ShadowGraph {
|
||||
NGHolder &g;
|
||||
u32 edit_distance;
|
||||
bool hamming;
|
||||
map<pair<NFAVertex, u32>, NFAVertex> shadow_map;
|
||||
map<pair<NFAVertex, u32>, NFAVertex> helper_map;
|
||||
map<NFAVertex, NFAVertex> clones;
|
||||
@@ -151,13 +152,17 @@ struct ShadowGraph {
|
||||
vector<pair<NFAVertex, NFAVertex>> edges_to_be_added;
|
||||
flat_set<NFAVertex> orig;
|
||||
|
||||
ShadowGraph(NGHolder &g_in, u32 ed_in) : g(g_in), edit_distance(ed_in) {}
|
||||
ShadowGraph(NGHolder &g_in, u32 ed_in, bool hamm_in)
|
||||
: g(g_in), edit_distance(ed_in), hamming(hamm_in) {}
|
||||
|
||||
void fuzz_graph() {
|
||||
if (edit_distance == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("edit distance = %u hamming = %s\n", edit_distance,
|
||||
hamming ? "true" : "false");
|
||||
|
||||
// step 1: prepare the vertices, helpers and shadows according to
|
||||
// the original graph
|
||||
prepare_graph();
|
||||
@@ -167,7 +172,9 @@ struct ShadowGraph {
|
||||
|
||||
// step 3: set up reports for newly created vertices (and make clones
|
||||
// if necessary)
|
||||
create_reports();
|
||||
if (!hamming) {
|
||||
create_reports();
|
||||
}
|
||||
|
||||
// step 4: wire up shadow graph and helpers for insert/replace/remove
|
||||
connect_shadow_graph();
|
||||
@@ -244,6 +251,16 @@ private:
|
||||
|
||||
// if there's nowhere to go from this vertex, no helper needed
|
||||
if (proper_out_degree(v, g) < 1) {
|
||||
DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n",
|
||||
g[shadow_v].index, dist);
|
||||
helper_map[make_pair(v, dist)] = shadow_v;
|
||||
continue;
|
||||
}
|
||||
|
||||
// start and startDs only have helpers for insert, so not Hamming
|
||||
if (hamming && is_any_start(v, g)) {
|
||||
DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n",
|
||||
g[shadow_v].index, dist);
|
||||
helper_map[make_pair(v, dist)] = shadow_v;
|
||||
continue;
|
||||
}
|
||||
@@ -256,6 +273,8 @@ private:
|
||||
g[helper_v].char_reach = CharReach::dot();
|
||||
// do not copy virtual start's assert flags
|
||||
if (is_virtual_start(v, g)) {
|
||||
DEBUG_PRINTF("Helper node ID is virtual start: %zu (level %u)\n",
|
||||
g[helper_v].index, dist);
|
||||
g[helper_v].assert_flags = 0;
|
||||
}
|
||||
helper_map[make_pair(v, dist)] = helper_v;
|
||||
@@ -272,7 +291,7 @@ private:
|
||||
const auto &cur_shadow_helper = helper_map[make_pair(v, dist)];
|
||||
|
||||
// multiple insert
|
||||
if (dist > 1) {
|
||||
if (!hamming && dist > 1) {
|
||||
const auto &prev_level_helper = helper_map[make_pair(v, dist - 1)];
|
||||
connect_to_clones(prev_level_helper, cur_shadow_helper);
|
||||
}
|
||||
@@ -429,13 +448,15 @@ private:
|
||||
connect_preds(v, dist);
|
||||
|
||||
// handle helpers
|
||||
if (dist > 0) {
|
||||
if (!hamming && dist > 0) {
|
||||
connect_helpers(v, dist);
|
||||
}
|
||||
}
|
||||
|
||||
// handle removals
|
||||
connect_removals(v);
|
||||
if (!hamming) {
|
||||
connect_removals(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -636,8 +657,8 @@ bool will_turn_vacuous(const NGHolder &g, u32 edit_distance) {
|
||||
return false;
|
||||
}
|
||||
|
||||
void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8,
|
||||
const Grey &grey) {
|
||||
void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool hamming,
|
||||
bool utf8, const Grey &grey) {
|
||||
if (edit_distance == 0) {
|
||||
return;
|
||||
}
|
||||
@@ -657,13 +678,14 @@ void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8,
|
||||
"approximate matching.");
|
||||
}
|
||||
}
|
||||
if (will_turn_vacuous(g, edit_distance)) {
|
||||
if (!hamming && will_turn_vacuous(g, edit_distance)) {
|
||||
throw CompileError("Approximate matching patterns that reduce to "
|
||||
"vacuous patterns are disallowed.");
|
||||
}
|
||||
}
|
||||
|
||||
void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey) {
|
||||
void make_fuzzy(NGHolder &g, u32 edit_distance, bool hamming,
|
||||
const Grey &grey) {
|
||||
if (edit_distance == 0) {
|
||||
return;
|
||||
}
|
||||
@@ -671,7 +693,7 @@ void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey) {
|
||||
assert(grey.allowApproximateMatching);
|
||||
assert(grey.maxEditDistance >= edit_distance);
|
||||
|
||||
ShadowGraph sg(g, edit_distance);
|
||||
ShadowGraph sg(g, edit_distance, hamming);
|
||||
sg.fuzz_graph();
|
||||
|
||||
// For safety, enforce limit on actual vertex count.
|
||||
|
@@ -40,10 +40,10 @@ struct Grey;
|
||||
class NGHolder;
|
||||
class ReportManager;
|
||||
|
||||
void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8,
|
||||
const Grey &grey);
|
||||
void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool hamming,
|
||||
bool utf8, const Grey &grey);
|
||||
|
||||
void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey);
|
||||
void make_fuzzy(NGHolder &g, u32 edit_distance, bool hamming, const Grey &grey);
|
||||
}
|
||||
|
||||
#endif // NG_FUZZY_H
|
||||
|
@@ -170,7 +170,7 @@ bool shortcutLiteral(NG &ng, const ParsedExpression &pe) {
|
||||
|
||||
// XXX: don't shortcut literals with extended params (yet)
|
||||
if (expr.min_offset || expr.max_offset != MAX_OFFSET || expr.min_length ||
|
||||
expr.edit_distance) {
|
||||
expr.edit_distance || expr.hamm_distance) {
|
||||
DEBUG_PRINTF("extended params not allowed\n");
|
||||
return false;
|
||||
}
|
||||
|
Reference in New Issue
Block a user