From 1891f147553ebea9893c8a47952a283b7b8dcacf Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Tue, 10 Oct 2017 15:26:35 +1100 Subject: [PATCH] Add support for Hamming distance approx matching --- doc/dev-reference/compilation.rst | 35 +++++++---- src/compiler/compiler.cpp | 15 ++++- src/compiler/expression_info.h | 7 ++- src/hs_compile.h | 10 ++++ src/nfagraph/ng.cpp | 9 ++- src/nfagraph/ng_expr_info.cpp | 7 ++- src/nfagraph/ng_fuzzy.cpp | 42 +++++++++---- src/nfagraph/ng_fuzzy.h | 6 +- src/parser/shortcut_literal.cpp | 2 +- unit/hyperscan/bad_patterns.txt | 1 + unit/hyperscan/expr_info.cpp | 79 ++++++++++++++++++------- unit/internal/nfagraph_find_matches.cpp | 2 +- util/ExpressionParser.rl | 10 +++- 13 files changed, 169 insertions(+), 56 deletions(-) diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst index 6b6d972a..65eb1fc4 100644 --- a/doc/dev-reference/compilation.rst +++ b/doc/dev-reference/compilation.rst @@ -287,6 +287,7 @@ which provides the following fields: * ``min_length``: The minimum match length (from start to end) required to successfully match this expression. * ``edit_distance``: Match this expression within a given Levenshtein distance. +* ``hamming_distance``: Match this expression within a given Hamming distance. These parameters either allow the set of matches produced by a pattern to be constrained at compile time (rather than relying on the application to process @@ -299,10 +300,15 @@ and a ``max_offset`` of 15 will not produce matches when scanned against streams ``foo0123bar`` or ``foo0123456bar``. Similarly, the pattern :regexp:`/foobar/` when given an ``edit_distance`` of 2 -will produce matches when scanned against ``foobar``, ``fooba``, ``fobr``, -``fo_baz``, ``foooobar``, and anything else that lies within edit distance of 2 -(as defined by Levenshtein distance). For more details, see the -:ref:`approximate_matching` section. +will produce matches when scanned against ``foobar``, ``f00bar``, ``fooba``, +``fobr``, ``fo_baz``, ``foooobar``, and anything else that lies within edit +distance of 2 (as defined by Levenshtein distance). + +When the same pattern :regexp:`/foobar/` is given a ``hamming_distance`` of 2, +it will produce matches when scanned against ``foobar``, ``boofar``, +``f00bar``, and anything else with at most two characters substituted from the +original pattern. For more details, see the :ref:`approximate_matching` +section. ================= Prefiltering Mode @@ -398,13 +404,20 @@ follows: #. **Edit distance** is defined as Levenshtein distance. That is, there are three possible edit types considered: insertion, removal and substitution. - More formal description can be found on + A more formal description can be found on `Wikipedia `_. -#. **Approximate matching** will match all *corpora* within a given edit - distance. That is, given a pattern, approximate matching will match anything - that can be edited to arrive at a corpus that exactly matches the original - pattern. +#. **Hamming distance** is the number of positions by which two strings of + equal length differ. That is, it is the number of substitutions required to + convert one string to the other. There are no insertions or removals when + approximate matching using a Hamming distance. A more formal description can + be found on + `Wikipedia `_. + +#. **Approximate matching** will match all *corpora* within a given edit or + Hamming distance. That is, given a pattern, approximate matching will match + anything that can be edited to arrive at a corpus that exactly matches the + original pattern. #. **Matching semantics** are exactly the same as described in :ref:`semantics`. @@ -437,7 +450,9 @@ matching support. Here they are, in a nutshell: reduce to so-called "vacuous" patterns (patterns that match everything). For example, pattern :regexp:`/foo/` with edit distance 3, if implemented, would reduce to matching zero-length buffers. Such patterns will result in a - "Pattern cannot be approximately matched" compile error. + "Pattern cannot be approximately matched" compile error. Approximate + matching within a Hamming distance does not remove symbols, so will not + reduce to a vacuous pattern. * Finally, due to the inherent complexities of defining matching behavior, approximate matching implements a reduced subset of regular expression syntax. Approximate matching does not support UTF-8 (and other diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp index cce89e40..7affb08d 100644 --- a/src/compiler/compiler.cpp +++ b/src/compiler/compiler.cpp @@ -78,7 +78,8 @@ void validateExt(const hs_expr_ext &ext) { static const unsigned long long ALL_EXT_FLAGS = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET | HS_EXT_FLAG_MIN_LENGTH | - HS_EXT_FLAG_EDIT_DISTANCE; + HS_EXT_FLAG_EDIT_DISTANCE | + HS_EXT_FLAG_HAMMING_DISTANCE; if (ext.flags & ~ALL_EXT_FLAGS) { throw CompileError("Invalid hs_expr_ext flag set."); } @@ -96,6 +97,13 @@ void validateExt(const hs_expr_ext &ext) { throw CompileError("In hs_expr_ext, min_length must be less than or " "equal to max_offset."); } + + if ((ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) && + (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE)) { + throw CompileError("In hs_expr_ext, cannot have both edit distance and " + "Hamming distance."); + } + } ParsedExpression::ParsedExpression(unsigned index_in, const char *expression, @@ -103,7 +111,7 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression, const hs_expr_ext *ext) : expr(index_in, flags & HS_FLAG_ALLOWEMPTY, flags & HS_FLAG_SINGLEMATCH, false, flags & HS_FLAG_PREFILTER, SOM_NONE, report, 0, MAX_OFFSET, - 0, 0) { + 0, 0, 0) { ParseMode mode(flags); component = parse(expression, mode); @@ -158,6 +166,9 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression, if (ext->flags & HS_EXT_FLAG_EDIT_DISTANCE) { expr.edit_distance = ext->edit_distance; } + if (ext->flags & HS_EXT_FLAG_HAMMING_DISTANCE) { + expr.hamm_distance = ext->hamming_distance; + } } // These are validated in validateExt, so an error will already have been diff --git a/src/compiler/expression_info.h b/src/compiler/expression_info.h index 7775f59e..45d18cbf 100644 --- a/src/compiler/expression_info.h +++ b/src/compiler/expression_info.h @@ -45,11 +45,13 @@ public: ExpressionInfo(unsigned int index_in, bool allow_vacuous_in, bool highlander_in, bool utf8_in, bool prefilter_in, som_type som_in, ReportID report_in, u64a min_offset_in, - u64a max_offset_in, u64a min_length_in, u32 edit_distance_in) + u64a max_offset_in, u64a min_length_in, u32 edit_distance_in, + u32 hamm_distance_in) : index(index_in), report(report_in), allow_vacuous(allow_vacuous_in), highlander(highlander_in), utf8(utf8_in), prefilter(prefilter_in), som(som_in), min_offset(min_offset_in), max_offset(max_offset_in), - min_length(min_length_in), edit_distance(edit_distance_in) {} + min_length(min_length_in), edit_distance(edit_distance_in), + hamm_distance(hamm_distance_in) {} /** * \brief Index of the expression represented by this graph. @@ -95,6 +97,7 @@ public: * 0 if not used. */ u32 edit_distance; + u32 hamm_distance; }; } diff --git a/src/hs_compile.h b/src/hs_compile.h index 3d527044..51106739 100644 --- a/src/hs_compile.h +++ b/src/hs_compile.h @@ -258,6 +258,13 @@ typedef struct hs_expr_ext { * hs_expr_ext::flags field. */ unsigned edit_distance; + + /** + * Allow patterns to approximately match within this Hamming distance. To + * use this parameter, set the @ref HS_EXT_FLAG_HAMMING_DISTANCE flag in the + * hs_expr_ext::flags field. + */ + unsigned hamming_distance; } hs_expr_ext_t; /** @@ -281,6 +288,9 @@ typedef struct hs_expr_ext { /** Flag indicating that the hs_expr_ext::edit_distance field is used. */ #define HS_EXT_FLAG_EDIT_DISTANCE 8ULL +/** Flag indicating that the hs_expr_ext::hamming_distance field is used. */ +#define HS_EXT_FLAG_HAMMING_DISTANCE 16ULL + /** @} */ /** diff --git a/src/nfagraph/ng.cpp b/src/nfagraph/ng.cpp index 8b247c74..8b7e4f91 100644 --- a/src/nfagraph/ng.cpp +++ b/src/nfagraph/ng.cpp @@ -347,14 +347,19 @@ bool NG::addGraph(ExpressionInfo &expr, unique_ptr g_ptr) { throw CompileError(expr.index, "Pattern can never match."); } + bool hamming = expr.hamm_distance > 0; + u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance; + + DEBUG_PRINTF("edit distance = %u hamming = %s\n", e_dist, hamming ? "true" : "false"); + // validate graph's suitability for fuzzing before resolving asserts - validate_fuzzy_compile(g, expr.edit_distance, expr.utf8, cc.grey); + validate_fuzzy_compile(g, e_dist, hamming, expr.utf8, cc.grey); resolveAsserts(rm, g, expr); dumpDotWrapper(g, expr, "02_post_assert_resolve", cc.grey); assert(allMatchStatesHaveReports(g)); - make_fuzzy(g, expr.edit_distance, cc.grey); + make_fuzzy(g, e_dist, hamming, cc.grey); dumpDotWrapper(g, expr, "02a_post_fuzz", cc.grey); pruneUseless(g); diff --git a/src/nfagraph/ng_expr_info.cpp b/src/nfagraph/ng_expr_info.cpp index 5f5bbea7..f8abbd04 100644 --- a/src/nfagraph/ng_expr_info.cpp +++ b/src/nfagraph/ng_expr_info.cpp @@ -161,14 +161,17 @@ void fillExpressionInfo(ReportManager &rm, const CompileContext &cc, throw CompileError(expr.index, "Pattern can never match."); } + bool hamming = expr.hamm_distance > 0; + u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance; + // validate graph's suitability for fuzzing - validate_fuzzy_compile(g, expr.edit_distance, expr.utf8, cc.grey); + validate_fuzzy_compile(g, e_dist, hamming, expr.utf8, cc.grey); resolveAsserts(rm, g, expr); assert(allMatchStatesHaveReports(g)); // fuzz graph - this must happen before any transformations are made - make_fuzzy(g, expr.edit_distance, cc.grey); + make_fuzzy(g, e_dist, hamming, cc.grey); pruneUseless(g); pruneEmptyVertices(g); diff --git a/src/nfagraph/ng_fuzzy.cpp b/src/nfagraph/ng_fuzzy.cpp index 2c3d85bd..78fd8629 100644 --- a/src/nfagraph/ng_fuzzy.cpp +++ b/src/nfagraph/ng_fuzzy.cpp @@ -144,6 +144,7 @@ vector> gatherPredecessorsByDepth(const NGHolder &g, struct ShadowGraph { NGHolder &g; u32 edit_distance; + bool hamming; map, NFAVertex> shadow_map; map, NFAVertex> helper_map; map clones; @@ -151,13 +152,17 @@ struct ShadowGraph { vector> edges_to_be_added; flat_set orig; - ShadowGraph(NGHolder &g_in, u32 ed_in) : g(g_in), edit_distance(ed_in) {} + ShadowGraph(NGHolder &g_in, u32 ed_in, bool hamm_in) + : g(g_in), edit_distance(ed_in), hamming(hamm_in) {} void fuzz_graph() { if (edit_distance == 0) { return; } + DEBUG_PRINTF("edit distance = %u hamming = %s\n", edit_distance, + hamming ? "true" : "false"); + // step 1: prepare the vertices, helpers and shadows according to // the original graph prepare_graph(); @@ -167,7 +172,9 @@ struct ShadowGraph { // step 3: set up reports for newly created vertices (and make clones // if necessary) - create_reports(); + if (!hamming) { + create_reports(); + } // step 4: wire up shadow graph and helpers for insert/replace/remove connect_shadow_graph(); @@ -244,6 +251,16 @@ private: // if there's nowhere to go from this vertex, no helper needed if (proper_out_degree(v, g) < 1) { + DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n", + g[shadow_v].index, dist); + helper_map[make_pair(v, dist)] = shadow_v; + continue; + } + + // start and startDs only have helpers for insert, so not Hamming + if (hamming && is_any_start(v, g)) { + DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n", + g[shadow_v].index, dist); helper_map[make_pair(v, dist)] = shadow_v; continue; } @@ -256,6 +273,8 @@ private: g[helper_v].char_reach = CharReach::dot(); // do not copy virtual start's assert flags if (is_virtual_start(v, g)) { + DEBUG_PRINTF("Helper node ID is virtual start: %zu (level %u)\n", + g[helper_v].index, dist); g[helper_v].assert_flags = 0; } helper_map[make_pair(v, dist)] = helper_v; @@ -272,7 +291,7 @@ private: const auto &cur_shadow_helper = helper_map[make_pair(v, dist)]; // multiple insert - if (dist > 1) { + if (!hamming && dist > 1) { const auto &prev_level_helper = helper_map[make_pair(v, dist - 1)]; connect_to_clones(prev_level_helper, cur_shadow_helper); } @@ -429,13 +448,15 @@ private: connect_preds(v, dist); // handle helpers - if (dist > 0) { + if (!hamming && dist > 0) { connect_helpers(v, dist); } } // handle removals - connect_removals(v); + if (!hamming) { + connect_removals(v); + } } } @@ -636,8 +657,8 @@ bool will_turn_vacuous(const NGHolder &g, u32 edit_distance) { return false; } -void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8, - const Grey &grey) { +void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool hamming, + bool utf8, const Grey &grey) { if (edit_distance == 0) { return; } @@ -657,13 +678,14 @@ void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8, "approximate matching."); } } - if (will_turn_vacuous(g, edit_distance)) { + if (!hamming && will_turn_vacuous(g, edit_distance)) { throw CompileError("Approximate matching patterns that reduce to " "vacuous patterns are disallowed."); } } -void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey) { +void make_fuzzy(NGHolder &g, u32 edit_distance, bool hamming, + const Grey &grey) { if (edit_distance == 0) { return; } @@ -671,7 +693,7 @@ void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey) { assert(grey.allowApproximateMatching); assert(grey.maxEditDistance >= edit_distance); - ShadowGraph sg(g, edit_distance); + ShadowGraph sg(g, edit_distance, hamming); sg.fuzz_graph(); // For safety, enforce limit on actual vertex count. diff --git a/src/nfagraph/ng_fuzzy.h b/src/nfagraph/ng_fuzzy.h index a2c82127..a99767d8 100644 --- a/src/nfagraph/ng_fuzzy.h +++ b/src/nfagraph/ng_fuzzy.h @@ -40,10 +40,10 @@ struct Grey; class NGHolder; class ReportManager; -void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8, - const Grey &grey); +void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool hamming, + bool utf8, const Grey &grey); -void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey); +void make_fuzzy(NGHolder &g, u32 edit_distance, bool hamming, const Grey &grey); } #endif // NG_FUZZY_H diff --git a/src/parser/shortcut_literal.cpp b/src/parser/shortcut_literal.cpp index 4539836a..82679c88 100644 --- a/src/parser/shortcut_literal.cpp +++ b/src/parser/shortcut_literal.cpp @@ -170,7 +170,7 @@ bool shortcutLiteral(NG &ng, const ParsedExpression &pe) { // XXX: don't shortcut literals with extended params (yet) if (expr.min_offset || expr.max_offset != MAX_OFFSET || expr.min_length || - expr.edit_distance) { + expr.edit_distance || expr.hamm_distance) { DEBUG_PRINTF("extended params not allowed\n"); return false; } diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt index 3042dc82..7cc03834 100644 --- a/unit/hyperscan/bad_patterns.txt +++ b/unit/hyperscan/bad_patterns.txt @@ -144,3 +144,4 @@ 147:/\b\BMYBt/s{edit_distance=1} #Pattern can never match. 148:/\QÀ\Eaaaa/8 #Expression is not valid UTF-8. 149:/[\QÀ\Eaaaa]/8 #Expression is not valid UTF-8. +150:/abcd/{edit_distance=1,hamming_distance=1} #In hs_expr_ext, cannot have both edit distance and Hamming distance. diff --git a/unit/hyperscan/expr_info.cpp b/unit/hyperscan/expr_info.cpp index 7cc6abd7..0ea8bce5 100644 --- a/unit/hyperscan/expr_info.cpp +++ b/unit/hyperscan/expr_info.cpp @@ -84,6 +84,13 @@ ostream& operator<<(ostream &os, const hs_expr_ext &ext) { os << "edit_distance=" << ext.edit_distance; first = false; } + if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) { + if (!first) { + os << ", "; + } + os << "hamming_distance=" << ext.hamming_distance; + first = false; + } return os; } @@ -171,7 +178,7 @@ TEST_P(ExprInfop, check_ext_null) { free(info); } -static const hs_expr_ext NO_EXT_PARAM = { 0, 0, 0, 0, 0 }; +static const hs_expr_ext NO_EXT_PARAM = { 0, 0, 0, 0, 0, 0 }; static const expected_info ei_test[] = { {"abc", NO_EXT_PARAM, 3, 3, 0, 0, 0}, @@ -214,38 +221,68 @@ static const expected_info ei_test[] = { {"(foo|bar)\\z", NO_EXT_PARAM, 3, 3, 0, 1, 1}, // Some cases with extended parameters. - {"^abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0}, - {"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0}, - {"abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0}, - {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0}, - {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 5, 0}, 6, UINT_MAX, 0, 0, 0}, + {"^abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0, 0}, 6, 10, 0, 0, 0}, + {"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0, 0}, 100, UINT_MAX, 0, 0, 0}, + {"abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0, 0}, 6, 10, 0, 0, 0}, + {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0, 0}, 100, UINT_MAX, 0, 0, 0}, + {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 5, 0, 0}, 6, UINT_MAX, 0, 0, 0}, - {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, UINT_MAX, 0, 0, 0}, - {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0}, - {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2}, + {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1, 0}, 5, UINT_MAX, 0, 0, 0}, + {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2, 0}, 4, UINT_MAX, 0, 0, 0}, + {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2, 0}, 10, UINT_MAX, 0, 0, 0}, - {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2}, + {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2, 0}, 4, UINT_MAX, 0, 0, 0}, - {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2}, + {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2, 0}, 4, 6, 0, 0, 0}, - {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, UINT_MAX, 0, 0, 0}, - {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0}, - {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2}, + {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1, 0}, 5, UINT_MAX, 0, 0, 0}, + {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2, 0}, 4, UINT_MAX, 0, 0, 0}, + {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2, 0}, 10, UINT_MAX, 0, 0, 0}, - {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2}, + {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2, 0}, 4, UINT_MAX, 0, 0, 0}, - {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2}, + {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2, 0}, 4, 6, 0, 0, 0}, - {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, 7, 0, 0, 0}, - {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, 8, 0, 0, 0}, - {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 8, 2}, + {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1, 0}, 5, 7, 0, 0, 0}, + {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2, 0}, 4, 8, 0, 0, 0}, + {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 8, 2, 0}, 8, 8, 0, 0, 0}, - {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2}, + {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2, 0}, 4, 8, 0, 0, 0}, - {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2}, + {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2, 0}, 4, 6, 0, 0, 0}, + + {"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 1}, 6, UINT_MAX, 0, 0, 0}, + {"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 2}, 6, UINT_MAX, 0, 0, 0}, + {"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 5}, 6, UINT_MAX, 0, 0, 0}, + {"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 0, 2}, + 10, UINT_MAX, 0, 0, 0}, + {"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 0, 2}, + 6, UINT_MAX, 0, 0, 0}, + {"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 0, 2}, + 6, 6, 0, 0, 0}, + + {"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 1}, 6, UINT_MAX, 0, 0, 0}, + {"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 2}, 6, UINT_MAX, 0, 0, 0}, + {"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 5}, 6, UINT_MAX, 0, 0, 0}, + {"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 0, 2}, + 10, UINT_MAX, 0, 0, 0}, + {"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 0, 2}, + 6, UINT_MAX, 0, 0, 0}, + {"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 0, 2}, + 6, 6, 0, 0, 0}, + + {"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 1}, 6, 6, 0, 0, 0}, + {"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 2}, 6, 6, 0, 0, 0}, + {"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 5}, 6, 6, 0, 0, 0}, + {"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 6, 0, 2}, + 6, 6, 0, 0, 0}, + {"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 0, 2}, + 6, 6, 0, 0, 0}, + {"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 0, 2}, + 6, 6, 0, 0, 0}, }; INSTANTIATE_TEST_CASE_P(ExprInfo, ExprInfop, ValuesIn(ei_test)); diff --git a/unit/internal/nfagraph_find_matches.cpp b/unit/internal/nfagraph_find_matches.cpp index cd0cd796..81f1610c 100644 --- a/unit/internal/nfagraph_find_matches.cpp +++ b/unit/internal/nfagraph_find_matches.cpp @@ -213,7 +213,7 @@ TEST_P(MatchesTest, Check) { bool utf8 = (t.flags & HS_FLAG_UTF8) > 0; set> matches; - bool success = findMatches(*g, rm, t.input, matches, 0, t.notEod, utf8); + bool success = findMatches(*g, rm, t.input, matches, 0, 0, t.notEod, utf8); ASSERT_TRUE(success); set> expected(begin(t.matches), end(t.matches)); diff --git a/util/ExpressionParser.rl b/util/ExpressionParser.rl index 233b70c1..94d03508 100644 --- a/util/ExpressionParser.rl +++ b/util/ExpressionParser.rl @@ -49,7 +49,8 @@ enum ParamKey { PARAM_MIN_OFFSET, PARAM_MAX_OFFSET, PARAM_MIN_LENGTH, - PARAM_EDIT_DISTANCE + PARAM_EDIT_DISTANCE, + PARAM_HAMM_DISTANCE }; %%{ @@ -97,6 +98,10 @@ enum ParamKey { ext->flags |= HS_EXT_FLAG_EDIT_DISTANCE; ext->edit_distance = num; break; + case PARAM_HAMM_DISTANCE: + ext->flags |= HS_EXT_FLAG_HAMMING_DISTANCE; + ext->hamming_distance = num; + break; case PARAM_NONE: default: // No key specified, syntax invalid. @@ -158,7 +163,8 @@ bool HS_CDECL readExpression(const std::string &input, std::string &expr, param = ('min_offset' @{ key = PARAM_MIN_OFFSET; } | 'max_offset' @{ key = PARAM_MAX_OFFSET; } | 'min_length' @{ key = PARAM_MIN_LENGTH; } | - 'edit_distance' @{ key = PARAM_EDIT_DISTANCE; }); + 'edit_distance' @{ key = PARAM_EDIT_DISTANCE; } | + 'hamming_distance' @{ key = PARAM_HAMM_DISTANCE; }); value = (digit @accumulateNum)+ >{num = 0;}; param_spec = (' '* param '=' value ' '*) >{ key = PARAM_NONE; }