Add support for Hamming distance approx matching

This commit is contained in:
Matthew Barr 2017-10-10 15:26:35 +11:00 committed by Xiang Wang
parent 5827bd1c2b
commit 1891f14755
13 changed files with 169 additions and 56 deletions

View File

@ -287,6 +287,7 @@ which provides the following fields:
* ``min_length``: The minimum match length (from start to end) required to
successfully match this expression.
* ``edit_distance``: Match this expression within a given Levenshtein distance.
* ``hamming_distance``: Match this expression within a given Hamming distance.
These parameters either allow the set of matches produced by a pattern to be
constrained at compile time (rather than relying on the application to process
@ -299,10 +300,15 @@ and a ``max_offset`` of 15 will not produce matches when scanned against
streams ``foo0123bar`` or ``foo0123456bar``.
Similarly, the pattern :regexp:`/foobar/` when given an ``edit_distance`` of 2
will produce matches when scanned against ``foobar``, ``fooba``, ``fobr``,
``fo_baz``, ``foooobar``, and anything else that lies within edit distance of 2
(as defined by Levenshtein distance). For more details, see the
:ref:`approximate_matching` section.
will produce matches when scanned against ``foobar``, ``f00bar``, ``fooba``,
``fobr``, ``fo_baz``, ``foooobar``, and anything else that lies within edit
distance of 2 (as defined by Levenshtein distance).
When the same pattern :regexp:`/foobar/` is given a ``hamming_distance`` of 2,
it will produce matches when scanned against ``foobar``, ``boofar``,
``f00bar``, and anything else with at most two characters substituted from the
original pattern. For more details, see the :ref:`approximate_matching`
section.
=================
Prefiltering Mode
@ -398,13 +404,20 @@ follows:
#. **Edit distance** is defined as Levenshtein distance. That is, there are
three possible edit types considered: insertion, removal and substitution.
More formal description can be found on
A more formal description can be found on
`Wikipedia <https://en.wikipedia.org/wiki/Levenshtein_distance>`_.
#. **Approximate matching** will match all *corpora* within a given edit
distance. That is, given a pattern, approximate matching will match anything
that can be edited to arrive at a corpus that exactly matches the original
pattern.
#. **Hamming distance** is the number of positions by which two strings of
equal length differ. That is, it is the number of substitutions required to
convert one string to the other. There are no insertions or removals when
approximate matching using a Hamming distance. A more formal description can
be found on
`Wikipedia <https://en.wikipedia.org/wiki/Hamming_distance>`_.
#. **Approximate matching** will match all *corpora* within a given edit or
Hamming distance. That is, given a pattern, approximate matching will match
anything that can be edited to arrive at a corpus that exactly matches the
original pattern.
#. **Matching semantics** are exactly the same as described in :ref:`semantics`.
@ -437,7 +450,9 @@ matching support. Here they are, in a nutshell:
reduce to so-called "vacuous" patterns (patterns that match everything). For
example, pattern :regexp:`/foo/` with edit distance 3, if implemented,
would reduce to matching zero-length buffers. Such patterns will result in a
"Pattern cannot be approximately matched" compile error.
"Pattern cannot be approximately matched" compile error. Approximate
matching within a Hamming distance does not remove symbols, so will not
reduce to a vacuous pattern.
* Finally, due to the inherent complexities of defining matching behavior,
approximate matching implements a reduced subset of regular expression
syntax. Approximate matching does not support UTF-8 (and other

View File

@ -78,7 +78,8 @@ void validateExt(const hs_expr_ext &ext) {
static const unsigned long long ALL_EXT_FLAGS = HS_EXT_FLAG_MIN_OFFSET |
HS_EXT_FLAG_MAX_OFFSET |
HS_EXT_FLAG_MIN_LENGTH |
HS_EXT_FLAG_EDIT_DISTANCE;
HS_EXT_FLAG_EDIT_DISTANCE |
HS_EXT_FLAG_HAMMING_DISTANCE;
if (ext.flags & ~ALL_EXT_FLAGS) {
throw CompileError("Invalid hs_expr_ext flag set.");
}
@ -96,6 +97,13 @@ void validateExt(const hs_expr_ext &ext) {
throw CompileError("In hs_expr_ext, min_length must be less than or "
"equal to max_offset.");
}
if ((ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) &&
(ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE)) {
throw CompileError("In hs_expr_ext, cannot have both edit distance and "
"Hamming distance.");
}
}
ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
@ -103,7 +111,7 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
const hs_expr_ext *ext)
: expr(index_in, flags & HS_FLAG_ALLOWEMPTY, flags & HS_FLAG_SINGLEMATCH,
false, flags & HS_FLAG_PREFILTER, SOM_NONE, report, 0, MAX_OFFSET,
0, 0) {
0, 0, 0) {
ParseMode mode(flags);
component = parse(expression, mode);
@ -158,6 +166,9 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
if (ext->flags & HS_EXT_FLAG_EDIT_DISTANCE) {
expr.edit_distance = ext->edit_distance;
}
if (ext->flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
expr.hamm_distance = ext->hamming_distance;
}
}
// These are validated in validateExt, so an error will already have been

View File

@ -45,11 +45,13 @@ public:
ExpressionInfo(unsigned int index_in, bool allow_vacuous_in,
bool highlander_in, bool utf8_in, bool prefilter_in,
som_type som_in, ReportID report_in, u64a min_offset_in,
u64a max_offset_in, u64a min_length_in, u32 edit_distance_in)
u64a max_offset_in, u64a min_length_in, u32 edit_distance_in,
u32 hamm_distance_in)
: index(index_in), report(report_in), allow_vacuous(allow_vacuous_in),
highlander(highlander_in), utf8(utf8_in), prefilter(prefilter_in),
som(som_in), min_offset(min_offset_in), max_offset(max_offset_in),
min_length(min_length_in), edit_distance(edit_distance_in) {}
min_length(min_length_in), edit_distance(edit_distance_in),
hamm_distance(hamm_distance_in) {}
/**
* \brief Index of the expression represented by this graph.
@ -95,6 +97,7 @@ public:
* 0 if not used.
*/
u32 edit_distance;
u32 hamm_distance;
};
}

View File

@ -258,6 +258,13 @@ typedef struct hs_expr_ext {
* hs_expr_ext::flags field.
*/
unsigned edit_distance;
/**
* Allow patterns to approximately match within this Hamming distance. To
* use this parameter, set the @ref HS_EXT_FLAG_HAMMING_DISTANCE flag in the
* hs_expr_ext::flags field.
*/
unsigned hamming_distance;
} hs_expr_ext_t;
/**
@ -281,6 +288,9 @@ typedef struct hs_expr_ext {
/** Flag indicating that the hs_expr_ext::edit_distance field is used. */
#define HS_EXT_FLAG_EDIT_DISTANCE 8ULL
/** Flag indicating that the hs_expr_ext::hamming_distance field is used. */
#define HS_EXT_FLAG_HAMMING_DISTANCE 16ULL
/** @} */
/**

View File

@ -347,14 +347,19 @@ bool NG::addGraph(ExpressionInfo &expr, unique_ptr<NGHolder> g_ptr) {
throw CompileError(expr.index, "Pattern can never match.");
}
bool hamming = expr.hamm_distance > 0;
u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance;
DEBUG_PRINTF("edit distance = %u hamming = %s\n", e_dist, hamming ? "true" : "false");
// validate graph's suitability for fuzzing before resolving asserts
validate_fuzzy_compile(g, expr.edit_distance, expr.utf8, cc.grey);
validate_fuzzy_compile(g, e_dist, hamming, expr.utf8, cc.grey);
resolveAsserts(rm, g, expr);
dumpDotWrapper(g, expr, "02_post_assert_resolve", cc.grey);
assert(allMatchStatesHaveReports(g));
make_fuzzy(g, expr.edit_distance, cc.grey);
make_fuzzy(g, e_dist, hamming, cc.grey);
dumpDotWrapper(g, expr, "02a_post_fuzz", cc.grey);
pruneUseless(g);

View File

@ -161,14 +161,17 @@ void fillExpressionInfo(ReportManager &rm, const CompileContext &cc,
throw CompileError(expr.index, "Pattern can never match.");
}
bool hamming = expr.hamm_distance > 0;
u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance;
// validate graph's suitability for fuzzing
validate_fuzzy_compile(g, expr.edit_distance, expr.utf8, cc.grey);
validate_fuzzy_compile(g, e_dist, hamming, expr.utf8, cc.grey);
resolveAsserts(rm, g, expr);
assert(allMatchStatesHaveReports(g));
// fuzz graph - this must happen before any transformations are made
make_fuzzy(g, expr.edit_distance, cc.grey);
make_fuzzy(g, e_dist, hamming, cc.grey);
pruneUseless(g);
pruneEmptyVertices(g);

View File

@ -144,6 +144,7 @@ vector<flat_set<NFAVertex>> gatherPredecessorsByDepth(const NGHolder &g,
struct ShadowGraph {
NGHolder &g;
u32 edit_distance;
bool hamming;
map<pair<NFAVertex, u32>, NFAVertex> shadow_map;
map<pair<NFAVertex, u32>, NFAVertex> helper_map;
map<NFAVertex, NFAVertex> clones;
@ -151,13 +152,17 @@ struct ShadowGraph {
vector<pair<NFAVertex, NFAVertex>> edges_to_be_added;
flat_set<NFAVertex> orig;
ShadowGraph(NGHolder &g_in, u32 ed_in) : g(g_in), edit_distance(ed_in) {}
ShadowGraph(NGHolder &g_in, u32 ed_in, bool hamm_in)
: g(g_in), edit_distance(ed_in), hamming(hamm_in) {}
void fuzz_graph() {
if (edit_distance == 0) {
return;
}
DEBUG_PRINTF("edit distance = %u hamming = %s\n", edit_distance,
hamming ? "true" : "false");
// step 1: prepare the vertices, helpers and shadows according to
// the original graph
prepare_graph();
@ -167,7 +172,9 @@ struct ShadowGraph {
// step 3: set up reports for newly created vertices (and make clones
// if necessary)
create_reports();
if (!hamming) {
create_reports();
}
// step 4: wire up shadow graph and helpers for insert/replace/remove
connect_shadow_graph();
@ -244,6 +251,16 @@ private:
// if there's nowhere to go from this vertex, no helper needed
if (proper_out_degree(v, g) < 1) {
DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n",
g[shadow_v].index, dist);
helper_map[make_pair(v, dist)] = shadow_v;
continue;
}
// start and startDs only have helpers for insert, so not Hamming
if (hamming && is_any_start(v, g)) {
DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n",
g[shadow_v].index, dist);
helper_map[make_pair(v, dist)] = shadow_v;
continue;
}
@ -256,6 +273,8 @@ private:
g[helper_v].char_reach = CharReach::dot();
// do not copy virtual start's assert flags
if (is_virtual_start(v, g)) {
DEBUG_PRINTF("Helper node ID is virtual start: %zu (level %u)\n",
g[helper_v].index, dist);
g[helper_v].assert_flags = 0;
}
helper_map[make_pair(v, dist)] = helper_v;
@ -272,7 +291,7 @@ private:
const auto &cur_shadow_helper = helper_map[make_pair(v, dist)];
// multiple insert
if (dist > 1) {
if (!hamming && dist > 1) {
const auto &prev_level_helper = helper_map[make_pair(v, dist - 1)];
connect_to_clones(prev_level_helper, cur_shadow_helper);
}
@ -429,13 +448,15 @@ private:
connect_preds(v, dist);
// handle helpers
if (dist > 0) {
if (!hamming && dist > 0) {
connect_helpers(v, dist);
}
}
// handle removals
connect_removals(v);
if (!hamming) {
connect_removals(v);
}
}
}
@ -636,8 +657,8 @@ bool will_turn_vacuous(const NGHolder &g, u32 edit_distance) {
return false;
}
void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8,
const Grey &grey) {
void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool hamming,
bool utf8, const Grey &grey) {
if (edit_distance == 0) {
return;
}
@ -657,13 +678,14 @@ void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8,
"approximate matching.");
}
}
if (will_turn_vacuous(g, edit_distance)) {
if (!hamming && will_turn_vacuous(g, edit_distance)) {
throw CompileError("Approximate matching patterns that reduce to "
"vacuous patterns are disallowed.");
}
}
void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey) {
void make_fuzzy(NGHolder &g, u32 edit_distance, bool hamming,
const Grey &grey) {
if (edit_distance == 0) {
return;
}
@ -671,7 +693,7 @@ void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey) {
assert(grey.allowApproximateMatching);
assert(grey.maxEditDistance >= edit_distance);
ShadowGraph sg(g, edit_distance);
ShadowGraph sg(g, edit_distance, hamming);
sg.fuzz_graph();
// For safety, enforce limit on actual vertex count.

View File

@ -40,10 +40,10 @@ struct Grey;
class NGHolder;
class ReportManager;
void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8,
const Grey &grey);
void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool hamming,
bool utf8, const Grey &grey);
void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey);
void make_fuzzy(NGHolder &g, u32 edit_distance, bool hamming, const Grey &grey);
}
#endif // NG_FUZZY_H

View File

@ -170,7 +170,7 @@ bool shortcutLiteral(NG &ng, const ParsedExpression &pe) {
// XXX: don't shortcut literals with extended params (yet)
if (expr.min_offset || expr.max_offset != MAX_OFFSET || expr.min_length ||
expr.edit_distance) {
expr.edit_distance || expr.hamm_distance) {
DEBUG_PRINTF("extended params not allowed\n");
return false;
}

View File

@ -144,3 +144,4 @@
147:/\b\BMYBt/s{edit_distance=1} #Pattern can never match.
148:/\QÀ\Eaaaa/8 #Expression is not valid UTF-8.
149:/[\QÀ\Eaaaa]/8 #Expression is not valid UTF-8.
150:/abcd/{edit_distance=1,hamming_distance=1} #In hs_expr_ext, cannot have both edit distance and Hamming distance.

View File

@ -84,6 +84,13 @@ ostream& operator<<(ostream &os, const hs_expr_ext &ext) {
os << "edit_distance=" << ext.edit_distance;
first = false;
}
if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
if (!first) {
os << ", ";
}
os << "hamming_distance=" << ext.hamming_distance;
first = false;
}
return os;
}
@ -171,7 +178,7 @@ TEST_P(ExprInfop, check_ext_null) {
free(info);
}
static const hs_expr_ext NO_EXT_PARAM = { 0, 0, 0, 0, 0 };
static const hs_expr_ext NO_EXT_PARAM = { 0, 0, 0, 0, 0, 0 };
static const expected_info ei_test[] = {
{"abc", NO_EXT_PARAM, 3, 3, 0, 0, 0},
@ -214,38 +221,68 @@ static const expected_info ei_test[] = {
{"(foo|bar)\\z", NO_EXT_PARAM, 3, 3, 0, 1, 1},
// Some cases with extended parameters.
{"^abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 5, 0}, 6, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0, 0}, 6, 10, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0, 0}, 100, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0, 0}, 6, 10, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0, 0}, 100, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 5, 0, 0}, 6, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1, 0}, 5, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2, 0}, 4, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2, 0},
10, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2, 0},
4, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2, 0},
4, 6, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1, 0}, 5, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2, 0}, 4, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2, 0},
10, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2, 0},
4, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2, 0},
4, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, 7, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, 8, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 8, 2},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1, 0}, 5, 7, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2, 0}, 4, 8, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 8, 2, 0},
8, 8, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2, 0},
4, 8, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2, 0},
4, 6, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 1}, 6, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 2}, 6, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 5}, 6, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 0, 2},
10, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 0, 2},
6, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 0, 2},
6, 6, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 1}, 6, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 2}, 6, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 5}, 6, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 0, 2},
10, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 0, 2},
6, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 0, 2},
6, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 1}, 6, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 2}, 6, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 5}, 6, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 6, 0, 2},
6, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 0, 2},
6, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 0, 2},
6, 6, 0, 0, 0},
};
INSTANTIATE_TEST_CASE_P(ExprInfo, ExprInfop, ValuesIn(ei_test));

View File

@ -213,7 +213,7 @@ TEST_P(MatchesTest, Check) {
bool utf8 = (t.flags & HS_FLAG_UTF8) > 0;
set<pair<size_t, size_t>> matches;
bool success = findMatches(*g, rm, t.input, matches, 0, t.notEod, utf8);
bool success = findMatches(*g, rm, t.input, matches, 0, 0, t.notEod, utf8);
ASSERT_TRUE(success);
set<pair<size_t, size_t>> expected(begin(t.matches), end(t.matches));

View File

@ -49,7 +49,8 @@ enum ParamKey {
PARAM_MIN_OFFSET,
PARAM_MAX_OFFSET,
PARAM_MIN_LENGTH,
PARAM_EDIT_DISTANCE
PARAM_EDIT_DISTANCE,
PARAM_HAMM_DISTANCE
};
%%{
@ -97,6 +98,10 @@ enum ParamKey {
ext->flags |= HS_EXT_FLAG_EDIT_DISTANCE;
ext->edit_distance = num;
break;
case PARAM_HAMM_DISTANCE:
ext->flags |= HS_EXT_FLAG_HAMMING_DISTANCE;
ext->hamming_distance = num;
break;
case PARAM_NONE:
default:
// No key specified, syntax invalid.
@ -158,7 +163,8 @@ bool HS_CDECL readExpression(const std::string &input, std::string &expr,
param = ('min_offset' @{ key = PARAM_MIN_OFFSET; } |
'max_offset' @{ key = PARAM_MAX_OFFSET; } |
'min_length' @{ key = PARAM_MIN_LENGTH; } |
'edit_distance' @{ key = PARAM_EDIT_DISTANCE; });
'edit_distance' @{ key = PARAM_EDIT_DISTANCE; } |
'hamming_distance' @{ key = PARAM_HAMM_DISTANCE; });
value = (digit @accumulateNum)+ >{num = 0;};
param_spec = (' '* param '=' value ' '*) >{ key = PARAM_NONE; }