Add support for Hamming distance approx matching

This commit is contained in:
Matthew Barr
2017-10-10 15:26:35 +11:00
committed by Xiang Wang
parent 5827bd1c2b
commit 1891f14755
13 changed files with 169 additions and 56 deletions

View File

@@ -144,3 +144,4 @@
147:/\b\BMYBt/s{edit_distance=1} #Pattern can never match.
148:/\Q<>\Eaaaa/8 #Expression is not valid UTF-8.
149:/[\Q<>\Eaaaa]/8 #Expression is not valid UTF-8.
150:/abcd/{edit_distance=1,hamming_distance=1} #In hs_expr_ext, cannot have both edit distance and Hamming distance.

View File

@@ -84,6 +84,13 @@ ostream& operator<<(ostream &os, const hs_expr_ext &ext) {
os << "edit_distance=" << ext.edit_distance;
first = false;
}
if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
if (!first) {
os << ", ";
}
os << "hamming_distance=" << ext.hamming_distance;
first = false;
}
return os;
}
@@ -171,7 +178,7 @@ TEST_P(ExprInfop, check_ext_null) {
free(info);
}
static const hs_expr_ext NO_EXT_PARAM = { 0, 0, 0, 0, 0 };
static const hs_expr_ext NO_EXT_PARAM = { 0, 0, 0, 0, 0, 0 };
static const expected_info ei_test[] = {
{"abc", NO_EXT_PARAM, 3, 3, 0, 0, 0},
@@ -214,38 +221,68 @@ static const expected_info ei_test[] = {
{"(foo|bar)\\z", NO_EXT_PARAM, 3, 3, 0, 1, 1},
// Some cases with extended parameters.
{"^abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 5, 0}, 6, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0, 0}, 6, 10, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0, 0}, 100, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0, 0}, 6, 10, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0, 0}, 100, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 5, 0, 0}, 6, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1, 0}, 5, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2, 0}, 4, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2, 0},
10, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2, 0},
4, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
{"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2, 0},
4, 6, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1, 0}, 5, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2, 0}, 4, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2, 0},
10, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2, 0},
4, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
{"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2, 0},
4, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, 7, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, 8, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 8, 2},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1, 0}, 5, 7, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2, 0}, 4, 8, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 8, 2, 0},
8, 8, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2, 0},
4, 8, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
{"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2, 0},
4, 6, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 1}, 6, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 2}, 6, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 5}, 6, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 0, 2},
10, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 0, 2},
6, UINT_MAX, 0, 0, 0},
{"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 0, 2},
6, 6, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 1}, 6, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 2}, 6, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 5}, 6, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 0, 2},
10, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 0, 2},
6, UINT_MAX, 0, 0, 0},
{"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 0, 2},
6, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 1}, 6, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 2}, 6, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 5}, 6, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 6, 0, 2},
6, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 0, 2},
6, 6, 0, 0, 0},
{"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 0, 2},
6, 6, 0, 0, 0},
};
INSTANTIATE_TEST_CASE_P(ExprInfo, ExprInfop, ValuesIn(ei_test));

View File

@@ -213,7 +213,7 @@ TEST_P(MatchesTest, Check) {
bool utf8 = (t.flags & HS_FLAG_UTF8) > 0;
set<pair<size_t, size_t>> matches;
bool success = findMatches(*g, rm, t.input, matches, 0, t.notEod, utf8);
bool success = findMatches(*g, rm, t.input, matches, 0, 0, t.notEod, utf8);
ASSERT_TRUE(success);
set<pair<size_t, size_t>> expected(begin(t.matches), end(t.matches));