From a5660ee3d4a789c0dd38ae95cb5031a36a0bb9ea Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Tue, 13 Oct 2015 10:16:54 +1100 Subject: [PATCH 01/17] nfagraph_comp: use common constructGraph --- unit/internal/nfagraph_comp.cpp | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/unit/internal/nfagraph_comp.cpp b/unit/internal/nfagraph_comp.cpp index 451871f9..41af3f0c 100644 --- a/unit/internal/nfagraph_comp.cpp +++ b/unit/internal/nfagraph_comp.cpp @@ -32,6 +32,7 @@ #include "config.h" #include "gtest/gtest.h" +#include "nfagraph_common.h" #include "grey.h" #include "hs.h" #include "compiler/compiler.h" @@ -43,17 +44,8 @@ using namespace std; using namespace ue2; -// Helper: build us an NFA graph from a regex -static -unique_ptr constructGraph(const string &expr) { - CompileContext cc(false, false, get_current_target(), Grey()); - ParsedExpression parsed(0, expr.c_str(), 0, 0); - ReportManager rm(cc.grey); - return buildWrapper(rm, cc, parsed); -} - TEST(NFAGraph, CalcComp1) { - auto graph = constructGraph("abc|def|ghi"); + auto graph = constructGraph("abc|def|ghi", 0); ASSERT_TRUE(graph != nullptr); deque> comps = calcComponents(*graph); @@ -61,7 +53,7 @@ TEST(NFAGraph, CalcComp1) { } TEST(NFAGraph, CalcComp2) { - auto graph = constructGraph("a|b|c|d|e|f|g|h|i"); + auto graph = constructGraph("a|b|c|d|e|f|g|h|i", 0); ASSERT_TRUE(graph != nullptr); deque> comps = calcComponents(*graph); @@ -72,7 +64,7 @@ TEST(NFAGraph, CalcComp2) { TEST(NFAGraph, RecalcComp1) { deque> comps; - comps.push_back(constructGraph("abc|def|ghi")); + comps.push_back(constructGraph("abc|def|ghi", 0)); ASSERT_TRUE(comps.back() != nullptr); recalcComponents(comps); From 9578ae6ee254ee807fda6c3d3276cd257b72ec11 Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Tue, 13 Oct 2015 10:49:45 +1100 Subject: [PATCH 02/17] nfagraph_find_matches: simplify/cleanup --- unit/internal/nfagraph_find_matches.cpp | 259 ++++++++++-------------- 1 file changed, 105 insertions(+), 154 deletions(-) diff --git a/unit/internal/nfagraph_find_matches.cpp b/unit/internal/nfagraph_find_matches.cpp index 3e430aff..41d28c52 100644 --- a/unit/internal/nfagraph_find_matches.cpp +++ b/unit/internal/nfagraph_find_matches.cpp @@ -29,6 +29,8 @@ #include "config.h" #include "gtest/gtest.h" +#include "nfagraph_common.h" + #include "compiler/compiler.h" #include "grey.h" #include "nfagraph/ng_builder.h" @@ -42,15 +44,10 @@ using namespace std; using namespace testing; using namespace ue2; -#define NUM_MATCHES 4U -#define P(x,y) pair(x, y) -#define NO_MATCH P(~0U, ~0U) - struct MatchesTestParams { string pattern; string input; - // max 4 matches per pattern, P(-1,-1) is "no match" - pair matches[NUM_MATCHES]; + vector> matches; unsigned flags; bool notEod; bool som; @@ -58,18 +55,11 @@ struct MatchesTestParams { // teach google-test how to print a param void PrintTo(const MatchesTestParams &p, ::std::ostream *os) { - pair *matches = const_cast *>(p.matches); - *os << "( \"" << p.pattern << "\", " << "\"" << p.input << "\", " << "{"; - for (int i = 0; i < 4; i++) { - if (matches[i] == NO_MATCH) { - *os << "NO_MATCH,"; - break; - } else { - *os << "P(" << matches[i].first << ',' << matches[i].second << "),"; - } + for (const auto &match : p.matches) { + *os << "P(" << match.first << ',' << match.second << "),"; } *os << "}, "; *os << "flags(" << p.flags << "), " @@ -81,192 +71,153 @@ void PrintTo(const MatchesTestParams &p, ::std::ostream *os) { class MatchesTest: public TestWithParam { }; +#define P(x, y) pair((x), (y)) + static const MatchesTestParams matchesTests[] = { // EOD and anchored patterns // these should produce no matches - { "^foobar", "foolish", {NO_MATCH}, 0, false, true}, - { "^foobar$", "ze foobar", {NO_MATCH}, 0, false, true}, - { "^foobar$", "foobar ", {NO_MATCH}, 0, false, true}, - { "^abc\\b", "abcde", {NO_MATCH}, 0, false, true}, - { "^a\\b", "aa", {NO_MATCH}, 0, false, true}, - { "^foobar\\b", "foobarz", {NO_MATCH}, 0, false, true}, - { "^foobar", "fooq", {NO_MATCH}, 0, false, true}, - { "^foobar", "foo", {NO_MATCH}, 0, false, true}, - { "^foobar", "fooba", {NO_MATCH}, 0, false, true}, - { "^foo *bar", "foolishness bar none ", {NO_MATCH}, 0, false, true}, - { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc p", {NO_MATCH}, 0, false, true}, - { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc dez", {NO_MATCH}, 0, false, true}, - { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc ghi", {NO_MATCH}, 0, false, true}, - { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc hij", {NO_MATCH}, 0, false, true}, - { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc klm", {NO_MATCH}, 0, false, true}, - { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abcklmn", {NO_MATCH}, 0, false, true}, - { "^.*foobar", "foobaz", {NO_MATCH}, 0, false, true}, - { "^.*foobar", "foobaz\n", {NO_MATCH}, 0, false, true}, - { "^(foo)|(bar)", "fo baz", {NO_MATCH}, 0, false, true}, - { "^((foo)|(bar))", "fo baz", {NO_MATCH}, 0, false, true}, - { "aaaaaaaa$", "AAaaAAaa", {NO_MATCH}, 0, false, true}, - { "^foo\\z", "foo\n", {NO_MATCH}, 0, false, true}, - { "^(foo){2,}", "foo", {NO_MATCH}, 0, false, true}, + { "^foobar", "foolish", {}, 0, false, true}, + { "^foobar$", "ze foobar", {}, 0, false, true}, + { "^foobar$", "foobar ", {}, 0, false, true}, + { "^abc\\b", "abcde", {}, 0, false, true}, + { "^a\\b", "aa", {}, 0, false, true}, + { "^foobar\\b", "foobarz", {}, 0, false, true}, + { "^foobar", "fooq", {}, 0, false, true}, + { "^foobar", "foo", {}, 0, false, true}, + { "^foobar", "fooba", {}, 0, false, true}, + { "^foo *bar", "foolishness bar none ", {}, 0, false, true}, + { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc p", {}, 0, false, true}, + { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc dez", {}, 0, false, true}, + { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc ghi", {}, 0, false, true}, + { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc hij", {}, 0, false, true}, + { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc klm", {}, 0, false, true}, + { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abcklmn", {}, 0, false, true}, + { "^.*foobar", "foobaz", {}, 0, false, true}, + { "^.*foobar", "foobaz\n", {}, 0, false, true}, + { "^(foo)|(bar)", "fo baz", {}, 0, false, true}, + { "^((foo)|(bar))", "fo baz", {}, 0, false, true}, + { "aaaaaaaa$", "AAaaAAaa", {}, 0, false, true}, + { "^foo\\z", "foo\n", {}, 0, false, true}, + { "^(foo){2,}", "foo", {}, 0, false, true}, // these should match - { "^abc\\B", "abcde", { P(0,3), NO_MATCH }, 0, false, true}, - { "^abc\\b", "abc de", { P(0, 3), NO_MATCH }, 0, false, true}, - { "^foobar", "foobar", { P(0, 6), NO_MATCH }, 0, false, true}, - { "^foobar$", "foobar", { P(0, 6), NO_MATCH }, 0, false, true}, - { "^foobar", "foobarq", { P(0, 6), NO_MATCH }, 0, false, true}, - { "^foobar\\B", "foobarz", { P(0, 6), NO_MATCH }, 0, false, true}, - { "^foo.*bar", "foobar none ", { P(0, 6), NO_MATCH }, 0, false, true}, - { "^foo.*bar", "foo bar none ", { P(0, 7), NO_MATCH }, 0, false, true}, - { "^foo.*bar", "foo bar none ", { P(0, 10), NO_MATCH }, 0, false, true}, - { "^foo.*bar", "foolishness bar none ", { P(0, 15), NO_MATCH }, 0, false, true}, - { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "klmny", { P(0, 5), NO_MATCH }, 0, false, true}, - { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc dexyklmnxy", { P(0, 17), NO_MATCH }, 0, false, true}, - { "^.*foobar", "abcfoobar", { P(0, 9), NO_MATCH }, 0, false, true}, - { "^((foo)|(bar))", "foobar", { P(0, 3), NO_MATCH }, 0, false, true}, - { "^((foo)|(bar))", "foo bar", { P(0, 3), NO_MATCH }, 0, false, true}, - { "^(foo)|(bar)", "foobaz", { P(0, 3), NO_MATCH }, 0, false, true}, - { "^(foo)|(bar)", "foo baz", { P(0, 3), NO_MATCH }, 0, false, true}, - { "^(f[o0]+o)|(bar)", "fo0o baz", { P(0, 4), NO_MATCH }, 0, false, true}, - { "aaaaaaaa$", "AAaaAAaa", { P(0, 8), NO_MATCH }, HS_FLAG_CASELESS, false, true}, - { "^foo\\z", "foo", { P(0, 3), NO_MATCH }, 0, false, true}, - { "^foo\\Z", "foo", { P(0, 3), NO_MATCH }, 0, false, true}, - { "^foo\\Z", "foo\n", { P(0, 3), NO_MATCH }, 0, false, true}, - { "^(foo){2,}", "foofoofoo", { P(0, 6), P(0, 9), NO_MATCH}, 0, false, true}, + { "^abc\\B", "abcde", { P(0,3) }, 0, false, true}, + { "^abc\\b", "abc de", { P(0, 3) }, 0, false, true}, + { "^foobar", "foobar", { P(0, 6) }, 0, false, true}, + { "^foobar$", "foobar", { P(0, 6) }, 0, false, true}, + { "^foobar", "foobarq", { P(0, 6) }, 0, false, true}, + { "^foobar\\B", "foobarz", { P(0, 6) }, 0, false, true}, + { "^foo.*bar", "foobar none ", { P(0, 6) }, 0, false, true}, + { "^foo.*bar", "foo bar none ", { P(0, 7) }, 0, false, true}, + { "^foo.*bar", "foo bar none ", { P(0, 10) }, 0, false, true}, + { "^foo.*bar", "foolishness bar none ", { P(0, 15) }, 0, false, true}, + { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "klmny", { P(0, 5) }, 0, false, true}, + { "^(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc dexyklmnxy", { P(0, 17) }, 0, false, true}, + { "^.*foobar", "abcfoobar", { P(0, 9) }, 0, false, true}, + { "^((foo)|(bar))", "foobar", { P(0, 3) }, 0, false, true}, + { "^((foo)|(bar))", "foo bar", { P(0, 3) }, 0, false, true}, + { "^(foo)|(bar)", "foobaz", { P(0, 3) }, 0, false, true}, + { "^(foo)|(bar)", "foo baz", { P(0, 3) }, 0, false, true}, + { "^(f[o0]+o)|(bar)", "fo0o baz", { P(0, 4) }, 0, false, true}, + { "aaaaaaaa$", "AAaaAAaa", { P(0, 8) }, HS_FLAG_CASELESS, false, true}, + { "^foo\\z", "foo", { P(0, 3) }, 0, false, true}, + { "^foo\\Z", "foo", { P(0, 3) }, 0, false, true}, + { "^foo\\Z", "foo\n", { P(0, 3) }, 0, false, true}, + { "^(foo){2,}", "foofoofoo", { P(0, 6), P(0, 9)}, 0, false, true}, // try multiple matches per pattern - { "^(foo)|(bar)", "foo bar", { P(0, 3), P(4, 7), NO_MATCH }, 0, false, true}, - { "^(foo)|(bar)", "foobar", { P(0, 3), P(3, 6), NO_MATCH }, 0, false, true}, - { "^(foo)+|(bar)", "foo foobar", { P(0, 3), P(7, 10), NO_MATCH }, 0, false, true}, - { "^(foo)+|(bar)", "foofoo bar", { P(0, 3), P(0, 6), P(7, 10), NO_MATCH }, 0, false, true}, - { "^(foo)|(bar)", "foobarbaz", { P(0, 3), P(3, 6), NO_MATCH }, 0, false, true}, - { "^(f[o0]+o)", "foo0obar", { P(0, 3), P(0, 5), NO_MATCH }, 0, false, true}, + { "^(foo)|(bar)", "foo bar", { P(0, 3), P(4, 7) }, 0, false, true}, + { "^(foo)|(bar)", "foobar", { P(0, 3), P(3, 6) }, 0, false, true}, + { "^(foo)+|(bar)", "foo foobar", { P(0, 3), P(7, 10) }, 0, false, true}, + { "^(foo)+|(bar)", "foofoo bar", { P(0, 3), P(0, 6), P(7, 10) }, 0, false, true}, + { "^(foo)|(bar)", "foobarbaz", { P(0, 3), P(3, 6) }, 0, false, true}, + { "^(f[o0]+o)", "foo0obar", { P(0, 3), P(0, 5) }, 0, false, true}, { "^(f[o0]+)", "foo0obar", { P(0, 2), P(0, 3), P(0, 4), P(0, 5) }, 0, false, true}, // unanchored patterns - { "\\b4\\B", "444", { P(0, 1), NO_MATCH }, 0, false, true}, - { "\\b\\w+\\b", "444 555", { P(0, 3), P(4, 7), NO_MATCH }, 0, false, true}, - { "foobar", "veryfoolish", {NO_MATCH}, 0, false, true}, - { "foo.*bar", "extreme foolishness bar none ", { P(8, 23), NO_MATCH }, 0, false, true}, - { "(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc deyghijfy", { P(0, 13), NO_MATCH }, 0, false, true}, - { "(abc\\s+dex?y)?(ghij|klmn).*?x?y", "wegf5tgghij34xy", { P(7, 15), NO_MATCH }, 0, false, true}, - { ".*foobar", "verylongfoobaz", {NO_MATCH}, 0, false, true}, - { ".*foobar", "foobaz\n", {NO_MATCH}, 0, false, true}, - { "(foo)|(bar)", "verylongfo baz", {NO_MATCH}, 0, false, true}, - { "(foo)?bar", "foobar", { P(0, 6), NO_MATCH }, 0, false, true}, - { "foo?bar", "foobar", { P(0, 6), NO_MATCH }, 0, false, true}, - { "(abc)|(bcd)", "abcd", { P(0, 3), P(1, 4), NO_MATCH }, 0, false, true}, - { "(abcd)|(bc)", "abcd", { P(0, 4), P(1, 3), NO_MATCH }, 0, false, true}, - { "(ab|cd)ef", "abcdef cdabef", { P(2, 6), P(9, 13), NO_MATCH }, 0, false, true}, - { "(foo)|(bar)", "verylongfoobbarbaz", { P(8, 11), P(12, 15), NO_MATCH }, 0, false, true}, - { "(a[aaaa]aa?((\\B)|[aa])){1,9}", "aaaaa", { P(0, 3), P(0, 4), P(0, 5), NO_MATCH }, 0, false, true}, - { "bar\\Z", "foobar\n", { P(3, 6), NO_MATCH }, 0, false, true}, + { "\\b4\\B", "444", { P(0, 1) }, 0, false, true}, + { "\\b\\w+\\b", "444 555", { P(0, 3), P(4, 7) }, 0, false, true}, + { "foobar", "veryfoolish", {}, 0, false, true}, + { "foo.*bar", "extreme foolishness bar none ", { P(8, 23) }, 0, false, true}, + { "(abc\\s+dex?y)?(ghij|klmn).*?x?y", "abc deyghijfy", { P(0, 13) }, 0, false, true}, + { "(abc\\s+dex?y)?(ghij|klmn).*?x?y", "wegf5tgghij34xy", { P(7, 15) }, 0, false, true}, + { ".*foobar", "verylongfoobaz", {}, 0, false, true}, + { ".*foobar", "foobaz\n", {}, 0, false, true}, + { "(foo)|(bar)", "verylongfo baz", {}, 0, false, true}, + { "(foo)?bar", "foobar", { P(0, 6) }, 0, false, true}, + { "foo?bar", "foobar", { P(0, 6) }, 0, false, true}, + { "(abc)|(bcd)", "abcd", { P(0, 3), P(1, 4) }, 0, false, true}, + { "(abcd)|(bc)", "abcd", { P(0, 4), P(1, 3) }, 0, false, true}, + { "(ab|cd)ef", "abcdef cdabef", { P(2, 6), P(9, 13) }, 0, false, true}, + { "(foo)|(bar)", "verylongfoobbarbaz", { P(8, 11), P(12, 15) }, 0, false, true}, + { "(a[aaaa]aa?((\\B)|[aa])){1,9}", "aaaaa", { P(0, 3), P(0, 4), P(0, 5) }, 0, false, true}, + { "bar\\Z", "foobar\n", { P(3, 6) }, 0, false, true}, // multi-line patterns - { "^foo$", "foo\nbar", { P(0, 3), NO_MATCH }, HS_FLAG_MULTILINE, false, true}, - { "^bar$", "foo\nbar", { P(4, 7), NO_MATCH }, HS_FLAG_MULTILINE, false, true}, - { "^foo$", "big foo\nbar", {NO_MATCH}, HS_FLAG_MULTILINE, false, true}, - { "^foo$", "foo\nfoo", { P(0, 3), P(4, 7), NO_MATCH }, HS_FLAG_MULTILINE, false, true}, - { "\\bfoo$", "big foo\nbar", { P(4, 7), NO_MATCH }, HS_FLAG_MULTILINE, false, true}, - { "\\Bfoo$", "bigfoo\nbar", { P(3, 6), NO_MATCH }, HS_FLAG_MULTILINE, false, true}, - { "^foo\\z", "big\nfoo", { P(4, 7), NO_MATCH }, HS_FLAG_MULTILINE, false, true}, - { "^foo\\Z", "big\nfoo\n", { P(4, 7), NO_MATCH }, HS_FLAG_MULTILINE, false, true}, + { "^foo$", "foo\nbar", { P(0, 3) }, HS_FLAG_MULTILINE, false, true}, + { "^bar$", "foo\nbar", { P(4, 7) }, HS_FLAG_MULTILINE, false, true}, + { "^foo$", "big foo\nbar", {}, HS_FLAG_MULTILINE, false, true}, + { "^foo$", "foo\nfoo", { P(0, 3), P(4, 7) }, HS_FLAG_MULTILINE, false, true}, + { "\\bfoo$", "big foo\nbar", { P(4, 7) }, HS_FLAG_MULTILINE, false, true}, + { "\\Bfoo$", "bigfoo\nbar", { P(3, 6) }, HS_FLAG_MULTILINE, false, true}, + { "^foo\\z", "big\nfoo", { P(4, 7) }, HS_FLAG_MULTILINE, false, true}, + { "^foo\\Z", "big\nfoo\n", { P(4, 7) }, HS_FLAG_MULTILINE, false, true}, // utf8 patterns - { "ab+", "\x61\x62", { P(0, 2), NO_MATCH }, HS_FLAG_UTF8, false, true}, - { "ab.+d", "\x61\x62\xf0\xa4\xad\xa2\x64", { P(0, 7), NO_MATCH }, HS_FLAG_UTF8, false, true}, + { "ab+", "\x61\x62", { P(0, 2) }, HS_FLAG_UTF8, false, true}, + { "ab.+d", "\x61\x62\xf0\xa4\xad\xa2\x64", { P(0, 7) }, HS_FLAG_UTF8, false, true}, // noteod patterns - { "^foobar$", "foobar", { NO_MATCH }, 0, true, true}, - { "aaaaaaaa$", "AAaaAAaa", { NO_MATCH }, HS_FLAG_CASELESS, true, true}, - { "^foo\\z", "foo", { NO_MATCH }, 0, true, true}, - { "^foo\\Z", "foo", { NO_MATCH }, 0, true, true}, - { "^foo\\Z", "foo\n", { NO_MATCH }, 0, true, true}, + { "^foobar$", "foobar", {}, 0, true, true}, + { "aaaaaaaa$", "AAaaAAaa", {}, HS_FLAG_CASELESS, true, true}, + { "^foo\\z", "foo", {}, 0, true, true}, + { "^foo\\Z", "foo", {}, 0, true, true}, + { "^foo\\Z", "foo\n", {}, 0, true, true}, // vacuous patterns (with multiline, utf8 and caseless flags) // vacuous patterns have SOM turned off, so all SOM are zero { "b*", "abc", { P(0, 0), P(0, 1), P(0, 2), P(0, 3) }, 0, false, false}, - { "b*", "", { P(0, 0), NO_MATCH }, 0, false, false}, - { "(aa|b*)", "a", { P(0, 0), P(0, 1), NO_MATCH }, 0, false, false}, + { "b*", "", { P(0, 0) }, 0, false, false}, + { "(aa|b*)", "a", { P(0, 0), P(0, 1) }, 0, false, false}, { "b*", "bBb", { P(0, 0), P(0, 1), P(0, 2), P(0, 3) }, HS_FLAG_CASELESS, false, false}, { "b*", "abc", { P(0, 0), P(0, 1), P(0, 2), P(0, 3) }, HS_FLAG_MULTILINE, false, false}, - { "b*", "", { P(0, 0), NO_MATCH }, HS_FLAG_MULTILINE, false, false}, - { "(aa|b*)", "a", { P(0, 0), P(0, 1), NO_MATCH }, HS_FLAG_MULTILINE, false, false}, + { "b*", "", { P(0, 0) }, HS_FLAG_MULTILINE, false, false}, + { "(aa|b*)", "a", { P(0, 0), P(0, 1) }, HS_FLAG_MULTILINE, false, false}, { "b*", "bBb", { P(0, 0), P(0, 1), P(0, 2), P(0, 3) }, HS_FLAG_MULTILINE | HS_FLAG_CASELESS, false, false}, { "b*", "abc", { P(0, 0), P(0, 1), P(0, 2), P(0, 3) }, HS_FLAG_UTF8, false, false}, - { "b*", "", { P(0, 0), NO_MATCH }, HS_FLAG_UTF8, false, false}, - { "(aa|b*)", "a", { P(0, 0), P(0, 1), NO_MATCH }, HS_FLAG_UTF8, false, false}, + { "b*", "", { P(0, 0) }, HS_FLAG_UTF8, false, false}, + { "(aa|b*)", "a", { P(0, 0), P(0, 1) }, HS_FLAG_UTF8, false, false}, { "b*", "bBb", { P(0, 0), P(0, 1), P(0, 2), P(0, 3) }, HS_FLAG_UTF8 | HS_FLAG_CASELESS, false, false}, { "b*", "bBb", { P(0, 0), P(0, 1), P(0, 2), P(0, 3) }, HS_FLAG_UTF8 | HS_FLAG_CASELESS, false, false}, { ".*", "\x61\xf0\xa4\xad\xa2\x64", { P(0, 0), P(0, 1), P(0, 5), P(0, 6) }, HS_FLAG_UTF8, false, false}, { ".*", "\xf0\xa4\xad\xa2\xf0\xa4\xad\xa3\x64", { P(0, 0), P(0, 4), P(0, 8), P(0, 9) }, HS_FLAG_UTF8, false, false}, // special patterns for detecting various bugs - { "(\\B.|a)*a", "\xf0\xa4\xad\xa2\x61", { P(0, 5), NO_MATCH }, HS_FLAG_UTF8, false, true}, + { "(\\B.|a)*a", "\xf0\xa4\xad\xa2\x61", { P(0, 5) }, HS_FLAG_UTF8, false, true}, { ".*", "\xf0\xa4\xad", { P(0, 0), P(0, 1), P(0, 2), P(0, 3) }, 0, false, true}, - { "\\Bfoo", "foo", {NO_MATCH}, 0, false, true}, - { "fo\\B", "fo_", { P(0, 2), NO_MATCH}, 0, false, true}, + { "\\Bfoo", "foo", {}, 0, false, true}, + { "fo\\B", "fo_", { P(0, 2)}, 0, false, true}, { "^.*", "\xee\x80\x80\n\xee\x80\x80", { P(0, 0), P(0, 3), P(0, 4), P(0, 7)}, HS_FLAG_UTF8 | HS_FLAG_MULTILINE, false, false}, // ignore highlander patterns as they can't be easily checked }; -// by default, all matches initialize to zeroes. this makes it impossible to -// test vacuous patterns, among other things, unless we specify every single -// match, which is tedious. instead, we set "no match" to NO_MATCH. if the matches -// start with NO_MATCH, everything else is set to NO_MATCH. if matches start -// with something else, look for the next NO_MATCH and replace everything after -// it with NO_MATCH's. -static -void fixMatches(const pair in_matches[], - pair out_matches[], unsigned size) { - bool end_matches = false; - for (unsigned i = 0; i < size; i++) { - if (in_matches[i] == NO_MATCH) { - end_matches = true; - } - if (end_matches) { - out_matches[i] = NO_MATCH; - } - else { - out_matches[i] = in_matches[i]; - } - } -} - TEST_P(MatchesTest, Check) { const MatchesTestParams &t = GetParam(); CompileContext cc(false, false, get_current_target(), Grey()); ReportManager rm(cc.grey); ParsedExpression parsed(0, t.pattern.c_str(), t.flags, 0); - unique_ptr g = buildWrapper(rm, cc, parsed); - - set > results, tmp, matches; + auto g = buildWrapper(rm, cc, parsed); bool utf8 = (t.flags & HS_FLAG_UTF8) > 0; + set> matches; findMatches(*g, rm, t.input, matches, t.notEod, t.som, utf8); - // fix matches and make a set out of them - pair tmp_matches[NUM_MATCHES]; - fixMatches(t.matches, tmp_matches, NUM_MATCHES); - tmp = set >(tmp_matches, tmp_matches + NUM_MATCHES); - tmp.erase(NO_MATCH); + set> expected(begin(t.matches), end(t.matches)); - // create a superset of pattern results and matches to pick up unexpected - // matches as well as missed matches. - results.insert(tmp.begin(), tmp.end()); - results.insert(matches.begin(), matches.end()); - - // check if we have the same number of matches as in expected results - ASSERT_EQ(results.size(), tmp.size())<< "Pattern '" << t.pattern - << "' against input '" << t.input << "': wrong results count"; - - // we already know that size of two sets is the same, so now check matches. - for (set >::const_iterator it = results.begin(); - it != results.end(); ++it) { - ASSERT_EQ(1, matches.count(*it))<< "Pattern '" << t.pattern - << "' against input '" << t.input << "'"; - } + ASSERT_EQ(expected, matches) << "Pattern '" << t.pattern + << "' against input '" << t.input << "'"; } INSTANTIATE_TEST_CASE_P(ng_find_matches, MatchesTest, ValuesIn(matchesTests)); From 4a98c664b4caa32f747d6b9311b393a295eb5c4d Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Wed, 14 Oct 2015 12:38:13 +1100 Subject: [PATCH 03/17] HyperscanScanGigabytesMatch: use a vector --- unit/hyperscan/behaviour.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/unit/hyperscan/behaviour.cpp b/unit/hyperscan/behaviour.cpp index 2ec8a9c9..98371c86 100644 --- a/unit/hyperscan/behaviour.cpp +++ b/unit/hyperscan/behaviour.cpp @@ -86,8 +86,7 @@ TEST(HyperscanTestBehaviour, ScanSeveralGigabytesNoMatch) { hs_error_t err; const size_t datalen = 1024 * 1024; size_t megabytes = 5 * 1024; - char * data = new char[datalen]; - memset(data, 'X', datalen); + vector data(datalen, 'X'); // build a database hs_database_t *db = nullptr; @@ -110,8 +109,8 @@ TEST(HyperscanTestBehaviour, ScanSeveralGigabytesNoMatch) { ASSERT_TRUE(stream != nullptr); while (megabytes-- > 0) { - err = hs_scan_stream(stream, data, datalen, 0, scratch, dummyHandler, - nullptr); + err = hs_scan_stream(stream, data.data(), data.size(), 0, scratch, + dummyHandler, nullptr); ASSERT_EQ(HS_SUCCESS, err); } @@ -121,7 +120,6 @@ TEST(HyperscanTestBehaviour, ScanSeveralGigabytesNoMatch) { // teardown hs_free_scratch(scratch); hs_free_database(db); - delete [] data; } struct HugeScanMatchingData { @@ -141,8 +139,7 @@ TEST_P(HyperscanScanGigabytesMatch, StreamingMatch) { hs_error_t err; const size_t datalen = 1024*1024; - char * data = new char[datalen]; - memset(data, 'X', datalen); + vector data(datalen, 'X'); // build a database hs_database_t *db = nullptr; @@ -178,7 +175,7 @@ TEST_P(HyperscanScanGigabytesMatch, StreamingMatch) { // streaming mode scan of our megabyte of data gb*1024 times unsigned long remaining = gb * 1024; while (remaining-- > 0) { - err = hs_scan_stream(stream, data, datalen, 0, scratch, + err = hs_scan_stream(stream, data.data(), data.size(), 0, scratch, singleHandler, nullptr); ASSERT_EQ(HS_SUCCESS, err); ASSERT_EQ(0ULL, lastMatchTo); @@ -202,7 +199,6 @@ TEST_P(HyperscanScanGigabytesMatch, StreamingMatch) { // teardown hs_free_scratch(scratch); hs_free_database(db); - delete[] data; } // Helper function to actually perform scans for BlockMatch test below From 441329f15a469aecc60c0207304b490e40f503f7 Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Thu, 15 Oct 2015 16:08:51 +1100 Subject: [PATCH 04/17] FDRp tests: less raw malloc/free --- unit/internal/fdr.cpp | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/unit/internal/fdr.cpp b/unit/internal/fdr.cpp index 90268812..3aada867 100644 --- a/unit/internal/fdr.cpp +++ b/unit/internal/fdr.cpp @@ -207,7 +207,6 @@ TEST_P(FDRp, SimpleSingle) { TEST_P(FDRp, MultiLocation) { const u32 hint = GetParam(); SCOPED_TRACE(hint); - u8 * data; vector lits; lits.push_back(hwlmLiteral("abc", 0, 1)); @@ -216,24 +215,23 @@ TEST_P(FDRp, MultiLocation) { CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint); const u32 testSize = 128; - data = (u8 *)malloc(testSize); - memset(data, 0, testSize); + + vector data(testSize, 0); + for (u32 i = 0; i < testSize - 3; i++) { - memcpy(data + i, "abc", 3); + memcpy(data.data() + i, "abc", 3); vector matches; - fdrExec(fdr.get(), (const u8 *)data, testSize, 0, decentCallback, - &matches, HWLM_ALL_GROUPS); + fdrExec(fdr.get(), data.data(), testSize, 0, decentCallback, &matches, + HWLM_ALL_GROUPS); ASSERT_EQ(1U, matches.size()); EXPECT_EQ(match(i, i+2, 1), matches[0]); - memset(data + i, 0, 3); + memset(data.data() + i, 0, 3); } - free(data); } TEST_P(FDRp, Flood) { const u32 hint = GetParam(); SCOPED_TRACE(hint); - u8 * data; vector lits; lits.push_back(hwlmLiteral("aaaa", 0, 1)); @@ -245,11 +243,10 @@ TEST_P(FDRp, Flood) { CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint); const u32 testSize = 1024; - data = (u8 *)malloc(testSize); - memset(data, 'a', testSize); + vector data(testSize, 'a'); vector matches; - fdrExec(fdr.get(), (const u8 *)data, testSize, 0, decentCallback, &matches, + fdrExec(fdr.get(), data.data(), testSize, 0, decentCallback, &matches, HWLM_ALL_GROUPS); ASSERT_EQ(testSize - 3 + testSize - 7, matches.size()); EXPECT_EQ(match(0, 3, 1), matches[0]); @@ -266,8 +263,6 @@ TEST_P(FDRp, Flood) { match(i - 3, i, 1) == matches[currentMatch]) ); } - - free(data); } TEST_P(FDRp, NoRepeat1) { @@ -483,10 +478,10 @@ TEST_P(FDRp, moveByteStream) { size_t size = fdrSize(fdrTable0.get()); - FDR *fdrTable = (FDR *)aligned_zmalloc(size); - EXPECT_TRUE(fdrTable); + auto fdrTable = aligned_zmalloc_unique(size); + EXPECT_NE(nullptr, fdrTable); - memcpy(fdrTable, fdrTable0.get(), size); + memcpy(fdrTable.get(), fdrTable0.get(), size); // bugger up original for (size_t i = 0 ; i < size; i++) { @@ -496,14 +491,13 @@ TEST_P(FDRp, moveByteStream) { // check matches vector matches; - hwlm_error_t fdrStatus = fdrExec(fdrTable, (const u8 *)data, data_len, 0, - decentCallback, &matches, HWLM_ALL_GROUPS); + hwlm_error_t fdrStatus = fdrExec(fdrTable.get(), (const u8 *)data, + data_len, 0, decentCallback, &matches, + HWLM_ALL_GROUPS); ASSERT_EQ(0, fdrStatus); ASSERT_EQ(1U, matches.size()); EXPECT_EQ(match(12, 17, 0), matches[0]); - - aligned_free(fdrTable); } TEST_P(FDRp, Stream1) { From a2f2e11e608a29616f154da51b74adf2fc4d1a3a Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Tue, 20 Oct 2015 14:37:15 +1100 Subject: [PATCH 05/17] sidecar: use aligned_zmalloc_unique --- unit/internal/sidecar.cpp | 85 ++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 46 deletions(-) diff --git a/unit/internal/sidecar.cpp b/unit/internal/sidecar.cpp index b087e46b..bd517926 100644 --- a/unit/internal/sidecar.cpp +++ b/unit/internal/sidecar.cpp @@ -71,37 +71,35 @@ TEST(Sidecar, ns1) { ASSERT_TRUE(ns != nullptr); ASSERT_LT(0U, sidecarSize(ns.get())); - struct sidecar_enabled *enabled - = (struct sidecar_enabled *)aligned_zmalloc(sidecarEnabledSize(ns.get())); - ASSERT_TRUE(enabled); - sidecarEnabledInit(ns.get(), enabled); - struct sidecar_scratch *scratch - = (struct sidecar_scratch *)aligned_zmalloc(sidecarScratchSize(ns.get())); + auto enabled = + aligned_zmalloc_unique(sidecarEnabledSize(ns.get())); + sidecarEnabledInit(ns.get(), enabled.get()); + auto scratch = + aligned_zmalloc_unique(sidecarScratchSize(ns.get())); for (u32 i = 0; i < 256; i++) { SCOPED_TRACE(i); u32 seen = 0; memset(data, i, data_len); - sidecarExec(ns.get(), data, data_len, enabled, scratch, 0, ns_cb, &seen); + sidecarExec(ns.get(), data, data_len, enabled.get(), scratch.get(), 0, + ns_cb, &seen); ASSERT_EQ(0U, seen); } - sidecarEnabledAdd(ns.get(), enabled, 0); + sidecarEnabledAdd(ns.get(), enabled.get(), 0); for (u32 i = 0; i < 256; i++) { SCOPED_TRACE(i); u32 seen = 0; memset(data, i, data_len); - sidecarExec(ns.get(), data, data_len, enabled, scratch, 0, ns_cb, &seen); + sidecarExec(ns.get(), data, data_len, enabled.get(), scratch.get(), 0, + ns_cb, &seen); if (i == 'f') { ASSERT_EQ(1U, seen); } else { ASSERT_EQ(0U, seen); } } - - aligned_free(enabled); - aligned_free(scratch); } const char* sidecarStrings[] = { @@ -186,14 +184,13 @@ TEST_P(SidecarTest, Individual) { ASSERT_TRUE(ns != nullptr); ASSERT_LT(0U, sidecarSize(ns.get())); - struct sidecar_enabled *enabled - = (struct sidecar_enabled *)aligned_zmalloc(sidecarEnabledSize(ns.get())); - ASSERT_TRUE(enabled); - sidecarEnabledInit(ns.get(), enabled); - struct sidecar_enabled *local_enabled - = (struct sidecar_enabled *)aligned_zmalloc(sidecarEnabledSize(ns.get())); - struct sidecar_scratch *scratch - = (struct sidecar_scratch *)aligned_zmalloc(sidecarScratchSize(ns.get())); + auto enabled = + aligned_zmalloc_unique(sidecarEnabledSize(ns.get())); + sidecarEnabledInit(ns.get(), enabled.get()); + auto local_enabled = + aligned_zmalloc_unique(sidecarEnabledSize(ns.get())); + auto scratch = + aligned_zmalloc_unique(sidecarScratchSize(ns.get())); const size_t data_len = 1024; u8 data[data_len]; @@ -203,7 +200,8 @@ TEST_P(SidecarTest, Individual) { SCOPED_TRACE(i); memset(data, i, data_len); set seen; - sidecarExec(ns.get(), data, data_len, enabled, scratch, 0, set_cb, &seen); + sidecarExec(ns.get(), data, data_len, enabled.get(), scratch.get(), 0, + set_cb, &seen); ASSERT_TRUE(seen.empty()); } @@ -213,17 +211,18 @@ TEST_P(SidecarTest, Individual) { SCOPED_TRACE(c); // build a "compile time" enabled structure and add class j to it. - sidecarEnabledInit(ns.get(), local_enabled); - sidecarEnabledAdd(ns.get(), local_enabled, j); + sidecarEnabledInit(ns.get(), local_enabled.get()); + sidecarEnabledAdd(ns.get(), local_enabled.get(), j); // union class j into our runtime enabled structure. - sidecarEnabledUnion(ns.get(), enabled, local_enabled); + sidecarEnabledUnion(ns.get(), enabled.get(), local_enabled.get()); for (u32 i = 0; i < 256; i++) { SCOPED_TRACE(i); memset(data, i, data_len); set seen; - sidecarExec(ns.get(), data, data_len, enabled, scratch, 0, set_cb, &seen); + sidecarExec(ns.get(), data, data_len, enabled.get(), scratch.get(), + 0, set_cb, &seen); if (i == c) { ASSERT_EQ(1U, seen.size()); ASSERT_EQ(j, *seen.begin()); @@ -232,10 +231,6 @@ TEST_P(SidecarTest, Individual) { } } } - - aligned_free(local_enabled); - aligned_free(enabled); - aligned_free(scratch); } TEST_P(SidecarTest, Together) { @@ -253,13 +248,13 @@ TEST_P(SidecarTest, Together) { ASSERT_TRUE(ns != nullptr); ASSERT_LT(0U, sidecarSize(ns.get())); - struct sidecar_enabled *enabled - = (struct sidecar_enabled *)aligned_zmalloc(sidecarEnabledSize(ns.get())); - ASSERT_TRUE(enabled); - struct sidecar_enabled *local_enabled - = (struct sidecar_enabled *)aligned_zmalloc(sidecarEnabledSize(ns.get())); - struct sidecar_scratch *scratch - = (struct sidecar_scratch *)aligned_zmalloc(sidecarScratchSize(ns.get())); + auto enabled = + aligned_zmalloc_unique(sidecarEnabledSize(ns.get())); + sidecarEnabledInit(ns.get(), enabled.get()); + auto local_enabled = + aligned_zmalloc_unique(sidecarEnabledSize(ns.get())); + auto scratch = + aligned_zmalloc_unique(sidecarScratchSize(ns.get())); const size_t data_len = 1024; u8 data[data_len]; @@ -269,21 +264,22 @@ TEST_P(SidecarTest, Together) { SCOPED_TRACE(i); memset(data, i, data_len); set seen; - sidecarExec(ns.get(), data, data_len, enabled, scratch, 0, set_cb, &seen); + sidecarExec(ns.get(), data, data_len, enabled.get(), scratch.get(), 0, + set_cb, &seen); ASSERT_TRUE(seen.empty()); } // test that every char class fires for (u32 j = 0; j < charclasses.size(); j++) { // enable the whole lot - sidecarEnabledInit(ns.get(), enabled); + sidecarEnabledInit(ns.get(), enabled.get()); for (u32 i = 0; i < charclasses.size(); i++) { // build a "compile time" enabled structure and add class j to it. - sidecarEnabledInit(ns.get(), local_enabled); - sidecarEnabledAdd(ns.get(), local_enabled, i); + sidecarEnabledInit(ns.get(), local_enabled.get()); + sidecarEnabledAdd(ns.get(), local_enabled.get(), i); // union class j into our runtime enabled structure. - sidecarEnabledUnion(ns.get(), enabled, local_enabled); + sidecarEnabledUnion(ns.get(), enabled.get(), local_enabled.get()); } u32 c = chars[j]; @@ -293,7 +289,8 @@ TEST_P(SidecarTest, Together) { SCOPED_TRACE(i); memset(data, i, data_len); set seen; - sidecarExec(ns.get(), data, data_len, enabled, scratch, 0, set_cb, &seen); + sidecarExec(ns.get(), data, data_len, enabled.get(), scratch.get(), + 0, set_cb, &seen); if (i == c) { // seen should contain only `c' ASSERT_EQ(1U, seen.size()); @@ -306,10 +303,6 @@ TEST_P(SidecarTest, Together) { } } } - - aligned_free(local_enabled); - aligned_free(enabled); - aligned_free(scratch); } INSTANTIATE_TEST_CASE_P(Sidecar, SidecarTest, From b59491d0dbab1a8ed47898bb397d4a539382c0e7 Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Fri, 23 Oct 2015 11:39:31 +1100 Subject: [PATCH 06/17] Custom NFA_API_NO_IMPL variant for zombie_status Silences ICC warning #188: enumerated type mixed with another type. --- src/nfa/castle.h | 2 +- src/nfa/gough.h | 4 ++-- src/nfa/lbr.h | 10 +++++----- src/nfa/mcclellan.h | 4 ++-- src/nfa/mpv.h | 2 +- src/nfa/nfa_internal.h | 22 +++++++++++++++++++--- 6 files changed, 30 insertions(+), 14 deletions(-) diff --git a/src/nfa/castle.h b/src/nfa/castle.h index 6a7fe70c..8fc3514b 100644 --- a/src/nfa/castle.h +++ b/src/nfa/castle.h @@ -54,7 +54,7 @@ char nfaExecCastle0_expandState(const struct NFA *nfa, void *dest, #define nfaExecCastle0_testEOD NFA_API_NO_IMPL #define nfaExecCastle0_B_Reverse NFA_API_NO_IMPL -#define nfaExecCastle0_zombie_status NFA_API_NO_IMPL +#define nfaExecCastle0_zombie_status NFA_API_ZOMBIE_NO_IMPL #ifdef __cplusplus } diff --git a/src/nfa/gough.h b/src/nfa/gough.h index 7729c653..41d4cb5a 100644 --- a/src/nfa/gough.h +++ b/src/nfa/gough.h @@ -55,7 +55,7 @@ char nfaExecGough8_expandState(const struct NFA *nfa, void *dest, const void *src, u64a offset, u8 key); #define nfaExecGough8_B_Reverse NFA_API_NO_IMPL -#define nfaExecGough8_zombie_status NFA_API_NO_IMPL +#define nfaExecGough8_zombie_status NFA_API_ZOMBIE_NO_IMPL // 16-bit Gough @@ -77,6 +77,6 @@ char nfaExecGough16_expandState(const struct NFA *nfa, void *dest, const void *src, u64a offset, u8 key); #define nfaExecGough16_B_Reverse NFA_API_NO_IMPL -#define nfaExecGough16_zombie_status NFA_API_NO_IMPL +#define nfaExecGough16_zombie_status NFA_API_ZOMBIE_NO_IMPL #endif diff --git a/src/nfa/lbr.h b/src/nfa/lbr.h index af7d983d..b770477d 100644 --- a/src/nfa/lbr.h +++ b/src/nfa/lbr.h @@ -56,7 +56,7 @@ char nfaExecLbrDot_expandState(const struct NFA *nfa, void *dest, #define nfaExecLbrDot_testEOD NFA_API_NO_IMPL #define nfaExecLbrDot_B_Reverse NFA_API_NO_IMPL -#define nfaExecLbrDot_zombie_status NFA_API_NO_IMPL +#define nfaExecLbrDot_zombie_status NFA_API_ZOMBIE_NO_IMPL // LBR Verm @@ -76,7 +76,7 @@ char nfaExecLbrVerm_expandState(const struct NFA *nfa, void *dest, #define nfaExecLbrVerm_testEOD NFA_API_NO_IMPL #define nfaExecLbrVerm_B_Reverse NFA_API_NO_IMPL -#define nfaExecLbrVerm_zombie_status NFA_API_NO_IMPL +#define nfaExecLbrVerm_zombie_status NFA_API_ZOMBIE_NO_IMPL // LBR Negated Verm @@ -96,7 +96,7 @@ char nfaExecLbrNVerm_expandState(const struct NFA *nfa, void *dest, #define nfaExecLbrNVerm_testEOD NFA_API_NO_IMPL #define nfaExecLbrNVerm_B_Reverse NFA_API_NO_IMPL -#define nfaExecLbrNVerm_zombie_status NFA_API_NO_IMPL +#define nfaExecLbrNVerm_zombie_status NFA_API_ZOMBIE_NO_IMPL // LBR Shuf @@ -116,7 +116,7 @@ char nfaExecLbrShuf_expandState(const struct NFA *nfa, void *dest, #define nfaExecLbrShuf_testEOD NFA_API_NO_IMPL #define nfaExecLbrShuf_B_Reverse NFA_API_NO_IMPL -#define nfaExecLbrShuf_zombie_status NFA_API_NO_IMPL +#define nfaExecLbrShuf_zombie_status NFA_API_ZOMBIE_NO_IMPL // LBR Truffle @@ -136,7 +136,7 @@ char nfaExecLbrTruf_expandState(const struct NFA *nfa, void *dest, #define nfaExecLbrTruf_testEOD NFA_API_NO_IMPL #define nfaExecLbrTruf_B_Reverse NFA_API_NO_IMPL -#define nfaExecLbrTruf_zombie_status NFA_API_NO_IMPL +#define nfaExecLbrTruf_zombie_status NFA_API_ZOMBIE_NO_IMPL #ifdef __cplusplus } diff --git a/src/nfa/mcclellan.h b/src/nfa/mcclellan.h index 856740b1..6b4ec2d5 100644 --- a/src/nfa/mcclellan.h +++ b/src/nfa/mcclellan.h @@ -56,7 +56,7 @@ char nfaExecMcClellan8_expandState(const struct NFA *nfa, void *dest, const void *src, u64a offset, u8 key); #define nfaExecMcClellan8_B_Reverse NFA_API_NO_IMPL -#define nfaExecMcClellan8_zombie_status NFA_API_NO_IMPL +#define nfaExecMcClellan8_zombie_status NFA_API_ZOMBIE_NO_IMPL // 16-bit McClellan @@ -79,7 +79,7 @@ char nfaExecMcClellan16_expandState(const struct NFA *nfa, void *dest, const void *src, u64a offset, u8 key); #define nfaExecMcClellan16_B_Reverse NFA_API_NO_IMPL -#define nfaExecMcClellan16_zombie_status NFA_API_NO_IMPL +#define nfaExecMcClellan16_zombie_status NFA_API_ZOMBIE_NO_IMPL /** * Simple streaming mode calls: diff --git a/src/nfa/mpv.h b/src/nfa/mpv.h index 18c52e99..dc5dad6f 100644 --- a/src/nfa/mpv.h +++ b/src/nfa/mpv.h @@ -50,7 +50,7 @@ char nfaExecMpv0_expandState(const struct NFA *nfa, void *dest, const void *src, #define nfaExecMpv0_QR NFA_API_NO_IMPL #define nfaExecMpv0_Q2 NFA_API_NO_IMPL /* for non-chained suffixes. */ #define nfaExecMpv0_B_Reverse NFA_API_NO_IMPL -#define nfaExecMpv0_zombie_status NFA_API_NO_IMPL +#define nfaExecMpv0_zombie_status NFA_API_ZOMBIE_NO_IMPL /** * return 0 if the mpv dies, otherwise returns the location of the next possible diff --git a/src/nfa/nfa_internal.h b/src/nfa/nfa_internal.h index 14678c2f..e13482b5 100644 --- a/src/nfa/nfa_internal.h +++ b/src/nfa/nfa_internal.h @@ -237,16 +237,32 @@ static really_inline int isMultiTopType(u8 t) { return !isDfaType(t) && !isLbrType(t); } -/** Macro used in place of unimplemented NFA API functions for a given + +/** Macros used in place of unimplemented NFA API functions for a given * engine. */ #if !defined(_WIN32) -#define NFA_API_NO_IMPL(...) \ + +/* Use for functions that return an integer. */ +#define NFA_API_NO_IMPL(...) \ ({ \ assert("not implemented for this engine!"); \ 0; /* return value, for places that need it */ \ }) + +/* Use for _zombie_status functions. */ +#define NFA_API_ZOMBIE_NO_IMPL(...) \ + ({ \ + assert("not implemented for this engine!"); \ + NFA_ZOMBIE_NO; \ + }) + #else -#define NFA_API_NO_IMPL(...) 0 + +/* Simpler implementation for compilers that don't like the GCC extension used + * above. */ +#define NFA_API_NO_IMPL(...) 0 +#define NFA_API_ZOMBIE_NO_IMPL(...) NFA_ZOMBIE_NO + #endif #ifdef __cplusplus From 9ff1303cd8adf1010fce16e36a803f53ec3a7ff2 Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Fri, 23 Oct 2015 15:32:55 +1100 Subject: [PATCH 07/17] Allow no scratch for stream reset API calls Bring hs_reset_stream(), hs_reset_and_copy_stream()'s functionality into line with hs_close_stream() by accepting a NULL scratch if and only if the match callback is also NULL, indicating that no matches should be delivered. --- src/hs_runtime.h | 6 +++-- src/runtime.c | 13 ++++++----- unit/hyperscan/arg_checks.cpp | 41 ++++++++++++++++++++++++++++++++--- unit/hyperscan/stream_op.cpp | 18 +++++++++------ 4 files changed, 60 insertions(+), 18 deletions(-) diff --git a/src/hs_runtime.h b/src/hs_runtime.h index 27de00a3..db52f4f5 100644 --- a/src/hs_runtime.h +++ b/src/hs_runtime.h @@ -250,7 +250,8 @@ hs_error_t hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch, * for future use and is unused at present. * * @param scratch - * A per-thread scratch space allocated by @ref hs_alloc_scratch(). + * A per-thread scratch space allocated by @ref hs_alloc_scratch(). This is + * allowed to be NULL only if the @a onEvent callback is also NULL. * * @param onEvent * Pointer to a match event callback function. If a NULL pointer is given, @@ -299,7 +300,8 @@ hs_error_t hs_copy_stream(hs_stream_t **to_id, const hs_stream_t *from_id); * The stream (as created by @ref hs_open_stream()) to be copied. * * @param scratch - * A per-thread scratch space allocated by @ref hs_alloc_scratch(). + * A per-thread scratch space allocated by @ref hs_alloc_scratch(). This is + * allowed to be NULL only if the @a onEvent callback is also NULL. * * @param onEvent * Pointer to a match event callback function. If a NULL pointer is given, diff --git a/src/runtime.c b/src/runtime.c index 66323789..335a83bc 100644 --- a/src/runtime.c +++ b/src/runtime.c @@ -1163,11 +1163,10 @@ hs_error_t hs_reset_and_copy_stream(hs_stream_t *to_id, return HS_INVALID; } - if (!scratch || !validScratch(to_id->rose, scratch)) { - return HS_INVALID; - } - if (onEvent) { + if (!scratch || !validScratch(to_id->rose, scratch)) { + return HS_INVALID; + } report_eod_matches(to_id, scratch, onEvent, context); } @@ -1406,12 +1405,14 @@ HS_PUBLIC_API hs_error_t hs_reset_stream(hs_stream_t *id, UNUSED unsigned int flags, hs_scratch_t *scratch, match_event_handler onEvent, void *context) { - if (!id || !scratch || !validScratch(id->rose, scratch)) { + if (!id) { return HS_INVALID; } - /* user wants eod matches */ if (onEvent) { + if (!scratch || !validScratch(id->rose, scratch)) { + return HS_INVALID; + } report_eod_matches(id, scratch, onEvent, context); } diff --git a/unit/hyperscan/arg_checks.cpp b/unit/hyperscan/arg_checks.cpp index f717f0dc..dbc692c5 100644 --- a/unit/hyperscan/arg_checks.cpp +++ b/unit/hyperscan/arg_checks.cpp @@ -740,7 +740,9 @@ TEST(HyperscanArgChecks, ResetAndCopyStreamSameToId) { hs_free_database(db); } -TEST(HyperscanArgChecks, ResetAndCopyStreamNoScratch) { +// hs_reset_and_copy_stream: You're allowed to reset and copy a stream with no +// scratch and no callback. +TEST(HyperscanArgChecks, ResetAndCopyStreamNoCallbackOrScratch) { hs_stream_t *stream = nullptr; hs_stream_t *stream_to = nullptr; hs_database_t *db = nullptr; @@ -760,6 +762,37 @@ TEST(HyperscanArgChecks, ResetAndCopyStreamNoScratch) { ASSERT_EQ(HS_SUCCESS, err); err = hs_reset_and_copy_stream(stream_to, stream, nullptr, nullptr, nullptr); + ASSERT_EQ(HS_SUCCESS, err); + + hs_close_stream(stream_to, scratch, nullptr, nullptr); + hs_close_stream(stream, scratch, nullptr, nullptr); + hs_free_scratch(scratch); + hs_free_database(db); +} + +// hs_reset_and_copy_stream: If you specify a callback, you must provide +// scratch. +TEST(HyperscanArgChecks, ResetAndCopyStreamNoScratch) { + hs_stream_t *stream = nullptr; + hs_stream_t *stream_to = nullptr; + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + hs_error_t err = hs_compile("foobar", 0, HS_MODE_STREAM, nullptr, &db, + &compile_err); + ASSERT_EQ(HS_SUCCESS, err); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + + err = hs_open_stream(db, 0, &stream); + ASSERT_EQ(HS_SUCCESS, err); + + err = hs_open_stream(db, 0, &stream_to); + ASSERT_EQ(HS_SUCCESS, err); + + err = hs_reset_and_copy_stream(stream_to, stream, nullptr, dummy_cb, + nullptr); ASSERT_EQ(HS_INVALID, err); hs_close_stream(stream_to, scratch, nullptr, nullptr); @@ -793,7 +826,8 @@ TEST(HyperscanArgChecks, ResetAndCopyStreamDiffDb) { err = hs_open_stream(db2, 0, &stream_to); ASSERT_EQ(HS_SUCCESS, err); - err = hs_reset_and_copy_stream(stream_to, stream, scratch, nullptr, nullptr); + err = hs_reset_and_copy_stream(stream_to, stream, scratch, nullptr, + nullptr); ASSERT_EQ(HS_INVALID, err); hs_close_stream(stream_to, scratch, nullptr, nullptr); @@ -2009,6 +2043,7 @@ TEST(HyperscanArgChecks, ScanStreamBadScratch) { free(local_garbage); } +// hs_reset_stream: bad scratch arg TEST(HyperscanArgChecks, ResetStreamBadScratch) { hs_database_t *db = nullptr; hs_compile_error_t *compile_err = nullptr; @@ -2025,7 +2060,7 @@ TEST(HyperscanArgChecks, ResetStreamBadScratch) { ASSERT_EQ(HS_SUCCESS, err); ASSERT_TRUE(stream != nullptr); - err = hs_reset_stream(stream, 0, scratch, nullptr, nullptr); + err = hs_reset_stream(stream, 0, scratch, dummy_cb, nullptr); EXPECT_NE(HS_SUCCESS, err); EXPECT_NE(HS_SCAN_TERMINATED, err); diff --git a/unit/hyperscan/stream_op.cpp b/unit/hyperscan/stream_op.cpp index fbec6b98..a78e08a1 100644 --- a/unit/hyperscan/stream_op.cpp +++ b/unit/hyperscan/stream_op.cpp @@ -69,7 +69,9 @@ TEST(StreamUtil, reset1) { c.matches.clear(); - err = hs_reset_stream(stream, 0, scratch, nullptr, nullptr); + // Note: we do not need matches from this reset operation, so we do not + // need to supply a callback or scratch space. + err = hs_reset_stream(stream, 0, nullptr, nullptr, nullptr); ASSERT_EQ(HS_SUCCESS, err); err = hs_scan_stream(stream, data1, sizeof(data1), 0, scratch, record_cb2, @@ -107,7 +109,9 @@ TEST(StreamUtil, reset2) { c.matches.clear(); - err = hs_reset_stream(stream, 0, scratch, nullptr, nullptr); + // Note: we do not need matches from this reset operation, so we do not + // need to supply a callback or scratch space. + err = hs_reset_stream(stream, 0, nullptr, nullptr, nullptr); ASSERT_EQ(HS_SUCCESS, err); err = hs_scan_stream(stream, data1, sizeof(data1), 0, scratch, record_cb, @@ -268,7 +272,7 @@ TEST(StreamUtil, copy_reset1) { c.matches.clear(); - err = hs_reset_and_copy_stream(stream, stream2, scratch, nullptr, nullptr); + err = hs_reset_and_copy_stream(stream, stream2, nullptr, nullptr, nullptr); ASSERT_EQ(HS_SUCCESS, err); err = hs_scan_stream(stream, data1, sizeof(data1), 0, scratch, record_cb2, @@ -312,7 +316,7 @@ TEST(StreamUtil, copy_reset2) { c.matches.clear(); - err = hs_reset_and_copy_stream(stream, stream2, scratch, nullptr, nullptr); + err = hs_reset_and_copy_stream(stream, stream2, nullptr, nullptr, nullptr); ASSERT_EQ(HS_SUCCESS, err); err = hs_scan_stream(stream, data1, sizeof(data1), 0, scratch, record_cb, @@ -355,7 +359,7 @@ TEST(StreamUtil, copy_reset3) { c.matches.clear(); - err = hs_reset_and_copy_stream(stream2, stream, scratch, nullptr, nullptr); + err = hs_reset_and_copy_stream(stream2, stream, nullptr, nullptr, nullptr); ASSERT_EQ(HS_SUCCESS, err); err = hs_scan_stream(stream, data1, sizeof(data1), 0, scratch, record_cb, @@ -408,7 +412,7 @@ TEST(StreamUtil, copy_reset4) { c.matches.clear(); - err = hs_reset_and_copy_stream(stream2, stream, scratch, nullptr, nullptr); + err = hs_reset_and_copy_stream(stream2, stream, nullptr, nullptr, nullptr); ASSERT_EQ(HS_SUCCESS, err); err = hs_scan_stream(stream, data1, sizeof(data1), 0, scratch, record_cb, @@ -458,7 +462,7 @@ TEST(StreamUtil, copy_reset5) { ASSERT_EQ(HS_SUCCESS, err); ASSERT_EQ(0U, c.matches.size()); - err = hs_reset_and_copy_stream(stream2, stream, scratch, nullptr, nullptr); + err = hs_reset_and_copy_stream(stream2, stream, nullptr, nullptr, nullptr); ASSERT_EQ(HS_SUCCESS, err); err = hs_scan_stream(stream, data1, sizeof(data1), 0, scratch, record_cb, From 55b357f7d1276cd4935a9034343726211a873d20 Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Fri, 23 Oct 2015 10:59:48 +1100 Subject: [PATCH 08/17] Remove enum mqe_event and use u32 for queue events We were using intermediate values int he enum and casting back and forth with a u32; it is cleaner to just use a u32 and define some special values. Silences ICC warning #188: enumerated type mixed with another type. --- src/nfa/lbr_common_impl.h | 6 +++--- src/nfa/mpvcompile.cpp | 6 +++--- src/nfa/nfa_api_queue.h | 30 +++++++++++++++++------------- src/rose/match.c | 6 +++--- src/rose/rose_build_bytecode.cpp | 2 +- src/rose/rose_internal.h | 4 ++-- 6 files changed, 29 insertions(+), 25 deletions(-) diff --git a/src/nfa/lbr_common_impl.h b/src/nfa/lbr_common_impl.h index ef12f193..917a8e91 100644 --- a/src/nfa/lbr_common_impl.h +++ b/src/nfa/lbr_common_impl.h @@ -151,12 +151,12 @@ char JOIN(ENGINE_EXEC_NAME, _TopScan)(const struct NFA *nfa, struct mq *q, while (1) { // Find the next top with location >= the last escape we saw. for (; q->cur < q->end && q_cur_loc(q) <= end; q->cur++) { - enum mqe_event t = q_cur_type(q); - if ((t == MQE_TOP || t == MQE_TOP_FIRST) && + u32 event = q_cur_type(q); + if ((event == MQE_TOP || event == MQE_TOP_FIRST) && q_cur_offset(q) >= lstate->lastEscape) { goto found_top; } - DEBUG_PRINTF("skip event type=%d offset=%lld\n", t, q_cur_offset(q)); + DEBUG_PRINTF("skip event type=%u offset=%lld\n", event, q_cur_offset(q)); } // No more tops, we're done. diff --git a/src/nfa/mpvcompile.cpp b/src/nfa/mpvcompile.cpp index ce0ac473..e4741ef1 100644 --- a/src/nfa/mpvcompile.cpp +++ b/src/nfa/mpvcompile.cpp @@ -250,7 +250,7 @@ void fillCounterInfos(vector *out, u32 *curr_decomp_offset, const map> &kilopuffs) { /* first the triggered puffs */ map>::const_iterator it = kilopuffs.begin(); - while (it != kilopuffs.end() && it->first.trigger_event != ~0U) { + while (it != kilopuffs.end() && it->first.trigger_event != MQE_INVALID) { assert(!it->first.auto_restart); assert(it->first.trigger_event == MQE_TOP_FIRST + distance(kilopuffs.begin(), it)); @@ -268,7 +268,7 @@ void fillCounterInfos(vector *out, u32 *curr_decomp_offset, */ map>::const_iterator trig_ite = it; while (it != kilopuffs.end() && !it->first.auto_restart) { - assert(it->first.trigger_event == ~0U); + assert(it->first.trigger_event == MQE_INVALID); ++it; } @@ -278,7 +278,7 @@ void fillCounterInfos(vector *out, u32 *curr_decomp_offset, kilopuffs, kilopuffs.begin(), it); } while (it != kilopuffs.end() && it->first.auto_restart) { - assert(it->first.trigger_event == ~0U); + assert(it->first.trigger_event == MQE_INVALID); out->push_back(mpv_counter_info()); map>::const_iterator it_o = it; diff --git a/src/nfa/nfa_api_queue.h b/src/nfa/nfa_api_queue.h index 123ce3e4..d8079292 100644 --- a/src/nfa/nfa_api_queue.h +++ b/src/nfa/nfa_api_queue.h @@ -41,24 +41,28 @@ extern "C" #define MAX_MQE_LEN 10 /** Queue events */ -enum mqe_event { - MQE_START = 0, /**< and begin! Note: stateless engines will start from - * this location */ - MQE_END = 1, /**< stop scanning */ - MQE_TOP = 2, /**< enable start + start dot star */ - MQE_TOP_FIRST = 4, /**< first event corresponding to a TOP _N_ */ - /* - * Additional tops (in multi-top engines) use the event values from - * MQE_TOP_FIRST to something. - */ +/** Queue event: begin scanning. Note: stateless engines will start from this + * location. */ +#define MQE_START 0U - MQE_INVALID = ~0U -}; +/** Queue event: stop scanning. */ +#define MQE_END 1U + +/** Queue event: enable start and start-dot-star. */ +#define MQE_TOP 2U + +/** Queue event: first event corresponding to a numbered TOP. Additional tops + * (in multi-top engines) use the event values from MQE_TOP_FIRST to + * MQE_INVALID - 1. */ +#define MQE_TOP_FIRST 4U + +/** Invalid queue event */ +#define MQE_INVALID (~0U) /** Queue item */ struct mq_item { - u32 type; /**< event; from mqe_event */ + u32 type; /**< event type, from MQE_* */ s64a location; /**< relative to the start of the current buffer */ u64a som; /**< pattern start-of-match corresponding to a top, only used * by som engines. */ diff --git a/src/rose/match.c b/src/rose/match.c index 6903baa5..32224c24 100644 --- a/src/rose/match.c +++ b/src/rose/match.c @@ -391,7 +391,7 @@ hwlmcb_rv_t roseHandleSuffixTrigger(const struct RoseEngine *t, } } - enum mqe_event top = tr->suffixEvent; + u32 top = tr->suffixEvent; assert(top == MQE_TOP || (top >= MQE_TOP_FIRST && top < MQE_INVALID)); pushQueueSom(q, top, loc, som); @@ -977,14 +977,14 @@ void roseTriggerInfixes(const struct RoseEngine *t, const struct RoseRole *tr, do { u32 qi = curr_r->queue; u32 ri = queueToLeftIndex(t, qi); - enum mqe_event topEvent = curr_r->event; + u32 topEvent = curr_r->event; u8 cancel = curr_r->cancel_prev_top; assert(topEvent < MQE_INVALID); const struct LeftNfaInfo *left = getLeftInfoByQueue(t, qi); assert(!left->transient); - DEBUG_PRINTF("rose %u (qi=%u) event %u\n", ri, qi, (u32)topEvent); + DEBUG_PRINTF("rose %u (qi=%u) event %u\n", ri, qi, topEvent); struct mq *q = tctxtToScratch(tctxt)->queues + qi; const struct NfaInfo *info = getNfaInfoByQueue(t, qi); diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp index 9cc1484f..bbc8644e 100644 --- a/src/rose/rose_build_bytecode.cpp +++ b/src/rose/rose_build_bytecode.cpp @@ -2433,7 +2433,7 @@ vector buildRoseTriggerList(const RoseGraph &g, RoseVertex u, assert(num_tops(g[v].left) == 1); top = MQE_TOP; } else { - top = (enum mqe_event)((u32)MQE_TOP_FIRST + g[e].rose_top); + top = MQE_TOP_FIRST + g[e].rose_top; assert(top < MQE_INVALID); } diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h index 441680ca..1f4fc93a 100644 --- a/src/rose/rose_internal.h +++ b/src/rose/rose_internal.h @@ -236,7 +236,7 @@ struct LeftNfaInfo { // A list of these is used to trigger prefix/infix roses. struct RoseTrigger { u32 queue; // queue index of leftfix - u32 event; // from enum mqe_event + u32 event; // queue event, from MQE_* u8 cancel_prev_top; }; @@ -309,7 +309,7 @@ struct RoseRole { ReportID reportId; // report ID, or MO_INVALID_IDX u32 stateIndex; /**< index into state multibit, or MMB_INVALID. Roles do not * require a state bit if they are terminal */ - u32 suffixEvent; // from enum mqe_event + u32 suffixEvent; // queue event, from MQE_ u8 depth; /**< depth of this vertex from root in the tree, or 255 if greater. */ u32 suffixOffset; /**< suffix nfa: 0 if no suffix associated with the role, From e67b8032f77ebe337c87aeb3f550089dccfdd990 Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Tue, 20 Oct 2015 13:24:23 +1100 Subject: [PATCH 09/17] Unbreak unit-internal for builds w/o dump support Use printable, rather than escapeString. --- unit/internal/utf8_validate.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unit/internal/utf8_validate.cpp b/unit/internal/utf8_validate.cpp index 83358361..6649e6fe 100644 --- a/unit/internal/utf8_validate.cpp +++ b/unit/internal/utf8_validate.cpp @@ -31,7 +31,7 @@ #include "parser/utf8_validate.h" #include "ue2common.h" -#include "util/ue2string.h" +#include "util/string_util.h" #include "gtest/gtest.h" @@ -46,7 +46,7 @@ struct ValidUtf8TestInfo { // Helper for gtest. static void PrintTo(const ValidUtf8TestInfo &t, ::std::ostream *os) { - *os << "(" << t.str << ", " << t.is_valid << ")"; + *os << "(\"" << printable(t.str) << "\", " << t.is_valid << ")"; } static ValidUtf8TestInfo valid_utf8_tests[] = { @@ -118,5 +118,5 @@ INSTANTIATE_TEST_CASE_P(ValidUtf8, ValidUtf8Test, ValuesIn(valid_utf8_tests)); TEST_P(ValidUtf8Test, check) { const auto &info = GetParam(); ASSERT_EQ(info.is_valid, isValidUtf8(info.str.c_str())) - << "String is: " << escapeString(info.str) << std::endl; + << "String is: " << printable(info.str) << std::endl; } From b77613802d257162e826e2e3bfeca5d25b5c946f Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Mon, 26 Oct 2015 15:53:55 +1100 Subject: [PATCH 10/17] Update CMake required min version to 2.8.11 RedHat/CentOS 7 ship with 2.8.11 so this is a sane minimum. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0dd02942..9760d36a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required (VERSION 2.8) +cmake_minimum_required (VERSION 2.8.11) project (Hyperscan C CXX) set (HS_MAJOR_VERSION 4) From 1afc591c30b8171df82bdcdcc613e280558604d6 Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Thu, 29 Oct 2015 09:08:40 +1100 Subject: [PATCH 11/17] Check for (and throw on) large min repeat We were only checking for large maximum bounds, which meant that we would attempt to compile A{N,} where N is huge. --- src/parser/ComponentRepeat.cpp | 12 ++++++++---- unit/hyperscan/bad_patterns.txt | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/parser/ComponentRepeat.cpp b/src/parser/ComponentRepeat.cpp index 325114d8..8cd88372 100644 --- a/src/parser/ComponentRepeat.cpp +++ b/src/parser/ComponentRepeat.cpp @@ -51,11 +51,11 @@ using namespace std; namespace ue2 { /** \brief Hard limit on the maximum repeat for bounded repeats. */ -static const u32 MAX_MAX_BOUND = 32767; +static constexpr u32 MAX_REPEAT = 32767; /** \brief If expanding a repeat would lead to this many positions being * generated, we fail the pattern. */ -static const u32 MAX_POSITIONS_EXPANDED = 500000; // arbitrarily huge +static constexpr u32 MAX_POSITIONS_EXPANDED = 500000; // arbitrarily huge /* no edge priorities means that if our subcomponent can be empty, our min * extent is effectively zero. */ @@ -67,7 +67,11 @@ ComponentRepeat::ComponentRepeat(unique_ptr sub_comp_in, u32 min, assert(sub_comp); assert(max > 0); assert(m_min <= m_max); - if (m_max < NoLimit && m_max > MAX_MAX_BOUND) { + + if (m_min > MAX_REPEAT) { + throw ParseError("Bounded repeat is too large."); + } + if (m_max != NoLimit && m_max > MAX_REPEAT) { throw ParseError("Bounded repeat is too large."); } } @@ -119,7 +123,7 @@ void checkPositions(vector &v, const GlushkovBuildState &bs) { void ComponentRepeat::notePositions(GlushkovBuildState &bs) { assert(m_max > 0); - assert(m_max == NoLimit || m_max < MAX_MAX_BOUND); + assert(m_max == NoLimit || m_max < MAX_REPEAT); /* Note: We can construct smaller subgraphs if we're not maintaining edge * priorities. */ diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt index 53a4dcc0..1ad445b3 100644 --- a/unit/hyperscan/bad_patterns.txt +++ b/unit/hyperscan/bad_patterns.txt @@ -127,3 +127,4 @@ 127:/^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/ρρρρρρρρρρρρρρρρρρρρρρρρρρρ0}l.{1,60}Car*k|npanomnax+8Wnah/8 #Expression is not valid UTF-8. 128:/(*UTF8)^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/ρρρρρρρρρρρρρρρρρρρρρρρρρρρ0}l.{1,60}Car*k|npanomnax+8Wnah/ #Expression is not valid UTF-8. 129:/bignum \1111111111111111111/ #Number is too big at index 7. +130:/foo|&{5555555,}/ #Bounded repeat is too large. From 629be086834197bc4d704730f07072ad6ee85e16 Mon Sep 17 00:00:00 2001 From: Alex Coyte Date: Thu, 29 Oct 2015 14:35:02 +1100 Subject: [PATCH 12/17] reduce memory use in ng_small_literal_set/ng_literal_decorated These passes kept temporary strings/paths alive longer than was needed which lead to high memory usage during these passes in pathological cases. --- src/nfagraph/ng_literal_decorated.cpp | 16 ++++++++++++++++ src/nfagraph/ng_small_literal_set.cpp | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/nfagraph/ng_literal_decorated.cpp b/src/nfagraph/ng_literal_decorated.cpp index 652fd14a..02b25a73 100644 --- a/src/nfagraph/ng_literal_decorated.cpp +++ b/src/nfagraph/ng_literal_decorated.cpp @@ -67,6 +67,7 @@ static bool findPaths(const NGHolder &g, vector &paths) { vector order = getTopoOrdering(g); + vector read_count(num_vertices(g)); vector> built(num_vertices(g)); for (auto it = order.rbegin(); it != order.rend(); ++it) { @@ -74,6 +75,11 @@ bool findPaths(const NGHolder &g, vector &paths) { auto &out = built[g[v].index]; assert(out.empty()); + read_count[g[v].index] = out_degree(v, g); + + DEBUG_PRINTF("setting read_count to %zu for %u\n", + read_count[g[v].index], g[v].index); + if (v == g.start || v == g.startDs) { out.push_back({v}); continue; @@ -94,6 +100,9 @@ bool findPaths(const NGHolder &g, vector &paths) { continue; } + assert(!built[g[u].index].empty()); + assert(read_count[g[u].index]); + for (const auto &p : built[g[u].index]) { out.push_back(p); out.back().push_back(v); @@ -105,6 +114,13 @@ bool findPaths(const NGHolder &g, vector &paths) { return false; } } + + read_count[g[u].index]--; + if (!read_count[g[u].index]) { + DEBUG_PRINTF("clearing %u as finished reading\n", g[u].index); + built[g[u].index].clear(); + built[g[u].index].shrink_to_fit(); + } } } diff --git a/src/nfagraph/ng_small_literal_set.cpp b/src/nfagraph/ng_small_literal_set.cpp index 89cb0ff8..b5867bb9 100644 --- a/src/nfagraph/ng_small_literal_set.cpp +++ b/src/nfagraph/ng_small_literal_set.cpp @@ -118,10 +118,15 @@ bool findLiterals(const NGHolder &g, vector order = getTopoOrdering(g); vector> built(num_vertices(g)); + vector read_count(num_vertices(g)); for (auto it = order.rbegin(); it != order.rend(); ++it) { NFAVertex v = *it; set &out = built[g[v].index]; + read_count[g[v].index] = out_degree(v, g); + + DEBUG_PRINTF("setting read_count to %zu for %u\n", + read_count[g[v].index], g[v].index); assert(out.empty()); if (v == g.start) { @@ -149,7 +154,10 @@ bool findLiterals(const NGHolder &g, } set &in = built[g[u].index]; + DEBUG_PRINTF("getting from %u (%zu reads to go)\n", + g[u].index, read_count[g[u].index]); assert(!in.empty()); + assert(read_count[g[u].index]); for (const sls_literal &lit : in) { if (accept) { @@ -171,10 +179,18 @@ bool findLiterals(const NGHolder &g, out.insert(lit.append((u8)c, nocase)); if (out.size() + literals->size() > MAX_LITERAL_SET_SIZE) { + DEBUG_PRINTF("too big %zu + %zu\n", out.size(), + literals->size()); return false; } } } + + read_count[g[u].index]--; + if (!read_count[g[u].index]) { + DEBUG_PRINTF("clearing %u as finished reading\n", g[u].index); + in.clear(); + } } } @@ -206,6 +222,8 @@ bool handleSmallLiteralSets(RoseBuild &rose, const NGHolder &g, return false; } + DEBUG_PRINTF("looking for literals\n"); + map> literals; if (!findLiterals(g, &literals)) { DEBUG_PRINTF(":(\n"); From ba0b2b788bcf11c64b5503e1a4c5287327e92cc0 Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Thu, 29 Oct 2015 17:29:06 +1100 Subject: [PATCH 13/17] cmake: collection of fixes --- CMakeLists.txt | 29 +++++++++++++++++------------ src/fdr/CMakeLists.txt | 6 +++--- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9760d36a..77e04a5e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,19 +36,19 @@ else() set(RELEASE_BUILD FALSE) endif() -set(BINDIR ${PROJECT_BINARY_DIR}/bin) -set(LIBDIR ${PROJECT_BINARY_DIR}/lib) +set(BINDIR "${PROJECT_BINARY_DIR}/bin") +set(LIBDIR "${PROJECT_BINARY_DIR}/lib") # First for the generic no-config case -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${BINDIR}) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LIBDIR}) -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LIBDIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${BINDIR}") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${LIBDIR}") +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${LIBDIR}") # Second, for multi-config builds (e.g. msvc) foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) string (TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG) - set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${BINDIR}) - set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${LIBDIR}) - set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${LIBDIR}) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} "${BINDIR}") + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} "${LIBDIR}") + set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} "${LIBDIR}") endforeach (OUTPUTCONFIG CMAKE_CONFIGURATION_TYPES) @@ -71,13 +71,14 @@ find_package(Boost ${BOOST_MINVERSION}) if(NOT Boost_FOUND) # we might have boost in tree, so provide a hint and try again message(STATUS "trying include dir for boost") - set(BOOST_INCLUDEDIR ${CMAKE_SOURCE_DIR}/include) + set(BOOST_INCLUDEDIR "${CMAKE_SOURCE_DIR}/include") find_package(Boost ${BOOST_MINVERSION}) if(NOT Boost_FOUND) - message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system pacakges if available or extract Boost headers to ${CMAKE_SOURCE_DIR}/include") + message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system pacakges if available, extract Boost headers to ${CMAKE_SOURCE_DIR}/include, or set the CMake BOOST_ROOT variable.") endif() endif() + # -- make this work? set(python_ADDITIONAL_VERSIONS 2.7 2.6) find_package(PythonInterp) find_program(RAGEL ragel) @@ -88,6 +89,10 @@ else() message(FATAL_ERROR "No python interpreter found") endif() +if(${RAGEL} STREQUAL "RAGEL-NOTFOUND") + message(FATAL_ERROR "Ragel state machine compiler not found") +endif() + option(OPTIMISE "Turns off compiler optimizations (on by default unless debug output enabled or coverage testing)" TRUE) option(DEBUG_OUTPUT "Enable debug output (warning: very verbose)" FALSE) @@ -290,10 +295,10 @@ CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE) endif() if (NOT XCODE) - include_directories(SYSTEM ${Boost_INCLUDE_DIR}) + include_directories(SYSTEM ${Boost_INCLUDE_DIRS}) else() # cmake doesn't think Xcode supports isystem - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -isystem ${Boost_INCLUDE_DIR}") + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -isystem ${Boost_INCLUDE_DIRS}") endif() diff --git a/src/fdr/CMakeLists.txt b/src/fdr/CMakeLists.txt index 4790c527..25396689 100644 --- a/src/fdr/CMakeLists.txt +++ b/src/fdr/CMakeLists.txt @@ -13,11 +13,11 @@ set(AUTOGEN_PY_FILES function(fdr_autogen type out) add_custom_command ( COMMENT "AUTOGEN ${out}" - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${out} - COMMAND ${PYTHON} ${CMAKE_CURRENT_SOURCE_DIR}/autogen.py ${type} > ${CMAKE_CURRENT_BINARY_DIR}/${out} + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${out}" + COMMAND ${PYTHON} "${CMAKE_CURRENT_SOURCE_DIR}/autogen.py" ${type} > "${CMAKE_CURRENT_BINARY_DIR}/${out}" DEPENDS ${AUTOGEN_PY_FILES} ) - add_custom_target(autogen_${type} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${out}) + add_custom_target(autogen_${type} DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/${out}") endfunction(fdr_autogen) #now build the functions From aafbd96a1dc6863aefb85e94737b8f6dddaf0d79 Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Thu, 29 Oct 2015 17:29:24 +1100 Subject: [PATCH 14/17] docs: describe BOOST_ROOT cmake variable --- doc/dev-reference/getting_started.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/dev-reference/getting_started.rst b/doc/dev-reference/getting_started.rst index 7e43d10a..826349a7 100644 --- a/doc/dev-reference/getting_started.rst +++ b/doc/dev-reference/getting_started.rst @@ -125,8 +125,12 @@ Boost Headers Compiling Hyperscan depends on a recent version of the Boost C++ header library. If the Boost libraries are installed on the build machine in the -usual paths, CMake will find them. An alternative is to put a copy of (or a -symlink to) the boost subdirectory in ``/include/boost``. +usual paths, CMake will find them. If the Boost libraries are not installed, +the location of the Boost source tree can be specified during the CMake +configuration step using the ``BOOST_ROOT`` variable (described below). + +Another alternative is to put a copy of (or a symlink to) the boost +subdirectory in ``/include/boost``. For example: for the Boost-1.59.0 release: :: @@ -161,6 +165,8 @@ Common options for CMake include: | BUILD_STATIC_AND_SHARED| Build both static and shared Hyperscan libs. | | | Default off. | +------------------------+----------------------------------------------------+ +| BOOST_ROOT | Location of Boost source tree. | ++------------------------+----------------------------------------------------+ | DEBUG_OUTPUT | Enable very verbose debug output. Default off. | +------------------------+----------------------------------------------------+ From 1f47b8210635c6d3a76b99c1b09ce4f5c4744f45 Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Fri, 30 Oct 2015 10:43:43 +1100 Subject: [PATCH 15/17] Remove unneeded code at preproc stage If we know we have BMI2 we shouldn't produce the fallback code. --- src/util/bitutils.h | 12 ++++++++---- src/util/shuffle.h | 6 ++++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/util/bitutils.h b/src/util/bitutils.h index 6fdafa71..979a2c04 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -254,7 +254,7 @@ u32 compress32(u32 x, u32 m) { #if defined(__BMI2__) // BMI2 has a single instruction for this operation. return _pext_u32(x, m); -#endif +#else // Return zero quickly on trivial cases if ((x & m) == 0) { @@ -281,6 +281,7 @@ u32 compress32(u32 x, u32 m) { } return x; +#endif } static really_inline @@ -288,7 +289,7 @@ u64a compress64(u64a x, u64a m) { #if defined(ARCH_X86_64) && defined(__BMI2__) // BMI2 has a single instruction for this operation. return _pext_u64(x, m); -#endif +#else // Return zero quickly on trivial cases if ((x & m) == 0) { @@ -316,6 +317,7 @@ u64a compress64(u64a x, u64a m) { } return x; +#endif } static really_inline @@ -323,7 +325,7 @@ u32 expand32(u32 x, u32 m) { #if defined(__BMI2__) // BMI2 has a single instruction for this operation. return _pdep_u32(x, m); -#endif +#else // Return zero quickly on trivial cases if (!x || !m) { @@ -355,6 +357,7 @@ u32 expand32(u32 x, u32 m) { } return x & m0; // clear out extraneous bits +#endif } static really_inline @@ -362,7 +365,7 @@ u64a expand64(u64a x, u64a m) { #if defined(ARCH_X86_64) && defined(__BMI2__) // BMI2 has a single instruction for this operation. return _pdep_u64(x, m); -#endif +#else // Return zero quickly on trivial cases if (!x || !m) { @@ -395,6 +398,7 @@ u64a expand64(u64a x, u64a m) { } return x & m0; // clear out extraneous bits +#endif } diff --git a/src/util/shuffle.h b/src/util/shuffle.h index 847d07a0..ba85fb5d 100644 --- a/src/util/shuffle.h +++ b/src/util/shuffle.h @@ -51,7 +51,7 @@ u32 shuffleDynamic32(u32 x, u32 mask) { #if defined(HAVE_PEXT) // Intel BMI2 can do this operation in one instruction. return _pext_u32(x, mask); -#endif +#else u32 result = 0, num = 1; while (mask != 0) { @@ -63,6 +63,7 @@ u32 shuffleDynamic32(u32 x, u32 mask) { num <<= 1; } return result; +#endif } static really_inline @@ -70,7 +71,7 @@ u32 shuffleDynamic64(u64a x, u64a mask) { #if defined(HAVE_PEXT) && defined(ARCH_64_BIT) // Intel BMI2 can do this operation in one instruction. return _pext_u64(x, mask); -#endif +#else u32 result = 0, num = 1; while (mask != 0) { @@ -82,6 +83,7 @@ u32 shuffleDynamic64(u64a x, u64a mask) { num <<= 1; } return result; +#endif } #undef HAVE_PEXT From aa674e4e4724e917b822935e1b64a07d5fe7fe06 Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Fri, 30 Oct 2015 11:14:32 +1100 Subject: [PATCH 16/17] unit: Don't run unit-internal in release build --- unit/CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt index 21416a25..9bc74e23 100644 --- a/unit/CMakeLists.txt +++ b/unit/CMakeLists.txt @@ -112,6 +112,7 @@ endif() # # build target to run unit tests # +if (NOT RELEASE_BUILD) add_custom_target( unit COMMAND bin/unit-internal @@ -119,3 +120,11 @@ add_custom_target( WORKING_DIRECTORY ${CMAKE_BINARY_DIR} DEPENDS unit-internal unit-hyperscan ) +else () +add_custom_target( + unit + COMMAND bin/unit-hyperscan + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + DEPENDS unit-hyperscan +) +endif() From 91343b00e94dbc4a3997324e281b06a5846cca9b Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Fri, 30 Oct 2015 11:20:28 +1100 Subject: [PATCH 17/17] Bump version number --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 77e04a5e..67885c93 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ project (Hyperscan C CXX) set (HS_MAJOR_VERSION 4) set (HS_MINOR_VERSION 0) -set (HS_PATCH_VERSION 0) +set (HS_PATCH_VERSION 1) set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION}) string (TIMESTAMP BUILD_DATE "%Y-%m-%d")