From 6c8e33e59712b4a2b46447b529af9afa2ea93a4d Mon Sep 17 00:00:00 2001 From: gtsoul-tech <56584633+gtsoul-tech@users.noreply.github.com> Date: Mon, 29 Jul 2024 11:49:25 +0300 Subject: [PATCH] Bug fix/rebar tests (#307) * fixed paths and utf8-lossy=true * revert to maskz (its the bug) * cppcheck fix --------- Co-authored-by: gtsoul-tech --- src/nfa/vermicelli_simd.cpp | 4 +- unit/hyperscan/rebar_tests.cpp | 84 +++++++++++++++++++++++++++------- 2 files changed, 69 insertions(+), 19 deletions(-) diff --git a/src/nfa/vermicelli_simd.cpp b/src/nfa/vermicelli_simd.cpp index 67ac1dac..6aaa679c 100644 --- a/src/nfa/vermicelli_simd.cpp +++ b/src/nfa/vermicelli_simd.cpp @@ -124,8 +124,8 @@ static const u8 *vermicelliExecReal(SuperVector const chars, SuperVector c // finish off tail if (d != buf_end) { - SuperVector data = SuperVector::loadu(buf_end - S); - rv = vermicelliBlock(data, chars, casemask, buf_end - S, buf_end - d); + SuperVector data = SuperVector::loadu_maskz(d, buf_end - d); + rv = vermicelliBlock(data, chars, casemask, d, buf_end - d); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } diff --git a/unit/hyperscan/rebar_tests.cpp b/unit/hyperscan/rebar_tests.cpp index be23d677..26e72055 100644 --- a/unit/hyperscan/rebar_tests.cpp +++ b/unit/hyperscan/rebar_tests.cpp @@ -43,13 +43,18 @@ using namespace std; +#define xstr(s) to_string_literal(s) +#define to_string_literal(s) #s + +#define SRCDIR_PREFIX xstr(SRCDIR) + + TEST(rebar, leipzig_math_symbols_count) { hs_database_t *db = nullptr; hs_compile_error_t *compile_err = nullptr; CallBackContext c; const char *expr = "\\p{Sm}"; const unsigned flag = HS_FLAG_UCP | HS_FLAG_UTF8; - const unsigned id= 1; hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK,nullptr, &db, &compile_err); ASSERT_EQ(HS_SUCCESS, err); @@ -60,8 +65,8 @@ TEST(rebar, leipzig_math_symbols_count) { ASSERT_EQ(HS_SUCCESS, err); ASSERT_TRUE(scratch != nullptr); - - std::ifstream file("../source/unit/hyperscan/datafiles/leipzig-3200.txt"); + string filename = "unit/hyperscan/datafiles/leipzig-3200.txt"; + std::ifstream file((string(SRCDIR_PREFIX) + "/" + filename).c_str()); std::stringstream buffer; buffer << file.rdbuf(); // Read the file into the buffer std::string data = buffer.str(); // Convert the buffer into a std::string @@ -77,14 +82,54 @@ TEST(rebar, leipzig_math_symbols_count) { ASSERT_EQ(HS_SUCCESS, err); } +// Function to replace invalid UTF-8 sequences with the replacement character +std::string utf8_lossy_decode(const std::string &input) { + std::string output; + for (size_t i = 0; i < input.size(); ++i) { + unsigned char c = input[i]; + if (c < 0x80) { + output += c; + } else if (c < 0xC0) { + output += '\xEF'; + output += '\xBF'; + output += '\xBD'; + } else if (c < 0xE0) { + if (i + 1 < input.size() && (input[i + 1] & 0xC0) == 0x80) { + output += c; + output += input[i + 1]; + ++i; + } else { + output += '\xEF'; + output += '\xBF'; + output += '\xBD'; + } + } else if (c < 0xF0) { + if (i + 2 < input.size() && (input[i + 1] & 0xC0) == 0x80 && (input[i + 2] & 0xC0) == 0x80) { + output += c; + output += input[i + 1]; + output += input[i + 2]; + i += 2; + } else { + output += '\xEF'; + output += '\xBF'; + output += '\xBD'; + } + } else { + output += '\xEF'; + output += '\xBF'; + output += '\xBD'; + } + } + return output; +} + TEST(rebar, lh3lh3_reb_uri_or_email_grep) { hs_database_t *db = nullptr; hs_compile_error_t *compile_err = nullptr; CallBackContext c; const char *expr = "([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)"; const unsigned flag = 0; - const unsigned id= 1; - hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK,nullptr, &db, &compile_err); + hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK, nullptr, &db, &compile_err); ASSERT_EQ(HS_SUCCESS, err); ASSERT_TRUE(db != nullptr); @@ -94,14 +139,17 @@ TEST(rebar, lh3lh3_reb_uri_or_email_grep) { ASSERT_EQ(HS_SUCCESS, err); ASSERT_TRUE(scratch != nullptr); - - std::ifstream file("../source/unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"); + string filename = "unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"; + std::ifstream file((string(SRCDIR_PREFIX) + "/" + filename).c_str()); std::stringstream buffer; buffer << file.rdbuf(); // Read the file into the buffer std::string data = buffer.str(); // Convert the buffer into a std::string + // Decode the data using UTF-8 lossy decoding + std::string decoded_data = utf8_lossy_decode(data); + c.halt = 0; - err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + err = hs_scan(db, decoded_data.c_str(), decoded_data.size(), 0, scratch, record_cb, reinterpret_cast(&c)); ASSERT_EQ(HS_SUCCESS, err); ASSERT_EQ(888987, c.matches.size()); @@ -117,8 +165,7 @@ TEST(rebar, lh3lh3_reb_email_grep) { CallBackContext c; const char *expr = "([^ @]+)@([^ @]+)"; const unsigned flag = 0; - const unsigned id= 1; - hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK,nullptr, &db, &compile_err); + hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK, nullptr, &db, &compile_err); ASSERT_EQ(HS_SUCCESS, err); ASSERT_TRUE(db != nullptr); @@ -128,14 +175,17 @@ TEST(rebar, lh3lh3_reb_email_grep) { ASSERT_EQ(HS_SUCCESS, err); ASSERT_TRUE(scratch != nullptr); - - std::ifstream file("../source/unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"); + string filename = "unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"; + std::ifstream file((string(SRCDIR_PREFIX) + "/" + filename).c_str()); std::stringstream buffer; buffer << file.rdbuf(); // Read the file into the buffer std::string data = buffer.str(); // Convert the buffer into a std::string + // Decode the data using UTF-8 lossy decoding + std::string decoded_data = utf8_lossy_decode(data); + c.halt = 0; - err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + err = hs_scan(db, decoded_data.c_str(), decoded_data.size(), 0, scratch, record_cb, reinterpret_cast(&c)); ASSERT_EQ(HS_SUCCESS, err); ASSERT_EQ(232354, c.matches.size()); @@ -152,7 +202,6 @@ TEST(rebar, lh3lh3_reb_date_grep) { CallBackContext c; const char *expr = "([0-9][0-9]?)/([0-9][0-9]?)/([0-9][0-9]([0-9][0-9])?)"; const unsigned flag = 0; - const unsigned id= 1; hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK,nullptr, &db, &compile_err); ASSERT_EQ(HS_SUCCESS, err); @@ -164,13 +213,14 @@ TEST(rebar, lh3lh3_reb_date_grep) { ASSERT_TRUE(scratch != nullptr); - std::ifstream file("../source/unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"); + string filename = "unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"; + std::ifstream file((string(SRCDIR_PREFIX) + "/" + filename).c_str()); std::stringstream buffer; buffer << file.rdbuf(); // Read the file into the buffer std::string data = buffer.str(); // Convert the buffer into a std::string - + std::string decoded_data = utf8_lossy_decode(data); c.halt = 0; - err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + err = hs_scan(db, decoded_data.c_str(), decoded_data.size(), 0, scratch, record_cb, reinterpret_cast(&c)); ASSERT_EQ(HS_SUCCESS, err); ASSERT_EQ(819, c.matches.size());