From 04f9cc9449be111955e18d7a034a791673ddff80 Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Tue, 12 Nov 2024 16:42:29 +0200 Subject: [PATCH] rename test to regression --- unit/CMakeLists.txt | 2 +- unit/hyperscan/regressions.cpp | 279 +++++++++++++++++++++++++++++++++ 2 files changed, 280 insertions(+), 1 deletion(-) create mode 100644 unit/hyperscan/regressions.cpp diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt index 0a118b24..7e16f333 100644 --- a/unit/CMakeLists.txt +++ b/unit/CMakeLists.txt @@ -50,7 +50,7 @@ set(unit_hyperscan_SOURCES hyperscan/main.cpp hyperscan/multi.cpp hyperscan/order.cpp - hyperscan/rebar_tests.cpp + hyperscan/regressions.cpp hyperscan/scratch_op.cpp hyperscan/scratch_in_use.cpp hyperscan/serialize.cpp diff --git a/unit/hyperscan/regressions.cpp b/unit/hyperscan/regressions.cpp new file mode 100644 index 00000000..320b0d06 --- /dev/null +++ b/unit/hyperscan/regressions.cpp @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2018-2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "hs.h" +#include "config.h" +#include "test_util.h" + + +#include +#include +#include + +using namespace std; + +#define xstr(s) to_string_literal(s) +#define to_string_literal(s) #s + +#define SRCDIR_PREFIX xstr(SRCDIR) + + +TEST(rebar, leipzig_math_symbols_count) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + const char *expr = "\\p{Sm}"; + const unsigned flag = HS_FLAG_UCP | HS_FLAG_UTF8; + hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK,nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + string filename = "unit/hyperscan/datafiles/leipzig-3200.txt"; + std::ifstream file((string(SRCDIR_PREFIX) + "/" + filename).c_str()); + std::stringstream buffer; + buffer << file.rdbuf(); // Read the file into the buffer + std::string data = buffer.str(); // Convert the buffer into a std::string + + c.halt = 0; + err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + reinterpret_cast(&c)); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(69, c.matches.size()); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +// Function to replace invalid UTF-8 sequences with the replacement character +std::string utf8_lossy_decode(const std::string &input) { + std::string output; + for (size_t i = 0; i < input.size(); ++i) { + unsigned char c = input[i]; + if (c < 0x80) { + output += c; + } else if (c < 0xC0) { + output += '\xEF'; + output += '\xBF'; + output += '\xBD'; + } else if (c < 0xE0) { + if (i + 1 < input.size() && (input[i + 1] & 0xC0) == 0x80) { + output += c; + output += input[i + 1]; + ++i; + } else { + output += '\xEF'; + output += '\xBF'; + output += '\xBD'; + } + } else if (c < 0xF0) { + if (i + 2 < input.size() && (input[i + 1] & 0xC0) == 0x80 && (input[i + 2] & 0xC0) == 0x80) { + output += c; + output += input[i + 1]; + output += input[i + 2]; + i += 2; + } else { + output += '\xEF'; + output += '\xBF'; + output += '\xBD'; + } + } else { + output += '\xEF'; + output += '\xBF'; + output += '\xBD'; + } + } + return output; +} + +TEST(rebar, lh3lh3_reb_uri_or_email_grep) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + const char *expr = "([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)"; + const unsigned flag = 0; + hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK, nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + string filename = "unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"; + std::ifstream file((string(SRCDIR_PREFIX) + "/" + filename).c_str()); + std::stringstream buffer; + buffer << file.rdbuf(); // Read the file into the buffer + std::string data = buffer.str(); // Convert the buffer into a std::string + + // Decode the data using UTF-8 lossy decoding + std::string decoded_data = utf8_lossy_decode(data); + + c.halt = 0; + err = hs_scan(db, decoded_data.c_str(), decoded_data.size(), 0, scratch, record_cb, + reinterpret_cast(&c)); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(888987, c.matches.size()); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + +TEST(rebar, lh3lh3_reb_email_grep) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + const char *expr = "([^ @]+)@([^ @]+)"; + const unsigned flag = 0; + hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK, nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + string filename = "unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"; + std::ifstream file((string(SRCDIR_PREFIX) + "/" + filename).c_str()); + std::stringstream buffer; + buffer << file.rdbuf(); // Read the file into the buffer + std::string data = buffer.str(); // Convert the buffer into a std::string + + // Decode the data using UTF-8 lossy decoding + std::string decoded_data = utf8_lossy_decode(data); + + c.halt = 0; + err = hs_scan(db, decoded_data.c_str(), decoded_data.size(), 0, scratch, record_cb, + reinterpret_cast(&c)); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(232354, c.matches.size()); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + + +TEST(rebar, lh3lh3_reb_date_grep) { + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err = nullptr; + CallBackContext c; + const char *expr = "([0-9][0-9]?)/([0-9][0-9]?)/([0-9][0-9]([0-9][0-9])?)"; + const unsigned flag = 0; + hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK,nullptr, &db, &compile_err); + + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + + string filename = "unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"; + std::ifstream file((string(SRCDIR_PREFIX) + "/" + filename).c_str()); + std::stringstream buffer; + buffer << file.rdbuf(); // Read the file into the buffer + std::string data = buffer.str(); // Convert the buffer into a std::string + std::string decoded_data = utf8_lossy_decode(data); + c.halt = 0; + err = hs_scan(db, decoded_data.c_str(), decoded_data.size(), 0, scratch, record_cb, + reinterpret_cast(&c)); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(819, c.matches.size()); + + hs_free_database(db); + err = hs_free_scratch(scratch); + ASSERT_EQ(HS_SUCCESS, err); +} + + +const char *patterns[] = { + "^muvoy-nyemcynjywynamlahi/nyzye/khjdrehko-(qjhn|lyol)-.*/0$", + "^cop/devel/workflows-(prod|test)-.*/[0-9]+$", // Regex pattern that will match our fixture + +}; + +TEST(bug317, regressionOnx86Bug317) { + hs_database_t *database; + hs_compile_error_t *compile_err; + + unsigned ids[2] = {0}; + ids[0]=0; + ids[1]=1; + + const unsigned flag = HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8 | HS_FLAG_PREFILTER; + std::vector flags; + for (size_t i = 0; i < 2; ++i) { + flags.push_back(flag); + } + hs_error_t err = hs_compile_multi(patterns, flags.data(), ids, 2, HS_MODE_BLOCK, NULL, &database, &compile_err); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(database != nullptr); + + // Allocate scratch space + hs_scratch_t *scratch = NULL; + err = hs_alloc_scratch(database, &scratch); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + // This input should match + const char* input = "cop/devel/workflows-prod-build-cop-cop-ingestor/0"; + + // Scan the input + bool matchFound = false; + auto matchHandler = [](unsigned int, unsigned long long, unsigned long long, unsigned int, void *ctx) -> int { + bool *matchFound = static_cast(ctx); + *matchFound = true; + return 0; + }; + + err= hs_scan(database, input, strlen(input), 0, scratch, matchHandler, reinterpret_cast(&matchFound)); + ASSERT_EQ(HS_SUCCESS, err); + ASSERT_EQ(true, matchFound); + // Clean up + hs_free_database(database); + err = hs_free_scratch(scratch); +}