From 1330265cede3f6da51e023cb3975125a3e816d2f Mon Sep 17 00:00:00 2001 From: Alex Coyte Date: Tue, 12 Dec 2017 09:29:20 +1100 Subject: [PATCH] hscollider: tool for testing Hyperscan match behaviour against PCRE --- cmake/pcre.cmake | 62 + tools/hscollider/BoundedQueue.h | 291 +++ tools/hscollider/CMakeLists.txt | 79 + tools/hscollider/ColliderCorporaParser.h | 39 + tools/hscollider/ColliderCorporaParser.rl | 150 ++ tools/hscollider/Corpora.cpp | 31 + tools/hscollider/Corpora.h | 68 + tools/hscollider/DatabaseProxy.h | 88 + tools/hscollider/FileCorpora.cpp | 99 + tools/hscollider/FileCorpora.h | 57 + tools/hscollider/GraphTruth.cpp | 308 ++++ tools/hscollider/GraphTruth.h | 144 ++ tools/hscollider/GroundTruth.cpp | 513 ++++++ tools/hscollider/GroundTruth.h | 126 ++ tools/hscollider/NfaGeneratedCorpora.cpp | 146 ++ tools/hscollider/NfaGeneratedCorpora.h | 61 + tools/hscollider/ResultSet.h | 139 ++ tools/hscollider/Thread.cpp | 95 + tools/hscollider/Thread.h | 60 + tools/hscollider/UltimateTruth.cpp | 1026 +++++++++++ tools/hscollider/UltimateTruth.h | 142 ++ tools/hscollider/args.cpp | 570 ++++++ tools/hscollider/args.h | 46 + tools/hscollider/common.h | 92 + tools/hscollider/limit.cpp | 63 + tools/hscollider/limit.h | 36 + tools/hscollider/main.cpp | 2002 +++++++++++++++++++++ tools/hscollider/pcre_util.cpp | 90 + tools/hscollider/pcre_util.h | 41 + tools/hscollider/sig.cpp | 185 ++ tools/hscollider/sig.h | 57 + tools/hscollider/simple_timer.h | 54 + 32 files changed, 6960 insertions(+) create mode 100644 cmake/pcre.cmake create mode 100644 tools/hscollider/BoundedQueue.h create mode 100644 tools/hscollider/CMakeLists.txt create mode 100644 tools/hscollider/ColliderCorporaParser.h create mode 100644 tools/hscollider/ColliderCorporaParser.rl create mode 100644 tools/hscollider/Corpora.cpp create mode 100644 tools/hscollider/Corpora.h create mode 100644 tools/hscollider/DatabaseProxy.h create mode 100644 tools/hscollider/FileCorpora.cpp create mode 100644 tools/hscollider/FileCorpora.h create mode 100644 tools/hscollider/GraphTruth.cpp create mode 100644 tools/hscollider/GraphTruth.h create mode 100644 tools/hscollider/GroundTruth.cpp create mode 100644 tools/hscollider/GroundTruth.h create mode 100644 tools/hscollider/NfaGeneratedCorpora.cpp create mode 100644 tools/hscollider/NfaGeneratedCorpora.h create mode 100644 tools/hscollider/ResultSet.h create mode 100644 tools/hscollider/Thread.cpp create mode 100644 tools/hscollider/Thread.h create mode 100644 tools/hscollider/UltimateTruth.cpp create mode 100644 tools/hscollider/UltimateTruth.h create mode 100644 tools/hscollider/args.cpp create mode 100644 tools/hscollider/args.h create mode 100644 tools/hscollider/common.h create mode 100644 tools/hscollider/limit.cpp create mode 100644 tools/hscollider/limit.h create mode 100644 tools/hscollider/main.cpp create mode 100644 tools/hscollider/pcre_util.cpp create mode 100644 tools/hscollider/pcre_util.h create mode 100644 tools/hscollider/sig.cpp create mode 100644 tools/hscollider/sig.h create mode 100644 tools/hscollider/simple_timer.h diff --git a/cmake/pcre.cmake b/cmake/pcre.cmake new file mode 100644 index 00000000..30b33b88 --- /dev/null +++ b/cmake/pcre.cmake @@ -0,0 +1,62 @@ +# first look in pcre-$version or pcre subdirs +if (PCRE_SOURCE) + # either provided on cmdline or we've seen it already + set (PCRE_BUILD_SOURCE TRUE) +elseif (EXISTS ${PROJECT_SOURCE_DIR}/pcre-${PCRE_REQUIRED_VERSION}) + set (PCRE_SOURCE ${PROJECT_SOURCE_DIR}/pcre-${PCRE_REQUIRED_VERSION}) + set (PCRE_BUILD_SOURCE TRUE) +elseif (EXISTS ${PROJECT_SOURCE_DIR}/pcre) + set (PCRE_SOURCE ${PROJECT_SOURCE_DIR}/pcre) + set (PCRE_BUILD_SOURCE TRUE) +endif() + +if (PCRE_BUILD_SOURCE) + if (NOT IS_ABSOLUTE ${PCRE_SOURCE}) + set(PCRE_SOURCE "${CMAKE_BINARY_DIR}/${PCRE_SOURCE}") + endif () + set (saved_INCLUDES "${CMAKE_REQUIRED_INCLUDES}") + set (CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES} ${PCRE_SOURCE}") + + if (PCRE_CHECKED) + set(PCRE_INCLUDE_DIRS ${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre) + set(PCRE_LDFLAGS -L"${LIBDIR}" -lpcre) + + # already processed this file and set up pcre building + return() + endif () + + # first, check version number + CHECK_C_SOURCE_COMPILES("#include + #if PCRE_MAJOR != ${PCRE_REQUIRED_MAJOR_VERSION} || PCRE_MINOR != ${PCRE_REQUIRED_MINOR_VERSION} + #error Incorrect pcre version + #endif + main() {}" CORRECT_PCRE_VERSION) + set (CMAKE_REQUIRED_INCLUDES "${saved_INCLUDES}") + + if (NOT CORRECT_PCRE_VERSION) + unset(CORRECT_PCRE_VERSION CACHE) + message(FATAL_ERROR "Incorrect version of pcre - version ${PCRE_REQUIRED_VERSION} is required") + else() + message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION} - building from source.") + endif() + + # PCRE compile options + option(PCRE_BUILD_PCRECPP OFF) + option(PCRE_BUILD_PCREGREP OFF) + option(PCRE_SHOW_REPORT OFF) + set(PCRE_SUPPORT_UNICODE_PROPERTIES ON CACHE BOOL "Build pcre with unicode") + add_subdirectory(${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre EXCLUDE_FROM_ALL) + set(PCRE_INCLUDE_DIRS ${PCRE_SOURCE} ${PROJECT_BINARY_DIR}/pcre) + set(PCRE_LDFLAGS -L"${LIBDIR}" -lpcre) +else () + # pkgconf should save us + find_package(PkgConfig) + pkg_check_modules(PCRE libpcre=${PCRE_REQUIRED_VERSION}) + if (PCRE_FOUND) + message(STATUS "PCRE version ${PCRE_REQUIRED_VERSION}") + else () + message(FATAL_ERROR "PCRE version ${PCRE_REQUIRED_VERSION} not found") + endif () +endif (PCRE_BUILD_SOURCE) + +set (PCRE_CHECKED TRUE PARENT_SCOPE) diff --git a/tools/hscollider/BoundedQueue.h b/tools/hscollider/BoundedQueue.h new file mode 100644 index 00000000..ff7d013b --- /dev/null +++ b/tools/hscollider/BoundedQueue.h @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef BOUNDEDQUEUE_H +#define BOUNDEDQUEUE_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +//#define QUEUE_STATS 1 + +#ifdef QUEUE_STATS + +#include + +class BoundedQueueStats { +public: + size_t pop = 0; //!< Number of pop operations. + size_t pop_block = 0; //!< Number of pop operations that had to block. + size_t push = 0; //!< Number of push operations. + size_t push_elements = 0; //!< Number of elements pushed. + size_t push_block = 0; //!< Number of push operations that had to block. + size_t refill = 0; //!< Number of refills done. + size_t stolen_from = 0; //!< Number of times we were stolen from. + + void dump() const { + std::cout << "pop : " << pop << std::endl; + std::cout << "pop_block : " << pop_block << std::endl; + std::cout << "push : " << push << std::endl; + std::cout << "push_elements : " << push_elements << std::endl; + std::cout << "push_block : " << push_block << std::endl; + std::cout << "refill : " << refill << std::endl; + std::cout << "stolen_from : " << stolen_from << std::endl; + } +}; +#endif + +template +class BoundedQueue : boost::noncopyable { +private: + // Encapsulates a queue and the mutex used to protect access to it. + class MutexQueue { + public: + // Forwarded queue operations. + void push(std::unique_ptr elem) { q.push(std::move(elem)); } + void pop() { q.pop(); } + std::unique_ptr &front() { return q.front(); } + bool empty() const { return q.empty(); } + size_t size() const { return q.size(); } + + // Acquire the mutex lock. + std::unique_lock lock() { + return std::unique_lock(mutex); + } + +#ifdef QUEUE_STATS + BoundedQueueStats stats; +#endif + + private: + std::mutex mutex; + std::queue> q; + }; + +public: + BoundedQueue(size_t consumers, size_t size) + : max_elements(size), consumer_q(consumers) { + assert(consumers > 0); + assert(size > 0); + } + +#ifdef QUEUE_STATS + ~BoundedQueue() { + std::cout << "Global queue stats:" << std::endl; + global_q.stats.dump(); + std::cout << std::endl; + for (size_t i = 0; i < consumer_q.size(); i++) { + std::cout << "Consumer queue " << i << ":" << std::endl; + consumer_q[i].stats.dump(); + std::cout << std::endl; + } + } +#endif // QUEUE_STATS + + void push(std::unique_ptr elem) { + auto lock = global_q.lock(); + +#ifdef QUEUE_STATS + global_q.stats.push++; + global_q.stats.push_elements++; + if (global_q.size() >= max_elements) { + global_q.stats.push_block++; + } +#endif // QUEUE_STATS + + // Block until queue is able to accept new elements. + cond_can_accept.wait(lock, + [&] { return global_q.size() < max_elements; }); + assert(global_q.size() < max_elements); + + global_q.push(std::move(elem)); + cond_can_consume.notify_all(); + } + + template + void push(Iter begin, Iter end) { + using ElemType = typename std::remove_reference::type; + static_assert(std::is_same>::value, + "Iterator must be over unique_ptr"); + + if (begin == end) { + return; + } + + auto lock = global_q.lock(); + +#ifdef QUEUE_STATS + global_q.stats.push++; + global_q.stats.push_elements += std::distance(begin, end); + if (global_q.size() >= max_elements) { + global_q.stats.push_block++; + } +#endif // QUEUE_STATS + + // Block until queue is able to accept new elements. + cond_can_accept.wait(lock, + [&] { return global_q.size() < max_elements; }); + assert(global_q.size() < max_elements); + + for (auto it = begin; it != end; ++it) { + global_q.push(std::move(*it)); + } + cond_can_consume.notify_all(); + } + + std::unique_ptr pop(size_t consumer_id) { + assert(consumer_id < consumer_q.size()); + auto &q = consumer_q[consumer_id]; + + // Try and satisfy the request from our per-consumer queue. + { + auto consumer_lock = q.lock(); + if (!q.empty()) { + return pop_from_queue(q); + } + } + + // Try and satisfy the request with a refill from the global queue. + { + auto lock = global_q.lock(); + if (!global_q.empty()) { + auto consumer_lock = q.lock(); + return refill_and_pop(q); + } + } + + // Try and satisfy the request by stealing it from another queue. + for (size_t i = 1; i < consumer_q.size(); i++) { + size_t victim_id = (consumer_id + i) % consumer_q.size(); + auto &victim_q = consumer_q[victim_id]; + auto victim_lock = victim_q.lock(); + // Note: we don't steal sentinel elements. + if (!victim_q.empty() && victim_q.front() != nullptr) { +#ifdef QUEUE_STATS + victim_q.stats.stolen_from++; +#endif + return pop_from_queue(victim_q); + } + } + + // All avenues exhausted, we must block until we've received a new + // element. + auto lock = global_q.lock(); +#ifdef QUEUE_STATS + global_q.stats.pop_block++; +#endif + cond_can_consume.wait(lock, [&]{ return !global_q.empty(); }); + assert(!global_q.empty()); + auto consumer_lock = q.lock(); + return refill_and_pop(q); + } + +private: + std::unique_ptr pop_from_queue(MutexQueue &q) { + assert(!q.empty()); + auto elem = std::move(q.front()); + q.pop(); +#ifdef QUEUE_STATS + q.stats.pop++; +#endif + return elem; + } + + std::unique_ptr refill_and_pop(MutexQueue &q) { + assert(!global_q.empty()); + +#ifdef QUEUE_STATS + q.stats.refill++; +#endif + + auto elem = pop_from_queue(global_q); + if (elem == nullptr) { + return elem; // Sentinel. + } + + // Grab all subsequent elements that share the same ID. + const auto &id = elem->id; + while (!global_q.empty()) { + auto &first = global_q.front(); + if (first == nullptr) { +#ifdef QUEUE_STATS + q.stats.push++; + q.stats.push_elements++; +#endif + // Sentinel element. We can grab one, but no more. + q.push(pop_from_queue(global_q)); + break; + } + if (first->id != id) { + break; + } +#ifdef QUEUE_STATS + q.stats.push++; + q.stats.push_elements++; +#endif + q.push(pop_from_queue(global_q)); + } + + if (global_q.size() < max_elements) { + cond_can_accept.notify_all(); + } + + return elem; + } + + // Maximum number of elements in the global queue (subsequent push + // operations will block). Note that we may overshoot this value when + // handling bulk push operations. + const size_t max_elements; + + // Global queue. + MutexQueue global_q; + + // Per-consumer queues. + std::vector consumer_q; + + // Condition variable for producers to wait on when the queue is full. + std::condition_variable cond_can_accept; + + // Condition variable for consumers to wait on when the queue is empty. + std::condition_variable cond_can_consume; +}; + +#ifdef QUEUE_STATS +#undef QUEUE_STATS +#endif + +#endif // BOUNDEDQUEUE_H diff --git a/tools/hscollider/CMakeLists.txt b/tools/hscollider/CMakeLists.txt new file mode 100644 index 00000000..2816b76d --- /dev/null +++ b/tools/hscollider/CMakeLists.txt @@ -0,0 +1,79 @@ +# we have a fixed requirement for PCRE +set(PCRE_REQUIRED_MAJOR_VERSION 8) +set(PCRE_REQUIRED_MINOR_VERSION 41) +set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION}) + +include (${CMAKE_MODULE_PATH}/pcre.cmake) + +include_directories(${PCRE_INCLUDE_DIRS}) + +include(${CMAKE_MODULE_PATH}/backtrace.cmake) + +# we need static libs - too much deep magic for shared libs +if (NOT BUILD_STATIC_LIBS) + return () +endif () + +CHECK_FUNCTION_EXISTS(sigaltstack HAVE_SIGALTSTACK) +CHECK_FUNCTION_EXISTS(sigaction HAVE_SIGACTION) +CHECK_FUNCTION_EXISTS(setrlimit HAVE_SETRLIMIT) + +set_source_files_properties( + ${CMAKE_CURRENT_BINARY_DIR}/ColliderCorporaParser.cpp + PROPERTIES + COMPILE_FLAGS "${RAGEL_C_FLAGS} -I${CMAKE_CURRENT_SOURCE_DIR}") + +ragelmaker(ColliderCorporaParser.rl) + +# only set these after all tests are done +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") + +SET(hscollider_SOURCES + common.h + BoundedQueue.h + Corpora.cpp + FileCorpora.h + FileCorpora.cpp + ColliderCorporaParser.h + ColliderCorporaParser.cpp + NfaGeneratedCorpora.h + NfaGeneratedCorpora.cpp + GraphTruth.h + GraphTruth.cpp + GroundTruth.h + GroundTruth.cpp + UltimateTruth.h + UltimateTruth.cpp + ResultSet.h + args.cpp + args.h + limit.cpp + pcre_util.cpp + sig.cpp + sig.h + DatabaseProxy.h + Thread.h + Thread.cpp + main.cpp +) + +set_source_files_properties(${hscollider_SOURCES} PROPERTIES + INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}) +add_executable(hscollider ${hscollider_SOURCES}) +add_dependencies(hscollider ragel_ColliderCorporaParser) +add_dependencies(hscollider pcre) + +if(NOT WIN32) + target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil + expressionutil corpusomatic crosscompileutil pthread + "${BACKTRACE_LDFLAGS}") + +if(HAVE_BACKTRACE) + set_source_files_properties(hscollider_SOURCES COMPILE_FLAGS + "${BACKTRACE_CFLAGS}") +endif() +else() # WIN32 + target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil + expressionutil corpusomatic crosscompileutil) +endif() diff --git a/tools/hscollider/ColliderCorporaParser.h b/tools/hscollider/ColliderCorporaParser.h new file mode 100644 index 00000000..385e4ec8 --- /dev/null +++ b/tools/hscollider/ColliderCorporaParser.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FILECORPORAPARSER_H +#define FILECORPORAPARSER_H + +#include + +struct Corpus; + +// parse an escaped string into a real data buffer +bool parseCorpus(const std::string &line, Corpus &c, unsigned int &id); + +#endif diff --git a/tools/hscollider/ColliderCorporaParser.rl b/tools/hscollider/ColliderCorporaParser.rl new file mode 100644 index 00000000..ab40b2ba --- /dev/null +++ b/tools/hscollider/ColliderCorporaParser.rl @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "ColliderCorporaParser.h" +#include "Corpora.h" + +#include "ue2common.h" + +#include +#include +#include +#include + +using namespace std; + +namespace /* anonymous */ { + +// Take a string like '\xFF' and convert it to the character it represents +char unhex(const char *start, UNUSED const char *end) { + assert(start + 4 == end); + assert(start[0] == '\\'); + assert(start[1] == 'x'); + assert(isxdigit(start[2])); + assert(isxdigit(start[2])); + + char temp[3] = {start[2], start[3], 0}; + + return strtol(temp, nullptr, 16); +} + +%%{ + machine FileCorporaParser; + + action accumulateNum { + num = (num * 10) + (fc - '0'); + } + + action handleHexEscaped { + sout.push_back(unhex(ts, te)); + } + + action handleSpecial { + switch (*(ts+1)) { + case '0': sout.push_back('\x00'); break; + case 'a': sout.push_back('\x07'); break; + case 'e': sout.push_back('\x1b'); break; + case 'f': sout.push_back('\x0c'); break; + case 'n': sout.push_back('\x0a'); break; + case 'v': sout.push_back('\x0b'); break; + case 'r': sout.push_back('\x0d'); break; + case 't': sout.push_back('\x09'); break; + default: fbreak; + } + } + + action handleMatch { + c.matches.insert(num); + } + + write data; +}%% + +} // namespace + +bool parseCorpus(const string &line, Corpus &c, unsigned int &id) { + const char *p = line.c_str(); + const char *pe = p + line.size(); + const char *eof = pe; + const char *ts; + const char *te; + int cs; + UNUSED int act; + + // For storing integers as they're scanned + unsigned int num = 0; + + string &sout = c.data; + + %%{ + id = ( digit @accumulateNum)+ >{num = 0;} @{id = num;}; + + backslashed = '\\' ^alnum; + specials = '\\' [0aefnvrt]; + hexescaped = '\\x' xdigit{2}; + + corpus_old := |* + hexescaped => handleHexEscaped; + specials => handleSpecial; + backslashed => { sout.push_back(*(ts + 1)); }; + any => { sout.push_back(*ts); }; + *|; + + corpus_new := |* + hexescaped => handleHexEscaped; + specials => handleSpecial; + backslashed => { sout.push_back(*(ts + 1)); }; + any - '"' => { sout.push_back(*ts); }; + '"' => { fgoto colon_sep; }; + *|; + + colon_sep := |* + ':' => {fgoto match_list; }; + *|; + + match_list := |* + (' '* (digit @accumulateNum)+ ' '* ','?) >{num = 0;} => handleMatch; + *|; + + # Old simple line format + line_old = id ':' @{ fgoto corpus_old; }; + + # New line format with matches + line_new = id "=\"" @{ c.hasMatches = true; fgoto corpus_new; }; + + main := ( line_new | line_old ); + + # Initialize and execute + write init; + write exec; + }%% + + return (cs != FileCorporaParser_error) && (p == pe); +} diff --git a/tools/hscollider/Corpora.cpp b/tools/hscollider/Corpora.cpp new file mode 100644 index 00000000..7345393d --- /dev/null +++ b/tools/hscollider/Corpora.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "Corpora.h" + +CorporaSource::~CorporaSource() { } diff --git a/tools/hscollider/Corpora.h b/tools/hscollider/Corpora.h new file mode 100644 index 00000000..65fb5836 --- /dev/null +++ b/tools/hscollider/Corpora.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CORPORA_H +#define CORPORA_H + +#include +#include +#include + +#include + +struct Corpus { + Corpus() : hasMatches(false) {} + explicit Corpus(const std::string &s) : data(s), hasMatches(false) {} + + std::string data; // Corpus itself + bool hasMatches; // Have the matches been pre-calculated? + std::set matches; // end-offsets of matches +}; + +struct CorpusFailure { + explicit CorpusFailure(const std::string &s) : message(s) {} + std::string message; +}; + +// Abstract class for a corpora source: new ways to load or generate corpora +// can be written by subclassing this class and providing its generate +// method. +class CorporaSource : boost::noncopyable { +public: + // destructor + virtual ~CorporaSource(); + + // Make a copy of this corpora source. + virtual CorporaSource *clone() const = 0; + + // Generate corpora for the given signature ID, adding them to the + // vector of strings provided. + virtual void generate(unsigned id, std::vector &data) = 0; +}; + +#endif // CORPORA_H diff --git a/tools/hscollider/DatabaseProxy.h b/tools/hscollider/DatabaseProxy.h new file mode 100644 index 00000000..13b6f680 --- /dev/null +++ b/tools/hscollider/DatabaseProxy.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef UE2COLLIDER_DATABASEPROXY_H +#define UE2COLLIDER_DATABASEPROXY_H + +#include "UltimateTruth.h" + +#include +#include +#include +#include + +#include + +/** + * When a compile fails for the first time, we throw this exception so that a + * compilation error can be reported to the user. Subsequent failures will + * simply return nullptr rather than throwing this exception. + */ +struct CompileFailed { +public: + explicit CompileFailed(const std::string &err) : error(err) {} + std::string error; +}; + +class DatabaseProxy : boost::noncopyable { +public: + explicit DatabaseProxy(const std::set &expr_ids) + : ids(expr_ids) {} + + explicit DatabaseProxy(std::shared_ptr built_db) + : db(built_db) {} + + std::shared_ptr get(const UltimateTruth &ultimate) { + std::lock_guard lock(mutex); + if (failed) { + // We have previously failed to compile this database. + return nullptr; + } + if (db) { + return db; + } + + // Database hasn't been compiled yet. + std::string error; + db = ultimate.compile(ids, error); + if (!db) { + failed = true; + throw CompileFailed(error); + } + + return db; + } + +private: + std::mutex mutex; + std::shared_ptr db; + std::set ids; + bool failed = false; // Database failed compilation. +}; + +#endif // UE2COLLIDER_DATABASEPROXY_H diff --git a/tools/hscollider/FileCorpora.cpp b/tools/hscollider/FileCorpora.cpp new file mode 100644 index 00000000..82488569 --- /dev/null +++ b/tools/hscollider/FileCorpora.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "ColliderCorporaParser.h" +#include "FileCorpora.h" +#include "common.h" +#include "util/expression_path.h" + +#include +#include + +#include + +using namespace std; + +// Returns true if this line is empty or a comment and should be skipped +static +bool emptyLine(const string& line) { + return line.empty() || line[0] == '#'; +} + +FileCorpora *FileCorpora::clone() const { + FileCorpora *copy = new FileCorpora(); + copy->corpora_by_pat = corpora_by_pat; + return copy; +} + +bool FileCorpora::readLine(const string &line) { + unsigned id = 0; + Corpus c; + bool rv = parseCorpus(line, c, id); + if (rv) { + corpora_by_pat[id].push_back(c); + return true; + } else { + return false; + } +} + +bool FileCorpora::readFile(const string &filename) { + ifstream f(filename.c_str()); + if (!f.good()) { + return false; + } + + unsigned lineNum = 0; + string line; + while (getline(f, line)) { + lineNum++; + + boost::trim(line); + + if (emptyLine(line)) { + continue; + } + if (!readLine(line)) { + cerr << "Error in corpora file parsing line " << lineNum << endl; + return false; + } + } + return !corpora_by_pat.empty(); +} + +void FileCorpora::generate(unsigned id, + vector &data) { + auto i = corpora_by_pat.find(id); + if (i == corpora_by_pat.end() || i->second.empty()) { + throw CorpusFailure("no corpora found for pattern."); + } + + data.insert(data.end(), i->second.begin(), i->second.end()); +} diff --git a/tools/hscollider/FileCorpora.h b/tools/hscollider/FileCorpora.h new file mode 100644 index 00000000..c34c72ff --- /dev/null +++ b/tools/hscollider/FileCorpora.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FILECORPORA_H +#define FILECORPORA_H + +#include "Corpora.h" + +#include +#include +#include +#include + +class FileCorpora : public CorporaSource { +public: + // copy + FileCorpora *clone() const override; + + // read corpora in from a file + bool readFile(const std::string &filename); + + // generator + void generate(unsigned id, std::vector &data) override; + +private: + // read in a line from our file + bool readLine(const std::string &line); + + std::map> corpora_by_pat; +}; + +#endif diff --git a/tools/hscollider/GraphTruth.cpp b/tools/hscollider/GraphTruth.cpp new file mode 100644 index 00000000..5c4cd8e7 --- /dev/null +++ b/tools/hscollider/GraphTruth.cpp @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "GraphTruth.h" + +#include "common.h" +#include "expressions.h" +#include "ExpressionParser.h" +#include "ng_find_matches.h" +#include "pcre_util.h" + +#include "grey.h" +#include "hs_compile.h" +#include "ue2common.h" +#include "compiler/compiler.h" +#include "nfagraph/ng.h" +#include "nfagraph/ng_depth.h" +#include "nfagraph/ng_dump.h" +#include "nfagraph/ng_fuzzy.h" +#include "nfagraph/ng_holder.h" +#include "nfagraph/ng_util.h" +#include "parser/Parser.h" +#include "parser/unsupported.h" +#include "util/compile_context.h" +#include "util/make_unique.h" +#include "util/report_manager.h" + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace ue2; + +// Struct to store the actual compiled NFA graph. +class CompiledNG : boost::noncopyable { +public: + CompiledNG(unique_ptr g_in, + unique_ptr rm_in) + : g(std::move(g_in)), rm(std::move(rm_in)) {} + unique_ptr g; + unique_ptr rm; +}; + +static +void populateMatchSet(ResultSet &rs, const set> &matches, + const CNGInfo &cngi) { + for (const auto &m : matches) { + u64a from = m.first; + u64a to = m.second; + if (g_streamOffset) { + // Subtract stream offset imposed by offset test. + u64a offset = min(100ull, g_streamOffset); + assert(to >= offset); + from -= min(offset, from); + to -= offset; + } + u64a len = to - from; + + if (to < cngi.min_offset || to > cngi.max_offset || + len < cngi.min_length) { + // this match does not satisfy extparams constraints + DEBUG_PRINTF("skipping NFA Match @ (%llu,%llu)\n", from, to); + continue; + } + if (!cngi.som) { + from = 0; + } + rs.addMatch(from, to); + } +} + +CNGInfo::CNGInfo(unsigned id_in, const ExpressionMap &m_expr_in) + : id(id_in), m_expr(m_expr_in) {} + +CNGInfo::~CNGInfo() = default; + +void CNGInfo::compile() { + auto i = m_expr.find(id); + if (i == m_expr.end()) { + throw NGCompileFailure("ID not found in expression map."); + } + + string re; + unsigned hs_flags; + hs_expr_ext ext; + + // read the flags for NFA compiler + if (!readExpression(i->second, re, &hs_flags, &ext)) { + throw NGCompileFailure("Cannot parse expression flags."); + } + // make sure we respect collider's UTF-8 setting + if (force_utf8) { + hs_flags |= HS_FLAG_UTF8; + } + + try { + bool isStreaming = colliderMode == MODE_STREAMING; + bool isVectored = colliderMode == MODE_VECTORED; + CompileContext cc(isStreaming, isVectored, get_current_target(), + Grey()); + ParsedExpression pe(0, re.c_str(), hs_flags, 0, &ext); + + // UE-2850: ParsedExpression may have updated the utf8 flag if the + // original expression starts with (*UTF8) + utf8 |= pe.expr.utf8; + + auto rm = ue2::make_unique(cc.grey); + + // Expressions containing zero-width assertions and other extended pcre + // types aren't supported yet. This call will throw a ParseError + // exception if the component tree contains such a construct. + checkUnsupported(*pe.component); + + pe.component->checkEmbeddedStartAnchor(true); + pe.component->checkEmbeddedEndAnchor(true); + + // edit distance may be set globally + if (force_edit_distance) { + pe.expr.edit_distance = edit_distance; + } + + // validate_fuzzy_compile checks this, but we don't need to build the + // graph to know it will fail + if (pe.expr.edit_distance && utf8) { + throw NGCompileFailure("UTF-8 patterns cannot be " + "approximately matched"); + } + + auto built_expr = buildGraph(*rm, cc, pe); + auto &expr = built_expr.expr; + auto &g = built_expr.g; + + if (expr.edit_distance || expr.hamm_distance) { + // check if this pattern can be approximately matched, throws + // CompileError on failure + bool hamming = expr.hamm_distance > 0; + u32 e_dist = hamming ? expr.hamm_distance : expr.edit_distance; + validate_fuzzy_compile(*g, e_dist, hamming, utf8, cc.grey); + } + + if (isVacuous(*g)) { + if (som) { + throw NGUnsupportedFailure("Vacuous patterns are not supported " + "in SOM mode"); + } + if (expr.min_length > 0) { + throw NGUnsupportedFailure("Vacuous patterns are not supported " + "in combination with min_length"); + } + } + + cng = make_unique(move(g), move(rm)); + } catch (CompileError &e) { + throw NGCompileFailure(e.reason); + } catch (NGUnsupportedFailure &e) { + throw NGCompileFailure(e.msg); + } catch (...) { + throw NGCompileFailure("NFA graph construction failed"); + } +} + +GraphTruth::GraphTruth(ostream &os, const ExpressionMap &expr) + : out(os), m_expr(expr) {} + +unique_ptr GraphTruth::preprocess(unsigned id, + bool ignoreUnsupported) { + bool highlander = false; + bool prefilter = false; + bool som = false; + + auto i = m_expr.find(id); + if (i == m_expr.end()) { + throw NGCompileFailure("ID not found in expression map."); + } + + string re; + unsigned flags, hs_flags; + hs_expr_ext ext; + + // read the flags for NFA compiler + if (!readExpression(i->second, re, &hs_flags, &ext)) { + throw NGCompileFailure("Cannot parse expression flags."); + } + // read PCRE flags + if (!getPcreFlags(hs_flags, &flags, &highlander, &prefilter, &som)) { + throw NGCompileFailure("Cannot get PCRE flags."); + } + if (force_utf8) { + hs_flags |= HS_FLAG_UTF8; + } + + // edit distance might be set globally + if (force_edit_distance) { + ext.edit_distance = edit_distance; + } + + // SOM flags might be set globally. + som |= !!somFlags; + + if (force_prefilter) { + prefilter = true; + } + + u64a supported_flags = HS_EXT_FLAG_HAMMING_DISTANCE | + HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET | + HS_EXT_FLAG_MAX_OFFSET | HS_EXT_FLAG_MIN_LENGTH; + if (ext.flags & ~supported_flags) { + if (!ignoreUnsupported) { + throw NGUnsupportedFailure("Unsupported extended flags specified."); + } + } + + auto cngi = make_unique(id, m_expr); + cngi->utf8 = hs_flags & HS_FLAG_UTF8; + cngi->highlander = highlander; + cngi->prefilter = prefilter; + cngi->som = som; + cngi->min_offset = ext.min_offset; + cngi->max_offset = ext.max_offset; + cngi->min_length = ext.min_length; + cngi->max_edit_distance = ext.edit_distance; + cngi->max_hamm_distance = ext.hamming_distance; + + return cngi; +} + +bool GraphTruth::run(unsigned, const CompiledNG &cng, const CNGInfo &cngi, + const string &buffer, ResultSet &rs, string &) { + set> matches; + + if (g_streamOffset) { + size_t offset = MIN(100, g_streamOffset); + assert(offset > 0); + const string preamble(string(offset, '\0')); + + set> pre_matches; + + // First, scan an empty buffer size of the preamble so that we can + // discard any matches therein after the real scan, later. We use + // notEod so that end-anchors in our expression don't match at the + // end of the buffer. + if (!findMatches(*cng.g, *cng.rm, preamble, pre_matches, + cngi.max_edit_distance, cngi.max_hamm_distance, true, + cngi.utf8)) { + return false; + } + + // Real scan. + if (!findMatches(*cng.g, *cng.rm, preamble + buffer, matches, + cngi.max_edit_distance, cngi.max_hamm_distance, false, + cngi.utf8)) { + return false; + } + + // Erase any matches due entirely to the preamble. + for (const auto &m : pre_matches) { + matches.erase(m); + } + } else { + if (!findMatches(*cng.g, *cng.rm, buffer, matches, + cngi.max_edit_distance, cngi.max_hamm_distance, false, + cngi.utf8)) { + return false; + } + } + + populateMatchSet(rs, matches, cngi); + + if (echo_matches) { + for (const auto &m : rs.matches) { + out << "NFA Match @ (" << m.from << "," << m.to << ")" << endl; + } + } + + return true; +} diff --git a/tools/hscollider/GraphTruth.h b/tools/hscollider/GraphTruth.h new file mode 100644 index 00000000..5f53899c --- /dev/null +++ b/tools/hscollider/GraphTruth.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef GRAPHTRUTH_H +#define GRAPHTRUTH_H + +#include "expressions.h" +#include "ResultSet.h" + +#include "hs_compile.h" // for hs_expr_ext +#include "ue2common.h" + +#include +#include +#include + +#include + +namespace ue2 { + +class ReportManager; +struct BoundaryReports; + +} // namespace ue2 + +struct NGCompileFailure { + explicit NGCompileFailure(const std::string &msg_s) : msg(msg_s) {} + std::string msg; +}; + +struct NGUnsupportedFailure { + explicit NGUnsupportedFailure(const std::string &msg_s) : msg(msg_s) {} + std::string msg; +}; + +// Struct to store the actual compiled NFA graph. +class CompiledNG; + +// Struct to store the precompile information about the graph. +class CNGInfo : boost::noncopyable { +public: + CNGInfo(unsigned id_in, const ExpressionMap &m_expr_in); + ~CNGInfo(); + + bool is_bad() { + std::lock_guard lock(bad_mutex); + bool val = bad; + return val; + } + + void mark_bad() { + std::lock_guard lock(bad_mutex); + bad = true; + } + + const CompiledNG *get() { + std::lock_guard lock(cng_mutex); + + if (cng) { + return cng.get(); + } + + // NFA graph hasn't been compiled yet. + try { + compile(); + } catch (NGCompileFailure &e) { + throw NGCompileFailure(e); + } catch (NGUnsupportedFailure &e) { + throw NGCompileFailure(e.msg); + } + + return cng.get(); + } + + u64a min_offset = 0; + u64a max_offset = 0; + u64a min_length = 0; + u32 max_edit_distance = 0; + u32 max_hamm_distance = 0; + bool utf8 = false; + bool highlander = false; + bool prefilter = false; + bool som = false; +private: + void compile(); + // If NFA graph scan failed for some reason, we mark it as bad and skip + // the remaining tests for it for performance reasons. + bool bad = false; + std::mutex bad_mutex; // serialised accesses to bad flag. + + std::unique_ptr cng; // compiled NFA graph + std::mutex cng_mutex; // serialised accesses to NFA graph + + unsigned id; + + // Our expression map + const ExpressionMap &m_expr; +}; + + +class GraphTruth : boost::noncopyable { +public: + GraphTruth(std::ostream &os, const ExpressionMap &expr); + + bool run(unsigned id, const CompiledNG &cng, const CNGInfo &cngi, + const std::string &buffer, ResultSet &rs, std::string &error); + + std::unique_ptr preprocess(unsigned id, + bool ignoreUnsupported = false); + +private: + // Output stream. + std::ostream &out; + + // Our expression map + const ExpressionMap &m_expr; +}; + +#endif diff --git a/tools/hscollider/GroundTruth.cpp b/tools/hscollider/GroundTruth.cpp new file mode 100644 index 00000000..b0fe384d --- /dev/null +++ b/tools/hscollider/GroundTruth.cpp @@ -0,0 +1,513 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "common.h" +#include "ExpressionParser.h" +#include "expressions.h" +#include "GroundTruth.h" +#include "pcre_util.h" + +#include "hs_compile.h" // for hs_expr_ext +#include "ue2common.h" +#include "parser/control_verbs.h" +#include "parser/Parser.h" +#include "parser/parse_error.h" +#include "util/make_unique.h" +#include "util/unicode_def.h" +#include "util/unordered.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* -X, -Y support + * as PCRE performance is `non-linear' and these options add a large amount of + * scanning, the following short cuts are used: + * 1: the suffix is not scanned - we are more interested in the matches from + * the original corpora. + * 2: only the last 50 bytes of the prefix is scanned. This may lead to some + * minor correctness issues for a few patterns. + */ + +using namespace std; +using namespace ue2; + +// We store matches in a hash table as we're likely to see lots of them. These +// are moved into a ResultSet at the end. +using PcreMatchSet = ue2::ue2_unordered_set>; + +namespace { +struct CalloutContext { + explicit CalloutContext(ostream &os) : out(os) {} + ostream &out; + PcreMatchSet matches; +}; +} + +static +int pcreCallOut(pcre_callout_block *block) { + assert(block); + assert(block->callout_data); + CalloutContext *ctx = static_cast(block->callout_data); + + if (echo_matches) { + ctx->out << "PCRE Match @ (" << block->start_match << "," + << block->current_position << ")" << endl; + } + + unsigned int from = block->start_match; + unsigned int to = block->current_position; + assert(from <= to); + + ctx->matches.insert(make_pair(from, to)); + return 1; +} + +static +bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander, + bool *prefilter, bool *som, hs_expr_ext *ext) { + string regex; + unsigned int hs_flags = 0; + if (!readExpression(expr, regex, &hs_flags, ext)) { + return false; + } + + expr.swap(regex); + + if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som)) { + return false; + } + + if (force_utf8) { + *flags |= PCRE_UTF8; + } + + if (force_prefilter) { + *prefilter = true; + } + + return true; +} + +static +string pcreErrStr(int err) { + switch (err) { + case PCRE_ERROR_NOMATCH: + return "PCRE_ERROR_NOMATCH"; + case PCRE_ERROR_NULL: + return "PCRE_ERROR_NULL"; + case PCRE_ERROR_BADOPTION: + return "PCRE_ERROR_BADOPTION"; + case PCRE_ERROR_BADMAGIC: + return "PCRE_ERROR_BADMAGIC"; +#if defined(PCRE_ERROR_UNKNOWN_OPCODE) + case PCRE_ERROR_UNKNOWN_OPCODE: + return "PCRE_ERROR_UNKNOWN_OPCODE"; +#else + case PCRE_ERROR_UNKNOWN_NODE: + return "PCRE_ERROR_UNKNOWN_NODE"; +#endif + case PCRE_ERROR_NOMEMORY: + return "PCRE_ERROR_NOMEMORY"; + case PCRE_ERROR_NOSUBSTRING: + return "PCRE_ERROR_NOSUBSTRING"; + case PCRE_ERROR_MATCHLIMIT: + return "PCRE_ERROR_MATCHLIMIT"; + case PCRE_ERROR_CALLOUT: + return "PCRE_ERROR_CALLOUT"; + case PCRE_ERROR_BADUTF8: + return "PCRE_ERROR_BADUTF8"; + case PCRE_ERROR_BADUTF8_OFFSET: + return "PCRE_ERROR_BADUTF8_OFFSET"; + case PCRE_ERROR_PARTIAL: + return "PCRE_ERROR_PARTIAL"; + case PCRE_ERROR_BADPARTIAL: + return "PCRE_ERROR_BADPARTIAL"; + case PCRE_ERROR_INTERNAL: + return "PCRE_ERROR_INTERNAL"; + case PCRE_ERROR_BADCOUNT: + return "PCRE_ERROR_BADCOUNT"; +#if defined(PCRE_ERROR_RECURSIONLIMIT) + case PCRE_ERROR_RECURSIONLIMIT: + return "PCRE_ERROR_RECURSIONLIMIT"; +#endif + case PCRE_ERROR_DFA_UITEM: + return "PCRE_ERROR_DFA_UITEM"; + case PCRE_ERROR_DFA_UCOND: + return "PCRE_ERROR_DFA_UCOND"; + case PCRE_ERROR_DFA_UMLIMIT: + return "PCRE_ERROR_DFA_UMLIMIT"; + case PCRE_ERROR_DFA_WSSIZE: + return "PCRE_ERROR_DFA_WSSIZE"; + case PCRE_ERROR_DFA_RECURSE: + return "PCRE_ERROR_DFA_RECURSE"; + default: + { + ostringstream oss; + oss << "Unknown PCRE error (value: " << err << ")"; + return oss.str(); + } + } +} + +GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr, + unsigned long int limit, + unsigned long int limit_recursion) + : out(os), m_expr(expr), matchLimit(limit), + matchLimitRecursion(limit_recursion) {} + +void GroundTruth::global_prep() { + // We're using pcre callouts + pcre_callout = &pcreCallOut; +} + +static +void addCallout(string &re) { + // If the string begins with "(*UTF8)" or "(*UTF8)(*UCP)", we want to keep + // it at the front. We reuse the control verbs mini-parser for this. + size_t startpos = 0; + try { + ue2::ParseMode mode; + const char *ptr = ue2::read_control_verbs( + re.c_str(), re.c_str() + re.size(), 0, mode); + startpos = ptr - re.c_str(); + } catch (const ue2::ParseError &err) { + // fall through + } + assert(startpos <= re.length()); + re.insert(startpos, "(?:"); + // We include a \E to close any open \Q quoted block. If there isn't + // one, pcre will ignore the \E. + re.append("\\E)(?C)"); +} + +unique_ptr +GroundTruth::compile(unsigned id, bool no_callouts) { + bool highlander = false; + bool prefilter = false; + bool som = false; + + // we can still match approximate matching patterns with PCRE if edit + // distance 0 is requested + if (force_edit_distance && edit_distance) { + throw SoftPcreCompileFailure("Edit distance not supported by PCRE."); + } + + ExpressionMap::const_iterator i = m_expr.find(id); + if (i == m_expr.end()) { + throw PcreCompileFailure("ID not found in expression map."); + } + + string re(i->second); + unsigned flags; + hs_expr_ext ext; + + // Decode the flags + if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som, &ext)) { + throw PcreCompileFailure("Unable to decode flags."); + } + + // filter out flags not supported by PCRE + u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET | + HS_EXT_FLAG_MIN_LENGTH; + if (ext.flags & ~supported) { + // edit distance is a known unsupported flag, so just throw a soft error + if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) { + throw SoftPcreCompileFailure("Edit distance not supported by PCRE."); + } + if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) { + throw SoftPcreCompileFailure( + "Hamming distance not supported by PCRE."); + } + throw PcreCompileFailure("Unsupported extended flags."); + } + + // SOM flags might be set globally. + som |= !!somFlags; + + // For traditional Hyperscan, add global callout to pattern. + if (!no_callouts) { + addCallout(re); + } + + // Compile the pattern + const char *errptr = nullptr; + int errloc = 0; + int errcode = 0; + + unique_ptr compiled = make_unique(); + compiled->utf8 = flags & PCRE_UTF8; + compiled->highlander = highlander; + compiled->prefilter = prefilter; + compiled->som = som; + compiled->min_offset = ext.min_offset; + compiled->max_offset = ext.max_offset; + compiled->min_length = ext.min_length; + compiled->expression = i->second; // original PCRE + flags |= PCRE_NO_AUTO_POSSESS; + + compiled->bytecode = + pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr); + + if (!compiled->bytecode || errptr) { + assert(errcode); + ostringstream oss; + oss << "Failed to compile expression '" << re << '\''; + oss << " (" << errptr << " at " << errloc << ")."; + if (errcode == 20) { // "regular expression is too large" + throw SoftPcreCompileFailure(oss.str()); + } else if (errcode == 25) { // "lookbehind assertion is not fixed length" + throw SoftPcreCompileFailure(oss.str()); + } else { + throw PcreCompileFailure(oss.str()); + } + } + + // Study the pattern + shared_ptr extra(pcre_study(compiled->bytecode, 0, &errptr), + free); + if (errptr) { + ostringstream oss; + oss << "Error studying pattern (" << errptr << ")."; + throw PcreCompileFailure(oss.str()); + } + + int infoRes = + pcre_fullinfo(compiled->bytecode, extra.get(), PCRE_INFO_CAPTURECOUNT, + &compiled->captureCount); + if (infoRes < PCRE_ERROR_NOMATCH) { + ostringstream oss; + oss << "Error determining number of capturing subpatterns (" + << pcreErrStr(infoRes) << ")."; + throw PcreCompileFailure(oss.str()); + } + + return compiled; +} + +static +void filterLeftmostSom(ResultSet &rs) { + if (rs.matches.size() <= 1) { + return; + } + + set seen; // End offsets. + set::iterator it = rs.matches.begin(); + while (it != rs.matches.end()) { + if (seen.insert(it->to).second) { + ++it; // First time we've seen this end-offset. + } else { + rs.matches.erase(it++); // Dupe with a "righter" SOM. + } + } +} + +static +void filterExtParams(ResultSet &rs, const CompiledPcre &compiled) { + set::iterator it = rs.matches.begin(); + while (it != rs.matches.end()) { + unsigned int from = it->from, to = it->to; + unsigned int len = to - from; + if (to < compiled.min_offset || to > compiled.max_offset || + len < compiled.min_length) { + rs.matches.erase(it++); + } else { + ++it; + } + } +} + +static +int scanBasic(const CompiledPcre &compiled, const string &buffer, + const pcre_extra &extra, vector &ovector, + CalloutContext &ctx) { + const size_t prefix_len = g_corpora_prefix.size(); + const size_t suffix_len = g_corpora_suffix.size(); + + size_t begin_offset = prefix_len - MIN(50, prefix_len); + size_t real_len = buffer.size(); + + if (suffix_len > 2) { + real_len -= suffix_len - 2; + } + + int flags = suffix_len ? PCRE_NOTEOL : 0; + int ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), real_len, + begin_offset, flags, &ovector[0], ovector.size()); + + if (!g_corpora_prefix.empty()) { + PcreMatchSet tmp; + tmp.swap(ctx.matches); + + for (const auto &m : tmp) { + unsigned from = m.first; + unsigned to = m.second; + if (to >= prefix_len && to <= buffer.size() - suffix_len) { + from = from < prefix_len ? 0 : from - prefix_len; + to -= prefix_len; + ctx.matches.insert(make_pair(from, to)); + } + } + } + + return ret; +} + +static +int scanOffset(const CompiledPcre &compiled, const string &buffer, + const pcre_extra &extra, vector &ovector, + CalloutContext &ctx) { + size_t offset = MIN(100, g_streamOffset); + assert(offset > 0); + + const string buf(string(offset, '\0') + buffer); + + // First, scan our preamble so that we can discard any matches therein + // after the real scan, later. We use PCRE_NOTEOL so that end-anchors in + // our expression don't match at the end of the preamble. + int ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), offset, 0, + PCRE_NOTEOL, &ovector[0], ovector.size()); + if (ret < PCRE_ERROR_NOMATCH) { + return ret; + } + + PcreMatchSet pre_matches; + pre_matches.swap(ctx.matches); + + // Real scan. + ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), buf.size(), 0, 0, + &ovector[0], ovector.size()); + if (ret < PCRE_ERROR_NOMATCH) { + return ret; + } + + // Erase any matches due entirely to the preamble. + for (const auto &m : pre_matches) { + ctx.matches.erase(m); + } + + return ret; +} + +bool GroundTruth::run(unsigned, const CompiledPcre &compiled, + const string &buffer, ResultSet &rs, string &error) { + CalloutContext ctx(out); + + pcre_extra extra; + extra.flags = 0; + + // Switch on callouts. + extra.flags |= PCRE_EXTRA_CALLOUT_DATA; + extra.callout_data = &ctx; + + // Set the match_limit (in order to bound execution time on very complex + // patterns) + extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION); + extra.match_limit = matchLimit; + extra.match_limit_recursion = matchLimitRecursion; + +#ifdef PCRE_NO_START_OPTIMIZE + // Switch off optimizations that may result in callouts not occurring. + extra.flags |= PCRE_NO_START_OPTIMIZE; +#endif + + // Ensure there's enough room in the ovector for the capture groups in this + // pattern. + int ovecsize = (compiled.captureCount + 1) * 3; + ovector.resize(ovecsize); + + int ret; + switch (colliderMode) { + case MODE_BLOCK: + case MODE_STREAMING: + case MODE_VECTORED: + if (g_streamOffset) { + ret = scanOffset(compiled, buffer, extra, ovector, ctx); + } else { + ret = scanBasic(compiled, buffer, extra, ovector, ctx); + } + break; + default: + assert(0); + ret = PCRE_ERROR_NULL; + break; + } + + if (ret < PCRE_ERROR_NOMATCH) { + error = pcreErrStr(ret); + return false; + } + + // Move matches into a ResultSet. + for (const auto &m : ctx.matches) { + unsigned long long from = m.first; + unsigned long long to = m.second; + + if (g_streamOffset) { + // Subtract stream offset imposed by offset test. + unsigned long long offset = min(100ull, g_streamOffset); + assert(to >= offset); + from -= min(offset, from); + to -= offset; + } + + rs.addMatch(from, to); + } + + // If we have no matches, there's no further work to do. + if (rs.matches.empty()) { + return true; + } + + if (compiled.som) { + filterLeftmostSom(rs); + } + + filterExtParams(rs, compiled); + + // If we haven't been asked for SOM, strip the from offsets. + if (!compiled.som) { + set endonly; + for (const auto &m : rs.matches) { + endonly.insert(MatchResult(0, m.to)); + } + rs.matches.swap(endonly); + } + + return true; +} diff --git a/tools/hscollider/GroundTruth.h b/tools/hscollider/GroundTruth.h new file mode 100644 index 00000000..bcab5599 --- /dev/null +++ b/tools/hscollider/GroundTruth.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef GROUNDTRUTH_H +#define GROUNDTRUTH_H + +#include "expressions.h" +#include "ResultSet.h" + +#include +#include +#include +#include + +#include + +#include + +// Thrown by GroundTruth::compile in the event of a PCRE compile failure. +struct PcreCompileFailure { + PcreCompileFailure(const std::string &msg_s) : msg(msg_s) {} + std::string msg; +}; + +// Thrown in the event of a "soft" PCRE compile failure, one that we don't want +// to consider a ue2collider failure (e.g. "regular expression too large"). +struct SoftPcreCompileFailure : PcreCompileFailure { + SoftPcreCompileFailure(const std::string &msg_s) + : PcreCompileFailure(msg_s) {} +}; + +// Struct to store everything about a PCRE. Note that the code assumes that +// once populated, the data in this structure will remain constant while tests +// are running, except for the bad flag (which is protected by a mutex). +class CompiledPcre : boost::noncopyable { +public: + CompiledPcre() {} + ~CompiledPcre() { + free(bytecode); + } + + bool is_bad() { + std::lock_guard lock(bad_mutex); + bool val = bad; + return val; + } + + void mark_bad() { + std::lock_guard lock(bad_mutex); + bad = true; + } + + std::string expression; + pcre *bytecode = nullptr; + unsigned long long min_offset = 0; + unsigned long long max_offset = ~0ULL; + unsigned long long min_length = 0; + int captureCount = 0; + bool utf8 = false; + bool highlander = false; + bool prefilter = false; + bool som = false; + +private: + // If a PCRE has hit its match recursion limit when scanning a corpus, we + // mark it as bad and skip the remaining tests for it for performance + // reasons. + bool bad = false; + std::mutex bad_mutex; // serialised accesses to bad flag. +}; + +// Wrapper around libpcre to generate results for an expression and corpus. +class GroundTruth : boost::noncopyable { +public: + GroundTruth(std::ostream &os, const ExpressionMap &expr, + unsigned long limit, unsigned long limit_recursion); + + static void global_prep(); + + std::unique_ptr compile(unsigned id, + bool no_callouts = false); + + bool run(unsigned id, const CompiledPcre &compiled, + const std::string &buffer, ResultSet &rs, std::string &error); + +private: + // Output stream. + std::ostream &out; + + // Our expression map + const ExpressionMap &m_expr; + + // PCRE match limit + const unsigned long int matchLimit; + const unsigned long int matchLimitRecursion; + + // Persistent ovector used to run tests. + std::vector ovector; +}; + +#endif diff --git a/tools/hscollider/NfaGeneratedCorpora.cpp b/tools/hscollider/NfaGeneratedCorpora.cpp new file mode 100644 index 00000000..32933be4 --- /dev/null +++ b/tools/hscollider/NfaGeneratedCorpora.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "ng_corpus_properties.h" +#include "ng_corpus_generator.h" +#include "NfaGeneratedCorpora.h" +#include "ExpressionParser.h" + +#include "grey.h" +#include "hs_compile.h" +#include "compiler/compiler.h" +#include "nfagraph/ng.h" +#include "parser/parse_error.h" +#include "parser/Parser.h" +#include "parser/prefilter.h" +#include "parser/unsupported.h" +#include "util/compile_context.h" +#include "util/compile_error.h" +#include "util/report_manager.h" +#include "util/target_info.h" + +#include +#include +#include + +using namespace std; +using namespace ue2; + +NfaGeneratedCorpora::NfaGeneratedCorpora(const ExpressionMap &expr, + const CorpusProperties &props, + bool force_utf8_mode_in, + bool force_prefilter_mode_in) + : m_expr(expr), m_props(props), force_utf8_mode(force_utf8_mode_in), + force_prefilter_mode(force_prefilter_mode_in) { + // empty +} + +NfaGeneratedCorpora *NfaGeneratedCorpora::clone() const { + return new NfaGeneratedCorpora(m_expr, m_props, force_utf8_mode, + force_prefilter_mode); +} + +void NfaGeneratedCorpora::generate(unsigned id, vector &data) { + ExpressionMap::const_iterator i = m_expr.find(id); + if (i == m_expr.end()) { + throw CorpusFailure("Expression not found."); + } + + string re; + u32 hs_flags; + hs_expr_ext ext; + if (!readExpression(i->second, re, &hs_flags, &ext)) { + throw CorpusFailure("Expression could not be read: " + i->second); + } + + if (force_utf8_mode) { + hs_flags |= HS_FLAG_UTF8; + } + + if (force_prefilter_mode) { + hs_flags |= HS_FLAG_PREFILTER; + } + + // Wrap the UE2 parser and compiler functionality and use it to generate + // corpora for us. + vector c; + + try { + ParsedExpression pe(0, re.c_str(), hs_flags, 0, &ext); + + // Apply prefiltering transformations if desired. + if (pe.expr.prefilter) { + prefilterTree(pe.component, ParseMode(hs_flags)); + } + + // Bail on patterns with unsupported constructs. + checkUnsupported(*pe.component); + pe.component->checkEmbeddedStartAnchor(true); + pe.component->checkEmbeddedEndAnchor(true); + + CompileContext cc(false, false, get_current_target(), Grey()); + ReportManager rm(cc.grey); + auto built_expr = buildGraph(rm, cc, pe); + if (!built_expr.g) { + // A more specific error should probably have been thrown by + // buildGraph. + throw CorpusFailure("could not build graph."); + } + + const auto cg = + makeCorpusGenerator(*built_expr.g, built_expr.expr, m_props); + cg->generateCorpus(c); + } + catch (const ParseError &e) { + throw CorpusFailure("compilation failed, " + e.reason); + } + catch (const CompileError &e) { + throw CorpusFailure("compilation failed, " + e.reason); + } + catch (const std::bad_alloc &) { + throw CorpusFailure("out of memory."); + } + catch (const CorpusGenerationFailure &e) { + // if corpus generation failed, just pass up the error message + throw CorpusFailure("corpus generation failed: " + e.message); + } + catch (...) { + throw CorpusFailure("unknown error."); + } + + if (c.empty()) { + throw CorpusFailure("no corpora generated."); + } + + data.reserve(data.size() + c.size()); + for (const auto &e : c) { + data.push_back(Corpus(e)); + } +} diff --git a/tools/hscollider/NfaGeneratedCorpora.h b/tools/hscollider/NfaGeneratedCorpora.h new file mode 100644 index 00000000..08572de6 --- /dev/null +++ b/tools/hscollider/NfaGeneratedCorpora.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef NFAGENERATEDCORPORA_H +#define NFAGENERATEDCORPORA_H + +#include "Corpora.h" +#include "ng_corpus_properties.h" +#include "expressions.h" + +#include +#include + +// Corpora associated with a pattern set +class NfaGeneratedCorpora : public CorporaSource { +public: + NfaGeneratedCorpora(const ExpressionMap &expr, + const CorpusProperties &props, bool force_utf8_mode_in, + bool force_prefilter_mode_in); + + NfaGeneratedCorpora *clone() const override; + + void generate(unsigned id, std::vector &data) override; + +private: + // Expressions handled by this corpora object + const ExpressionMap &m_expr; + + // CorpusProperties policy object + CorpusProperties m_props; + + bool force_utf8_mode; + bool force_prefilter_mode; +}; + +#endif diff --git a/tools/hscollider/ResultSet.h b/tools/hscollider/ResultSet.h new file mode 100644 index 00000000..23c628ec --- /dev/null +++ b/tools/hscollider/ResultSet.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RESULTSET_H +#define RESULTSET_H + +#include +#include +#include +#include +#include + +// Class representing a single match, encapsulating to/from offsets. +class MatchResult { +public: + MatchResult(unsigned long long start, unsigned long long end) + : from(start), to(end) {} + + bool operator<(const MatchResult &a) const { + if (from != a.from) { + return from < a.from; + } + return to < a.to; + } + + bool operator==(const MatchResult &a) const { + return from == a.from && to == a.to; + } + + unsigned long long from; + unsigned long long to; +}; + +enum ResultSource { + RESULT_FROM_UE2, + RESULT_FROM_PCRE, + RESULT_FROM_GRAPH, +}; + +inline +std::ostream &operator<<(std::ostream &out, ResultSource src) { + switch (src) { + case RESULT_FROM_UE2: + out << "UE2"; + break; + case RESULT_FROM_GRAPH: + out << "Graph"; + break; + case RESULT_FROM_PCRE: + out << "PCRE"; + break; + } + return out; +} + +class ResultSet { +public: + // Constructor. + explicit ResultSet(ResultSource s) : src(s) {} + + // Can be constructed with a set of end-offsets. + ResultSet(const std::set &m, ResultSource s) : src(s) { + for (const auto &offset : m) { + matches.emplace(0, offset); + } + } + + // Equality. + bool operator==(const ResultSet &other) const { + return uoom == other.uoom && + match_after_halt == other.match_after_halt && + invalid_id == other.invalid_id && + matches == other.matches; + } + + // Inequality. + bool operator!=(const ResultSet &other) const { return !(*this == other); } + + // Add a match. + void addMatch(unsigned long long from, unsigned long long to, + int block = 0) { + MatchResult m(from, to); + matches.insert(m); + + if (matches_by_block[block].find(m) != matches_by_block[block].end()) { + dupe_matches.insert(m); + } else { + matches_by_block[block].insert(m); + } + } + + // Unexpected out of order match seen. + bool uoom = false; + + // A match was received after termination was requested. + bool match_after_halt = false; + + // A match from an invalid ID was seen. + bool invalid_id = false; + + // Ordered set of matches. + std::set matches; + + // Matches grouped by stream write/block that we see them in. + std::map> matches_by_block; + + // Dupe matches that we have seen. + std::set dupe_matches; + + /* Where these results came from (does not take part in comparisions) */ + ResultSource src; +}; + +#endif diff --git a/tools/hscollider/Thread.cpp b/tools/hscollider/Thread.cpp new file mode 100644 index 00000000..537fa0dd --- /dev/null +++ b/tools/hscollider/Thread.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "Thread.h" +#include "common.h" +#include "sig.h" + +#include +#include + +#include + +static const size_t COLLIDER_THREAD_STACK_SIZE = 8192 * 1024; + +void Thread::start() { + // Some systems, notably Mac OS X, use a default stack size that is + // smaller than what we want (particularly given that we're planning on + // running PCRE, which recurses inside pcre_exec). We attempt to + // increase it to 8MB. + int ret; + pthread_attr_t attr; + ret = pthread_attr_init(&attr); + if (ret) { + std::cerr << "pthread_attr_init failed" << std::endl; + exit(1); + } + + size_t stacksize = 0; + ret = pthread_attr_getstacksize(&attr, &stacksize); + if (ret) { + std::cerr << "Warning: can't query stack size with " + "pthread_attr_getstacksize" << std::endl; + goto create_thread; + } + + if (stacksize < COLLIDER_THREAD_STACK_SIZE) { + ret = pthread_attr_setstacksize(&attr, COLLIDER_THREAD_STACK_SIZE); + if (ret) { + std::cerr << "Warning: pthread_attr_setstacksize failed, " + "unable to set stack size to " + << COLLIDER_THREAD_STACK_SIZE << " bytes." << std::endl; + // Fall through: this isn't necessarily fatal (yet!) + } + } + +create_thread: + ret = pthread_create(&thread, &attr, &runThread, this); + if (ret) { + std::cerr << "pthread_create failed for thread id " << thread_id + << std::endl; + exit(1); + } +} + +// Dispatch +void *Thread::runThread(void *thr) { + if (!no_signal_handler) { + setSignalStack(); + } + ((Thread *)thr)->run(); + return nullptr; +} + +void Thread::join() { pthread_join(thread, nullptr); } + +Thread::Thread(size_t num) : thread_id(num) {} + +Thread::~Thread() {} diff --git a/tools/hscollider/Thread.h b/tools/hscollider/Thread.h new file mode 100644 index 00000000..2ca50e38 --- /dev/null +++ b/tools/hscollider/Thread.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef UE2COLLIDER_THREAD_H +#define UE2COLLIDER_THREAD_H + +#include + +#include + +#include + +class Thread : boost::noncopyable { +public: + explicit Thread(size_t num); + virtual ~Thread(); + + virtual void start(); + + // Dispatch + static void *runThread(void *thr); + + virtual void join(); + + // Implemented by subclasses. + virtual void run() = 0; + +protected: + const size_t thread_id; + +private: + pthread_t thread; +}; + +#endif // UE2COLLIDER_THREAD_H diff --git a/tools/hscollider/UltimateTruth.cpp b/tools/hscollider/UltimateTruth.cpp new file mode 100644 index 00000000..19c597be --- /dev/null +++ b/tools/hscollider/UltimateTruth.cpp @@ -0,0 +1,1026 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "ResultSet.h" +#include "UltimateTruth.h" +#include "util/database_util.h" +#include "util/ExpressionParser.h" +#include "util/string_util.h" + +#include "ue2common.h" +#include "common.h" +#include "crc32.h" +#include "hs.h" +#include "hs_internal.h" +#include "util/make_unique.h" + +#include "scratch.h" +#include "nfa/nfa_api_queue.h" +#include "rose/rose_internal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace std; +using namespace ue2; +using boost::ptr_vector; + +#ifndef RELEASE_BUILD + +#include "database.h" +#include "state.h" + +static +hs_error_t open_magic_stream(const hs_database_t *db, unsigned flags, + hs_stream_t **stream, hs_scratch_t *scratch, + unsigned long long start_offset) { + hs_error_t ret = hs_open_stream(db, flags, stream); + if (ret != HS_SUCCESS) { + return ret; + } + + const char dummy_data[100] = { 0 }; + UNUSED const struct RoseEngine *rose + = (const struct RoseEngine *)hs_get_bytecode(db); + assert(sizeof(dummy_data) >= rose->historyRequired); + hs_scan_stream(*stream, dummy_data, MIN(start_offset, sizeof(dummy_data)), 0, + scratch, nullptr, nullptr); + (*stream)->offset = start_offset; + return ret; +} + +#endif // RELEASE_BUILD + +class HyperscanDB : boost::noncopyable { +public: + // Constructor takes iterators over a container of pattern IDs. + template + HyperscanDB(hs_database_t *db_in, Iter ids_begin, Iter ids_end) + : db(db_in), ids(ids_begin, ids_end) {} + + ~HyperscanDB() { + hs_free_database(db); + } + + // Underlying Hyperscan database pointer. + hs_database_t *db; + + // The set of expression IDs that must return their matches in order. + unordered_set ordered; + + // The complete set of expression IDs associated with this database. + unordered_set ids; +}; + +// Used to track the ID and result set. +namespace { +struct MultiContext { + MultiContext(unsigned int id_in, const HyperscanDB &db_in, ResultSet *rs_in, + bool single_in, ostream &os) + : id(id_in), db(db_in), rs(rs_in), single(single_in), out(os) {} + unsigned int id; + int block = 0; + const HyperscanDB &db; + ResultSet *rs; + u64a lastRawMatch = 0; /* store last known unadjusted match location */ + u64a lastOrderMatch = 0; + bool single; + bool use_max_offset = false; + unsigned long long max_offset = 0; /* don't record matches beyond this */ + bool terminated = false; //!< user has instructed us to stop + bool in_scan_call = false; + ostream &out; +}; +} + +// Callback used for all (both single and multi-mode) scans. +static +int callbackMulti(unsigned int id, unsigned long long from, + unsigned long long to, UNUSED unsigned int flags, void *ctx) { + MultiContext *mctx = static_cast(ctx); + assert(mctx); + assert(mctx->rs); + assert(mctx->in_scan_call); + + ostream &out = mctx->out; + + // Sanity check: in single mode, we'd better not be getting matches for the + // wrong ID! + if (mctx->single && id != mctx->id) { + out << "UE2 Match @ (" << from << "," << to << ") for " << id + << " which is not the id we're looking for" << endl; + mctx->rs->invalid_id = true; + return 1; + } + + // In any mode, we should NEVER get a match from an ID outside our known set. + if (mctx->db.ids.find(id) == mctx->db.ids.end()) { + out << "UE2 Match @ (" << from << "," << to << ") for " << id + << " which is not in the pattern set" << endl; + mctx->rs->invalid_id = true; + return 1; + } + + if (mctx->terminated) { + out << "UE2 Match @ (" << from << "," << to << ") for " << id + << " after termination" << endl; + mctx->rs->match_after_halt = true; + } + +#ifndef RELEASE_BUILD + unsigned int adjustment = flags & HS_MATCH_FLAG_ADJUSTED ? 1 : 0; + if (mctx->lastRawMatch > to + adjustment) { + out << "UE2 Match @ (" << from << "," << to << ") for " << id + << " unordered" << endl; + mctx->rs->uoom = true; + } + mctx->lastRawMatch = to + adjustment; +#endif + + if (mctx->db.ordered.find(id) != mctx->db.ordered.end()) { + if (mctx->lastOrderMatch > to) { + out << "UE2 Match @ (" << from << "," << to << ") for " << id + << " unordered" << endl; + mctx->rs->uoom = true; + } + mctx->lastOrderMatch = to; + } + + if (mctx->use_max_offset && to > mctx->max_offset) { + if (echo_matches) { + out << "UE2 Match @ (" << from << "," << to << ") for " << id + << " ignored" << endl; + } + return 0; + } + + if (to - g_streamOffset < g_corpora_prefix.size()) { + if (echo_matches) { + out << "UE2 Match @ (" << from << "," << to << ") for " << id + << " too early" << endl; + } + return 0; + } + + u64a offsetDelta = g_corpora_prefix.size() + g_streamOffset; + + if (from) { + // from only set in SOM mode, otherwise zero. If we wanted to be REALLY + // principled about this, we'd probably want to stash the flags + // somewhere at compile time. + from -= (from > offsetDelta ? offsetDelta : from); + } + + to -= offsetDelta; + + if (echo_matches) { + out << "UE2 Match @ (" << from << "," << to << ") for " << id << endl; + } + + if (mctx->single || id == mctx->id) { + mctx->rs->addMatch(from, to, mctx->block); + if (limit_matches && mctx->rs->matches.size() == limit_matches) { + if (echo_matches) { + out << "Terminating matching (hit match limit)" << endl; + } + mctx->terminated = true; + return 1; // terminate matching. + } + } + + return 0; +} + +static +void filterLeftmostSom(ResultSet &rs) { + if (rs.matches.size() <= 1) { + return; + } + + set seen; // End offsets. + auto it = rs.matches.begin(); + while (it != rs.matches.end()) { + if (seen.insert(it->to).second) { + ++it; // First time we've seen this end-offset. + } else { + rs.matches.erase(it++); + } + } +} + +UltimateTruth::UltimateTruth(ostream &os, const ExpressionMap &expr, + const hs_platform_info_t *plat, + const Grey &grey_in, unsigned int streamBlocks) + : grey(grey_in), out(os), m_expr(expr), m_xcompile(false), + m_streamBlocks(streamBlocks), scratch(nullptr), + platform(plat) { + // Build our mode flags. + + switch (colliderMode) { + case MODE_STREAMING: + m_mode = HS_MODE_STREAM; + break; + case MODE_BLOCK: + m_mode = HS_MODE_BLOCK; + break; + case MODE_VECTORED: + m_mode = HS_MODE_VECTORED; + break; + } + + // Set desired SOM precision, if we're in streaming mode. + if (colliderMode == MODE_STREAMING) { + m_mode |= somPrecisionMode; + } +} + +UltimateTruth::~UltimateTruth() { + hs_free_scratch(scratch); +} + +static +void mangle_scratch(hs_scratch_t *scratch) { + /* Use our knowledge of the internals of scratch to make a mess */ + + memset(&scratch->tctxt, 0xc0, sizeof(scratch->tctxt)); + memset(scratch->bstate, 0xd0, scratch->bStateSize); + memset(scratch->tstate, 0xe0, scratch->tStateSize); + memset(scratch->fullState, 0xf0, scratch->fullStateSize); + + for (u32 i = 0; i < scratch->queueCount; i++) { + struct mq *q = &scratch->queues[i]; + memset(q, 0x01, sizeof(*q)); + q->scratch = scratch; + } + + memset(scratch->aqa, 0xb0, scratch->activeQueueArraySize); + for (u32 i = 0; i < DELAY_SLOT_COUNT; i++) { + memset(scratch->delay_slots[i], 0x05, scratch->delay_fatbit_size); + } + + memset(scratch->catchup_pq.qm, 0x06, + scratch->queueCount * sizeof(struct queue_match)); + scratch->catchup_pq.qm_size = 45; + memset(&scratch->core_info, 0x07, sizeof(scratch->core_info)); + memset(scratch->deduper.som_start_log[0], 0x90, + sizeof(u64a) * scratch->deduper.dkey_count); + memset(scratch->deduper.som_start_log[1], 0x09, + sizeof(u64a) * scratch->deduper.dkey_count); + memset(scratch->deduper.log[0], 0xa0, scratch->deduper.log_size); + memset(scratch->deduper.log[1], 0x0a, scratch->deduper.log_size); + memset(scratch->deduper.som_log[0], 0xd0, scratch->deduper.log_size); + memset(scratch->deduper.som_log[1], 0x0d, scratch->deduper.log_size); + + for (u32 i = 0; i < scratch->anchored_literal_region_len; i++) { + memset(scratch->al_log[i], 0xa0, scratch->anchored_literal_fatbit_size); + } + scratch->al_log_sum=0xf0f; + + memset(scratch->handled_roles, 0x05, scratch->handledKeyFatbitSize); + memset(scratch->som_store, 0x06, + scratch->som_store_count * sizeof(u64a)); + memset(scratch->som_attempted_store, 0x06, + scratch->som_store_count * sizeof(u64a)); + memset(scratch->som_set_now, 0x03, scratch->som_fatbit_size); + memset(scratch->som_attempted_set, 0x04, scratch->som_fatbit_size); + scratch->som_set_now_offset = 45; + memset(&scratch->fdr_conf, 0x0d, sizeof(scratch->fdr_conf)); + scratch->fdr_conf_offset = 0xe4; +} + +bool UltimateTruth::blockScan(const HyperscanDB &hdb, const string &buffer, + size_t align, match_event_handler callback, + void *ctx_in, ResultSet *) { + assert(colliderMode == MODE_BLOCK); + assert(!m_xcompile); + + const hs_database_t *db = hdb.db; + assert(db); + MultiContext *ctx = (MultiContext *)ctx_in; + + char *realigned = setupScanBuffer(buffer.c_str(), buffer.size(), align); + if (!realigned) { + return false; + } + + if (use_copy_scratch && !cloneScratch()) { + return false; + } + + ctx->in_scan_call = true; + hs_error_t ret = + hs_scan(db, realigned, buffer.size(), 0, scratch, callback, ctx); + ctx->in_scan_call = false; + + if (g_verbose) { + out << "Scan call returned " << ret << endl; + } + + if (ctx->terminated) { + if (g_verbose && ret != HS_SCAN_TERMINATED) { + out << "Scan should have returned HS_SCAN_TERMINATED, returned " + << ret << " instead." << endl; + } + return ret == HS_SCAN_TERMINATED; + } + + if (g_verbose && ret != HS_SUCCESS) { + out << "Scan should have returned HS_SUCCESS, returned " << ret + << " instead." << endl; + } + + if (use_mangle_scratch) { + mangle_scratch(scratch); + } + + return ret == HS_SUCCESS; +} + +static +vector compressAndCloseStream(hs_stream_t *stream) { + size_t needed; + hs_error_t err = hs_compress_stream(stream, nullptr, 0, &needed); + if (err != HS_INSUFFICIENT_SPACE) { + return {}; + } + + vector buf(needed); + err = hs_compress_stream(stream, buf.data(), needed, &needed); + if (err != HS_SUCCESS) { + return {}; + } + assert(needed == buf.size()); + + err = hs_close_stream(stream, nullptr, nullptr, nullptr); + if (err != HS_SUCCESS) { + return {}; + } + + return buf; +} + + +static +hs_stream_t *compressAndExpandStream(const hs_database_t *db, + hs_stream_t *stream) { + vector buf = compressAndCloseStream(stream); + hs_stream_t *out; + hs_error_t err = hs_expand_stream(db, &out, buf.data(), buf.size()); + + if (err != HS_SUCCESS) { + return nullptr; + } + + return out; +} + +static +hs_stream_t *compressAndResetExpandStream(const hs_database_t *db, + hs_stream_t *stream) { + vector buf = compressAndCloseStream(stream); + if (buf.empty()) { + return nullptr; + } + + hs_stream_t *out; + + hs_error_t err = hs_open_stream(db, 0, &out); + + if (err != HS_SUCCESS) { + return nullptr; + } + + err = hs_reset_and_expand_stream(out, buf.data(), buf.size(), nullptr, + nullptr, nullptr); + if (err != HS_SUCCESS) { + return nullptr; + } + + return out; +} + +bool UltimateTruth::streamingScan(const HyperscanDB &hdb, const string &buffer, + size_t align, match_event_handler callback, + void *ctx_in, ResultSet *rs) { + assert(colliderMode == MODE_STREAMING); + assert(!m_xcompile); + + const hs_database_t *db = hdb.db; + assert(db); + MultiContext *ctx = (MultiContext *)ctx_in; + + // open a stream + hs_stream_t *stream; + size_t stream_size; + int ret; + + ret = hs_stream_size(db, &stream_size); + if (ret != HS_SUCCESS) { + out << "Unable to size stream." << endl; + return false; + } + + if (!g_streamOffset) { + ret = hs_open_stream(db, 0, &stream); + } else { +#ifndef RELEASE_BUILD + ret = open_magic_stream(db, 0, &stream, scratch, g_streamOffset); +#else + ret = HS_INVALID; +#endif + } + + if (ret != HS_SUCCESS) { + out << "Unable to open stream." << endl; + return false; + } + + // scan our data, split into blocks and copied into a temporary buffer + // aligned as requested (out of paranoia) + unsigned blockSize = buffer.size() / m_streamBlocks; + if (blockSize == 0) { + blockSize = 1; + } + const char *ptr = buffer.c_str(); + const char *end = ptr + buffer.size(); + ctx->block = 0; + + // We use a do-while loop here so that zero-byte cases still generate at + // least one hs_scan_stream call, since it's something users might try. + do { + if (ptr + blockSize > end) { + // last write is a runt + blockSize = end - ptr; + } + char *realigned = setupScanBuffer(ptr, blockSize, align); + if (!realigned) { + return false; + } + ctx->in_scan_call = true; + DEBUG_PRINTF("scan stream write %u\n", ctx->block); + ret = hs_scan_stream(stream, realigned, blockSize, 0, scratch, + callback, ctx); + DEBUG_PRINTF("scan %u done\n", ctx->block); + ctx->in_scan_call = false; + + if (limit_matches && rs->matches.size() == limit_matches) { + if (ret != HS_SCAN_TERMINATED) { + DEBUG_PRINTF("failure to scan %d\n", ret); + return false; + } + } else if (ret != HS_SUCCESS) { + DEBUG_PRINTF("failure to scan %d\n", ret); + return false; + } + + if (use_copy_scratch && !cloneScratch()) { + return false; + } + + if (use_copy_stream) { + hs_stream_t *s2; + ret = hs_copy_stream(&s2, stream); + if (ret != HS_SUCCESS) { + DEBUG_PRINTF("failure to copy %d\n", ret); + return false; + } + /* do a short write to the old stream so that it is in the wrong + * state. */ + char temp[2] = {0, 0}; + ret = hs_scan_stream(stream, temp, sizeof(temp), 0, scratch, + nullptr, nullptr); + + hs_error_t expected = HS_SUCCESS; + if (limit_matches && rs->matches.size() == limit_matches) { + expected = HS_SCAN_TERMINATED; + } + if (ret != expected) { + DEBUG_PRINTF("failure to scan %d\n", ret); + return false; + } + ret = hs_close_stream(stream, nullptr, nullptr, nullptr); + if (ret != HS_SUCCESS) { + DEBUG_PRINTF("failure to close %d\n", ret); + return false; + } + stream = s2; + } + if (use_mangle_scratch) { + mangle_scratch(scratch); + } + + if (use_compress_expand) { + auto rv = compressAndExpandStream(db, stream); + if (!rv) { + if (g_verbose) { + out << "Compress/Expand failed." << endl; + } + return false; + } else { + stream = rv; + } + } + + if (use_compress_reset_expand) { + auto rv = compressAndResetExpandStream(db, stream); + if (!rv) { + if (g_verbose) { + out << "Compress/Expand failed." << endl; + } + return false; + } else { + stream = rv; + } + } + + ptr += blockSize; + ctx->block++; + } while (ptr < end); + + // close the stream + ctx->in_scan_call = true; + DEBUG_PRINTF("close stream %u\n", ctx->block); + ret = hs_close_stream(stream, scratch, callback, ctx); + DEBUG_PRINTF("close stream done\n"); + ctx->in_scan_call = false; + + if (ret != HS_SUCCESS) { + return false; + } + + // UE2 cannot dedupe SOM matches across stream boundaries, so we must + // filter them out. + filterLeftmostSom(*rs); + + return ret == HS_SUCCESS; +} + +bool UltimateTruth::vectoredScan(const HyperscanDB &hdb, const string &buffer, + size_t align, match_event_handler callback, + void *ctx_in, ResultSet *rs) { + assert(colliderMode == MODE_VECTORED); + assert(!m_xcompile); + + const hs_database_t *db = hdb.db; + assert(db); + MultiContext *ctx = (MultiContext *)ctx_in; + + int ret; + + assert(!g_streamOffset); + + // scan our data, split into blocks and copied into a temporary buffer + // aligned as requested (out of paranoia) + unsigned blockSize = buffer.size() / m_streamBlocks; + if (blockSize == 0) { + blockSize = 1; + } + const char *ptr = buffer.c_str(); + const char *end = ptr + buffer.size(); + ctx->block = 0; + + // We use a do-while loop here so that zero-byte cases still generate at + // least one hs_scan_stream call, since it's something users might try. + + vector data; + vector length; + + u32 block_count = (buffer.size() + blockSize - 1) / blockSize; + block_count = MAX(block_count, 1); + + if (block_count > raw_blocks.size()) { + raw_blocks.resize(block_count); + } + + do { + if (ptr + blockSize > end) { + // last write is a runt + blockSize = end - ptr; + } + char *realigned = setupVecScanBuffer(ptr, blockSize, align, ctx->block); + if (!realigned) { + return false; + } + + data.push_back(realigned); + length.push_back(blockSize); + + ptr += blockSize; + ctx->block++; + + } while (ptr < end); + + if (use_copy_scratch && !cloneScratch()) { + return false; + } + + DEBUG_PRINTF("scan vectored write %u\n", ctx->block); + ctx->in_scan_call = true; + ret = hs_scan_vector(db, &data[0], &length[0], ctx->block, 0, scratch, + callback, ctx); + ctx->in_scan_call = false; + DEBUG_PRINTF("scan %u done\n", ctx->block); + if (use_mangle_scratch) { + mangle_scratch(scratch); + } + + rs->dupe_matches.clear(); /* TODO: dedupe across vectored blocks */ + + if (limit_matches && rs->matches.size() == limit_matches) { + if (ret != HS_SCAN_TERMINATED) { + DEBUG_PRINTF("failure to scan %d\n", ret); + return false; + } + } else if (ret != HS_SUCCESS) { + DEBUG_PRINTF("failure to scan %d\n", ret); + return false; + } + + // UE2 cannot dedupe SOM matches across vector block boundaries, so we must + // filter them out. + filterLeftmostSom(*rs); + + return true; +} + +bool UltimateTruth::run(unsigned int id, shared_ptr hdb, + const string &buffer, bool single_pattern, + unsigned int align, ResultSet &rs) { + assert(!m_xcompile); + assert(hdb); + + // Ensure that scratch is appropriate for this database. + if (!allocScratch(hdb)) { + out << "Scratch alloc failed." << endl; + return false; + } + + MultiContext ctx(id, *hdb, &rs, single_pattern, out); + if (!g_corpora_suffix.empty()) { + ctx.use_max_offset = true; + ctx.max_offset = buffer.size() - g_corpora_suffix.size(); + } + + switch (colliderMode) { + case MODE_BLOCK: + return blockScan(*hdb, buffer, align, callbackMulti, &ctx, &rs); + case MODE_STREAMING: + return streamingScan(*hdb, buffer, align, callbackMulti, &ctx, &rs); + case MODE_VECTORED: + return vectoredScan(*hdb, buffer, align, callbackMulti, &ctx, &rs); + } + + assert(0); + return false; +} + +static +bool isOrdered(const string &expr, unsigned int flags) { + // SOM doesn't produce ordered matches? + if (flags & HS_FLAG_SOM_LEFTMOST) { + return false; + } + + hs_expr_info_t *info = nullptr; + hs_compile_error_t *error = nullptr; + hs_error_t err = hs_expression_info(expr.c_str(), flags, &info, &error); + if (err != HS_SUCCESS) { + // Expression will fail compilation and report error elsewhere. + free(info); + hs_free_compile_error(error); + return false; + } + + assert(info); + + // Any pattern that does not require offset adjustment should produce + // matches in order. + bool ordered = !info->unordered_matches; + free(info); + return ordered; +} + +static unique_ptr +compileHyperscan(vector &patterns, vector &flags, + vector &idsvec, ptr_vector &ext, + unsigned mode, const hs_platform_info *platform, string &error, + const Grey &grey) { + const unsigned count = patterns.size(); + hs_database_t *db = nullptr; + hs_compile_error_t *compile_err; + + hs_error_t err = hs_compile_multi_int(&patterns[0], &flags[0], + &idsvec[0], ext.c_array(), count, + mode, platform, &db, + &compile_err, grey); + + if (err != HS_SUCCESS) { + error = compile_err->message; + hs_free_compile_error(compile_err); + return nullptr; + } + + return ue2::make_unique(db, idsvec.begin(), idsvec.end()); +} + +shared_ptr UltimateTruth::compile(const set &ids, + string &error) const { + // Build our vectors for compilation + const size_t count = ids.size(); + vector expressions(count); + vector idsvec(ids.begin(), ids.end()); + vector flags(count); + vector check_ordered(count, false); + ptr_vector ext; + ext.reserve(count); + + size_t n = 0; + for (const auto &id : ids) { + auto j = m_expr.find(id); + if (j == m_expr.end()) { + error = "Unable to find ID."; + return nullptr; + } + + ext.push_back(new hs_expr_ext); + bool must_be_ordered; + if (!readExpression(j->second, expressions[n], &flags[n], &ext[n], + &must_be_ordered)) { + ostringstream oss; + oss << "Unable to decode flags: '" << j->first << ":" + << j->second << "'."; + error = oss.str(); + return nullptr; + } + + check_ordered[n] = must_be_ordered; + + if (force_utf8) { + flags[n] |= HS_FLAG_UTF8; + } + + if (force_prefilter) { + flags[n] |= HS_FLAG_PREFILTER; + } + + if (somFlags) { + flags[n] |= somFlags; + } + + if (force_edit_distance) { + ext[n].flags |= HS_EXT_FLAG_EDIT_DISTANCE; + ext[n].edit_distance = edit_distance; + } + + n++; + } + + // Our compiler takes an array of plain ol' C strings. + vector patterns(count); + for (unsigned int i = 0; i < count; i++) { + patterns[i] = expressions[i].c_str(); + } + + // Compile + if (!count) { /* slight hack to allow us to compile empty sets cleanly */ + patterns.push_back(nullptr); + flags.push_back(0); + idsvec.push_back(0); + } + + auto db = compileHyperscan(patterns, flags, idsvec, ext, m_mode, platform, + error, grey); + if (!db) { + return nullptr; + } + + // Track IDs of patterns that require ordering for validation at match + // time. + for (unsigned int i = 0; i < count; i++) { + bool is_ordered = isOrdered(expressions[i], flags[i]); + if (check_ordered[i] && !is_ordered) { + error = "Ordering required, but hs_expression_info suggests " + "that ordering is not guaranteed."; + return nullptr; + } + if (is_ordered) { + db->ordered.insert(idsvec[i]); + } + } + + return move(db); +} + +bool UltimateTruth::allocScratch(shared_ptr db) { + assert(db); + + // We explicitly avoid running scratch allocators for the same HyperscanDB + // over and over again by retaining a shared_ptr to the last one we saw. + if (db == last_db) { + return true; + } + + hs_error_t err = hs_alloc_scratch(db.get()->db, &scratch); + if (err != HS_SUCCESS) { + return false; + } + + last_db = db; + return true; +} + +bool UltimateTruth::cloneScratch(void) { + hs_scratch_t *old_scratch = scratch; + hs_scratch_t *new_scratch; + hs_error_t ret = hs_clone_scratch(scratch, &new_scratch); + if (ret != HS_SUCCESS) { + DEBUG_PRINTF("failure to clone %d\n", ret); + return false; + } + scratch = new_scratch; + ret = hs_free_scratch(old_scratch); + if (ret != HS_SUCCESS) { + DEBUG_PRINTF("failure to free %d\n", ret); + return false; + } + DEBUG_PRINTF("scratch cloned from %p to %p\n", old_scratch, scratch); + return true; +} + +// Return an appropriately aligned (modulo max align) copy of the given buffer +char * UltimateTruth::setupScanBuffer(const char *begin, size_t len, + size_t align) { + if (align >= MAX_MAX_UE2_ALIGN) { + return nullptr; + } + + // Realloc if necessary + size_t maxBufSize = len + MAX_MAX_UE2_ALIGN; + if (maxBufSize > m_scanBuf.size()) { + m_scanBuf.resize(maxBufSize); + } + + uintptr_t currentAlign = (uintptr_t)(m_scanBuf.data()) % MAX_MAX_UE2_ALIGN; + char *ptr; + + ptrdiff_t diff = align - currentAlign; + if (diff >= 0) { + ptr = (m_scanBuf.data() + diff); + } else { + ptr = (m_scanBuf.data() + (MAX_MAX_UE2_ALIGN + diff)); + } + assert((uintptr_t)(ptr) % MAX_MAX_UE2_ALIGN == align); + + // copy the buffer + memcpy(ptr, begin, len); + return ptr; +} + +char *UltimateTruth::setupVecScanBuffer(const char *begin, size_t len, + size_t align, u32 block_id) { + if (align >= MAX_MAX_UE2_ALIGN) { + return nullptr; + } + + assert(block_id < raw_blocks.size()); + vector &raw = raw_blocks[block_id]; + + // Realloc if necessary + size_t maxBufSize = len + MAX_MAX_UE2_ALIGN; + if (maxBufSize > raw.size()) { + raw.resize(maxBufSize); + } + assert(maxBufSize <= raw.size()); + + uintptr_t currentAlign = (uintptr_t)(&raw[0]) % MAX_MAX_UE2_ALIGN; + char *ptr; + + ptrdiff_t diff = align - currentAlign; + if (diff >= 0) { + ptr = (&raw[0] + diff); + } else { + ptr = (&raw[0] + (MAX_MAX_UE2_ALIGN + diff)); + } + assert((uintptr_t)(ptr) % MAX_MAX_UE2_ALIGN == align); + + // copy the buffer + memcpy(ptr, begin, len); + return ptr; +} + +bool UltimateTruth::saveDatabase(const HyperscanDB &hdb, + const string &filename) const { + return ::saveDatabase(hdb.db, filename.c_str(), g_verbose); +} + +shared_ptr +UltimateTruth::loadDatabase(const string &filename, + const std::set &ids) const { + hs_database_t *hs_db = ::loadDatabase(filename.c_str(), g_verbose); + if (!hs_db) { + return nullptr; + } + + auto db = make_shared(hs_db, ids.begin(), ids.end()); + assert(db); + + // Fill db::ordered with the expressions that require the ordered flag. + for (const auto &id : ids) { + auto j = m_expr.find(id); + if (j == m_expr.end()) { + cerr << "Can't find expression with ID " << id << endl; + assert(0); + db.reset(); + return db; + } + string expr; + hs_expr_ext ext; + unsigned int flags; + if (!readExpression(j->second, expr, &flags, &ext)) { + cerr << "Can't parse expression with ID " << id << ": " + << j->second << endl; + assert(0); + db.reset(); + return db; + } + if (isOrdered(expr, flags)) { + db->ordered.insert(id); + } + } + + return db; +} + +unsigned int UltimateTruth::describe() const { + return m_mode; +} + +// Hash the settings used to compile a database, returning a string that can be +// used as a filename. +string UltimateTruth::dbSettingsHash(const set &ids) const { + // create a single string to contain a description of the db + ostringstream info_oss; + + // settings from UltimateTruth::describe() + info_oss << ' ' << describe() << ' '; + + // our set + for (unsigned int id : ids) { + info_oss << id << ' '; + } + + string info = info_oss.str(); + + u32 crc = Crc32c_ComputeBuf(0, info.data(), info.size()); + + // return STL string with printable version of digest + ostringstream oss; + oss << hex << setw(8) << setfill('0') << crc << dec; + + return oss.str(); +} + +string UltimateTruth::dbFilename(const set &ids) const { + ostringstream oss; + oss << serializePath << '/' << dbSettingsHash(ids) << ".db"; + return oss.str(); +} diff --git a/tools/hscollider/UltimateTruth.h b/tools/hscollider/UltimateTruth.h new file mode 100644 index 00000000..c8de8642 --- /dev/null +++ b/tools/hscollider/UltimateTruth.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ULTIMATETRUTH_H +#define ULTIMATETRUTH_H + +#include "expressions.h" + +#include "hs.h" + +#include +#include +#include +#include +#include + +#include + +namespace ue2 { + +struct Grey; + +} // namespace ue2 + +class HyperscanDB; +class ResultSet; + +// Wrapper around ue2 to generate results for an expression and corpus. +class UltimateTruth : boost::noncopyable { +public: + UltimateTruth(std::ostream &os, const ExpressionMap &expr, + const hs_platform_info *plat, const ue2::Grey &grey, + unsigned streamBlocks = 0); + + ~UltimateTruth(); + + std::shared_ptr compile(const std::set &ids, + std::string &error) const; + + bool saveDatabase(const HyperscanDB &db, + const std::string &filename) const; + + std::shared_ptr + loadDatabase(const std::string &filename, + const std::set &ids) const; + + // Are we runnable? (i.e. not xcompiling) + bool runnable() const { + return !m_xcompile; + } + + bool run(unsigned id, std::shared_ptr db, + const std::string &buffer, bool single_pattern, unsigned align, + ResultSet &rs); + + // Returns a value completely representing this object's compile options. + unsigned int describe() const; + + std::string dbFilename(const std::set &ids) const; + +private: + bool blockScan(const HyperscanDB &db, const std::string &buffer, + size_t align, match_event_handler callback, void *ctx, + ResultSet *rs); + bool streamingScan(const HyperscanDB &db, const std::string &buffer, + size_t align, match_event_handler callback, void *ctx, + ResultSet *rs); + bool vectoredScan(const HyperscanDB &db, const std::string &buffer, + size_t align, match_event_handler callback, void *ctx, + ResultSet *rs); + + char *setupScanBuffer(const char *buf, size_t len, size_t align); + + char *setupVecScanBuffer(const char *buf, size_t len, size_t align, + unsigned int block_id); + + bool allocScratch(std::shared_ptr db); + + bool cloneScratch(void); + + std::string dbSettingsHash(const std::set &ids) const; + + const ue2::Grey &grey; + + // Output stream. + std::ostream &out; + + // Our expression map + const ExpressionMap &m_expr; + + // Are we cross-compiling, and therefore unable to scan at all? + bool m_xcompile; + + // Our mode flags to pass into the compiler: calculated from streaming, + // etc. + unsigned m_mode; + + // In streaming mode, what is the number of blocks to chop data into? + unsigned m_streamBlocks; + + // Scratch space for Hyperscan. + hs_scratch_t *scratch; + + // Temporary scan buffer used for realigned scanning + std::vector m_scanBuf; + + std::vector > raw_blocks; /* temp scan buffers used by + * vectored mode */ + + // Last database we successfully allocated scratch for, so that we can + // avoid unnecessarily reallocating for it. + std::shared_ptr last_db; + + const hs_platform_info *platform; +}; + +#endif diff --git a/tools/hscollider/args.cpp b/tools/hscollider/args.cpp new file mode 100644 index 00000000..a15977f9 --- /dev/null +++ b/tools/hscollider/args.cpp @@ -0,0 +1,570 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "ng_corpus_properties.h" +#include "args.h" +#include "common.h" +#include "cross_compile.h" +#include "util/expression_path.h" +#include "util/string_util.h" + +#include "grey.h" +#include "ue2common.h" +#include "hs_compile.h" // for HS_MODE_* + +#include +#include +#include +#include +#include +#include +#include +#include + +#define xstr(s) str(s) +#define str(s) #s + +using namespace ue2; +using namespace std; + +// display usage information, with an optional error +static +void usage(const char *name, const char *error) { + printf("Usage: %s [OPTIONS...]\n\n", name); + printf("General Options:\n\n"); + printf(" -h Display help and exit.\n"); + printf(" -G OVERRIDES Overrides for the grey box.\n"); + printf(" -e PATH Path to expression directory or file.\n"); + printf(" -s FILE Signature file to use.\n"); + printf(" -z NUM Signature ID to use.\n"); + printf(" -c FILE Load corpora from FILE rather than using " + "generator.\n"); + printf(" -w FILE After running, save corpora (with matches) to " + "FILE.\n"); + printf(" -a [BAND] Compile all expressions in UE2 (but still match " + "singly).\n"); + printf(" If BAND, compile patterns in groups of size " + "BAND.\n"); + printf(" -t NUM Use streaming mode, split data into ~NUM " + "blocks.\n"); + printf(" -V NUM Use vectored mode, split data into ~NUM " + "blocks.\n"); + printf(" -Z {R or 0-%d} Only test one alignment, either as given or " + "'R' for random.\n", MAX_MAX_UE2_ALIGN - 1); + printf(" -q Quiet; display only match differences, no other " + "failures.\n"); + printf(" -v Verbose; display successes as well as " + "failures.\n"); + printf("\n"); + printf("Pattern flags:\n"); + printf("\n"); + printf(" -8 Force UTF8 mode on all patterns.\n"); + printf(" -L Apply HS_FLAG_SOM_LEFTMOST to all patterns.\n"); + printf(" -E DISTANCE Match all patterns within edit distance" + " DISTANCE.\n"); + printf(" --prefilter Apply HS_FLAG_PREFILTER to all patterns.\n"); + printf("\n"); + printf("Testing mode options:\n"); + printf("\n"); + printf(" -d NUM Set SOM precision mode (default: 8 (large)).\n"); + printf(" -O NUM In streaming mode, set initial offset to NUM.\n"); + printf(" -k NUM Terminate callback after NUM matches per " + "pattern.\n"); + printf(" --copy-scratch Copy scratch after each scan call.\n"); + printf(" --copy-stream Copy stream state after each scan call.\n"); + printf(" --compress-expand Compress and expand stream state after each " + "scan call.\n"); + printf(" --compress-reset-expand Compress, reset and expand stream state " + "after each scan call.\n"); + printf(" --mangle-scratch Mangle scratch space after each scan call.\n"); + printf(" --no-nfa Disable NFA graph execution engine.\n"); + printf(" --no-pcre Disable PCRE engine.\n"); + printf(" --test-nfa Disable UE2 engine (test NFA against PCRE).\n"); + printf(" --abort-on-fail Abort, rather than exit, on failure.\n"); + printf(" --no-signal-handler Do not handle handle signals (to generate " + "backtraces).\n"); + printf("\n"); + printf("Memory and resource control options:\n"); + printf("\n"); + printf(" -T NUM Run with NUM threads.\n"); + printf(" -M NUM Set maximum memory allocated to NUM megabytes per" + " thread.\n"); + printf(" (0 means no limit, default is 1000 MB).\n"); + printf(" -m NUM Set PCRE_MATCH_LIMIT (default: %lu).\n", + DEFAULT_PCRE_MATCH_LIMIT); + printf(" -r NUM Set PCRE_MATCH_LIMIT_RECURSION (default: %lu).\n", + DEFAULT_PCRE_MATCH_RECURSION_LIMIT); + printf("\n"); + printf("Cross-compiling:\n"); + printf("\n"); + printf(" -x NAME Cross-compile for arch NAME.\n"); + printf(" -i DIR Don't compile, load from files in DIR " + "instead.\n"); + printf(" -o DIR After compiling, save to files in DIR.\n"); + printf("\n"); + printf("Corpus generation options:\n"); + printf("\n"); + printf(" -n NUM Max corpora to generate for a given signature " + "(default: %u).\n", DEFAULT_CORPUS_GENERATOR_LIMIT); + printf(" -R NUM Random seed to use (default: seeded from " + "time()).\n"); + printf(" -p NUM,NUM,NUM Percentage probabilities of " + "(match,unmatch,random) char.\n"); + printf(" -C NUM,NUM Follow cycles (min,max) times.\n"); + printf(" -P NUM,NUM Add a random prefix of length between " + "(min,max).\n"); + printf(" -S NUM,NUM Add a random suffix of length between " + "(min,max).\n"); + printf(" -D NUM Apply an edit distance (default: 0) to each " + "corpus.\n"); + printf(" -b NUM Limit alphabet to NUM characters, starting at " + "lower-case 'a'.\n"); + printf("\n"); + + if (error) { + printf("Error: %s\n", error); + } +} + +void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, + vector *corpora, UNUSED Grey *grey, + unique_ptr *plat_out) { + static const char options[] + = "-ab:cC:d:D:e:E:G:hi:k:Lm:M:n:o:O:p:P:qr:R:S:s:t:T:vV:w:x:X:Y:z:Z:8"; + s32 in_multi = 0; + s32 in_corpora = 0; + int pcreFlag = 1; + int nfaFlag = 1; + int ue2Flag = 1; + int copyScratch = 0; + int copyStream = 0; + int mangleScratch = 0; + int compressFlag = 0; + int compressResetFlag = 0; + static const struct option longopts[] = { + {"copy-scratch", 0, ©Scratch, 1}, + {"copy-stream", 0, ©Stream, 1}, + {"mangle-scratch", 0, &mangleScratch, 1}, + {"prefilter", 0, &force_prefilter, 1}, + {"no-pcre", 0, &pcreFlag, 0}, + {"no-nfa", 0, &nfaFlag, 0}, + {"test-nfa", 0, &ue2Flag, 0}, + {"abort-on-fail", 0, &abort_on_failure, 1}, + {"no-signal-handler", 0, &no_signal_handler, 1}, + {"compress-expand", 0, &compressFlag, 1}, + {"compress-reset-expand", 0, &compressResetFlag, 1}, + {nullptr, 0, nullptr, 0}}; + + for (;;) { + int c = getopt_long(argc, argv, options, longopts, nullptr); + if (c < 0) { + break; + } + + switch (c) { + case 'a': + g_ue2CompileAll = true; + in_multi = 2; + break; + case 'b': { + unsigned sz; + if (!fromString(optarg, sz) || sz > 256) { + usage(argv[0], "Must provide an integer argument <= 256" + "to '-b' flag"); + exit(1); + } + corpus_gen_prop.alphabetSize = sz; + break; + } + case 'c': + in_corpora = 2; + break; + case 'C': { + vector nums; + if (!strToList(optarg, nums) || nums.size() != 2 + || nums[0] > nums[1]) { + usage(argv[0], "Cycle limit '-C' argument takes a list of " + " integers: MIN,MAX"); + exit(1); + } + corpus_gen_prop.setCycleLimit(nums[0], nums[1]); + break; + } + case 'd': { + unsigned dist; + if (!fromString(optarg, dist)) { + usage(argv[0], + "Must provide an integer argument to '-d' flag"); + exit(1); + } + switch (dist) { + case 2: + somPrecisionMode = HS_MODE_SOM_HORIZON_SMALL; + break; + case 4: + somPrecisionMode = HS_MODE_SOM_HORIZON_MEDIUM; + break; + case 8: + somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE; + break; + default: + usage(argv[0], "SOM precision must be 2, 4 or 8"); + exit(1); + } + break; + } + case 'D': { + unsigned dist; + if (!fromString(optarg, dist)) { + usage(argv[0], + "Must provide an integer argument to '-D' flag"); + exit(1); + } + corpus_gen_prop.editDistance = dist; + break; + } + case 'e': + g_exprPath.assign(optarg); + break; + case 'E': { + u32 dist; + if (!fromString(optarg, dist)) { + usage(argv[0], "Argument to '-E' flag must be an integer"); + exit(1); + } + force_edit_distance = true; + edit_distance = dist; + break; + } +#ifndef RELEASE_BUILD + case 'G': + applyGreyOverrides(grey, string(optarg)); + break; +#endif + case 'h': + usage(argv[0], nullptr); + exit(0); + case 'i': + loadDatabases = true; + serializePath = optarg; + break; + case 'k': + if (!fromString(optarg, limit_matches) || limit_matches < 1) { + usage(argv[0], + "Must provide a positive integer argument to '-k' " + "flag"); + exit(1); + } + break; + case 'L': + somFlags = HS_FLAG_SOM_LEFTMOST; + break; + case 'm': + if (!fromString(optarg, g_matchLimit) || g_matchLimit < 1) { + usage(argv[0], + "Must provide a positive integer argument to '-m' " + "flag"); + exit(1); + } + break; + case 'M': + if (!fromString(optarg, g_memoryLimit)) { + usage(argv[0], + "Must provide a positive (or zero) integer argument " + "to '-M' flag"); + exit(1); + } + break; + case 'n': { + unsigned int count; + if (!fromString(optarg, count)) { + usage(argv[0], "Argument to '-n' flag must be an integer"); + exit(1); + } + corpus_gen_prop.corpusLimit = count; + break; + } + case 'o': + saveDatabases = true; + serializePath = optarg; + break; + case 'O': + if (!fromString(optarg, g_streamOffset)) { + usage(argv[0], + "Argument '-O' flag must be a positive integer"); + exit(1); + } + break; + case 'p': { + vector prob; + if (!strToList(optarg, prob) || prob.size() != 3) { + usage(argv[0], "Probabilities '-p' argument takes a list " + "of three integers: MATCH,UNMATCH,RANDOM"); + exit(1); + } + if (!corpus_gen_prop.setPercentages(prob[0], prob[1], + prob[2])) { + usage(argv[0], + "Unable to set corpus generator probabilities."); + exit(1); + } + break; + } + case 'P': { + vector nums; + if (!strToList(optarg, nums) || nums.size() != 2 + || nums[0] > nums[1]) { + usage(argv[0], "Prefix '-P' argument takes a list of two" + " integers: MIN,MAX"); + exit(1); + } + corpus_gen_prop.prefixRange = min_max(nums[0], nums[1]); + break; + } + case 'q': + g_quiet++; + break; + case 'r': + if (!fromString(optarg, g_matchLimitRecursion) + || g_matchLimitRecursion < 1) { + usage(argv[0], "Must provide a positive integer argument " + "to '-r' flag"); + exit(1); + } + break; + case 'R': { + if (!fromString(optarg, randomSeed)) { + usage(argv[0], "Argument to '-R' flag must be an integer"); + exit(1); + } + corpus_gen_prop.seed(randomSeed); + break; + } + case 's': + g_signatureFiles.push_back(optarg); + break; + case 'S': { + vector nums; + if (!strToList(optarg, nums) || nums.size() != 2 || + nums[0] > nums[1]) { + usage(argv[0], "Suffix '-S' argument takes a list of two" + " integers: MIN,MAX"); + exit(1); + } + corpus_gen_prop.suffixRange = min_max(nums[0], nums[1]); + break; + } + case 't': + if (colliderMode != MODE_BLOCK) { + usage(argv[0], "You can only use one mode at a time!"); + exit(1); + } + colliderMode = MODE_STREAMING; + if (!fromString(optarg, g_streamBlocks) || g_streamBlocks < 1) { + usage(argv[0], "Must provide a positive integer argument " + "to '-t' flag"); + exit(1); + } + break; + case 'T': + if (!fromString(optarg, numThreads) || numThreads < 1) { + usage(argv[0], "Must provide a positive integer argument " + "to '-T' flag"); + exit(1); + } + break; + case 'v': + if (g_verbose) { + echo_matches = true; + } + g_verbose = true; + break; + case 'V': + if (colliderMode != MODE_BLOCK) { + usage(argv[0], "You can only use one mode at a time!"); + exit(1); + } + colliderMode = MODE_VECTORED; + if (!fromString(optarg, g_streamBlocks) || g_streamBlocks < 1) { + usage(argv[0], "Must provide a positive integer argument " + "to '-t' flag"); + exit(1); + } + break; + case 'w': + saveCorpora = true; + saveCorporaFile = optarg; + break; + case 'x': + *plat_out = xcompileReadMode(optarg); + if (!*plat_out) { + usage(argv[0], xcompileUsage().c_str()); + exit(1); + } + break; + case 'X': { + u32 count; + if (!fromString(optarg, count)) { + usage(argv[0], "Argument to '-X' flag must be an integer"); + exit(1); + } + g_corpora_prefix.insert(g_corpora_prefix.end(), count, '~'); + break; + } + case 'Y': + { + u32 count; + if (!fromString(optarg, count)) { + usage(argv[0], "Argument to '-Y' flag must be an integer"); + exit(1); + } + g_corpora_suffix.insert(g_corpora_suffix.end(), count, '~'); + break; + } + case 'z': + if (!strToList(optarg, g_signatures)) { + usage(argv[0], + "Argument to '-z' flag must be a list of integers"); + exit(1); + } + break; + case 'Z': + static constexpr unsigned ALIGN_LIMIT = MAX_MAX_UE2_ALIGN - 1; + if (optarg == string("R")) { + // Random min alignment selected. + use_random_alignment = true; + break; + } else if (!fromString(optarg, min_ue2_align) + || min_ue2_align > ALIGN_LIMIT) { + usage(argv[0], "Argument must be 'R' or numeric < " + xstr(MAX_MAX_UE2_ALIGN) " to '-Z'"); + exit(1); + } + max_ue2_align = min_ue2_align + 1; + break; + case '8': + force_utf8 = true; + break; + case 1: + if (in_multi) { + if (!fromString(optarg, multicompile_bands)) { + usage(argv[0], + "Argument to '-a' flag must be an integer"); + exit(1); + } + break; + } else if (in_corpora) { + corpora->push_back(optarg); + in_corpora = 2; + break; + } + case 0: + break; + default: + usage(argv[0], "Unrecognised command line argument."); + exit(1); + } + + in_multi = MAX(0, in_multi - 1); + in_corpora = MAX(0, in_corpora - 1); + } + + if (g_streamOffset && !g_streamBlocks) { + usage(argv[0], "stream offset requires streams"); + exit(1); + } + + if (g_exprPath.empty() && !g_signatureFiles.empty()) { + /* attempt to infer an expression directory */ + for (const auto &fname : g_signatureFiles) { + string exprPath = inferExpressionPath(fname); + if (!g_exprPath.empty() && exprPath != g_exprPath) { + usage(argv[0], "Only one expression path is allowed."); + } + g_exprPath.assign(exprPath); + } + } + + // Must have a valid expression path + if (g_exprPath.empty()) { + usage(argv[0], "Must specify an expression path with the -e option."); + exit(1); + } + + // If we've been handed an expr file and no restrictions, use 'em all! + if (!isDir(g_exprPath) && isFile(g_exprPath) && g_signatureFiles.empty() + && g_signatures.empty()) { + g_allSignatures = true; + } + + // Must have a valid signature file + if (g_signatureFiles.empty() && g_signatures.empty() && !g_allSignatures) { + usage(argv[0], "Must specify a signature file with the -s option."); + exit(1); + } + + // Cannot ask for both loading and saving + if (loadDatabases && saveDatabases) { + usage(argv[0], "You cannot both load and save databases."); + exit(1); + } + + // Cannot ask for cross-compile and loading + if (loadDatabases && *plat_out) { + usage(argv[0], "You cannot both load and xcompile of databases."); + exit(1); + } + + // need at least two pattern engines active + if (nfaFlag + pcreFlag + ue2Flag < 2) { + usage(argv[0], "At least two pattern engines should be active."); + exit(1); + } + + if (copyStream && !g_streamBlocks) { + usage(argv[0], "Copying streams only makes sense in streaming mode."); + exit(1); + } + if (compressFlag && compressResetFlag) { + usage(argv[0], + "Only use one of --compress-expand and --compress-reset-expand."); + exit(1); + } + + // set booleans appropriately + use_NFA = (bool) nfaFlag; + use_PCRE = (bool) pcreFlag; + use_UE2 = (bool) ue2Flag; + use_copy_scratch = (bool) copyScratch; + use_copy_stream = (bool) copyStream; + use_mangle_scratch = (bool) mangleScratch; + use_compress_expand = (bool)compressFlag; + use_compress_reset_expand = (bool)compressResetFlag; +} diff --git a/tools/hscollider/args.h b/tools/hscollider/args.h new file mode 100644 index 00000000..382eff03 --- /dev/null +++ b/tools/hscollider/args.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ARGS_H +#define ARGS_H + +#include +#include +#include + +namespace ue2 { +struct Grey; +} +struct hs_platform_info; +class CorpusProperties; + +void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, + std::vector *corpora, ue2::Grey *grey, + std::unique_ptr *plat_out); + +#endif diff --git a/tools/hscollider/common.h b/tools/hscollider/common.h new file mode 100644 index 00000000..da85790c --- /dev/null +++ b/tools/hscollider/common.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef COMMON_H +#define COMMON_H + +#include +#include +#include + +enum ColliderMode { + MODE_BLOCK, + MODE_STREAMING, + MODE_VECTORED +}; + +extern unsigned numThreads; +extern enum ColliderMode colliderMode; +extern unsigned int somFlags; +extern bool loadDatabases; +extern bool saveDatabases; +extern bool saveCorpora; +extern std::string saveCorporaFile; +extern std::string serializePath; +extern bool echo_matches; +extern int g_quiet; +extern bool g_verbose; +extern std::string g_exprPath; +extern std::vector g_signatureFiles; +extern bool g_allSignatures; +extern bool g_ue2CompileAll; +extern unsigned g_streamBlocks; +extern unsigned long long g_streamOffset; +extern std::string g_corpora_prefix; +extern std::string g_corpora_suffix; +extern unsigned multicompile_bands; +extern std::string g_corporaFile; +extern std::vector g_signatures; +extern unsigned long int g_matchLimit; +extern unsigned long int g_matchLimitRecursion; +extern unsigned min_ue2_align; +extern unsigned max_ue2_align; +extern size_t g_memoryLimit; +extern bool force_utf8; +extern int force_prefilter; +extern unsigned somPrecisionMode; +extern unsigned limit_matches; +extern unsigned randomSeed; +extern bool use_random_alignment; +extern bool use_PCRE; +extern bool use_NFA; +extern bool use_UE2; +extern bool use_copy_scratch; +extern bool use_copy_stream; +extern bool use_mangle_scratch; +extern bool use_compress_expand; +extern bool use_compress_reset_expand; +extern int abort_on_failure; +extern int no_signal_handler; +extern bool force_edit_distance; +extern unsigned edit_distance; + +// Constants +static const unsigned long int DEFAULT_PCRE_MATCH_LIMIT = 10*1000*1000; +static const unsigned long int DEFAULT_PCRE_MATCH_RECURSION_LIMIT = 10000; +#define MAX_MAX_UE2_ALIGN 64 +#endif diff --git a/tools/hscollider/limit.cpp b/tools/hscollider/limit.cpp new file mode 100644 index 00000000..716a19e3 --- /dev/null +++ b/tools/hscollider/limit.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "limit.h" + +#include + +#if defined(HAVE_SETRLIMIT) +#include +#include +#include +#include +#include + +void setMemoryLimit(size_t mbytes) { + size_t bytes = mbytes * 1024 * 1024; + + struct rlimit r; + r.rlim_cur = bytes; + r.rlim_max = bytes; + + int rv = setrlimit(RLIMIT_DATA, &r); + if (rv != 0) { + std::cerr << "setrlimit(RLIMIT_DATA, ...) failed: " << + strerror(errno) << std::endl; + } + + rv = setrlimit(RLIMIT_AS, &r); + if (rv != 0) { + std::cerr << "setrlimit(RLIMIT_AS, ...) failed: " << + strerror(errno) << std::endl; + } +} +#else // no setrlimit +void setMemoryLimit(size_t) {} +#endif diff --git a/tools/hscollider/limit.h b/tools/hscollider/limit.h new file mode 100644 index 00000000..64d25abe --- /dev/null +++ b/tools/hscollider/limit.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LIMIT_H +#define LIMIT_H + +#include + +void setMemoryLimit(size_t mbytes); + +#endif // LIMIT_H diff --git a/tools/hscollider/main.cpp b/tools/hscollider/main.cpp new file mode 100644 index 00000000..b289135b --- /dev/null +++ b/tools/hscollider/main.cpp @@ -0,0 +1,2002 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "BoundedQueue.h" +#include "DatabaseProxy.h" +#include "FileCorpora.h" +#include "GraphTruth.h" +#include "GroundTruth.h" +#include "NfaGeneratedCorpora.h" +#include "Thread.h" +#include "UltimateTruth.h" +#include "args.h" +#include "common.h" +#include "cross_compile.h" +#include "expressions.h" +#include "limit.h" +#include "ng_corpus_properties.h" +#include "sig.h" +#include "simple_timer.h" +#include "util/expression_path.h" +#include "util/string_util.h" + +#include "grey.h" +#include "hs.h" +#include "parser/utf8_validate.h" +#include "ue2common.h" +#include "util/container.h" +#include "util/make_unique.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace std; +using namespace ue2; + +unsigned int numThreads = 1; +unsigned int numScannerThreads = 1; +unsigned int numGeneratorThreads = 1; +enum ColliderMode colliderMode = MODE_BLOCK; +bool echo_matches = false; +int g_quiet = 0; +bool g_verbose = false; +bool g_allSignatures = false; +string g_exprPath; +vector g_signatureFiles; +string g_cmdline; +bool g_ue2CompileAll = false; +unsigned g_streamBlocks = 0; +unsigned long long g_streamOffset = 0; +unsigned multicompile_bands = 0; +vector g_signatures; +unsigned long int g_matchLimit = DEFAULT_PCRE_MATCH_LIMIT; +unsigned long int g_matchLimitRecursion = DEFAULT_PCRE_MATCH_RECURSION_LIMIT; +string g_corpora_prefix; +string g_corpora_suffix; +size_t g_memoryLimit = 1000; // megabytes per thread +unsigned int somFlags = 0; +bool loadDatabases = false; +bool saveDatabases = false; +bool saveCorpora = false; +string saveCorporaFile; +string serializePath; +bool force_utf8 = false; +int force_prefilter = 0; +int no_groups = 0; +unsigned somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE; +unsigned limit_matches = 0; +unsigned randomSeed = 0; +bool use_random_alignment = false; +bool use_PCRE = true; +bool use_NFA = true; +bool use_UE2 = true; +bool use_copy_scratch = false; +bool use_copy_stream = false; +bool use_mangle_scratch = false; +bool use_compress_expand = false; +bool use_compress_reset_expand = false; +int abort_on_failure = 0; +int no_signal_handler = 0; +size_t max_scan_queue_len = 25000; +size_t max_generator_queue_len = 25000; +bool force_edit_distance = false; +unsigned edit_distance = 0; +CorpusProperties corpus_gen_prop; + +// Semi constants +unsigned min_ue2_align = 0; +unsigned max_ue2_align = MAX_MAX_UE2_ALIGN; + +#define DEDUPE_MATCHES + +static +unsigned countCores() { + unsigned count = std::thread::hardware_concurrency(); + return count ? count : 1; +} + +// Detect the Address Sanitizer with either GCC or Clang. +#if defined(__SANITIZE_ADDRESS__) +# define BUILT_WITH_ASAN +#elif defined(__has_feature) +# if __has_feature(address_sanitizer) +# define BUILT_WITH_ASAN +# endif +#endif + +// Set the default params that can be overridden with commandline args +static +void setDefaults() { + // Seed random number generator for corpora + randomSeed = time(nullptr); + // Overcommit since we have generators and scanners running. + numThreads = countCores() * 2; + +#ifdef BUILT_WITH_ASAN + cout << "NOTE: Built with AddressSanitizer.\n" + << "Defaulting to no memory limit and no signal handler.\n" + << endl; + g_memoryLimit = 0; + no_signal_handler = 1; +#endif +} + +static +void exit_with_fail(void) { + cout << "Failing cmdline was:\n " << g_cmdline << endl; + if (abort_on_failure) { + cout << "Calling abort()" << endl; + abort(); + } + exit(1); +} + +namespace /* anonymous */ { + +// For saving corpora out if the -w flag is specified. Note that we need a +// mutex to serialise writes from different threads. +class CorpusWriter { +public: + explicit CorpusWriter(const string &filename) + : out(filename.c_str(), ios_base::trunc) {} + + void write(const string &str) { + std::lock_guard lock(mutex); + out << str << flush; + } + +private: + ofstream out; + std::mutex mutex; +}; + +unique_ptr corporaOut = nullptr; + +// Encapsulates all of the data reported from a test +struct TestSummary { + unsigned totalCorpora = 0; + unsigned totalExpressions = 0; + unsigned failCorpora = 0; + unsigned failPcreCompile = 0; + unsigned failNGCompile = 0; + unsigned failUe2Compile = 0; + unsigned failCompileDifference = 0; // failed in pcre but not ue2 + unsigned failPcreScan = 0; + unsigned failNGScan = 0; + unsigned failUe2Scan = 0; + unsigned failDiff = 0; + unsigned failNoGroundTruth = 0; + set failIds; + set nogtIds; + + // true if we've got a failure + bool hasFailure() const { + return failDiff != 0 || !failIds.empty() || failCompileDifference != 0; + } + + void merge(const TestSummary &a) { + totalCorpora += a.totalCorpora; + totalExpressions += a.totalExpressions; + failCorpora += a.failCorpora; + failPcreCompile += a.failPcreCompile; + failNGCompile += a.failNGCompile; + failUe2Compile += a.failUe2Compile; + failCompileDifference += a.failCompileDifference; + failPcreScan += a.failPcreScan; + failNGScan += a.failNGScan; + failUe2Scan += a.failUe2Scan; + failDiff += a.failDiff; + failNoGroundTruth += a.failNoGroundTruth; + failIds.insert(begin(a.failIds), end(a.failIds)); + nogtIds.insert(begin(a.nogtIds), end(a.nogtIds)); + } +}; + +enum TestResult { + TEST_NO_GROUND_TRUTH, + TEST_PASSED, + TEST_SKIPPED, + TEST_FAILED_COMPILE, + TEST_FAILED +}; + +struct TestUnit { + shared_ptr pcre; // libpcre bytecode + shared_ptr cngi; // NFA graph info (compilation is deferred) + shared_ptr ue2; // ue2 bytecode + Corpus corpus; // a local copy, as we may modify it + + unsigned id; // expression id + unsigned corpus_id; // corpus id + bool highlander; // single match flag + bool prefilter; // prefilter flag + bool som; // start of match flag + bool multi; // if false, we're in single mode. + bool utf8; // at least one of our patterns is utf8 + + enum TestResult result; + + TestUnit(unsigned sig_id, unsigned c_id, const Corpus &c, + shared_ptr pcre_in, shared_ptr cngi_in, + shared_ptr ue2_in, bool multi_in, bool utf8_in, + bool highlander_in, bool prefilter_in, bool som_in) + : pcre(pcre_in), cngi(cngi_in), ue2(ue2_in), corpus(c), id(sig_id), + corpus_id(c_id), highlander(highlander_in), prefilter(prefilter_in), + som(som_in), multi(multi_in), utf8(utf8_in), + result(TEST_NO_GROUND_TRUTH) {} +}; + +} // namespace + +// For ease of printing match sets +static +std::ostream &operator<<(std::ostream &os, const set &v) { + auto vi = v.begin(), ve = v.end(); + while (vi != ve) { + // match offsets + os << '(' << vi->from << ',' << vi->to << ')'; + if (++vi != ve) { + os << ", "; + } + } + return os; +} + +static +void printCorpus(ostream &out, const Corpus &corpus) { + // Print the offending corpus + string corpus_data(corpus.data.begin() + g_corpora_prefix.size(), + corpus.data.end() - g_corpora_suffix.size()); + bool trimmed = false; + if (corpus_data.size() > 1000) { + corpus_data.resize(1000); + trimmed = true; + } + out << " Corpus data: '" << printable(corpus_data) << "'"; + if (trimmed) { + out << " ..."; + } + out << "\n"; +} + +static +void printGroundTruthDifference(ostream &out, const ExpressionMap &exprMap, + const TestUnit &unit, + const ResultSet &pcre_results, + const ResultSet &ngw_results) { + assert(contains(exprMap, unit.id)); + // Print the expression itself + out << " Expression: '" << exprMap.at(unit.id) << "'\n"; + printCorpus(out, unit.corpus); + out << " PCRE matches: " << pcre_results.matches << "\n"; + out << " NFA matches: " << ngw_results.matches << "\n"; + + vector diff; + + set_difference(pcre_results.matches.begin(), pcre_results.matches.end(), + ngw_results.matches.begin(), ngw_results.matches.end(), + back_inserter(diff)); + + for (const auto &match : diff) { + out << " PCRE only: match (" << match.from << "," << match.to << ")\n"; + } + + diff.clear(); + + set_difference(ngw_results.matches.begin(), ngw_results.matches.end(), + pcre_results.matches.begin(), pcre_results.matches.end(), + back_inserter(diff)); + + for (const auto &match : diff) { + out << " NFA only: match (" << match.from << "," << match.to << ")\n"; + } + out.flush(); +} + +// Report the difference information when a pattern causes different matches in +// our engines. +static +void printDifference(ostream &out, const ExpressionMap &exprMap, + const TestUnit &unit, const ResultSet >_results, + const vector &ue2_results, + const vector &pass) { + assert(contains(exprMap, unit.id)); + // Print the expression itself + out << " Expression: '" << exprMap.at(unit.id) << "'\n"; + printCorpus(out, unit.corpus); + out << " " << gt_results.src << " matches: " << gt_results.matches << endl; + + for (u32 align = min_ue2_align; align < max_ue2_align; align++) { + if (pass[align]) { + continue; + } + + u32 align_in = align; + out << " UE2 (" << align; + while (align + 1 < max_ue2_align) { + if (pass[align + 1] || + ue2_results[align] != ue2_results[align + 1]) { + break; + } + align++; + } + + if (align != align_in) { + out << " - " << align; + } + + out << ") matches: " << ue2_results[align].matches; + out << endl; + + vector only; + + // Print matches only returned by ground truth + set_difference(gt_results.matches.begin(), + gt_results.matches.end(), + ue2_results[align].matches.begin(), + ue2_results[align].matches.end(), + back_inserter(only)); + for (const auto &match : only) { + out << " " << gt_results.src << " only: match (" + << match.from << "," << match.to << ')' << endl; + } + + // Print matches only returned by UE2 + only.clear(); + + set_difference(ue2_results[align].matches.begin(), + ue2_results[align].matches.end(), + gt_results.matches.begin(), + gt_results.matches.end(), + back_inserter(only)); + + for (const auto &match : only) { + out << " UE2 only: match (" << match.from << "," << match.to << ')' + << endl; + } + +#ifdef DEDUPE_MATCHES + for (const auto &match : ue2_results[align].dupe_matches) { + out << " UE2 dupe: match (" << match.from << "," << match.to + << ')' << endl; + } +#endif + + if (ue2_results[align].uoom) { + out << " *** UE2 produced matches out of order" << endl; + } + if (ue2_results[align].match_after_halt) { + out << " *** UE2 produced matches after termination" << endl; + } + if (ue2_results[align].invalid_id) { + out << " *** UE2 produced matches for invalid ids" << endl; + } + } +} + +static +void printMode(void) { + if (!g_ue2CompileAll) { + cout << "Single/"; + } else if (!multicompile_bands) { + cout << "Multi/"; + } else { + cout << "Multi-" << multicompile_bands << "/"; + } + + switch (colliderMode) { + case MODE_BLOCK: + cout << "Block"; + break; + case MODE_STREAMING: + cout << "Streaming-" << g_streamBlocks; + if (g_streamOffset) { + cout << " offset " << g_streamOffset; + } + if (use_copy_stream) { + cout << " [copy stream]"; + } + if (use_compress_expand) { + cout << " [compress]"; + } + if (use_compress_reset_expand) { + cout << " [compress+reset]"; + } + break; + case MODE_VECTORED: + cout << "Vectored-" << g_streamBlocks; + break; + } + + if (use_copy_scratch) { + cout << " [copy scratch]"; + } + if (use_mangle_scratch) { + cout << " [mangle]"; + } + cout << endl; +} + +static +void printSummaryV(const TestSummary &sum) { + cout << endl; + cout << "Summary:" << endl; + cout << "Mode: "; + printMode(); + cout << "=========" << endl; + cout << "Expressions processed: " << sum.totalExpressions << endl; + cout << "Corpora processed: " << sum.totalCorpora << endl; + cout << "Expressions with failures: " << sum.failIds.size() << endl; + cout << " Corpora generation failures: " << sum.failCorpora << endl; + cout << " Compilation failures: "; + cout << "pcre:" << sum.failPcreCompile << ", "; + cout << "ng:" << sum.failNGCompile << ", "; + cout << "ue2:" << sum.failUe2Compile << endl; + + cout << " Matching failures: "; + cout << "pcre:" << sum.failPcreScan << ", "; + cout << "ng:" << sum.failNGScan << ", "; + cout << "ue2:" << sum.failUe2Scan << endl; + cout << " Match differences: " << sum.failIds.size() << endl; + cout << " No ground truth: " << sum.nogtIds.size() << endl; + cout << "Total match differences: " << sum.failDiff << endl; +} + +static +void printSummaryQ(const TestSummary &sum) { + cout << "Summary: "; + printMode(); + + cout << "Processed: " << sum.totalExpressions << " expressions, " + << sum.totalCorpora << " corpora" << endl; + cout << "Failures: " << sum.failIds.size() + << " (corpora: " << sum.failCorpora << "; compile: "; + cout << "pcre:" << sum.failPcreCompile << ", "; + cout << "ng:" << sum.failNGCompile << ", "; + cout << "ue2:" << sum.failUe2Compile << "; match: "; + + cout << "pcre:" << sum.failPcreScan << ", "; + cout << "ng:" << sum.failNGScan << ", "; + cout << "ue2:" << sum.failUe2Scan << ")" << endl; + cout << "Differences: " << sum.failIds.size() << " expressions, " + << sum.failDiff << " total" << endl; + cout << "No ground truth: " << sum.nogtIds.size() << " expressions" << endl; +} + +static +void printSummary(const TestSummary &sum) { + if (g_quiet > 1) { + printSummaryQ(sum); + } else { + printSummaryV(sum); + } +} + +// Returns true if this Highlander mode test succeeded. +static +bool checkSingleMatch(const ResultSet &ground_truth, const ResultSet &ue2) { + // In Highlander (single-match) mode, UE2 must return only one of the + // matches returned by PCRE/GraphTruth. It need not be the earliest one. + if (ground_truth.matches.empty()) { + return ue2.matches.empty(); + } else if (ue2.matches.size() != 1) { + return false; + } else { + return contains(ground_truth.matches, *ue2.matches.begin()); + } +} + +// Returns true if this prefiltering mode test succeeded. +static +bool checkPrefilterMatch(const ResultSet &ground_truth, const ResultSet &ue2, + bool highlander) { + if (highlander) { + // Highlander + prefilter is tricky. Best we can do is say that if PCRE + // returns matches, UE2 must return a match, though it may not be one + // of the ones returned by PCRE (it may be an earlier match). + if (!ground_truth.matches.empty()) { + return ue2.matches.size() == 1; + } + // We can't verify anything more. + return true; + } else if (!limit_matches || ue2.matches.size() < limit_matches) { + // In prefilter mode, every match found by PCRE must be found by UE2, + // but the UE2 set may be a superset of the PCRE match set. + return std::includes(ue2.matches.begin(), ue2.matches.end(), + ground_truth.matches.begin(), ground_truth.matches.end()); + } + + // Otherwise, we've hit our match limit. Prefilter mode is quite difficult + // to verify in this case, so we just verify that "something happened". + return true; +} + +static +ResultSet makeEndOfMatchOnly(const ResultSet &rs) { + ResultSet out(rs.src); + for (const auto &match : rs.matches) { + out.addMatch(0, match.to); + } + return out; +} + +static +bool checkMultiMatch(const ResultSet &ground_truth, const ResultSet &ue2) { + // If we had out-of-order matches or matches after termination, we have a + // bug! + if (ue2.uoom || ue2.match_after_halt || ue2.invalid_id) { + return false; + } + + // If we have more UE2 matches than our limit, we have a bug! + if (limit_matches && ue2.matches.size() > limit_matches) { + return false; + } + + // If we have more UE2 matches than PCRE matches, we have a bug! + if (ue2.matches.size() > ground_truth.matches.size()) { + return false; + } + + // If we've got fewer matches than our limit to test, then the match sets + // must be identical. + if (!limit_matches || ground_truth.matches.size() < limit_matches) { + return ground_truth == ue2; + } + + // We're in limit_matches mode _and_ we have hit the limit. Every match in + // 'ue2' must be in 'pcre'. (We can't just trim pcre and do an equality + // test as matches may come out of UE2 a little out of order.) + + // In streaming mode, the limit may mean that we get a different SOM from + // the leftmost one. So we compare only end offsets. + if (colliderMode == MODE_STREAMING || colliderMode == MODE_VECTORED) { + ResultSet gt_eom = makeEndOfMatchOnly(ground_truth); + ResultSet ue2_eom = makeEndOfMatchOnly(ue2); + return std::includes(gt_eom.matches.begin(), gt_eom.matches.end(), + ue2_eom.matches.begin(), ue2_eom.matches.end()); + } + + return std::includes(ground_truth.matches.begin(), + ground_truth.matches.end(), + ue2.matches.begin(), ue2.matches.end()); +} + +// Check results, returns true if there has any failure. +static +bool checkTestResults(ostream &out, TestSummary &summary, + const ExpressionMap &exprMap, TestUnit &unit, + const ResultSet >_results, + const vector &ue2_results) { + bool failed = false; + bool any_fail = false; + vector pass(max_ue2_align, false); + + for (unsigned align = min_ue2_align; align != max_ue2_align; ++align) { + if (unit.prefilter) { + failed = !checkPrefilterMatch(gt_results, ue2_results[align], + unit.highlander); + } else if (unit.highlander) { + failed = !checkSingleMatch(gt_results, ue2_results[align]); + } else { + // In non-Highlander mode, the two result sets MUST be equal + // don't check PCRE if the scan didn't succeed + failed = !checkMultiMatch(gt_results, ue2_results[align]); + } + +#ifdef DEDUPE_MATCHES + if (!failed) { + failed |= !ue2_results[align].dupe_matches.empty(); + } +#endif + + pass[align] = !failed; + + any_fail |= failed; + + summary.failDiff += failed ? 1 : 0; + + if (g_verbose) { + if (failed) { + out << "FAILED: id " << unit.id << ", alignment " << align + << ", corpus " << unit.corpus_id << ", results differ" + << endl; + } else { + out << "PASSED: id " << unit.id << ", alignment " << align + << ", corpus " << unit.corpus_id + << " (matched "<< gt_results.src << ":" + << gt_results.matches.size() + << ", ue2:" << ue2_results[align].matches.size() << ")" + << endl; + } + } + } + + if (!any_fail) { + return false; + } + + if (!g_verbose) { + out << "FAILED: id " << unit.id << ", alignment"; + for (unsigned align = min_ue2_align; align != max_ue2_align; ++align) { + if (!pass[align]) { + out << " " << align; + + if (align + 1 < max_ue2_align && !pass[align + 1]) { + while (align + 1 < max_ue2_align && !pass[align + 1]) { + align++; + } + + out << "-" << align; + } + } + } + + out << ", corpus " << unit.corpus_id << ", results differ" << endl; + } + printDifference(out, exprMap, unit, gt_results, ue2_results, pass); + + return true; +} + +// Construct a UE2 database, taking care of loading/saving to disk when +// appropriate +static +shared_ptr constructDatabase(const set &ids, + const UltimateTruth &ultimate) { + assert(!ids.empty()); + + if (loadDatabases) { + string filename = ultimate.dbFilename(ids); + shared_ptr db = ultimate.loadDatabase(filename, ids); + if (!db) { + if (!g_quiet) { + cout << "FAILED: could not load database " << filename << endl; + } + return nullptr; + } + return make_shared(db); + } + + shared_ptr ue2 = make_shared(ids); + + try { + // If we're not runnable (i.e. we're cross-compiling), let's at least + // try to build the database. + if (!ultimate.runnable()) { + shared_ptr db = ue2->get(ultimate); + assert(db); // throws otherwise + } + + // Compile and save if we've been told to. + if (saveDatabases) { + string filename = ultimate.dbFilename(ids); + if (!ultimate.saveDatabase(*(ue2->get(ultimate)), + filename.c_str())) { + cout << "FAILED: could not save database to file: " << filename + << endl; + } + } + } catch (const CompileFailed &fail) { + if (!g_quiet) { + cout << "FAILED: ue2 compile failed for " << *ids.begin() << ": " + << fail.error << endl; + } + // Return null database to indicate failure. + ue2 = nullptr; + } + + return ue2; +} + +static +bool getGraphTruth(ostream &out, CNGInfo &cngi, GraphTruth &graph, + TestUnit &unit, ResultSet &ngw_results, + TestSummary &summary, const string &expression) { + debug_stage = STAGE_GRAPH_RUN; + + // Skip patterns we've previously marked as bad. + if (cngi.is_bad()) { + summary.failNGScan++; + return false; + } + + // If we already have match information for this corpus, we don't need to + // run PCRE at all. At the moment our on-disk format for corpora with match + // information only includes the end-of-match offset, so we only use these + // in non-som modes. If edit distance is forced, all bets are off so we + // ignore this as well. + if (!g_streamOffset && unit.corpus.hasMatches && !force_utf8 && !cngi.som && + !force_edit_distance) { + if (g_verbose) { + out << "Using corpus match set rather than NFA graph" << endl; + } + ngw_results = ResultSet(unit.corpus.matches, RESULT_FROM_GRAPH); + } else { + // compile the actual graph + const CompiledNG *cng; + try { + debug_stage = STAGE_GRAPH_COMPILE; + cng = cngi.get(); + debug_stage = STAGE_UNDEFINED; + } + catch (const NGCompileFailure &err) { + debug_stage = STAGE_UNDEFINED; + summary.failNGCompile++; + summary.failNGScan++; + cngi.mark_bad(); + if (!g_quiet) { + cout << "FAILED: id " << unit.id + << ", NFA graph compile failed (" << err.msg << ")" + << endl; + } + return false; + } + debug_stage = STAGE_GRAPH_RUN; + + // Run NFA graph and collect match information. + string error; + assert(cng); + if (!graph.run(unit.id, *cng, cngi, unit.corpus.data, ngw_results, + error)) { + if (!g_quiet) { + out << "FAILED: id " << unit.id + << ", NFA graph scan failed: " << error << "\n" + << " Expression: '" << expression << "'\n" + << " Corpus data: '" << printable(unit.corpus.data) + << "'\n" + << " (note: marking bad, skipping subsequent tests)" + << endl; + } + summary.failNGScan++; + cngi.mark_bad(); + return false; + } + } + + return true; +} + +static +bool getGroundTruth(ostream &out, CompiledPcre &cpcre, GroundTruth &ground, + TestUnit &unit, ResultSet &pcre_results, + TestSummary &summary) { + debug_stage = STAGE_PCRE_RUN; + + // Skip patterns we've previously marked as bad. + if (cpcre.is_bad()) { + summary.failPcreScan++; + return false; + } + + // If we already have match information for this corpus, we don't need to + // run PCRE at all. At the moment our on-disk format for corpora with match + // information only includes the end-of-match offset, so we only use these + // in non-som modes. Also, we can't trust corpus matches if there was an + // edit distance requested for all patterns. + if (!g_streamOffset && unit.corpus.hasMatches && !force_utf8 && !cpcre.som + && !force_edit_distance) { + if (g_verbose) { + out << "Using corpus match set rather than PCRE" << endl; + } + pcre_results = ResultSet(unit.corpus.matches, RESULT_FROM_PCRE); + } else { + // Run PCRE and collect match information. + string error; + if (!ground.run(unit.id, cpcre, unit.corpus.data, pcre_results, + error)) { + if (!g_quiet) { + out << "FAILED: id " << unit.id + << ", libpcre scan failed: " << error << "\n" + << " Expression: '" << cpcre.expression << "'\n" + << " Corpus data: '" << printable(unit.corpus.data) + << "'\n" + << " (note: marking PCRE bad, skipping subsequent tests)" + << endl; + } + summary.failPcreScan++; + cpcre.mark_bad(); + return false; + } + } + + return true; +} + +static +void writeCorpus(unsigned id, const Corpus &corpus, const ResultSet &results) { + assert(corporaOut); + ostringstream oss; + oss << id << "=\"" << printable(corpus.data) << "\": "; + + auto vi = results.matches.begin(); + auto ve = results.matches.end(); + + // Print match end offsets only. + while (vi != ve) { + oss << vi->to; + if (++vi != ve) { + oss << ","; + } + } + oss << "\n"; + corporaOut->write(oss.str()); +} + +static +void runTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph, + UltimateTruth &ultimate, TestUnit &unit, TestSummary &summary, + const ExpressionMap &exprMap) { + assert(use_UE2); + Corpus &corpus = unit.corpus; + + shared_ptr db; + if (use_UE2) { + // Acquire UE2 database. + debug_stage = STAGE_UE2_COMPILE; + try { + db = unit.ue2->get(ultimate); + } catch (const CompileFailed &fail) { + summary.failUe2Compile++; + if (!g_quiet) { + out << "FAILED: ue2 compile failed for " << unit.id << ": " + << fail.error << endl; + unit.result = TEST_FAILED_COMPILE; + debug_stage = STAGE_UNDEFINED; + return; + } + } + debug_stage = STAGE_UNDEFINED; + + if (!db) { + // Database previously failed compilation. + unit.result = TEST_SKIPPED; + return; + } + } + + // If the user has specified that they want prefix/suffix data added to + // their corpora, we do it here; this is as local as possible to the + // test, so we don't keep piles of HUGE corpora hanging around. + if (!g_corpora_prefix.empty()) { + corpus.data.insert(0, g_corpora_prefix); + corpus.hasMatches = false; + } + if (!g_corpora_suffix.empty()) { + corpus.data.append(g_corpora_suffix); + corpus.hasMatches = false; + } + + ResultSet gt_results(RESULT_FROM_PCRE); + vector ue2_results(max_ue2_align, ResultSet(RESULT_FROM_UE2)); + + bool gt_done = false; + + // run PCRE test if enabled and if compile succeeded + if (unit.pcre && use_PCRE) { + gt_done = getGroundTruth(out, *unit.pcre, ground, unit, gt_results, + summary); + } + + // run NFA if PCRE failed (or wasn't run), or if we don't run UE2 + if (unit.cngi && (use_NFA && !gt_done)) { + gt_done = getGraphTruth(out, *unit.cngi, graph, unit, gt_results, + summary, exprMap.find(unit.id)->second); + } + + // both ground truth methods either failed or didn't run + if (!gt_done) { + unit.result = TEST_NO_GROUND_TRUTH; + return; + } + + // Write out corpora if we've been told to + if (saveCorpora) { + writeCorpus(unit.id, unit.corpus, gt_results); + } + + debug_stage = STAGE_UE2_RUN; + for (unsigned int align = min_ue2_align; align != max_ue2_align; ++align) { + bool ok = ultimate.run(unit.id, db, corpus.data, !unit.multi, align, + ue2_results[align]); + + if (!ok) { + if (!g_quiet) { + out << "FAILED: id " << unit.id << ", ue2 scan at alignment " + << align << " failed" << endl; + } + unit.result = TEST_FAILED; + debug_stage = STAGE_UNDEFINED; + return; + } + } + + // if we're using UE2, check all the different results modes + if (checkTestResults(out, summary, exprMap, unit, gt_results, + ue2_results)) { + unit.result = TEST_FAILED; + } else { + unit.result = TEST_PASSED; + } + + debug_stage = STAGE_UNDEFINED; +} + +/* Used for testing the graph truth agains PCE */ +static +void runGroundCompTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph, + TestUnit &unit, TestSummary &summary, + const ExpressionMap &exprMap) { + assert(!use_UE2); + assert(use_PCRE); + assert(use_NFA); + Corpus &corpus = unit.corpus; + + // If the user has specified that they want prefix/suffix data added to + // their corpora, we do it here; this is as local as possible to the + // test, so we don't keep piles of HUGE corpora hanging around. + if (!g_corpora_prefix.empty()) { + corpus.data.insert(0, g_corpora_prefix); + corpus.hasMatches = false; + } + if (!g_corpora_suffix.empty()) { + corpus.data.append(g_corpora_suffix); + corpus.hasMatches = false; + } + + ResultSet pcre_results(RESULT_FROM_PCRE); + ResultSet ngw_results(RESULT_FROM_GRAPH); + + bool pcreResult = false; + bool graphResult = false; + + if (unit.pcre) { + pcreResult = getGroundTruth(out, *unit.pcre, ground, unit, pcre_results, + summary); + } + + if (unit.cngi) { + graphResult = getGraphTruth(out, *unit.cngi, graph, unit, ngw_results, + summary, exprMap.find(unit.id)->second); + } + + // no ground truth found either NFA or PCRE failed + if (!pcreResult || !graphResult) { + unit.result = TEST_NO_GROUND_TRUTH; + return; + } + + // Write out corpora if we've been told to + if (saveCorpora) { + writeCorpus(unit.id, unit.corpus, pcre_results); + } + + if (pcre_results.matches != ngw_results.matches) { + unit.result = TEST_FAILED; + out << "FAILED: id " << unit.id << ", corpus " << unit.corpus_id + << ", results differ" << endl; + + printGroundTruthDifference(out, exprMap, unit, pcre_results, + ngw_results); + } else { + unit.result = TEST_PASSED; + if (g_verbose) { + out << "PASSED: id " << unit.id << ", corpus " << unit.corpus_id + << " (matched pcre:" << pcre_results.matches.size() + << ", matched ng:" << ngw_results.matches.size() << ")" << endl; + } + } + + debug_stage = STAGE_UNDEFINED; +} + +static +void addCorporaToQueue(ostream &out, BoundedQueue &testq, unsigned id, + CorporaSource &corpora, TestSummary &summary, + shared_ptr cpcre, shared_ptr cngi, + shared_ptr ue2, bool multi, bool utf8) { + // build corpora + vector c; + try { + corpora.generate(id, c); + } + catch (CorpusFailure &err) { + if (!g_quiet) { + out << "FAILED: id " << id << ", corpora failure: " << err.message + << endl; + } + summary.failCorpora++; + return; + } + + const bool som = cpcre ? cpcre->som : cngi->som; + const bool prefilter = cpcre ? cpcre->prefilter : cngi->prefilter; + const bool highlander = cpcre ? cpcre->highlander : cngi->highlander; + + // If we're in UTF-8 mode and the corpus isn't valid UTF-8, skip it: + // Hyperscan's behaviour when scanning invalid UTF-8 data in UTF-8 mode + // is undefined. + if (utf8) { + auto is_invalid_utf8 = [](const Corpus &corpus) { + return !isValidUtf8(corpus.data.c_str()); + }; + c.erase(remove_if(begin(c), end(c), is_invalid_utf8), end(c)); + } + + // Collect together corpora units in a container so that we don't have to + // repeatedly lock the queue. + vector> tests; + tests.reserve(c.size()); + + size_t corpus_id = 0; + for (const Corpus &corpus : c) { + tests.push_back(ue2::make_unique(id, corpus_id, corpus, cpcre, + cngi, ue2, multi, utf8, + highlander, prefilter, som)); + corpus_id++; + } + + testq.push(begin(tests), end(tests)); +} + +namespace /* anonymous */ { + +// A subclass of Thread that stores its own output in a stringstream, flushing +// it to cout when necessary. +class OutputThread : public Thread { +public: + OutputThread(size_t id) : Thread(id) {} + ~OutputThread() override { + flush_output(); + } + +protected: + void flush_output() { + const string &s = out.str(); + if (!s.empty()) { + cout << s; + out.str(""); // make empty + } + } + + // Output stream, flushed to cout after every test unit. + stringstream out; +}; + +class ScanThread : public OutputThread { +public: + ScanThread(size_t id, BoundedQueue &testq, const ExpressionMap &e, + const hs_platform_info *plat, const Grey &grey) + : OutputThread(id), q(testq), + ground(out, e, g_matchLimit, g_matchLimitRecursion), graph(out, e), + ultimate(out, e, plat, grey, g_streamBlocks), exprMap(e) {} + + void run() override { + DEBUG_PRINTF("thread %zu running\n", thread_id); + for (;;) { + const auto unit = q.pop(thread_id); + if (!unit) { + // Sentinel value, indicates that we have run out of units to + // process. + DEBUG_PRINTF("thread %zu stopped\n", thread_id); + break; + } + + assert(unit); + assert(exprMap.find(unit->id) != exprMap.end()); + + // Debug information is stored in TLS and (hopefully) printed out in + // the event of a crash. + debug_expr = unit->id; + debug_corpus = unit->corpus_id; + debug_corpus_ptr = unit->corpus.data.c_str(); + debug_corpus_len = unit->corpus.data.size(); + debug_expr_ptr = exprMap.find(unit->id)->second.c_str(); + + if (use_UE2) { + runTestUnit(out, ground, graph, ultimate, *unit, summary, + exprMap); + } else { + runGroundCompTestUnit(out, ground, graph, *unit, summary, + exprMap); + } + + if (unit->result == TEST_NO_GROUND_TRUTH) { + summary.nogtIds.insert(unit->id); + // this is fine, continue + } else if (unit->result == TEST_FAILED) { + summary.failIds.insert(unit->id); + } + + count++; + summary.totalCorpora++; + flush_output(); + } + } + + const TestSummary &getSummary() const { return summary; } + +public: + size_t count = 0; // number of units processed + +private: + // Shared queue. + BoundedQueue &q; + + // Thread-local data. + GroundTruth ground; // independent copy + GraphTruth graph; // independent copy + UltimateTruth ultimate; // independent copy + TestSummary summary; + + // Constant shared data. + const ExpressionMap &exprMap; +}; + +/** Represent a work item for the corpus generation threads. This contains + * all information relating to an expression. The corpus generator will + * generate corpora for this expression and enqueue work items representing + * complete test cases for the scanning threads. + */ +struct CorpusGenUnit { + CorpusGenUnit(unique_ptr cngi_in, unique_ptr pcre_in, + shared_ptr ue2_in, unsigned expr_id, + bool multi_in, bool utf8_in) + : cngi(move(cngi_in)), pcre(move(pcre_in)), ue2(ue2_in), id(expr_id), + multi(multi_in), utf8(utf8_in) {} + + unique_ptr cngi; + unique_ptr pcre; + + /* ue2 shared_ptr as in multicompile and banded compile it is shared amongst + * various corpus units (with differing expression ids). */ + shared_ptr ue2; + + unsigned id; // expression id + bool multi; // ue2 contains more than one expression + bool utf8; // ue2 can be run against utf8 corpora +}; + +class CorpusGenThread : public OutputThread { +public: + CorpusGenThread(size_t id, BoundedQueue &testq_in, + BoundedQueue &corpq_in, + const CorporaSource &corpora_in) + : OutputThread(id), testq(testq_in), corpq(corpq_in), + corpora(corpora_in.clone()) {} + + void run() override { + DEBUG_PRINTF("thread %zu running\n", thread_id); + for (;;) { + auto c = corpq.pop(thread_id); + if (!c) { + break; + } + + addCorporaToQueue(out, testq, c->id, *corpora, summary, + move(c->pcre), move(c->cngi), c->ue2, c->multi, + c->utf8); + + count++; + flush_output(); + } + } + + const TestSummary &getSummary() const { return summary; } + +public: + size_t count = 0; // number of units processed + +private: + // Output queue, shared between threads. + BoundedQueue &testq; + + // Input queue, shared between corpus generator threads. + BoundedQueue &corpq; + + // Thread-local data. + const unique_ptr corpora; // independent copy + TestSummary summary; +}; + +} // namespace + +static +unique_ptr makeNGInfo(const unsigned id, TestSummary &summary, + GraphTruth &graph, UltimateTruth &ultimate, + shared_ptr ue2) { + string nfaErr; + + try { + debug_stage = STAGE_GRAPH_PREPROCESS; + auto cngi = graph.preprocess(id); + debug_stage = STAGE_UNDEFINED; + return cngi; + } + catch (const NGCompileFailure &err) { + nfaErr = err.msg; + debug_stage = STAGE_UNDEFINED; + // fall through + } + catch (const NGUnsupportedFailure &err) { + // unsupported error happens when the pattern appears to be valid, but + // there are things that we don't yet support (e.g. SOM). + // in this case, try again, suppressing the errors + debug_stage = STAGE_UNDEFINED; + summary.failNGCompile++; + + // try again and suppress unsupported errors + try { + debug_stage = STAGE_GRAPH_PREPROCESS; + auto cngi = graph.preprocess(id, true); + debug_stage = STAGE_UNDEFINED; + + // preprocess succeeded - that means the pattern itself is valid. + // however, we can't use it, so we have to mark it as bad + // only print the error in the following cases: + // 1) if verbose is specified + // 2) if we are not using UE2 and quiet is NOT specified + if ((!use_UE2 && !g_quiet) || g_verbose) { + cout << "FAILED: id " << id << ", NFA graph preprocess failed (" + << err.msg << ")" << endl; + } + cngi->mark_bad(); + return cngi; + } + catch (const NGCompileFailure &e) { + // compile failed + nfaErr = e.msg; + debug_stage = STAGE_UNDEFINED; + // fall through + } + } + + // We should ensure that we also fail compilation with UE2, otherwise we + // likely have a pattern support bug. + try { + auto db = ue2->get(ultimate); + if (db) { + // if we made it this far, that means UE2 compile succeeded while + // NFA compile failed. + cout << "FAILED: id " << id << ", NFA graph preprocess failed (" + << nfaErr << ") but UE2 compile succeeded." << endl; + summary.failNGCompile++; + summary.failCompileDifference++; + return nullptr; + } + // If db is nullptr, we have previously failed compilation of this + // database. + } + catch (const CompileFailed &) { + // Everything's OK: compilation failed in Hyperscan as well. Fall + // through. + } + summary.failNGCompile++; + if (!g_quiet) { + cout << "FAILED: id " << id << ", NFA graph preprocess failed (" + << nfaErr << ")" << endl; + } + return nullptr; +} + +static +unique_ptr makePcre(const unsigned id, TestSummary &summary, + GroundTruth &ground, UltimateTruth &ultimate, + shared_ptr ue2) { + string pcreErr; + + try { + debug_stage = STAGE_PCRE_COMPILE; + auto cpcre = ground.compile(id); + debug_stage = STAGE_UNDEFINED; + return cpcre; + } + catch (const SoftPcreCompileFailure &err) { + debug_stage = STAGE_UNDEFINED; + summary.failPcreCompile++; + if (g_verbose) { + cout << "FAILED: id " << id + << ", libpcre compile failed with soft error: " << err.msg + << endl; + } + return nullptr; + } + catch (const PcreCompileFailure &err) { + debug_stage = STAGE_UNDEFINED; + pcreErr = err.msg; + // fall through + } + + // We should ensure that we also fail compilation with UE2, otherwise we + // likely have a pattern support bug. + try { + auto db = ue2->get(ultimate); + if (db) { + // OK, so now we have a situation: PCRE failed but UE2 succeeded. + // There is one situation where this is legal: patterns beginning + // with (*UTF8), which will throw an error due to the callback + // wrapping we do for PCRE. We can check these by trying to compile + // an "unwrapped" PCRE. + ground.compile(id, true); + // If we didn't throw, PCRE failed above but succeeded when not + // wrapped in a callback, and UE2 succeeded. Not worth reporting, + // fall through. + } + } + catch (const CompileFailed &) { + // Everything's OK: compilation failed in Hyperscan as well. Fall + // through. + } + catch (const PcreCompileFailure &) { + cout << "FAILED: id " << id << ", libpcre compile failed (" << pcreErr + << ") but UE2 compile succeeded." << endl; + summary.failPcreCompile++; + summary.failCompileDifference++; + return nullptr; + } + + if (!g_quiet) { + cout << "FAILED: id " << id << ", libpcre compile failed: " << pcreErr + << endl; + } + + summary.failPcreCompile++; + return nullptr; +} + +static +void drainGenerators(BoundedQueue &corpq, + vector> &generators, + TestSummary &summary) { + // Push a sentinel per thread. + for (size_t i = 0; i < generators.size(); i++) { + corpq.push(nullptr); + } + + // Wait for workers to end and retrieve their results. + for (auto &c : generators) { + c->join(); + summary.merge(c->getSummary()); + } +} + +// Note: In multi-pattern cases, utf8 is true if any pattern to be run against +// this corpus is in UTF-8 mode. +static +unique_ptr makeCorpusGenUnit(unsigned id, TestSummary &summary, + GroundTruth &ground, + GraphTruth &graph, + UltimateTruth &ultimate, + shared_ptr ue2, + bool multi, bool utf8) { + unique_ptr cpcre; + unique_ptr cngi; + + // compile PCRE bytecode + if (use_PCRE) { + cpcre = makePcre(id, summary, ground, ultimate, ue2); + } + if (use_NFA) { + cngi = makeNGInfo(id, summary, graph, ultimate, ue2); + } + + // if both compiles failed, skip the test + if (!cpcre && !cngi) { + return nullptr; + } + + // Caller may already have set the UTF-8 property (in multi cases) + utf8 |= cpcre ? cpcre->utf8 : cngi->utf8; + + return ue2::make_unique(move(cngi), move(cpcre), ue2, id, + multi, utf8); +} + +static +bool hasUTF8Pattern(GroundTruth &ground, ExpressionMap::const_iterator it, + ExpressionMap::const_iterator end) { + /* note: we cannot just check the flags as utf8 can be enabled in the + * pattern itself with (*UTF) */ + debug_stage = STAGE_PCRE_COMPILE; + for (; it != end; ++it) { + try { + auto cpcre = ground.compile(it->first); + assert(cpcre); // Would have thrown PcreCompileFailure otherwise. + if (cpcre->utf8) { + DEBUG_PRINTF("UTF8 mode\n"); + debug_stage = STAGE_UNDEFINED; + return true; + } + } + catch (const PcreCompileFailure &) { + continue; + } + } + debug_stage = STAGE_UNDEFINED; + return false; +} + +// Fill a test queue with single-pattern tests. +static +void buildSingle(BoundedQueue &corpq, TestSummary &summary, + GroundTruth &ground, GraphTruth &graph, + UltimateTruth &ultimate, const ExpressionMap &exprMap) { + for (const auto &m : exprMap) { + unsigned id = m.first; + debug_expr = id; + debug_expr_ptr = m.second.c_str(); + + shared_ptr ue2 = constructDatabase({id}, ultimate); + if (!ue2) { + summary.failUe2Compile++; + continue; + } + + // if we're cross-compiling, then we don't bother building PCRE and + // running scans, we're just going to output the database bytecode. + if (!ultimate.runnable()) { + continue; + } + + bool multi = false; + bool utf8 = false; + auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, ue2, + multi, utf8); + if (u) { + corpq.push(move(u)); + } + } +} + +// Fill a test queue with multi-pattern tests of size N, where N is the band +// size specified on the command line. +static +void buildBanded(BoundedQueue &corpq, TestSummary &summary, + GroundTruth &ground, GraphTruth &graph, + UltimateTruth &ultimate, const ExpressionMap &exprMap) { + for (auto i = exprMap.begin(), e = exprMap.end(); i != e;) { + debug_expr = i->first; + debug_expr_ptr = i->second.c_str(); + + // Build a set of IDs in this band from the expression map + set bandIds; + + if (g_verbose) { + cout << "Building set:"; + } + + ExpressionMap::const_iterator band_end = i; + for (u32 j = 0; j < multicompile_bands && band_end != e; + j++, ++band_end) { + bandIds.insert(bandIds.end(), band_end->first); + if (g_verbose) { + cout << " " << band_end->first; + } + } + + if (g_verbose) { + cout << endl; + } + + // compile UE2 bytecode + shared_ptr ue2 = constructDatabase(bandIds, ultimate); + if (!ue2) { + summary.failUe2Compile++; + i = band_end; + continue; + } + + // if we're cross-compiling, then we don't bother building PCRE and + // running scans, we're just going to output the database bytecode. + if (!ultimate.runnable()) { + i = band_end; + continue; + } + + bool utf8 = hasUTF8Pattern(ground, i, band_end); + + for (; i != band_end; ++i) { + unsigned id = i->first; + bool multi = true; + auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, + ue2, multi, utf8); + if (u) { + corpq.push(move(u)); + } + } + } +} + +// Fill a test queue with multi-pattern tests. +static +void buildMulti(BoundedQueue &corpq, TestSummary &summary, + GroundTruth &ground, GraphTruth &graph, UltimateTruth &ultimate, + const ExpressionMap &exprMap) { + // Build a set of all IDs from the expression map + set idsAll; + for (const auto &e : exprMap) { + idsAll.insert(e.first); + } + + // Compile in UE2 + shared_ptr ue2 = constructDatabase(idsAll, ultimate); + if (!ue2) { + summary.failUe2Compile++; + return; + } + + // if we're cross-compiling, then we don't bother building PCRE and + // running scans, we're just going to output the database bytecode. + if (!ultimate.runnable()) { + return; + } + + bool utf8 = hasUTF8Pattern(ground, exprMap.begin(), exprMap.end()); + + for (const auto &m : exprMap) { + unsigned id = m.first; + debug_expr = id; + debug_expr_ptr = m.second.c_str(); + bool multi = true; + auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, ue2, + multi, utf8); + if (u) { + corpq.push(move(u)); + } + } +} + +static +void generateTests(CorporaSource &corpora_src, const ExpressionMap &exprMap, + TestSummary &summary, const hs_platform_info *plat, + const Grey &grey, BoundedQueue &testq) { + GraphTruth graph(cout, exprMap); + GroundTruth ground(cout, exprMap, g_matchLimit, g_matchLimitRecursion); + UltimateTruth ultimate(cout, exprMap, plat, grey, g_streamBlocks); + + // Construct corpus generator queue and threads. + BoundedQueue corpq(numGeneratorThreads, + max_generator_queue_len); + vector> generators; + for (size_t i = 0; i < numGeneratorThreads; i++) { + auto c = make_unique(i, testq, corpq, corpora_src); + c->start(); + generators.push_back(move(c)); + } + + if (g_ue2CompileAll && multicompile_bands) { + printf("Running single-pattern/banded-multi-compile test for %zu " + "expressions.\n\n", exprMap.size()); + buildBanded(corpq, summary, ground, graph, ultimate, exprMap); + } else if (g_ue2CompileAll) { + printf("Running single-pattern/multi-compile test for %zu " + "expressions.\n\n", exprMap.size()); + buildMulti(corpq, summary, ground, graph, ultimate, exprMap); + } else { + printf("Running single-pattern/single-compile test for %zu " + "expressions.\n\n", exprMap.size()); + buildSingle(corpq, summary, ground, graph, ultimate, exprMap); + } + + drainGenerators(corpq, generators, summary); +} + +static +void printSettingsV(const vector &corporaFiles, + const hs_platform_info *platform) { + cout << "hscollider: The Pattern Collider Mark II\n\n" + << "Number of threads: " << numThreads << " (" << numScannerThreads + << " scanner, " << numGeneratorThreads << " generator)\n" + << "Expression path: " << g_exprPath << "\n" + << "Signature files: "; + if (g_signatureFiles.empty()) { + cout << "none" << endl; + } else { + for (unsigned i = 0; i < g_signatureFiles.size(); i++) { + string &fname = g_signatureFiles[i]; + if (i > 0) { + cout << string(20, ' '); + } + cout << fname << endl; + } + } + cout << "Mode of operation: "; + + switch (colliderMode) { + case MODE_BLOCK: cout << "block mode"; break; + case MODE_STREAMING: cout << "streaming mode"; break; + case MODE_VECTORED: cout << "vectored mode"; break; + } + cout << endl; + + if (limit_matches) { + cout << "Terminate scanning after " << limit_matches << " matches." + << endl; + } + + if (platform) { + cout << "Cross-compile for: " << to_string(*platform) << endl; + } + + if (loadDatabases) { + cout << "Loading DBs from: " << serializePath << endl; + } + if (saveDatabases) { + cout << "Saving DBs to: " << serializePath << endl; + } + if (colliderMode == MODE_STREAMING) { + cout << "Stream block count: " << g_streamBlocks << endl; + } + if (colliderMode == MODE_VECTORED) { + cout << "Vectored block count: " << g_streamBlocks << endl; + } + + if (use_UE2) { + if (max_ue2_align == min_ue2_align + 1) { + cout << "UE2 scan alignment: " << min_ue2_align << endl; + } else { + cout << "UE2 scan alignment: [" << min_ue2_align << ", " + << max_ue2_align << ")" << endl; + } + } + + if (!corporaFiles.empty()) { + for (const auto &file : corporaFiles) { + cout << "Corpora read from file: " << file << endl; + } + } else { + cout << "Corpora properties: \n" + << " random seed: " << corpus_gen_prop.getSeed() << "\n" + << " percentages: " << corpus_gen_prop.percentMatch() + << "% match, " + << corpus_gen_prop.percentUnmatch() << "% unmatch, " + << corpus_gen_prop.percentRandom() << "% random" << endl; + + // prefix and suffix info + const min_max &prefixSpan = corpus_gen_prop.prefixRange; + const min_max &suffixSpan = corpus_gen_prop.suffixRange; + if (prefixSpan.max) { + cout << " random prefix: " << prefixSpan.min << " to " + << prefixSpan.max << endl; + } else { + cout << " random prefix: none" << endl; + } + if (suffixSpan.max) { + cout << " random suffix: " << suffixSpan.min + << " to " << suffixSpan.max << endl; + } else { + cout << " random suffix: none" << endl; + } + + // cycle info + pair cycleSpan = corpus_gen_prop.getCycleLimit(); + cout << " follow cycles: " << cycleSpan.first << " to " + << cycleSpan.second << " times" << endl; + } + + if (saveCorpora) { + cout << "Saving corpora to: " << saveCorporaFile << endl; + } + + cout << endl; +} + +static +void printSettingsQ(const vector &corporaFiles, + const hs_platform_info *platform) { + cout << "Number of threads: " << numThreads << endl + << "Expression path: " << g_exprPath << endl + << "Signature files: "; + if (g_signatureFiles.empty()) { + cout << "none" << endl; + } else { + for (unsigned i = 0; i < g_signatureFiles.size(); i++) { + string &fname = g_signatureFiles[i]; + if (i > 0) { + cout << string(20, ' '); + } + cout << fname << endl; + } + } + cout << "Mode of operation: "; + + switch (colliderMode) { + case MODE_BLOCK: cout << "block mode"; break; + case MODE_STREAMING: cout << "streaming mode"; break; + case MODE_VECTORED: cout << "vectored mode"; break; + } + cout << endl; + + if (limit_matches) { + cout << "Terminate scanning after " << limit_matches << " matches." + << endl; + } + + if (platform) { + cout << "Cross-compile for: " << to_string(*platform) << endl; + } + + if (colliderMode == MODE_STREAMING) { + cout << "Stream block count: " << g_streamBlocks << endl; + } + if (colliderMode == MODE_VECTORED) { + cout << "Vectored block count: " << g_streamBlocks << endl; + } + + if (max_ue2_align == min_ue2_align + 1) { + cout << "UE2 scan alignment: " << min_ue2_align << endl; + } else { + cout << "UE2 scan alignment: [" << min_ue2_align << ", " + << max_ue2_align << ")" << endl; + } + + if (!g_corpora_prefix.empty()) { + cout << "Prefix of " << g_corpora_prefix.size() << "bytes" << endl; + } + if (!g_corpora_suffix.empty()) { + cout << "Suffix of " << g_corpora_suffix.size() << "bytes" << endl; + } + + if (!corporaFiles.empty()) { + cout << "Corpora: from file" << endl; + } else { + cout << "Corpora: -R " << corpus_gen_prop.getSeed() << " -p " + << corpus_gen_prop.percentMatch() << "," + << corpus_gen_prop.percentUnmatch() << "," + << corpus_gen_prop.percentRandom(); + + // prefix and suffix info + const min_max &prefixSpan = corpus_gen_prop.prefixRange; + const min_max &suffixSpan = corpus_gen_prop.suffixRange; + if (prefixSpan.max) { + cout << " -P " << prefixSpan.min << "," << prefixSpan.max; + } + if (suffixSpan.max) { + cout << " -S " << suffixSpan.min << "," << suffixSpan.max; + } + + // cycle info + pair cycleSpan = corpus_gen_prop.getCycleLimit(); + cout << " -C " << cycleSpan.first << "," << cycleSpan.second; + cout << endl; + } +} + +static +void printSettings(const vector &c, const hs_platform_info *plat) { + if (g_quiet > 1) { + printSettingsQ(c, plat); + } else { + printSettingsV(c, plat); + } +} + +static +unique_ptr buildCorpora(const vector &corporaFiles, + const ExpressionMap &exprMap) { + if (!corporaFiles.empty()) { + auto c = ue2::make_unique(); + for (const auto &file : corporaFiles) { + if (!c->readFile(file)) { + cout << "Error reading corpora from file: " << file << endl; + exit_with_fail(); + } + } + return c; + } else { + auto c = ue2::make_unique( + exprMap, corpus_gen_prop, force_utf8, force_prefilter); + return c; + } +} + +static +bool needsQuotes(const char *s) { + size_t len = strlen(s); + // don't confuse the correct isblank for the one in locale + int (*blank)(int) = &std::isblank; + + if (len == 0) { + return true; + } + if (find_if(s, s + len, blank) != s + len) { + return true; + } + + return false; +} + +static +void storeCmdline(int argc, char **argv) { + for (int i = 0; i < argc; i++) { + const char *s = argv[i]; + if (needsQuotes(s)) { + g_cmdline += '"'; + g_cmdline += s; + g_cmdline += '"'; + } else { + g_cmdline += s; + } + if (i != argc - 1) { + g_cmdline += " "; + } + } +} + +static +bool runTests(CorporaSource &corpora_source, const ExpressionMap &exprMap, + const hs_platform_info *plat, const Grey &grey) { + TestSummary summary; + summary.totalExpressions = exprMap.size(); + BoundedQueue testq(numScannerThreads, max_scan_queue_len); + + // Start scanning threads. + vector> scanners; + for (size_t i = 0; i < numScannerThreads; i++) { + auto s = ue2::make_unique(i, testq, exprMap, plat, grey); + s->start(); + scanners.push_back(move(s)); + } + + generateTests(corpora_source, exprMap, summary, plat, grey, testq); + + // Push a sentinel per scanning thread to ensure that everyone finishes + // work. + for (size_t i = 0; i < scanners.size(); i++) { + testq.push(nullptr); + } + + // Wait for consumers to end and retrieve their results. + for (size_t i = 0; i < scanners.size(); i++) { + const auto &s = scanners[i]; + s->join(); + + if (g_verbose) { + cout << "Thread " << i << " processed " << s->count << " units." + << endl; + } + + summary.merge(s->getSummary()); + } + + printSummary(summary); + return !summary.hasFailure(); +} + +int main(int argc, char *argv[]) { + Grey grey; + vector corporaFiles; + + for (int i = 1; i < argc - 1; i++) { + if (!strcmp(argv[i], "-G")) { + cout << "Override: " << argv[i + 1] << endl; + } + } + + setDefaults(); + storeCmdline(argc, argv); + unique_ptr plat; + corpus_gen_prop.seed(randomSeed); + + processArgs(argc, argv, corpus_gen_prop, &corporaFiles, &grey, &plat); + + // If the user has asked for a random alignment, we select it here (after + // random number seed applied). + if (use_random_alignment) { + min_ue2_align = corpus_gen_prop.rand(0, 15); + max_ue2_align = min_ue2_align + 1; + } + + // Limit memory usage, unless the user has specified zero on the command + // line or in a config file. + if (g_memoryLimit) { + setMemoryLimit(g_memoryLimit * numThreads); + } + + // Split threads available up amongst scanner and generator threads. + numGeneratorThreads = max(1u, static_cast(numThreads * 0.5)); + numScannerThreads = max(1u, numThreads - numGeneratorThreads); + + ExpressionMap exprMap; + loadExpressions(g_exprPath, exprMap); + + if (!g_allSignatures) { + SignatureSet signatures; + if (!g_signatureFiles.empty()) { + for (string &fname : g_signatureFiles) { + loadSignatureList(fname, signatures); + } + } else { + signatures.insert(signatures.end(), g_signatures.begin(), + g_signatures.end()); + } + + exprMap = limitToSignatures(exprMap, signatures); + } + + printSettings(corporaFiles, plat.get()); + + if (exprMap.empty()) { + cout << "Warning: no signatures to scan. Exiting." << endl; + exit(0); + } + + if (!no_signal_handler) { + installSignalHandler(); + } + + if (saveDatabases || loadDatabases) { + struct stat st; + if (stat(serializePath.c_str(), &st) < 0) { + cout << "Unable to stat serialize path '" << serializePath + << "': " << strerror(errno) << endl; + exit_with_fail(); + } + } + + // If we're saving corpora out, truncate the output file. + if (saveCorpora) { + corporaOut = ue2::make_unique(saveCorporaFile); + } + + GroundTruth::global_prep(); + + auto corpora_source = buildCorpora(corporaFiles, exprMap); + + if (!g_verbose && g_quiet < 2) { + cout << "Only failed tests are displayed." << endl; + } + + SimpleTimer timer; + bool success = runTests(*corpora_source, exprMap, plat.get(), grey); + cout << "\nTotal elapsed time: " << timer.elapsed() << " secs." << endl; + exprMap.clear(); + + if (!success) { + exit_with_fail(); + } + + return 0; +} diff --git a/tools/hscollider/pcre_util.cpp b/tools/hscollider/pcre_util.cpp new file mode 100644 index 00000000..0e1aa0ec --- /dev/null +++ b/tools/hscollider/pcre_util.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "pcre_util.h" + +#include "hs.h" + +#include +#include /* for pcre flags */ + +bool getPcreFlags(unsigned int hs_flags, unsigned int *flags, + bool *highlander, bool *prefilter, bool *som) { + assert(flags); + assert(highlander); + assert(prefilter); + assert(som); + *flags = 0; + *highlander = false; + *prefilter = false; + *som = false; + + if (hs_flags & HS_FLAG_CASELESS) { + *flags |= PCRE_CASELESS; + hs_flags &= ~HS_FLAG_CASELESS; + } + if (hs_flags & HS_FLAG_DOTALL) { + *flags |= PCRE_DOTALL; + hs_flags &= ~HS_FLAG_DOTALL; + } + if (hs_flags & HS_FLAG_MULTILINE) { + *flags |= PCRE_MULTILINE; + hs_flags &= ~HS_FLAG_MULTILINE; + } + if (hs_flags & HS_FLAG_UCP) { + *flags |= PCRE_UCP; + hs_flags &= ~HS_FLAG_UCP; + } + if (hs_flags & HS_FLAG_UTF8) { + *flags |= PCRE_UTF8; + hs_flags &= ~HS_FLAG_UTF8; + } + if (hs_flags & HS_FLAG_SINGLEMATCH) { + *highlander = true; + hs_flags &= ~HS_FLAG_SINGLEMATCH; + } + if (hs_flags & HS_FLAG_PREFILTER) { + *prefilter = true; + hs_flags &= ~HS_FLAG_PREFILTER; + } + if (hs_flags & HS_FLAG_SOM_LEFTMOST) { + *som = true; + hs_flags &= ~HS_FLAG_SOM_LEFTMOST; + } + + // Flags that are irrelevant to PCRE. + hs_flags &= ~HS_FLAG_ALLOWEMPTY; + + if (hs_flags) { + // You've added new flags, haven't you? + assert(0); + return false; + } + + return true; +} diff --git a/tools/hscollider/pcre_util.h b/tools/hscollider/pcre_util.h new file mode 100644 index 00000000..87758873 --- /dev/null +++ b/tools/hscollider/pcre_util.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef PCRE_UTIL_H +#define PCRE_UTIL_H + +/** Translates the given hyperscan flags into pcre flags (where appropriate) + * and other bools (for flags which are not directly translateable). + * + * Returns false if an unknown hyperscan flag is encountered. + */ +bool getPcreFlags(unsigned int hs_flags, unsigned int *pcre_flags, + bool *highlander, bool *prefilter, bool *som); + +#endif /* PCRE_UTIL_H */ + diff --git a/tools/hscollider/sig.cpp b/tools/hscollider/sig.cpp new file mode 100644 index 00000000..b48be98a --- /dev/null +++ b/tools/hscollider/sig.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "sig.h" + +#include +#include +#include +#include +#include + +#ifdef HAVE_SIGACTION +#include +#endif + +#ifdef HAVE_BACKTRACE +#include +#include +#endif + +#define BACKTRACE_BUFFER_SIZE 200 + +TLS_VARIABLE volatile int debug_stage = STAGE_UNDEFINED; +TLS_VARIABLE volatile int debug_expr = 0; +TLS_VARIABLE const char * volatile debug_expr_ptr = nullptr; +TLS_VARIABLE volatile int debug_corpus = 0; +TLS_VARIABLE const char * volatile debug_corpus_ptr = nullptr; +TLS_VARIABLE volatile size_t debug_corpus_len = 0; + +extern std::string g_cmdline; + +#ifdef HAVE_SIGACTION +static void sighandler(int signum) { + /* NOTE: This signal handler is designed solely to provide more information + * when a crash occurs in ue2collider -- it makes calls to signal-unsafe + * functions like printf() and backtrace() by design, since we're already + * in deep trouble and are going to exit anyway. */ + + fflush(stdout); + printf("signal %d\n", signum); + printf("\nFailing cmdline was:\n%s\n\n", g_cmdline.c_str()); + printf("expression %d ", debug_expr); + switch(debug_stage) { + case STAGE_UE2_COMPILE: + printf("ue2 compile\n"); + break; + case STAGE_UE2_RUN: + printf("corpus %d ue2 scan\n", debug_corpus); + break; + case STAGE_PCRE_COMPILE: + printf("pcre compile\n"); + break; + case STAGE_PCRE_RUN: + printf("corpus %d pcre scan\n", debug_corpus); + break; + case STAGE_GRAPH_PREPROCESS: + printf("graph preprocess\n"); + break; + case STAGE_GRAPH_COMPILE: + printf("graph compile\n"); + break; + case STAGE_GRAPH_RUN: + printf("corpus %d graph scan\n", debug_corpus); + break; + default: + case STAGE_UNDEFINED: + printf("unknown stage\n"); + break; + } + printf("\n"); + + if (debug_expr_ptr) { + printf("expression %p\n", debug_expr_ptr); + printf("%d:%s\n\n", debug_expr, debug_expr_ptr); + } + + if (debug_stage == STAGE_PCRE_RUN || debug_stage == STAGE_UE2_RUN) { + printf("corpus %p len %zu\n", debug_corpus_ptr, debug_corpus_len); + + printf("%d:", debug_expr); + for (size_t i = 0; i < debug_corpus_len && debug_corpus_ptr; i++) { + unsigned char c = debug_corpus_ptr[i]; + if (c == '\n') { + printf("\\n"); + } else if (c == '\t') { + printf("\\t"); + } else if (c == '\r') { + printf("\\r"); + } else if (0x20 <= c && c <= 0x7e && c != '\\') { + printf("%c", c); + } else { + printf("\\x%02hhx", c); + } + } + printf("\n\n"); + } + + fflush(stdout); + +#ifdef HAVE_BACKTRACE + static void *bt[BACKTRACE_BUFFER_SIZE]; + int count = backtrace(bt, BACKTRACE_BUFFER_SIZE); + if (count) { + backtrace_symbols_fd(bt, count, STDOUT_FILENO); + } else { + printf("(Call to backtrace() returns zero count.)\n"); + } +#else + printf("(Backtrace unavailable on this platform.)\n"); +#endif + + _exit(signum); +} +#endif // HAVE_SIGACTION + +void installSignalHandler(void) { +#ifdef HAVE_SIGACTION + struct sigaction act; + memset(&act, 0, sizeof(act)); + act.sa_handler = sighandler; + act.sa_flags = 0; + sigemptyset(&act.sa_mask); + sigaddset(&act.sa_mask, SIGSEGV); + sigaddset(&act.sa_mask, SIGBUS); + sigaddset(&act.sa_mask, SIGFPE); + sigaddset(&act.sa_mask, SIGILL); + sigaddset(&act.sa_mask, SIGABRT); + sigaction(SIGBUS, &act, nullptr); + sigaction(SIGFPE, &act, nullptr); + sigaction(SIGILL, &act, nullptr); + sigaction(SIGABRT, &act, nullptr); + sigaction(SIGSEGV, &act, nullptr); + setSignalStack(); +#endif // HAVE_SIGACTION +} + +#ifdef HAVE_SIGALTSTACK +static TLS_VARIABLE char alt_stack_loc[SIGSTKSZ]; +#endif + +void setSignalStack(void) { +#ifdef HAVE_SIGALTSTACK + struct sigaction act; + memset(&act, 0, sizeof(act)); + act.sa_handler = sighandler; + act.sa_flags = 0; + stack_t alt_stack; + memset(&alt_stack, 0, sizeof(alt_stack)); + alt_stack.ss_flags = 0; + alt_stack.ss_size = SIGSTKSZ; + alt_stack.ss_sp = alt_stack_loc; + if (!sigaltstack(&alt_stack, nullptr)) { + act.sa_flags |= SA_ONSTACK; + } + sigaction(SIGSEGV, &act, nullptr); +#endif +} + diff --git a/tools/hscollider/sig.h b/tools/hscollider/sig.h new file mode 100644 index 00000000..fc643826 --- /dev/null +++ b/tools/hscollider/sig.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SIG_H +#define SIG_H + +#include // for size_t + +#define STAGE_UNDEFINED 0 +#define STAGE_UE2_COMPILE 1 +#define STAGE_UE2_RUN 2 +#define STAGE_PCRE_COMPILE 3 +#define STAGE_PCRE_RUN 4 +#define STAGE_GRAPH_PREPROCESS 5 +#define STAGE_GRAPH_COMPILE 6 +#define STAGE_GRAPH_RUN 7 + +#define TLS_VARIABLE __thread + +extern TLS_VARIABLE volatile int debug_stage; +extern TLS_VARIABLE volatile int debug_expr; +extern TLS_VARIABLE const char * volatile debug_expr_ptr; +extern TLS_VARIABLE volatile int debug_corpus; +extern TLS_VARIABLE const char * volatile debug_corpus_ptr; +extern TLS_VARIABLE volatile size_t debug_corpus_len; + +void installSignalHandler(void); + +// Must be called by every thread. +void setSignalStack(void); + +#endif diff --git a/tools/hscollider/simple_timer.h b/tools/hscollider/simple_timer.h new file mode 100644 index 00000000..a310de15 --- /dev/null +++ b/tools/hscollider/simple_timer.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SIMPLE_TIMER_H +#define SIMPLE_TIMER_H + +#include + +class SimpleTimer { +public: + SimpleTimer(); + double elapsed() const; +private: + std::chrono::time_point start; +}; + +SimpleTimer::SimpleTimer() { + start = std::chrono::system_clock::now(); +} + +double SimpleTimer::elapsed() const { + std::chrono::time_point end; + end = std::chrono::system_clock::now(); + + std::chrono::duration delta = end - start; + return delta.count(); +} + +#endif // SIMPLE_TIMER_H