From f626276271fbc0ea209ada0e8c499c53a23284c8 Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Wed, 14 Dec 2016 15:26:01 +1100 Subject: [PATCH] hsbench: add Hyperscan benchmarker The hsbench tool provides an easy way to measure Hyperscan's performance for a particular set of patterns and corpus of data to be scanned. --- cmake/sqlite3.cmake | 53 ++ tools/CMakeLists.txt | 19 + tools/hsbench/CMakeLists.txt | 36 ++ tools/hsbench/README.md | 8 + tools/hsbench/common.h | 42 ++ tools/hsbench/data_corpus.cpp | 133 ++++ tools/hsbench/data_corpus.h | 63 ++ tools/hsbench/engine_hyperscan.cpp | 411 ++++++++++++ tools/hsbench/engine_hyperscan.h | 97 +++ tools/hsbench/heapstats.cpp | 146 +++++ tools/hsbench/heapstats.h | 36 ++ tools/hsbench/huge.cpp | 201 ++++++ tools/hsbench/huge.h | 37 ++ tools/hsbench/main.cpp | 780 +++++++++++++++++++++++ tools/hsbench/scripts/CorpusBuilder.py | 58 ++ tools/hsbench/scripts/gutenbergCorpus.py | 68 ++ tools/hsbench/scripts/linebasedCorpus.py | 53 ++ tools/hsbench/scripts/pcapCorpus.py | 301 +++++++++ tools/hsbench/thread_barrier.h | 71 +++ tools/hsbench/timer.h | 59 ++ util/CMakeLists.txt | 16 +- util/cross_compile.cpp | 115 ++++ util/cross_compile.h | 42 ++ util/database_util.cpp | 155 +++++ util/database_util.h | 39 ++ util/expression_path.h | 107 ++++ 26 files changed, 3145 insertions(+), 1 deletion(-) create mode 100644 cmake/sqlite3.cmake create mode 100644 tools/CMakeLists.txt create mode 100644 tools/hsbench/CMakeLists.txt create mode 100644 tools/hsbench/README.md create mode 100644 tools/hsbench/common.h create mode 100644 tools/hsbench/data_corpus.cpp create mode 100644 tools/hsbench/data_corpus.h create mode 100644 tools/hsbench/engine_hyperscan.cpp create mode 100644 tools/hsbench/engine_hyperscan.h create mode 100644 tools/hsbench/heapstats.cpp create mode 100644 tools/hsbench/heapstats.h create mode 100644 tools/hsbench/huge.cpp create mode 100644 tools/hsbench/huge.h create mode 100644 tools/hsbench/main.cpp create mode 100755 tools/hsbench/scripts/CorpusBuilder.py create mode 100755 tools/hsbench/scripts/gutenbergCorpus.py create mode 100755 tools/hsbench/scripts/linebasedCorpus.py create mode 100755 tools/hsbench/scripts/pcapCorpus.py create mode 100644 tools/hsbench/thread_barrier.h create mode 100644 tools/hsbench/timer.h create mode 100644 util/cross_compile.cpp create mode 100644 util/cross_compile.h create mode 100644 util/database_util.cpp create mode 100644 util/database_util.h create mode 100644 util/expression_path.h diff --git a/cmake/sqlite3.cmake b/cmake/sqlite3.cmake new file mode 100644 index 00000000..c07f1161 --- /dev/null +++ b/cmake/sqlite3.cmake @@ -0,0 +1,53 @@ +# +# a lot of noise to find sqlite +# + +option(SQLITE_PREFER_STATIC "Build sqlite3 statically instead of using an installed lib" OFF) + +if(NOT WIN32 AND NOT SQLITE_PREFER_STATIC) +find_package(PkgConfig QUIET) + +# first check for sqlite on the system +pkg_check_modules(SQLITE3 sqlite3) +endif() + +if (NOT SQLITE3_FOUND) + message(STATUS "looking for sqlite3 in source tree") + # look in the source tree + if (EXISTS "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.h" AND + EXISTS "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.c") + message(STATUS " found sqlite3 in source tree") + set(SQLITE3_FOUND TRUE) + set(SQLITE3_BUILD_SOURCE TRUE) + set(SQLITE3_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/sqlite3") + set(SQLITE3_LDFLAGS sqlite3_static) + else() + message(FATAL_ERROR " no sqlite3 in source tree") + endif() +endif() + +# now do version checks +if (SQLITE3_FOUND) + list(INSERT CMAKE_REQUIRED_INCLUDES 0 "${SQLITE3_INCLUDE_DIRS}") + CHECK_C_SOURCE_COMPILES("#include \n#if SQLITE_VERSION_NUMBER >= 3008007 && SQLITE_VERSION_NUMBER < 3008010\n#error broken sqlite\n#endif\nint main() {return 0;}" SQLITE_VERSION_OK) + if (NOT SQLITE_VERSION_OK) + message(FATAL_ERROR "sqlite3 is broken from 3.8.7 to 3.8.10 - please find a working version") + endif() +if (NOT SQLITE3_BUILD_SOURCE) + set(_SAVED_FLAGS ${CMAKE_REQUIRED_FLAGS}) + list(INSERT CMAKE_REQUIRED_LIBRARIES 0 ${SQLITE3_LDFLAGS}) + CHECK_SYMBOL_EXISTS(sqlite3_open_v2 sqlite3.h HAVE_SQLITE3_OPEN_V2) + list(REMOVE_ITEM CMAKE_REQUIRED_INCLUDES "${SQLITE3_INCLUDE_DIRS}") + list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES ${SQLITE3_LDFLAGS}) +else() + if (NOT TARGET sqlite3_static) + # build sqlite as a static lib to compile into our test programs + add_library(sqlite3_static STATIC "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.c") + if (NOT WIN32) + set_target_properties(sqlite3_static PROPERTIES COMPILE_FLAGS "-Wno-unused -Wno-cast-qual -DSQLITE_OMIT_LOAD_EXTENSION") + endif() + endif() +endif() +endif() + +# that's enough about sqlite diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt new file mode 100644 index 00000000..049fd368 --- /dev/null +++ b/tools/CMakeLists.txt @@ -0,0 +1,19 @@ +find_package(Threads) + +# remove some warnings +if(CMAKE_CXX_FLAGS MATCHES "-Wmissing-declarations" ) + string(REPLACE "-Wmissing-declarations" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +endif() + +include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) +include_directories(${PROJECT_SOURCE_DIR}/util) + +# add any subdir with a cmake file +file(GLOB dirents RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *) +foreach(e ${dirents}) + if(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${e} AND + EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${e}/CMakeLists.txt) + add_subdirectory(${e}) + endif () +endforeach () diff --git a/tools/hsbench/CMakeLists.txt b/tools/hsbench/CMakeLists.txt new file mode 100644 index 00000000..25a833d0 --- /dev/null +++ b/tools/hsbench/CMakeLists.txt @@ -0,0 +1,36 @@ +include (${CMAKE_MODULE_PATH}/sqlite3.cmake) + +if (NOT XCODE) + include_directories(SYSTEM ${SQLITE3_INCLUDE_DIRS}) +else() + # cmake doesn't think Xcode supports isystem + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -isystem ${SQLITE3_INCLUDE_DIRS}") +endif() + +CHECK_FUNCTION_EXISTS(malloc_info HAVE_MALLOC_INFO) +CHECK_FUNCTION_EXISTS(shmget HAVE_SHMGET) +set(HAVE_SHMGET ${HAVE_SHMGET} CACHE BOOL "shmget()") + +# only set these after all tests are done +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") + + +SET(hsbench_SOURCES + common.h + data_corpus.cpp + data_corpus.h + engine_hyperscan.cpp + engine_hyperscan.h + heapstats.cpp + heapstats.h + huge.cpp + huge.h + main.cpp + thread_barrier.h + timer.h +) + +add_executable(hsbench ${hsbench_SOURCES}) +target_link_libraries(hsbench hs databaseutil expressionutil ${SQLITE3_LDFLAGS} + ${CMAKE_THREAD_LIBS_INIT}) diff --git a/tools/hsbench/README.md b/tools/hsbench/README.md new file mode 100644 index 00000000..344a6c00 --- /dev/null +++ b/tools/hsbench/README.md @@ -0,0 +1,8 @@ +Hyperscan Benchmarker: hsbench +============================== + +The `hsbench` tool provides an easy way to measure Hyperscan's performance +for a particular set of patterns and corpus of data to be scanned. + +Documentation describing its operation is available in the Tools section of the +[Developer Reference Guide](http://01org.github.io/hyperscan/dev-reference/). diff --git a/tools/hsbench/common.h b/tools/hsbench/common.h new file mode 100644 index 00000000..a4d60021 --- /dev/null +++ b/tools/hsbench/common.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef COMMON_H +#define COMMON_H + +#include + +enum class ScanMode { BLOCK, STREAMING, VECTORED }; + +extern bool echo_matches; +extern bool saveDatabases; +extern bool loadDatabases; +extern std::string serializePath; +extern unsigned int somPrecisionMode; + +#endif // COMMON_H diff --git a/tools/hsbench/data_corpus.cpp b/tools/hsbench/data_corpus.cpp new file mode 100644 index 00000000..55bfe93a --- /dev/null +++ b/tools/hsbench/data_corpus.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "data_corpus.h" + +#include "util/container.h" +#include "ue2common.h" + +#include +#include +#include +#include +#include + +#include + +using namespace std; +using namespace ue2; + +static +void readRow(sqlite3_stmt *statement, vector &blocks, + map &stream_indices) { + unsigned int id = sqlite3_column_int(statement, 0); + unsigned int stream_id = sqlite3_column_int(statement, 1); + const char *blob = (const char *)sqlite3_column_blob(statement, 2); + unsigned int bytes = sqlite3_column_bytes(statement, 2); + + if (!contains(stream_indices, stream_id)) { + unsigned int internal_stream_index = stream_indices.size(); + stream_indices[stream_id] = internal_stream_index; + } + auto internal_stream_index = stream_indices[stream_id]; + + assert(blob || bytes > 0); + blocks.emplace_back(id, stream_id, internal_stream_index, + string(blob, blob + bytes)); +} + +vector readCorpus(const string &filename) { + int status; + sqlite3 *db = nullptr; + + status = sqlite3_open_v2(filename.c_str(), &db, SQLITE_OPEN_READONLY, + nullptr); + + assert(db); + if (status != SQLITE_OK) { + ostringstream err; + err << "Unable to open database '" << filename << "': " + << sqlite3_errmsg(db); + status = sqlite3_close(db); + assert(status == SQLITE_OK); + throw DataCorpusError(err.str()); + } + + static const string query("SELECT id, stream_id, data " + "FROM chunk ORDER BY id;"); + + sqlite3_stmt *statement = nullptr; + + status = sqlite3_prepare_v2(db, query.c_str(), query.size(), &statement, + nullptr); + if (status != SQLITE_OK) { + status = sqlite3_finalize(statement); + assert(status == SQLITE_OK); + status = sqlite3_close(db); + assert(status == SQLITE_OK); + + ostringstream oss; + oss << "Query failed: " << query; + throw DataCorpusError(oss.str()); + } + + vector blocks; + map stream_indices; + + status = sqlite3_step(statement); + while (status == SQLITE_ROW) { + readRow(statement, blocks, stream_indices); + status = sqlite3_step(statement); + } + + if (status != SQLITE_DONE) { + ostringstream oss; + oss << "Error retrieving blocks from corpus: " + << sqlite3_errstr(status); + + status = sqlite3_finalize(statement); + assert(status == SQLITE_OK); + status = sqlite3_close(db); + assert(status == SQLITE_OK); + + throw DataCorpusError(oss.str()); + } + + status = sqlite3_finalize(statement); + assert(status == SQLITE_OK); + status = sqlite3_close(db); + assert(status == SQLITE_OK); + + if (blocks.empty()) { + throw DataCorpusError("Database contains no blocks."); + } + + return blocks; +} diff --git a/tools/hsbench/data_corpus.h b/tools/hsbench/data_corpus.h new file mode 100644 index 00000000..91a87acc --- /dev/null +++ b/tools/hsbench/data_corpus.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DATACORPUS_H +#define DATACORPUS_H + +#include +#include + +class DataBlock { +public: + DataBlock(unsigned int in_id, unsigned int in_stream, + unsigned int int_stream_index_in, std::string in_data) + : id(in_id), stream_id(in_stream), + internal_stream_index(int_stream_index_in), + payload(std::move(in_data)) {} + + unsigned int id; // unique block identifier + unsigned int stream_id; // unique stream identifier (from corpus file) + unsigned int internal_stream_index; /* dense index for this stream + * (allocated by hsbench) */ + std::string payload; // actual block payload +}; + +/** Exception thrown if an error occurs. */ +class DataCorpusError { +public: + explicit DataCorpusError(std::string msg_in) : msg(std::move(msg_in)) {} + std::string msg; +}; + +/** + * Interface to a corpus database. Any error will produce a DataCorpusError + * and should be considered fatal. + */ +std::vector readCorpus(const std::string &filename); + +#endif // DATACORPUS_H diff --git a/tools/hsbench/engine_hyperscan.cpp b/tools/hsbench/engine_hyperscan.cpp new file mode 100644 index 00000000..f5abb9fa --- /dev/null +++ b/tools/hsbench/engine_hyperscan.cpp @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "ExpressionParser.h" +#include "common.h" +#include "engine_hyperscan.h" +#include "expressions.h" +#include "heapstats.h" +#include "huge.h" +#include "timer.h" + +#include "crc32.h" +#include "database.h" +#include "hs_compile.h" +#include "hs_internal.h" +#include "hs_runtime.h" +#include "util/database_util.h" +#include "util/make_unique.h" + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +EngineContext::EngineContext(const hs_database_t *db) { + hs_alloc_scratch(db, &scratch); + assert(scratch); +} + +EngineContext::~EngineContext() { + hs_free_scratch(scratch); +} + +namespace /* anonymous */ { + +/** Scan context structure passed to the onMatch callback function. */ +struct ScanContext { + ScanContext(unsigned id_in, ResultEntry &result_in, + const EngineStream *stream_in) + : id(id_in), result(result_in), stream(stream_in) {} + unsigned id; + ResultEntry &result; + const EngineStream *stream; // nullptr except in streaming mode. +}; + +} // namespace + +/** + * Callback function called for every match that Hyperscan produces, used when + * "echo matches" is off. + */ +static +int onMatch(unsigned int, unsigned long long, unsigned long long, unsigned int, + void *ctx) { + ScanContext *sc = static_cast(ctx); + assert(sc); + sc->result.matches++; + + return 0; +} + +/** + * Callback function called for every match that Hyperscan produces when "echo + * matches" is enabled. + */ +static +int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to, + unsigned int, void *ctx) { + ScanContext *sc = static_cast(ctx); + assert(sc); + sc->result.matches++; + + if (sc->stream) { + printf("Match @%u:%u:%llu for %u\n", sc->stream->sn, sc->id, to, id); + } else { + printf("Match @%u:%llu for %u\n", sc->id, to, id); + } + + return 0; +} + +EngineHyperscan::EngineHyperscan(hs_database_t *db_in) : db(db_in) { + assert(db); +} + +EngineHyperscan::~EngineHyperscan() { + release_huge(db); +} + +unique_ptr EngineHyperscan::makeContext() const { + return ue2::make_unique(db); +} + +void EngineHyperscan::scan(const char *data, unsigned int len, unsigned int id, + ResultEntry &result, EngineContext &ctx) const { + assert(data); + + ScanContext sc(id, result, nullptr); + auto callback = echo_matches ? onMatchEcho : onMatch; + hs_error_t rv = hs_scan(db, data, len, 0, ctx.scratch, callback, &sc); + + if (rv != HS_SUCCESS) { + printf("Fatal error: hs_scan returned error %d\n", rv); + abort(); + } +} + +void EngineHyperscan::scan_vectored(const char *const *data, + const unsigned int *len, unsigned int count, + unsigned streamId, ResultEntry &result, + EngineContext &ctx) const { + assert(data); + assert(len); + + ScanContext sc(streamId, result, nullptr); + auto callback = echo_matches ? onMatchEcho : onMatch; + hs_error_t rv = + hs_scan_vector(db, data, len, count, 0, ctx.scratch, callback, &sc); + + if (rv != HS_SUCCESS) { + printf("Fatal error: hs_scan_vector returned error %d\n", rv); + abort(); + } +} + +unique_ptr EngineHyperscan::streamOpen(EngineContext &ctx, + unsigned streamId) const { + auto stream = ue2::make_unique(); + stream->ctx = &ctx; + + hs_open_stream(db, 0, &stream->id); + if (!stream->id) { + // an error occurred, propagate to caller + return nullptr; + } + stream->sn = streamId; + return stream; +} + +void EngineHyperscan::streamClose(unique_ptr stream, + ResultEntry &result) const { + assert(stream); + + auto &s = static_cast(*stream); + EngineContext &ctx = *s.ctx; + + ScanContext sc(0, result, &s); + auto callback = echo_matches ? onMatchEcho : onMatch; + + assert(s.id); + hs_close_stream(s.id, ctx.scratch, callback, &sc); + s.id = nullptr; +} + +void EngineHyperscan::streamScan(EngineStream &stream, const char *data, + unsigned len, unsigned id, + ResultEntry &result) const { + assert(data); + + auto &s = static_cast(stream); + EngineContext &ctx = *s.ctx; + + ScanContext sc(id, result, &s); + auto callback = echo_matches ? onMatchEcho : onMatch; + hs_error_t rv = + hs_scan_stream(s.id, data, len, 0, ctx.scratch, callback, &sc); + + if (rv != HS_SUCCESS) { + printf("Fatal error: hs_scan_stream returned error %d\n", rv); + abort(); + } +} + +static +unsigned makeModeFlags(ScanMode scan_mode) { + switch (scan_mode) { + case ScanMode::BLOCK: + return HS_MODE_BLOCK; + case ScanMode::STREAMING: + return HS_MODE_STREAM; + case ScanMode::VECTORED: + return HS_MODE_VECTORED; + } + assert(0); + return HS_MODE_STREAM; +} + +/** + * Hash the settings used to compile a database, returning a string that can be + * used as a filename. + */ +static +string dbSettingsHash(const string &filename, u32 mode) { + ostringstream info_oss; + + info_oss << filename.c_str() << ' '; + info_oss << mode << ' '; + + string info = info_oss.str(); + + u32 crc = Crc32c_ComputeBuf(0, info.data(), info.size()); + + // return STL string with printable version of digest + ostringstream oss; + oss << hex << setw(8) << setfill('0') << crc << dec; + + return oss.str(); +} + +static +string dbFilename(const std::string &name, unsigned mode) { + ostringstream oss; + oss << serializePath << '/' << dbSettingsHash(name, mode) << ".db"; + return oss.str(); +} + +std::unique_ptr +buildEngineHyperscan(const ExpressionMap &expressions, ScanMode scan_mode, + const std::string &name, UNUSED const ue2::Grey &grey) { + if (expressions.empty()) { + assert(0); + return nullptr; + } + + long double compileSecs = 0.0; + size_t compiledSize = 0.0; + size_t streamSize = 0; + size_t scratchSize = 0; + unsigned int peakMemorySize = 0; + unsigned int crc = 0; + std::string db_info; + + unsigned int mode = makeModeFlags(scan_mode); + + hs_database_t *db; + hs_error_t err; + + if (loadDatabases) { + db = loadDatabase(dbFilename(name, mode).c_str()); + if (!db) { + return nullptr; + } + } else { + const unsigned int count = expressions.size(); + + vector exprs; + vector flags, ids; + vector ext; + + for (const auto &m : expressions) { + string expr; + unsigned int f = 0; + hs_expr_ext extparam; + extparam.flags = 0; + if (!readExpression(m.second, expr, &f, &extparam)) { + printf("Error parsing PCRE: %s (id %u)\n", m.second.c_str(), + m.first); + return nullptr; + } + + exprs.push_back(expr); + ids.push_back(m.first); + flags.push_back(f); + ext.push_back(extparam); + } + + unsigned full_mode = mode; + if (mode == HS_MODE_STREAM) { + full_mode |= somPrecisionMode; + } + + // Our compiler takes an array of plain ol' C strings. + vector patterns(count); + for (unsigned int i = 0; i < count; i++) { + patterns[i] = exprs[i].c_str(); + } + + // Extended parameters are passed as pointers to hs_expr_ext structures. + vector ext_ptr(count); + for (unsigned int i = 0; i < count; i++) { + ext_ptr[i] = &ext[i]; + } + + Timer timer; + timer.start(); + + hs_compile_error_t *compile_err; + +#ifndef RELEASE_BUILD + err = hs_compile_multi_int(patterns.data(), flags.data(), ids.data(), + ext_ptr.data(), count, full_mode, nullptr, + &db, &compile_err, grey); +#else + err = hs_compile_ext_multi(patterns.data(), flags.data(), ids.data(), + ext_ptr.data(), count, full_mode, nullptr, + &db, &compile_err); +#endif + + timer.complete(); + compileSecs = timer.seconds(); + peakMemorySize = getPeakHeap(); + + if (err == HS_COMPILER_ERROR) { + if (compile_err->expression >= 0) { + printf("Compile error for signature #%u: %s\n", + compile_err->expression, compile_err->message); + } else { + printf("Compile error: %s\n", compile_err->message); + } + hs_free_compile_error(compile_err); + return nullptr; + } + } + + // copy the db into huge pages (where available) to reduce TLB pressure + db = get_huge(db); + if (!db) { + return nullptr; + } + + err = hs_database_size(db, &compiledSize); + if (err != HS_SUCCESS) { + return nullptr; + } + assert(compiledSize > 0); + + crc = db->crc32; + + if (saveDatabases) { + saveDatabase(db, dbFilename(name, mode).c_str()); + } + + if (mode & HS_MODE_STREAM) { + err = hs_stream_size(db, &streamSize); + if (err != HS_SUCCESS) { + return nullptr; + } + } else { + streamSize = 0; + } + + char *info; + err = hs_database_info(db, &info); + if (err != HS_SUCCESS) { + return nullptr; + } else { + db_info = string(info); + free(info); + } + + // Allocate scratch temporarily to find its size: this is a good test + // anyway. + hs_scratch_t *scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + if (err != HS_SUCCESS) { + return nullptr; + } + + err = hs_scratch_size(scratch, &scratchSize); + if (err != HS_SUCCESS) { + return nullptr; + } + hs_free_scratch(scratch); + + // Output summary information. + printf("Signatures: %s\n", name.c_str()); + printf("Hyperscan info: %s\n", db_info.c_str()); + printf("Expression count: %'zu\n", expressions.size()); + printf("Bytecode size: %'zu bytes\n", compiledSize); + printf("Database CRC: 0x%x\n", crc); + if (mode & HS_MODE_STREAM) { + printf("Stream state size: %'zu bytes\n", streamSize); + } + printf("Scratch size: %'zu bytes\n", scratchSize); + printf("Compile time: %'0.3Lf seconds\n", compileSecs); + printf("Peak heap usage: %'u bytes\n", peakMemorySize); + + return ue2::make_unique(db); +} diff --git a/tools/hsbench/engine_hyperscan.h b/tools/hsbench/engine_hyperscan.h new file mode 100644 index 00000000..7875decc --- /dev/null +++ b/tools/hsbench/engine_hyperscan.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ENGINEHYPERSCAN_H +#define ENGINEHYPERSCAN_H + +#include "expressions.h" +#include "common.h" +#include "hs_runtime.h" + +#include + +/** Structure for the result of a single complete scan. */ +struct ResultEntry { + double seconds = 0; //!< Time taken for scan. + unsigned int matches = 0; //!< Count of matches found. +}; + +/** Engine context which is allocated on a per-thread basis. */ +class EngineContext { +public: + explicit EngineContext(const hs_database_t *db); + ~EngineContext(); + + hs_scratch_t *scratch = nullptr; +}; + +/** Streaming mode scans have persistent stream state associated with them. */ +class EngineStream { +public: + hs_stream_t *id; + unsigned int sn; + EngineContext *ctx; +}; + +/** Hyperscan Engine for scanning data. */ +class EngineHyperscan { +public: + explicit EngineHyperscan(hs_database_t *db); + ~EngineHyperscan(); + + std::unique_ptr makeContext() const; + + void scan(const char *data, unsigned int len, unsigned int id, + ResultEntry &result, EngineContext &ctx) const; + + void scan_vectored(const char *const *data, const unsigned int *len, + unsigned int count, unsigned int streamId, + ResultEntry &result, EngineContext &ctx) const; + + std::unique_ptr streamOpen(EngineContext &ctx, + unsigned id) const; + + void streamClose(std::unique_ptr stream, + ResultEntry &result) const; + + void streamScan(EngineStream &stream, const char *data, unsigned int len, + unsigned int id, ResultEntry &result) const; + +private: + hs_database_t *db; +}; + +namespace ue2 { +struct Grey; +} + +std::unique_ptr +buildEngineHyperscan(const ExpressionMap &expressions, ScanMode scan_mode, + const std::string &name, const ue2::Grey &grey); + +#endif // ENGINEHYPERSCAN_H diff --git a/tools/hsbench/heapstats.cpp b/tools/hsbench/heapstats.cpp new file mode 100644 index 00000000..d0dffdb3 --- /dev/null +++ b/tools/hsbench/heapstats.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Peak heap usage code. + * + * At present, we only have an implementation for modern glibc systems, using + * the malloc_info() call. We return zero elsewhere. + */ + +#include "config.h" + +#include "heapstats.h" + +#if defined HAVE_MALLOC_INFO + +#include +#include +#include +#include + +#include + +size_t getPeakHeap(void) { + FILE *tmpf = tmpfile(); + if (!tmpf) { + return 0; + } + + int rv = malloc_info(0, tmpf); + if (rv != 0) { + fclose(tmpf); + return 0; + } + + rewind(tmpf); + + // We don't want to depend on a real XML parser. This is ugly and brittle + // and hopefully good enough for the time being. We look for the last + // system tag with type max, which should be the malloc-wide one. + + static const char begin[] = " +#include +#include +#include + +#include +#include + +using namespace std; + +size_t getPeakHeap(void) { + // Modern Linux kernels write a 'VmPeak' value into /proc/$PID/status. This + // is a reasonable approximation, though it likely includes shared libs and + // the like as well... + ostringstream path; + path << "/proc/" << getpid() << "/status"; + + ifstream f(path.str().c_str()); + if (!f.good()) { + return 0; + } + + const string vmpeak("VmPeak:"); + + string line; + while (getline(f, line)) { + istringstream iss(line, istringstream::in); + string word; + iss >> word; + if (word != vmpeak) { + continue; + } + + // Skip spaces + while (iss.good() && !isdigit(iss.peek())) { + iss.ignore(); + } + + size_t num = 0; + iss >> num; + return num * 1024; + } + + f.close(); + return 0; +} + +#else + +// Stub. +size_t getPeakHeap(void) { + return 0; +} + +#endif diff --git a/tools/hsbench/heapstats.h b/tools/hsbench/heapstats.h new file mode 100644 index 00000000..c2c37998 --- /dev/null +++ b/tools/hsbench/heapstats.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HEAPSTATS_H +#define HEAPSTATS_H + +#include // for size_t + +size_t getPeakHeap(void); + +#endif diff --git a/tools/hsbench/huge.cpp b/tools/hsbench/huge.cpp new file mode 100644 index 00000000..dbb453b2 --- /dev/null +++ b/tools/hsbench/huge.cpp @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "hs.h" +#include "ue2common.h" + +#include "common.h" +#include "huge.h" + +#ifndef _WIN32 +#include +#include +#include +#include +#include +#include +#include +#if defined(HAVE_SHMGET) +#include +#include +#endif + +UNUSED static int hsdb_shmid; + +using namespace std; + +long gethugepagesize(void); + +hs_database_t *get_huge(hs_database_t *db) { +#if defined(HAVE_SHMGET) && defined(SHM_HUGETLB) + /* move the database to huge pages where possible, but fail politely */ + hs_error_t err; + size_t len; + char *bytes; + + long hpage_size = gethugepagesize(); + if (hpage_size < 0) { + printf("Couldn't determine huge page size\n"); + hsdb_shmid = -1; + return db; + } + + err = hs_serialize_database(db, &bytes, &len); + if (err != HS_SUCCESS) { + printf("Failed to serialize database for copy: %d\n", err); + // this is weird - don't fail gracefully this time + return nullptr; + } + + size_t size; + err = hs_serialized_database_size(bytes, len, &size); + if (err != HS_SUCCESS) { + printf("Failed to get database size: %d\n", err); + // this is weird - don't fail gracefully this time + return nullptr; + } + + void *shmaddr; + if ((hsdb_shmid = shmget(IPC_PRIVATE, ROUNDUP_N(size, gethugepagesize()), + SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { + // This could fail if the user doesn't have permission to shmget(), + // which is OK. + goto fini; + } + + shmaddr = shmat(hsdb_shmid, nullptr, SHM_RND); + if (shmaddr == (char *)-1) { + perror("Shared memory attach failure"); + goto fini; + } + + // Mark this segment to be destroyed after this process detaches. + shmctl(hsdb_shmid, IPC_RMID, nullptr); + + err = hs_deserialize_database_at(bytes, len, (hs_database_t *)shmaddr); + if (err != HS_SUCCESS) { + printf("Failed to deserialize database into shm: %d\n", err); + shmdt((const void *)shmaddr); + goto fini; + } + + free(bytes); + hs_free_database(db); + return (hs_database_t *)shmaddr; + +fini: + free(bytes); + hsdb_shmid = -1; + return db; +#else + return db; +#endif +} + +void release_huge(hs_database_t *db) { +#if defined(HAVE_SHMGET) && defined(SHM_HUGETLB) + if (hsdb_shmid != -1) { + if (shmdt((const void *)db) != 0) { + perror("Detach failure"); + } + } else { + // fallback + hs_free_database(db); + } +#else + hs_free_database(db); +#endif +} + +#define BUF_SIZE 4096 +static long read_meminfo(const char *tag) { + int fd; + char buf[BUF_SIZE]; + int len; + char *p, *q; + long val; + + fd = open("/proc/meminfo", O_RDONLY); + if (fd < 0) { + perror("Couldn't open /proc/meminfo"); + return -1; + } + + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len < 0) { + perror("Error reading /proc/meminfo"); + return -1; + } + if (len == sizeof(buf)) { + printf("/proc/meminfo is too large\n"); + return -1; + } + buf[len] = '\0'; + + p = strstr(buf, tag); + if (!p) { + return -1; + } + + p += strlen(tag); + val = strtol(p, &q, 0); + if (!isspace(*q)) { + printf("Couldn't parse /proc/meminfo value\n"); + return -1; + } + + return val; +} + +long gethugepagesize(void) { + long hpage_size; + int hpage_kb; + + hpage_kb = read_meminfo("Hugepagesize:"); + if (hpage_kb < 0) { + hpage_size = -1; + } else { + /* convert from kb to bytes */ + hpage_size = 1024 * hpage_kb; + } + + return hpage_size; +} + +#else + +/* No huge page support on WIN32. */ + +hs_database_t *get_huge(hs_database_t *db) { return db; } + +void release_huge(hs_database_t *db) { hs_free_database(db); } + +#endif diff --git a/tools/hsbench/huge.h b/tools/hsbench/huge.h new file mode 100644 index 00000000..da539bd6 --- /dev/null +++ b/tools/hsbench/huge.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HUGE_H +#define HUGE_H + +#include "hs.h" + +hs_database_t *get_huge(hs_database_t *db); +void release_huge(hs_database_t *db); + +#endif /* HUGE_H */ diff --git a/tools/hsbench/main.cpp b/tools/hsbench/main.cpp new file mode 100644 index 00000000..4298963b --- /dev/null +++ b/tools/hsbench/main.cpp @@ -0,0 +1,780 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "common.h" +#include "data_corpus.h" +#include "engine_hyperscan.h" +#include "expressions.h" +#include "thread_barrier.h" +#include "timer.h" +#include "util/expression_path.h" +#include "util/string_util.h" + +#include "grey.h" +#include "hs.h" +#include "ue2common.h" +#include "util/make_unique.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifndef _WIN32 +#include +#include +#endif + +#include +#include + +using namespace std; +using namespace ue2; +using boost::adaptors::map_keys; + +// Globals common to all files. +bool echo_matches = false; +bool saveDatabases = false; +bool loadDatabases = false; +string serializePath(""); +unsigned int somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE; + +namespace /* anonymous */ { + +// Globals local to this file. +bool display_per_scan = false; +ScanMode scan_mode = ScanMode::STREAMING; +unsigned repeats = 20; +string exprPath(""); +string corpusFile(""); +vector threadCores; +Timer totalTimer; +double totalSecs = 0; + +typedef void (*thread_func_t)(void *context); + +class ThreadContext : boost::noncopyable { +public: + ThreadContext(unsigned num_in, const EngineHyperscan &db_in, + thread_barrier &tb_in, thread_func_t function_in, + vector corpus_data_in) + : num(num_in), results(repeats), engine(db_in), + enginectx(db_in.makeContext()), corpus_data(move(corpus_data_in)), + tb(tb_in), function(function_in) {} + + // Start the thread. + bool start(int cpu) { + thr = thread(function, this); + + // affine if it's asked for + if (cpu >= 0) { + return affine(cpu); + } + return true; + } + + // Wait for the thread to exit. + void join() { + thr.join(); + } + + // Serialise all threads on a global barrier. + void barrier() { + tb.wait(); + } + + // Apply processor affinity (if available) to this thread. + bool affine(UNUSED int cpu) { +#ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + assert(cpu >= 0 && cpu < CPU_SETSIZE); + + // The 'clang' compiler complains about an unused result here, so we + // silence it. + (void)CPU_SET(cpu, &cpuset); + + int rv = pthread_setaffinity_np(thr.native_handle(), sizeof(cpuset), + &cpuset); + return (rv == 0); +#endif + return false; // not available + } + + unsigned num; + Timer timer; + vector results; + const EngineHyperscan &engine; + unique_ptr enginectx; + vector corpus_data; + +protected: + thread_barrier &tb; // shared barrier for time sync + thread_func_t function; + thread thr; +}; + +/** Display usage information, with an optional error. */ +static +void usage(const char *error) { + printf("Usage: hsbench [OPTIONS...]\n\n"); + printf("Options:\n\n"); + printf(" -h Display help and exit.\n"); + printf(" -G OVERRIDES Overrides for the grey box.\n"); + printf(" -e PATH Path to expression directory.\n"); + printf(" -s FILE Signature file to use.\n"); + printf(" -z NUM Signature ID to use.\n"); + printf(" -c FILE File to use as corpus.\n"); + printf(" -n NUMBER Repeat scan NUMBER times (default 20).\n"); + printf(" -N Benchmark in block mode" + " (default: streaming).\n"); + printf(" -V Benchmark in vectored mode" + " (default: streaming).\n"); + printf(" -T CPU,CPU,... Benchmark with threads on these CPUs.\n"); + printf(" -i DIR Don't compile, load from files in DIR" + " instead.\n"); + printf(" -w DIR After compiling, save to files in DIR.\n"); + printf(" -d NUMBER Set SOM precision mode (default: 8 (large)).\n"); + printf("\n"); + printf(" --per-scan Display per-scan Mbit/sec results.\n"); + printf(" --echo-matches Display all matches that occur during scan.\n"); + printf("\n\n"); + + if (error) { + printf("Error: %s\n", error); + } +} + +/** Wraps up a name and the set of signature IDs it refers to. */ +struct BenchmarkSigs { + BenchmarkSigs(string name_in, SignatureSet sigs_in) + : name(move(name_in)), sigs(move(sigs_in)) {} + string name; + SignatureSet sigs; +}; + +/** Process command-line arguments. Prints usage and exits on error. */ +static +void processArgs(int argc, char *argv[], vector &sigSets, + UNUSED Grey &grey) { + const char options[] = "-b:c:Cd:e:G:hi:n:No:p:sT:Vw:z:"; + int in_sigfile = 0; + int do_per_scan = 0; + int do_echo_matches = 0; + vector sigFiles; + + static struct option longopts[] = { + {"per-scan", 0, &do_per_scan, 1}, + {"echo-matches", 0, &do_echo_matches, 1}, + {nullptr, 0, nullptr, 0} + }; + + for (;;) { + int c = getopt_long(argc, argv, options, longopts, nullptr); + if (c < 0) { + break; + } + switch (c) { + case 'c': + corpusFile.assign(optarg); + break; + case 'd': { + unsigned dist; + if (!fromString(optarg, dist)) { + usage("Must provide an integer argument to '-d' flag"); + exit(1); + } + switch (dist) { + case 2: + somPrecisionMode = HS_MODE_SOM_HORIZON_SMALL; + break; + case 4: + somPrecisionMode = HS_MODE_SOM_HORIZON_MEDIUM; + break; + case 8: + somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE; + break; + default: + usage("SOM precision must be 2, 4 or 8"); + exit(1); + } + break; + } + case 'e': + exprPath.assign(optarg); + break; +#ifndef RELEASE_BUILD + case 'G': + applyGreyOverrides(&grey, string(optarg)); + break; +#endif + case 'h': + usage(nullptr); + exit(0); + break; + case 'n': + if (!fromString(optarg, repeats) || repeats == 0) { + usage("Couldn't parse argument to -n flag, should be" + " a positive integer."); + exit(1); + } + break; + case 's': + in_sigfile = 2; + break; + case 'N': + scan_mode = ScanMode::BLOCK; + break; + case 'V': + scan_mode = ScanMode::VECTORED; + break; + case 'T': + if (!strToList(optarg, threadCores)) { + usage("Couldn't parse argument to -T flag, should be" + " a list of positive integers."); + exit(1); + } + break; + case 'z': { + unsigned int sinumber; + if (!fromString(optarg, sinumber)) { + usage("Argument to '-z' flag must be an integer"); + exit(1); + } + SignatureSet sigs = {sinumber}; + sigSets.emplace_back(string("-z ") + optarg, sigs); + break; + } + case 'i': + loadDatabases = true; + serializePath = optarg; + break; + case 'w': + saveDatabases = true; + serializePath = optarg; + break; + case 1: + if (in_sigfile) { + sigFiles.push_back(optarg); + in_sigfile = 2; + break; + } + case 0: + break; + default: + usage("Unrecognised command line argument."); + exit(1); + } + + if (in_sigfile) { + in_sigfile--; + } + } + + if (do_echo_matches) { + echo_matches = true; + } + if (do_per_scan) { + display_per_scan = true; + } + + if (exprPath.empty() && !sigFiles.empty()) { + /* attempt to infer an expression directory */ + auto si = sigFiles.begin(); + exprPath = inferExpressionPath(*si); + for (++si; si != sigFiles.end(); ++si) { + if (exprPath != inferExpressionPath(*si)) { + usage("Unable to infer consistent expression directory"); + exit(1); + } + } + } + + // Must have a valid expression path + if (exprPath.empty()) { + usage("Must specify an expression path with the -e option."); + exit(1); + } + + // Must have valid database to scan + if (corpusFile.empty()) { + usage("Must specify a corpus file with the -c option."); + exit(1); + } + + // Cannot ask for both loading and saving + if (loadDatabases && saveDatabases) { + usage("You cannot both load and save databases."); + exit(1); + } + + // Read in any -s signature sets. + for (const auto &file : sigFiles) { + SignatureSet sigs; + loadSignatureList(file, sigs); + sigSets.emplace_back(file, move(sigs)); + } +} + +/** Start the global timer. */ +static +void startTotalTimer(ThreadContext *ctx) { + if (ctx->num != 0) { + return; // only runs in the first thread + } + totalTimer.start(); +} + +/** Stop the global timer and calculate totals. */ +static +void stopTotalTimer(ThreadContext *ctx) { + if (ctx->num != 0) { + return; // only runs in the first thread + } + totalTimer.complete(); + totalSecs = totalTimer.seconds(); +} + +/** Run a benchmark over a given engine and corpus in block mode. */ +static +void benchBlock(void *context) { + ThreadContext *ctx = (ThreadContext *)context; + + // Synchronization point + ctx->barrier(); + + startTotalTimer(ctx); + + for (ResultEntry &r : ctx->results) { + ctx->timer.start(); + + for (const DataBlock &block : ctx->corpus_data) { + ctx->engine.scan(block.payload.c_str(), block.payload.size(), + block.id, r, *ctx->enginectx); + } + + ctx->timer.complete(); + r.seconds = ctx->timer.seconds(); + } + + // Synchronization point + ctx->barrier(); + + // Now that all threads are finished, we can stop the clock. + stopTotalTimer(ctx); +} + +/** Structure used to represent a stream. */ +struct StreamInfo { + unsigned int stream_id = ~0U; + unsigned int first_block_id = ~0U; + unsigned int last_block_id = 0; + unique_ptr eng_handle; +}; + +static +u64a count_streams(const vector &corpus_blocks) { + set streams; + for (const DataBlock &block : corpus_blocks) { + streams.insert(block.stream_id); + } + + return (u64a)streams.size(); +} + +/** + * Take a ThreadContext and prepare a vector for streaming mode + * scanning from it. + */ +static +vector prepStreamingData(const ThreadContext *ctx) { + vector info(count_streams(ctx->corpus_data)); + for (const DataBlock &block : ctx->corpus_data) { + assert(block.internal_stream_index < info.size()); + StreamInfo &si = info[block.internal_stream_index]; + + /* check if this is the first time we have encountered this stream */ + if (si.first_block_id > si.last_block_id) { + si.stream_id = block.stream_id; + si.first_block_id = block.id; + si.last_block_id = block.id; + } else { + assert(block.stream_id == si.stream_id); + assert(block.id > si.last_block_id); + assert(block.id > si.first_block_id); + si.last_block_id = block.id; + } + } + return info; +} + +static +void benchStreamingInternal(ThreadContext *ctx, vector &streams) { + assert(ctx); + const EngineHyperscan &e = ctx->engine; + const vector &blocks = ctx->corpus_data; + + for (ResultEntry &r : ctx->results) { + ctx->timer.start(); + + for (const auto &b : blocks) { + StreamInfo &stream = streams[b.internal_stream_index]; + assert(stream.stream_id == b.stream_id); + + // If this is the first block in the stream, open the stream + // handle. + if (b.id == stream.first_block_id) { + assert(!stream.eng_handle); + stream.eng_handle = e.streamOpen(*ctx->enginectx, b.stream_id); + if (!stream.eng_handle) { + printf("Fatal error: stream open failed!\n"); + exit(1); + } + } + + assert(stream.eng_handle); + + e.streamScan(*stream.eng_handle, b.payload.c_str(), + b.payload.size(), b.id, r); + + // if this was the last block in the stream, close the stream handle + if (b.id == stream.last_block_id) { + e.streamClose(move(stream.eng_handle), r); + stream.eng_handle = nullptr; + } + } + + ctx->timer.complete(); + r.seconds = ctx->timer.seconds(); + } +} + +/** Run a benchmark over a given engine and corpus in streaming mode. */ +static +void benchStreaming(void *context) { + ThreadContext *ctx = (ThreadContext *)context; + vector streams = prepStreamingData(ctx); + + // Synchronization point + ctx->barrier(); + + startTotalTimer(ctx); + + benchStreamingInternal(ctx, streams); + + // Synchronization point + ctx->barrier(); + + // Now that all threads are finished, we can stop the clock. + stopTotalTimer(ctx); +} + +/** In-memory structure for a data block to be scanned in vectored mode. */ +struct VectoredInfo { + vector data; + vector len; + unsigned int stream_id; +}; + +/** + * Take a ThreadContext and prepare a vector for vectored mode + * scanning from it. + */ +static +vector prepVectorData(const ThreadContext *ctx) { + vector out(count_streams(ctx->corpus_data)); + for (const DataBlock &block : ctx->corpus_data) { + VectoredInfo &vi = out[block.internal_stream_index]; + if (vi.data.empty()) { + vi.stream_id = block.stream_id; + } else { + assert(vi.stream_id == block.stream_id); + } + vi.data.push_back(block.payload.c_str()); + vi.len.push_back(block.payload.size()); + } + + return out; +} + +/** Run a benchmark over a given engine and corpus in vectored mode. */ +static +void benchVectored(void *context) { + ThreadContext *ctx = (ThreadContext *)context; + + vector v_plans = prepVectorData(ctx); + + // Synchronization point + ctx->barrier(); + + startTotalTimer(ctx); + + for (ResultEntry &r : ctx->results) { + ctx->timer.start(); + + for (const VectoredInfo &v_plan : v_plans) { + ctx->engine.scan_vectored(&v_plan.data[0], &v_plan.len[0], + v_plan.data.size(), v_plan.stream_id, r, + *ctx->enginectx); + } + + ctx->timer.complete(); + r.seconds = ctx->timer.seconds(); + } + + // Synchronization point + ctx->barrier(); + + // Now that all threads are finished, we can stop the clock. + stopTotalTimer(ctx); +} + +/** Given a time and a size, compute the throughput in megabits/sec. */ +static +long double calc_mbps(double seconds, u64a bytes) { + assert(seconds > 0); + return (long double)bytes / ((long double)seconds * 125000); +} + +/** Dump per-scan throughput data to screen. */ +static +void displayPerScanResults(const vector> &threads, + u64a bytesPerRun) { + for (const auto &t : threads) { + const auto &results = t->results; + for (size_t j = 0; j != results.size(); j++) { + const auto &r = results[j]; + double mbps = calc_mbps(r.seconds, bytesPerRun); + printf("T %2u Scan %2zu: %'0.2f Mbit/sec\n", t->num, j, mbps); + } + } + printf("\n"); +} + +static +u64a byte_size(const vector &corpus_blocks) { + u64a total = 0; + for (const DataBlock &block : corpus_blocks) { + total += block.payload.size(); + } + + return total; +} + +/** Dump benchmark results to screen. */ +static +void displayResults(const vector> &threads, + const vector &corpus_blocks) { + u64a bytesPerRun = byte_size(corpus_blocks); + u64a matchesPerRun = threads[0]->results[0].matches; + + // Sanity check: all of our results should have the same match count. + for (const auto &t : threads) { + if (!all_of(begin(t->results), end(t->results), + [&matchesPerRun](const ResultEntry &e) { + return e.matches == matchesPerRun; + })) { + printf("\nWARNING: PER-SCAN MATCH COUNTS ARE INCONSISTENT!\n\n"); + break; + } + } + + printf("Time spent scanning: %'0.3f seconds\n", totalSecs); + printf("Corpus size: %'llu bytes ", bytesPerRun); + switch (scan_mode) { + case ScanMode::STREAMING: + printf("(%'zu blocks in %'llu streams)\n", corpus_blocks.size(), + count_streams(corpus_blocks)); + break; + case ScanMode::VECTORED: + printf("(%'zu blocks in %'llu vectors)\n", corpus_blocks.size(), + count_streams(corpus_blocks)); + break; + case ScanMode::BLOCK: + printf("(%'zu blocks)\n", corpus_blocks.size()); + break; + } + + u64a totalBytes = bytesPerRun * repeats * threads.size(); + u64a totalBlocks = corpus_blocks.size() * repeats * threads.size(); + + double matchRate = ((double)matchesPerRun * 1024) / bytesPerRun; + printf("Matches per iteration: %'llu (%'0.3f matches/kilobyte)\n", + matchesPerRun, matchRate); + + double blockRate = (double)totalBlocks / (double)totalSecs; + printf("Overall block rate: %'0.2f blocks/sec\n", blockRate); + printf("Overall throughput: %'0.2Lf Mbit/sec\n", + calc_mbps(totalSecs, totalBytes)); + printf("\n"); + + if (display_per_scan) { + displayPerScanResults(threads, bytesPerRun); + } +} + +/** + * Construct a thread context for this scanning mode. + * + * Note: does not take blocks by reference. This is to give every thread their + * own copy of the data. It would be unrealistic for every thread to be scanning + * the same copy of the data. + */ +static +unique_ptr makeThreadContext(const EngineHyperscan &db, + const vector &blocks, + unsigned id, + thread_barrier &sync_barrier) { + thread_func_t fn = nullptr; + switch (scan_mode) { + case ScanMode::STREAMING: + fn = benchStreaming; + break; + case ScanMode::VECTORED: + fn = benchVectored; + break; + case ScanMode::BLOCK: + fn = benchBlock; + break; + } + assert(fn); + + return ue2::make_unique(id, db, sync_barrier, fn, blocks); +} + +/** Run the given benchmark. */ +static +void runBenchmark(const EngineHyperscan &db, + const vector &corpus_blocks) { + size_t numThreads; + bool useAffinity = false; + + if (threadCores.empty()) { + numThreads = 1; + } else { + numThreads = threadCores.size(); +#ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP + useAffinity = true; +#else + useAffinity = false; +#endif + } + + // Initialise a barrier that will let us sync threads before/after scanning + // for timer measurements. + thread_barrier sync_barrier(numThreads); + + vector> threads; + + for (unsigned i = 0; i < numThreads; i++) { + auto t = makeThreadContext(db, corpus_blocks, i, sync_barrier); + int core = useAffinity ? (int)threadCores[i] : -1; + if (!t->start(core)) { + printf("Unable to start processing thread %u\n", i); + exit(1); + } + threads.push_back(move(t)); + } + + // Reap threads. + for (auto &t : threads) { + t->join(); + } + + // Display global results. + displayResults(threads, corpus_blocks); +} + +} // namespace + +/** Main driver. */ +int main(int argc, char *argv[]) { + Grey grey; + + setlocale(LC_ALL, ""); // use the user's locale + +#ifndef NDEBUG + printf("\nWARNING: DO NOT BENCHMARK A HYPERSCAN BUILD WITH ASSERTIONS\n\n"); +#endif + + vector sigSets; + processArgs(argc, argv, sigSets, grey); + + // read in and process our expressions + ExpressionMap exprMapTemplate; + loadExpressions(exprPath, exprMapTemplate); + + // If we have no signature sets, the user wants us to benchmark all the + // known expressions together. + if (sigSets.empty()) { + SignatureSet sigs; + for (auto i : exprMapTemplate | map_keys) { + sigs.push_back(i); + } + sigSets.emplace_back(exprPath, move(sigs)); + } + + // read in and process our corpus + vector corpus_blocks; + try { + corpus_blocks = readCorpus(corpusFile); + } catch (const DataCorpusError &e) { + printf("Corpus data error: %s\n", e.msg.c_str()); + return 1; + } + + for (const auto &s : sigSets) { + ExpressionMap exprMap = exprMapTemplate; // copy + + limitBySignature(exprMap, s.sigs); + if (exprMap.empty()) { + continue; + } + + auto engine = buildEngineHyperscan(exprMap, scan_mode, s.name, grey); + if (!engine) { + printf("Error: expressions failed to compile.\n"); + exit(1); + } + + printf("\n"); + + runBenchmark(*engine, corpus_blocks); + } + + return 0; +} diff --git a/tools/hsbench/scripts/CorpusBuilder.py b/tools/hsbench/scripts/CorpusBuilder.py new file mode 100755 index 00000000..5baed2bd --- /dev/null +++ b/tools/hsbench/scripts/CorpusBuilder.py @@ -0,0 +1,58 @@ +#!/usr/bin/python + +''' +A module to construct corpora databases for the Hyperscan benchmarker +(hsbench). + +After construction, simply add blocks with the add_chunk() method, then call +finish() when you're done. +''' + +import os.path + +try: + from sqlite3 import dbapi2 as sqlite +except: + from pysqlite2 import dbapi2 as sqlite + +class CorpusBuilder: + SCHEMA = ''' +CREATE TABLE chunk ( + id integer primary key, + stream_id integer not null, + data blob +); +''' + + def __init__(self, outfile): + if os.path.exists(outfile): + raise RuntimeError("Database '%s' already exists" % outfile) + self.outfile = outfile + self.db = sqlite.connect(self.outfile) + self.db.executescript(CorpusBuilder.SCHEMA) + self.current_chunk_id = 0; + + def add_chunk(self, stream_id, data): + chunk_id = self.current_chunk_id; + c = self.db.cursor() + q = 'insert into chunk (id, stream_id, data) values (?, ?, ?)' + c.execute(q, (chunk_id, stream_id, sqlite.Binary(data))) + self.current_chunk_id += 1 + return chunk_id + + def finish(self): + self.db.commit() + + c = self.db.cursor() + q = 'create index chunk_stream_id_idx on chunk(stream_id)' + c.execute(q) + + c = self.db.cursor() + q = 'vacuum' + c.execute(q) + + c = self.db.cursor() + q = 'analyze' + c.execute(q) + + self.db.commit() diff --git a/tools/hsbench/scripts/gutenbergCorpus.py b/tools/hsbench/scripts/gutenbergCorpus.py new file mode 100755 index 00000000..fa1b1570 --- /dev/null +++ b/tools/hsbench/scripts/gutenbergCorpus.py @@ -0,0 +1,68 @@ +#!/usr/bin/python + +''' +This script creates a Hyperscan benchmarking corpus database from a supplied +group of Project Gutenberg texts. +''' + +import sys, getopt, os.path +import gutenberg.acquire, gutenberg.cleanup, gutenberg.query +from CorpusBuilder import CorpusBuilder + +stream_id = 0 +stream_bytes = 0 + +def addBlocks(builder, block_size, stream_size, text_id, text): + global stream_id + global stream_bytes + + print "text", text_id, "len", len(text) + i = 0 + while i < len(text): + chunk = text[i:min(len(text), i + block_size)] + builder.add_chunk(stream_id, chunk) + i += block_size + stream_bytes += len(chunk) + if stream_bytes >= stream_size: + stream_id += 1 + stream_bytes = 0 + print "Text", text_id, ": added", i/block_size, "blocks of", block_size, "bytes." + +def buildCorpus(outFN, block_size, stream_size, text_ids): + if len(text_ids) == 0: + print >>sys.stderr, "Must provide at least one input ID" + sys.exit(0) + + builder = CorpusBuilder(outFN) + + total_bytes = 0 + stream_id = 0 + stream_bytes = 0 + + for text_id in text_ids: + text_id = int(text_id) + text = gutenberg.acquire.load_etext(text_id) + text = gutenberg.cleanup.strip_headers(text).strip() + addBlocks(builder, block_size, stream_size, text_id, text) + total_bytes += len(text) + + builder.finish() + + print "Total:", total_bytes, "bytes." + +def usage(exeName): + errmsg = "Usage: %s -o -b -s ..." + errmsg = errmsg % exeName + print >> sys.stderr, errmsg + sys.exit(-1) + +if __name__ == '__main__': + opts, args = getopt.getopt(sys.argv[1:], 'o:b:s:') + opts = dict(opts) + + requiredKeys = [ '-o', '-b', '-s' ] + for k in requiredKeys: + if not opts.has_key(k): + usage(os.path.basename(sys.argv[0])) + + buildCorpus(opts['-o'], int(opts['-b']), int(opts['-s']), args) diff --git a/tools/hsbench/scripts/linebasedCorpus.py b/tools/hsbench/scripts/linebasedCorpus.py new file mode 100755 index 00000000..bde20e39 --- /dev/null +++ b/tools/hsbench/scripts/linebasedCorpus.py @@ -0,0 +1,53 @@ +#!/usr/bin/python + +''' +Simple script to take a file full of lines of text and push them into a +Hyperscan benchmarking corpus database, one block per line. +''' + +import sys, getopt, os.path +from CorpusBuilder import CorpusBuilder + +def lineCorpus(inFN, outFN): + ''' + Read lines from file name @inFN and write them as blocks to a new db with + name @outFN. + ''' + + if not os.path.exists(inFN): + print >> sys.stderr, "Input file '%s' does not exist. Exiting." % outFN + sys.exit(-1) + + lines = open(inFN).readlines() + + if len(lines) == 0: + print >> sys.stderr, "Input file contained no lines. Exiting." + sys.exit(0) + + builder = CorpusBuilder(outFN) + + # write a single stream to contain everything + streamId = 0 + + for l in lines: + builder.add_chunk(streamId, l.rstrip()) + + builder.finish() + +def usage(exeName): + errmsg = "Usage: %s -i -o " + errmsg = errmsg % exeName + print >> sys.stderr, errmsg + sys.exit(-1) + +if __name__ == '__main__': + args = getopt.getopt(sys.argv[1:], 'i:o:c:') + args = dict(args[0]) + + requiredKeys = [ '-i', '-o' ] + for k in requiredKeys: + if not args.has_key(k): + usage(os.path.basename(sys.argv[0])) + + fnArgs = tuple([args[k] for k in requiredKeys]) + lineCorpus(*fnArgs) diff --git a/tools/hsbench/scripts/pcapCorpus.py b/tools/hsbench/scripts/pcapCorpus.py new file mode 100755 index 00000000..c10bfef3 --- /dev/null +++ b/tools/hsbench/scripts/pcapCorpus.py @@ -0,0 +1,301 @@ +#!/usr/bin/env python + +''' +Script to convert a pcap file containing UDP and TCP packets to a corpus file. +''' + +import sys, getopt, pprint, os +from sqlite3 import dbapi2 as sqlite +import pcap +from optparse import OptionParser +from socket import AF_INET, IPPROTO_UDP, IPPROTO_TCP, inet_ntop, ntohs, ntohl, inet_ntoa +import struct +from CorpusBuilder import CorpusBuilder + +ETHERTYPE_IP = 0x0800 # IP protocol +ETHERTYPE_ARP = 0x0806 # Addr. resolution protocol +ETHERTYPE_REVARP = 0x8035 # reverse Addr. resolution protocol +ETHERTYPE_VLAN = 0x8100 # IEEE 802.1Q VLAN tagging +ETHERTYPE_IPV6 = 0x86dd # IPv6 + +# +# A dictionary of active TCP streams +# +tcp_streams = {} + +# +# A dictionary of UDP streams +# +udp_streams = {} + +# +# Current stream id +cur_stream_id = 0 + +def usage(exeName) : + errmsg = "Usage: %s -i -o " + errmsg = errmsg % exeName + print >> sys.stderr, errmsg + sys.exit(-1) + +class FiveTuple(object): + def __init__(self, protocol, src_addr, src_port, dst_addr, dst_port): + self.protocol = protocol + self.src_addr = src_addr + self.src_port = src_port + self.dst_addr = dst_addr + self.dst_port = dst_port + + def __str__(self): + return "%d,%s,%d,%s,%d" % (self.protocol, self.src_addr, self.src_port, self.dst_addr, self.dst_port) + +class UdpSegment: + """Definition of a UDP segment + """ + def __init__(self, five_tuple, header, payload): + self.five_tuple = five_tuple + self.udp_header = header + self.udp_payload = payload + +class TcpSegment: + """Definition of a TCP segment + """ + def __init__(self, five_tuple, header, payload): + self.five_tuple = five_tuple + self.tcp_header = header + self.tcp_payload = payload + self.tcp_sequence_number, self.tcp_acknowledgement_number = struct.unpack('!LL', header[4:12]) + + def opt_isset_FIN(self): + opts = ord(self.tcp_header[13]) & 0x3F + return (opts & 0x01) + + def opt_isset_SYN(self): + opts = ord(self.tcp_header[13]) & 0x3F + return (opts & 0x02) + + def get_sequence_number(self): + return self.tcp_sequence_number + + def __cmp__(self, other): + return cmp(self.tcp_sequence_number, other.tcp_sequence_number) + +class TcpStream: + """Definition of a TCP stream. + """ + TCP_STREAM_ACTIVE = 0x1 + TCP_STREAM_CLOSED = 0x02 + + def __init__(self, five_tuple): + self.five_tuple = five_tuple + self.initial_sequence_number = 0 + self.segments = [] + + def reset_stream(self): + self.segments = [] + self.initial_sequence_number = 0 + + def set_initial_sequence_number(self, sequence_number): + self.initial_sequence_number = sequence_number + + def append_segment(self, tcp_segment): + if len(self.segments) == 0: + self.set_initial_sequence_number(tcp_segment.get_sequence_number()) + self.segments.append(tcp_segment) + + def get_segments_sorted(self): + return sorted(self.segments) + +class UdpStream: + """A container for UDP packets that share the same 5-tuple + """ + def __init__(self, five_tuple): + self.five_tuple = five_tuple + self.segments = [] + + def append_segment(self, udp_segment): + self.segments.append(udp_segment) + + +def newStream(five_tuple): + ''' + Create a new stream using the arguments passed-in and return its ID. + ''' + global cur_stream_id + stream_id = cur_stream_id + cur_stream_id += 1 + return stream_id + +def process_tcp_segment(builder, segment): + """Process a tcp segment. It checks for SYN and FIN segments are + if set modifies the associated stream. + """ + segment_id = str(segment.five_tuple) + if segment_id in tcp_streams: + m_tcp_stream = tcp_streams[segment_id] + m_tcp_stream.append_segment(segment) + else: + m_tcp_stream = TcpStream(segment.five_tuple) + m_tcp_stream.append_segment(segment) + tcp_streams[segment_id] = m_tcp_stream + + + if segment.opt_isset_SYN(): + m_tcp_stream.segments = [] + + if segment.opt_isset_FIN(): + # + # Finished with the stream - add the segments in the + # stream to db allowing the stream to be reused. + # + db_add_tcp_stream_segments(builder, m_tcp_stream) + del tcp_streams[segment_id] + +def process_udp_segment(builder, segment): + """ Process a UDP segment. Given the connectionless nature of the UDP + protocol we simple accumulate the segment for later processing + when all the packets have been read + """ + segment_id = str(segment.five_tuple) + if segment_id in udp_streams: + m_udp_stream = udp_streams[segment_id] + m_udp_stream.append_segment(segment) + else: + m_udp_stream = UdpStream(segment.five_tuple) + m_udp_stream.append_segment(segment) + udp_streams[segment_id] = m_udp_stream + + +def db_add_tcp_stream_segments(builder, tcp_stream): + """Add the contents of a tcp stream to the database + """ + tcp_segments = tcp_stream.get_segments_sorted() + last_sequence_num = 0 + streamID = None + + for tcp_segment in tcp_segments: + if (len(tcp_segment.tcp_payload) > 0) and (tcp_segment.tcp_sequence_number > last_sequence_num): + # + # Segment with an actual payload - add it to the stream's + # list of chunks. + # + # Note: delay creating the stream until we have a via chunk to + # commit to it + # + if streamID == None: + streamID = newStream(tcp_stream.five_tuple) + builder.add_chunk(streamID, tcp_segment.tcp_payload) + last_sequence_num = tcp_segment.tcp_sequence_number + + +def db_add_udp_stream_segments(builder, udp_stream): + """Add the contents of a UDP stream to the database. Since UDP is + connection-less, a UDP stream object is really just an accumulation + of all the packets associated with a given 5-tuple. + """ + udp_segments = udp_stream.segments + streamID = None + for udp_segment in udp_segments: + if len(udp_segment.udp_payload) > 0: + if streamID == None: + streamID = newStream(udp_stream.five_tuple) + builder.add_chunk(streamID, udp_segment.udp_payload) + +def enchunk_pcap(pcapFN, sqliteFN): + """Read the contents of a pcap file with name @pcapFN and produce + a sqlite db with name @sqliteFN. It will contain chunks of data + from TCP and UDP streams, + """ + + if not os.path.exists(pcapFN): + print >> sys.stderr, "Input file '%s' does not exist. Exiting." % pcapFN + sys.exit(-1) + + builder = CorpusBuilder(sqliteFN) + + # + # Read in the contents of the pcap file, adding stream segments as found + # + pkt_cnt = 0; + ip_pkt_cnt = 0; + unsupported_ip_protocol_cnt = 0 + pcap_ref = pcap.pcap(pcapFN) + done = False + + while not done: + try: + ts, packet = pcap_ref.next() + except: + break + + pkt_cnt += 1 + + linkLayerType = struct.unpack('!H', packet[(pcap_ref.dloff - 2):pcap_ref.dloff])[0] + if linkLayerType != ETHERTYPE_IP: + # + # We're only interested in IP packets + # + continue + + ip_pkt_cnt += 1 + + ip_pkt_total_len = struct.unpack('!H', packet[pcap_ref.dloff + 2: pcap_ref.dloff + 4])[0] + ip_pkt = packet[pcap_ref.dloff:pcap_ref.dloff + ip_pkt_total_len] + pkt_protocol = struct.unpack('B', ip_pkt[9])[0] + + if (pkt_protocol != IPPROTO_UDP) and (pkt_protocol != IPPROTO_TCP): + # + # we're only interested in UDP and TCP packets at the moment + # + continue + + pkt_src_addr = inet_ntoa(ip_pkt[12:16]) + pkt_dst_addr = inet_ntoa(ip_pkt[16:20]) + + ip_hdr_len_offset = (ord(ip_pkt[0]) & 0x0f) * 4 + ip_payload = ip_pkt[ip_hdr_len_offset:len(ip_pkt)] + + pkt_src_port, pkt_dst_port = struct.unpack('!HH', ip_payload[0:4]) + five_tuple = FiveTuple(pkt_protocol, pkt_src_addr, pkt_src_port, pkt_dst_addr, pkt_dst_port) + five_tuple_id = str(five_tuple) + + if pkt_protocol == IPPROTO_UDP: + udp_payload_len = struct.unpack('!H', ip_payload[4:6])[0] - 8 + udp_header = ip_payload[0:8] + udp_payload = ip_payload[8:len(ip_payload)] + udp_segment = UdpSegment(five_tuple, udp_header, udp_payload) + process_udp_segment(builder, udp_segment) + elif pkt_protocol == IPPROTO_TCP: + tcp_hdr_len = (ord(ip_payload[12]) >> 4) * 4 + tcp_header = ip_payload[0:tcp_hdr_len] + tcp_payload = ip_payload[tcp_hdr_len:len(ip_payload)] + segment = TcpSegment(five_tuple, tcp_header, tcp_payload) + process_tcp_segment(builder, segment) + + # + # Having read the contents of the pcap, we fill the database with any + # remaining TCP and UDP segments + # + for tcp_stream in tcp_streams.itervalues(): + db_add_tcp_stream_segments(builder, tcp_stream) + + for udp_stream in udp_streams.itervalues(): + db_add_udp_stream_segments(builder, udp_stream) + + # + # We've finished with the database + # + builder.finish() + +if __name__ == '__main__' : + + args = getopt.getopt(sys.argv[1:], 'i:o:') + args = dict(args[0]) + + requiredKeys = [ '-i', '-o'] + for k in requiredKeys : + if not args.has_key(k) : + usage(os.path.basename(sys.argv[0])) + + fnArgs = tuple([ args[k] for k in requiredKeys ]) + enchunk_pcap(*fnArgs) diff --git a/tools/hsbench/thread_barrier.h b/tools/hsbench/thread_barrier.h new file mode 100644 index 00000000..1c3a53e7 --- /dev/null +++ b/tools/hsbench/thread_barrier.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * \brief Simple thread barrier. + */ + +#ifndef TOOLS_THREAD_BARRIER_H +#define TOOLS_THREAD_BARRIER_H + +#include +#include + +/** + * \brief Simple thread barrier class. + * + * Blocks until wait() has been called N times. + */ +class thread_barrier { +public: + explicit thread_barrier(unsigned int n) : max(n) { + if (max == 0) { + throw std::runtime_error("invalid barrier"); + } + } + + void wait() { + std::unique_lock lock(mtx); + count++; + if (count >= max) { + count = 0; + condvar.notify_all(); + } else { + condvar.wait(lock); + } + } + +private: + std::mutex mtx; + std::condition_variable condvar; + unsigned int count = 0; + unsigned int max; +}; + +#endif // TOOLS_THREAD_BARRIER_H diff --git a/tools/hsbench/timer.h b/tools/hsbench/timer.h new file mode 100644 index 00000000..85bd294c --- /dev/null +++ b/tools/hsbench/timer.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TIMER_H +#define TIMER_H + +#include "ue2common.h" + +#include + +class Timer { +public: + Timer() = default; + + void start() { + clock_start = Clock::now(); + } + + void complete() { + clock_end = Clock::now(); + } + + double seconds() const { + std::chrono::duration secs = clock_end - clock_start; + return secs.count(); + } + +protected: + using Clock = std::chrono::steady_clock; + std::chrono::time_point clock_start; + std::chrono::time_point clock_end; +}; + +#endif // TIMER_H diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt index dc731322..c0a6bc21 100644 --- a/util/CMakeLists.txt +++ b/util/CMakeLists.txt @@ -1,7 +1,10 @@ # utility libs +CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") -include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR} + ${PROJECT_SOURCE_DIR}) set_source_files_properties( ${CMAKE_BINARY_DIR}/tools/ExpressionParser.cpp @@ -31,3 +34,14 @@ SET(corpusomatic_SRCS ) add_library(corpusomatic STATIC ${corpusomatic_SRCS}) +set(databaseutil_SRCS + database_util.cpp + database_util.h +) +add_library(databaseutil STATIC ${databaseutil_SRCS}) + +set(crosscompileutil_SRCS + cross_compile.cpp + cross_compile.h + ) +add_library(crosscompileutil STATIC ${crosscompileutil_SRCS}) diff --git a/util/cross_compile.cpp b/util/cross_compile.cpp new file mode 100644 index 00000000..b4d1f5f1 --- /dev/null +++ b/util/cross_compile.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "cross_compile.h" +#include "src/ue2common.h" +#include "src/hs_compile.h" +#include "src/util/make_unique.h" + +#include +#include + +using namespace std; + +struct XcompileMode { + const char *name; + unsigned long long cpu_features; +}; + +static const XcompileMode xcompile_options[] = { + { "avx2", HS_CPU_FEATURES_AVX2 }, + { "base", 0 }, +}; + +unique_ptr xcompileReadMode(const char *s) { + hs_platform_info rv; + UNUSED hs_error_t err; + err = hs_populate_platform(&rv); + assert(!err); + + string str(s); + string mode = str.substr(0, str.find(":")); + string opt = str.substr(str.find(":")+1, str.npos); + bool found_mode = false; + + if (!opt.empty()) { + const size_t numOpts = ARRAY_LENGTH(xcompile_options); + for (size_t i = 0; i < numOpts; i++) { + if (opt.compare(xcompile_options[i].name) == 0) { + DEBUG_PRINTF("found opt %zu:%llu\n", i, + xcompile_options[i].cpu_features); + rv.cpu_features = xcompile_options[i].cpu_features; + found_mode = true; + break; + } + } + } + + if (!found_mode) { + return nullptr; + } else { + DEBUG_PRINTF("cpu_features %llx\n", rv.cpu_features); + return ue2::make_unique(rv); + } +} + +string to_string(const hs_platform_info &p) { + ostringstream out; + if (p.tune) { + out << p.tune; + } + + if (p.cpu_features) { + u64a features = p.cpu_features; + if (features & HS_CPU_FEATURES_AVX2) { + out << " avx2"; + features &= ~HS_CPU_FEATURES_AVX2; + } + + if (features) { + out << " " << "?cpu_features?:" << features; + } + } + + return out.str(); +} + +string xcompileUsage(void) { + string variants = "Instruction set options: "; + const size_t numOpts = ARRAY_LENGTH(xcompile_options); + for (size_t i = 0; i < numOpts; i++) { + variants += xcompile_options[i].name; + if (i + 1 != numOpts) { + variants += ", "; + } + } + + return variants; +} diff --git a/util/cross_compile.h b/util/cross_compile.h new file mode 100644 index 00000000..ddfc7b10 --- /dev/null +++ b/util/cross_compile.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CROSS_COMPILE_H +#define CROSS_COMPILE_H + +#include +#include + +struct hs_platform_info; + +std::unique_ptr xcompileReadMode(const char *s); +std::string xcompileUsage(void); + +std::string to_string(const hs_platform_info &p); + +#endif /* CROSS_COMPILE_H */ diff --git a/util/database_util.cpp b/util/database_util.cpp new file mode 100644 index 00000000..3df75e2a --- /dev/null +++ b/util/database_util.cpp @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "database_util.h" + +#include "hs_common.h" + +#include +#include +#include +#include +#include + +#if defined(HAVE_MMAP) +#include // for mmap +#include // for close +#include +#include +#endif + +using namespace std; + +bool saveDatabase(const hs_database_t *db, const char *filename, bool verbose) { + assert(db); + assert(filename); + + if (verbose) { + cout << "Saving database to: " << filename << endl; + } + + char *bytes = nullptr; + size_t length = 0; + hs_error_t err = hs_serialize_database(db, &bytes, &length); + if (err != HS_SUCCESS) { + return false; + } + + assert(bytes); + assert(length > 0); + + ofstream out(filename, ios::binary); + out.write(bytes, length); + out.close(); + + ::free(bytes); + + return true; +} + +hs_database_t * loadDatabase(const char *filename, bool verbose) { + assert(filename); + + if (verbose) { + cout << "Loading database from: " << filename << endl; + } + + char *bytes = nullptr; + +#if defined(HAVE_MMAP) + // Use mmap to read the file + int fd = open(filename, O_RDONLY); + if (fd < 0) { + return nullptr; + } + struct stat st; + if (fstat(fd, &st) < 0) { + close(fd); + return nullptr; + } + size_t len = st.st_size; + + bytes = (char *)mmap(nullptr, len, PROT_READ, MAP_SHARED, fd, 0); + if (bytes == MAP_FAILED) { + cout << "mmap failed" << endl; + close(fd); + return nullptr; + } +#else + // Fall back on stream IO + ifstream is; + is.open(filename, ios::in | ios::binary); + if (!is.is_open()) { + return nullptr; + } + is.seekg(0, ios::end); + size_t len = is.tellg(); + if (verbose) { + cout << "Reading " << len << " bytes" << endl; + } + is.seekg(0, ios::beg); + bytes = new char[len]; + is.read(bytes, len); + is.close(); +#endif + + assert(bytes); + + if (verbose) { + char *info = nullptr; + hs_error_t err = hs_serialized_database_info(bytes, len, &info); + if (err) { + cout << "Unable to decode serialized database info: " << err + << endl; + } else if (info) { + cout << "Serialized database info: " << info << endl; + std::free(info); + } else { + cout << "Unable to decode serialized database info." << endl; + } + } + + hs_database_t *db = nullptr; + hs_error_t err = hs_deserialize_database(bytes, len, &db); + +#if defined(HAVE_MMAP) + munmap(bytes, len); + close(fd); +#else + delete [] bytes; +#endif + + if (err != HS_SUCCESS) { + cout << "hs_deserialize_database call failed: " << err << endl; + return nullptr; + } + + assert(db); + + return db; +} diff --git a/util/database_util.h b/util/database_util.h new file mode 100644 index 00000000..badd036d --- /dev/null +++ b/util/database_util.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DATABASE_UTIL_H +#define DATABASE_UTIL_H + +struct hs_database; + +bool saveDatabase(const hs_database *db, const char *filename, + bool verbose = false); + +hs_database *loadDatabase(const char *filename, bool verbose = false); + +#endif /* DATABASE_UTIL_H */ diff --git a/util/expression_path.h b/util/expression_path.h new file mode 100644 index 00000000..3075b4d4 --- /dev/null +++ b/util/expression_path.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef EXPRESSION_PATH_H +#define EXPRESSION_PATH_H + +#include "ue2common.h" + +#include +#include +#include +#include +#include + +#include +#if !defined(_WIN32) +#include +#include +#endif + +// +// Utility functions +// + +/** + * Given a path to a signature file, infer the path of the pcre directory. + */ +static inline +std::string inferExpressionPath(const std::string &sigFile) { +#ifndef _WIN32 + // POSIX variant. + + // dirname() may modify its argument, so we must make a copy. + std::vector path(sigFile.size() + 1); + memcpy(path.data(), sigFile.c_str(), sigFile.size()); + path[sigFile.size()] = 0; // ensure null termination. + + std::string rv = dirname(path.data()); +#else + // Windows variant. + if (sigFile.size() >= _MAX_DIR) { + return std::string(); + } + char path[_MAX_DIR]; + _splitpath(sigFile.c_str(), nullptr, path, nullptr, nullptr); + std::string rv(path); +#endif + + rv += "/../pcre"; + return rv; +} + +#if defined(_WIN32) +#define stat _stat +#define S_IFREG _S_IFREG +#endif + +static inline +bool isDir(const std::string &filename) { + struct stat s; + + if (stat(filename.c_str(), &s) == -1) { + std::cerr << "stat: " << strerror(errno) << std::endl; + return false; + } + + return (S_IFDIR & s.st_mode); +} + +static inline +bool isFile(const std::string &filename) { + struct stat s; + + if (stat(filename.c_str(), &s) == -1) { + std::cerr << "stat: " << strerror(errno) << std::endl; + return false; + } + + return (S_IFREG & s.st_mode); +} + +#endif /* EXPRESSION_PATH_H */