diff --git a/CMakeLists.txt b/CMakeLists.txt index e33655be..3a8cef0b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,6 +70,16 @@ include_directories(SYSTEM include) include (${CMAKE_MODULE_PATH}/boost.cmake) +# PCRE check, we have a fixed requirement for PCRE to use Chimera +# and hscollider +set(PCRE_REQUIRED_MAJOR_VERSION 8) +set(PCRE_REQUIRED_MINOR_VERSION 41) +set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION}) +include (${CMAKE_MODULE_PATH}/pcre.cmake) +if (NOT CORRECT_PCRE_VERSION) + message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} not found") +endif() + # -- make this work? set(python_ADDITIONAL_VERSIONS 2.7 2.6) find_package(PythonInterp) find_program(RAGEL ragel) @@ -154,7 +164,7 @@ if(MSVC OR MSVC_IDE) # todo: change these as required set(ARCH_C_FLAGS "/arch:AVX2") set(ARCH_CXX_FLAGS "/arch:AVX2") - set(MSVC_WARNS "/wd4101 /wd4146 /wd4172 /wd4200 /wd4244 /wd4267 /wd4307 /wd4334 /wd4805 -D_CRT_SECURE_NO_WARNINGS") + set(MSVC_WARNS "/wd4101 /wd4146 /wd4172 /wd4200 /wd4244 /wd4267 /wd4307 /wd4334 /wd4805 /wd4996 -D_CRT_SECURE_NO_WARNINGS") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 ${MSVC_WARNS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 ${MSVC_WARNS} /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD") endif() @@ -445,12 +455,20 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") endif() +# we need static libs for Chimera - too much deep magic for shared libs +if (CORRECT_PCRE_VERSION AND BUILD_STATIC_LIBS) + set(BUILD_CHIMERA TRUE) +endif() + add_subdirectory(util) add_subdirectory(unit) add_subdirectory(doc/dev-reference) if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt) add_subdirectory(tools) endif() +if (EXISTS ${CMAKE_SOURCE_DIR}/chimera/CMakeLists.txt AND BUILD_CHIMERA) + add_subdirectory(chimera) +endif() # do substitutions configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h) diff --git a/chimera/CMakeLists.txt b/chimera/CMakeLists.txt new file mode 100644 index 00000000..f2a4203a --- /dev/null +++ b/chimera/CMakeLists.txt @@ -0,0 +1,32 @@ +# Chimera lib + +include_directories(${PCRE_INCLUDE_DIRS}) + +# only set these after all tests are done +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") + +SET(chimera_HEADERS + ch.h + ch_common.h + ch_compile.h + ch_runtime.h +) +install(FILES ${chimera_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs") + +SET(chimera_SRCS + ${chimera_HEADERS} + ch_alloc.c + ch_alloc.h + ch_compile.cpp + ch_database.c + ch_database.h + ch_internal.h + ch_runtime.c + ch_scratch.h + ch_scratch.c +) + +add_library(chimera STATIC ${chimera_SRCS}) +add_dependencies(chimera hs pcre) +target_link_libraries(chimera hs pcre) diff --git a/chimera/ch.h b/chimera/ch.h new file mode 100644 index 00000000..9838f0da --- /dev/null +++ b/chimera/ch.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CH_H_ +#define CH_H_ + +/** + * @file + * @brief The complete Chimera API definition. + * + * Chimera is a hybrid solution of Hyperscan and PCRE. + * + * This header includes both the Chimera compiler and runtime components. See + * the individual component headers for documentation. + */ + +#include "ch_compile.h" +#include "ch_runtime.h" + +#endif /* CH_H_ */ diff --git a/chimera/ch_alloc.c b/chimera/ch_alloc.c new file mode 100644 index 00000000..047f1238 --- /dev/null +++ b/chimera/ch_alloc.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Runtime functions for setting custom allocators. + */ + +#include "ch.h" +#include "ch_common.h" +#include "ch_internal.h" +#include "hs.h" +#include "ue2common.h" + +#define default_malloc malloc +#define default_free free + +ch_alloc_t ch_database_alloc = default_malloc; +ch_alloc_t ch_misc_alloc = default_malloc; +ch_alloc_t ch_scratch_alloc = default_malloc; + +ch_free_t ch_database_free = default_free; +ch_free_t ch_misc_free = default_free; +ch_free_t ch_scratch_free = default_free; + +static +ch_alloc_t normalise_alloc(ch_alloc_t a) { + if (!a) { + return default_malloc; + } else { + return a; + } +} + +static +ch_free_t normalise_free(ch_free_t f) { + if (!f) { + return default_free; + } else { + return f; + } +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_set_allocator(ch_alloc_t allocfunc, + ch_free_t freefunc) { + ch_set_database_allocator(allocfunc, freefunc); + ch_set_misc_allocator(allocfunc, freefunc); + ch_set_scratch_allocator(allocfunc, freefunc); + + // Set core Hyperscan alloc/free. + hs_error_t ret = hs_set_allocator(allocfunc, freefunc); + + return ret; +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_set_database_allocator(ch_alloc_t allocfunc, + ch_free_t freefunc) { + ch_database_alloc = normalise_alloc(allocfunc); + ch_database_free = normalise_free(freefunc); + + // Set Hyperscan database alloc/free. + return hs_set_database_allocator(allocfunc, freefunc); +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_set_misc_allocator(ch_alloc_t allocfunc, + ch_free_t freefunc) { + ch_misc_alloc = normalise_alloc(allocfunc); + ch_misc_free = normalise_free(freefunc); + + // Set Hyperscan misc alloc/free. + return hs_set_misc_allocator(allocfunc, freefunc); +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_set_scratch_allocator(ch_alloc_t allocfunc, + ch_free_t freefunc) { + ch_scratch_alloc = normalise_alloc(allocfunc); + ch_scratch_free = normalise_free(freefunc); + + // Set Hyperscan scratch alloc/free. + return hs_set_scratch_allocator(allocfunc, freefunc); +} diff --git a/chimera/ch_alloc.h b/chimera/ch_alloc.h new file mode 100644 index 00000000..243df00b --- /dev/null +++ b/chimera/ch_alloc.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CH_ALLOC_H +#define CH_ALLOC_H + +#include "hs_common.h" +#include "ue2common.h" +#include "ch_common.h" + +#ifdef __cplusplus +extern "C" +{ +#endif +extern hs_alloc_t ch_database_alloc; +extern hs_alloc_t ch_misc_alloc; +extern hs_alloc_t ch_scratch_alloc; + +extern hs_free_t ch_database_free; +extern hs_free_t ch_misc_free; +extern hs_free_t ch_scratch_free; +#ifdef __cplusplus +} /* extern C */ +#endif +/** \brief Check the results of an alloc done with hs_alloc for alignment. + * + * If we have incorrect alignment, return an error. Caller should free the + * offending block. */ +static really_inline +ch_error_t ch_check_alloc(const void *mem) { + ch_error_t ret = CH_SUCCESS; + if (!mem) { + ret = CH_NOMEM; + } else if (!ISALIGNED_N(mem, alignof(unsigned long long))) { + ret = CH_BAD_ALLOC; + } + return ret; +} + +#endif diff --git a/chimera/ch_common.h b/chimera/ch_common.h new file mode 100644 index 00000000..8caa4440 --- /dev/null +++ b/chimera/ch_common.h @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CH_COMMON_H_ +#define CH_COMMON_H_ + +#include "hs_common.h" + +#include + +/** + * @file + * @brief The Chimera common API definition. + * + * Chimera is a hybrid of Hyperscan and PCRE. + * + * This header contains functions available to both the Chimera compiler and + * runtime. + */ + +#ifdef __cplusplus +extern "C" +{ +#endif + +struct ch_database; + +/** + * A Chimera pattern database. + * + * Generated by one of the Chimera compiler functions: + * - @ref ch_compile() + * - @ref ch_compile_multi() + * - @ref ch_compile_ext_multi() + */ +typedef struct ch_database ch_database_t; + +/** + * A type for errors returned by Chimera functions. + */ +typedef int ch_error_t; + +/** + * Free a compiled pattern database. + * + * The free callback set by @ref ch_set_allocator()) will be used by this + * function. + * + * @param db + * A compiled pattern database. NULL may also be safely provided, in which + * case the function does nothing. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_free_database(ch_database_t *db); + +/** + * Utility function for identifying this release version. + * + * @return + * A string containing the version number of this release build and the + * date of the build. It is allocated statically, so it does not need to + * be freed by the caller. + */ +const char * HS_CDECL ch_version(void); + +/** + * Returns the size of the given database. + * + * @param database + * Pointer to compiled expression database. + * + * @param database_size + * On success, the size of the compiled database in bytes is placed in this + * parameter. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_database_size(const ch_database_t *database, + size_t *database_size); + +/** + * Utility function providing information about a database. + * + * @param database + * Pointer to a compiled database. + * + * @param info + * On success, a string containing the version and platform information for + * the supplied database is placed in the parameter. The string is + * allocated using the allocator supplied in @ref hs_set_allocator() + * (or malloc() if no allocator was set) and should be freed by the caller. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_database_info(const ch_database_t *database, + char **info); + +/** + * The type of the callback function that will be used by Chimera to allocate + * more memory at runtime as required. + * + * If Chimera is to be used in a multi-threaded, or similarly concurrent + * environment, the allocation function will need to be re-entrant, or + * similarly safe for concurrent use. + * + * @param size + * The number of bytes to allocate. + * @return + * A pointer to the region of memory allocated, or NULL on error. + */ +typedef void *(HS_CDECL *ch_alloc_t)(size_t size); + +/** + * The type of the callback function that will be used by Chimera to free + * memory regions previously allocated using the @ref ch_alloc_t function. + * + * @param ptr + * The region of memory to be freed. + */ +typedef void (HS_CDECL *ch_free_t)(void *ptr); + +/** + * Set the allocate and free functions used by Chimera for allocating + * memory at runtime for stream state, scratch space, database bytecode, + * and various other data structure returned by the Chimera API. + * + * The function is equivalent to calling @ref ch_set_scratch_allocator(), + * @ref ch_set_database_allocator() and + * @ref ch_set_misc_allocator() with the provided parameters. + * + * This call will override any previous allocators that have been set. + * + * Note: there is no way to change the allocator used for temporary objects + * created during the various compile calls (@ref ch_compile() and @ref + * ch_compile_multi()). + * + * @param alloc_func + * A callback function pointer that allocates memory. This function must + * return memory suitably aligned for the largest representable data type + * on this platform. + * + * @param free_func + * A callback function pointer that frees allocated memory. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_set_allocator(ch_alloc_t alloc_func, + ch_free_t free_func); + +/** + * Set the allocate and free functions used by Chimera for allocating memory + * for database bytecode produced by the compile calls (@ref ch_compile() and @ref + * ch_compile_multi()). + * + * If no database allocation functions are set, or if NULL is used in place of + * both parameters, then memory allocation will default to standard methods + * (such as the system malloc() and free() calls). + * + * This call will override any previous database allocators that have been set. + * + * Note: the database allocator may also be set by calling @ref + * ch_set_allocator(). + * + * Note: there is no way to change how temporary objects created during the + * various compile calls (@ref ch_compile() and @ref ch_compile_multi()) are + * allocated. + * + * @param alloc_func + * A callback function pointer that allocates memory. This function must + * return memory suitably aligned for the largest representable data type + * on this platform. + * + * @param free_func + * A callback function pointer that frees allocated memory. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_set_database_allocator(ch_alloc_t alloc_func, + ch_free_t free_func); + +/** + * Set the allocate and free functions used by Chimera for allocating memory + * for items returned by the Chimera API such as @ref ch_compile_error_t. + * + * If no misc allocation functions are set, or if NULL is used in place of both + * parameters, then memory allocation will default to standard methods (such as + * the system malloc() and free() calls). + * + * This call will override any previous misc allocators that have been set. + * + * Note: the misc allocator may also be set by calling @ref ch_set_allocator(). + * + * @param alloc_func + * A callback function pointer that allocates memory. This function must + * return memory suitably aligned for the largest representable data type + * on this platform. + * + * @param free_func + * A callback function pointer that frees allocated memory. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_set_misc_allocator(ch_alloc_t alloc_func, + ch_free_t free_func); + +/** + * Set the allocate and free functions used by Chimera for allocating memory + * for scratch space by @ref ch_alloc_scratch() and @ref ch_clone_scratch(). + * + * If no scratch allocation functions are set, or if NULL is used in place of + * both parameters, then memory allocation will default to standard methods + * (such as the system malloc() and free() calls). + * + * This call will override any previous scratch allocators that have been set. + * + * Note: the scratch allocator may also be set by calling @ref + * ch_set_allocator(). + * + * @param alloc_func + * A callback function pointer that allocates memory. This function must + * return memory suitably aligned for the largest representable data type + * on this platform. + * + * @param free_func + * A callback function pointer that frees allocated memory. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_set_scratch_allocator(ch_alloc_t alloc_func, + ch_free_t free_func); + +/** + * @defgroup CH_ERROR ch_error_t values + * + * @{ + */ + +/** + * The engine completed normally. + */ +#define CH_SUCCESS 0 + +/** + * A parameter passed to this function was invalid. + */ +#define CH_INVALID (-1) + +/** + * A memory allocation failed. + */ +#define CH_NOMEM (-2) + +/** + * The engine was terminated by callback. + * + * This return value indicates that the target buffer was partially scanned, + * but that the callback function requested that scanning cease after a match + * was located. + */ +#define CH_SCAN_TERMINATED (-3) + +/** + * The pattern compiler failed, and the @ref ch_compile_error_t should be + * inspected for more detail. + */ +#define CH_COMPILER_ERROR (-4) + +/** + * The given database was built for a different version of the Chimera matcher. + */ +#define CH_DB_VERSION_ERROR (-5) + +/** + * The given database was built for a different platform (i.e., CPU type). + */ +#define CH_DB_PLATFORM_ERROR (-6) + +/** + * The given database was built for a different mode of operation. This error + * is returned when streaming calls are used with a non-streaming database and + * vice versa. + */ +#define CH_DB_MODE_ERROR (-7) + +/** + * A parameter passed to this function was not correctly aligned. + */ +#define CH_BAD_ALIGN (-8) + +/** + * The memory allocator did not correctly return memory suitably aligned for + * the largest representable data type on this platform. + */ +#define CH_BAD_ALLOC (-9) + +/** + * The scratch region was already in use. + * + * This error is returned when Chimera is able to detect that the scratch + * region given is already in use by another Chimera API call. + * + * A separate scratch region, allocated with @ref ch_alloc_scratch() or @ref + * ch_clone_scratch(), is required for every concurrent caller of the Chimera + * API. + * + * For example, this error might be returned when @ref ch_scan() has been + * called inside a callback delivered by a currently-executing @ref ch_scan() + * call using the same scratch region. + * + * Note: Not all concurrent uses of scratch regions may be detected. This error + * is intended as a best-effort debugging tool, not a guarantee. + */ +#define CH_SCRATCH_IN_USE (-10) + +/** + * Returned when pcre_exec (called for some expressions internally from @ref + * ch_scan) failed due to a fatal error. + */ +#define CH_FAIL_INTERNAL (-32) + +/** @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* CH_COMMON_H_ */ diff --git a/chimera/ch_compile.cpp b/chimera/ch_compile.cpp new file mode 100644 index 00000000..c71e26e0 --- /dev/null +++ b/chimera/ch_compile.cpp @@ -0,0 +1,878 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Compiler front-end, including public API calls for compilation. + */ + +#include "ch_compile.h" +#include "ch_alloc.h" +#include "ch_internal.h" +#include "ch_database.h" +#include "grey.h" +#include "hs_common.h" +#include "hs_internal.h" +#include "ue2common.h" +#include "util/compile_error.h" +#include "util/make_unique.h" +#include "util/multibit_build.h" +#include "util/target_info.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define PCRE_ERROR_MSG "Internal error building PCRE pattern." + +using namespace std; +using namespace ue2; + +static const char failureNoMemory[] = "Unable to allocate memory."; +static const char failureInternal[] = "Internal error."; +static const char failureBadAlloc[] = "Allocator returned misaligned memory."; + +static const ch_compile_error_t ch_enomem + = { const_cast(failureNoMemory), 0 }; +static const ch_compile_error_t ch_einternal + = { const_cast(failureInternal), 0 }; +static const ch_compile_error_t ch_badalloc + = { const_cast(failureBadAlloc), 0 }; + +static +ch_compile_error_t *generateChimeraCompileError(const string &err, + int expression) { + ch_compile_error_t *ret = + (struct ch_compile_error *)ch_misc_alloc(sizeof(ch_compile_error_t)); + if (ret) { + ch_error_t e = ch_check_alloc(ret); + if (e != CH_SUCCESS) { + ch_misc_free(ret); + return const_cast(&ch_badalloc); + } + char *msg = (char *)ch_misc_alloc(err.size() + 1); + if (msg) { + e = ch_check_alloc(msg); + if (e != HS_SUCCESS) { + ch_misc_free(msg); + return const_cast(&ch_badalloc); + } + memcpy(msg, err.c_str(), err.size() + 1); + ret->message = msg; + } else { + ch_misc_free(ret); + ret = nullptr; + } + } + + if (!ret || !ret->message) { + return const_cast(&ch_enomem); + } + + ret->expression = expression; + + return ret; +} + +static +void freeChimeraCompileError(ch_compile_error_t *error) { + if (!error) { + return; + } + if (error == &ch_enomem || error == &ch_einternal || + error == &ch_badalloc) { + // These are not allocated. + return; + } + + ch_misc_free(error->message); + ch_misc_free(error); +} + +static +bool checkMode(unsigned int mode, ch_compile_error_t **comp_error) { + static const unsigned int supported = CH_MODE_GROUPS; + + if (mode & ~supported) { + *comp_error = + generateChimeraCompileError("Invalid mode flag supplied.", -1); + return false; + } + return true; +} + +/** \brief Throw a compile error if we're passed some unsupported flags. */ +static +void checkFlags(const unsigned int flags) { + static const unsigned int supported = HS_FLAG_DOTALL + | HS_FLAG_MULTILINE + | HS_FLAG_CASELESS + | HS_FLAG_SINGLEMATCH + | HS_FLAG_UCP + | HS_FLAG_UTF8; + + if (flags & ~supported) { + throw CompileError("Unrecognized flag used."); + } +} + +static +bool isHyperscanSupported(const char *expression, unsigned int flags, + const hs_platform_info *platform) { + hs_database_t *db = nullptr; + hs_compile_error *comp_error = nullptr; + + unsigned int id = 0; + hs_error_t err = hs_compile_multi(&expression, &flags, &id, + 1, HS_MODE_BLOCK, platform, &db, + &comp_error); + if (err != HS_SUCCESS) { + assert(!db); + assert(comp_error); + DEBUG_PRINTF("unsupported: %s\n", comp_error->message); + hs_free_compile_error(comp_error); + + return false; + } + + assert(db); + assert(!comp_error); + hs_free_database(db); + return true; +} + +static +bool writeHyperscanDatabase(char *ptr, hs_database_t *db) { + // Note: we must use our serialization calls to re-home the database. + char *serialized = nullptr; + size_t slen = 0; + hs_error_t err = hs_serialize_database(db, &serialized, &slen); + if (err != HS_SUCCESS) { + DEBUG_PRINTF("hs_serialize_database returned %d\n", err); + assert(0); + return false; + } + + DEBUG_PRINTF("writing database to ptr %p\n", ptr); + + // deserialize_at without the platform tests. + err = hs_deserialize_database_at(serialized, slen, (hs_database_t *)ptr); + if (err != HS_SUCCESS) { + DEBUG_PRINTF("hs_deserialize_database_at returned %d\n", err); + assert(0); + ch_misc_free(serialized); + return false; + } + + ch_misc_free(serialized); + return true; +} + +static +bool writeHyperscanDatabase(ch_bytecode *db, hs_database_t *hs_db) { + db->databaseOffset = ROUNDUP_CL(sizeof(*db)); + char *ptr = (char *)db + db->databaseOffset; + return writeHyperscanDatabase(ptr, hs_db); +} + +static +int convertFlagsToPcreOptions(unsigned int flags) { + int options = 0; + if (flags & HS_FLAG_CASELESS) { + options |= PCRE_CASELESS; + } + if (flags & HS_FLAG_DOTALL) { + options |= PCRE_DOTALL; + } + if (flags & HS_FLAG_MULTILINE) { + options |= PCRE_MULTILINE; + } + if (flags & HS_FLAG_UTF8) { + options |= PCRE_UTF8; + } + if (flags & HS_FLAG_UCP) { + options |= PCRE_UCP; + } + + // All other flags are meaningless to PCRE. + + return options; +} + +namespace { + +/** \brief Data about a single pattern. */ +struct PatternData : boost::noncopyable { + PatternData(const char *pattern, u32 flags, u32 idx, u32 id_in, + unsigned mode, unsigned long int match_limit, + unsigned long int match_limit_recursion, + const hs_platform_info *platform); + ~PatternData() { + pcre_free(compiled); + pcre_free(extra); + } + + void buildPcre(const char *pattern, u32 flags); + + size_t patternSize() const; + + void writePattern(ch_pattern *pattern) const; + + pcre *compiled; //!< pcre_compile output + pcre_extra *extra; //!< pcre_study output + size_t compiled_size; + int study_size; + int capture_cnt; + bool utf8; + u32 id; //!< ID from the user + u32 expr_index; //!< index in the expression array + bool singlematch; //!< pattern is in highlander mode + bool guard; //!< this pattern should be guarded by the multimatcher + u32 minWidth; //!< min match width + u32 maxWidth; //!< max match width + u32 fixedWidth; //!< fixed pattern width + unsigned long int matchLimit; //! pcre match limit + unsigned long int matchLimitRecursion; //! pcre match_limit_recursion +}; + +PatternData::PatternData(const char *pattern, u32 flags, u32 idx, u32 id_in, + unsigned mode, unsigned long int match_limit, + unsigned long int match_limit_recursion, + const hs_platform_info *platform) + : compiled(nullptr), extra(nullptr), id(id_in), expr_index(idx), + singlematch(flags & HS_FLAG_SINGLEMATCH), + guard(false), minWidth(0), maxWidth(UINT_MAX), + fixedWidth(UINT_MAX), matchLimit(match_limit), + matchLimitRecursion(match_limit_recursion) { + assert(pattern); + + flags |= HS_FLAG_ALLOWEMPTY; /* don't hand things off to pcre for no + reason */ + + buildPcre(pattern, flags); + + // Fetch the expression info for a prefiltering, non-singlematch version of + // this pattern, if possible. + hs_expr_info *info = nullptr; + hs_compile_error_t *error = nullptr; + u32 infoflags = (flags | HS_FLAG_PREFILTER) & ~HS_FLAG_SINGLEMATCH; + u32 rawflags = (flags | HS_FLAG_SOM_LEFTMOST) & ~HS_FLAG_SINGLEMATCH; + hs_error_t err = hs_expression_info(pattern, infoflags, &info, &error); + if (err == HS_SUCCESS) { + assert(info); + hs_expr_info *i = (hs_expr_info *)info; + minWidth = i->min_width; + maxWidth = i->max_width; + bool ordered = i->unordered_matches ? false : true; + + // Only enable capturing if required + u32 captureCnt = 0; + if (mode & CH_MODE_GROUPS) { + captureCnt = capture_cnt; + } + + // No need to confirm with PCRE if: + // 1) pattern is fixed width + // 2) pattern isn't vacuous as it can't combine with start of match + // 3) no capturing in this pattern + // 4) no offset adjust in this pattern as hyperscan match callback + // will arrive without order, i.e. [^a]\z has offset adjust + // 5) hyperscan compile succeeds without prefiltering + if (minWidth == maxWidth && minWidth && maxWidth != UINT_MAX && + !captureCnt && ordered && + isHyperscanSupported(pattern, rawflags, platform)) { + fixedWidth = maxWidth; + } + + DEBUG_PRINTF("gathered info: widths=[%u,%u]\n", minWidth, maxWidth); + + ch_misc_free(info); + + u32 guardflags; + guardflags = (flags | HS_FLAG_PREFILTER) & ~HS_FLAG_SINGLEMATCH; + guard = isHyperscanSupported(pattern, guardflags, platform); + } else { + // We can't even prefilter this pattern, so we're dependent on Big Dumb + // Pcre Scans. + DEBUG_PRINTF("hs_expression_info failed, falling back to pcre\n"); + hs_free_compile_error(error); + } +} + +void PatternData::buildPcre(const char *pattern, u32 flags) { + int options = convertFlagsToPcreOptions(flags); + const char *errptr = nullptr; + int erroffset = 0; + + compiled = pcre_compile(pattern, options, &errptr, &erroffset, nullptr); + if (!compiled) { + DEBUG_PRINTF("PCRE failed to compile: %s\n", pattern); + string err("PCRE compilation failed: "); + err += string(errptr); + err += "."; + throw CompileError(expr_index, err); + } + + extra = pcre_study(compiled, PCRE_STUDY_JIT_COMPILE, &errptr); + // Note that it's OK for pcre_study to return NULL if there's nothing + // to be found, but a non-NULL error is always bad. + if (errptr) { + DEBUG_PRINTF("PCRE could not be studied: %s\n", errptr); + string err("PCRE compilation failed: "); + err += string(errptr); + err += "."; + throw CompileError(expr_index, err); + } + + if (pcre_fullinfo(compiled, extra, PCRE_INFO_SIZE, &compiled_size)) { + throw CompileError(PCRE_ERROR_MSG); + } + + if (!extra) { + study_size = 0; + } else { + if (pcre_fullinfo(compiled, extra, PCRE_INFO_STUDYSIZE, &study_size)) { + throw CompileError(PCRE_ERROR_MSG); + } + } + + if (pcre_fullinfo(compiled, extra, PCRE_INFO_CAPTURECOUNT, &capture_cnt)) { + throw CompileError(PCRE_ERROR_MSG); + } + + /* We use the pcre rather than hs to get this information as we may need it + * even in the pure unguarded pcre mode where there is no hs available. We + * can not use the compile flags due to (*UTF8) verb */ + unsigned long int opts = 0; // PCRE_INFO_OPTIONS demands an unsigned long + if (pcre_fullinfo(compiled, extra, PCRE_INFO_OPTIONS, &opts)) { + throw CompileError(PCRE_ERROR_MSG); + } + utf8 = opts & PCRE_UTF8; +} + +size_t PatternData::patternSize() const { + size_t len = 0; + + // ch_pattern header. + len += sizeof(ch_pattern); + + len = ROUNDUP_N(len, 8); + DEBUG_PRINTF("compiled pcre at %zu\n", len); + len += compiled_size; + + // PCRE study data, which may be zero. + if (study_size) { + len = ROUNDUP_N(len, 8); + DEBUG_PRINTF("study at %zu\n", len); + len += (size_t)study_size; + } + + DEBUG_PRINTF("pattern size %zu\n", len); + return len; +} + +/** \brief Write out an ch_pattern structure, which should already be sized + * correctly according to PatternData::patternSize. */ +void PatternData::writePattern(ch_pattern *pattern) const { + assert(pattern); + assert(ISALIGNED_CL(pattern)); + + pattern->id = id; + + u32 flags = 0; + if (singlematch) { + flags |= CHIMERA_PATTERN_FLAG_SINGLEMATCH; + } + if (utf8) { + flags |= CHIMERA_PATTERN_FLAG_UTF8; + } + + pattern->flags = flags; + pattern->maxWidth = maxWidth; + pattern->minWidth = minWidth == UINT_MAX ? 0 : minWidth; + pattern->fixedWidth = fixedWidth; + + // Compiled PCRE pattern. + char *ptr = (char *)pattern; + ptr += ROUNDUP_N(sizeof(*pattern), 8); + DEBUG_PRINTF("compiled pcre at %zu\n", (size_t)(ptr - (char *)pattern)); + memcpy(ptr, compiled, compiled_size); + ptr += compiled_size; + + // PCRE match limits + pattern->extra.flags = PCRE_EXTRA_MATCH_LIMIT | + PCRE_EXTRA_MATCH_LIMIT_RECURSION; + pattern->extra.match_limit = matchLimit ? matchLimit : 10000000; + // Set to avoid segment fault + pattern->extra.match_limit_recursion = + matchLimitRecursion ? matchLimitRecursion : 1500; + + // PCRE study_data. + u32 studyOffset = 0; + if (extra) { + assert(extra->study_data); + ptr = ROUNDUP_PTR(ptr, 8); + DEBUG_PRINTF("study at %zu\n", (size_t)(ptr - (char *)pattern)); + memcpy(ptr, extra->study_data, study_size); + studyOffset = (size_t)(ptr - (char *)pattern); + + pattern->extra.flags |= PCRE_EXTRA_STUDY_DATA; + pattern->extra.study_data = ptr; + + ptr += study_size; + } else { + pattern->extra.flags &= ~PCRE_EXTRA_STUDY_DATA; + } + pattern->studyOffset = studyOffset; + + size_t pcreLen = (ptr - (char *)pattern); + assert(pcreLen <= patternSize()); + pattern->length = (u32)pcreLen; + + // We shouldn't overrun the space we've allocated for this pattern. + assert(patternSize() >= (size_t)(ptr - (char *)pattern)); +} + +} // namespace + +namespace ch { + +static +void ch_compile_multi_int(const char *const *expressions, const unsigned *flags, + const unsigned *ids, unsigned elements, + unsigned mode, unsigned long int match_limit, + unsigned long int match_limit_recursion, + const hs_platform_info_t *platform, + ch_database_t **out) { + vector> pcres; + pcres.reserve(elements); + vector unguarded; // indices of unguarded PCREs. + vector multiExpr; + vector multiFlags; + vector multiIds; + bool allConfirm = true; + bool allSingleMatch = true; + for (unsigned int i = 0; i < elements; i++) { + const char *myExpr = expressions[i]; + unsigned int myFlags = flags ? flags[i] : 0; + unsigned int myId = ids ? ids[i] : 0; + + checkFlags(myFlags); + + // First, build with libpcre. A build failure from libpcre will throw + // an exception up to the caller. + auto patternData = + ue2::make_unique(myExpr, myFlags, i, myId, mode, match_limit, + match_limit_recursion, platform); + pcres.push_back(move(patternData)); + PatternData &curr = *pcres.back(); + + if (!(myFlags & HS_FLAG_SINGLEMATCH)) { + allSingleMatch = false; + } + + // in the multimatch, we always run in prefilter mode and accept vacuous + // patterns. + myFlags |= + HS_FLAG_ALLOWEMPTY | HS_FLAG_PREFILTER; + + if (curr.fixedWidth != UINT_MAX) { + myFlags |= HS_FLAG_SOM_LEFTMOST; + DEBUG_PRINTF("fixed width, turn off prefiltering\n"); + myFlags &= ~HS_FLAG_PREFILTER; + allConfirm = false; + + // Single match can't coexist with SOM. + myFlags &= ~HS_FLAG_SINGLEMATCH; + } + + if (curr.guard) { + // We use the index into the PCREs array as the Hyperscan idx. + multiExpr.push_back(myExpr); + multiFlags.push_back(myFlags); + multiIds.push_back(i); + } else { + // No Hyperscan support, PCRE is unguarded. + unguarded.push_back(i); + } + } + + DEBUG_PRINTF("built %zu PCREs, %zu of which are unguarded\n", + pcres.size(), unguarded.size()); + + // Work out our sizing for the output database. + size_t patternSize = 0; + for (unsigned int i = 0; i < elements; i++) { + size_t len = pcres[i]->patternSize(); + patternSize += ROUNDUP_CL(len); + } + DEBUG_PRINTF("pcre bytecode takes %zu bytes\n", patternSize); + + bool noMulti = multiExpr.empty(); + size_t multiSize = 0; + hs_database *multidb = nullptr; + if (!noMulti) { + hs_compile_error_t *hs_comp_error = nullptr; + hs_error_t err = hs_compile_multi(&multiExpr[0], &multiFlags[0], + &multiIds[0], multiExpr.size(), + HS_MODE_BLOCK, platform, &multidb, + &hs_comp_error); + + if (err != HS_SUCCESS) { + assert(hs_comp_error); + DEBUG_PRINTF("hs_compile_multi returned error: %s\n", + hs_comp_error->message); + assert(0); + hs_free_compile_error(hs_comp_error); + throw CompileError("Internal error."); + } + + assert(multidb); + err = hs_database_size(multidb, &multiSize); + if (err != HS_SUCCESS) { + assert(0); + throw CompileError("Internal error."); + } + DEBUG_PRINTF("built hyperscan database with len %zu bytes\n", multiSize); + } + + size_t bytecodeLen = sizeof(ch_bytecode) + + multiSize + alignof(u32) + + (sizeof(u32) * unguarded.size()) + + (sizeof(u32) * elements) + + patternSize + + 128; // padding for alignment + size_t totalSize = sizeof(ch_database) + bytecodeLen; + + DEBUG_PRINTF("allocating %zu bytes for database\n", totalSize); + char *ptr = (char *)ch_database_alloc(totalSize); + if (ch_check_alloc(ptr) != CH_SUCCESS) { + ch_database_free(ptr); + throw std::bad_alloc(); + } + + memset(ptr, 0, totalSize); + + // First, the header. + ch_database *hydb = (ch_database *)ptr; + hydb->magic = CH_DB_MAGIC; + hydb->version = HS_VERSION_32BIT; + hydb->length = bytecodeLen; + + // Then, the bytecode. + size_t shift = (size_t)hydb->bytes & 0x3f; + hydb->bytecode = offsetof(struct ch_database, bytes) - shift; + ch_bytecode *db = (ch_bytecode *)((char *)hydb + hydb->bytecode); + db->patternCount = elements; + db->activeSize = mmbit_size(elements); + db->flags = 0; + db->length = bytecodeLen; + + if (noMulti) { + db->flags |= CHIMERA_FLAG_NO_MULTIMATCH; + } + if (mode & CH_MODE_GROUPS) { + db->flags |= CHIMERA_FLAG_GROUPS; + } + if (allConfirm) { + db->flags |= CHIMERA_FLAG_ALL_CONFIRM; + } + if (allSingleMatch) { + db->flags |= CHIMERA_FLAG_ALL_SINGLE; + } + + + // Find and set the max ovector size by looking at the capture count for + // each pcre. + u32 maxCaptureGroups = 0; + for (unsigned int i = 0; i < elements; i++) { + maxCaptureGroups = max(maxCaptureGroups, (u32)pcres[i]->capture_cnt); + } + db->maxCaptureGroups = maxCaptureGroups; + DEBUG_PRINTF("max capture groups is %u\n", maxCaptureGroups); + + if (!noMulti) { + DEBUG_PRINTF("write hyperscan database\n"); + // Write Hyperscan database directly after the header struct, then free it. + if (!writeHyperscanDatabase(db, multidb)) { + ch_database_free(hydb); + hs_free_database(multidb); + throw CompileError("Internal error."); + } + hs_free_database(multidb); + } else { + db->databaseOffset = ROUNDUP_CL(sizeof(*db)); + } + + // Then, write our unguarded PCRE list. + db->unguardedCount = unguarded.size(); + db->unguardedOffset = ROUNDUP_N(db->databaseOffset + multiSize, 4); + ptr = (char *)db + db->unguardedOffset; + copy(unguarded.begin(), unguarded.end(), (u32 *)ptr); + + // Then, write all our compiled PCRE patterns and the lookup table for + // them. + db->patternOffset = db->unguardedOffset + unguarded.size() * sizeof(u32); + u32 *patternOffset = (u32 *)((char *)db + db->patternOffset); + u32 offset = ROUNDUP_CL(db->patternOffset + elements * sizeof(u32)); + for (unsigned int i = 0; i < elements; i++) { + *patternOffset = offset; + size_t len = pcres[i]->patternSize(); + ptr = (char *)db + offset; + struct ch_pattern *pattern = (struct ch_pattern *)ptr; + pcres[i]->writePattern(pattern); + DEBUG_PRINTF("wrote pcre %u into offset %u, len %zu\n", i, offset, len); + offset += ROUNDUP_CL(len); + patternOffset++; + } + + assert(offset <= totalSize); + assert(hydb->magic == CH_DB_MAGIC); + DEBUG_PRINTF("built hybrid database, size %zu bytes\n", totalSize); + DEBUG_PRINTF("offset=%u\n", offset); + *out = hydb; +} + +} // namespace ch + +extern "C" HS_PUBLIC_API +ch_error_t HS_CDECL ch_compile(const char *expression, unsigned flags, + unsigned mode, + const hs_platform_info_t *platform, + ch_database_t **db, + ch_compile_error_t **comp_error) { + if (!comp_error) { + if (db) { + db = nullptr; + } + // nowhere to write the string, but we can still report an error code + return CH_COMPILER_ERROR; + } + if (!db) { + *comp_error = + generateChimeraCompileError("Invalid parameter: db is NULL", -1); + return CH_COMPILER_ERROR; + } + if (!expression) { + *db = nullptr; + *comp_error = + generateChimeraCompileError("Invalid parameter: expressions is\ + NULL", -1); + return CH_COMPILER_ERROR; + } + + if (!checkMode(mode, comp_error)) { + *db = nullptr; + assert(*comp_error); // set by checkMode + return CH_COMPILER_ERROR; + } + + try { + unsigned id = 0; // single expressions get zero as an ID + // Internal function to do all the work, now that we've handled all the + // argument checking. + ch::ch_compile_multi_int(&expression, &flags, &id, 1, mode, 0, 0, + platform, db); + } + catch (const CompileError &e) { + // Compiler error occurred + *db = nullptr; + *comp_error = generateChimeraCompileError(e.reason, e.hasIndex ? + (int)e.index : -1); + return CH_COMPILER_ERROR; + } + catch (std::bad_alloc) { + *db = nullptr; + *comp_error = const_cast(&ch_enomem); + return CH_COMPILER_ERROR; + } + catch (...) { + assert(!"Internal error, unexpected exception"); + *db = nullptr; + *comp_error = const_cast(&ch_einternal); + return CH_COMPILER_ERROR; + } + + DEBUG_PRINTF("success!\n"); + return CH_SUCCESS; +} + +extern "C" HS_PUBLIC_API +ch_error_t HS_CDECL ch_compile_multi(const char *const *expressions, + const unsigned *flags, const unsigned *ids, + unsigned elements, unsigned mode, + const hs_platform_info_t *platform, + ch_database_t **db, + ch_compile_error_t **comp_error) { + if (!comp_error) { + if (db) { + db = nullptr; + } + // nowhere to write the string, but we can still report an error code + return CH_COMPILER_ERROR; + } + if (!db) { + *comp_error = + generateChimeraCompileError("Invalid parameter: db is NULL", -1); + return CH_COMPILER_ERROR; + } + if (!expressions) { + *db = nullptr; + *comp_error = + generateChimeraCompileError("Invalid parameter: expressions is\ + NULL", -1); + return CH_COMPILER_ERROR; + } + if (!elements) { + *db = nullptr; + *comp_error = generateChimeraCompileError("Invalid parameter:\ + elements is zero", -1); + return CH_COMPILER_ERROR; + } + + if (!checkMode(mode, comp_error)) { + *db = nullptr; + assert(*comp_error); // set by checkMode + return CH_COMPILER_ERROR; + } + + try { + // Internal function to do all the work, now that we've handled all the + // argument checking. + ch::ch_compile_multi_int(expressions, flags, ids, elements, mode, 0, 0, + platform, db); + } + catch (const CompileError &e) { + // Compiler error occurred + *db = nullptr; + *comp_error = generateChimeraCompileError(e.reason, e.hasIndex ? + (int)e.index : -1); + return CH_COMPILER_ERROR; + } + catch (std::bad_alloc) { + *db = nullptr; + *comp_error = const_cast(&ch_enomem); + return CH_COMPILER_ERROR; + } + catch (...) { + assert(!"Internal error, unexpected exception"); + *db = nullptr; + *comp_error = const_cast(&ch_einternal); + return CH_COMPILER_ERROR; + } + + DEBUG_PRINTF("success!\n"); + return CH_SUCCESS; +} + +extern "C" HS_PUBLIC_API +ch_error_t HS_CDECL ch_compile_ext_multi( + const char *const *expressions, + const unsigned *flags, + const unsigned *ids, + unsigned elements, unsigned mode, + unsigned long int match_limit, + unsigned long int match_limit_recursion, + const hs_platform_info_t *platform, + ch_database_t **db, + ch_compile_error_t **comp_error) { + if (!comp_error) { + if (db) { + db = nullptr; + } + // nowhere to write the string, but we can still report an error code + return CH_COMPILER_ERROR; + } + if (!db) { + *comp_error = + generateChimeraCompileError("Invalid parameter: db is NULL", -1); + return CH_COMPILER_ERROR; + } + if (!expressions) { + *db = nullptr; + *comp_error = + generateChimeraCompileError("Invalid parameter: expressions is\ + NULL", -1); + return CH_COMPILER_ERROR; + } + if (!elements) { + *db = nullptr; + *comp_error = generateChimeraCompileError("Invalid parameter:\ + elements is zero", -1); + return CH_COMPILER_ERROR; + } + + if (!checkMode(mode, comp_error)) { + *db = nullptr; + assert(*comp_error); // set by checkMode + return CH_COMPILER_ERROR; + } + + try { + // Internal function to do all the work, now that we've handled all the + // argument checking. + ch::ch_compile_multi_int(expressions, flags, ids, elements, mode, + match_limit, match_limit_recursion, platform, + db); + } + catch (const CompileError &e) { + // Compiler error occurred + *db = nullptr; + *comp_error = generateChimeraCompileError(e.reason, e.hasIndex ? + (int)e.index : -1); + return CH_COMPILER_ERROR; + } + catch (std::bad_alloc) { + *db = nullptr; + *comp_error = const_cast(&ch_enomem); + return CH_COMPILER_ERROR; + } + catch (...) { + assert(!"Internal error, unexpected exception"); + *db = nullptr; + *comp_error = const_cast(&ch_einternal); + return CH_COMPILER_ERROR; + } + + DEBUG_PRINTF("success!\n"); + return CH_SUCCESS; +} + +extern "C" HS_PUBLIC_API +ch_error_t HS_CDECL ch_free_compile_error(ch_compile_error_t *error) { + freeChimeraCompileError(error); + return CH_SUCCESS; +} diff --git a/chimera/ch_compile.h b/chimera/ch_compile.h new file mode 100644 index 00000000..03c750eb --- /dev/null +++ b/chimera/ch_compile.h @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CH_COMPILE_H_ +#define CH_COMPILE_H_ + +/** + * @file + * @brief The Chimera compiler API definition. + * + * Chimera is a hybrid solution of Hyperscan and PCRE. + * + * This header contains functions for compiling regular expressions into + * Chimera databases that can be used by the Chimera runtime. + */ + +#include "ch_common.h" +#include "hs_compile.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * A type containing error details that is returned by the compile calls (@ref + * ch_compile() and @ref ch_compile_multi() on failure. The caller may inspect + * the values returned in this type to determine the cause of failure. + */ +typedef struct ch_compile_error { + /** + * A human-readable error message describing the error. + */ + char *message; + + /** + * The zero-based number of the expression that caused the error (if this + * can be determined). If the error is not specific to an expression, then + * this value will be less than zero. + */ + int expression; +} ch_compile_error_t; + +/** + * The basic regular expression compiler. + * + * This is the function call with which an expression is compiled into a + * Chimera database which can be passed to the runtime function ( + * @ref ch_scan()) + * + * @param expression + * The NULL-terminated expression to parse. Note that this string must + * represent ONLY the pattern to be matched, with no delimiters or flags; + * any global flags should be specified with the @a flags argument. For + * example, the expression `/abc?def/i` should be compiled by providing + * `abc?def` as the @a expression, and @ref CH_FLAG_CASELESS as the @a + * flags. + * + * @param flags + * Flags which modify the behaviour of the expression. Multiple flags may + * be used by ORing them together. Valid values are: + * - CH_FLAG_CASELESS - Matching will be performed case-insensitively. + * - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines. + * - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - CH_FLAG_SINGLEMATCH - Only one match will be generated for the + * expression per stream. + * - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. + * - CH_FLAG_UCP - Use Unicode properties for character classes. + * + * @param mode + * Compiler mode flag that affect the database as a whole for capturing + * groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied. + * See @ref CH_MODE_FLAG for more details. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref ch_free_database() function. + * + * @param compile_error + * If the compile fails, a pointer to a @ref ch_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * ch_free_compile_error() function. + * + * @return + * @ref CH_SUCCESS is returned on successful compilation; @ref + * CH_COMPILER_ERROR on failure, with details provided in the error + * parameter. + */ +ch_error_t HS_CDECL ch_compile(const char *expression, unsigned int flags, + unsigned int mode, + const hs_platform_info_t *platform, + ch_database_t **db, + ch_compile_error_t **compile_error); + +/** + * The multiple regular expression compiler. + * + * This is the function call with which a set of expressions is compiled into a + * database which can be passed to the runtime function (@ref ch_scan()). + * Each expression can be labelled with a unique integer which is passed into + * the match callback to identify the pattern that has matched. + * + * @param expressions + * Array of NULL-terminated expressions to compile. Note that (as for @ref + * ch_compile()) these strings must contain only the pattern to be + * matched, with no delimiters or flags. For example, the expression + * `/abc?def/i` should be compiled by providing `abc?def` as the first + * string in the @a expressions array, and @ref CH_FLAG_CASELESS as the + * first value in the @a flags array. + * + * @param flags + * Array of flags which modify the behaviour of each expression. Multiple + * flags may be used by ORing them together. Specifying the NULL pointer + * in place of an array will set the flags value for all patterns to zero. + * Valid values are: + * - CH_FLAG_CASELESS - Matching will be performed case-insensitively. + * - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines. + * - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns + * with this match id per stream. + * - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. + * - CH_FLAG_UCP - Use Unicode properties for character classes. + * + * @param ids + * An array of integers specifying the ID number to be associated with the + * corresponding pattern in the expressions array. Specifying the NULL + * pointer in place of an array will set the ID value for all patterns to + * zero. + * + * @param elements + * The number of elements in the input arrays. + * + * @param mode + * Compiler mode flag that affect the database as a whole for capturing + * groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied. + * See @ref CH_MODE_FLAG for more details. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref ch_free_database() function. + * + * @param compile_error + * If the compile fails, a pointer to a @ref ch_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * ch_free_compile_error() function. + * + * @return + * @ref CH_SUCCESS is returned on successful compilation; @ref + * CH_COMPILER_ERROR on failure, with details provided in the @a error + * parameter. + * + */ +ch_error_t HS_CDECL ch_compile_multi(const char *const *expressions, + const unsigned int *flags, + const unsigned int *ids, + unsigned int elements, unsigned int mode, + const hs_platform_info_t *platform, + ch_database_t **db, + ch_compile_error_t **compile_error); + +/** + * The multiple regular expression compiler with extended match limits support. + * + * This is the function call with which a set of expressions is compiled into a + * database in the same way as @ref ch_compile_multi(), but allows additional + * parameters to be specified via match_limit and match_limit_recursion to + * define match limits for PCRE runtime. + * + * @param expressions + * Array of NULL-terminated expressions to compile. Note that (as for @ref + * ch_compile()) these strings must contain only the pattern to be + * matched, with no delimiters or flags. For example, the expression + * `/abc?def/i` should be compiled by providing `abc?def` as the first + * string in the @a expressions array, and @ref CH_FLAG_CASELESS as the + * first value in the @a flags array. + * + * @param flags + * Array of flags which modify the behaviour of each expression. Multiple + * flags may be used by ORing them together. Specifying the NULL pointer + * in place of an array will set the flags value for all patterns to zero. + * Valid values are: + * - CH_FLAG_CASELESS - Matching will be performed case-insensitively. + * - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines. + * - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns + * with this match id per stream. + * - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. + * - CH_FLAG_UCP - Use Unicode properties for character classes. + * + * @param ids + * An array of integers specifying the ID number to be associated with the + * corresponding pattern in the expressions array. Specifying the NULL + * pointer in place of an array will set the ID value for all patterns to + * zero. + * + * @param elements + * The number of elements in the input arrays. + * + * @param mode + * Compiler mode flag that affect the database as a whole for capturing + * groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied. + * See @ref CH_MODE_FLAG for more details. + * + * @param match_limit + * A limit from pcre_extra on the amount of match function called in PCRE + * to limit backtracking that can take place. + * + * @param match_limit_recursion + * A limit from pcre_extra on the recursion depth of match function + * in PCRE. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref ch_free_database() function. + * + * @param compile_error + * If the compile fails, a pointer to a @ref ch_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * ch_free_compile_error() function. + * + * @return + * @ref CH_SUCCESS is returned on successful compilation; @ref + * CH_COMPILER_ERROR on failure, with details provided in the @a error + * parameter. + * + */ +ch_error_t HS_CDECL ch_compile_ext_multi(const char *const *expressions, + const unsigned int *flags, + const unsigned int *ids, + unsigned int elements, + unsigned int mode, + unsigned long int match_limit, + unsigned long int match_limit_recursion, + const hs_platform_info_t *platform, + ch_database_t **db, + ch_compile_error_t **compile_error); + +/** + * Free an error structure generated by @ref ch_compile(), @ref + * ch_compile_multi(). + * + * @param error + * The @ref ch_compile_error_t to be freed. NULL may also be safely + * provided. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_free_compile_error(ch_compile_error_t *error); + +/** + * @defgroup CH_PATTERN_FLAG Pattern flags + * + * @{ + */ + +/** + * Compile flag: Set case-insensitive matching. + * + * This flag sets the expression to be matched case-insensitively by default. + * The expression may still use PCRE tokens (notably `(?i)` and + * `(?-i)`) to switch case-insensitive matching on and off. + */ +#define CH_FLAG_CASELESS 1 + +/** + * Compile flag: Matching a `.` will not exclude newlines. + * + * This flag sets any instances of the `.` token to match newline characters as + * well as all other characters. The PCRE specification states that the `.` + * token does not match newline characters by default, so without this flag the + * `.` token will not cross line boundaries. + */ +#define CH_FLAG_DOTALL 2 + +/** + * Compile flag: Set multi-line anchoring. + * + * This flag instructs the expression to make the `^` and `$` tokens match + * newline characters as well as the start and end of the stream. If this flag + * is not specified, the `^` token will only ever match at the start of a + * stream, and the `$` token will only ever match at the end of a stream within + * the guidelines of the PCRE specification. + */ +#define CH_FLAG_MULTILINE 4 + +/** + * Compile flag: Set single-match only mode. + * + * This flag sets the expression's match ID to match at most once, only the + * first match for each invocation of @ref ch_scan() will be returned. + * + */ +#define CH_FLAG_SINGLEMATCH 8 + +/** + * Compile flag: Enable UTF-8 mode for this expression. + * + * This flag instructs Chimera to treat the pattern as a sequence of UTF-8 + * characters. The results of scanning invalid UTF-8 sequences with a Chimera + * library that has been compiled with one or more patterns using this flag are + * undefined. + */ +#define CH_FLAG_UTF8 32 + +/** + * Compile flag: Enable Unicode property support for this expression. + * + * This flag instructs Chimera to use Unicode properties, rather than the + * default ASCII interpretations, for character mnemonics like `\w` and `\s` as + * well as the POSIX character classes. It is only meaningful in conjunction + * with @ref CH_FLAG_UTF8. + */ +#define CH_FLAG_UCP 64 + +/** @} */ + +/** + * @defgroup CH_MODE_FLAG Compile mode flags + * + * The mode flags are used as values for the mode parameter of the various + * compile calls (@ref ch_compile(), @ref ch_compile_multi(). + * + * By default, the matcher will only supply the start and end offsets of the + * match when the match callback is called. Using mode flag @ref CH_MODE_GROUPS + * will also fill the `captured' array with the start and end offsets of all + * the capturing groups specified by the pattern that has matched. + * + * @{ + */ + +/** + * Compiler mode flag: Disable capturing groups. + */ +#define CH_MODE_NOGROUPS 0 + +/** + * Compiler mode flag: Enable capturing groups. + */ +#define CH_MODE_GROUPS 1048576 + +/** @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* CH_COMPILE_H_ */ diff --git a/chimera/ch_database.c b/chimera/ch_database.c new file mode 100644 index 00000000..387d076e --- /dev/null +++ b/chimera/ch_database.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Chimera: database construction, etc. + */ + +#include +#include +#include +#include + +#include "allocator.h" +#include "database.h" +#include "hs.h" +#include "ch.h" +#include "hs_internal.h" +#include "ch_common.h" +#include "ch_alloc.h" +#include "ch_database.h" +#include "ch_internal.h" + +static really_inline +int db_correctly_aligned(const void *db) { + return ISALIGNED_N(db, alignof(unsigned long long)); +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_free_database(ch_database_t *hydb) { + if (hydb && hydb->magic != CH_DB_MAGIC) { + return CH_INVALID; + } + ch_database_free(hydb); + + return CH_SUCCESS; +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_database_size(const ch_database_t *hydb, size_t *size) { + if (!size) { + return CH_INVALID; + } + + ch_error_t ret = hydbIsValid(hydb); + if (unlikely(ret != CH_SUCCESS)) { + return ret; + } + + *size = sizeof(struct ch_database) + hydb->length; + return CH_SUCCESS; +} + +/** \brief Identifier prepended to database info. */ +static const char CHIMERA_IDENT[] = "Chimera "; + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_database_info(const ch_database_t *hydb, char **info) { + if (!info) { + return CH_INVALID; + } + *info = NULL; + + if (!hydb || !db_correctly_aligned(hydb) || hydb->magic != CH_DB_MAGIC) { + return HS_INVALID; + } + + const struct ch_bytecode *bytecode = ch_get_bytecode(hydb); + char noMulti = (bytecode->flags & CHIMERA_FLAG_NO_MULTIMATCH); + if (noMulti) { + size_t len = strlen(CHIMERA_IDENT); + *info = ch_misc_alloc(len + 1); + if (!(*info)) { + return CH_INVALID; + } + memcpy((*info), CHIMERA_IDENT, len); + (*info)[len] = '\0'; + return CH_SUCCESS; + } + + char *hsinfo = NULL; + hs_error_t ret = hs_database_info(getHyperscanDatabase(bytecode), &hsinfo); + if (ret != HS_SUCCESS) { + assert(!hsinfo); + return ret; + } + + size_t hybridlen = strlen(CHIMERA_IDENT); + size_t hslen = strlen(hsinfo); + *info = ch_misc_alloc(hybridlen + hslen + 1); + if (!(*info)) { + ch_misc_free(hsinfo); + return CH_INVALID; + } + + memcpy((*info), CHIMERA_IDENT, hybridlen); + memcpy((*info) + hybridlen, hsinfo, hslen); + (*info)[hybridlen + hslen] = '\0'; + ch_misc_free(hsinfo); + + return CH_SUCCESS; +} diff --git a/chimera/ch_database.h b/chimera/ch_database.h new file mode 100644 index 00000000..28bde86e --- /dev/null +++ b/chimera/ch_database.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Runtime code for ch_database manipulation. + */ + +#ifndef CH_DATABASE_H_ +#define CH_DATABASE_H_ + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define PCRE_STATIC +#include + +#include "ch_compile.h" // for CH_MODE_ flags +#include "ue2common.h" +#include "hs_version.h" +#include "hs.h" + +#define CH_DB_MAGIC 0xdedededeU //!< Magic number stored in \ref ch_database + +/** \brief Main Chimera database header. */ +struct ch_database { + u32 magic; //!< must be \ref CH_DB_MAGIC + u32 version; //!< release version + u32 length; //!< total allocated length in bytes + u32 reserved0; //!< unused + u32 reserved1; //!< unused + u32 bytecode; //!< offset relative to db start + u32 padding[16]; //!< padding for alignment of rest of bytecode + char bytes[]; +}; + +/** \brief Chimera bytecode header, which follows the \ref ch_database and is + * always 64-byte aligned. */ +struct ch_bytecode { + u32 length; //!< length of bytecode including this header struct + u32 flags; //!< whole-database flags (CHIMERA_FLAG_NO_MULTIMATCH, + // CHIMERA_FLAG_GROUPS) + u32 patternCount; //!< total number of patterns + u32 activeSize; //!< size of mmbit to store active pattern ids + u32 databaseOffset; //!< offset for database following \ref ch_bytecode + // header + u32 patternOffset; //!< points to an array of u32 offsets, each pointing to + // a \ref ch_pattern + u32 unguardedOffset; //!< pointer to a list of unguarded pattern indices + u32 unguardedCount; //!< number of unguarded patterns + u32 maxCaptureGroups; //!< max number of capture groups used by any pattern +}; + +/** \brief Per-pattern header. + * + * struct is followed in bytecode by: + * 1. pcre bytecode (always present) + * 2. pcre study data (sometimes) + */ +struct ch_pattern { + u32 id; //!< pattern ID to report to the user + u32 flags; //!< per-pattern flags (e.g. \ref CHIMERA_PATTERN_FLAG_UTF8) + u32 maxWidth; //!< maximum width of a match, or UINT_MAX for inf. + u32 minWidth; //!< minimum width of a match. + u32 fixedWidth;//!< pattern has fixed width. + u32 studyOffset; //!< offset relative to struct start of study data, + // or zero if there is none + u32 length; //!< length of struct plus pcre bytecode and study data + pcre_extra extra; //!< pcre_extra struct, used to store study data ptr for + // the currently-running pcre at runtime. +}; + +static really_inline +const void *ch_get_bytecode(const struct ch_database *db) { + assert(db); + const void *bytecode = (const char *)db + db->bytecode; + assert(ISALIGNED_16(bytecode)); + return bytecode; +} + +struct hs_database; + +static really_inline +const struct hs_database *getHyperscanDatabase(const struct ch_bytecode *db) { + assert(db); + const char *ptr = (const char *)db; + const struct hs_database *hs_db; + hs_db = (const struct hs_database *)(ptr + db->databaseOffset); + assert(ISALIGNED_CL(hs_db)); + return hs_db; +} + +static really_inline +const u32 *getUnguarded(const struct ch_bytecode *db) { + assert(db); + const char *ptr = (const char *)db; + const u32 *unguarded = (const u32 *)(ptr + db->unguardedOffset); + assert(ISALIGNED_N(unguarded, sizeof(u32))); + return unguarded; +} + +static really_inline +const struct ch_pattern *getPattern(const struct ch_bytecode *db, u32 i) { + assert(db); + assert(i < db->patternCount); + const char *ptr = (const char *)db; + const u32 *patternOffset = (const u32 *)(ptr + db->patternOffset); + assert(patternOffset[i] < db->length); + return (const struct ch_pattern *)(ptr + patternOffset[i]); +} + +static really_inline +ch_error_t hydbIsValid(const struct ch_database *hydb) { + if (!hydb || hydb->magic != CH_DB_MAGIC) { + DEBUG_PRINTF("bad magic (%u != %u)\n", hydb->magic, CH_DB_MAGIC); + return CH_INVALID; + } + + if (hydb->version != HS_VERSION_32BIT) { + DEBUG_PRINTF("bad version\n"); + return CH_DB_VERSION_ERROR; + } + + return CH_SUCCESS; +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* CH_DATABASE_H_ */ + diff --git a/chimera/ch_internal.h b/chimera/ch_internal.h new file mode 100644 index 00000000..a54d1392 --- /dev/null +++ b/chimera/ch_internal.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Chimera: data structures and internals. + */ + +#ifndef CH_INTERNAL_H +#define CH_INTERNAL_H + +#define CHIMERA_FLAG_NO_MULTIMATCH 1 //!< Don't run a multimatch scan +#define CHIMERA_FLAG_GROUPS 2 //!< Return capturing groups +#define CHIMERA_FLAG_ALL_CONFIRM 4 //!< All patterns need confirm +#define CHIMERA_FLAG_ALL_SINGLE 8 //!< All patterns need only one match + +#define CHIMERA_PATTERN_FLAG_SINGLEMATCH 1 //!< only report the first match +#define CHIMERA_PATTERN_FLAG_UTF8 2 //!< pattern is in UTF-8 mode + +#endif diff --git a/chimera/ch_runtime.c b/chimera/ch_runtime.c new file mode 100644 index 00000000..4685192b --- /dev/null +++ b/chimera/ch_runtime.c @@ -0,0 +1,629 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Chimera: main runtime. + */ + +#include +#include +#include +#include + +#include "ch.h" +#include "hs.h" +#include "hs_internal.h" +#include "ue2common.h" +#include "ch_database.h" +#include "ch_internal.h" +#include "ch_scratch.h" +#include "util/multibit.h" +#include "util/unicode_def.h" + +typedef struct queue_item PQ_T; + +static +char PQ_COMP(PQ_T *pqc_items, int a, int b) { + if ((pqc_items)[a].to != (pqc_items)[b].to) { + return (pqc_items)[a].to < (pqc_items)[b].to; + } else if ((pqc_items)[a].from != (pqc_items)[b].from) { + return (pqc_items)[a].from < (pqc_items)[b].from; + } else { + return (pqc_items)[a].id < (pqc_items)[b].id; + } +} + +static +char PQ_COMP_B(PQ_T *pqc_items, int a, PQ_T b_fixed) { + if ((pqc_items)[a].to != (b_fixed).to) { + return (pqc_items)[a].to < (b_fixed).to; + } else if ((pqc_items)[a].from != (b_fixed).from) { + return (pqc_items)[a].from < (b_fixed).from; + } else { + return (pqc_items)[a].id < b_fixed.id; + } +} + +#include "util/pqueue.h" + +static really_inline +void pq_insert_with(struct match_pq *pq, int from, int to, u32 id) { + DEBUG_PRINTF("inserting pattern%u in pq at %u\n", id, to); + struct queue_item temp = { + .from = from, + .to = to, + .id = id, + }; + + pq_insert(pq->item, pq->size, temp); + ++pq->size; +} + +static really_inline +void pq_pop_nice(struct match_pq *pq) { + pq_pop(pq->item, pq->size); + pq->size--; +} + +/** dummy event handler for use when user does not provide one */ +static +int null_onEvent(UNUSED unsigned id, UNUSED unsigned long long from, + UNUSED unsigned long long to, UNUSED unsigned flags, + UNUSED unsigned size, UNUSED const ch_capture_t *captured, + UNUSED void *ctxt) { + return 0; +} + +/** \brief Chimera runtime context. */ +struct HybridContext { + const char *data; //!< buffer being scanned + u32 length; //!< length of data buffer + u32 valid_utf8_highwater; //!< UTF-8 has been validated up to here. + const struct ch_bytecode *db; + struct ch_scratch *scratch; + struct match_pq *pq; + /** \brief user-supplied match callback */ + int (*match_callback)(unsigned int id, unsigned long long from, + unsigned long long to, unsigned int flags, + unsigned int size, const ch_capture_t *capture, + void *ctx); + /** \brief user-supplied error callback */ + int (*error_callback)(ch_error_event_t error_type, unsigned int id, + void *info, void *ctx); + /** \brief user-supplied context */ + void *context; +}; + +// Internal PCRE func. +extern int _pcre_valid_utf(const unsigned char *, int, int *); + +/** UTF-8 validity check. Returns >0 if the given region of the data is valid + * UTF-8, 0 otherwise. */ +static +char isValidUTF8(struct HybridContext *hyctx, u32 end) { + assert(hyctx); + + if (hyctx->valid_utf8_highwater >= end) { + return 1; // Already validated. + } + + const unsigned char *data = + (const unsigned char *)hyctx->data + hyctx->valid_utf8_highwater; + int validate_len = end - hyctx->valid_utf8_highwater; + + DEBUG_PRINTF("validating %d bytes\n", validate_len); + + int erroroffset = 0; + if (_pcre_valid_utf(data, validate_len, &erroroffset)) { + DEBUG_PRINTF("UTF8 invalid at offset %d\n", erroroffset); + return 0; + } + + hyctx->valid_utf8_highwater = end; + return 1; +} + +static +const pcre *getPcre(const struct ch_pattern *pattern) { + const char *ptr = (const char *)pattern; + const pcre *p = (const pcre *)(ptr + ROUNDUP_N(sizeof(*pattern), 8)); + assert(ISALIGNED_N(p, 8)); + return p; +} + +/** \brief Fill the Chimera groups array from a pcre_exec ovector. */ +static +void fillGroupsFromOvector(ch_capture_t *groups, int numPairs, int *ovector) { + assert(groups); + assert(ISALIGNED_N(groups, alignof(ch_capture_t))); + + DEBUG_PRINTF("filling %d groups (@ %p) from pcre ovector\n", + numPairs, groups); + + for (int i = 0; i < numPairs * 2; i += 2) { + if (ovector[i] == -1) { + groups->flags = CH_CAPTURE_FLAG_INACTIVE; + } else { + groups->flags = CH_CAPTURE_FLAG_ACTIVE; + assert(ovector[i] <= ovector[i + 1]); + groups->from = ovector[i]; + groups->to = ovector[i + 1]; + } + ++groups; + } +} + +static +ch_error_t handlePcreNonMatch(const struct ch_pattern *pattern, int rv, + ch_error_event_handler onError, + void *userContext) { + assert(rv < 0); + + if (rv == PCRE_ERROR_NOMATCH) { + DEBUG_PRINTF("no match found by libpcre\n"); + return CH_SUCCESS; + } else if (rv == PCRE_ERROR_MATCHLIMIT) { + DEBUG_PRINTF("pcre hit match limit\n"); + if (onError) { + return onError(CH_ERROR_MATCHLIMIT, pattern->id, NULL, + userContext); + } + return CH_SUCCESS; + } else if (rv == PCRE_ERROR_RECURSIONLIMIT) { + DEBUG_PRINTF("pcre hit recursion limit\n"); + if (onError) { + return onError(CH_ERROR_RECURSIONLIMIT, pattern->id, NULL, + userContext); + } + return CH_SUCCESS; + } + + // All other errors not handled above are fatal. + return CH_FAIL_INTERNAL; +} + +static +ch_error_t scanPcre(struct HybridContext *hyctx, UNUSED unsigned int length, + unsigned int offset, u32 id) { + const char *data = hyctx->data; + unsigned int full_length = hyctx->length; + ch_error_event_handler onError = hyctx->error_callback; + void *userContext = hyctx->context; + + const struct ch_pattern *pattern = getPattern(hyctx->db, id); + const pcre *p = getPcre(pattern); + + // Set up the PCRE extra block. + const pcre_extra *extra = &pattern->extra; + + int startoffset = offset; + + int *ovector = hyctx->scratch->ovector; + int ovectorSize = (hyctx->scratch->maxCaptureGroups + 1) * 3; + assert(ovectorSize >= 2); + + DEBUG_PRINTF("scanning %u bytes, pattern %u, startoffset %d\n", + length, id, startoffset); + + int options = 0; + if (pattern->flags & CHIMERA_PATTERN_FLAG_UTF8) { + // We do our own UTF-8 validation. + options |= PCRE_NO_UTF8_CHECK; + if (!isValidUTF8(hyctx, full_length)) { + return handlePcreNonMatch(pattern, PCRE_ERROR_BADUTF8, onError, + userContext); + } + } + + int rv = pcre_exec(p, extra, data, full_length, startoffset, options, + ovector, ovectorSize); + + DEBUG_PRINTF("pcre return code is %d\n", rv); + + // Handle all non-match or error cases, all of which involve us + // terminating the loop. + if (rv < 0) { + return handlePcreNonMatch(pattern, rv, onError, userContext); + } + + // We've found a match, and we should always have room for at least the + // start and end offsets in our ovector. Pass this info to the user. + assert(rv >= 1); + assert(rv < ovectorSize); + int from = ovector[0]; + int to = ovector[1]; + DEBUG_PRINTF("match %d -> %d\n", from, to); + + struct ch_patterndata *pd = hyctx->scratch->patternData + id; + + if (hyctx->db->flags & CHIMERA_FLAG_GROUPS) { + fillGroupsFromOvector(pd->match, rv, ovector); + } else { + rv = 0; + } + pd->groupCount = (u32)rv; + + // Insert new matched item to the queue + pq_insert_with(hyctx->pq, from, to, id); + + // Next scan starts at the first codepoint after the match. It's + // possible that we have a vacuous match, in which case we must step + // past it to ensure that we always progress. + if (from != to) { + startoffset = to; + } else if (pattern->flags & CHIMERA_PATTERN_FLAG_UTF8) { + startoffset = to + 1; + while (startoffset < (int)full_length && + ((data[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) { + ++startoffset; + } + } else { + startoffset = to + 1; + } + + pd->scanStart = startoffset; + DEBUG_PRINTF("new offset %u\n", pd->scanStart); + + return CH_SUCCESS; +} + +static +ch_error_t catchupPcre(struct HybridContext *hyctx, unsigned int id, + unsigned long long from, unsigned long long to) { + ch_match_event_handler onEvent = hyctx->match_callback; + void *userContext = hyctx->context; + DEBUG_PRINTF("priority queue size %u\n", hyctx->pq->size); + while (hyctx->pq->size) { + u32 num_item = hyctx->pq->size; + struct queue_item *item = pq_top(hyctx->pq->item); + size_t top_from = item->from; + size_t top_to = item->to; + u32 top_id = item->id; + + if (top_to > to) { + pq_insert_with(hyctx->pq, from, to, id); + break; + } + pq_pop_nice(hyctx->pq); + + const struct ch_pattern *pattern = getPattern(hyctx->db, top_id); + struct ch_patterndata *pd = hyctx->scratch->patternData + top_id; + + // Report match for pattern + DEBUG_PRINTF("trigger match@%zu\n", top_to); + ch_callback_t cbrv = + onEvent(pattern->id, top_from, top_to, 0 /* flags */, + pd->groupCount, pd->match, userContext); + + if (cbrv == CH_CALLBACK_TERMINATE) { + DEBUG_PRINTF("user callback told us to terminate scanning\n"); + return CH_SCAN_TERMINATED; + } else if (cbrv == CH_CALLBACK_SKIP_PATTERN) { + DEBUG_PRINTF("user callback told us to skip this pattern\n"); + pd->scanStart = hyctx->length; + } + + if (top_id == id) { + break; + } + + // Push a new match to replace the old one + unsigned int start = pd->scanStart; + unsigned int len = hyctx->length - pd->scanStart; + if (hyctx->length >= pd->scanStart && + !(pattern->flags & CHIMERA_PATTERN_FLAG_SINGLEMATCH)) { + DEBUG_PRINTF("get a new match item\n"); + int ret = scanPcre(hyctx, len, start, top_id); + + if (ret == CH_CALLBACK_TERMINATE) { + DEBUG_PRINTF("user callback told us to terminate scanning\n"); + return CH_SCAN_TERMINATED; + } else if (ret == CH_CALLBACK_SKIP_PATTERN) { + DEBUG_PRINTF("user callback told us to skip this pattern\n"); + pd->scanStart = hyctx->length; + ret = CH_SUCCESS; + } else if (ret == CH_FAIL_INTERNAL) { + return ret; + } + + // No further match is found + if (hyctx->pq->size == num_item - 1) { + pd->scanStart = hyctx->length; + } + } + } + + return CH_SUCCESS; +} + +/** \brief Callback used for internal Hyperscan multi-matcher. */ +static +int multiCallback(unsigned int id, unsigned long long from, + unsigned long long to, UNUSED unsigned int flags, + void *ctx) { + assert(ctx); + struct HybridContext *hyctx = ctx; + + DEBUG_PRINTF("match for ID %u at offset %llu\n", id, to); + assert(id < hyctx->db->patternCount); + + const struct ch_pattern *pattern = getPattern(hyctx->db, id); + struct ch_patterndata *pd = hyctx->scratch->patternData + id; + char needConfirm = pattern->fixedWidth == ~0U; + + if (needConfirm && + mmbit_isset(hyctx->scratch->active, hyctx->db->patternCount, id)) { + if ((hyctx->db->flags & CHIMERA_FLAG_ALL_CONFIRM) && + mmbit_all(hyctx->scratch->active, hyctx->db->patternCount)) { + return 1; + } + return 0; + } + // Store the fact that we've seen this bit. + char already = mmbit_set(hyctx->scratch->active, + hyctx->db->patternCount, id); + DEBUG_PRINTF("match from %u to %llu\n", pd->scanStart, to); + + if (!already) { + pd->scanStart = 0; + } else if (to < pd->scanStart + pattern->minWidth) { + return 0; + } else if (pattern->flags & CHIMERA_PATTERN_FLAG_SINGLEMATCH) { + if ((hyctx->db->flags & CHIMERA_FLAG_ALL_SINGLE) && + mmbit_all(hyctx->scratch->active, hyctx->db->patternCount)) { + return 1; + } + // Note: we may have unordered match from Hyperscan, + // thus possibly get to < pd->scanStart. + return 0; + } + + int ret = HS_SUCCESS; + unsigned int start = pd->scanStart; + unsigned int len = hyctx->length - pd->scanStart; + assert(hyctx->length >= pd->scanStart); + const char *data = hyctx->data; + if (needConfirm) { + DEBUG_PRINTF("run confirm for the first time\n"); + ret = scanPcre(hyctx, len, start, id); + hyctx->scratch->ret = ret; + if (ret == CH_CALLBACK_TERMINATE) { + DEBUG_PRINTF("user callback told us to terminate scanning\n"); + return HS_SCAN_TERMINATED; + } else if (ret == CH_CALLBACK_SKIP_PATTERN) { + DEBUG_PRINTF("user callback told us to skip this pattern\n"); + pd->scanStart = hyctx->length; + ret = HS_SUCCESS; + } else if (ret == CH_FAIL_INTERNAL) { + return ret; + } + } else { + if (already) { + DEBUG_PRINTF("catch up with new matches\n"); + ret = catchupPcre(hyctx, id, from, to); + + hyctx->scratch->ret = ret; + if (pd->scanStart >= hyctx->length) { + return ret; + } + } + int startoffset = 0; + // Next scan starts at the first codepoint after the match. It's + // possible that we have a vacuous match, in which case we must step + // past it to ensure that we always progress. + if (from != to) { + startoffset = to; + } else if (pattern->flags & CHIMERA_PATTERN_FLAG_UTF8) { + startoffset = to + 1; + while (startoffset < (int)hyctx->length && + ((data[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) { + ++startoffset; + } + } else { + startoffset = to + 1; + } + pd->scanStart = startoffset; + int rv = 0; + if (hyctx->db->flags & CHIMERA_FLAG_GROUPS) { + ch_capture_t *groups = pd->match; + groups->flags = CH_CAPTURE_FLAG_ACTIVE; + groups->from = from; + groups->to = to; + rv = 1; + } + pd->groupCount = (u32)rv; + pq_insert_with(hyctx->pq, from, to, id); + } + + return ret; +} + +static +hs_error_t scanHyperscan(struct HybridContext *hyctx, const char *data, + unsigned int length) { + DEBUG_PRINTF("scanning %u bytes with Hyperscan\n", length); + const struct ch_bytecode *hydb = hyctx->db; + const hs_database_t *db = getHyperscanDatabase(hydb); + hs_scratch_t *scratch = hyctx->scratch->multi_scratch; + + hs_error_t err = hs_scan(db, data, length, 0, scratch, multiCallback, + hyctx); + + return err; +} + +/** \brief Init match priority queue. + * + * Add a first match offset for each pattern that is not supported by Hyperscan + * with prefiltering. + */ +static really_inline +ch_error_t initQueue(struct HybridContext *hyctx, struct match_pq *pq) { + const struct ch_bytecode *db = hyctx->db; + + u8 *active = hyctx->scratch->active; + mmbit_clear(active, db->patternCount); + + // Init match queue size + pq->size = 0; + + unsigned int length = hyctx->length; + const u32 *unguarded = getUnguarded(db); + for (u32 i = 0; i < db->unguardedCount; i++) { + u32 patternId = unguarded[i]; + DEBUG_PRINTF("switch on unguarded pcre %u\n", patternId); + mmbit_set(active, db->patternCount, patternId); + + DEBUG_PRINTF("get a new match item\n"); + int ret = scanPcre(hyctx, length, 0, patternId); + + struct ch_patterndata *pd = hyctx->scratch->patternData + patternId; + if (ret == CH_CALLBACK_TERMINATE) { + DEBUG_PRINTF("user callback told us to terminate scanning\n"); + return CH_SCAN_TERMINATED; + } else if (ret == CH_CALLBACK_SKIP_PATTERN) { + DEBUG_PRINTF("user callback told us to skip this pattern\n"); + pd->scanStart = length; + ret = CH_SUCCESS; + } else if (ret == CH_FAIL_INTERNAL) { + return ret; + } + } + + return CH_SUCCESS; +} + +static really_inline +ch_error_t ch_scan_i(const ch_database_t *hydb, + const char *data, unsigned int length, + UNUSED unsigned int flags, + ch_scratch_t *scratch, + ch_match_event_handler onEvent, + ch_error_event_handler onError, + void *userContext) { + if (unlikely(!hydb || !scratch || !data)) { + DEBUG_PRINTF("args invalid\n"); + return CH_INVALID; + } + ch_error_t ret = hydbIsValid(hydb); + if (ret != CH_SUCCESS) { + DEBUG_PRINTF("database invalid\n"); + return ret; + } + + if (!ISALIGNED_CL(scratch)) { + DEBUG_PRINTF("bad alignment %p\n", scratch); + return CH_INVALID; + } + + if (scratch->magic != CH_SCRATCH_MAGIC) { + DEBUG_PRINTF("scratch invalid\n"); + return CH_INVALID; + } + + if (unlikely(markScratchInUse(scratch))) { + return CH_SCRATCH_IN_USE; + } + + // Hyperscan underlying scratch and database validity will be checked by + // the hs_scan() call, so no need to do it here. + + // PCRE takes the data region length in as an int, so this limits our block + // size to INT_MAX. + if (length > INT_MAX) { + DEBUG_PRINTF("length invalid\n"); + unmarkScratchInUse(scratch); + return CH_INVALID; + } + + const struct ch_bytecode *db = ch_get_bytecode(hydb); + + scratch->pq.size = 0; + scratch->ret = CH_SUCCESS; + + // Firstly, we run Hyperscan in block mode and add its matches into the + // active list for subsequent confirmation with pcre. + struct HybridContext hyctx = { + .data = data, + .length = length, + .valid_utf8_highwater = 0, + .db = db, + .scratch = scratch, + .pq = &scratch->pq, + .match_callback = onEvent ? onEvent : null_onEvent, + .error_callback = onError, + .context = userContext + }; + + // Init priority queue. + ret = initQueue(&hyctx, &scratch->pq); + if (ret != CH_SUCCESS) { + DEBUG_PRINTF("Chimera returned error %d\n", ret); + unmarkScratchInUse(scratch); + return ret; + } + + if (!(db->flags & CHIMERA_FLAG_NO_MULTIMATCH)) { + ret = scanHyperscan(&hyctx, data, length); + if (ret != HS_SUCCESS && scratch->ret != CH_SUCCESS) { + DEBUG_PRINTF("Hyperscan returned error %d\n", scratch->ret); + unmarkScratchInUse(scratch); + return scratch->ret; + } + } + + DEBUG_PRINTF("Flush priority queue\n"); + // Catch up with PCRE and make up id and offsets as we don't really care + // about their values + ret = catchupPcre(&hyctx, ~0U, length, length); + if (ret != CH_SUCCESS) { + DEBUG_PRINTF("PCRE catch up returned error %d\n", ret); + unmarkScratchInUse(scratch); + return ret; + } + + unmarkScratchInUse(scratch); + return CH_SUCCESS; +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_scan(const ch_database_t *hydb, const char *data, + unsigned int length, unsigned int flags, + ch_scratch_t *scratch, + ch_match_event_handler onEvent, + ch_error_event_handler onError, void *userContext) { + ch_error_t ret = ch_scan_i(hydb, data, length, flags, scratch, onEvent, + onError, userContext); + + return ret; +} + +HS_PUBLIC_API +const char * HS_CDECL ch_version(void) { + return HS_VERSION_STRING; +} diff --git a/chimera/ch_runtime.h b/chimera/ch_runtime.h new file mode 100644 index 00000000..79593f1d --- /dev/null +++ b/chimera/ch_runtime.h @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CH_RUNTIME_H_ +#define CH_RUNTIME_H_ + +#include + +/** + * @file + * @brief The Chimera runtime API definition. + * + * Chimera is a hybrid of Hyperscan and PCRE regular expression engine. + * + * This header contains functions for using compiled Chimera databases for + * scanning data at runtime. + */ + +#include "hs_common.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +struct ch_scratch; + +/** + * A Chimera scratch space. + */ +typedef struct ch_scratch ch_scratch_t; + +/** + * Callback return value used to tell the Chimera matcher what to do after + * processing this match. + */ +typedef int ch_callback_t; + +/** + * @defgroup CH_CALLBACK ch_callback_t values + * + * @{ + */ + +/** + * Continue matching. + */ +#define CH_CALLBACK_CONTINUE 0 + +/** + * Terminate matching. + */ +#define CH_CALLBACK_TERMINATE 1 + +/** + * Skip remaining matches for this ID and continue. + */ +#define CH_CALLBACK_SKIP_PATTERN 2 + + +/** @} */ + + +/** + * Type used to differentiate the errors raised with the @ref + * ch_error_event_handler callback. + */ +typedef int ch_error_event_t; + +/** + * @defgroup CH_ERROR_EVENT ch_error_event_t values + * + * @{ + */ + +/** + * PCRE hits its match limit and reports PCRE_ERROR_MATCHLIMIT. + */ +#define CH_ERROR_MATCHLIMIT 1 + +/** + * PCRE hits its recursion limit and reports PCRE_ERROR_RECURSIONLIMIT. + */ +#define CH_ERROR_RECURSIONLIMIT 2 + +/** @} */ + +/** + * Structure representing a captured subexpression within a match. An array of + * these structures corresponding to capture groups in order is passed to the + * callback on match, with active structures identified by the + * CH_CAPTURE_FLAG_ACTIVE flag. + */ +typedef struct ch_capture { + /** + * The flags indicating if this structure is active. + */ + unsigned int flags; + + /** + * offset at which this capture group begins. + */ + unsigned long long from; /*< offset at which this capture group begins. */ + + /** + * offset at which this capture group ends. + */ + unsigned long long to; +} ch_capture_t; + +/** + * @defgroup CH_CAPTURE ch_capture_t flags + * + * These flags are used in @ref ch_capture_t::flags to indicate if this + * structure is active. + * + * @{ + */ + +/** + * Flag indicating that a particular capture group is inactive, used in @ref + * ch_capture_t::flags. + */ +#define CH_CAPTURE_FLAG_INACTIVE 0 + +/** + * Flag indicating that a particular capture group is active, used in @ref + * ch_capture_t::flags. + */ +#define CH_CAPTURE_FLAG_ACTIVE 1 + +/** @} */ + +/** + * Definition of the match event callback function type. + * + * A callback function matching the defined type must be provided by the + * application calling the @ref ch_scan() + * + * This callback function will be invoked whenever a match is located in the + * target data during the execution of a scan. The details of the match are + * passed in as parameters to the callback function, and the callback function + * should return a value indicating whether or not matching should continue on + * the target data. If no callbacks are desired from a scan call, NULL may be + * provided in order to suppress match production. + * + * @param id + * The ID number of the expression that matched. If the expression was a + * single expression compiled with @ref ch_compile(), this value will be + * zero. + * + * @param from + * The offset of the first byte that matches the expression. + * + * @param to + * The offset after the last byte that matches the expression. + * + * @param flags + * This is provided for future use and is unused at present. + * + * @param size + * The number of valid entries pointed to by the captured parameter. + * + * @param captured + * A pointer to an array of @ref ch_capture_t structures that + * contain the start and end offsets of entire pattern match and + * each captured subexpression. + * + * @param ctx + * The pointer supplied by the user to the @ref ch_scan() function. + * + * @return + * The callback can return @ref CH_CALLBACK_TERMINATE to stop matching. + * Otherwise, a return value of @ref CH_CALLBACK_CONTINUE will continue, + * with the current pattern if configured to produce multiple matches per + * pattern, while a return value of @ref CH_CALLBACK_SKIP_PATTERN will + * cease matching this pattern but continue matching the next pattern. + */ +typedef ch_callback_t (*ch_match_event_handler)(unsigned int id, + unsigned long long from, + unsigned long long to, + unsigned int flags, + unsigned int size, + const ch_capture_t *captured, + void *ctx); + +/** + * Definition of the Chimera error event callback function type. + * + * A callback function matching the defined type may be provided by the + * application calling the @ref ch_scan function. This callback function + * will be invoked when an error event occurs during matching; this indicates + * that some matches for a given expression may not be reported. + * + * @param error_type + * The type of error event that occurred. Currently these errors + * correspond to resource limits on PCRE backtracking + * @ref CH_ERROR_MATCHLIMIT and @ref CH_ERROR_RECURSIONLIMIT. + * + * @param id + * The ID number of the expression that matched. + * + * @param info + * Event-specific data, for future use. Currently unused. + * + * @param ctx + * The context pointer supplied by the user to the @ref ch_scan + * function. + * + * @return + * The callback can return @ref CH_CALLBACK_SKIP_PATTERN to cease matching this + * pattern but continue matching the next pattern. Otherwise, we stop + * matching for all patterns with @ref CH_CALLBACK_TERMINATE. + */ + typedef ch_callback_t (*ch_error_event_handler)(ch_error_event_t error_type, + unsigned int id, void *info, + void *ctx); + +/** + * The block regular expression scanner. + * + * This is the function call in which the actual pattern matching takes place + * for block-mode pattern databases. + * + * @param db + * A compiled pattern database. + * + * @param data + * Pointer to the data to be scanned. + * + * @param length + * The number of bytes to scan. + * + * @param flags + * Flags modifying the behaviour of this function. This parameter is + * provided for future use and is unused at present. + * + * @param scratch + * A per-thread scratch space allocated by @ref ch_alloc_scratch() for this + * database. + * + * @param onEvent + * Pointer to a match event callback function. If a NULL pointer is given, + * no matches will be returned. + * + * @param onError + * Pointer to a error event callback function. If a NULL pointer is given, + * @ref CH_ERROR_MATCHLIMIT and @ref CH_ERROR_RECURSIONLIMIT errors will + * be ignored and match will continue. + * + * @param context + * The user defined pointer which will be passed to the callback function. + * + * @return + * Returns @ref CH_SUCCESS on success; @ref CH_SCAN_TERMINATED if the + * match callback indicated that scanning should stop; other values on + * error. + */ +ch_error_t HS_CDECL ch_scan(const ch_database_t *db, const char *data, + unsigned int length, unsigned int flags, + ch_scratch_t *scratch, + ch_match_event_handler onEvent, + ch_error_event_handler onError, + void *context); + +/** + * Allocate a "scratch" space for use by Chimera. + * + * This is required for runtime use, and one scratch space per thread, or + * concurrent caller, is required. Any allocator callback set by @ref + * ch_set_scratch_allocator() or @ref ch_set_allocator() will be used by this + * function. + * + * @param db + * The database, as produced by @ref ch_compile(). + * + * @param scratch + * On first allocation, a pointer to NULL should be provided so a new + * scratch can be allocated. If a scratch block has been previously + * allocated, then a pointer to it should be passed back in to see if it + * is valid for this database block. If a new scratch block is required, + * the original will be freed and the new one returned, otherwise the + * previous scratch block will be returned. On success, the scratch block + * will be suitable for use with the provided database in addition to any + * databases that original scratch space was suitable for. + * + * @return + * @ref CH_SUCCESS on successful allocation; @ref CH_NOMEM if the + * allocation fails. Other errors may be returned if invalid parameters + * are specified. + */ +ch_error_t HS_CDECL ch_alloc_scratch(const ch_database_t *db, + ch_scratch_t **scratch); + +/** + * Allocate a scratch space that is a clone of an existing scratch space. + * + * This is useful when multiple concurrent threads will be using the same set + * of compiled databases, and another scratch space is required. Any allocator + * callback set by @ref ch_set_scratch_allocator() or @ref ch_set_allocator() + * will be used by this function. + * + * @param src + * The existing @ref ch_scratch_t to be cloned. + * + * @param dest + * A pointer to the new scratch space will be returned here. + * + * @return + * @ref CH_SUCCESS on success; @ref CH_NOMEM if the allocation fails. + * Other errors may be returned if invalid parameters are specified. + */ +ch_error_t HS_CDECL ch_clone_scratch(const ch_scratch_t *src, + ch_scratch_t **dest); + +/** + * Provides the size of the given scratch space. + * + * @param scratch + * A per-thread scratch space allocated by @ref ch_alloc_scratch() or @ref + * ch_clone_scratch(). + * + * @param scratch_size + * On success, the size of the scratch space in bytes is placed in this + * parameter. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_scratch_size(const ch_scratch_t *scratch, + size_t *scratch_size); + +/** + * Free a scratch block previously allocated by @ref ch_alloc_scratch() or @ref + * ch_clone_scratch(). + * + * The free callback set by @ref ch_set_scratch_allocator() or @ref + * ch_set_allocator() will be used by this function. + * + * @param scratch + * The scratch block to be freed. NULL may also be safely provided. + * + * @return + * @ref CH_SUCCESS on success, other values on failure. + */ +ch_error_t HS_CDECL ch_free_scratch(ch_scratch_t *scratch); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* CH_RUNTIME_H_ */ diff --git a/chimera/ch_scratch.c b/chimera/ch_scratch.c new file mode 100644 index 00000000..af49c34d --- /dev/null +++ b/chimera/ch_scratch.c @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Chimera: scratch space alloc. + */ + +#include + +#include "allocator.h" +#include "ch.h" +#include "hs.h" +#include "hs_internal.h" +#include "ue2common.h" +#include "ch_alloc.h" +#include "ch_internal.h" +#include "ch_scratch.h" +#include "ch_database.h" + +static +size_t getPatternDataSize(const ch_scratch_t *s) { + size_t numCapturingStructs = + s->patternCount * (s->maxCaptureGroups + 1); + return (sizeof(struct ch_patterndata) * s->patternCount) + + alignof(struct ch_capture) + // padding + (sizeof(struct ch_capture) * numCapturingStructs); +} + +static +void initPatternData(const ch_scratch_t *s) { + // ch_capture array is aligned, directly after the patterndata array. + char *ptr = (char *)s->patternData + + (sizeof(struct ch_patterndata) * s->patternCount); + struct ch_capture *cap = (struct ch_capture *) + (ROUNDUP_PTR(ptr, alignof(struct ch_capture))); + + for (u32 i = 0; i < s->patternCount; i++) { + struct ch_patterndata *pd = &s->patternData[i]; + pd->match = cap; + DEBUG_PRINTF("pattern %u: pd=%p, match=%p\n", i, pd, pd->match); + cap += (s->maxCaptureGroups + 1); + } +} + +static +ch_error_t alloc_scratch(const ch_scratch_t *proto, ch_scratch_t **scratch) { + size_t ovectorSize = (proto->maxCaptureGroups + 1) * sizeof(int) * 3; + size_t capturedSize = + sizeof(struct ch_capture) * (proto->maxCaptureGroups + 1); + size_t patternDataSize = getPatternDataSize(proto); + size_t activeSize = proto->activeSize; + size_t queueSize = proto->patternCount * sizeof(struct queue_item); + + // max padding for alignment below. + size_t padding = alignof(int) + alignof(struct ch_capture) + + alignof(struct ch_patterndata) + + alignof(struct queue_item); + + size_t allocSize = sizeof(ch_scratch_t) + ovectorSize + capturedSize + + patternDataSize + activeSize + queueSize + padding + + 256; /* padding for cacheline alignment */ + ch_scratch_t *s; + ch_scratch_t *s_tmp = ch_scratch_alloc(allocSize); + ch_error_t err = ch_check_alloc(s_tmp); + if (err != CH_SUCCESS) { + ch_scratch_free(s_tmp); + *scratch = NULL; + return err; + } + + memset(s_tmp, 0, allocSize); + s = ROUNDUP_PTR(s_tmp, 64); + // Set ordinary members. + *s = *proto; + + s->magic = CH_SCRATCH_MAGIC; + s->in_use = 0; + s->scratch_alloc = (char *)s_tmp; + + // Set pointers internal to allocation. + + char *ptr = (char *)s + sizeof(*s); + ptr = ROUNDUP_PTR(ptr, alignof(int)); + s->ovector = (int *)ptr; + ptr += ovectorSize; + + ptr = ROUNDUP_PTR(ptr, alignof(struct ch_capture)); + s->captured = (struct ch_capture *)ptr; + ptr += capturedSize; + + ptr = ROUNDUP_PTR(ptr, alignof(struct ch_patterndata)); + s->patternData = (struct ch_patterndata *)ptr; + ptr += patternDataSize; + + // Pre-fill pattern data, setting captureOffsets + initPatternData(s); + + ptr = ROUNDUP_PTR(ptr, alignof(struct queue_item)); + s->pq.item = (struct queue_item *)ptr; + ptr += queueSize; + + s->active = (u8 *)ptr; + + // Store size. + s->scratchSize = allocSize; + + // We should never overrun our allocation. + assert((ptr + activeSize) - (char *)s <= (ptrdiff_t)allocSize); + + *scratch = s; + return CH_SUCCESS; +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_alloc_scratch(const ch_database_t *hydb, + ch_scratch_t **scratch) { + if (!hydb || !scratch) { + DEBUG_PRINTF("invalid args\n"); + return CH_INVALID; + } + + DEBUG_PRINTF("hydb=%p, &scratch=%p\n", hydb, scratch); + ch_error_t rv = hydbIsValid(hydb); + if (rv != CH_SUCCESS) { + DEBUG_PRINTF("invalid database\n"); + return rv; + } + + if (*scratch != NULL) { + /* has to be aligned before we can do anything with it */ + if (!ISALIGNED_CL(*scratch)) { + return CH_INVALID; + } + if ((*scratch)->magic != CH_SCRATCH_MAGIC) { + return CH_INVALID; + } + if (markScratchInUse(*scratch)) { + return CH_SCRATCH_IN_USE; + } + } + + // We allocate a prototype of the scratch header to do our sizing with. + ch_scratch_t *proto; + ch_scratch_t *proto_tmp = ch_scratch_alloc(sizeof(ch_scratch_t) + 256); + ch_error_t proto_ret = ch_check_alloc(proto_tmp); + if (proto_ret != CH_SUCCESS) { + ch_scratch_free(proto_tmp); + ch_scratch_free(*scratch); + *scratch = NULL; + return proto_ret; + } + + proto = ROUNDUP_PTR(proto_tmp, 64); + + int resize = 0; + if (*scratch) { + *proto = **scratch; + } else { + memset(proto, 0, sizeof(*proto)); + resize = 1; + } + proto->scratch_alloc = (char *)proto_tmp; + + const struct ch_bytecode *db = ch_get_bytecode(hydb); + + if (db->maxCaptureGroups > proto->maxCaptureGroups) { + proto->maxCaptureGroups = db->maxCaptureGroups; + resize = 1; + } + + if (db->patternCount > proto->patternCount) { + proto->patternCount = db->patternCount; + proto->activeSize = db->activeSize; + resize = 1; + } + + if (resize) { + if (*scratch) { + ch_scratch_free((*scratch)->scratch_alloc); + } + + ch_error_t alloc_ret = alloc_scratch(proto, scratch); + ch_scratch_free(proto_tmp); + if (alloc_ret != CH_SUCCESS) { + *scratch = NULL; + return alloc_ret; + } + } else { + ch_scratch_free(proto_tmp); + unmarkScratchInUse(*scratch); + } + + if (db->flags & CHIMERA_FLAG_NO_MULTIMATCH) { + (*scratch)->multi_scratch = NULL; + return CH_SUCCESS; + } + + // We may still have to realloc the underlying Hyperscan scratch. + rv = hs_alloc_scratch(getHyperscanDatabase(db), + &(*scratch)->multi_scratch); + if (rv != HS_SUCCESS) { + DEBUG_PRINTF("hs_alloc_scratch for multi_scratch failed\n"); + hs_free_scratch((*scratch)->multi_scratch); + ch_scratch_free((*scratch)->scratch_alloc); + *scratch = NULL; + return rv; + } + + return CH_SUCCESS; +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_clone_scratch(const ch_scratch_t *src, + ch_scratch_t **dest) { + if (!dest || !src || !ISALIGNED_CL(src) || + src->magic != CH_SCRATCH_MAGIC) { + DEBUG_PRINTF("scratch invalid\n"); + return CH_INVALID; + } + + ch_error_t ret = alloc_scratch(src, dest); + if (ret != CH_SUCCESS) { + DEBUG_PRINTF("alloc_scratch failed\n"); + *dest = NULL; + return ret; + } + + if (src->multi_scratch) { + (*dest)->multi_scratch = NULL; + ret = hs_clone_scratch(src->multi_scratch, &(*dest)->multi_scratch); + if (ret != HS_SUCCESS) { + DEBUG_PRINTF("hs_clone_scratch(multi_scratch,...) failed\n"); + ch_scratch_free(*dest); + return ret; + } + } + + return CH_SUCCESS; +} + +HS_PUBLIC_API +ch_error_t HS_CDECL ch_free_scratch(ch_scratch_t *scratch) { + ch_error_t ret = CH_SUCCESS; + if (scratch) { + /* has to be aligned before we can do anything with it */ + if (!ISALIGNED_CL(scratch)) { + return CH_INVALID; + } + if (scratch->magic != CH_SCRATCH_MAGIC) { + return CH_INVALID; + } + if (markScratchInUse(scratch)) { + return CH_SCRATCH_IN_USE; + } + + if (scratch->multi_scratch) { + ret = hs_free_scratch(scratch->multi_scratch); + } + + scratch->magic = 0; + assert(scratch->scratch_alloc); + DEBUG_PRINTF("scratch %p is really at %p : freeing\n", scratch, + scratch->scratch_alloc); + ch_scratch_free(scratch->scratch_alloc); + } + + return ret; +} + +/** Not public, but used for info from our internal tools. Note that in the + * hybrid matcher the scratch is definitely not a contiguous memory region. */ +HS_PUBLIC_API +ch_error_t HS_CDECL ch_scratch_size(const ch_scratch_t *scratch, size_t *size) { + ch_error_t ret = CH_SUCCESS; + if (!size || !scratch || !ISALIGNED_CL(scratch) || + scratch->magic != CH_SCRATCH_MAGIC) { + return CH_INVALID; + } else { + size_t multi_size = 0; + + if (scratch->multi_scratch) { + ret = hs_scratch_size(scratch->multi_scratch, &multi_size); + } + if (ret) { + multi_size = 0; + } + + *size = scratch->scratchSize + multi_size; + } + + return ret; +} diff --git a/chimera/ch_scratch.h b/chimera/ch_scratch.h new file mode 100644 index 00000000..47d9101e --- /dev/null +++ b/chimera/ch_scratch.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Scratch and associated data structures. + * + * This header gets pulled into many places (many deep, slow to compile + * places). Try to keep the included headers under control. + */ + +#ifndef CH_SCRATCH_H_ +#define CH_SCRATCH_H_ + +#include "ch_common.h" +#include "ch_runtime.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define CH_SCRATCH_MAGIC 0x554F4259 //!< Magic number stored in \ref ch_scratch + +struct queue_item { + int from; /** \brief used to store the start location. */ + int to; /** \brief used to store the current location. */ + u32 id; /**< pattern index. */ +}; + +struct match_pq { + struct queue_item *item; + u32 size; /**< current size of the priority queue */ +}; + +/** \brief Information about a pattern stored at runtime when a match is + * encountered. */ +struct ch_patterndata { + struct ch_capture *match; //!< buffered group info + u32 groupCount; //!< number of capturing groups + u32 scanStart; //!< start of match window (still to be single-scanned). +}; + +/** \brief Scratch space header for Chimera. */ +struct ch_scratch { + u32 magic; //!< must be \ref CH_SCRATCH_MAGIC + u8 in_use; /**< non-zero when being used by an API call. */ + struct hs_scratch *multi_scratch; //!< for hyperscan scatch. + int *ovector; //!< maximally-sized ovector for PCRE usage. + struct ch_capture *captured; //!< max-sized capture group struct. + u8 *active; //!< active multibit. + struct ch_patterndata *patternData; //!< per-pattern match data, indexed by + // pattern ID. + struct match_pq pq; //!< priority queue to ensure matching ordering + u32 patternCount; //!< number of patterns, used to size active multibit + u32 activeSize; //!< size of active multibit + u32 maxCaptureGroups; //!< largest num of capturing groups required + u32 scratchSize; //!< size of allocation + int ret; //!< return value in Hyperscan callback + char *scratch_alloc; /* user allocated scratch object */ +}; + +/** + * \brief Mark scratch as in use. + * + * Returns non-zero if it was already in use, zero otherwise. + */ +static really_inline +char markScratchInUse(struct ch_scratch *scratch) { + DEBUG_PRINTF("marking scratch as in use\n"); + assert(scratch && scratch->magic == CH_SCRATCH_MAGIC); + if (scratch->in_use) { + DEBUG_PRINTF("scratch already in use!\n"); + return 1; + } + scratch->in_use = 1; + return 0; +} + +/** + * \brief Mark scratch as no longer in use. + */ +static really_inline +void unmarkScratchInUse(struct ch_scratch *scratch) { + DEBUG_PRINTF("marking scratch as not in use\n"); + assert(scratch && scratch->magic == CH_SCRATCH_MAGIC); + assert(scratch->in_use == 1); + scratch->in_use = 0; +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* CH_SCRATCH_H_ */ diff --git a/cmake/pcre.cmake b/cmake/pcre.cmake index 63527990..2b0d23c7 100644 --- a/cmake/pcre.cmake +++ b/cmake/pcre.cmake @@ -61,5 +61,3 @@ else () return () endif () endif (PCRE_BUILD_SOURCE) - -set (PCRE_CHECKED TRUE PARENT_SCOPE) diff --git a/src/util/multibit_build.cpp b/src/util/multibit_build.cpp index ad6a0d6a..fd7b4e80 100644 --- a/src/util/multibit_build.cpp +++ b/src/util/multibit_build.cpp @@ -46,7 +46,7 @@ using namespace std; namespace ue2 { -u32 mmbit_size(u32 total_bits) { +u32 HS_CDECL mmbit_size(u32 total_bits) { if (total_bits > MMB_MAX_BITS) { throw ResourceLimitError(); } diff --git a/src/util/multibit_build.h b/src/util/multibit_build.h index ba5c8dfa..60c07995 100644 --- a/src/util/multibit_build.h +++ b/src/util/multibit_build.h @@ -33,6 +33,7 @@ #ifndef MULTIBIT_BUILD_H #define MULTIBIT_BUILD_H +#include "hs_common.h" #include "multibit_internal.h" #include "hash.h" @@ -62,8 +63,10 @@ namespace ue2 { * * This will throw a resource limit assertion if the requested mmbit is too * large. + * + * TODO:add temporary HS_CDECL for chimera on Windows, need improve this. */ -u32 mmbit_size(u32 total_bits); +u32 HS_CDECL mmbit_size(u32 total_bits); /** \brief Construct a sparse iterator over the values in \a bits for a * multibit of size \a total_bits. */ diff --git a/tools/hsbench/CMakeLists.txt b/tools/hsbench/CMakeLists.txt index f0e76da1..c580a7b9 100644 --- a/tools/hsbench/CMakeLists.txt +++ b/tools/hsbench/CMakeLists.txt @@ -31,6 +31,8 @@ SET(hsbench_SOURCES common.h data_corpus.cpp data_corpus.h + engine.cpp + engine.h engine_hyperscan.cpp engine_hyperscan.h heapstats.cpp @@ -45,6 +47,23 @@ SET(hsbench_SOURCES timer.h ) +if (BUILD_CHIMERA) + add_definitions(-DHS_HYBRID) + SET(hsbench_SOURCES + ${hsbench_SOURCES} + engine_chimera.cpp + engine_chimera.h + engine_pcre.cpp + engine_pcre.h + ) +endif() + add_executable(hsbench ${hsbench_SOURCES}) -target_link_libraries(hsbench hs databaseutil expressionutil ${SQLITE3_LDFLAGS} - ${CMAKE_THREAD_LIBS_INIT}) +if (BUILD_CHIMERA) + include_directories(${PCRE_INCLUDE_DIRS}) + target_link_libraries(hsbench hs chimera ${PCRE_LDFLAGS} databaseutil + expressionutil ${SQLITE3_LDFLAGS} ${CMAKE_THREAD_LIBS_INIT}) +else() + target_link_libraries(hsbench hs databaseutil expressionutil + ${SQLITE3_LDFLAGS} ${CMAKE_THREAD_LIBS_INIT}) +endif() diff --git a/tools/hsbench/common.h b/tools/hsbench/common.h index d7bce73a..820cad7c 100644 --- a/tools/hsbench/common.h +++ b/tools/hsbench/common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -42,6 +42,12 @@ extern bool forceEditDistance; extern unsigned editDistance; extern bool printCompressSize; +/** Structure for the result of a single complete scan. */ +struct ResultEntry { + double seconds = 0; //!< Time taken for scan. + unsigned int matches = 0; //!< Count of matches found. +}; + struct SqlFailure { explicit SqlFailure(const std::string &s) : message(s) {} std::string message; diff --git a/tools/hsbench/engine.cpp b/tools/hsbench/engine.cpp new file mode 100644 index 00000000..f447a0bc --- /dev/null +++ b/tools/hsbench/engine.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "engine.h" + +EngineContext::~EngineContext() { } + +EngineStream::~EngineStream() { } + +Engine::~Engine() { } diff --git a/tools/hsbench/engine.h b/tools/hsbench/engine.h new file mode 100644 index 00000000..e41f9948 --- /dev/null +++ b/tools/hsbench/engine.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ENGINE_H +#define ENGINE_H + +#include "common.h" +#include "sqldb.h" + +#include +#include +#include + +#include + +// Engines have an engine context which is allocated on a per-thread basis. +class EngineContext : boost::noncopyable { +public: + virtual ~EngineContext(); +}; + +/** Streaming mode scans have persistent stream state associated with them. */ +class EngineStream : boost::noncopyable { +public: + virtual ~EngineStream(); + unsigned int sn; +}; + +// Benchmarking engine +class Engine : boost::noncopyable { +public: + virtual ~Engine(); + + // allocate an EngineContext + virtual std::unique_ptr makeContext() const = 0; + + // non-streaming scan + virtual void scan(const char *data, unsigned len, unsigned blockId, + ResultEntry &results, EngineContext &ectx) const = 0; + + // vectoring scan + virtual void scan_vectored(const char *const *data, + const unsigned int *len, unsigned int count, + unsigned int streamId, ResultEntry &result, + EngineContext &ectx) const = 0; + + // stream open + virtual std::unique_ptr streamOpen(EngineContext &ectx, + unsigned id) const = 0; + + // stream close + virtual void streamClose(std::unique_ptr stream, + ResultEntry &result) const = 0; + + // stream compress and expand + virtual void streamCompressExpand(EngineStream &stream, + std::vector &temp) const = 0; + + // streaming scan + virtual void streamScan(EngineStream &stream, const char *data, + unsigned int len, unsigned int id, + ResultEntry &result) const = 0; + + virtual void printStats() const = 0; + + virtual void sqlStats(SqlDB &db) const = 0; +}; + +#endif // ENGINE_H diff --git a/tools/hsbench/engine_chimera.cpp b/tools/hsbench/engine_chimera.cpp new file mode 100644 index 00000000..16374f36 --- /dev/null +++ b/tools/hsbench/engine_chimera.cpp @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "ExpressionParser.h" +#include "common.h" +#include "engine_chimera.h" +#include "expressions.h" +#include "heapstats.h" +#include "sqldb.h" +#include "timer.h" + +#include "chimera/ch_database.h" + +#include "util/make_unique.h" + +using namespace std; + +EngineCHContext::EngineCHContext(const ch_database_t *db) { + ch_alloc_scratch(db, &scratch); + assert(scratch); +} + +EngineCHContext::~EngineCHContext() { + ch_free_scratch(scratch); +} + +namespace /* anonymous */ { + +/** Scan context structure passed to the onMatch callback function. */ +struct ScanCHContext { + ScanCHContext(unsigned id_in, ResultEntry &result_in) + : id(id_in), result(result_in) {} + unsigned id; + ResultEntry &result; +}; + +} // namespace + +/** + * Callback function called for every match that Chimera produces, used when + * "echo matches" is off. + */ +static +int onMatch(unsigned int, unsigned long long, unsigned long long, unsigned int, + unsigned int, const ch_capture_t *, void *ctx) { + ScanCHContext *sc = static_cast(ctx); + assert(sc); + sc->result.matches++; + + return 0; +} + +/** + * Callback function called for every match that Chimera produces when "echo + * matches" is enabled. + */ +static +int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to, + unsigned int, unsigned int, const ch_capture_t *, void *ctx) { + ScanCHContext *sc = static_cast(ctx); + assert(sc); + sc->result.matches++; + + printf("Match @%u:%llu for %u\n", sc->id, to, id); + + return 0; +} + +EngineChimera::EngineChimera(ch_database_t *db_in, CompileCHStats cs) + : db(db_in), compile_stats(move(cs)) { + assert(db); +} + +EngineChimera::~EngineChimera() { + ch_free_database(db); +} + +unique_ptr EngineChimera::makeContext() const { + return ue2::make_unique(db); +} + +void EngineChimera::scan(const char *data, unsigned int len, unsigned int id, + ResultEntry &result, EngineContext &ectx) const { + assert(data); + + auto &ctx = static_cast(ectx); + ScanCHContext sc(id, result); + auto callback = echo_matches ? onMatchEcho : onMatch; + ch_error_t rv = ch_scan(db, data, len, 0, ctx.scratch, callback, nullptr, + &sc); + + if (rv != CH_SUCCESS) { + printf("Fatal error: ch_scan returned error %d\n", rv); + abort(); + } +} + +// vectoring scan +void EngineChimera::scan_vectored(UNUSED const char *const *data, + UNUSED const unsigned int *len, + UNUSED unsigned int count, + UNUSED unsigned int streamId, + UNUSED ResultEntry &result, + UNUSED EngineContext &ectx) const { + printf("Hybrid matcher can't support vectored mode.\n"); + abort(); +} + +unique_ptr EngineChimera::streamOpen(UNUSED EngineContext &ectx, + UNUSED unsigned id) const { + printf("Hybrid matcher can't stream.\n"); + abort(); +} + +void EngineChimera::streamClose(UNUSED unique_ptr stream, + UNUSED ResultEntry &result) const { + printf("Hybrid matcher can't stream.\n"); + abort(); +} + +void EngineChimera::streamScan(UNUSED EngineStream &stream, + UNUSED const char *data, + UNUSED unsigned len, UNUSED unsigned id, + UNUSED ResultEntry &result) const { + printf("Hybrid matcher can't stream.\n"); + abort(); +} + +void EngineChimera::streamCompressExpand(UNUSED EngineStream &stream, + UNUSED vector &temp) const { + printf("Hybrid matcher can't stream.\n"); + abort(); +} + +void EngineChimera::printStats() const { + // Output summary information. + if (!compile_stats.sigs_name.empty()) { + printf("Signature set: %s\n", compile_stats.sigs_name.c_str()); + } + printf("Signatures: %s\n", compile_stats.signatures.c_str()); + printf("Chimera info: %s\n", compile_stats.db_info.c_str()); + printf("Expression count: %'zu\n", compile_stats.expressionCount); + printf("Bytecode size: %'zu bytes\n", compile_stats.compiledSize); + printf("Database CRC: 0x%x\n", compile_stats.crc32); + printf("Scratch size: %'zu bytes\n", compile_stats.scratchSize); + printf("Compile time: %'0.3Lf seconds\n", compile_stats.compileSecs); + printf("Peak heap usage: %'u bytes\n", compile_stats.peakMemorySize); +} + +void EngineChimera::sqlStats(SqlDB &sqldb) const { + ostringstream crc; + crc << "0x" << hex << compile_stats.crc32; + + static const string Q = + "INSERT INTO Compile (" + "sigsName, signatures, dbInfo, exprCount, dbSize, crc," + "scratchSize, compileSecs, peakMemory) " + "VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)"; + + sqldb.insert_all(Q, compile_stats.sigs_name, compile_stats.signatures, + compile_stats.db_info, compile_stats.expressionCount, + compile_stats.compiledSize, crc.str(), + compile_stats.scratchSize, compile_stats.compileSecs, + compile_stats.peakMemorySize); +} + +unique_ptr +buildEngineChimera(const ExpressionMap &expressions, const string &name, + const string &sigs_name) { + if (expressions.empty()) { + assert(0); + return nullptr; + } + + long double compileSecs = 0.0; + size_t compiledSize = 0.0; + size_t scratchSize = 0; + unsigned int peakMemorySize = 0; + string db_info; + + ch_database_t *db; + ch_error_t err; + + const unsigned int count = expressions.size(); + + vector exprs; + vector flags, ids; + vector ext; + + for (const auto &m : expressions) { + string expr; + unsigned int f = 0; + hs_expr_ext extparam; // unused + extparam.flags = 0; + if (!readExpression(m.second, expr, &f, &extparam)) { + printf("Error parsing PCRE: %s (id %u)\n", m.second.c_str(), + m.first); + return nullptr; + } + + if (extparam.flags) { + printf("Error parsing PCRE with extended flags: %s (id %u)\n", + m.second.c_str(), m.first); + return nullptr; + } + exprs.push_back(expr); + ids.push_back(m.first); + flags.push_back(f); + } + + // Our compiler takes an array of plain ol' C strings. + vector patterns(count); + for (unsigned int i = 0; i < count; i++) { + patterns[i] = exprs[i].c_str(); + } + + Timer timer; + timer.start(); + + // Capture groups by default + unsigned int mode = CH_MODE_GROUPS; + ch_compile_error_t *compile_err; + err = ch_compile_multi(patterns.data(), flags.data(), ids.data(), + count, mode, nullptr, &db, &compile_err); + + timer.complete(); + compileSecs = timer.seconds(); + peakMemorySize = getPeakHeap(); + + if (err == CH_COMPILER_ERROR) { + if (compile_err->expression >= 0) { + printf("Compile error for signature #%u: %s\n", + compile_err->expression, compile_err->message); + } else { + printf("Compile error: %s\n", compile_err->message); + } + ch_free_compile_error(compile_err); + return nullptr; + } + + err = ch_database_size(db, &compiledSize); + if (err != CH_SUCCESS) { + return nullptr; + } + assert(compiledSize > 0); + + char *info; + err = ch_database_info(db, &info); + if (err != CH_SUCCESS) { + return nullptr; + } else { + db_info = string(info); + free(info); + } + + // Allocate scratch temporarily to find its size: this is a good test + // anyway. + ch_scratch_t *scratch = nullptr; + err = ch_alloc_scratch(db, &scratch); + if (err != HS_SUCCESS) { + return nullptr; + } + + err = ch_scratch_size(scratch, &scratchSize); + if (err != CH_SUCCESS) { + return nullptr; + } + ch_free_scratch(scratch); + + // Collect summary information. + CompileCHStats cs; + cs.sigs_name = sigs_name; + if (!sigs_name.empty()) { + const auto pos = name.find_last_of('/'); + cs.signatures = name.substr(pos + 1); + } else { + cs.signatures = name; + } + cs.db_info = db_info; + cs.expressionCount = expressions.size(); + cs.compiledSize = compiledSize; + cs.scratchSize = scratchSize; + cs.compileSecs = compileSecs; + cs.peakMemorySize = peakMemorySize; + + return ue2::make_unique(db, move(cs)); +} diff --git a/tools/hsbench/engine_chimera.h b/tools/hsbench/engine_chimera.h new file mode 100644 index 00000000..8e2cd0f6 --- /dev/null +++ b/tools/hsbench/engine_chimera.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ENGINECHIMERA_H +#define ENGINECHIMERA_H + +#include "expressions.h" +#include "engine.h" + +#include "chimera/ch.h" + +#include +#include +#include + +/** Infomation about the database compile */ +struct CompileCHStats { + std::string sigs_name; + std::string signatures; + std::string db_info; + size_t expressionCount = 0; + size_t compiledSize = 0; + uint32_t crc32 = 0; + size_t scratchSize = 0; + long double compileSecs = 0; + unsigned int peakMemorySize = 0; +}; + +/** Engine context which is allocated on a per-thread basis. */ +class EngineCHContext : public EngineContext{ +public: + explicit EngineCHContext(const ch_database_t *db); + ~EngineCHContext(); + + ch_scratch_t *scratch = nullptr; +}; + +/** Chimera Engine for scanning data. */ +class EngineChimera : public Engine { +public: + explicit EngineChimera(ch_database_t *db, CompileCHStats cs); + ~EngineChimera(); + + std::unique_ptr makeContext() const; + + void scan(const char *data, unsigned int len, unsigned int id, + ResultEntry &result, EngineContext &ectx) const; + + void scan_vectored(const char *const *data, const unsigned int *len, + unsigned int count, unsigned int streamId, + ResultEntry &result, EngineContext &ectx) const; + + std::unique_ptr streamOpen(EngineContext &ectx, + unsigned id) const; + + void streamClose(std::unique_ptr stream, + ResultEntry &result) const; + + void streamCompressExpand(EngineStream &stream, + std::vector &temp) const; + + void streamScan(EngineStream &stream, const char *data, unsigned int len, + unsigned int id, ResultEntry &result) const; + + void printStats() const; + + void sqlStats(SqlDB &db) const; + +private: + ch_database_t *db; + CompileCHStats compile_stats; +}; + +std::unique_ptr +buildEngineChimera(const ExpressionMap &expressions, const std::string &name, + const std::string &sigs_name); + +#endif // ENGINECHIMERA_H diff --git a/tools/hsbench/engine_hyperscan.cpp b/tools/hsbench/engine_hyperscan.cpp index d98b3a40..685c1076 100644 --- a/tools/hsbench/engine_hyperscan.cpp +++ b/tools/hsbench/engine_hyperscan.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -57,20 +57,22 @@ using namespace std; -EngineContext::EngineContext(const hs_database_t *db) { +EngineHSContext::EngineHSContext(const hs_database_t *db) { hs_alloc_scratch(db, &scratch); assert(scratch); } -EngineContext::~EngineContext() { +EngineHSContext::~EngineHSContext() { hs_free_scratch(scratch); } +EngineHSStream::~EngineHSStream() { } + namespace /* anonymous */ { /** Scan context structure passed to the onMatch callback function. */ -struct ScanContext { - ScanContext(unsigned id_in, ResultEntry &result_in, +struct ScanHSContext { + ScanHSContext(unsigned id_in, ResultEntry &result_in, const EngineStream *stream_in) : id(id_in), result(result_in), stream(stream_in) {} unsigned id; @@ -87,7 +89,7 @@ struct ScanContext { static int onMatch(unsigned int, unsigned long long, unsigned long long, unsigned int, void *ctx) { - ScanContext *sc = static_cast(ctx); + ScanHSContext *sc = static_cast(ctx); assert(sc); sc->result.matches++; @@ -101,7 +103,7 @@ int onMatch(unsigned int, unsigned long long, unsigned long long, unsigned int, static int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to, unsigned int, void *ctx) { - ScanContext *sc = static_cast(ctx); + ScanHSContext *sc = static_cast(ctx); assert(sc); sc->result.matches++; @@ -114,7 +116,7 @@ int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to, return 0; } -EngineHyperscan::EngineHyperscan(hs_database_t *db_in, CompileStats cs) +EngineHyperscan::EngineHyperscan(hs_database_t *db_in, CompileHSStats cs) : db(db_in), compile_stats(std::move(cs)) { assert(db); } @@ -124,14 +126,15 @@ EngineHyperscan::~EngineHyperscan() { } unique_ptr EngineHyperscan::makeContext() const { - return ue2::make_unique(db); + return ue2::make_unique(db); } void EngineHyperscan::scan(const char *data, unsigned int len, unsigned int id, - ResultEntry &result, EngineContext &ctx) const { + ResultEntry &result, EngineContext &ectx) const { assert(data); - ScanContext sc(id, result, nullptr); + EngineHSContext &ctx = static_cast(ectx); + ScanHSContext sc(id, result, nullptr); auto callback = echo_matches ? onMatchEcho : onMatch; hs_error_t rv = hs_scan(db, data, len, 0, ctx.scratch, callback, &sc); @@ -144,11 +147,12 @@ void EngineHyperscan::scan(const char *data, unsigned int len, unsigned int id, void EngineHyperscan::scan_vectored(const char *const *data, const unsigned int *len, unsigned int count, unsigned streamId, ResultEntry &result, - EngineContext &ctx) const { + EngineContext &ectx) const { assert(data); assert(len); - ScanContext sc(streamId, result, nullptr); + EngineHSContext &ctx = static_cast(ectx); + ScanHSContext sc(streamId, result, nullptr); auto callback = echo_matches ? onMatchEcho : onMatch; hs_error_t rv = hs_scan_vector(db, data, len, count, 0, ctx.scratch, callback, &sc); @@ -159,9 +163,10 @@ void EngineHyperscan::scan_vectored(const char *const *data, } } -unique_ptr EngineHyperscan::streamOpen(EngineContext &ctx, +unique_ptr EngineHyperscan::streamOpen(EngineContext &ectx, unsigned streamId) const { - auto stream = ue2::make_unique(); + EngineHSContext &ctx = static_cast(ectx); + auto stream = ue2::make_unique(); stream->ctx = &ctx; hs_open_stream(db, 0, &stream->id); @@ -170,17 +175,18 @@ unique_ptr EngineHyperscan::streamOpen(EngineContext &ctx, return nullptr; } stream->sn = streamId; - return stream; + return move(stream); } void EngineHyperscan::streamClose(unique_ptr stream, ResultEntry &result) const { assert(stream); - auto &s = static_cast(*stream); - EngineContext &ctx = *s.ctx; + auto &s = static_cast(*stream); + EngineContext &ectx = *s.ctx; + EngineHSContext &ctx = static_cast(ectx); - ScanContext sc(0, result, &s); + ScanHSContext sc(0, result, &s); auto callback = echo_matches ? onMatchEcho : onMatch; assert(s.id); @@ -193,10 +199,10 @@ void EngineHyperscan::streamScan(EngineStream &stream, const char *data, ResultEntry &result) const { assert(data); - auto &s = static_cast(stream); - EngineContext &ctx = *s.ctx; + auto &s = static_cast(stream); + EngineHSContext &ctx = *s.ctx; - ScanContext sc(id, result, &s); + ScanHSContext sc(id, result, &s); auto callback = echo_matches ? onMatchEcho : onMatch; hs_error_t rv = hs_scan_stream(s.id, data, len, 0, ctx.scratch, callback, &sc); @@ -210,11 +216,12 @@ void EngineHyperscan::streamScan(EngineStream &stream, const char *data, void EngineHyperscan::streamCompressExpand(EngineStream &stream, vector &temp) const { size_t used = 0; - hs_error_t err = hs_compress_stream(stream.id, temp.data(), temp.size(), + auto &s = static_cast(stream); + hs_error_t err = hs_compress_stream(s.id, temp.data(), temp.size(), &used); if (err == HS_INSUFFICIENT_SPACE) { temp.resize(used); - err = hs_compress_stream(stream.id, temp.data(), temp.size(), &used); + err = hs_compress_stream(s.id, temp.data(), temp.size(), &used); } if (err != HS_SUCCESS) { @@ -223,10 +230,10 @@ void EngineHyperscan::streamCompressExpand(EngineStream &stream, } if (printCompressSize) { - printf("stream %u: compressed to %zu\n", stream.sn, used); + printf("stream %u: compressed to %zu\n", s.sn, used); } - err = hs_reset_and_expand_stream(stream.id, temp.data(), temp.size(), + err = hs_reset_and_expand_stream(s.id, temp.data(), temp.size(), nullptr, nullptr, nullptr); if (err != HS_SUCCESS) { @@ -469,7 +476,7 @@ buildEngineHyperscan(const ExpressionMap &expressions, ScanMode scan_mode, hs_free_scratch(scratch); // Collect summary information. - CompileStats cs; + CompileHSStats cs; cs.sigs_name = sigs_name; if (!sigs_name.empty()) { const auto pos = name.find_last_of('/'); diff --git a/tools/hsbench/engine_hyperscan.h b/tools/hsbench/engine_hyperscan.h index d27aab75..a8105d75 100644 --- a/tools/hsbench/engine_hyperscan.h +++ b/tools/hsbench/engine_hyperscan.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -30,22 +30,15 @@ #define ENGINEHYPERSCAN_H #include "expressions.h" -#include "common.h" -#include "sqldb.h" +#include "engine.h" #include "hs_runtime.h" #include #include #include -/** Structure for the result of a single complete scan. */ -struct ResultEntry { - double seconds = 0; //!< Time taken for scan. - unsigned int matches = 0; //!< Count of matches found. -}; - /** Infomation about the database compile */ -struct CompileStats { +struct CompileHSStats { std::string sigs_name; std::string signatures; std::string db_info; @@ -60,38 +53,38 @@ struct CompileStats { }; /** Engine context which is allocated on a per-thread basis. */ -class EngineContext { +class EngineHSContext : public EngineContext { public: - explicit EngineContext(const hs_database_t *db); - ~EngineContext(); + explicit EngineHSContext(const hs_database_t *db); + ~EngineHSContext(); hs_scratch_t *scratch = nullptr; }; /** Streaming mode scans have persistent stream state associated with them. */ -class EngineStream { +class EngineHSStream : public EngineStream { public: + ~EngineHSStream(); hs_stream_t *id; - unsigned int sn; - EngineContext *ctx; + EngineHSContext *ctx; }; /** Hyperscan Engine for scanning data. */ -class EngineHyperscan { +class EngineHyperscan : public Engine { public: - explicit EngineHyperscan(hs_database_t *db, CompileStats cs); + explicit EngineHyperscan(hs_database_t *db, CompileHSStats cs); ~EngineHyperscan(); std::unique_ptr makeContext() const; void scan(const char *data, unsigned int len, unsigned int id, - ResultEntry &result, EngineContext &ctx) const; + ResultEntry &result, EngineContext &ectx) const; void scan_vectored(const char *const *data, const unsigned int *len, unsigned int count, unsigned int streamId, - ResultEntry &result, EngineContext &ctx) const; + ResultEntry &result, EngineContext &ectx) const; - std::unique_ptr streamOpen(EngineContext &ctx, + std::unique_ptr streamOpen(EngineContext &ectx, unsigned id) const; void streamClose(std::unique_ptr stream, @@ -109,7 +102,7 @@ public: private: hs_database_t *db; - CompileStats compile_stats; + CompileHSStats compile_stats; }; namespace ue2 { diff --git a/tools/hsbench/engine_pcre.cpp b/tools/hsbench/engine_pcre.cpp new file mode 100644 index 00000000..b24ba4b5 --- /dev/null +++ b/tools/hsbench/engine_pcre.cpp @@ -0,0 +1,388 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "common.h" +#include "engine_pcre.h" +#include "heapstats.h" +#include "huge.h" +#include "sqldb.h" +#include "timer.h" + +#include "util/make_unique.h" +#include "util/unicode_def.h" + +using namespace std; + +EnginePCREContext::EnginePCREContext(int capture_cnt) { + ovec = (int *)malloc((capture_cnt + 1)* sizeof(int) * 3); +} + +EnginePCREContext::~EnginePCREContext() { + free(ovec); +} + +namespace /* anonymous */ { + +/** Scan context structure passed to the onMatch callback function. */ +struct ScanPCREContext { + ScanPCREContext(unsigned id_in, ResultEntry &result_in) + : id(id_in), result(result_in) {} + unsigned id; + ResultEntry &result; +}; + +} // namespace + +/** + * Function called for every match that PCRE produces, used when + * "echo matches" is off. + */ +static +int onMatch(ScanPCREContext *sc) { + assert(sc); + sc->result.matches++; + + return 0; +} + +/** + * Function called for every match that PCRE produces when "echo + * matches" is enabled. + */ +static +int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to, + ScanPCREContext *sc) { + assert(sc); + sc->result.matches++; + + printf("Match @%u:%llu for %u\n", sc->id, to, id); + + return 0; +} + +EnginePCRE::EnginePCRE(vector> dbs_in, CompilePCREStats cs, + int capture_cnt_in) + : dbs(move(dbs_in)), compile_stats(move(cs)), + capture_cnt(capture_cnt_in) {} + +EnginePCRE::~EnginePCRE() { + for (auto &pcreDB : dbs) { + free(pcreDB->extra); + free(pcreDB->db); + } +} + +unique_ptr EnginePCRE::makeContext() const { + return ue2::make_unique(capture_cnt); +} + +void EnginePCRE::scan(const char *data, unsigned int len, unsigned int id, + ResultEntry &result, EngineContext &ectx) const { + assert(data); + + ScanPCREContext sc(id, result); + auto &ctx = static_cast(ectx); + int *ovec = ctx.ovec; + int ovec_size = (capture_cnt + 1) * 3; + for (const auto &pcreDB : dbs) { + int startoffset = 0; + bool utf8 = pcreDB->utf8; + bool highlander = pcreDB->highlander; + + int flags = 0; + int ret; + do { + ret = pcre_exec(pcreDB->db, pcreDB->extra, data, len, + startoffset, flags, ovec, ovec_size); + if (ret <= PCRE_ERROR_NOMATCH) { + break; + } + + int from = ovec[0]; + int to = ovec[1]; + assert(from <= to); + + if (echo_matches) { + onMatchEcho(pcreDB->id, from, to, &sc); + } else { + onMatch(&sc); + } + + // If we only wanted a single match, we're done. + if (highlander) { + break; + } + + // Next scan starts at the first codepoint after the match. It's + // possible that we have a vacuous match, in which case we must step + // past it to ensure that we always progress. + if (from != to) { + startoffset = to; + } else if (utf8) { + startoffset = to + 1; + while (startoffset < (int)len && + ((data[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) { + ++startoffset; + } + } else { + startoffset = to + 1; + } + } while (startoffset <= (int)len); + + if (ret < PCRE_ERROR_NOMATCH) { + printf("Fatal error: pcre returned error %d\n", ret); + abort(); + } + } +} + +// vectoring scan +void EnginePCRE::scan_vectored(UNUSED const char *const *data, + UNUSED const unsigned int *len, + UNUSED unsigned int count, + UNUSED unsigned int streamId, + UNUSED ResultEntry &result, + UNUSED EngineContext &ectx) const { + printf("PCRE matcher can't support vectored mode.\n"); + abort(); +} + +unique_ptr EnginePCRE::streamOpen(UNUSED EngineContext &ectx, + UNUSED unsigned id) const { + printf("PCRE matcher can't stream.\n"); + abort(); +} + +void EnginePCRE::streamClose(UNUSED unique_ptr stream, + UNUSED ResultEntry &result) const { + printf("PCRE matcher can't stream.\n"); + abort(); +} + +void EnginePCRE::streamScan(UNUSED EngineStream &stream, + UNUSED const char *data, + UNUSED unsigned len, UNUSED unsigned id, + UNUSED ResultEntry &result) const { + printf("PCRE matcher can't stream.\n"); + abort(); +} + +void EnginePCRE::streamCompressExpand(UNUSED EngineStream &stream, + UNUSED vector &temp) const { + printf("PCRE matcher can't stream.\n"); + abort(); +} + +void EnginePCRE::printStats() const { + // Output summary information. + if (!compile_stats.sigs_name.empty()) { + printf("Signature set: %s\n", compile_stats.sigs_name.c_str()); + } + printf("Signatures: %s\n", compile_stats.signatures.c_str()); + printf("PCRE info: %s\n", compile_stats.db_info.c_str()); + printf("Expression count: %'zu\n", compile_stats.expressionCount); + printf("Bytecode size: %'zu bytes\n", compile_stats.compiledSize); + printf("Scratch size: %'zu bytes\n", compile_stats.scratchSize); + printf("Compile time: %'0.3Lf seconds\n", compile_stats.compileSecs); + printf("Peak heap usage: %'u bytes\n", compile_stats.peakMemorySize); +} + +void EnginePCRE::sqlStats(SqlDB &sqldb) const { + ostringstream crc; + + static const string Q = + "INSERT INTO Compile (" + "sigsName, signatures, dbInfo, exprCount, dbSize, crc," + "scratchSize, compileSecs, peakMemory) " + "VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)"; + + sqldb.insert_all(Q, compile_stats.sigs_name, compile_stats.signatures, + compile_stats.db_info, compile_stats.expressionCount, + compile_stats.compiledSize, crc.str(), + compile_stats.scratchSize, compile_stats.compileSecs, + compile_stats.peakMemorySize); +} + +static +bool decodeExprPCRE(string &expr, unsigned *flags, struct PcreDB &db) { + if (expr[0] != '/') { + return false; + } + + size_t end = expr.find_last_of('/'); + if (end == string::npos) { + return false; + } + string strFlags = expr.substr(end + 1, expr.length() - end - 1); + + // strip starting and trailing slashes and the flags + expr.erase(end, expr.length() - end); + expr.erase(0, 1); + + // decode the flags + *flags = 0; + for (size_t i = 0; i != strFlags.length(); ++i) { + switch (strFlags[i]) { + case 's': + *flags |= PCRE_DOTALL; + break; + case 'm': + *flags |= PCRE_MULTILINE; + break; + case 'i': + *flags |= PCRE_CASELESS; + break; + case '8': + *flags |= PCRE_UTF8; + db.utf8 = true; + break; + case 'W': + *flags |= PCRE_UCP; + break; + case 'H': + db.highlander = true; + break; + default: + return false; + } + } + + return true; +} + +unique_ptr +buildEnginePcre(const ExpressionMap &expressions, const string &name, + const string &sigs_name) { + if (expressions.empty()) { + assert(0); + return nullptr; + } + + long double compileSecs = 0.0; + size_t compiledSize = 0.0; + unsigned int peakMemorySize = 0; + string db_info("Version: "); + db_info += string(pcre_version()); + + vector> dbs; + int capture_cnt = 0; + + Timer timer; + timer.start(); + + for (const auto &m : expressions) { + string expr(m.second); + unsigned int flags = 0; + auto pcreDB = ue2::make_unique(); + if (!decodeExprPCRE(expr, &flags, *pcreDB)) { + printf("Error parsing PCRE: %s (id %u)\n", m.second.c_str(), + m.first); + return nullptr; + } + + const char *errp; + int erro; + pcre *db = pcre_compile(expr.c_str(), flags, &errp, &erro, NULL); + + if (!db) { + printf("Compile error %s\n", errp); + return nullptr; + } + + pcre_extra *extra = pcre_study(db, PCRE_STUDY_JIT_COMPILE, &errp); + if (errp) { + printf("PCRE could not be studied: %s\n", errp); + return nullptr; + } + if (!extra) { + extra = (pcre_extra *)malloc(sizeof(pcre_extra)); + } + int cap = 0; // PCRE_INFO_CAPTURECOUNT demands an int + if (pcre_fullinfo(db, extra, PCRE_INFO_CAPTURECOUNT, &cap)) { + printf("PCRE fullinfo error\n"); + free(extra); + free(db); + return nullptr; + } + assert(cap >= 0); + capture_cnt = max(capture_cnt, cap); + + size_t db_size = 0; + if (pcre_fullinfo(db, extra, PCRE_INFO_SIZE, &db_size)) { + printf("PCRE fullinfo error\n"); + free(extra); + free(db); + return nullptr; + } + + size_t study_size = 0; + if (pcre_fullinfo(db, extra, PCRE_INFO_STUDYSIZE, + &study_size)) { + printf("PCRE fullinfo error\n"); + free(extra); + free(db); + return nullptr; + } + compiledSize += db_size + study_size; + + pcreDB->id = m.first; + pcreDB->db = db; + + extra->flags = + PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION; + extra->match_limit = 10000000; + extra->match_limit_recursion = 1500; + + pcreDB->extra = extra; + dbs.push_back(move(pcreDB)); + } + + timer.complete(); + compileSecs = timer.seconds(); + peakMemorySize = getPeakHeap(); + + // Collect summary information. + CompilePCREStats cs; + cs.sigs_name = sigs_name; + if (!sigs_name.empty()) { + const auto pos = name.find_last_of('/'); + cs.signatures = name.substr(pos + 1); + } else { + cs.signatures = name; + } + cs.db_info = db_info; + cs.expressionCount = expressions.size(); + cs.compiledSize = compiledSize; + cs.scratchSize = (capture_cnt + 1) * sizeof(int) * 3; + cs.compileSecs = compileSecs; + cs.peakMemorySize = peakMemorySize; + + return ue2::make_unique(move(dbs), move(cs), capture_cnt); +} diff --git a/tools/hsbench/engine_pcre.h b/tools/hsbench/engine_pcre.h new file mode 100644 index 00000000..2e7dad9c --- /dev/null +++ b/tools/hsbench/engine_pcre.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ENGINEPCRE_H +#define ENGINEPCRE_H + +#include "expressions.h" +#include "engine.h" + +#include + +#include +#include +#include + +/** Infomation about the database compile */ +struct CompilePCREStats { + std::string sigs_name; + std::string signatures; + std::string db_info; + size_t expressionCount = 0; + size_t compiledSize = 0; + size_t scratchSize = 0; + long double compileSecs = 0; + unsigned int peakMemorySize = 0; +}; + +/** Engine context which is allocated on a per-thread basis. */ +class EnginePCREContext : public EngineContext{ +public: + explicit EnginePCREContext(int capture_cnt); + ~EnginePCREContext(); + + int *ovec = nullptr; +}; + +struct PcreDB { + bool highlander = false; + bool utf8 = false; + u32 id; + pcre *db = nullptr; + pcre_extra *extra = nullptr; +}; + +/** PCRE Engine for scanning data. */ +class EnginePCRE : public Engine { +public: + explicit EnginePCRE(std::vector> dbs_in, + CompilePCREStats cs, int capture_cnt_in); + ~EnginePCRE(); + + std::unique_ptr makeContext() const; + + void scan(const char *data, unsigned int len, unsigned int id, + ResultEntry &result, EngineContext &ectx) const; + + void scan_vectored(const char *const *data, const unsigned int *len, + unsigned int count, unsigned int streamId, + ResultEntry &result, EngineContext &ectx) const; + + std::unique_ptr streamOpen(EngineContext &ectx, + unsigned id) const; + + void streamClose(std::unique_ptr stream, + ResultEntry &result) const; + + void streamCompressExpand(EngineStream &stream, + std::vector &temp) const; + + void streamScan(EngineStream &stream, const char *data, unsigned int len, + unsigned int id, ResultEntry &result) const; + + void printStats() const; + + void sqlStats(SqlDB &db) const; + +private: + std::vector> dbs; + + CompilePCREStats compile_stats; + + int capture_cnt; +}; + +std::unique_ptr +buildEnginePcre(const ExpressionMap &expressions, const std::string &name, + const std::string &sigs_name); + +#endif // ENGINEPCRE_H diff --git a/tools/hsbench/main.cpp b/tools/hsbench/main.cpp index ae46de77..e99b052e 100644 --- a/tools/hsbench/main.cpp +++ b/tools/hsbench/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -31,6 +31,10 @@ #include "common.h" #include "data_corpus.h" #include "engine_hyperscan.h" +#if defined(HS_HYBRID) +#include "engine_chimera.h" +#include "engine_pcre.h" +#endif #include "expressions.h" #include "sqldb.h" #include "thread_barrier.h" @@ -87,6 +91,8 @@ namespace /* anonymous */ { bool display_per_scan = false; ScanMode scan_mode = ScanMode::STREAMING; +bool useHybrid = false; +bool usePcre = false; unsigned repeats = 20; string exprPath(""); string corpusFile(""); @@ -102,7 +108,7 @@ typedef void (*thread_func_t)(void *context); class ThreadContext : boost::noncopyable { public: - ThreadContext(unsigned num_in, const EngineHyperscan &db_in, + ThreadContext(unsigned num_in, const Engine &db_in, thread_barrier &tb_in, thread_func_t function_in, vector corpus_data_in) : num(num_in), results(repeats), engine(db_in), @@ -155,7 +161,7 @@ public: unsigned num; Timer timer; vector results; - const EngineHyperscan &engine; + const Engine &engine; unique_ptr enginectx; vector corpus_data; @@ -181,6 +187,10 @@ void usage(const char *error) { " (default: streaming).\n"); printf(" -V Benchmark in vectored mode" " (default: streaming).\n"); +#if defined(HS_HYBRID) + printf(" -H Benchmark using Chimera (if supported).\n"); + printf(" -P Benchmark using PCRE (if supported).\n"); +#endif #ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP printf(" -T CPU,CPU,... Benchmark with threads on these CPUs.\n"); #endif @@ -214,7 +224,7 @@ struct BenchmarkSigs { static void processArgs(int argc, char *argv[], vector &sigSets, UNUSED unique_ptr &grey) { - const char options[] = "-b:c:Cd:e:E:G:hi:n:No:p:sS:Vw:z:" + const char options[] = "-b:c:Cd:e:E:G:hHi:n:No:p:PsS:Vw:z:" #ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP "T:" // add the thread flag #endif @@ -287,6 +297,14 @@ void processArgs(int argc, char *argv[], vector &sigSets, usage(nullptr); exit(0); break; + case 'H': +#if defined(HS_HYBRID) + useHybrid = true; +#else + usage("Hybrid matcher not enabled in this build"); + exit(1); +#endif + break; case 'n': if (!fromString(optarg, repeats) || repeats == 0) { usage("Couldn't parse argument to -n flag, should be" @@ -294,6 +312,14 @@ void processArgs(int argc, char *argv[], vector &sigSets, exit(1); } break; + case 'P': +#if defined(HS_HYBRID) + usePcre = true; +#else + usage("PCRE matcher not enabled in this build"); + exit(1); +#endif + break; case 's': in_sigfile = 2; break; @@ -399,6 +425,24 @@ void processArgs(int argc, char *argv[], vector &sigSets, exit(1); } + // Constraints on Chimera and PCRE engines + if (useHybrid || usePcre) { + if (useHybrid && usePcre) { + usage("Can't run both Chimera and PCRE."); + exit(1); + } + if (scan_mode != ScanMode::BLOCK) { + usage("Must specify block mode in Chimera or PCRE with " + "the -N option."); + exit(1); + } + + if (forceEditDistance || loadDatabases || saveDatabases) { + usage("No extended options are supported in Chimera or PCRE."); + exit(1); + } + } + // Read in any -s signature sets. for (const auto &file : sigFiles) { SignatureSet sigs; @@ -503,7 +547,7 @@ static void benchStreamingInternal(ThreadContext *ctx, vector &streams, bool do_compress) { assert(ctx); - const EngineHyperscan &e = ctx->engine; + const Engine &e = ctx->engine; const vector &blocks = ctx->corpus_data; vector compress_buf(do_compress ? 1000 : 0); @@ -812,7 +856,7 @@ void sqlResults(const vector> &threads, * the same copy of the data. */ static -unique_ptr makeThreadContext(const EngineHyperscan &db, +unique_ptr makeThreadContext(const Engine &db, const vector &blocks, unsigned id, thread_barrier &sync_barrier) { @@ -839,7 +883,7 @@ unique_ptr makeThreadContext(const EngineHyperscan &db, /** Run the given benchmark. */ static -void runBenchmark(const EngineHyperscan &db, +void runBenchmark(const Engine &db, const vector &corpus_blocks) { size_t numThreads; bool useAffinity = false; @@ -936,8 +980,18 @@ int main(int argc, char *argv[]) { continue; } - auto engine = buildEngineHyperscan(exprMap, scan_mode, s.name, - sigName, *grey); + unique_ptr engine; + if (useHybrid) { +#if defined(HS_HYBRID) + engine = buildEngineChimera(exprMap, s.name, sigName); + } else if (usePcre) { + engine = buildEnginePcre(exprMap, s.name, sigName); +#endif + } else { + engine = buildEngineHyperscan(exprMap, scan_mode, s.name, + sigName, *grey); + } + if (!engine) { printf("Error: expressions failed to compile.\n"); exit(1); diff --git a/tools/hscheck/CMakeLists.txt b/tools/hscheck/CMakeLists.txt index 065d4c04..781d6b39 100644 --- a/tools/hscheck/CMakeLists.txt +++ b/tools/hscheck/CMakeLists.txt @@ -5,6 +5,14 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") SET(hscheck_SOURCES main.cpp ) -add_executable(hscheck ${hscheck_SOURCES}) -target_link_libraries(hscheck hs expressionutil pthread) + +if (BUILD_CHIMERA) + include_directories(${PCRE_INCLUDE_DIRS}) + add_definitions(-DHS_HYBRID) + add_executable(hscheck ${hscheck_SOURCES}) + target_link_libraries(hscheck hs chimera ${PCRE_LDFLAGS} expressionutil pthread) +else() + add_executable(hscheck ${hscheck_SOURCES}) + target_link_libraries(hscheck hs expressionutil pthread) +endif() diff --git a/tools/hscheck/main.cpp b/tools/hscheck/main.cpp index 73687e2a..27416028 100644 --- a/tools/hscheck/main.cpp +++ b/tools/hscheck/main.cpp @@ -59,6 +59,11 @@ #include "hs_internal.h" #include "ue2common.h" +#ifdef HS_HYBRID +#include +#include "chimera/ch.h" +#endif + #include #include #include @@ -77,6 +82,7 @@ namespace /* anonymous */ { // are we in streaming mode? (default: yes) bool g_streaming = true; bool g_vectored = false; +bool g_hybrid = false; string g_exprPath(""); string g_signatureFile(""); bool g_allSignatures = false; @@ -282,34 +288,57 @@ void checkExpression(UNUSED void *threadarg) { // Try and compile a database. const char *regexp = regex.c_str(); - const hs_expr_ext *extp = &ext; hs_error_t err; - hs_compile_error_t *compile_err; - hs_database_t *db = nullptr; + + if (g_hybrid) { +#ifdef HS_HYBRID + ch_compile_error_t *ch_compile_err; + ch_database_t *hybrid_db = nullptr; + err = ch_compile_multi(®exp, &flags, nullptr, 1, CH_MODE_GROUPS, + nullptr, &hybrid_db, &ch_compile_err); + if (err == HS_SUCCESS) { + assert(hybrid_db); + recordSuccess(g_exprMap, it->first); + ch_free_database(hybrid_db); + } else { + assert(!hybrid_db); + assert(ch_compile_err); + recordFailure(g_exprMap, it->first, ch_compile_err->message); + ch_free_compile_error(ch_compile_err); + } +#else + cerr << "Hybrid mode not available in this build." << endl; + exit(1); +#endif // HS_HYBRID + } else { + const hs_expr_ext *extp = &ext; + hs_compile_error_t *compile_err; + hs_database_t *db = nullptr; #if !defined(RELEASE_BUILD) - // This variant is available in non-release builds and allows us to - // modify greybox settings. - err = hs_compile_multi_int(®exp, &flags, nullptr, &extp, 1, mode, - nullptr, &db, &compile_err, *g_grey); + // This variant is available in non-release builds and allows us to + // modify greybox settings. + err = hs_compile_multi_int(®exp, &flags, nullptr, &extp, 1, mode, + nullptr, &db, &compile_err, *g_grey); #else - err = hs_compile_ext_multi(®exp, &flags, nullptr, &extp, 1, mode, - nullptr, &db, &compile_err); + err = hs_compile_ext_multi(®exp, &flags, nullptr, &extp, 1, mode, + nullptr, &db, &compile_err); #endif - if (err == HS_SUCCESS) { - assert(db); - recordSuccess(g_exprMap, it->first); - hs_free_database(db); - if (check_logical) { - cacheSubExpr(it->first, regex, flags, ext); + if (err == HS_SUCCESS) { + assert(db); + recordSuccess(g_exprMap, it->first); + hs_free_database(db); + if (check_logical) { + cacheSubExpr(it->first, regex, flags, ext); + } + } else { + assert(!db); + assert(compile_err); + recordFailure(g_exprMap, it->first, compile_err->message); + hs_free_compile_error(compile_err); } - } else { - assert(!db); - assert(compile_err); - recordFailure(g_exprMap, it->first, compile_err->message); - hs_free_compile_error(compile_err); } } } @@ -429,6 +458,9 @@ void usage() { #endif << " -V Operate in vectored mode." << endl << " -N Operate in block mode (default: streaming)." << endl +#ifdef HS_HYBRID + << " -H Operate in hybrid mode." << endl +#endif << " -L Pass HS_FLAG_SOM_LEFTMOST for all expressions (default: off)." << endl << " -8 Force UTF8 mode on all patterns." << endl << " -T NUM Run with NUM threads." << endl @@ -440,7 +472,7 @@ void usage() { static void processArgs(int argc, char *argv[], UNUSED unique_ptr &grey) { - const char options[] = "e:E:s:z:hLNV8G:T:BC"; + const char options[] = "e:E:s:z:hHLNV8G:T:BC"; bool signatureSet = false; for (;;) { @@ -492,6 +524,9 @@ void processArgs(int argc, char *argv[], UNUSED unique_ptr &grey) { g_streaming = false; g_vectored = true; break; + case 'H': + g_hybrid = true; + break; case 'T': num_of_threads = atoi(optarg); break; diff --git a/tools/hscollider/CMakeLists.txt b/tools/hscollider/CMakeLists.txt index f05b444f..7b2891fc 100644 --- a/tools/hscollider/CMakeLists.txt +++ b/tools/hscollider/CMakeLists.txt @@ -1,9 +1,3 @@ -# we have a fixed requirement for PCRE -set(PCRE_REQUIRED_MAJOR_VERSION 8) -set(PCRE_REQUIRED_MINOR_VERSION 41) -set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION}) - -include (${CMAKE_MODULE_PATH}/pcre.cmake) if (NOT CORRECT_PCRE_VERSION) message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} not found, not building hscollider") return() @@ -29,6 +23,8 @@ set_source_files_properties( ragelmaker(ColliderCorporaParser.rl) +add_definitions(-DHS_HYBRID) + # only set these after all tests are done set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") @@ -69,7 +65,7 @@ add_dependencies(hscollider ragel_ColliderCorporaParser) add_dependencies(hscollider pcre) if(NOT WIN32) - target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil + target_link_libraries(hscollider hs chimera ${PCRE_LDFLAGS} databaseutil expressionutil corpusomatic crosscompileutil pthread "${BACKTRACE_LDFLAGS}") @@ -78,7 +74,7 @@ if(HAVE_BACKTRACE) "${BACKTRACE_CFLAGS}") endif() else() # WIN32 - target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil + target_link_libraries(hscollider hs chimera ${PCRE_LDFLAGS} databaseutil expressionutil corpusomatic crosscompileutil) endif() diff --git a/tools/hscollider/DatabaseProxy.h b/tools/hscollider/DatabaseProxy.h index 13b6f680..831ab148 100644 --- a/tools/hscollider/DatabaseProxy.h +++ b/tools/hscollider/DatabaseProxy.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -54,10 +54,10 @@ public: explicit DatabaseProxy(const std::set &expr_ids) : ids(expr_ids) {} - explicit DatabaseProxy(std::shared_ptr built_db) + explicit DatabaseProxy(std::shared_ptr built_db) : db(built_db) {} - std::shared_ptr get(const UltimateTruth &ultimate) { + std::shared_ptr get(const UltimateTruth &ultimate) { std::lock_guard lock(mutex); if (failed) { // We have previously failed to compile this database. @@ -80,7 +80,7 @@ public: private: std::mutex mutex; - std::shared_ptr db; + std::shared_ptr db; std::set ids; bool failed = false; // Database failed compilation. }; diff --git a/tools/hscollider/GroundTruth.cpp b/tools/hscollider/GroundTruth.cpp index abd54778..fcb47685 100644 --- a/tools/hscollider/GroundTruth.cpp +++ b/tools/hscollider/GroundTruth.cpp @@ -187,6 +187,14 @@ string pcreErrStr(int err) { } } +/* that is, a mode provided by native hyperscan */ +static +bool isStandardMode(unsigned int mode) { + return mode == MODE_BLOCK + || mode == MODE_STREAMING + || mode == MODE_VECTORED; +} + GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr, unsigned long int limit, unsigned long int limit_recursion) @@ -194,8 +202,10 @@ GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr, matchLimitRecursion(limit_recursion) {} void GroundTruth::global_prep() { - // We're using pcre callouts - pcre_callout = &pcreCallOut; + if (isStandardMode(colliderMode)) { + // We're using pcre callouts + pcre_callout = &pcreCallOut; + } } static @@ -262,11 +272,17 @@ GroundTruth::compile(unsigned id, bool no_callouts) { throw PcreCompileFailure("Unsupported extended flags."); } + // Hybrid mode implies SOM. + if (colliderMode == MODE_HYBRID) { + assert(!use_NFA); + som = true; + } + // SOM flags might be set globally. som |= !!somFlags; // For traditional Hyperscan, add global callout to pattern. - if (!combination && !no_callouts) { + if (!combination && !no_callouts && isStandardMode(colliderMode)) { addCallout(re); } @@ -403,6 +419,79 @@ int scanBasic(const CompiledPcre &compiled, const string &buffer, return ret; } +static +bool isUtf8(const CompiledPcre &compiled) { + unsigned long int options = 0; + pcre_fullinfo(compiled.bytecode, NULL, PCRE_INFO_OPTIONS, &options); + return options & PCRE_UTF8; +} + +static +CaptureVec makeCaptureVec(const vector &ovector, int ret) { + assert(ret > 0); + + CaptureVec cap; + + if (no_groups) { + return cap; // No group info requested. + } + + cap.reserve(ret * 2); + for (int i = 0; i < ret * 2; i += 2) { + int from = ovector[i], to = ovector[i + 1]; + cap.push_back(make_pair(from, to)); + } + return cap; +} + +static +int scanHybrid(const CompiledPcre &compiled, const string &buffer, + const pcre_extra &extra, vector &ovector, + ResultSet &rs, ostream &out) { + int len = (int)buffer.length(); + int startoffset = 0; + bool utf8 = isUtf8(compiled); + + int flags = 0; + int ret; + do { + ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), len, + startoffset, flags, &ovector[0], ovector.size()); + + if (ret <= PCRE_ERROR_NOMATCH) { + return ret; + } + + int from = ovector.at(0); + int to = ovector.at(1); + rs.addMatch(from, to, makeCaptureVec(ovector, ret)); + + if (echo_matches) { + out << "PCRE Match @ (" << from << "," << to << ")" << endl; + } + + // If we only wanted a single match, we're done. + if (compiled.highlander) break; + + // Next scan starts at the first codepoint after the match. It's + // possible that we have a vacuous match, in which case we must step + // past it to ensure that we always progress. + if (from != to) { + startoffset = to; + } else if (utf8) { + startoffset = to + 1; + while (startoffset < len + && ((buffer[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) { + ++startoffset; + } + } else { + startoffset = to + 1; + } + } while (startoffset <= len); + + return ret; +} + static int scanOffset(const CompiledPcre &compiled, const string &buffer, const pcre_extra &extra, vector &ovector, @@ -532,15 +621,24 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled, pcre_extra extra; extra.flags = 0; - // Switch on callouts. - extra.flags |= PCRE_EXTRA_CALLOUT_DATA; - extra.callout_data = &ctx; + // If running in traditional HyperScan mode, switch on callouts. + bool usingCallouts = isStandardMode(colliderMode); + if (usingCallouts) { + // Switch on callouts. + extra.flags |= PCRE_EXTRA_CALLOUT_DATA; + extra.callout_data = &ctx; + } // Set the match_limit (in order to bound execution time on very complex // patterns) extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION); - extra.match_limit = matchLimit; - extra.match_limit_recursion = matchLimitRecursion; + if (colliderMode == MODE_HYBRID) { + extra.match_limit = 10000000; + extra.match_limit_recursion = 1500; + } else { + extra.match_limit = matchLimit; + extra.match_limit_recursion = matchLimitRecursion; + } #ifdef PCRE_NO_START_OPTIMIZE // Switch off optimizations that may result in callouts not occurring. @@ -553,6 +651,7 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled, ovector.resize(ovecsize); int ret; + bool hybrid = false; switch (colliderMode) { case MODE_BLOCK: case MODE_STREAMING: @@ -563,6 +662,10 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled, ret = scanBasic(compiled, buffer, extra, ovector, ctx); } break; + case MODE_HYBRID: + ret = scanHybrid(compiled, buffer, extra, ovector, rs, out); + hybrid = true; + break; default: assert(0); ret = PCRE_ERROR_NULL; @@ -595,7 +698,7 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled, return true; } - if (compiled.som) { + if (compiled.som && !hybrid) { filterLeftmostSom(rs); } diff --git a/tools/hscollider/ResultSet.h b/tools/hscollider/ResultSet.h index b7736d52..067055ca 100644 --- a/tools/hscollider/ResultSet.h +++ b/tools/hscollider/ResultSet.h @@ -35,25 +35,36 @@ #include #include +// Type for capturing groups: a vector of (from, to) offsets, with both set to +// -1 for inactive groups (like pcre's ovector). Used by hybrid modes. +typedef std::vector > CaptureVec; + // Class representing a single match, encapsulating to/from offsets. class MatchResult { public: MatchResult(unsigned long long start, unsigned long long end) : from(start), to(end) {} + MatchResult(unsigned long long start, unsigned long long end, + const CaptureVec &cap) + : from(start), to(end), captured(cap) {} bool operator<(const MatchResult &a) const { if (from != a.from) { return from < a.from; } - return to < a.to; + if (to != a.to) { + return to < a.to; + } + return captured < a.captured; } bool operator==(const MatchResult &a) const { - return from == a.from && to == a.to; + return from == a.from && to == a.to && captured == a.captured; } unsigned long long from; unsigned long long to; + CaptureVec captured; }; enum ResultSource { @@ -114,6 +125,19 @@ public: } } + // Add a match (with capturing vector) + void addMatch(unsigned long long from, unsigned long long to, + const CaptureVec &cap, int block = 0) { + MatchResult m(from, to, cap); + matches.insert(m); + + if (matches_by_block[block].find(m) != matches_by_block[block].end()) { + dupe_matches.insert(m); + } else { + matches_by_block[block].insert(m); + } + } + // Clear all matches. void clear() { matches.clear(); diff --git a/tools/hscollider/UltimateTruth.cpp b/tools/hscollider/UltimateTruth.cpp index 19c597be..4b8724e8 100644 --- a/tools/hscollider/UltimateTruth.cpp +++ b/tools/hscollider/UltimateTruth.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -90,19 +90,14 @@ hs_error_t open_magic_stream(const hs_database_t *db, unsigned flags, #endif // RELEASE_BUILD -class HyperscanDB : boost::noncopyable { +class BaseDB : boost::noncopyable { public: // Constructor takes iterators over a container of pattern IDs. template - HyperscanDB(hs_database_t *db_in, Iter ids_begin, Iter ids_end) - : db(db_in), ids(ids_begin, ids_end) {} + BaseDB(Iter ids_begin, Iter ids_end) + : ids(ids_begin, ids_end) {} - ~HyperscanDB() { - hs_free_database(db); - } - - // Underlying Hyperscan database pointer. - hs_database_t *db; + virtual ~BaseDB(); // The set of expression IDs that must return their matches in order. unordered_set ordered; @@ -111,15 +106,55 @@ public: unordered_set ids; }; +BaseDB::~BaseDB() { } + +class HyperscanDB : public BaseDB { +public: + // Constructor takes iterators over a container of pattern IDs. + template + HyperscanDB(hs_database_t *db_in, Iter ids_begin, Iter ids_end) + : BaseDB(ids_begin, ids_end), db(db_in) {} + + ~HyperscanDB(); + + // Underlying Hyperscan database pointer. + hs_database_t *db; +}; + +HyperscanDB::~HyperscanDB() { + hs_free_database(db); +} + +#ifdef HS_HYBRID + +class HybridDB : public BaseDB { +public: + // Constructor takes iterators over a container of pattern IDs. + template + HybridDB(ch_database_t *db_in, Iter ids_begin, Iter ids_end) + : BaseDB(ids_begin, ids_end), db(db_in) {} + + ~HybridDB(); + + // Underlying Hyperscan database pointer. + ch_database_t *db; +}; + +HybridDB::~HybridDB() { + ch_free_database(db); +} + +#endif // HS_HYBRID + // Used to track the ID and result set. namespace { struct MultiContext { - MultiContext(unsigned int id_in, const HyperscanDB &db_in, ResultSet *rs_in, + MultiContext(unsigned int id_in, const BaseDB &db_in, ResultSet *rs_in, bool single_in, ostream &os) : id(id_in), db(db_in), rs(rs_in), single(single_in), out(os) {} unsigned int id; int block = 0; - const HyperscanDB &db; + const BaseDB &db; ResultSet *rs; u64a lastRawMatch = 0; /* store last known unadjusted match location */ u64a lastOrderMatch = 0; @@ -230,6 +265,75 @@ int callbackMulti(unsigned int id, unsigned long long from, return 0; } +#ifdef HS_HYBRID + +// Hybrid matcher callback. +static +ch_callback_t callbackHybrid(unsigned id, unsigned long long from, + unsigned long long to, unsigned, unsigned size, + const ch_capture_t *captured, void *ctx) { + MultiContext *mctx = static_cast(ctx); + assert(mctx); + assert(mctx->rs); + assert(mctx->in_scan_call); + + ostream &out = mctx->out; + + to -= g_corpora_prefix.size(); + + if (mctx->terminated) { + out << "UE2 Match @ (" << from << "," << to << ") for " << id + << " after termination" << endl; + mctx->rs->match_after_halt = true; + } + + if (mctx->single || id == mctx->id) { + CaptureVec cap; + for (unsigned int i = 0; i < size; i++) { + if (!(captured[i].flags & CH_CAPTURE_FLAG_ACTIVE)) { + cap.push_back(make_pair(-1, -1)); + } else { + cap.push_back(make_pair(captured[i].from, captured[i].to)); + } + } + mctx->rs->addMatch(from, to, cap); + } + + if (echo_matches) { + out << "Match @ [" << from << "," << to << "] for " << id << endl; + out << " Captured " << size << " groups: "; + for (unsigned int i = 0; i < size; i++) { + if (!(captured[i].flags & CH_CAPTURE_FLAG_ACTIVE)) { + out << "{} "; + } else { + out << "{" << captured[i].from << "," << captured[i].to << "} "; + } + } + out << endl; + } + + if (limit_matches && mctx->rs->matches.size() == limit_matches) { + mctx->terminated = true; + return CH_CALLBACK_TERMINATE; + } + + return CH_CALLBACK_CONTINUE; +} + +// Hybrid matcher error callback. +static +ch_callback_t errorCallback(UNUSED ch_error_event_t errorType, UNUSED unsigned int id, void *, + void *ctx) { + UNUSED MultiContext *mctx = static_cast(ctx); + assert(mctx); + assert(mctx->rs); + assert(mctx->in_scan_call); + + return CH_CALLBACK_SKIP_PATTERN; +} + +#endif // HS_HYBRID + static void filterLeftmostSom(ResultSet &rs) { if (rs.matches.size() <= 1) { @@ -252,6 +356,9 @@ UltimateTruth::UltimateTruth(ostream &os, const ExpressionMap &expr, const Grey &grey_in, unsigned int streamBlocks) : grey(grey_in), out(os), m_expr(expr), m_xcompile(false), m_streamBlocks(streamBlocks), scratch(nullptr), +#ifdef HS_HYBRID + chimeraScratch(nullptr), +#endif platform(plat) { // Build our mode flags. @@ -265,15 +372,27 @@ UltimateTruth::UltimateTruth(ostream &os, const ExpressionMap &expr, case MODE_VECTORED: m_mode = HS_MODE_VECTORED; break; + case MODE_HYBRID: + m_mode = 0; + break; } // Set desired SOM precision, if we're in streaming mode. if (colliderMode == MODE_STREAMING) { m_mode |= somPrecisionMode; } + +#ifdef HS_HYBRID + if (colliderMode == MODE_HYBRID && !no_groups) { + m_mode |= CH_MODE_GROUPS; + } +#endif } UltimateTruth::~UltimateTruth() { +#ifdef HS_HYBRID + ch_free_scratch(chimeraScratch); +#endif hs_free_scratch(scratch); } @@ -327,13 +446,13 @@ void mangle_scratch(hs_scratch_t *scratch) { scratch->fdr_conf_offset = 0xe4; } -bool UltimateTruth::blockScan(const HyperscanDB &hdb, const string &buffer, +bool UltimateTruth::blockScan(const BaseDB &bdb, const string &buffer, size_t align, match_event_handler callback, void *ctx_in, ResultSet *) { assert(colliderMode == MODE_BLOCK); assert(!m_xcompile); - const hs_database_t *db = hdb.db; + const hs_database_t *db = reinterpret_cast(bdb).db; assert(db); MultiContext *ctx = (MultiContext *)ctx_in; @@ -438,13 +557,13 @@ hs_stream_t *compressAndResetExpandStream(const hs_database_t *db, return out; } -bool UltimateTruth::streamingScan(const HyperscanDB &hdb, const string &buffer, +bool UltimateTruth::streamingScan(const BaseDB &bdb, const string &buffer, size_t align, match_event_handler callback, void *ctx_in, ResultSet *rs) { assert(colliderMode == MODE_STREAMING); assert(!m_xcompile); - const hs_database_t *db = hdb.db; + const hs_database_t *db = reinterpret_cast(bdb).db; assert(db); MultiContext *ctx = (MultiContext *)ctx_in; @@ -594,13 +713,13 @@ bool UltimateTruth::streamingScan(const HyperscanDB &hdb, const string &buffer, return ret == HS_SUCCESS; } -bool UltimateTruth::vectoredScan(const HyperscanDB &hdb, const string &buffer, +bool UltimateTruth::vectoredScan(const BaseDB &bdb, const string &buffer, size_t align, match_event_handler callback, void *ctx_in, ResultSet *rs) { assert(colliderMode == MODE_VECTORED); assert(!m_xcompile); - const hs_database_t *db = hdb.db; + const hs_database_t *db = reinterpret_cast(bdb).db; assert(db); MultiContext *ctx = (MultiContext *)ctx_in; @@ -682,19 +801,67 @@ bool UltimateTruth::vectoredScan(const HyperscanDB &hdb, const string &buffer, return true; } -bool UltimateTruth::run(unsigned int id, shared_ptr hdb, +#ifdef HS_HYBRID +bool UltimateTruth::hybridScan(const BaseDB &bdb, const string &buffer, + size_t align, ch_match_event_handler callback, + ch_error_event_handler error_callback, + void *ctx_in, ResultSet *) { + assert(colliderMode == MODE_HYBRID); + assert(!m_xcompile); + + const ch_database_t *db = reinterpret_cast(bdb).db; + assert(db); + MultiContext *ctx = (MultiContext *)ctx_in; + + char *realigned = setupScanBuffer(buffer.c_str(), buffer.size(), align); + if (!realigned) { + return false; + } + + if (use_copy_scratch && !cloneScratch()) { + return false; + } + + ctx->in_scan_call = true; + ch_error_t ret = + ch_scan(db, realigned, buffer.size(), 0, chimeraScratch, callback, + error_callback, ctx); + ctx->in_scan_call = false; + + if (g_verbose) { + out << "Scan call returned " << ret << endl; + } + + if (ctx->terminated) { + if (g_verbose && ret != CH_SCAN_TERMINATED) { + out << "Scan should have returned CH_SCAN_TERMINATED, returned " + << ret << " instead." << endl; + } + return ret == CH_SCAN_TERMINATED; + } + + if (g_verbose && ret != CH_SUCCESS) { + out << "Scan should have returned CH_SUCCESS, returned " << ret + << " instead." << endl; + } + + return ret == CH_SUCCESS; +} +#endif + +bool UltimateTruth::run(unsigned int id, shared_ptr bdb, const string &buffer, bool single_pattern, unsigned int align, ResultSet &rs) { assert(!m_xcompile); - assert(hdb); + assert(bdb); // Ensure that scratch is appropriate for this database. - if (!allocScratch(hdb)) { + if (!allocScratch(bdb)) { out << "Scratch alloc failed." << endl; return false; } - MultiContext ctx(id, *hdb, &rs, single_pattern, out); + MultiContext ctx(id, *bdb, &rs, single_pattern, out); if (!g_corpora_suffix.empty()) { ctx.use_max_offset = true; ctx.max_offset = buffer.size() - g_corpora_suffix.size(); @@ -702,11 +869,20 @@ bool UltimateTruth::run(unsigned int id, shared_ptr hdb, switch (colliderMode) { case MODE_BLOCK: - return blockScan(*hdb, buffer, align, callbackMulti, &ctx, &rs); + return blockScan(*bdb, buffer, align, callbackMulti, &ctx, &rs); case MODE_STREAMING: - return streamingScan(*hdb, buffer, align, callbackMulti, &ctx, &rs); + return streamingScan(*bdb, buffer, align, callbackMulti, &ctx, &rs); case MODE_VECTORED: - return vectoredScan(*hdb, buffer, align, callbackMulti, &ctx, &rs); + return vectoredScan(*bdb, buffer, align, callbackMulti, &ctx, &rs); + case MODE_HYBRID: +#ifdef HS_HYBRID + return hybridScan(*bdb, buffer, align, callbackHybrid, errorCallback, + &ctx, &rs); +#else + cerr << "Hybrid mode not available in this build." << endl; + abort(); +#endif + break; } assert(0); @@ -739,7 +915,7 @@ bool isOrdered(const string &expr, unsigned int flags) { return ordered; } -static unique_ptr +static unique_ptr compileHyperscan(vector &patterns, vector &flags, vector &idsvec, ptr_vector &ext, unsigned mode, const hs_platform_info *platform, string &error, @@ -762,7 +938,30 @@ compileHyperscan(vector &patterns, vector &flags, return ue2::make_unique(db, idsvec.begin(), idsvec.end()); } -shared_ptr UltimateTruth::compile(const set &ids, +#ifdef HS_HYBRID +static unique_ptr +compileHybrid(vector &patterns, + vector &flags, vector &idsvec, + unsigned mode, const hs_platform_info *platform, string &error) { + const unsigned count = patterns.size(); + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err; + + ch_error_t err = ch_compile_multi(&patterns[0], &flags[0], + &idsvec[0], count, mode, platform, &db, + &compile_err); + + if (err != HS_SUCCESS) { + error = compile_err->message; + ch_free_compile_error(compile_err); + return nullptr; + } + + return ue2::make_unique(db, idsvec.begin(), idsvec.end()); +} +#endif + +shared_ptr UltimateTruth::compile(const set &ids, string &error) const { // Build our vectors for compilation const size_t count = ids.size(); @@ -811,6 +1010,17 @@ shared_ptr UltimateTruth::compile(const set &ids, ext[n].edit_distance = edit_distance; } + if (colliderMode == MODE_HYBRID) { + if (ext[n].flags) { + error = "Hybrid does not support extended parameters."; + return nullptr; + } + // We can also strip some other flags in the hybrid matcher. + flags[n] &= ~HS_FLAG_PREFILTER; // prefilter always used + flags[n] &= ~HS_FLAG_ALLOWEMPTY; // empty always allowed + flags[n] &= ~HS_FLAG_SOM_LEFTMOST; // SOM always on + } + n++; } @@ -827,8 +1037,18 @@ shared_ptr UltimateTruth::compile(const set &ids, idsvec.push_back(0); } - auto db = compileHyperscan(patterns, flags, idsvec, ext, m_mode, platform, - error, grey); + unique_ptr db; + if (colliderMode == MODE_HYBRID) { +#ifdef HS_HYBRID + db = compileHybrid(patterns, flags, idsvec, m_mode, platform, error); +#else + error = "Hybrid mode not available in this build."; +#endif + } else { + db = compileHyperscan(patterns, flags, idsvec, ext, m_mode, + platform, error, grey); + } + if (!db) { return nullptr; } @@ -850,18 +1070,29 @@ shared_ptr UltimateTruth::compile(const set &ids, return move(db); } -bool UltimateTruth::allocScratch(shared_ptr db) { +bool UltimateTruth::allocScratch(shared_ptr db) { assert(db); - // We explicitly avoid running scratch allocators for the same HyperscanDB + // We explicitly avoid running scratch allocators for the same BaseDB // over and over again by retaining a shared_ptr to the last one we saw. if (db == last_db) { return true; } - hs_error_t err = hs_alloc_scratch(db.get()->db, &scratch); - if (err != HS_SUCCESS) { - return false; + if (colliderMode == MODE_HYBRID) { +#ifdef HS_HYBRID + ch_error_t err = ch_alloc_scratch( + reinterpret_cast(db.get())->db, &chimeraScratch); + if (err != HS_SUCCESS) { + return false; + } +#endif // HS_HYBRID + } else { + hs_error_t err = hs_alloc_scratch( + reinterpret_cast(db.get())->db, &scratch); + if (err != HS_SUCCESS) { + return false; + } } last_db = db; @@ -869,20 +1100,40 @@ bool UltimateTruth::allocScratch(shared_ptr db) { } bool UltimateTruth::cloneScratch(void) { - hs_scratch_t *old_scratch = scratch; - hs_scratch_t *new_scratch; - hs_error_t ret = hs_clone_scratch(scratch, &new_scratch); - if (ret != HS_SUCCESS) { - DEBUG_PRINTF("failure to clone %d\n", ret); - return false; + if (colliderMode == MODE_HYBRID) { +#ifdef HS_HYBRID + ch_scratch_t *old_scratch = chimeraScratch; + ch_scratch_t *new_scratch; + ch_error_t ret = ch_clone_scratch(chimeraScratch, &new_scratch); + if (ret != CH_SUCCESS) { + DEBUG_PRINTF("failure to clone %d\n", ret); + return false; + } + chimeraScratch = new_scratch; + ret = ch_free_scratch(old_scratch); + if (ret != CH_SUCCESS) { + DEBUG_PRINTF("failure to free %d\n", ret); + return false; + } + DEBUG_PRINTF("hybrid scratch cloned from %p to %p\n", + old_scratch, chimeraScratch); +#endif // HS_HYBRID + } else { + hs_scratch_t *old_scratch = scratch; + hs_scratch_t *new_scratch; + hs_error_t ret = hs_clone_scratch(scratch, &new_scratch); + if (ret != HS_SUCCESS) { + DEBUG_PRINTF("failure to clone %d\n", ret); + return false; + } + scratch = new_scratch; + ret = hs_free_scratch(old_scratch); + if (ret != HS_SUCCESS) { + DEBUG_PRINTF("failure to free %d\n", ret); + return false; + } + DEBUG_PRINTF("scratch cloned from %p to %p\n", old_scratch, scratch); } - scratch = new_scratch; - ret = hs_free_scratch(old_scratch); - if (ret != HS_SUCCESS) { - DEBUG_PRINTF("failure to free %d\n", ret); - return false; - } - DEBUG_PRINTF("scratch cloned from %p to %p\n", old_scratch, scratch); return true; } @@ -947,20 +1198,35 @@ char *UltimateTruth::setupVecScanBuffer(const char *begin, size_t len, return ptr; } -bool UltimateTruth::saveDatabase(const HyperscanDB &hdb, +bool UltimateTruth::saveDatabase(const BaseDB &bdb, const string &filename) const { - return ::saveDatabase(hdb.db, filename.c_str(), g_verbose); + if (colliderMode == MODE_HYBRID) { + cerr << "Hybrid mode doesn't support serialization." << endl; + abort(); + } else { + return ::saveDatabase(reinterpret_cast(&bdb)->db, + filename.c_str(), g_verbose); + } + return false; } -shared_ptr +shared_ptr UltimateTruth::loadDatabase(const string &filename, const std::set &ids) const { - hs_database_t *hs_db = ::loadDatabase(filename.c_str(), g_verbose); - if (!hs_db) { - return nullptr; + shared_ptr db; + + if (colliderMode == MODE_HYBRID) { + cerr << "Hybrid mode doesn't support deserialization." << endl; + abort(); + } else { + hs_database_t *hs_db = ::loadDatabase(filename.c_str(), g_verbose); + if (!hs_db) { + return nullptr; + } + + db = make_shared(hs_db, ids.begin(), ids.end()); } - auto db = make_shared(hs_db, ids.begin(), ids.end()); assert(db); // Fill db::ordered with the expressions that require the ordered flag. diff --git a/tools/hscollider/UltimateTruth.h b/tools/hscollider/UltimateTruth.h index c8de8642..645cf297 100644 --- a/tools/hscollider/UltimateTruth.h +++ b/tools/hscollider/UltimateTruth.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,6 +33,10 @@ #include "hs.h" +#ifdef HS_HYBRID +#include "chimera/ch.h" +#endif + #include #include #include @@ -47,7 +51,7 @@ struct Grey; } // namespace ue2 -class HyperscanDB; +class BaseDB; class ResultSet; // Wrapper around ue2 to generate results for an expression and corpus. @@ -59,13 +63,13 @@ public: ~UltimateTruth(); - std::shared_ptr compile(const std::set &ids, + std::shared_ptr compile(const std::set &ids, std::string &error) const; - bool saveDatabase(const HyperscanDB &db, + bool saveDatabase(const BaseDB &db, const std::string &filename) const; - std::shared_ptr + std::shared_ptr loadDatabase(const std::string &filename, const std::set &ids) const; @@ -74,7 +78,7 @@ public: return !m_xcompile; } - bool run(unsigned id, std::shared_ptr db, + bool run(unsigned id, std::shared_ptr db, const std::string &buffer, bool single_pattern, unsigned align, ResultSet &rs); @@ -84,22 +88,28 @@ public: std::string dbFilename(const std::set &ids) const; private: - bool blockScan(const HyperscanDB &db, const std::string &buffer, + bool blockScan(const BaseDB &db, const std::string &buffer, size_t align, match_event_handler callback, void *ctx, ResultSet *rs); - bool streamingScan(const HyperscanDB &db, const std::string &buffer, + bool streamingScan(const BaseDB &db, const std::string &buffer, size_t align, match_event_handler callback, void *ctx, ResultSet *rs); - bool vectoredScan(const HyperscanDB &db, const std::string &buffer, + bool vectoredScan(const BaseDB &db, const std::string &buffer, size_t align, match_event_handler callback, void *ctx, ResultSet *rs); +#ifdef HS_HYBRID + bool hybridScan(const BaseDB &db, const std::string &buffer, + size_t align, ch_match_event_handler callback, + ch_error_event_handler error_callback, + void *ctx, ResultSet *rs); +#endif // HS_HYBRID char *setupScanBuffer(const char *buf, size_t len, size_t align); char *setupVecScanBuffer(const char *buf, size_t len, size_t align, unsigned int block_id); - bool allocScratch(std::shared_ptr db); + bool allocScratch(std::shared_ptr db); bool cloneScratch(void); @@ -126,6 +136,11 @@ private: // Scratch space for Hyperscan. hs_scratch_t *scratch; +#ifdef HS_HYBRID + // Scratch space for Chimera. + ch_scratch_t *chimeraScratch; +#endif // HS_HYBRID + // Temporary scan buffer used for realigned scanning std::vector m_scanBuf; @@ -134,7 +149,7 @@ private: // Last database we successfully allocated scratch for, so that we can // avoid unnecessarily reallocating for it. - std::shared_ptr last_db; + std::shared_ptr last_db; const hs_platform_info *platform; }; diff --git a/tools/hscollider/args.cpp b/tools/hscollider/args.cpp index a15977f9..3f53f8bc 100644 --- a/tools/hscollider/args.cpp +++ b/tools/hscollider/args.cpp @@ -76,6 +76,7 @@ void usage(const char *name, const char *error) { "blocks.\n"); printf(" -V NUM Use vectored mode, split data into ~NUM " "blocks.\n"); + printf(" -H Use hybrid mode.\n"); printf(" -Z {R or 0-%d} Only test one alignment, either as given or " "'R' for random.\n", MAX_MAX_UE2_ALIGN - 1); printf(" -q Quiet; display only match differences, no other " @@ -90,6 +91,7 @@ void usage(const char *name, const char *error) { printf(" -E DISTANCE Match all patterns within edit distance" " DISTANCE.\n"); printf(" --prefilter Apply HS_FLAG_PREFILTER to all patterns.\n"); + printf(" --no-groups Disable capturing in Hybrid mode.\n"); printf("\n"); printf("Testing mode options:\n"); printf("\n"); @@ -157,7 +159,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, vector *corpora, UNUSED Grey *grey, unique_ptr *plat_out) { static const char options[] - = "-ab:cC:d:D:e:E:G:hi:k:Lm:M:n:o:O:p:P:qr:R:S:s:t:T:vV:w:x:X:Y:z:Z:8"; + = "-ab:cC:d:D:e:E:G:hHi:k:Lm:M:n:o:O:p:P:qr:R:S:s:t:T:vV:w:x:X:Y:z:Z:8"; s32 in_multi = 0; s32 in_corpora = 0; int pcreFlag = 1; @@ -180,6 +182,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, {"no-signal-handler", 0, &no_signal_handler, 1}, {"compress-expand", 0, &compressFlag, 1}, {"compress-reset-expand", 0, &compressResetFlag, 1}, + {"no-groups", 0, &no_groups, 1}, {nullptr, 0, nullptr, 0}}; for (;;) { @@ -271,6 +274,15 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, case 'h': usage(argv[0], nullptr); exit(0); + case 'H': + if (colliderMode != MODE_BLOCK) { + usage(argv[0], "You can only use one mode at a time!"); + exit(1); + } + colliderMode = MODE_HYBRID; + // Disable graph truth in hybrid mode + nfaFlag = 0; + break; case 'i': loadDatabases = true; serializePath = optarg; @@ -542,6 +554,11 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop, exit(1); } + if (colliderMode == MODE_HYBRID && !ue2Flag) { + usage(argv[0], "You cannot disable UE2 engine in Hybrid mode."); + exit(1); + } + // need at least two pattern engines active if (nfaFlag + pcreFlag + ue2Flag < 2) { usage(argv[0], "At least two pattern engines should be active."); diff --git a/tools/hscollider/common.h b/tools/hscollider/common.h index da85790c..d9a0144c 100644 --- a/tools/hscollider/common.h +++ b/tools/hscollider/common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -36,7 +36,8 @@ enum ColliderMode { MODE_BLOCK, MODE_STREAMING, - MODE_VECTORED + MODE_VECTORED, + MODE_HYBRID }; extern unsigned numThreads; @@ -68,6 +69,7 @@ extern unsigned max_ue2_align; extern size_t g_memoryLimit; extern bool force_utf8; extern int force_prefilter; +extern int no_groups; extern unsigned somPrecisionMode; extern unsigned limit_matches; extern unsigned randomSeed; diff --git a/tools/hscollider/main.cpp b/tools/hscollider/main.cpp index 9877b6ae..4eaa3962 100644 --- a/tools/hscollider/main.cpp +++ b/tools/hscollider/main.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2018, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -448,6 +448,9 @@ void printMode(void) { case MODE_VECTORED: cout << "Vectored-" << g_streamBlocks; break; + case MODE_HYBRID: + cout << "Hybrid"; + break; } if (use_copy_scratch) { @@ -690,7 +693,7 @@ shared_ptr constructDatabase(const set &ids, if (loadDatabases) { string filename = ultimate.dbFilename(ids); - shared_ptr db = ultimate.loadDatabase(filename, ids); + shared_ptr db = ultimate.loadDatabase(filename, ids); if (!db) { if (!g_quiet) { cout << "FAILED: could not load database " << filename << endl; @@ -706,7 +709,7 @@ shared_ptr constructDatabase(const set &ids, // If we're not runnable (i.e. we're cross-compiling), let's at least // try to build the database. if (!ultimate.runnable()) { - shared_ptr db = ue2->get(ultimate); + shared_ptr db = ue2->get(ultimate); assert(db); // throws otherwise } @@ -872,7 +875,7 @@ void runTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph, assert(use_UE2); Corpus &corpus = unit.corpus; - shared_ptr db; + shared_ptr db; if (use_UE2) { // Acquire UE2 database. debug_stage = STAGE_UE2_COMPILE; @@ -1648,6 +1651,7 @@ void printSettingsV(const vector &corporaFiles, case MODE_BLOCK: cout << "block mode"; break; case MODE_STREAMING: cout << "streaming mode"; break; case MODE_VECTORED: cout << "vectored mode"; break; + case MODE_HYBRID: cout << "hybrid mode"; break; } cout << endl; @@ -1746,6 +1750,7 @@ void printSettingsQ(const vector &corporaFiles, case MODE_BLOCK: cout << "block mode"; break; case MODE_STREAMING: cout << "streaming mode"; break; case MODE_VECTORED: cout << "vectored mode"; break; + case MODE_HYBRID: cout << "hybrid mode"; break; } cout << endl; diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt index 61418510..47b0ae9b 100644 --- a/unit/CMakeLists.txt +++ b/unit/CMakeLists.txt @@ -123,22 +123,58 @@ set_target_properties(unit-internal PROPERTIES COMPILE_FLAGS "${HS_CXX_FLAGS}") target_link_libraries(unit-internal hs corpusomatic) endif(NOT (RELEASE_BUILD OR FAT_RUNTIME)) -# -# build target to run unit tests -# -if (NOT RELEASE_BUILD) -add_custom_target( - unit - COMMAND bin/unit-internal - COMMAND bin/unit-hyperscan - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - DEPENDS unit-internal unit-hyperscan -) -else () -add_custom_target( - unit - COMMAND bin/unit-hyperscan - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - DEPENDS unit-hyperscan -) +if (BUILD_CHIMERA) + # enable Chimera unit tests + set(unit_chimera_SOURCES + ${gtest_SOURCES} + chimera/allocators.cpp + chimera/arg_checks.cpp + chimera/bad_patterns.cpp + chimera/compat.cpp + chimera/main.cpp + chimera/scan.cpp + ) + add_executable(unit-chimera ${unit_chimera_SOURCES}) + target_link_libraries(unit-chimera chimera hs pcre) + # + # build target to run unit tests + # + if (NOT RELEASE_BUILD) + add_custom_target( + unit + COMMAND bin/unit-internal + COMMAND bin/unit-hyperscan + COMMAND bin/unit-chimera + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + DEPENDS unit-internal unit-hyperscan unit-chimera + ) + else () + add_custom_target( + unit + COMMAND bin/unit-hyperscan + COMMAND bin/unit-chimera + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + DEPENDS unit-hyperscan unit-chimera + ) + endif() +else() + # + # build target to run unit tests + # + if (NOT RELEASE_BUILD) + add_custom_target( + unit + COMMAND bin/unit-internal + COMMAND bin/unit-hyperscan + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + DEPENDS unit-internal unit-hyperscan + ) + else () + add_custom_target( + unit + COMMAND bin/unit-hyperscan + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + DEPENDS unit-hyperscan + ) + endif() endif() diff --git a/unit/chimera/allocators.cpp b/unit/chimera/allocators.cpp new file mode 100644 index 00000000..bfceba76 --- /dev/null +++ b/unit/chimera/allocators.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "gtest/gtest.h" +#include "chimera/ch.h" + +#include +#include + +using std::string; + +static void *null_malloc(size_t) { return nullptr; } + +// Helper: correctly construct a simple database. +static +void makeDatabase(ch_database_t **hydb) { + static const char *expr[] = { "foobar" }; + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + err = ch_compile_multi(expr, nullptr, nullptr, 1, 0, nullptr, &db, + &compile_err); + + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + *hydb = db; +} + +TEST(HybridAllocator, DatabaseInfoBadAlloc) { + ch_database_t *db = nullptr; + makeDatabase(&db); + ASSERT_TRUE(db != nullptr); + + ch_set_allocator(null_malloc, nullptr); + + char *info = nullptr; + ch_error_t err = ch_database_info(db, &info); + ASSERT_EQ(CH_NOMEM, err); + + ch_set_allocator(nullptr, nullptr); + ch_free_database(db); +} + +static +void * two_aligned_malloc(size_t len) { + void *mem = malloc(len + 2); + if (!mem) { + return nullptr; + } + return (char *)mem + 2; +} + +static +void two_aligned_free(void *mem) { + if (!mem) { + return; + } + // Allocated with two_aligned_malloc above. + free((char *)mem - 2); +} + +TEST(HybridAllocator, TwoAlignedCompile) { + ch_set_database_allocator(two_aligned_malloc, two_aligned_free); + + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + const hs_platform_info_t *platform = nullptr; + ch_error_t err = + ch_compile("foobar", 0, CH_MODE_GROUPS, platform, &db, &compile_err); + ASSERT_EQ(CH_COMPILER_ERROR, err); + ASSERT_EQ(nullptr, db); + ASSERT_NE(nullptr, compile_err); + ch_free_compile_error(compile_err); + ch_set_database_allocator(nullptr, nullptr); +} + +TEST(HybridAllocator, TwoAlignedCompileError) { + ch_set_misc_allocator(two_aligned_malloc, two_aligned_free); + + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + const hs_platform_info_t *platform = nullptr; + ch_error_t err = + ch_compile("\\1", 0, CH_MODE_GROUPS, platform, &db, &compile_err); + ASSERT_EQ(CH_COMPILER_ERROR, err); + ASSERT_EQ(nullptr, db); + ASSERT_NE(nullptr, compile_err); + EXPECT_STREQ("Allocator returned misaligned memory.", compile_err->message); + ch_free_compile_error(compile_err); + ch_set_database_allocator(nullptr, nullptr); + ch_set_misc_allocator(nullptr, nullptr); +} + +TEST(HybridAllocator, TwoAlignedDatabaseInfo) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + ch_set_misc_allocator(two_aligned_malloc, two_aligned_free); + + char *info = nullptr; + ch_error_t err = ch_database_info(db, &info); + ASSERT_EQ(CH_BAD_ALLOC, err); + + ch_set_misc_allocator(nullptr, nullptr); + ch_free_database(db); +} + +TEST(HybridAllocator, TwoAlignedAllocScratch) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + ch_set_scratch_allocator(two_aligned_malloc, two_aligned_free); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_BAD_ALLOC, err); + + ch_set_scratch_allocator(nullptr, nullptr); + ch_free_database(db); +} diff --git a/unit/chimera/arg_checks.cpp b/unit/chimera/arg_checks.cpp new file mode 100644 index 00000000..ea1cda15 --- /dev/null +++ b/unit/chimera/arg_checks.cpp @@ -0,0 +1,591 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "gtest/gtest.h" +#include "chimera/ch.h" + +static char garbage[] = "TEST(HybridArgChecks, DatabaseSizeNoDatabase) {" \ + " size_t sz = ch_database_size(0);" \ + " ASSERT_EQ(0, sz);"; + +namespace /* anonymous */ { + +// Dummy callback: does nothing, returns 0 (keep matching) +ch_callback_t dummyHandler(unsigned, unsigned long long, + unsigned long long, unsigned, unsigned, + const ch_capture_t *, void *) { + // empty + return CH_CALLBACK_CONTINUE; +} + +// Helper: correctly construct a simple database. +static +void makeDatabase(ch_database_t **hydb) { + static const char *expr[] = { "foo.*bar" }; + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + err = ch_compile_multi(expr, nullptr, nullptr, 1, 0, nullptr, &db, + &compile_err); + + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(db != nullptr); + + *hydb = db; +} + +// Helper: given a database, build me some scratch. +static +void makeScratch(const ch_database_t *db, + ch_scratch_t **scratch) { + ch_error_t err = ch_alloc_scratch(db, scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(*scratch != nullptr); +} + +// Break the magic number of the given database. +void breakDatabaseMagic(ch_database *db) { + // database magic should be 0xdbdb at the start + ASSERT_TRUE(memcmp("\xde\xde", db, 2) == 0); + *(char *)db = 0xdc; +} + +// Break the version number of the given database. +void breakDatabaseVersion(ch_database *db) { + // database version is the second u32 + *((char *)db + 4) += 1; +} + +// Check that CH_version gives us a reasonable string back +TEST(HybridArgChecks, Version) { + const char *version = ch_version(); + ASSERT_TRUE(version != nullptr); + ASSERT_TRUE(version[0] >= '0' && version[0] <= '9') + << "First byte should be a digit."; + ASSERT_EQ('.', version[1]) << "Second byte should be a dot."; +} + +// ch_compile: Hand the compiler a bogus flag. +TEST(HybridArgChecks, SingleBogusFlags) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + static const unsigned int badflags[] = { + 0xffffffff, + 16, + 128, + 256, + 512, + }; + + for (size_t i = 0; i < sizeof(badflags)/sizeof(badflags[0]); i++) { + const char expr[] = "foobar"; + err = ch_compile(expr, badflags[i], 0, nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + EXPECT_STREQ("Unrecognized flag used.", compile_err->message); + ch_free_compile_error(compile_err); + } +} + +// ch_compile: Hand the compiler a bogus mode. +TEST(HybridArgChecks, SingleBogusMode) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + static const unsigned int badModes[] = { + 0xffffffff, + 1, + 2, + CH_MODE_GROUPS << 1, // this was our largest mode flag + }; + + for (size_t i = 0; i < sizeof(badModes)/sizeof(badModes[0]); i++) { + const char expr[] = "foobar"; + err = ch_compile(expr, 0, badModes[i], nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + EXPECT_STREQ("Invalid mode flag supplied.", compile_err->message); + ch_free_compile_error(compile_err); + } +} + +// ch_compile: Compile a nullptr pattern set) +TEST(HybridArgChecks, SingleCompileBlockNoPattern) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + err = ch_compile(nullptr, 0, 0, nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile: Compile a pattern to a nullptr database ptr +TEST(HybridArgChecks, SingleCompileBlockNoDatabase) { + ch_compile_error_t *compile_err = nullptr; + const char expr[] = "foobar"; + ch_error_t err; + err = ch_compile(expr, 0, 0, nullptr, nullptr, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile_multi: Hand the compiler a bogus flag. +TEST(HybridArgChecks, MultiBogusFlags) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + static const unsigned int badflags[] = { + 0xffffffff, + 16, // HS_FLAG_ERROREOD + 128, + 256, + 512, + }; + + for (size_t i = 0; i < sizeof(badflags)/sizeof(badflags[0]); i++) { + const char *expr[] = { "foobar" }; + err = ch_compile_multi(expr, &badflags[i], nullptr, 1, 0, nullptr, &db, + &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + EXPECT_STREQ("Unrecognized flag used.", compile_err->message); + ch_free_compile_error(compile_err); + } +} + +// ch_compile_multi: Hand the ch_compile_multi a bogus mode. +TEST(HybridArgChecks, MultiBogusMode) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + static const unsigned int badModes[] = { + 0xffffffff, + 1, + 2, + CH_MODE_GROUPS << 1, // this was our largest mode flag + }; + + for (size_t i = 0; i < sizeof(badModes)/sizeof(badModes[0]); i++) { + const char *expr[] = { "foobar" }; + err = ch_compile_multi(expr, nullptr, nullptr, 1, badModes[i], nullptr, + &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + EXPECT_STREQ("Invalid mode flag supplied.", compile_err->message); + ch_free_compile_error(compile_err); + } +} + +// ch_compile_multi: Compile a nullptr pattern set (block mode) +TEST(HybridArgChecks, MultiCompileBlockNoPattern) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + err = ch_compile_multi(nullptr, nullptr, nullptr, 1, 0, nullptr, &db, + &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile_multi: Compile a set of zero patterns +TEST(HybridArgChecks, MultiCompileZeroPatterns) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + const char *expr[] = {"foobar"}; + ch_error_t err; + err = ch_compile_multi(expr, nullptr, nullptr, 0, 0, nullptr, &db, + &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile_multi: Compile a pattern to a nullptr database ptr +TEST(HybridArgChecks, MultiCompileBlockNoDatabase) { + ch_compile_error_t *compile_err = nullptr; + const char *expr[] = {"foobar"}; + ch_error_t err; + err = ch_compile_multi(expr, nullptr, nullptr, 1, 0, nullptr, nullptr, + &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile_ext_multi: Hand the compiler a bogus flag. +TEST(HybridArgChecks, ExtMultiBogusFlags) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + static const unsigned int badflags[] = { + 0xffffffff, + 16, // HS_FLAG_ERROREOD + 128, + 256, + 512, + }; + + for (size_t i = 0; i < sizeof(badflags)/sizeof(badflags[0]); i++) { + const char *expr[] = { "foobar" }; + err = ch_compile_ext_multi(expr, &badflags[i], nullptr, 1, 0, + 10000000, 8000, nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + EXPECT_STREQ("Unrecognized flag used.", compile_err->message); + ch_free_compile_error(compile_err); + } +} + +// ch_compile_ext_multi: Hand the ch_compile_multi a bogus mode. +TEST(HybridArgChecks, ExtMultiBogusMode) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + + static const unsigned int badModes[] = { + 0xffffffff, + 1, + 2, + CH_MODE_GROUPS << 1, // this was our largest mode flag + }; + + for (size_t i = 0; i < sizeof(badModes)/sizeof(badModes[0]); i++) { + const char *expr[] = { "foobar" }; + err = ch_compile_ext_multi(expr, nullptr, nullptr, 1, badModes[i], + 10000000, 8000, nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + EXPECT_STREQ("Invalid mode flag supplied.", compile_err->message); + ch_free_compile_error(compile_err); + } +} + +// ch_compile_ext_multi: Compile a nullptr pattern set (block mode) +TEST(HybridArgChecks, ExtMultiCompileBlockNoPattern) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err; + err = ch_compile_ext_multi(nullptr, nullptr, nullptr, 1, 0, 10000000, + 8000, nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile_ext_multi: Compile a set of zero patterns +TEST(HybridArgChecks, ExtMultiCompileZeroPatterns) { + ch_database_t *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + const char *expr[] = {"foobar"}; + ch_error_t err; + err = ch_compile_ext_multi(expr, nullptr, nullptr, 0, 0, 10000000, + 8000, nullptr, &db, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(db == nullptr); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_compile_ext_multi: Compile a pattern to a nullptr database ptr +TEST(HybridArgChecks, ExtMultiCompileBlockNoDatabase) { + ch_compile_error_t *compile_err = nullptr; + const char *expr[] = {"foobar"}; + ch_error_t err; + err = ch_compile_ext_multi(expr, nullptr, nullptr, 1, 0, 10000000, + 8000, nullptr, nullptr, &compile_err); + EXPECT_EQ(CH_COMPILER_ERROR, err); + EXPECT_TRUE(compile_err != nullptr); + ch_free_compile_error(compile_err); +} + +// ch_scan: Call with no database +TEST(HybridArgChecks, ScanBlockNoDatabase) { + ch_database_t *db = nullptr; + makeDatabase(&db); + ch_scratch_t *scratch = nullptr; + makeScratch(db, &scratch); + + ch_error_t err = ch_scan(nullptr, "data", 4, 0, scratch, + dummyHandler, nullptr, nullptr); + ASSERT_NE(CH_SUCCESS, err); + EXPECT_NE(CH_SCAN_TERMINATED, err); + + // teardown + err = ch_free_scratch(scratch); + ASSERT_EQ(CH_SUCCESS, err); + ch_free_database(db); +} + +// ch_scan: Call with a database with broken magic +TEST(HybridArgChecks, ScanBlockBrokenDatabaseMagic) { + ch_database_t *db = nullptr; + makeDatabase(&db); + ch_scratch_t *scratch = nullptr; + makeScratch(db, &scratch); + + // break the database here, after scratch alloc + breakDatabaseMagic(db); + + ch_error_t err = ch_scan(db, "data", 4, 0, scratch, + dummyHandler, nullptr, nullptr); + ASSERT_EQ(CH_INVALID, err); + + // teardown + err = ch_free_scratch(scratch); + ASSERT_EQ(CH_SUCCESS, err); + free(db); +} + +// ch_scan: Call with a database with broken version +TEST(HybridArgChecks, ScanBlockBrokenDatabaseVersion) { + ch_database_t *db = nullptr; + makeDatabase(&db); + ch_scratch_t *scratch = nullptr; + makeScratch(db, &scratch); + + // break the database here, after scratch alloc + breakDatabaseVersion(db); + + ch_error_t err = ch_scan(db, "data", 4, 0, scratch, + dummyHandler, nullptr, nullptr); + ASSERT_EQ(CH_DB_VERSION_ERROR, err); + + // teardown + err = ch_free_scratch(scratch); + ASSERT_EQ(CH_SUCCESS, err); + ch_free_database(db); +} + +// ch_scan: Call with no data +TEST(HybridArgChecks, ScanBlockNoData) { + ch_database_t *db = nullptr; + makeDatabase(&db); + ch_scratch_t *scratch = nullptr; + makeScratch(db, &scratch); + + ch_error_t err = ch_scan(db, nullptr, 4, 0, scratch, dummyHandler, + nullptr, nullptr); + ASSERT_NE(CH_SUCCESS, err); + EXPECT_NE(CH_SCAN_TERMINATED, err); + + // teardown + err = ch_free_scratch(scratch); + ASSERT_EQ(CH_SUCCESS, err); + ch_free_database(db); +} + +// ch_scan: Call with no scratch +TEST(HybridArgChecks, ScanBlockNoScratch) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + ch_error_t err = ch_scan(db, "data", 4, 0, nullptr, dummyHandler, + nullptr, nullptr); + ASSERT_NE(CH_SUCCESS, err); + EXPECT_NE(CH_SCAN_TERMINATED, err); + + // teardown + ch_free_database(db); +} + +// ch_scan: Call with no event handler +TEST(HybridArgChecks, ScanBlockNoHandler) { + ch_database_t *db = nullptr; + makeDatabase(&db); + ch_scratch_t *scratch = nullptr; + makeScratch(db, &scratch); + + ch_error_t err = ch_scan(db, "data", 4, 0, scratch, nullptr, nullptr, + nullptr); + ASSERT_EQ(CH_SUCCESS, err); + EXPECT_NE(CH_SCAN_TERMINATED, err); + + // teardown + err = ch_free_scratch(scratch); + ASSERT_EQ(CH_SUCCESS, err); + ch_free_database(db); +} + +// ch_alloc_scratch: Call with no database +TEST(HybridArgChecks, AllocScratchNoDatabase) { + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(nullptr, &scratch); + EXPECT_NE(CH_SUCCESS, err); + EXPECT_TRUE(scratch == nullptr); +} + +// ch_alloc_scratch: Call with nullptr ptr-to-scratch +TEST(HybridArgChecks, AllocScratchNullScratchPtr) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + ch_error_t err = ch_alloc_scratch(db, nullptr); + ASSERT_EQ(CH_INVALID, err); + + // teardown + ch_free_database(db); +} + +// ch_alloc_scratch: Call with bogus scratch +TEST(HybridArgChecks, AllocScratchBogusScratch) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + ch_scratch_t *blah = (ch_scratch_t *)malloc(100); + memset(blah, 0xf0, 100); + ch_error_t err = ch_alloc_scratch(db, &blah); + ASSERT_EQ(CH_INVALID, err); + + // teardown + free(blah); + ch_free_database(db); +} + +// ch_alloc_scratch: Call with broken database magic +TEST(HybridArgChecks, AllocScratchBadDatabaseMagic) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + breakDatabaseMagic(db); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_INVALID, err); + + // teardown + free(db); +} + +// ch_alloc_scratch: Call with broken database version +TEST(HybridArgChecks, AllocScratchBadDatabaseVersion) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + breakDatabaseVersion(db); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_DB_VERSION_ERROR, err); + + // teardown + ch_free_database(db); +} + +// ch_clone_scratch: Call with no source scratch +TEST(HybridArgChecks, CloneScratchNoSource) { + ch_scratch_t *scratch = nullptr, *scratch2 = nullptr; + ch_error_t err = ch_clone_scratch(scratch, &scratch2); + EXPECT_NE(CH_SUCCESS, err); + EXPECT_TRUE(scratch2 == nullptr); +} + +// ch_database_size: Call with no database +TEST(HybridArgChecks, DatabaseSizeNoDatabase) { + size_t sz = 0; + ch_error_t err = ch_database_size(0, &sz); + ASSERT_EQ(CH_INVALID, err); + ASSERT_EQ(0U, sz); +} + +// ch_clone_scratch: bad scratch arg +TEST(HybridArgChecks, CloneBadScratch) { + // Try cloning the scratch + void *local_garbage = malloc(sizeof(garbage)); + memcpy(local_garbage, garbage, sizeof(garbage)); + ch_scratch_t *cloned = nullptr; + ch_scratch_t *scratch = (ch_scratch_t *)local_garbage; + ch_error_t err = ch_clone_scratch(scratch, &cloned); + free(local_garbage); + ASSERT_EQ(CH_INVALID, err); +} + +// ch_scan: bad scratch arg +TEST(HybridArgChecks, ScanBadScratch) { + ch_database_t *db = nullptr; + makeDatabase(&db); + + void *local_garbage = malloc(sizeof(garbage)); + memcpy(local_garbage, garbage, sizeof(garbage)); + + ch_scratch_t *scratch = (ch_scratch_t *)local_garbage; + ch_error_t err = ch_scan(db, "data", 4, 0, scratch, + dummyHandler, nullptr, nullptr); + free(local_garbage); + ASSERT_EQ(CH_INVALID, err); + + // teardown + ch_free_database(db); +} + +TEST(HybridArgChecks, ch_free_database_null) { + ch_error_t err = ch_free_database(nullptr); + ASSERT_EQ(CH_SUCCESS, err); +} + +TEST(HybridArgChecks, ch_free_database_garbage) { + ch_error_t err = ch_free_database((ch_database_t *)garbage); + ASSERT_EQ(CH_INVALID, err); +} + +TEST(HybridArgChecks, ch_free_scratch_null) { + ch_error_t err = ch_free_scratch(nullptr); + ASSERT_EQ(CH_SUCCESS, err); +} + +TEST(HybridArgChecks, ch_free_scratch_garbage) { + ch_error_t err = ch_free_scratch((ch_scratch_t *)garbage); + ASSERT_EQ(CH_INVALID, err); +} + +TEST(HybridArgChecks, ch_free_compile_error_null) { + ch_error_t err = ch_free_compile_error(nullptr); + ASSERT_EQ(CH_SUCCESS, err); +} + +} // namespace + diff --git a/unit/chimera/bad_patterns.cpp b/unit/chimera/bad_patterns.cpp new file mode 100644 index 00000000..0e6ce5d9 --- /dev/null +++ b/unit/chimera/bad_patterns.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "gtest/gtest.h" +#include "chimera/ch.h" + +using namespace testing; + +class HybridCompile : public TestWithParam { + // empty +}; + +TEST_P(HybridCompile, BadPattern) { + ch_error_t err; + ch_compile_error_t *compile_err = nullptr; + const char *pattern = GetParam(); + ch_database_t *db = nullptr; + + err = ch_compile_multi(&pattern, nullptr, nullptr, 1, 0, nullptr, &db, + &compile_err); + ASSERT_NE(CH_SUCCESS, err) << "Compile should have failed for expr: " + << pattern; + ASSERT_TRUE(db == nullptr); + ASSERT_TRUE(compile_err != nullptr); + + ch_free_compile_error(compile_err); +} + +static +const char * BAD_PATTERNS[] = { + // unmatched parens + "(foo", + "foo)", + "((foo)", + "(foo))", + // nothing to repeat + "a+++", + "a+?+", + "a???", + "a??+", + "?qa", + "*abc", + "+abc", + // repeating boundaries is not allowed (UE-1007) + "^?0", + "^*0", + "^+0", + "^{1,3}0", + "0$?", + "0$*", + "0$+", + "0${1,3}", + // char classes + "[]", + "[]foobar", + "[`-\\80", + // bad named classes + "[[:foo:]]", + "[[:1234:]]", + "[[:f\\oo:]]", + "[[: :]]", + "[[:...:]]", + "[[:l\\ower:]]", + "[[:abc\\:]]", + "[abc[:x\\]pqr:]]", + "[[:a\\dz:]]", + "foobar\\", // trailing unescaped backslash +}; + +INSTANTIATE_TEST_CASE_P(Compile, HybridCompile, ValuesIn(BAD_PATTERNS)); diff --git a/unit/chimera/compat.cpp b/unit/chimera/compat.cpp new file mode 100644 index 00000000..2599656a --- /dev/null +++ b/unit/chimera/compat.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "gtest/gtest.h" +#include "chimera/ch.h" +#include "hs.h" + +// We currently depend on our common (meaning) hash defines having the same +// values. +TEST(HybridCompat, Defines) { + // flags + EXPECT_EQ(HS_FLAG_CASELESS, CH_FLAG_CASELESS); + EXPECT_EQ(HS_FLAG_DOTALL, CH_FLAG_DOTALL); + EXPECT_EQ(HS_FLAG_MULTILINE, CH_FLAG_MULTILINE); + EXPECT_EQ(HS_FLAG_SINGLEMATCH, CH_FLAG_SINGLEMATCH); + EXPECT_EQ(HS_FLAG_UTF8, CH_FLAG_UTF8); + EXPECT_EQ(HS_FLAG_UCP, CH_FLAG_UCP); + + // errors + EXPECT_EQ(HS_SUCCESS, CH_SUCCESS); + EXPECT_EQ(HS_INVALID, CH_INVALID); + EXPECT_EQ(HS_NOMEM, CH_NOMEM); + EXPECT_EQ(HS_SCAN_TERMINATED, CH_SCAN_TERMINATED); + EXPECT_EQ(HS_COMPILER_ERROR, CH_COMPILER_ERROR); + EXPECT_EQ(HS_DB_VERSION_ERROR, CH_DB_VERSION_ERROR); + EXPECT_EQ(HS_DB_PLATFORM_ERROR, CH_DB_PLATFORM_ERROR); + EXPECT_EQ(HS_DB_MODE_ERROR, CH_DB_MODE_ERROR); + EXPECT_EQ(HS_BAD_ALIGN, CH_BAD_ALIGN); + EXPECT_EQ(HS_BAD_ALLOC, CH_BAD_ALLOC); + EXPECT_EQ(HS_SCRATCH_IN_USE, CH_SCRATCH_IN_USE); +} diff --git a/unit/chimera/main.cpp b/unit/chimera/main.cpp new file mode 100644 index 00000000..9ab663c3 --- /dev/null +++ b/unit/chimera/main.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "gtest/gtest.h" + +// Driver: run all the tests (defined in other source files in this directory) +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/unit/chimera/scan.cpp b/unit/chimera/scan.cpp new file mode 100644 index 00000000..b1dd73b2 --- /dev/null +++ b/unit/chimera/scan.cpp @@ -0,0 +1,551 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "gtest/gtest.h" +#include "chimera/ch.h" + +using namespace std; +using namespace testing; + +namespace { + +class HybridScanParams { +public: + HybridScanParams() {} + HybridScanParams(const char *s, unsigned int f) + : patterns(1, s), flags(1, f) {} + + void add(const char *pattern, unsigned int myflags) { + patterns.push_back(pattern); + flags.push_back(myflags); + } + + size_t size() const { + return patterns.size(); + } + + const char * const * getPatterns() const { + return &patterns[0]; + } + + const unsigned int * getFlags() const { + return &flags[0]; + } + +private: + vector patterns; + vector flags; +}; + +static +vector paramFactory() { + vector hsp; + + // Some simple single-pattern cases. + hsp.push_back(HybridScanParams(".", CH_FLAG_DOTALL)); + hsp.push_back(HybridScanParams("foobar", 0)); + hsp.push_back(HybridScanParams("foo.*bar", 0)); + hsp.push_back(HybridScanParams("fred.*bill", CH_FLAG_DOTALL)); + hsp.push_back(HybridScanParams(".*", 0)); // vacuosity! + hsp.push_back(HybridScanParams("\\A(.?.{7,27}jf[tmqq]l(f|t|hgmr.+.fg|abks)){3,7}", 0)); + hsp.push_back(HybridScanParams("^begin", CH_FLAG_MULTILINE)); + hsp.push_back(HybridScanParams("match", CH_FLAG_SINGLEMATCH)); + + // Single-pattern cases where the pattern isn't supported by hyperscan but + // can be prefiltered. + hsp.push_back(HybridScanParams("foo(?!bar)", 0)); + hsp.push_back(HybridScanParams("(sens|respons)e and \\1ibility", 0)); + + // A case that can't be prefiltered (as of this writing) because it's too + // gosh-darned big. This tests that the hybrid matcher can run without the + // multi-matcher (or with a "fake" one). + hsp.push_back(HybridScanParams("((c(p|p)h{2,}bh.|p|((((cq|j|c|(\\b)|.[^nbgn]|(\\B)[qfh]a)){10,12}|ih|a|mnde[pa].|.g)){5,8})){3}", 0)); + + // Simple multi-pattern literal case. + hsp.push_back(HybridScanParams()); + hsp.back().add("hatstand", 0); + hsp.back().add("teakettle", 0); + hsp.back().add("badgerbrush", 0); + hsp.back().add("mnemosyne", 0); + + // More complex multi-pattern case. + hsp.push_back(HybridScanParams()); + hsp.back().add("foo.{3,7}bar", 0); + hsp.back().add("foo.{30,70}bar", 0); + hsp.back().add("foobar.*foobar", 0); + hsp.back().add("^blingwrapper.*foo", 0); + hsp.back().add("[0-9a-f]{70,}\\n", 0); + + // A couple of trivial Unicode patterns, mostly to make sure we accept + // the flags. + hsp.push_back(HybridScanParams()); + hsp.back().add("foo.*bar", CH_FLAG_UTF8); + hsp.back().add("today", CH_FLAG_UTF8|CH_FLAG_UCP); + + // PCRE exotica. + hsp.push_back(HybridScanParams()); + hsp.back().add("benign literal", 0); + hsp.back().add("(?|(abc)|(def))\\1", 0); + hsp.back().add("(?|(abc)|(def))(?1)", 0); + hsp.back().add("(sens|respons)e and \\1ibility", 0); + hsp.back().add("\\w+(?=;)", 0); + hsp.back().add("foo(?!bar)", 0); + hsp.back().add("(?<=bullock|donkey)", 0); + + return hsp; +} + +// Dummy callback. +static +ch_callback_t dummyHandler(unsigned, unsigned long long, unsigned long long, + unsigned, unsigned,const ch_capture_t *, void *) { + // empty + return CH_CALLBACK_CONTINUE; +} + +static +void checkGroups(unsigned int num, const ch_capture_t *captured) { + // We should have _some_ group info. + ASSERT_LT(0U, num); + ASSERT_TRUE(captured != nullptr); + + // Group 0 is always active. + ASSERT_TRUE(captured[0].flags & CH_CAPTURE_FLAG_ACTIVE); + + // Sanity-checking. + for (unsigned int i = 0; i < num; i++) { + if (!(captured[i].flags & CH_CAPTURE_FLAG_ACTIVE)) { + continue; + } + ASSERT_LE(captured[i].from, captured[i].to) << "Group " << i + << "not sane."; + } +} + +// Dummy callback that checks that we had some groups set. +static +ch_callback_t dummyGroupHandler(unsigned, unsigned long long, + unsigned long long, unsigned, unsigned num, + const ch_capture_t *captured, void *) { + checkGroups(num, captured); + return CH_CALLBACK_CONTINUE; +} + +class HybridScan : public TestWithParam> { +protected: + virtual void SetUp() { + ch_error_t err; + ch_compile_error_t *compile_err = nullptr; + const HybridScanParams &hsp = get<0>(GetParam()); + groups = get<1>(GetParam()); + + err = ch_compile_ext_multi(hsp.getPatterns(), hsp.getFlags(), nullptr, + hsp.size(), groups ? CH_MODE_GROUPS : + CH_MODE_NOGROUPS, 10000000, 8000, + nullptr, &db, &compile_err); + ASSERT_EQ(err, CH_SUCCESS); + ASSERT_TRUE(db != nullptr); + + err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(err, CH_SUCCESS); + ASSERT_TRUE(scratch != nullptr); + } + + virtual void TearDown() { + ch_free_database(db); + ch_free_scratch(scratch); + } + + ch_database_t *db = nullptr; + ch_scratch_t *scratch = nullptr; + bool groups; +}; + +static const string SCAN_DATA( + "Beware the Jabberwock, my son!\n" + "The jaws that bite, the claws that catch!\n" + "Beware the Jubjub bird, and shun\n" + "The frumious Bandersnatch!\n"); + +TEST_P(HybridScan, BuildAndScan) { + ASSERT_TRUE(db != nullptr); + + size_t sz; + ch_error_t err = ch_database_size(db, &sz); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_LT(16U, sz); + + ch_match_event_handler cb = groups ? dummyGroupHandler : dummyHandler; + + err = ch_scan(db, SCAN_DATA.c_str(), SCAN_DATA.length(), 0, + scratch, cb, nullptr, nullptr); + ASSERT_EQ(CH_SUCCESS, err); +} + +TEST_P(HybridScan, ScanNearly4KData) { + ASSERT_TRUE(db != nullptr); + + string data(4000, '*'); // it's full of stars! + + // Insert some strings that will match a few patterns. + data.insert(278, "foo"); + data.insert(285, "bar"); + data.insert(1178, "foobar"); + data.insert(1894, "bar"); + data.insert(3000, "foobar"); + + ch_match_event_handler cb = groups ? dummyGroupHandler : dummyHandler; + + ch_error_t err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, cb, nullptr, nullptr); + ASSERT_EQ(CH_SUCCESS, err); +} + +TEST_P(HybridScan, ScanBigData) { + ASSERT_TRUE(db != nullptr); + + // More than 4MB, as that pushes us into using PCRE for non-Pawn cases. + string data(5*1024*1024, '*'); // it's full of stars! + + // Insert some strings that will match a few patterns. + data.insert(278, "foo"); + data.insert(285, "bar"); + data.insert(1178, "foobar"); + data.insert(1894, "bar"); + data.insert(3000, "foobar"); + + ch_match_event_handler cb = groups ? dummyGroupHandler : dummyHandler; + + ch_error_t err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, cb, nullptr, nullptr); + ASSERT_EQ(CH_SUCCESS, err); +} + +TEST_P(HybridScan, ScanClonedScratch) { + ASSERT_TRUE(db != nullptr); + + ch_error_t err; + ch_scratch_t *clonedScratch = nullptr; + err = ch_clone_scratch(scratch, &clonedScratch); + ASSERT_EQ(CH_SUCCESS, err); + + ch_match_event_handler cb = groups ? dummyGroupHandler : dummyHandler; + + err = ch_scan(db, SCAN_DATA.c_str(), SCAN_DATA.length(), 0, + clonedScratch, cb, nullptr, nullptr); + ASSERT_EQ(CH_SUCCESS, err); + + ch_free_scratch(clonedScratch); +} + +TEST_P(HybridScan, DatabaseInfo) { + ASSERT_TRUE(db != nullptr); + + char *info = nullptr; + ch_error_t err = ch_database_info(db, &info); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(info != nullptr); + + const string strinfo(info); + const string prefix("Chimera "); + ASSERT_GE(strinfo.size(), prefix.size()); + ASSERT_EQ(prefix, strinfo.substr(0, prefix.size())); + + free(info); +} + +TEST_P(HybridScan, NonZeroScratchSize) { + ASSERT_TRUE(db != nullptr); + size_t curr_size; + ch_error_t err = ch_scratch_size(scratch, &curr_size); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_LT(0, curr_size); +} + +INSTANTIATE_TEST_CASE_P(Scan, HybridScan, + Combine(ValuesIn(paramFactory()), Bool())); + +// Counting callback that returns CH_CALLBACK_CONTINUE. +static +ch_callback_t countHandler(unsigned, unsigned long long, unsigned long long, + unsigned, unsigned, const ch_capture_t *, + void *ctx) { + unsigned int *count = (unsigned int *)ctx; + ++(*count); + return CH_CALLBACK_CONTINUE; +} + +// Counting callback that returns CH_CALLBACK_SKIP_PATTERN. +static +ch_callback_t skipHandler(unsigned, unsigned long long, unsigned long long, + unsigned, unsigned, const ch_capture_t *, + void *ctx) { + unsigned int *count = (unsigned int *)ctx; + ++(*count); + return CH_CALLBACK_SKIP_PATTERN; +} + +// Counting callback that returns CH_CALLBACK_TERMINATE. +static +ch_callback_t terminateHandler(unsigned, unsigned long long, unsigned long long, + unsigned, unsigned, const ch_capture_t *, + void *ctx) { + unsigned int *count = (unsigned int *)ctx; + ++(*count); + return CH_CALLBACK_TERMINATE; +} + +static +void makeDatabase(ch_database_t **db, const char * const expr[], size_t num) { + *db = nullptr; + ch_compile_error_t *compile_err = nullptr; + ch_error_t err = ch_compile_ext_multi(expr, nullptr, nullptr, num, 0, + 10000000, 8000, nullptr, db, + &compile_err); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(*db != nullptr); +} + +struct RescanContext { + RescanContext(const ch_database_t *db_in, ch_scratch_t *scratch_in) + : db(db_in), scratch(scratch_in) {} + const ch_database_t *db; + ch_scratch_t *scratch; + size_t matches = 0; +}; + +static +int rescan_block_cb(unsigned, unsigned long long, unsigned long long, unsigned, + unsigned, const ch_capture_t *, void *ctx) { + RescanContext *rctx = (RescanContext *)ctx; + rctx->matches++; + + const string data = "___foo___bar_"; + + hs_error_t err = ch_scan(rctx->db, data.c_str(), data.length(), 0, + rctx->scratch, nullptr, nullptr, nullptr); + EXPECT_EQ(CH_SCRATCH_IN_USE, err); + return 0; +} + +TEST(Scan, ScratchInUse) { + static const char * const expr[] = { "foo.*bar" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 1); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + RescanContext rc(db, scratch); + + const string data("___foo___bar_"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, rescan_block_cb, 0, &rc); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_EQ(1U, rc.matches); + + ch_free_scratch(scratch); + ch_free_database(db); +} + +TEST(Scan, CallbackSkip1) { + static const char * const expr[] = { "." }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 1); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("qwertyuiop"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, skipHandler, 0, &count); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_EQ(1U, count); + + ch_free_scratch(scratch); + ch_free_database(db); +} + +TEST(Scan, CallbackSkip2) { + static const char * const expr[] = { "[a-z]+", "[0-9]" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 2); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("foo 0123 0 bar 39483 n34jfhlqekrcoi3q4"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, skipHandler, 0, &count); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_EQ(2U, count); // both patterns should match once + + ch_free_scratch(scratch); + ch_free_database(db); +} + +// This case includes a pattern that we use libpcre for. +TEST(Scan, CallbackSkip3) { + static const char * const expr[] = { "[a-z]+", "foo(?!bar)" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 2); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("foobaz foobing foobar"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, skipHandler, 0, &count); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_EQ(2U, count); // both patterns should match once + + ch_free_scratch(scratch); + ch_free_database(db); +} + +TEST(Scan, CallbackNoSkip1) { + static const char * const expr[] = { "foo|bar", "[0-9]{3}" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 2); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("foo 012 bar 345 foobar 678"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, countHandler, 0, &count); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_EQ(7U, count); // seven matches in total + + ch_free_scratch(scratch); + ch_free_database(db); +} + +TEST(Scan, CallbackNoSkip2) { + static const char * const expr[] = { "foo(?!bar)", "[0-9]{3}" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 2); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("foo 012 bar 345 foobar 678"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, countHandler, 0, &count); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_EQ(4U, count); // four matches in total + + ch_free_scratch(scratch); + ch_free_database(db); +} + +TEST(Scan, CallbackTerm1) { + static const char * const expr[] = { "." }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 1); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("qwertyuiop"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, terminateHandler, 0, &count); + ASSERT_EQ(CH_SCAN_TERMINATED, err); + ASSERT_EQ(1U, count); + + ch_free_scratch(scratch); + ch_free_database(db); +} + +TEST(Scan, CallbackTerm2) { + static const char * const expr[] = { "[a-z]+", "[0-9]" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 2); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != 0); + + unsigned int count = 0; + const string data("foo 0123 0 bar 39483 n34jfhlqekrcoi3q4"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, terminateHandler, 0, &count); + ASSERT_EQ(CH_SCAN_TERMINATED, err); + ASSERT_EQ(1U, count); + + ch_free_scratch(scratch); + ch_free_database(db); +} + +// This case includes a pattern that we use libpcre for. +TEST(Scan, CallbackTerm3) { + static const char * const expr[] = { "[a-z]+", "foo(?!bar)" }; + ch_database_t *db = nullptr; + makeDatabase(&db, expr, 2); + + ch_scratch_t *scratch = nullptr; + ch_error_t err = ch_alloc_scratch(db, &scratch); + ASSERT_EQ(CH_SUCCESS, err); + ASSERT_TRUE(scratch != nullptr); + + unsigned int count = 0; + const string data("foobaz foobing foobar"); + err = ch_scan(db, data.c_str(), data.length(), 0, + scratch, terminateHandler, 0, &count); + ASSERT_EQ(CH_SCAN_TERMINATED, err); + ASSERT_EQ(1U, count); + + ch_free_scratch(scratch); + ch_free_database(db); +} + +} // namespace