chimera: hybrid of Hyperscan and PCRE

This commit is contained in:
Wang, Xiang W 2018-03-09 03:52:12 -05:00
parent 8a1c497f44
commit bf87f8c003
47 changed files with 6985 additions and 202 deletions

View File

@ -70,6 +70,16 @@ include_directories(SYSTEM include)
include (${CMAKE_MODULE_PATH}/boost.cmake) include (${CMAKE_MODULE_PATH}/boost.cmake)
# PCRE check, we have a fixed requirement for PCRE to use Chimera
# and hscollider
set(PCRE_REQUIRED_MAJOR_VERSION 8)
set(PCRE_REQUIRED_MINOR_VERSION 41)
set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION})
include (${CMAKE_MODULE_PATH}/pcre.cmake)
if (NOT CORRECT_PCRE_VERSION)
message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} not found")
endif()
# -- make this work? set(python_ADDITIONAL_VERSIONS 2.7 2.6) # -- make this work? set(python_ADDITIONAL_VERSIONS 2.7 2.6)
find_package(PythonInterp) find_package(PythonInterp)
find_program(RAGEL ragel) find_program(RAGEL ragel)
@ -154,7 +164,7 @@ if(MSVC OR MSVC_IDE)
# todo: change these as required # todo: change these as required
set(ARCH_C_FLAGS "/arch:AVX2") set(ARCH_C_FLAGS "/arch:AVX2")
set(ARCH_CXX_FLAGS "/arch:AVX2") set(ARCH_CXX_FLAGS "/arch:AVX2")
set(MSVC_WARNS "/wd4101 /wd4146 /wd4172 /wd4200 /wd4244 /wd4267 /wd4307 /wd4334 /wd4805 -D_CRT_SECURE_NO_WARNINGS") set(MSVC_WARNS "/wd4101 /wd4146 /wd4172 /wd4200 /wd4244 /wd4267 /wd4307 /wd4334 /wd4805 /wd4996 -D_CRT_SECURE_NO_WARNINGS")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 ${MSVC_WARNS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 ${MSVC_WARNS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 ${MSVC_WARNS} /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 ${MSVC_WARNS} /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD")
endif() endif()
@ -445,12 +455,20 @@ else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
endif() endif()
# we need static libs for Chimera - too much deep magic for shared libs
if (CORRECT_PCRE_VERSION AND BUILD_STATIC_LIBS)
set(BUILD_CHIMERA TRUE)
endif()
add_subdirectory(util) add_subdirectory(util)
add_subdirectory(unit) add_subdirectory(unit)
add_subdirectory(doc/dev-reference) add_subdirectory(doc/dev-reference)
if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt) if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt)
add_subdirectory(tools) add_subdirectory(tools)
endif() endif()
if (EXISTS ${CMAKE_SOURCE_DIR}/chimera/CMakeLists.txt AND BUILD_CHIMERA)
add_subdirectory(chimera)
endif()
# do substitutions # do substitutions
configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h) configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h)

32
chimera/CMakeLists.txt Normal file
View File

@ -0,0 +1,32 @@
# Chimera lib
include_directories(${PCRE_INCLUDE_DIRS})
# only set these after all tests are done
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
SET(chimera_HEADERS
ch.h
ch_common.h
ch_compile.h
ch_runtime.h
)
install(FILES ${chimera_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs")
SET(chimera_SRCS
${chimera_HEADERS}
ch_alloc.c
ch_alloc.h
ch_compile.cpp
ch_database.c
ch_database.h
ch_internal.h
ch_runtime.c
ch_scratch.h
ch_scratch.c
)
add_library(chimera STATIC ${chimera_SRCS})
add_dependencies(chimera hs pcre)
target_link_libraries(chimera hs pcre)

45
chimera/ch.h Normal file
View File

@ -0,0 +1,45 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CH_H_
#define CH_H_
/**
* @file
* @brief The complete Chimera API definition.
*
* Chimera is a hybrid solution of Hyperscan and PCRE.
*
* This header includes both the Chimera compiler and runtime components. See
* the individual component headers for documentation.
*/
#include "ch_compile.h"
#include "ch_runtime.h"
#endif /* CH_H_ */

109
chimera/ch_alloc.c Normal file
View File

@ -0,0 +1,109 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Runtime functions for setting custom allocators.
*/
#include "ch.h"
#include "ch_common.h"
#include "ch_internal.h"
#include "hs.h"
#include "ue2common.h"
#define default_malloc malloc
#define default_free free
ch_alloc_t ch_database_alloc = default_malloc;
ch_alloc_t ch_misc_alloc = default_malloc;
ch_alloc_t ch_scratch_alloc = default_malloc;
ch_free_t ch_database_free = default_free;
ch_free_t ch_misc_free = default_free;
ch_free_t ch_scratch_free = default_free;
static
ch_alloc_t normalise_alloc(ch_alloc_t a) {
if (!a) {
return default_malloc;
} else {
return a;
}
}
static
ch_free_t normalise_free(ch_free_t f) {
if (!f) {
return default_free;
} else {
return f;
}
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_set_allocator(ch_alloc_t allocfunc,
ch_free_t freefunc) {
ch_set_database_allocator(allocfunc, freefunc);
ch_set_misc_allocator(allocfunc, freefunc);
ch_set_scratch_allocator(allocfunc, freefunc);
// Set core Hyperscan alloc/free.
hs_error_t ret = hs_set_allocator(allocfunc, freefunc);
return ret;
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_set_database_allocator(ch_alloc_t allocfunc,
ch_free_t freefunc) {
ch_database_alloc = normalise_alloc(allocfunc);
ch_database_free = normalise_free(freefunc);
// Set Hyperscan database alloc/free.
return hs_set_database_allocator(allocfunc, freefunc);
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_set_misc_allocator(ch_alloc_t allocfunc,
ch_free_t freefunc) {
ch_misc_alloc = normalise_alloc(allocfunc);
ch_misc_free = normalise_free(freefunc);
// Set Hyperscan misc alloc/free.
return hs_set_misc_allocator(allocfunc, freefunc);
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_set_scratch_allocator(ch_alloc_t allocfunc,
ch_free_t freefunc) {
ch_scratch_alloc = normalise_alloc(allocfunc);
ch_scratch_free = normalise_free(freefunc);
// Set Hyperscan scratch alloc/free.
return hs_set_scratch_allocator(allocfunc, freefunc);
}

65
chimera/ch_alloc.h Normal file
View File

@ -0,0 +1,65 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CH_ALLOC_H
#define CH_ALLOC_H
#include "hs_common.h"
#include "ue2common.h"
#include "ch_common.h"
#ifdef __cplusplus
extern "C"
{
#endif
extern hs_alloc_t ch_database_alloc;
extern hs_alloc_t ch_misc_alloc;
extern hs_alloc_t ch_scratch_alloc;
extern hs_free_t ch_database_free;
extern hs_free_t ch_misc_free;
extern hs_free_t ch_scratch_free;
#ifdef __cplusplus
} /* extern C */
#endif
/** \brief Check the results of an alloc done with hs_alloc for alignment.
*
* If we have incorrect alignment, return an error. Caller should free the
* offending block. */
static really_inline
ch_error_t ch_check_alloc(const void *mem) {
ch_error_t ret = CH_SUCCESS;
if (!mem) {
ret = CH_NOMEM;
} else if (!ISALIGNED_N(mem, alignof(unsigned long long))) {
ret = CH_BAD_ALLOC;
}
return ret;
}
#endif

360
chimera/ch_common.h Normal file
View File

@ -0,0 +1,360 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CH_COMMON_H_
#define CH_COMMON_H_
#include "hs_common.h"
#include <stdlib.h>
/**
* @file
* @brief The Chimera common API definition.
*
* Chimera is a hybrid of Hyperscan and PCRE.
*
* This header contains functions available to both the Chimera compiler and
* runtime.
*/
#ifdef __cplusplus
extern "C"
{
#endif
struct ch_database;
/**
* A Chimera pattern database.
*
* Generated by one of the Chimera compiler functions:
* - @ref ch_compile()
* - @ref ch_compile_multi()
* - @ref ch_compile_ext_multi()
*/
typedef struct ch_database ch_database_t;
/**
* A type for errors returned by Chimera functions.
*/
typedef int ch_error_t;
/**
* Free a compiled pattern database.
*
* The free callback set by @ref ch_set_allocator()) will be used by this
* function.
*
* @param db
* A compiled pattern database. NULL may also be safely provided, in which
* case the function does nothing.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_free_database(ch_database_t *db);
/**
* Utility function for identifying this release version.
*
* @return
* A string containing the version number of this release build and the
* date of the build. It is allocated statically, so it does not need to
* be freed by the caller.
*/
const char * HS_CDECL ch_version(void);
/**
* Returns the size of the given database.
*
* @param database
* Pointer to compiled expression database.
*
* @param database_size
* On success, the size of the compiled database in bytes is placed in this
* parameter.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_database_size(const ch_database_t *database,
size_t *database_size);
/**
* Utility function providing information about a database.
*
* @param database
* Pointer to a compiled database.
*
* @param info
* On success, a string containing the version and platform information for
* the supplied database is placed in the parameter. The string is
* allocated using the allocator supplied in @ref hs_set_allocator()
* (or malloc() if no allocator was set) and should be freed by the caller.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_database_info(const ch_database_t *database,
char **info);
/**
* The type of the callback function that will be used by Chimera to allocate
* more memory at runtime as required.
*
* If Chimera is to be used in a multi-threaded, or similarly concurrent
* environment, the allocation function will need to be re-entrant, or
* similarly safe for concurrent use.
*
* @param size
* The number of bytes to allocate.
* @return
* A pointer to the region of memory allocated, or NULL on error.
*/
typedef void *(HS_CDECL *ch_alloc_t)(size_t size);
/**
* The type of the callback function that will be used by Chimera to free
* memory regions previously allocated using the @ref ch_alloc_t function.
*
* @param ptr
* The region of memory to be freed.
*/
typedef void (HS_CDECL *ch_free_t)(void *ptr);
/**
* Set the allocate and free functions used by Chimera for allocating
* memory at runtime for stream state, scratch space, database bytecode,
* and various other data structure returned by the Chimera API.
*
* The function is equivalent to calling @ref ch_set_scratch_allocator(),
* @ref ch_set_database_allocator() and
* @ref ch_set_misc_allocator() with the provided parameters.
*
* This call will override any previous allocators that have been set.
*
* Note: there is no way to change the allocator used for temporary objects
* created during the various compile calls (@ref ch_compile() and @ref
* ch_compile_multi()).
*
* @param alloc_func
* A callback function pointer that allocates memory. This function must
* return memory suitably aligned for the largest representable data type
* on this platform.
*
* @param free_func
* A callback function pointer that frees allocated memory.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_set_allocator(ch_alloc_t alloc_func,
ch_free_t free_func);
/**
* Set the allocate and free functions used by Chimera for allocating memory
* for database bytecode produced by the compile calls (@ref ch_compile() and @ref
* ch_compile_multi()).
*
* If no database allocation functions are set, or if NULL is used in place of
* both parameters, then memory allocation will default to standard methods
* (such as the system malloc() and free() calls).
*
* This call will override any previous database allocators that have been set.
*
* Note: the database allocator may also be set by calling @ref
* ch_set_allocator().
*
* Note: there is no way to change how temporary objects created during the
* various compile calls (@ref ch_compile() and @ref ch_compile_multi()) are
* allocated.
*
* @param alloc_func
* A callback function pointer that allocates memory. This function must
* return memory suitably aligned for the largest representable data type
* on this platform.
*
* @param free_func
* A callback function pointer that frees allocated memory.
*
* @return
* @ref HS_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_set_database_allocator(ch_alloc_t alloc_func,
ch_free_t free_func);
/**
* Set the allocate and free functions used by Chimera for allocating memory
* for items returned by the Chimera API such as @ref ch_compile_error_t.
*
* If no misc allocation functions are set, or if NULL is used in place of both
* parameters, then memory allocation will default to standard methods (such as
* the system malloc() and free() calls).
*
* This call will override any previous misc allocators that have been set.
*
* Note: the misc allocator may also be set by calling @ref ch_set_allocator().
*
* @param alloc_func
* A callback function pointer that allocates memory. This function must
* return memory suitably aligned for the largest representable data type
* on this platform.
*
* @param free_func
* A callback function pointer that frees allocated memory.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_set_misc_allocator(ch_alloc_t alloc_func,
ch_free_t free_func);
/**
* Set the allocate and free functions used by Chimera for allocating memory
* for scratch space by @ref ch_alloc_scratch() and @ref ch_clone_scratch().
*
* If no scratch allocation functions are set, or if NULL is used in place of
* both parameters, then memory allocation will default to standard methods
* (such as the system malloc() and free() calls).
*
* This call will override any previous scratch allocators that have been set.
*
* Note: the scratch allocator may also be set by calling @ref
* ch_set_allocator().
*
* @param alloc_func
* A callback function pointer that allocates memory. This function must
* return memory suitably aligned for the largest representable data type
* on this platform.
*
* @param free_func
* A callback function pointer that frees allocated memory.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_set_scratch_allocator(ch_alloc_t alloc_func,
ch_free_t free_func);
/**
* @defgroup CH_ERROR ch_error_t values
*
* @{
*/
/**
* The engine completed normally.
*/
#define CH_SUCCESS 0
/**
* A parameter passed to this function was invalid.
*/
#define CH_INVALID (-1)
/**
* A memory allocation failed.
*/
#define CH_NOMEM (-2)
/**
* The engine was terminated by callback.
*
* This return value indicates that the target buffer was partially scanned,
* but that the callback function requested that scanning cease after a match
* was located.
*/
#define CH_SCAN_TERMINATED (-3)
/**
* The pattern compiler failed, and the @ref ch_compile_error_t should be
* inspected for more detail.
*/
#define CH_COMPILER_ERROR (-4)
/**
* The given database was built for a different version of the Chimera matcher.
*/
#define CH_DB_VERSION_ERROR (-5)
/**
* The given database was built for a different platform (i.e., CPU type).
*/
#define CH_DB_PLATFORM_ERROR (-6)
/**
* The given database was built for a different mode of operation. This error
* is returned when streaming calls are used with a non-streaming database and
* vice versa.
*/
#define CH_DB_MODE_ERROR (-7)
/**
* A parameter passed to this function was not correctly aligned.
*/
#define CH_BAD_ALIGN (-8)
/**
* The memory allocator did not correctly return memory suitably aligned for
* the largest representable data type on this platform.
*/
#define CH_BAD_ALLOC (-9)
/**
* The scratch region was already in use.
*
* This error is returned when Chimera is able to detect that the scratch
* region given is already in use by another Chimera API call.
*
* A separate scratch region, allocated with @ref ch_alloc_scratch() or @ref
* ch_clone_scratch(), is required for every concurrent caller of the Chimera
* API.
*
* For example, this error might be returned when @ref ch_scan() has been
* called inside a callback delivered by a currently-executing @ref ch_scan()
* call using the same scratch region.
*
* Note: Not all concurrent uses of scratch regions may be detected. This error
* is intended as a best-effort debugging tool, not a guarantee.
*/
#define CH_SCRATCH_IN_USE (-10)
/**
* Returned when pcre_exec (called for some expressions internally from @ref
* ch_scan) failed due to a fatal error.
*/
#define CH_FAIL_INTERNAL (-32)
/** @} */
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* CH_COMMON_H_ */

878
chimera/ch_compile.cpp Normal file
View File

@ -0,0 +1,878 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Compiler front-end, including public API calls for compilation.
*/
#include "ch_compile.h"
#include "ch_alloc.h"
#include "ch_internal.h"
#include "ch_database.h"
#include "grey.h"
#include "hs_common.h"
#include "hs_internal.h"
#include "ue2common.h"
#include "util/compile_error.h"
#include "util/make_unique.h"
#include "util/multibit_build.h"
#include "util/target_info.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstring>
#include <memory>
#include <ostream>
#include <sstream>
#include <limits.h>
#include <string>
#include <vector>
#include <boost/core/noncopyable.hpp>
#define PCRE_ERROR_MSG "Internal error building PCRE pattern."
using namespace std;
using namespace ue2;
static const char failureNoMemory[] = "Unable to allocate memory.";
static const char failureInternal[] = "Internal error.";
static const char failureBadAlloc[] = "Allocator returned misaligned memory.";
static const ch_compile_error_t ch_enomem
= { const_cast<char *>(failureNoMemory), 0 };
static const ch_compile_error_t ch_einternal
= { const_cast<char *>(failureInternal), 0 };
static const ch_compile_error_t ch_badalloc
= { const_cast<char *>(failureBadAlloc), 0 };
static
ch_compile_error_t *generateChimeraCompileError(const string &err,
int expression) {
ch_compile_error_t *ret =
(struct ch_compile_error *)ch_misc_alloc(sizeof(ch_compile_error_t));
if (ret) {
ch_error_t e = ch_check_alloc(ret);
if (e != CH_SUCCESS) {
ch_misc_free(ret);
return const_cast<ch_compile_error_t *>(&ch_badalloc);
}
char *msg = (char *)ch_misc_alloc(err.size() + 1);
if (msg) {
e = ch_check_alloc(msg);
if (e != HS_SUCCESS) {
ch_misc_free(msg);
return const_cast<ch_compile_error_t *>(&ch_badalloc);
}
memcpy(msg, err.c_str(), err.size() + 1);
ret->message = msg;
} else {
ch_misc_free(ret);
ret = nullptr;
}
}
if (!ret || !ret->message) {
return const_cast<ch_compile_error_t *>(&ch_enomem);
}
ret->expression = expression;
return ret;
}
static
void freeChimeraCompileError(ch_compile_error_t *error) {
if (!error) {
return;
}
if (error == &ch_enomem || error == &ch_einternal ||
error == &ch_badalloc) {
// These are not allocated.
return;
}
ch_misc_free(error->message);
ch_misc_free(error);
}
static
bool checkMode(unsigned int mode, ch_compile_error_t **comp_error) {
static const unsigned int supported = CH_MODE_GROUPS;
if (mode & ~supported) {
*comp_error =
generateChimeraCompileError("Invalid mode flag supplied.", -1);
return false;
}
return true;
}
/** \brief Throw a compile error if we're passed some unsupported flags. */
static
void checkFlags(const unsigned int flags) {
static const unsigned int supported = HS_FLAG_DOTALL
| HS_FLAG_MULTILINE
| HS_FLAG_CASELESS
| HS_FLAG_SINGLEMATCH
| HS_FLAG_UCP
| HS_FLAG_UTF8;
if (flags & ~supported) {
throw CompileError("Unrecognized flag used.");
}
}
static
bool isHyperscanSupported(const char *expression, unsigned int flags,
const hs_platform_info *platform) {
hs_database_t *db = nullptr;
hs_compile_error *comp_error = nullptr;
unsigned int id = 0;
hs_error_t err = hs_compile_multi(&expression, &flags, &id,
1, HS_MODE_BLOCK, platform, &db,
&comp_error);
if (err != HS_SUCCESS) {
assert(!db);
assert(comp_error);
DEBUG_PRINTF("unsupported: %s\n", comp_error->message);
hs_free_compile_error(comp_error);
return false;
}
assert(db);
assert(!comp_error);
hs_free_database(db);
return true;
}
static
bool writeHyperscanDatabase(char *ptr, hs_database_t *db) {
// Note: we must use our serialization calls to re-home the database.
char *serialized = nullptr;
size_t slen = 0;
hs_error_t err = hs_serialize_database(db, &serialized, &slen);
if (err != HS_SUCCESS) {
DEBUG_PRINTF("hs_serialize_database returned %d\n", err);
assert(0);
return false;
}
DEBUG_PRINTF("writing database to ptr %p\n", ptr);
// deserialize_at without the platform tests.
err = hs_deserialize_database_at(serialized, slen, (hs_database_t *)ptr);
if (err != HS_SUCCESS) {
DEBUG_PRINTF("hs_deserialize_database_at returned %d\n", err);
assert(0);
ch_misc_free(serialized);
return false;
}
ch_misc_free(serialized);
return true;
}
static
bool writeHyperscanDatabase(ch_bytecode *db, hs_database_t *hs_db) {
db->databaseOffset = ROUNDUP_CL(sizeof(*db));
char *ptr = (char *)db + db->databaseOffset;
return writeHyperscanDatabase(ptr, hs_db);
}
static
int convertFlagsToPcreOptions(unsigned int flags) {
int options = 0;
if (flags & HS_FLAG_CASELESS) {
options |= PCRE_CASELESS;
}
if (flags & HS_FLAG_DOTALL) {
options |= PCRE_DOTALL;
}
if (flags & HS_FLAG_MULTILINE) {
options |= PCRE_MULTILINE;
}
if (flags & HS_FLAG_UTF8) {
options |= PCRE_UTF8;
}
if (flags & HS_FLAG_UCP) {
options |= PCRE_UCP;
}
// All other flags are meaningless to PCRE.
return options;
}
namespace {
/** \brief Data about a single pattern. */
struct PatternData : boost::noncopyable {
PatternData(const char *pattern, u32 flags, u32 idx, u32 id_in,
unsigned mode, unsigned long int match_limit,
unsigned long int match_limit_recursion,
const hs_platform_info *platform);
~PatternData() {
pcre_free(compiled);
pcre_free(extra);
}
void buildPcre(const char *pattern, u32 flags);
size_t patternSize() const;
void writePattern(ch_pattern *pattern) const;
pcre *compiled; //!< pcre_compile output
pcre_extra *extra; //!< pcre_study output
size_t compiled_size;
int study_size;
int capture_cnt;
bool utf8;
u32 id; //!< ID from the user
u32 expr_index; //!< index in the expression array
bool singlematch; //!< pattern is in highlander mode
bool guard; //!< this pattern should be guarded by the multimatcher
u32 minWidth; //!< min match width
u32 maxWidth; //!< max match width
u32 fixedWidth; //!< fixed pattern width
unsigned long int matchLimit; //! pcre match limit
unsigned long int matchLimitRecursion; //! pcre match_limit_recursion
};
PatternData::PatternData(const char *pattern, u32 flags, u32 idx, u32 id_in,
unsigned mode, unsigned long int match_limit,
unsigned long int match_limit_recursion,
const hs_platform_info *platform)
: compiled(nullptr), extra(nullptr), id(id_in), expr_index(idx),
singlematch(flags & HS_FLAG_SINGLEMATCH),
guard(false), minWidth(0), maxWidth(UINT_MAX),
fixedWidth(UINT_MAX), matchLimit(match_limit),
matchLimitRecursion(match_limit_recursion) {
assert(pattern);
flags |= HS_FLAG_ALLOWEMPTY; /* don't hand things off to pcre for no
reason */
buildPcre(pattern, flags);
// Fetch the expression info for a prefiltering, non-singlematch version of
// this pattern, if possible.
hs_expr_info *info = nullptr;
hs_compile_error_t *error = nullptr;
u32 infoflags = (flags | HS_FLAG_PREFILTER) & ~HS_FLAG_SINGLEMATCH;
u32 rawflags = (flags | HS_FLAG_SOM_LEFTMOST) & ~HS_FLAG_SINGLEMATCH;
hs_error_t err = hs_expression_info(pattern, infoflags, &info, &error);
if (err == HS_SUCCESS) {
assert(info);
hs_expr_info *i = (hs_expr_info *)info;
minWidth = i->min_width;
maxWidth = i->max_width;
bool ordered = i->unordered_matches ? false : true;
// Only enable capturing if required
u32 captureCnt = 0;
if (mode & CH_MODE_GROUPS) {
captureCnt = capture_cnt;
}
// No need to confirm with PCRE if:
// 1) pattern is fixed width
// 2) pattern isn't vacuous as it can't combine with start of match
// 3) no capturing in this pattern
// 4) no offset adjust in this pattern as hyperscan match callback
// will arrive without order, i.e. [^a]\z has offset adjust
// 5) hyperscan compile succeeds without prefiltering
if (minWidth == maxWidth && minWidth && maxWidth != UINT_MAX &&
!captureCnt && ordered &&
isHyperscanSupported(pattern, rawflags, platform)) {
fixedWidth = maxWidth;
}
DEBUG_PRINTF("gathered info: widths=[%u,%u]\n", minWidth, maxWidth);
ch_misc_free(info);
u32 guardflags;
guardflags = (flags | HS_FLAG_PREFILTER) & ~HS_FLAG_SINGLEMATCH;
guard = isHyperscanSupported(pattern, guardflags, platform);
} else {
// We can't even prefilter this pattern, so we're dependent on Big Dumb
// Pcre Scans.
DEBUG_PRINTF("hs_expression_info failed, falling back to pcre\n");
hs_free_compile_error(error);
}
}
void PatternData::buildPcre(const char *pattern, u32 flags) {
int options = convertFlagsToPcreOptions(flags);
const char *errptr = nullptr;
int erroffset = 0;
compiled = pcre_compile(pattern, options, &errptr, &erroffset, nullptr);
if (!compiled) {
DEBUG_PRINTF("PCRE failed to compile: %s\n", pattern);
string err("PCRE compilation failed: ");
err += string(errptr);
err += ".";
throw CompileError(expr_index, err);
}
extra = pcre_study(compiled, PCRE_STUDY_JIT_COMPILE, &errptr);
// Note that it's OK for pcre_study to return NULL if there's nothing
// to be found, but a non-NULL error is always bad.
if (errptr) {
DEBUG_PRINTF("PCRE could not be studied: %s\n", errptr);
string err("PCRE compilation failed: ");
err += string(errptr);
err += ".";
throw CompileError(expr_index, err);
}
if (pcre_fullinfo(compiled, extra, PCRE_INFO_SIZE, &compiled_size)) {
throw CompileError(PCRE_ERROR_MSG);
}
if (!extra) {
study_size = 0;
} else {
if (pcre_fullinfo(compiled, extra, PCRE_INFO_STUDYSIZE, &study_size)) {
throw CompileError(PCRE_ERROR_MSG);
}
}
if (pcre_fullinfo(compiled, extra, PCRE_INFO_CAPTURECOUNT, &capture_cnt)) {
throw CompileError(PCRE_ERROR_MSG);
}
/* We use the pcre rather than hs to get this information as we may need it
* even in the pure unguarded pcre mode where there is no hs available. We
* can not use the compile flags due to (*UTF8) verb */
unsigned long int opts = 0; // PCRE_INFO_OPTIONS demands an unsigned long
if (pcre_fullinfo(compiled, extra, PCRE_INFO_OPTIONS, &opts)) {
throw CompileError(PCRE_ERROR_MSG);
}
utf8 = opts & PCRE_UTF8;
}
size_t PatternData::patternSize() const {
size_t len = 0;
// ch_pattern header.
len += sizeof(ch_pattern);
len = ROUNDUP_N(len, 8);
DEBUG_PRINTF("compiled pcre at %zu\n", len);
len += compiled_size;
// PCRE study data, which may be zero.
if (study_size) {
len = ROUNDUP_N(len, 8);
DEBUG_PRINTF("study at %zu\n", len);
len += (size_t)study_size;
}
DEBUG_PRINTF("pattern size %zu\n", len);
return len;
}
/** \brief Write out an ch_pattern structure, which should already be sized
* correctly according to PatternData::patternSize. */
void PatternData::writePattern(ch_pattern *pattern) const {
assert(pattern);
assert(ISALIGNED_CL(pattern));
pattern->id = id;
u32 flags = 0;
if (singlematch) {
flags |= CHIMERA_PATTERN_FLAG_SINGLEMATCH;
}
if (utf8) {
flags |= CHIMERA_PATTERN_FLAG_UTF8;
}
pattern->flags = flags;
pattern->maxWidth = maxWidth;
pattern->minWidth = minWidth == UINT_MAX ? 0 : minWidth;
pattern->fixedWidth = fixedWidth;
// Compiled PCRE pattern.
char *ptr = (char *)pattern;
ptr += ROUNDUP_N(sizeof(*pattern), 8);
DEBUG_PRINTF("compiled pcre at %zu\n", (size_t)(ptr - (char *)pattern));
memcpy(ptr, compiled, compiled_size);
ptr += compiled_size;
// PCRE match limits
pattern->extra.flags = PCRE_EXTRA_MATCH_LIMIT |
PCRE_EXTRA_MATCH_LIMIT_RECURSION;
pattern->extra.match_limit = matchLimit ? matchLimit : 10000000;
// Set to avoid segment fault
pattern->extra.match_limit_recursion =
matchLimitRecursion ? matchLimitRecursion : 1500;
// PCRE study_data.
u32 studyOffset = 0;
if (extra) {
assert(extra->study_data);
ptr = ROUNDUP_PTR(ptr, 8);
DEBUG_PRINTF("study at %zu\n", (size_t)(ptr - (char *)pattern));
memcpy(ptr, extra->study_data, study_size);
studyOffset = (size_t)(ptr - (char *)pattern);
pattern->extra.flags |= PCRE_EXTRA_STUDY_DATA;
pattern->extra.study_data = ptr;
ptr += study_size;
} else {
pattern->extra.flags &= ~PCRE_EXTRA_STUDY_DATA;
}
pattern->studyOffset = studyOffset;
size_t pcreLen = (ptr - (char *)pattern);
assert(pcreLen <= patternSize());
pattern->length = (u32)pcreLen;
// We shouldn't overrun the space we've allocated for this pattern.
assert(patternSize() >= (size_t)(ptr - (char *)pattern));
}
} // namespace
namespace ch {
static
void ch_compile_multi_int(const char *const *expressions, const unsigned *flags,
const unsigned *ids, unsigned elements,
unsigned mode, unsigned long int match_limit,
unsigned long int match_limit_recursion,
const hs_platform_info_t *platform,
ch_database_t **out) {
vector<unique_ptr<PatternData>> pcres;
pcres.reserve(elements);
vector<u32> unguarded; // indices of unguarded PCREs.
vector<const char *> multiExpr;
vector<unsigned int> multiFlags;
vector<unsigned int> multiIds;
bool allConfirm = true;
bool allSingleMatch = true;
for (unsigned int i = 0; i < elements; i++) {
const char *myExpr = expressions[i];
unsigned int myFlags = flags ? flags[i] : 0;
unsigned int myId = ids ? ids[i] : 0;
checkFlags(myFlags);
// First, build with libpcre. A build failure from libpcre will throw
// an exception up to the caller.
auto patternData =
ue2::make_unique<PatternData>(myExpr, myFlags, i, myId, mode, match_limit,
match_limit_recursion, platform);
pcres.push_back(move(patternData));
PatternData &curr = *pcres.back();
if (!(myFlags & HS_FLAG_SINGLEMATCH)) {
allSingleMatch = false;
}
// in the multimatch, we always run in prefilter mode and accept vacuous
// patterns.
myFlags |=
HS_FLAG_ALLOWEMPTY | HS_FLAG_PREFILTER;
if (curr.fixedWidth != UINT_MAX) {
myFlags |= HS_FLAG_SOM_LEFTMOST;
DEBUG_PRINTF("fixed width, turn off prefiltering\n");
myFlags &= ~HS_FLAG_PREFILTER;
allConfirm = false;
// Single match can't coexist with SOM.
myFlags &= ~HS_FLAG_SINGLEMATCH;
}
if (curr.guard) {
// We use the index into the PCREs array as the Hyperscan idx.
multiExpr.push_back(myExpr);
multiFlags.push_back(myFlags);
multiIds.push_back(i);
} else {
// No Hyperscan support, PCRE is unguarded.
unguarded.push_back(i);
}
}
DEBUG_PRINTF("built %zu PCREs, %zu of which are unguarded\n",
pcres.size(), unguarded.size());
// Work out our sizing for the output database.
size_t patternSize = 0;
for (unsigned int i = 0; i < elements; i++) {
size_t len = pcres[i]->patternSize();
patternSize += ROUNDUP_CL(len);
}
DEBUG_PRINTF("pcre bytecode takes %zu bytes\n", patternSize);
bool noMulti = multiExpr.empty();
size_t multiSize = 0;
hs_database *multidb = nullptr;
if (!noMulti) {
hs_compile_error_t *hs_comp_error = nullptr;
hs_error_t err = hs_compile_multi(&multiExpr[0], &multiFlags[0],
&multiIds[0], multiExpr.size(),
HS_MODE_BLOCK, platform, &multidb,
&hs_comp_error);
if (err != HS_SUCCESS) {
assert(hs_comp_error);
DEBUG_PRINTF("hs_compile_multi returned error: %s\n",
hs_comp_error->message);
assert(0);
hs_free_compile_error(hs_comp_error);
throw CompileError("Internal error.");
}
assert(multidb);
err = hs_database_size(multidb, &multiSize);
if (err != HS_SUCCESS) {
assert(0);
throw CompileError("Internal error.");
}
DEBUG_PRINTF("built hyperscan database with len %zu bytes\n", multiSize);
}
size_t bytecodeLen = sizeof(ch_bytecode) +
multiSize + alignof(u32) +
(sizeof(u32) * unguarded.size()) +
(sizeof(u32) * elements) +
patternSize +
128; // padding for alignment
size_t totalSize = sizeof(ch_database) + bytecodeLen;
DEBUG_PRINTF("allocating %zu bytes for database\n", totalSize);
char *ptr = (char *)ch_database_alloc(totalSize);
if (ch_check_alloc(ptr) != CH_SUCCESS) {
ch_database_free(ptr);
throw std::bad_alloc();
}
memset(ptr, 0, totalSize);
// First, the header.
ch_database *hydb = (ch_database *)ptr;
hydb->magic = CH_DB_MAGIC;
hydb->version = HS_VERSION_32BIT;
hydb->length = bytecodeLen;
// Then, the bytecode.
size_t shift = (size_t)hydb->bytes & 0x3f;
hydb->bytecode = offsetof(struct ch_database, bytes) - shift;
ch_bytecode *db = (ch_bytecode *)((char *)hydb + hydb->bytecode);
db->patternCount = elements;
db->activeSize = mmbit_size(elements);
db->flags = 0;
db->length = bytecodeLen;
if (noMulti) {
db->flags |= CHIMERA_FLAG_NO_MULTIMATCH;
}
if (mode & CH_MODE_GROUPS) {
db->flags |= CHIMERA_FLAG_GROUPS;
}
if (allConfirm) {
db->flags |= CHIMERA_FLAG_ALL_CONFIRM;
}
if (allSingleMatch) {
db->flags |= CHIMERA_FLAG_ALL_SINGLE;
}
// Find and set the max ovector size by looking at the capture count for
// each pcre.
u32 maxCaptureGroups = 0;
for (unsigned int i = 0; i < elements; i++) {
maxCaptureGroups = max(maxCaptureGroups, (u32)pcres[i]->capture_cnt);
}
db->maxCaptureGroups = maxCaptureGroups;
DEBUG_PRINTF("max capture groups is %u\n", maxCaptureGroups);
if (!noMulti) {
DEBUG_PRINTF("write hyperscan database\n");
// Write Hyperscan database directly after the header struct, then free it.
if (!writeHyperscanDatabase(db, multidb)) {
ch_database_free(hydb);
hs_free_database(multidb);
throw CompileError("Internal error.");
}
hs_free_database(multidb);
} else {
db->databaseOffset = ROUNDUP_CL(sizeof(*db));
}
// Then, write our unguarded PCRE list.
db->unguardedCount = unguarded.size();
db->unguardedOffset = ROUNDUP_N(db->databaseOffset + multiSize, 4);
ptr = (char *)db + db->unguardedOffset;
copy(unguarded.begin(), unguarded.end(), (u32 *)ptr);
// Then, write all our compiled PCRE patterns and the lookup table for
// them.
db->patternOffset = db->unguardedOffset + unguarded.size() * sizeof(u32);
u32 *patternOffset = (u32 *)((char *)db + db->patternOffset);
u32 offset = ROUNDUP_CL(db->patternOffset + elements * sizeof(u32));
for (unsigned int i = 0; i < elements; i++) {
*patternOffset = offset;
size_t len = pcres[i]->patternSize();
ptr = (char *)db + offset;
struct ch_pattern *pattern = (struct ch_pattern *)ptr;
pcres[i]->writePattern(pattern);
DEBUG_PRINTF("wrote pcre %u into offset %u, len %zu\n", i, offset, len);
offset += ROUNDUP_CL(len);
patternOffset++;
}
assert(offset <= totalSize);
assert(hydb->magic == CH_DB_MAGIC);
DEBUG_PRINTF("built hybrid database, size %zu bytes\n", totalSize);
DEBUG_PRINTF("offset=%u\n", offset);
*out = hydb;
}
} // namespace ch
extern "C" HS_PUBLIC_API
ch_error_t HS_CDECL ch_compile(const char *expression, unsigned flags,
unsigned mode,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **comp_error) {
if (!comp_error) {
if (db) {
db = nullptr;
}
// nowhere to write the string, but we can still report an error code
return CH_COMPILER_ERROR;
}
if (!db) {
*comp_error =
generateChimeraCompileError("Invalid parameter: db is NULL", -1);
return CH_COMPILER_ERROR;
}
if (!expression) {
*db = nullptr;
*comp_error =
generateChimeraCompileError("Invalid parameter: expressions is\
NULL", -1);
return CH_COMPILER_ERROR;
}
if (!checkMode(mode, comp_error)) {
*db = nullptr;
assert(*comp_error); // set by checkMode
return CH_COMPILER_ERROR;
}
try {
unsigned id = 0; // single expressions get zero as an ID
// Internal function to do all the work, now that we've handled all the
// argument checking.
ch::ch_compile_multi_int(&expression, &flags, &id, 1, mode, 0, 0,
platform, db);
}
catch (const CompileError &e) {
// Compiler error occurred
*db = nullptr;
*comp_error = generateChimeraCompileError(e.reason, e.hasIndex ?
(int)e.index : -1);
return CH_COMPILER_ERROR;
}
catch (std::bad_alloc) {
*db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_enomem);
return CH_COMPILER_ERROR;
}
catch (...) {
assert(!"Internal error, unexpected exception");
*db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_einternal);
return CH_COMPILER_ERROR;
}
DEBUG_PRINTF("success!\n");
return CH_SUCCESS;
}
extern "C" HS_PUBLIC_API
ch_error_t HS_CDECL ch_compile_multi(const char *const *expressions,
const unsigned *flags, const unsigned *ids,
unsigned elements, unsigned mode,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **comp_error) {
if (!comp_error) {
if (db) {
db = nullptr;
}
// nowhere to write the string, but we can still report an error code
return CH_COMPILER_ERROR;
}
if (!db) {
*comp_error =
generateChimeraCompileError("Invalid parameter: db is NULL", -1);
return CH_COMPILER_ERROR;
}
if (!expressions) {
*db = nullptr;
*comp_error =
generateChimeraCompileError("Invalid parameter: expressions is\
NULL", -1);
return CH_COMPILER_ERROR;
}
if (!elements) {
*db = nullptr;
*comp_error = generateChimeraCompileError("Invalid parameter:\
elements is zero", -1);
return CH_COMPILER_ERROR;
}
if (!checkMode(mode, comp_error)) {
*db = nullptr;
assert(*comp_error); // set by checkMode
return CH_COMPILER_ERROR;
}
try {
// Internal function to do all the work, now that we've handled all the
// argument checking.
ch::ch_compile_multi_int(expressions, flags, ids, elements, mode, 0, 0,
platform, db);
}
catch (const CompileError &e) {
// Compiler error occurred
*db = nullptr;
*comp_error = generateChimeraCompileError(e.reason, e.hasIndex ?
(int)e.index : -1);
return CH_COMPILER_ERROR;
}
catch (std::bad_alloc) {
*db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_enomem);
return CH_COMPILER_ERROR;
}
catch (...) {
assert(!"Internal error, unexpected exception");
*db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_einternal);
return CH_COMPILER_ERROR;
}
DEBUG_PRINTF("success!\n");
return CH_SUCCESS;
}
extern "C" HS_PUBLIC_API
ch_error_t HS_CDECL ch_compile_ext_multi(
const char *const *expressions,
const unsigned *flags,
const unsigned *ids,
unsigned elements, unsigned mode,
unsigned long int match_limit,
unsigned long int match_limit_recursion,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **comp_error) {
if (!comp_error) {
if (db) {
db = nullptr;
}
// nowhere to write the string, but we can still report an error code
return CH_COMPILER_ERROR;
}
if (!db) {
*comp_error =
generateChimeraCompileError("Invalid parameter: db is NULL", -1);
return CH_COMPILER_ERROR;
}
if (!expressions) {
*db = nullptr;
*comp_error =
generateChimeraCompileError("Invalid parameter: expressions is\
NULL", -1);
return CH_COMPILER_ERROR;
}
if (!elements) {
*db = nullptr;
*comp_error = generateChimeraCompileError("Invalid parameter:\
elements is zero", -1);
return CH_COMPILER_ERROR;
}
if (!checkMode(mode, comp_error)) {
*db = nullptr;
assert(*comp_error); // set by checkMode
return CH_COMPILER_ERROR;
}
try {
// Internal function to do all the work, now that we've handled all the
// argument checking.
ch::ch_compile_multi_int(expressions, flags, ids, elements, mode,
match_limit, match_limit_recursion, platform,
db);
}
catch (const CompileError &e) {
// Compiler error occurred
*db = nullptr;
*comp_error = generateChimeraCompileError(e.reason, e.hasIndex ?
(int)e.index : -1);
return CH_COMPILER_ERROR;
}
catch (std::bad_alloc) {
*db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_enomem);
return CH_COMPILER_ERROR;
}
catch (...) {
assert(!"Internal error, unexpected exception");
*db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_einternal);
return CH_COMPILER_ERROR;
}
DEBUG_PRINTF("success!\n");
return CH_SUCCESS;
}
extern "C" HS_PUBLIC_API
ch_error_t HS_CDECL ch_free_compile_error(ch_compile_error_t *error) {
freeChimeraCompileError(error);
return CH_SUCCESS;
}

394
chimera/ch_compile.h Normal file
View File

@ -0,0 +1,394 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CH_COMPILE_H_
#define CH_COMPILE_H_
/**
* @file
* @brief The Chimera compiler API definition.
*
* Chimera is a hybrid solution of Hyperscan and PCRE.
*
* This header contains functions for compiling regular expressions into
* Chimera databases that can be used by the Chimera runtime.
*/
#include "ch_common.h"
#include "hs_compile.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* A type containing error details that is returned by the compile calls (@ref
* ch_compile() and @ref ch_compile_multi() on failure. The caller may inspect
* the values returned in this type to determine the cause of failure.
*/
typedef struct ch_compile_error {
/**
* A human-readable error message describing the error.
*/
char *message;
/**
* The zero-based number of the expression that caused the error (if this
* can be determined). If the error is not specific to an expression, then
* this value will be less than zero.
*/
int expression;
} ch_compile_error_t;
/**
* The basic regular expression compiler.
*
* This is the function call with which an expression is compiled into a
* Chimera database which can be passed to the runtime function (
* @ref ch_scan())
*
* @param expression
* The NULL-terminated expression to parse. Note that this string must
* represent ONLY the pattern to be matched, with no delimiters or flags;
* any global flags should be specified with the @a flags argument. For
* example, the expression `/abc?def/i` should be compiled by providing
* `abc?def` as the @a expression, and @ref CH_FLAG_CASELESS as the @a
* flags.
*
* @param flags
* Flags which modify the behaviour of the expression. Multiple flags may
* be used by ORing them together. Valid values are:
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - CH_FLAG_SINGLEMATCH - Only one match will be generated for the
* expression per stream.
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - CH_FLAG_UCP - Use Unicode properties for character classes.
*
* @param mode
* Compiler mode flag that affect the database as a whole for capturing
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
* See @ref CH_MODE_FLAG for more details.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref ch_free_database() function.
*
* @param compile_error
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* ch_free_compile_error() function.
*
* @return
* @ref CH_SUCCESS is returned on successful compilation; @ref
* CH_COMPILER_ERROR on failure, with details provided in the error
* parameter.
*/
ch_error_t HS_CDECL ch_compile(const char *expression, unsigned int flags,
unsigned int mode,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **compile_error);
/**
* The multiple regular expression compiler.
*
* This is the function call with which a set of expressions is compiled into a
* database which can be passed to the runtime function (@ref ch_scan()).
* Each expression can be labelled with a unique integer which is passed into
* the match callback to identify the pattern that has matched.
*
* @param expressions
* Array of NULL-terminated expressions to compile. Note that (as for @ref
* ch_compile()) these strings must contain only the pattern to be
* matched, with no delimiters or flags. For example, the expression
* `/abc?def/i` should be compiled by providing `abc?def` as the first
* string in the @a expressions array, and @ref CH_FLAG_CASELESS as the
* first value in the @a flags array.
*
* @param flags
* Array of flags which modify the behaviour of each expression. Multiple
* flags may be used by ORing them together. Specifying the NULL pointer
* in place of an array will set the flags value for all patterns to zero.
* Valid values are:
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns
* with this match id per stream.
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - CH_FLAG_UCP - Use Unicode properties for character classes.
*
* @param ids
* An array of integers specifying the ID number to be associated with the
* corresponding pattern in the expressions array. Specifying the NULL
* pointer in place of an array will set the ID value for all patterns to
* zero.
*
* @param elements
* The number of elements in the input arrays.
*
* @param mode
* Compiler mode flag that affect the database as a whole for capturing
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
* See @ref CH_MODE_FLAG for more details.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref ch_free_database() function.
*
* @param compile_error
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* ch_free_compile_error() function.
*
* @return
* @ref CH_SUCCESS is returned on successful compilation; @ref
* CH_COMPILER_ERROR on failure, with details provided in the @a error
* parameter.
*
*/
ch_error_t HS_CDECL ch_compile_multi(const char *const *expressions,
const unsigned int *flags,
const unsigned int *ids,
unsigned int elements, unsigned int mode,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **compile_error);
/**
* The multiple regular expression compiler with extended match limits support.
*
* This is the function call with which a set of expressions is compiled into a
* database in the same way as @ref ch_compile_multi(), but allows additional
* parameters to be specified via match_limit and match_limit_recursion to
* define match limits for PCRE runtime.
*
* @param expressions
* Array of NULL-terminated expressions to compile. Note that (as for @ref
* ch_compile()) these strings must contain only the pattern to be
* matched, with no delimiters or flags. For example, the expression
* `/abc?def/i` should be compiled by providing `abc?def` as the first
* string in the @a expressions array, and @ref CH_FLAG_CASELESS as the
* first value in the @a flags array.
*
* @param flags
* Array of flags which modify the behaviour of each expression. Multiple
* flags may be used by ORing them together. Specifying the NULL pointer
* in place of an array will set the flags value for all patterns to zero.
* Valid values are:
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns
* with this match id per stream.
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - CH_FLAG_UCP - Use Unicode properties for character classes.
*
* @param ids
* An array of integers specifying the ID number to be associated with the
* corresponding pattern in the expressions array. Specifying the NULL
* pointer in place of an array will set the ID value for all patterns to
* zero.
*
* @param elements
* The number of elements in the input arrays.
*
* @param mode
* Compiler mode flag that affect the database as a whole for capturing
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
* See @ref CH_MODE_FLAG for more details.
*
* @param match_limit
* A limit from pcre_extra on the amount of match function called in PCRE
* to limit backtracking that can take place.
*
* @param match_limit_recursion
* A limit from pcre_extra on the recursion depth of match function
* in PCRE.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref ch_free_database() function.
*
* @param compile_error
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* ch_free_compile_error() function.
*
* @return
* @ref CH_SUCCESS is returned on successful compilation; @ref
* CH_COMPILER_ERROR on failure, with details provided in the @a error
* parameter.
*
*/
ch_error_t HS_CDECL ch_compile_ext_multi(const char *const *expressions,
const unsigned int *flags,
const unsigned int *ids,
unsigned int elements,
unsigned int mode,
unsigned long int match_limit,
unsigned long int match_limit_recursion,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **compile_error);
/**
* Free an error structure generated by @ref ch_compile(), @ref
* ch_compile_multi().
*
* @param error
* The @ref ch_compile_error_t to be freed. NULL may also be safely
* provided.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_free_compile_error(ch_compile_error_t *error);
/**
* @defgroup CH_PATTERN_FLAG Pattern flags
*
* @{
*/
/**
* Compile flag: Set case-insensitive matching.
*
* This flag sets the expression to be matched case-insensitively by default.
* The expression may still use PCRE tokens (notably `(?i)` and
* `(?-i)`) to switch case-insensitive matching on and off.
*/
#define CH_FLAG_CASELESS 1
/**
* Compile flag: Matching a `.` will not exclude newlines.
*
* This flag sets any instances of the `.` token to match newline characters as
* well as all other characters. The PCRE specification states that the `.`
* token does not match newline characters by default, so without this flag the
* `.` token will not cross line boundaries.
*/
#define CH_FLAG_DOTALL 2
/**
* Compile flag: Set multi-line anchoring.
*
* This flag instructs the expression to make the `^` and `$` tokens match
* newline characters as well as the start and end of the stream. If this flag
* is not specified, the `^` token will only ever match at the start of a
* stream, and the `$` token will only ever match at the end of a stream within
* the guidelines of the PCRE specification.
*/
#define CH_FLAG_MULTILINE 4
/**
* Compile flag: Set single-match only mode.
*
* This flag sets the expression's match ID to match at most once, only the
* first match for each invocation of @ref ch_scan() will be returned.
*
*/
#define CH_FLAG_SINGLEMATCH 8
/**
* Compile flag: Enable UTF-8 mode for this expression.
*
* This flag instructs Chimera to treat the pattern as a sequence of UTF-8
* characters. The results of scanning invalid UTF-8 sequences with a Chimera
* library that has been compiled with one or more patterns using this flag are
* undefined.
*/
#define CH_FLAG_UTF8 32
/**
* Compile flag: Enable Unicode property support for this expression.
*
* This flag instructs Chimera to use Unicode properties, rather than the
* default ASCII interpretations, for character mnemonics like `\w` and `\s` as
* well as the POSIX character classes. It is only meaningful in conjunction
* with @ref CH_FLAG_UTF8.
*/
#define CH_FLAG_UCP 64
/** @} */
/**
* @defgroup CH_MODE_FLAG Compile mode flags
*
* The mode flags are used as values for the mode parameter of the various
* compile calls (@ref ch_compile(), @ref ch_compile_multi().
*
* By default, the matcher will only supply the start and end offsets of the
* match when the match callback is called. Using mode flag @ref CH_MODE_GROUPS
* will also fill the `captured' array with the start and end offsets of all
* the capturing groups specified by the pattern that has matched.
*
* @{
*/
/**
* Compiler mode flag: Disable capturing groups.
*/
#define CH_MODE_NOGROUPS 0
/**
* Compiler mode flag: Enable capturing groups.
*/
#define CH_MODE_GROUPS 1048576
/** @} */
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* CH_COMPILE_H_ */

126
chimera/ch_database.c Normal file
View File

@ -0,0 +1,126 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Chimera: database construction, etc.
*/
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "allocator.h"
#include "database.h"
#include "hs.h"
#include "ch.h"
#include "hs_internal.h"
#include "ch_common.h"
#include "ch_alloc.h"
#include "ch_database.h"
#include "ch_internal.h"
static really_inline
int db_correctly_aligned(const void *db) {
return ISALIGNED_N(db, alignof(unsigned long long));
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_free_database(ch_database_t *hydb) {
if (hydb && hydb->magic != CH_DB_MAGIC) {
return CH_INVALID;
}
ch_database_free(hydb);
return CH_SUCCESS;
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_database_size(const ch_database_t *hydb, size_t *size) {
if (!size) {
return CH_INVALID;
}
ch_error_t ret = hydbIsValid(hydb);
if (unlikely(ret != CH_SUCCESS)) {
return ret;
}
*size = sizeof(struct ch_database) + hydb->length;
return CH_SUCCESS;
}
/** \brief Identifier prepended to database info. */
static const char CHIMERA_IDENT[] = "Chimera ";
HS_PUBLIC_API
ch_error_t HS_CDECL ch_database_info(const ch_database_t *hydb, char **info) {
if (!info) {
return CH_INVALID;
}
*info = NULL;
if (!hydb || !db_correctly_aligned(hydb) || hydb->magic != CH_DB_MAGIC) {
return HS_INVALID;
}
const struct ch_bytecode *bytecode = ch_get_bytecode(hydb);
char noMulti = (bytecode->flags & CHIMERA_FLAG_NO_MULTIMATCH);
if (noMulti) {
size_t len = strlen(CHIMERA_IDENT);
*info = ch_misc_alloc(len + 1);
if (!(*info)) {
return CH_INVALID;
}
memcpy((*info), CHIMERA_IDENT, len);
(*info)[len] = '\0';
return CH_SUCCESS;
}
char *hsinfo = NULL;
hs_error_t ret = hs_database_info(getHyperscanDatabase(bytecode), &hsinfo);
if (ret != HS_SUCCESS) {
assert(!hsinfo);
return ret;
}
size_t hybridlen = strlen(CHIMERA_IDENT);
size_t hslen = strlen(hsinfo);
*info = ch_misc_alloc(hybridlen + hslen + 1);
if (!(*info)) {
ch_misc_free(hsinfo);
return CH_INVALID;
}
memcpy((*info), CHIMERA_IDENT, hybridlen);
memcpy((*info) + hybridlen, hsinfo, hslen);
(*info)[hybridlen + hslen] = '\0';
ch_misc_free(hsinfo);
return CH_SUCCESS;
}

158
chimera/ch_database.h Normal file
View File

@ -0,0 +1,158 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Runtime code for ch_database manipulation.
*/
#ifndef CH_DATABASE_H_
#define CH_DATABASE_H_
#ifdef __cplusplus
extern "C"
{
#endif
#define PCRE_STATIC
#include <pcre.h>
#include "ch_compile.h" // for CH_MODE_ flags
#include "ue2common.h"
#include "hs_version.h"
#include "hs.h"
#define CH_DB_MAGIC 0xdedededeU //!< Magic number stored in \ref ch_database
/** \brief Main Chimera database header. */
struct ch_database {
u32 magic; //!< must be \ref CH_DB_MAGIC
u32 version; //!< release version
u32 length; //!< total allocated length in bytes
u32 reserved0; //!< unused
u32 reserved1; //!< unused
u32 bytecode; //!< offset relative to db start
u32 padding[16]; //!< padding for alignment of rest of bytecode
char bytes[];
};
/** \brief Chimera bytecode header, which follows the \ref ch_database and is
* always 64-byte aligned. */
struct ch_bytecode {
u32 length; //!< length of bytecode including this header struct
u32 flags; //!< whole-database flags (CHIMERA_FLAG_NO_MULTIMATCH,
// CHIMERA_FLAG_GROUPS)
u32 patternCount; //!< total number of patterns
u32 activeSize; //!< size of mmbit to store active pattern ids
u32 databaseOffset; //!< offset for database following \ref ch_bytecode
// header
u32 patternOffset; //!< points to an array of u32 offsets, each pointing to
// a \ref ch_pattern
u32 unguardedOffset; //!< pointer to a list of unguarded pattern indices
u32 unguardedCount; //!< number of unguarded patterns
u32 maxCaptureGroups; //!< max number of capture groups used by any pattern
};
/** \brief Per-pattern header.
*
* struct is followed in bytecode by:
* 1. pcre bytecode (always present)
* 2. pcre study data (sometimes)
*/
struct ch_pattern {
u32 id; //!< pattern ID to report to the user
u32 flags; //!< per-pattern flags (e.g. \ref CHIMERA_PATTERN_FLAG_UTF8)
u32 maxWidth; //!< maximum width of a match, or UINT_MAX for inf.
u32 minWidth; //!< minimum width of a match.
u32 fixedWidth;//!< pattern has fixed width.
u32 studyOffset; //!< offset relative to struct start of study data,
// or zero if there is none
u32 length; //!< length of struct plus pcre bytecode and study data
pcre_extra extra; //!< pcre_extra struct, used to store study data ptr for
// the currently-running pcre at runtime.
};
static really_inline
const void *ch_get_bytecode(const struct ch_database *db) {
assert(db);
const void *bytecode = (const char *)db + db->bytecode;
assert(ISALIGNED_16(bytecode));
return bytecode;
}
struct hs_database;
static really_inline
const struct hs_database *getHyperscanDatabase(const struct ch_bytecode *db) {
assert(db);
const char *ptr = (const char *)db;
const struct hs_database *hs_db;
hs_db = (const struct hs_database *)(ptr + db->databaseOffset);
assert(ISALIGNED_CL(hs_db));
return hs_db;
}
static really_inline
const u32 *getUnguarded(const struct ch_bytecode *db) {
assert(db);
const char *ptr = (const char *)db;
const u32 *unguarded = (const u32 *)(ptr + db->unguardedOffset);
assert(ISALIGNED_N(unguarded, sizeof(u32)));
return unguarded;
}
static really_inline
const struct ch_pattern *getPattern(const struct ch_bytecode *db, u32 i) {
assert(db);
assert(i < db->patternCount);
const char *ptr = (const char *)db;
const u32 *patternOffset = (const u32 *)(ptr + db->patternOffset);
assert(patternOffset[i] < db->length);
return (const struct ch_pattern *)(ptr + patternOffset[i]);
}
static really_inline
ch_error_t hydbIsValid(const struct ch_database *hydb) {
if (!hydb || hydb->magic != CH_DB_MAGIC) {
DEBUG_PRINTF("bad magic (%u != %u)\n", hydb->magic, CH_DB_MAGIC);
return CH_INVALID;
}
if (hydb->version != HS_VERSION_32BIT) {
DEBUG_PRINTF("bad version\n");
return CH_DB_VERSION_ERROR;
}
return CH_SUCCESS;
}
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* CH_DATABASE_H_ */

44
chimera/ch_internal.h Normal file
View File

@ -0,0 +1,44 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Chimera: data structures and internals.
*/
#ifndef CH_INTERNAL_H
#define CH_INTERNAL_H
#define CHIMERA_FLAG_NO_MULTIMATCH 1 //!< Don't run a multimatch scan
#define CHIMERA_FLAG_GROUPS 2 //!< Return capturing groups
#define CHIMERA_FLAG_ALL_CONFIRM 4 //!< All patterns need confirm
#define CHIMERA_FLAG_ALL_SINGLE 8 //!< All patterns need only one match
#define CHIMERA_PATTERN_FLAG_SINGLEMATCH 1 //!< only report the first match
#define CHIMERA_PATTERN_FLAG_UTF8 2 //!< pattern is in UTF-8 mode
#endif

629
chimera/ch_runtime.c Normal file
View File

@ -0,0 +1,629 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Chimera: main runtime.
*/
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "ch.h"
#include "hs.h"
#include "hs_internal.h"
#include "ue2common.h"
#include "ch_database.h"
#include "ch_internal.h"
#include "ch_scratch.h"
#include "util/multibit.h"
#include "util/unicode_def.h"
typedef struct queue_item PQ_T;
static
char PQ_COMP(PQ_T *pqc_items, int a, int b) {
if ((pqc_items)[a].to != (pqc_items)[b].to) {
return (pqc_items)[a].to < (pqc_items)[b].to;
} else if ((pqc_items)[a].from != (pqc_items)[b].from) {
return (pqc_items)[a].from < (pqc_items)[b].from;
} else {
return (pqc_items)[a].id < (pqc_items)[b].id;
}
}
static
char PQ_COMP_B(PQ_T *pqc_items, int a, PQ_T b_fixed) {
if ((pqc_items)[a].to != (b_fixed).to) {
return (pqc_items)[a].to < (b_fixed).to;
} else if ((pqc_items)[a].from != (b_fixed).from) {
return (pqc_items)[a].from < (b_fixed).from;
} else {
return (pqc_items)[a].id < b_fixed.id;
}
}
#include "util/pqueue.h"
static really_inline
void pq_insert_with(struct match_pq *pq, int from, int to, u32 id) {
DEBUG_PRINTF("inserting pattern%u in pq at %u\n", id, to);
struct queue_item temp = {
.from = from,
.to = to,
.id = id,
};
pq_insert(pq->item, pq->size, temp);
++pq->size;
}
static really_inline
void pq_pop_nice(struct match_pq *pq) {
pq_pop(pq->item, pq->size);
pq->size--;
}
/** dummy event handler for use when user does not provide one */
static
int null_onEvent(UNUSED unsigned id, UNUSED unsigned long long from,
UNUSED unsigned long long to, UNUSED unsigned flags,
UNUSED unsigned size, UNUSED const ch_capture_t *captured,
UNUSED void *ctxt) {
return 0;
}
/** \brief Chimera runtime context. */
struct HybridContext {
const char *data; //!< buffer being scanned
u32 length; //!< length of data buffer
u32 valid_utf8_highwater; //!< UTF-8 has been validated up to here.
const struct ch_bytecode *db;
struct ch_scratch *scratch;
struct match_pq *pq;
/** \brief user-supplied match callback */
int (*match_callback)(unsigned int id, unsigned long long from,
unsigned long long to, unsigned int flags,
unsigned int size, const ch_capture_t *capture,
void *ctx);
/** \brief user-supplied error callback */
int (*error_callback)(ch_error_event_t error_type, unsigned int id,
void *info, void *ctx);
/** \brief user-supplied context */
void *context;
};
// Internal PCRE func.
extern int _pcre_valid_utf(const unsigned char *, int, int *);
/** UTF-8 validity check. Returns >0 if the given region of the data is valid
* UTF-8, 0 otherwise. */
static
char isValidUTF8(struct HybridContext *hyctx, u32 end) {
assert(hyctx);
if (hyctx->valid_utf8_highwater >= end) {
return 1; // Already validated.
}
const unsigned char *data =
(const unsigned char *)hyctx->data + hyctx->valid_utf8_highwater;
int validate_len = end - hyctx->valid_utf8_highwater;
DEBUG_PRINTF("validating %d bytes\n", validate_len);
int erroroffset = 0;
if (_pcre_valid_utf(data, validate_len, &erroroffset)) {
DEBUG_PRINTF("UTF8 invalid at offset %d\n", erroroffset);
return 0;
}
hyctx->valid_utf8_highwater = end;
return 1;
}
static
const pcre *getPcre(const struct ch_pattern *pattern) {
const char *ptr = (const char *)pattern;
const pcre *p = (const pcre *)(ptr + ROUNDUP_N(sizeof(*pattern), 8));
assert(ISALIGNED_N(p, 8));
return p;
}
/** \brief Fill the Chimera groups array from a pcre_exec ovector. */
static
void fillGroupsFromOvector(ch_capture_t *groups, int numPairs, int *ovector) {
assert(groups);
assert(ISALIGNED_N(groups, alignof(ch_capture_t)));
DEBUG_PRINTF("filling %d groups (@ %p) from pcre ovector\n",
numPairs, groups);
for (int i = 0; i < numPairs * 2; i += 2) {
if (ovector[i] == -1) {
groups->flags = CH_CAPTURE_FLAG_INACTIVE;
} else {
groups->flags = CH_CAPTURE_FLAG_ACTIVE;
assert(ovector[i] <= ovector[i + 1]);
groups->from = ovector[i];
groups->to = ovector[i + 1];
}
++groups;
}
}
static
ch_error_t handlePcreNonMatch(const struct ch_pattern *pattern, int rv,
ch_error_event_handler onError,
void *userContext) {
assert(rv < 0);
if (rv == PCRE_ERROR_NOMATCH) {
DEBUG_PRINTF("no match found by libpcre\n");
return CH_SUCCESS;
} else if (rv == PCRE_ERROR_MATCHLIMIT) {
DEBUG_PRINTF("pcre hit match limit\n");
if (onError) {
return onError(CH_ERROR_MATCHLIMIT, pattern->id, NULL,
userContext);
}
return CH_SUCCESS;
} else if (rv == PCRE_ERROR_RECURSIONLIMIT) {
DEBUG_PRINTF("pcre hit recursion limit\n");
if (onError) {
return onError(CH_ERROR_RECURSIONLIMIT, pattern->id, NULL,
userContext);
}
return CH_SUCCESS;
}
// All other errors not handled above are fatal.
return CH_FAIL_INTERNAL;
}
static
ch_error_t scanPcre(struct HybridContext *hyctx, UNUSED unsigned int length,
unsigned int offset, u32 id) {
const char *data = hyctx->data;
unsigned int full_length = hyctx->length;
ch_error_event_handler onError = hyctx->error_callback;
void *userContext = hyctx->context;
const struct ch_pattern *pattern = getPattern(hyctx->db, id);
const pcre *p = getPcre(pattern);
// Set up the PCRE extra block.
const pcre_extra *extra = &pattern->extra;
int startoffset = offset;
int *ovector = hyctx->scratch->ovector;
int ovectorSize = (hyctx->scratch->maxCaptureGroups + 1) * 3;
assert(ovectorSize >= 2);
DEBUG_PRINTF("scanning %u bytes, pattern %u, startoffset %d\n",
length, id, startoffset);
int options = 0;
if (pattern->flags & CHIMERA_PATTERN_FLAG_UTF8) {
// We do our own UTF-8 validation.
options |= PCRE_NO_UTF8_CHECK;
if (!isValidUTF8(hyctx, full_length)) {
return handlePcreNonMatch(pattern, PCRE_ERROR_BADUTF8, onError,
userContext);
}
}
int rv = pcre_exec(p, extra, data, full_length, startoffset, options,
ovector, ovectorSize);
DEBUG_PRINTF("pcre return code is %d\n", rv);
// Handle all non-match or error cases, all of which involve us
// terminating the loop.
if (rv < 0) {
return handlePcreNonMatch(pattern, rv, onError, userContext);
}
// We've found a match, and we should always have room for at least the
// start and end offsets in our ovector. Pass this info to the user.
assert(rv >= 1);
assert(rv < ovectorSize);
int from = ovector[0];
int to = ovector[1];
DEBUG_PRINTF("match %d -> %d\n", from, to);
struct ch_patterndata *pd = hyctx->scratch->patternData + id;
if (hyctx->db->flags & CHIMERA_FLAG_GROUPS) {
fillGroupsFromOvector(pd->match, rv, ovector);
} else {
rv = 0;
}
pd->groupCount = (u32)rv;
// Insert new matched item to the queue
pq_insert_with(hyctx->pq, from, to, id);
// Next scan starts at the first codepoint after the match. It's
// possible that we have a vacuous match, in which case we must step
// past it to ensure that we always progress.
if (from != to) {
startoffset = to;
} else if (pattern->flags & CHIMERA_PATTERN_FLAG_UTF8) {
startoffset = to + 1;
while (startoffset < (int)full_length &&
((data[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) {
++startoffset;
}
} else {
startoffset = to + 1;
}
pd->scanStart = startoffset;
DEBUG_PRINTF("new offset %u\n", pd->scanStart);
return CH_SUCCESS;
}
static
ch_error_t catchupPcre(struct HybridContext *hyctx, unsigned int id,
unsigned long long from, unsigned long long to) {
ch_match_event_handler onEvent = hyctx->match_callback;
void *userContext = hyctx->context;
DEBUG_PRINTF("priority queue size %u\n", hyctx->pq->size);
while (hyctx->pq->size) {
u32 num_item = hyctx->pq->size;
struct queue_item *item = pq_top(hyctx->pq->item);
size_t top_from = item->from;
size_t top_to = item->to;
u32 top_id = item->id;
if (top_to > to) {
pq_insert_with(hyctx->pq, from, to, id);
break;
}
pq_pop_nice(hyctx->pq);
const struct ch_pattern *pattern = getPattern(hyctx->db, top_id);
struct ch_patterndata *pd = hyctx->scratch->patternData + top_id;
// Report match for pattern
DEBUG_PRINTF("trigger match@%zu\n", top_to);
ch_callback_t cbrv =
onEvent(pattern->id, top_from, top_to, 0 /* flags */,
pd->groupCount, pd->match, userContext);
if (cbrv == CH_CALLBACK_TERMINATE) {
DEBUG_PRINTF("user callback told us to terminate scanning\n");
return CH_SCAN_TERMINATED;
} else if (cbrv == CH_CALLBACK_SKIP_PATTERN) {
DEBUG_PRINTF("user callback told us to skip this pattern\n");
pd->scanStart = hyctx->length;
}
if (top_id == id) {
break;
}
// Push a new match to replace the old one
unsigned int start = pd->scanStart;
unsigned int len = hyctx->length - pd->scanStart;
if (hyctx->length >= pd->scanStart &&
!(pattern->flags & CHIMERA_PATTERN_FLAG_SINGLEMATCH)) {
DEBUG_PRINTF("get a new match item\n");
int ret = scanPcre(hyctx, len, start, top_id);
if (ret == CH_CALLBACK_TERMINATE) {
DEBUG_PRINTF("user callback told us to terminate scanning\n");
return CH_SCAN_TERMINATED;
} else if (ret == CH_CALLBACK_SKIP_PATTERN) {
DEBUG_PRINTF("user callback told us to skip this pattern\n");
pd->scanStart = hyctx->length;
ret = CH_SUCCESS;
} else if (ret == CH_FAIL_INTERNAL) {
return ret;
}
// No further match is found
if (hyctx->pq->size == num_item - 1) {
pd->scanStart = hyctx->length;
}
}
}
return CH_SUCCESS;
}
/** \brief Callback used for internal Hyperscan multi-matcher. */
static
int multiCallback(unsigned int id, unsigned long long from,
unsigned long long to, UNUSED unsigned int flags,
void *ctx) {
assert(ctx);
struct HybridContext *hyctx = ctx;
DEBUG_PRINTF("match for ID %u at offset %llu\n", id, to);
assert(id < hyctx->db->patternCount);
const struct ch_pattern *pattern = getPattern(hyctx->db, id);
struct ch_patterndata *pd = hyctx->scratch->patternData + id;
char needConfirm = pattern->fixedWidth == ~0U;
if (needConfirm &&
mmbit_isset(hyctx->scratch->active, hyctx->db->patternCount, id)) {
if ((hyctx->db->flags & CHIMERA_FLAG_ALL_CONFIRM) &&
mmbit_all(hyctx->scratch->active, hyctx->db->patternCount)) {
return 1;
}
return 0;
}
// Store the fact that we've seen this bit.
char already = mmbit_set(hyctx->scratch->active,
hyctx->db->patternCount, id);
DEBUG_PRINTF("match from %u to %llu\n", pd->scanStart, to);
if (!already) {
pd->scanStart = 0;
} else if (to < pd->scanStart + pattern->minWidth) {
return 0;
} else if (pattern->flags & CHIMERA_PATTERN_FLAG_SINGLEMATCH) {
if ((hyctx->db->flags & CHIMERA_FLAG_ALL_SINGLE) &&
mmbit_all(hyctx->scratch->active, hyctx->db->patternCount)) {
return 1;
}
// Note: we may have unordered match from Hyperscan,
// thus possibly get to < pd->scanStart.
return 0;
}
int ret = HS_SUCCESS;
unsigned int start = pd->scanStart;
unsigned int len = hyctx->length - pd->scanStart;
assert(hyctx->length >= pd->scanStart);
const char *data = hyctx->data;
if (needConfirm) {
DEBUG_PRINTF("run confirm for the first time\n");
ret = scanPcre(hyctx, len, start, id);
hyctx->scratch->ret = ret;
if (ret == CH_CALLBACK_TERMINATE) {
DEBUG_PRINTF("user callback told us to terminate scanning\n");
return HS_SCAN_TERMINATED;
} else if (ret == CH_CALLBACK_SKIP_PATTERN) {
DEBUG_PRINTF("user callback told us to skip this pattern\n");
pd->scanStart = hyctx->length;
ret = HS_SUCCESS;
} else if (ret == CH_FAIL_INTERNAL) {
return ret;
}
} else {
if (already) {
DEBUG_PRINTF("catch up with new matches\n");
ret = catchupPcre(hyctx, id, from, to);
hyctx->scratch->ret = ret;
if (pd->scanStart >= hyctx->length) {
return ret;
}
}
int startoffset = 0;
// Next scan starts at the first codepoint after the match. It's
// possible that we have a vacuous match, in which case we must step
// past it to ensure that we always progress.
if (from != to) {
startoffset = to;
} else if (pattern->flags & CHIMERA_PATTERN_FLAG_UTF8) {
startoffset = to + 1;
while (startoffset < (int)hyctx->length &&
((data[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) {
++startoffset;
}
} else {
startoffset = to + 1;
}
pd->scanStart = startoffset;
int rv = 0;
if (hyctx->db->flags & CHIMERA_FLAG_GROUPS) {
ch_capture_t *groups = pd->match;
groups->flags = CH_CAPTURE_FLAG_ACTIVE;
groups->from = from;
groups->to = to;
rv = 1;
}
pd->groupCount = (u32)rv;
pq_insert_with(hyctx->pq, from, to, id);
}
return ret;
}
static
hs_error_t scanHyperscan(struct HybridContext *hyctx, const char *data,
unsigned int length) {
DEBUG_PRINTF("scanning %u bytes with Hyperscan\n", length);
const struct ch_bytecode *hydb = hyctx->db;
const hs_database_t *db = getHyperscanDatabase(hydb);
hs_scratch_t *scratch = hyctx->scratch->multi_scratch;
hs_error_t err = hs_scan(db, data, length, 0, scratch, multiCallback,
hyctx);
return err;
}
/** \brief Init match priority queue.
*
* Add a first match offset for each pattern that is not supported by Hyperscan
* with prefiltering.
*/
static really_inline
ch_error_t initQueue(struct HybridContext *hyctx, struct match_pq *pq) {
const struct ch_bytecode *db = hyctx->db;
u8 *active = hyctx->scratch->active;
mmbit_clear(active, db->patternCount);
// Init match queue size
pq->size = 0;
unsigned int length = hyctx->length;
const u32 *unguarded = getUnguarded(db);
for (u32 i = 0; i < db->unguardedCount; i++) {
u32 patternId = unguarded[i];
DEBUG_PRINTF("switch on unguarded pcre %u\n", patternId);
mmbit_set(active, db->patternCount, patternId);
DEBUG_PRINTF("get a new match item\n");
int ret = scanPcre(hyctx, length, 0, patternId);
struct ch_patterndata *pd = hyctx->scratch->patternData + patternId;
if (ret == CH_CALLBACK_TERMINATE) {
DEBUG_PRINTF("user callback told us to terminate scanning\n");
return CH_SCAN_TERMINATED;
} else if (ret == CH_CALLBACK_SKIP_PATTERN) {
DEBUG_PRINTF("user callback told us to skip this pattern\n");
pd->scanStart = length;
ret = CH_SUCCESS;
} else if (ret == CH_FAIL_INTERNAL) {
return ret;
}
}
return CH_SUCCESS;
}
static really_inline
ch_error_t ch_scan_i(const ch_database_t *hydb,
const char *data, unsigned int length,
UNUSED unsigned int flags,
ch_scratch_t *scratch,
ch_match_event_handler onEvent,
ch_error_event_handler onError,
void *userContext) {
if (unlikely(!hydb || !scratch || !data)) {
DEBUG_PRINTF("args invalid\n");
return CH_INVALID;
}
ch_error_t ret = hydbIsValid(hydb);
if (ret != CH_SUCCESS) {
DEBUG_PRINTF("database invalid\n");
return ret;
}
if (!ISALIGNED_CL(scratch)) {
DEBUG_PRINTF("bad alignment %p\n", scratch);
return CH_INVALID;
}
if (scratch->magic != CH_SCRATCH_MAGIC) {
DEBUG_PRINTF("scratch invalid\n");
return CH_INVALID;
}
if (unlikely(markScratchInUse(scratch))) {
return CH_SCRATCH_IN_USE;
}
// Hyperscan underlying scratch and database validity will be checked by
// the hs_scan() call, so no need to do it here.
// PCRE takes the data region length in as an int, so this limits our block
// size to INT_MAX.
if (length > INT_MAX) {
DEBUG_PRINTF("length invalid\n");
unmarkScratchInUse(scratch);
return CH_INVALID;
}
const struct ch_bytecode *db = ch_get_bytecode(hydb);
scratch->pq.size = 0;
scratch->ret = CH_SUCCESS;
// Firstly, we run Hyperscan in block mode and add its matches into the
// active list for subsequent confirmation with pcre.
struct HybridContext hyctx = {
.data = data,
.length = length,
.valid_utf8_highwater = 0,
.db = db,
.scratch = scratch,
.pq = &scratch->pq,
.match_callback = onEvent ? onEvent : null_onEvent,
.error_callback = onError,
.context = userContext
};
// Init priority queue.
ret = initQueue(&hyctx, &scratch->pq);
if (ret != CH_SUCCESS) {
DEBUG_PRINTF("Chimera returned error %d\n", ret);
unmarkScratchInUse(scratch);
return ret;
}
if (!(db->flags & CHIMERA_FLAG_NO_MULTIMATCH)) {
ret = scanHyperscan(&hyctx, data, length);
if (ret != HS_SUCCESS && scratch->ret != CH_SUCCESS) {
DEBUG_PRINTF("Hyperscan returned error %d\n", scratch->ret);
unmarkScratchInUse(scratch);
return scratch->ret;
}
}
DEBUG_PRINTF("Flush priority queue\n");
// Catch up with PCRE and make up id and offsets as we don't really care
// about their values
ret = catchupPcre(&hyctx, ~0U, length, length);
if (ret != CH_SUCCESS) {
DEBUG_PRINTF("PCRE catch up returned error %d\n", ret);
unmarkScratchInUse(scratch);
return ret;
}
unmarkScratchInUse(scratch);
return CH_SUCCESS;
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_scan(const ch_database_t *hydb, const char *data,
unsigned int length, unsigned int flags,
ch_scratch_t *scratch,
ch_match_event_handler onEvent,
ch_error_event_handler onError, void *userContext) {
ch_error_t ret = ch_scan_i(hydb, data, length, flags, scratch, onEvent,
onError, userContext);
return ret;
}
HS_PUBLIC_API
const char * HS_CDECL ch_version(void) {
return HS_VERSION_STRING;
}

377
chimera/ch_runtime.h Normal file
View File

@ -0,0 +1,377 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CH_RUNTIME_H_
#define CH_RUNTIME_H_
#include <stdlib.h>
/**
* @file
* @brief The Chimera runtime API definition.
*
* Chimera is a hybrid of Hyperscan and PCRE regular expression engine.
*
* This header contains functions for using compiled Chimera databases for
* scanning data at runtime.
*/
#include "hs_common.h"
#ifdef __cplusplus
extern "C"
{
#endif
struct ch_scratch;
/**
* A Chimera scratch space.
*/
typedef struct ch_scratch ch_scratch_t;
/**
* Callback return value used to tell the Chimera matcher what to do after
* processing this match.
*/
typedef int ch_callback_t;
/**
* @defgroup CH_CALLBACK ch_callback_t values
*
* @{
*/
/**
* Continue matching.
*/
#define CH_CALLBACK_CONTINUE 0
/**
* Terminate matching.
*/
#define CH_CALLBACK_TERMINATE 1
/**
* Skip remaining matches for this ID and continue.
*/
#define CH_CALLBACK_SKIP_PATTERN 2
/** @} */
/**
* Type used to differentiate the errors raised with the @ref
* ch_error_event_handler callback.
*/
typedef int ch_error_event_t;
/**
* @defgroup CH_ERROR_EVENT ch_error_event_t values
*
* @{
*/
/**
* PCRE hits its match limit and reports PCRE_ERROR_MATCHLIMIT.
*/
#define CH_ERROR_MATCHLIMIT 1
/**
* PCRE hits its recursion limit and reports PCRE_ERROR_RECURSIONLIMIT.
*/
#define CH_ERROR_RECURSIONLIMIT 2
/** @} */
/**
* Structure representing a captured subexpression within a match. An array of
* these structures corresponding to capture groups in order is passed to the
* callback on match, with active structures identified by the
* CH_CAPTURE_FLAG_ACTIVE flag.
*/
typedef struct ch_capture {
/**
* The flags indicating if this structure is active.
*/
unsigned int flags;
/**
* offset at which this capture group begins.
*/
unsigned long long from; /*< offset at which this capture group begins. */
/**
* offset at which this capture group ends.
*/
unsigned long long to;
} ch_capture_t;
/**
* @defgroup CH_CAPTURE ch_capture_t flags
*
* These flags are used in @ref ch_capture_t::flags to indicate if this
* structure is active.
*
* @{
*/
/**
* Flag indicating that a particular capture group is inactive, used in @ref
* ch_capture_t::flags.
*/
#define CH_CAPTURE_FLAG_INACTIVE 0
/**
* Flag indicating that a particular capture group is active, used in @ref
* ch_capture_t::flags.
*/
#define CH_CAPTURE_FLAG_ACTIVE 1
/** @} */
/**
* Definition of the match event callback function type.
*
* A callback function matching the defined type must be provided by the
* application calling the @ref ch_scan()
*
* This callback function will be invoked whenever a match is located in the
* target data during the execution of a scan. The details of the match are
* passed in as parameters to the callback function, and the callback function
* should return a value indicating whether or not matching should continue on
* the target data. If no callbacks are desired from a scan call, NULL may be
* provided in order to suppress match production.
*
* @param id
* The ID number of the expression that matched. If the expression was a
* single expression compiled with @ref ch_compile(), this value will be
* zero.
*
* @param from
* The offset of the first byte that matches the expression.
*
* @param to
* The offset after the last byte that matches the expression.
*
* @param flags
* This is provided for future use and is unused at present.
*
* @param size
* The number of valid entries pointed to by the captured parameter.
*
* @param captured
* A pointer to an array of @ref ch_capture_t structures that
* contain the start and end offsets of entire pattern match and
* each captured subexpression.
*
* @param ctx
* The pointer supplied by the user to the @ref ch_scan() function.
*
* @return
* The callback can return @ref CH_CALLBACK_TERMINATE to stop matching.
* Otherwise, a return value of @ref CH_CALLBACK_CONTINUE will continue,
* with the current pattern if configured to produce multiple matches per
* pattern, while a return value of @ref CH_CALLBACK_SKIP_PATTERN will
* cease matching this pattern but continue matching the next pattern.
*/
typedef ch_callback_t (*ch_match_event_handler)(unsigned int id,
unsigned long long from,
unsigned long long to,
unsigned int flags,
unsigned int size,
const ch_capture_t *captured,
void *ctx);
/**
* Definition of the Chimera error event callback function type.
*
* A callback function matching the defined type may be provided by the
* application calling the @ref ch_scan function. This callback function
* will be invoked when an error event occurs during matching; this indicates
* that some matches for a given expression may not be reported.
*
* @param error_type
* The type of error event that occurred. Currently these errors
* correspond to resource limits on PCRE backtracking
* @ref CH_ERROR_MATCHLIMIT and @ref CH_ERROR_RECURSIONLIMIT.
*
* @param id
* The ID number of the expression that matched.
*
* @param info
* Event-specific data, for future use. Currently unused.
*
* @param ctx
* The context pointer supplied by the user to the @ref ch_scan
* function.
*
* @return
* The callback can return @ref CH_CALLBACK_SKIP_PATTERN to cease matching this
* pattern but continue matching the next pattern. Otherwise, we stop
* matching for all patterns with @ref CH_CALLBACK_TERMINATE.
*/
typedef ch_callback_t (*ch_error_event_handler)(ch_error_event_t error_type,
unsigned int id, void *info,
void *ctx);
/**
* The block regular expression scanner.
*
* This is the function call in which the actual pattern matching takes place
* for block-mode pattern databases.
*
* @param db
* A compiled pattern database.
*
* @param data
* Pointer to the data to be scanned.
*
* @param length
* The number of bytes to scan.
*
* @param flags
* Flags modifying the behaviour of this function. This parameter is
* provided for future use and is unused at present.
*
* @param scratch
* A per-thread scratch space allocated by @ref ch_alloc_scratch() for this
* database.
*
* @param onEvent
* Pointer to a match event callback function. If a NULL pointer is given,
* no matches will be returned.
*
* @param onError
* Pointer to a error event callback function. If a NULL pointer is given,
* @ref CH_ERROR_MATCHLIMIT and @ref CH_ERROR_RECURSIONLIMIT errors will
* be ignored and match will continue.
*
* @param context
* The user defined pointer which will be passed to the callback function.
*
* @return
* Returns @ref CH_SUCCESS on success; @ref CH_SCAN_TERMINATED if the
* match callback indicated that scanning should stop; other values on
* error.
*/
ch_error_t HS_CDECL ch_scan(const ch_database_t *db, const char *data,
unsigned int length, unsigned int flags,
ch_scratch_t *scratch,
ch_match_event_handler onEvent,
ch_error_event_handler onError,
void *context);
/**
* Allocate a "scratch" space for use by Chimera.
*
* This is required for runtime use, and one scratch space per thread, or
* concurrent caller, is required. Any allocator callback set by @ref
* ch_set_scratch_allocator() or @ref ch_set_allocator() will be used by this
* function.
*
* @param db
* The database, as produced by @ref ch_compile().
*
* @param scratch
* On first allocation, a pointer to NULL should be provided so a new
* scratch can be allocated. If a scratch block has been previously
* allocated, then a pointer to it should be passed back in to see if it
* is valid for this database block. If a new scratch block is required,
* the original will be freed and the new one returned, otherwise the
* previous scratch block will be returned. On success, the scratch block
* will be suitable for use with the provided database in addition to any
* databases that original scratch space was suitable for.
*
* @return
* @ref CH_SUCCESS on successful allocation; @ref CH_NOMEM if the
* allocation fails. Other errors may be returned if invalid parameters
* are specified.
*/
ch_error_t HS_CDECL ch_alloc_scratch(const ch_database_t *db,
ch_scratch_t **scratch);
/**
* Allocate a scratch space that is a clone of an existing scratch space.
*
* This is useful when multiple concurrent threads will be using the same set
* of compiled databases, and another scratch space is required. Any allocator
* callback set by @ref ch_set_scratch_allocator() or @ref ch_set_allocator()
* will be used by this function.
*
* @param src
* The existing @ref ch_scratch_t to be cloned.
*
* @param dest
* A pointer to the new scratch space will be returned here.
*
* @return
* @ref CH_SUCCESS on success; @ref CH_NOMEM if the allocation fails.
* Other errors may be returned if invalid parameters are specified.
*/
ch_error_t HS_CDECL ch_clone_scratch(const ch_scratch_t *src,
ch_scratch_t **dest);
/**
* Provides the size of the given scratch space.
*
* @param scratch
* A per-thread scratch space allocated by @ref ch_alloc_scratch() or @ref
* ch_clone_scratch().
*
* @param scratch_size
* On success, the size of the scratch space in bytes is placed in this
* parameter.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_scratch_size(const ch_scratch_t *scratch,
size_t *scratch_size);
/**
* Free a scratch block previously allocated by @ref ch_alloc_scratch() or @ref
* ch_clone_scratch().
*
* The free callback set by @ref ch_set_scratch_allocator() or @ref
* ch_set_allocator() will be used by this function.
*
* @param scratch
* The scratch block to be freed. NULL may also be safely provided.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_free_scratch(ch_scratch_t *scratch);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* CH_RUNTIME_H_ */

317
chimera/ch_scratch.c Normal file
View File

@ -0,0 +1,317 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Chimera: scratch space alloc.
*/
#include <string.h>
#include "allocator.h"
#include "ch.h"
#include "hs.h"
#include "hs_internal.h"
#include "ue2common.h"
#include "ch_alloc.h"
#include "ch_internal.h"
#include "ch_scratch.h"
#include "ch_database.h"
static
size_t getPatternDataSize(const ch_scratch_t *s) {
size_t numCapturingStructs =
s->patternCount * (s->maxCaptureGroups + 1);
return (sizeof(struct ch_patterndata) * s->patternCount) +
alignof(struct ch_capture) + // padding
(sizeof(struct ch_capture) * numCapturingStructs);
}
static
void initPatternData(const ch_scratch_t *s) {
// ch_capture array is aligned, directly after the patterndata array.
char *ptr = (char *)s->patternData +
(sizeof(struct ch_patterndata) * s->patternCount);
struct ch_capture *cap = (struct ch_capture *)
(ROUNDUP_PTR(ptr, alignof(struct ch_capture)));
for (u32 i = 0; i < s->patternCount; i++) {
struct ch_patterndata *pd = &s->patternData[i];
pd->match = cap;
DEBUG_PRINTF("pattern %u: pd=%p, match=%p\n", i, pd, pd->match);
cap += (s->maxCaptureGroups + 1);
}
}
static
ch_error_t alloc_scratch(const ch_scratch_t *proto, ch_scratch_t **scratch) {
size_t ovectorSize = (proto->maxCaptureGroups + 1) * sizeof(int) * 3;
size_t capturedSize =
sizeof(struct ch_capture) * (proto->maxCaptureGroups + 1);
size_t patternDataSize = getPatternDataSize(proto);
size_t activeSize = proto->activeSize;
size_t queueSize = proto->patternCount * sizeof(struct queue_item);
// max padding for alignment below.
size_t padding = alignof(int) + alignof(struct ch_capture) +
alignof(struct ch_patterndata) +
alignof(struct queue_item);
size_t allocSize = sizeof(ch_scratch_t) + ovectorSize + capturedSize +
patternDataSize + activeSize + queueSize + padding
+ 256; /* padding for cacheline alignment */
ch_scratch_t *s;
ch_scratch_t *s_tmp = ch_scratch_alloc(allocSize);
ch_error_t err = ch_check_alloc(s_tmp);
if (err != CH_SUCCESS) {
ch_scratch_free(s_tmp);
*scratch = NULL;
return err;
}
memset(s_tmp, 0, allocSize);
s = ROUNDUP_PTR(s_tmp, 64);
// Set ordinary members.
*s = *proto;
s->magic = CH_SCRATCH_MAGIC;
s->in_use = 0;
s->scratch_alloc = (char *)s_tmp;
// Set pointers internal to allocation.
char *ptr = (char *)s + sizeof(*s);
ptr = ROUNDUP_PTR(ptr, alignof(int));
s->ovector = (int *)ptr;
ptr += ovectorSize;
ptr = ROUNDUP_PTR(ptr, alignof(struct ch_capture));
s->captured = (struct ch_capture *)ptr;
ptr += capturedSize;
ptr = ROUNDUP_PTR(ptr, alignof(struct ch_patterndata));
s->patternData = (struct ch_patterndata *)ptr;
ptr += patternDataSize;
// Pre-fill pattern data, setting captureOffsets
initPatternData(s);
ptr = ROUNDUP_PTR(ptr, alignof(struct queue_item));
s->pq.item = (struct queue_item *)ptr;
ptr += queueSize;
s->active = (u8 *)ptr;
// Store size.
s->scratchSize = allocSize;
// We should never overrun our allocation.
assert((ptr + activeSize) - (char *)s <= (ptrdiff_t)allocSize);
*scratch = s;
return CH_SUCCESS;
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_alloc_scratch(const ch_database_t *hydb,
ch_scratch_t **scratch) {
if (!hydb || !scratch) {
DEBUG_PRINTF("invalid args\n");
return CH_INVALID;
}
DEBUG_PRINTF("hydb=%p, &scratch=%p\n", hydb, scratch);
ch_error_t rv = hydbIsValid(hydb);
if (rv != CH_SUCCESS) {
DEBUG_PRINTF("invalid database\n");
return rv;
}
if (*scratch != NULL) {
/* has to be aligned before we can do anything with it */
if (!ISALIGNED_CL(*scratch)) {
return CH_INVALID;
}
if ((*scratch)->magic != CH_SCRATCH_MAGIC) {
return CH_INVALID;
}
if (markScratchInUse(*scratch)) {
return CH_SCRATCH_IN_USE;
}
}
// We allocate a prototype of the scratch header to do our sizing with.
ch_scratch_t *proto;
ch_scratch_t *proto_tmp = ch_scratch_alloc(sizeof(ch_scratch_t) + 256);
ch_error_t proto_ret = ch_check_alloc(proto_tmp);
if (proto_ret != CH_SUCCESS) {
ch_scratch_free(proto_tmp);
ch_scratch_free(*scratch);
*scratch = NULL;
return proto_ret;
}
proto = ROUNDUP_PTR(proto_tmp, 64);
int resize = 0;
if (*scratch) {
*proto = **scratch;
} else {
memset(proto, 0, sizeof(*proto));
resize = 1;
}
proto->scratch_alloc = (char *)proto_tmp;
const struct ch_bytecode *db = ch_get_bytecode(hydb);
if (db->maxCaptureGroups > proto->maxCaptureGroups) {
proto->maxCaptureGroups = db->maxCaptureGroups;
resize = 1;
}
if (db->patternCount > proto->patternCount) {
proto->patternCount = db->patternCount;
proto->activeSize = db->activeSize;
resize = 1;
}
if (resize) {
if (*scratch) {
ch_scratch_free((*scratch)->scratch_alloc);
}
ch_error_t alloc_ret = alloc_scratch(proto, scratch);
ch_scratch_free(proto_tmp);
if (alloc_ret != CH_SUCCESS) {
*scratch = NULL;
return alloc_ret;
}
} else {
ch_scratch_free(proto_tmp);
unmarkScratchInUse(*scratch);
}
if (db->flags & CHIMERA_FLAG_NO_MULTIMATCH) {
(*scratch)->multi_scratch = NULL;
return CH_SUCCESS;
}
// We may still have to realloc the underlying Hyperscan scratch.
rv = hs_alloc_scratch(getHyperscanDatabase(db),
&(*scratch)->multi_scratch);
if (rv != HS_SUCCESS) {
DEBUG_PRINTF("hs_alloc_scratch for multi_scratch failed\n");
hs_free_scratch((*scratch)->multi_scratch);
ch_scratch_free((*scratch)->scratch_alloc);
*scratch = NULL;
return rv;
}
return CH_SUCCESS;
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_clone_scratch(const ch_scratch_t *src,
ch_scratch_t **dest) {
if (!dest || !src || !ISALIGNED_CL(src) ||
src->magic != CH_SCRATCH_MAGIC) {
DEBUG_PRINTF("scratch invalid\n");
return CH_INVALID;
}
ch_error_t ret = alloc_scratch(src, dest);
if (ret != CH_SUCCESS) {
DEBUG_PRINTF("alloc_scratch failed\n");
*dest = NULL;
return ret;
}
if (src->multi_scratch) {
(*dest)->multi_scratch = NULL;
ret = hs_clone_scratch(src->multi_scratch, &(*dest)->multi_scratch);
if (ret != HS_SUCCESS) {
DEBUG_PRINTF("hs_clone_scratch(multi_scratch,...) failed\n");
ch_scratch_free(*dest);
return ret;
}
}
return CH_SUCCESS;
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_free_scratch(ch_scratch_t *scratch) {
ch_error_t ret = CH_SUCCESS;
if (scratch) {
/* has to be aligned before we can do anything with it */
if (!ISALIGNED_CL(scratch)) {
return CH_INVALID;
}
if (scratch->magic != CH_SCRATCH_MAGIC) {
return CH_INVALID;
}
if (markScratchInUse(scratch)) {
return CH_SCRATCH_IN_USE;
}
if (scratch->multi_scratch) {
ret = hs_free_scratch(scratch->multi_scratch);
}
scratch->magic = 0;
assert(scratch->scratch_alloc);
DEBUG_PRINTF("scratch %p is really at %p : freeing\n", scratch,
scratch->scratch_alloc);
ch_scratch_free(scratch->scratch_alloc);
}
return ret;
}
/** Not public, but used for info from our internal tools. Note that in the
* hybrid matcher the scratch is definitely not a contiguous memory region. */
HS_PUBLIC_API
ch_error_t HS_CDECL ch_scratch_size(const ch_scratch_t *scratch, size_t *size) {
ch_error_t ret = CH_SUCCESS;
if (!size || !scratch || !ISALIGNED_CL(scratch) ||
scratch->magic != CH_SCRATCH_MAGIC) {
return CH_INVALID;
} else {
size_t multi_size = 0;
if (scratch->multi_scratch) {
ret = hs_scratch_size(scratch->multi_scratch, &multi_size);
}
if (ret) {
multi_size = 0;
}
*size = scratch->scratchSize + multi_size;
}
return ret;
}

119
chimera/ch_scratch.h Normal file
View File

@ -0,0 +1,119 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Scratch and associated data structures.
*
* This header gets pulled into many places (many deep, slow to compile
* places). Try to keep the included headers under control.
*/
#ifndef CH_SCRATCH_H_
#define CH_SCRATCH_H_
#include "ch_common.h"
#include "ch_runtime.h"
#ifdef __cplusplus
extern "C"
{
#endif
#define CH_SCRATCH_MAGIC 0x554F4259 //!< Magic number stored in \ref ch_scratch
struct queue_item {
int from; /** \brief used to store the start location. */
int to; /** \brief used to store the current location. */
u32 id; /**< pattern index. */
};
struct match_pq {
struct queue_item *item;
u32 size; /**< current size of the priority queue */
};
/** \brief Information about a pattern stored at runtime when a match is
* encountered. */
struct ch_patterndata {
struct ch_capture *match; //!< buffered group info
u32 groupCount; //!< number of capturing groups
u32 scanStart; //!< start of match window (still to be single-scanned).
};
/** \brief Scratch space header for Chimera. */
struct ch_scratch {
u32 magic; //!< must be \ref CH_SCRATCH_MAGIC
u8 in_use; /**< non-zero when being used by an API call. */
struct hs_scratch *multi_scratch; //!< for hyperscan scatch.
int *ovector; //!< maximally-sized ovector for PCRE usage.
struct ch_capture *captured; //!< max-sized capture group struct.
u8 *active; //!< active multibit.
struct ch_patterndata *patternData; //!< per-pattern match data, indexed by
// pattern ID.
struct match_pq pq; //!< priority queue to ensure matching ordering
u32 patternCount; //!< number of patterns, used to size active multibit
u32 activeSize; //!< size of active multibit
u32 maxCaptureGroups; //!< largest num of capturing groups required
u32 scratchSize; //!< size of allocation
int ret; //!< return value in Hyperscan callback
char *scratch_alloc; /* user allocated scratch object */
};
/**
* \brief Mark scratch as in use.
*
* Returns non-zero if it was already in use, zero otherwise.
*/
static really_inline
char markScratchInUse(struct ch_scratch *scratch) {
DEBUG_PRINTF("marking scratch as in use\n");
assert(scratch && scratch->magic == CH_SCRATCH_MAGIC);
if (scratch->in_use) {
DEBUG_PRINTF("scratch already in use!\n");
return 1;
}
scratch->in_use = 1;
return 0;
}
/**
* \brief Mark scratch as no longer in use.
*/
static really_inline
void unmarkScratchInUse(struct ch_scratch *scratch) {
DEBUG_PRINTF("marking scratch as not in use\n");
assert(scratch && scratch->magic == CH_SCRATCH_MAGIC);
assert(scratch->in_use == 1);
scratch->in_use = 0;
}
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* CH_SCRATCH_H_ */

View File

@ -61,5 +61,3 @@ else ()
return () return ()
endif () endif ()
endif (PCRE_BUILD_SOURCE) endif (PCRE_BUILD_SOURCE)
set (PCRE_CHECKED TRUE PARENT_SCOPE)

View File

@ -46,7 +46,7 @@ using namespace std;
namespace ue2 { namespace ue2 {
u32 mmbit_size(u32 total_bits) { u32 HS_CDECL mmbit_size(u32 total_bits) {
if (total_bits > MMB_MAX_BITS) { if (total_bits > MMB_MAX_BITS) {
throw ResourceLimitError(); throw ResourceLimitError();
} }

View File

@ -33,6 +33,7 @@
#ifndef MULTIBIT_BUILD_H #ifndef MULTIBIT_BUILD_H
#define MULTIBIT_BUILD_H #define MULTIBIT_BUILD_H
#include "hs_common.h"
#include "multibit_internal.h" #include "multibit_internal.h"
#include "hash.h" #include "hash.h"
@ -62,8 +63,10 @@ namespace ue2 {
* *
* This will throw a resource limit assertion if the requested mmbit is too * This will throw a resource limit assertion if the requested mmbit is too
* large. * large.
*
* TODO:add temporary HS_CDECL for chimera on Windows, need improve this.
*/ */
u32 mmbit_size(u32 total_bits); u32 HS_CDECL mmbit_size(u32 total_bits);
/** \brief Construct a sparse iterator over the values in \a bits for a /** \brief Construct a sparse iterator over the values in \a bits for a
* multibit of size \a total_bits. */ * multibit of size \a total_bits. */

View File

@ -31,6 +31,8 @@ SET(hsbench_SOURCES
common.h common.h
data_corpus.cpp data_corpus.cpp
data_corpus.h data_corpus.h
engine.cpp
engine.h
engine_hyperscan.cpp engine_hyperscan.cpp
engine_hyperscan.h engine_hyperscan.h
heapstats.cpp heapstats.cpp
@ -45,6 +47,23 @@ SET(hsbench_SOURCES
timer.h timer.h
) )
if (BUILD_CHIMERA)
add_definitions(-DHS_HYBRID)
SET(hsbench_SOURCES
${hsbench_SOURCES}
engine_chimera.cpp
engine_chimera.h
engine_pcre.cpp
engine_pcre.h
)
endif()
add_executable(hsbench ${hsbench_SOURCES}) add_executable(hsbench ${hsbench_SOURCES})
target_link_libraries(hsbench hs databaseutil expressionutil ${SQLITE3_LDFLAGS} if (BUILD_CHIMERA)
${CMAKE_THREAD_LIBS_INIT}) include_directories(${PCRE_INCLUDE_DIRS})
target_link_libraries(hsbench hs chimera ${PCRE_LDFLAGS} databaseutil
expressionutil ${SQLITE3_LDFLAGS} ${CMAKE_THREAD_LIBS_INIT})
else()
target_link_libraries(hsbench hs databaseutil expressionutil
${SQLITE3_LDFLAGS} ${CMAKE_THREAD_LIBS_INIT})
endif()

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -42,6 +42,12 @@ extern bool forceEditDistance;
extern unsigned editDistance; extern unsigned editDistance;
extern bool printCompressSize; extern bool printCompressSize;
/** Structure for the result of a single complete scan. */
struct ResultEntry {
double seconds = 0; //!< Time taken for scan.
unsigned int matches = 0; //!< Count of matches found.
};
struct SqlFailure { struct SqlFailure {
explicit SqlFailure(const std::string &s) : message(s) {} explicit SqlFailure(const std::string &s) : message(s) {}
std::string message; std::string message;

35
tools/hsbench/engine.cpp Normal file
View File

@ -0,0 +1,35 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "engine.h"
EngineContext::~EngineContext() { }
EngineStream::~EngineStream() { }
Engine::~Engine() { }

94
tools/hsbench/engine.h Normal file
View File

@ -0,0 +1,94 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef ENGINE_H
#define ENGINE_H
#include "common.h"
#include "sqldb.h"
#include <memory>
#include <string>
#include <vector>
#include <boost/core/noncopyable.hpp>
// Engines have an engine context which is allocated on a per-thread basis.
class EngineContext : boost::noncopyable {
public:
virtual ~EngineContext();
};
/** Streaming mode scans have persistent stream state associated with them. */
class EngineStream : boost::noncopyable {
public:
virtual ~EngineStream();
unsigned int sn;
};
// Benchmarking engine
class Engine : boost::noncopyable {
public:
virtual ~Engine();
// allocate an EngineContext
virtual std::unique_ptr<EngineContext> makeContext() const = 0;
// non-streaming scan
virtual void scan(const char *data, unsigned len, unsigned blockId,
ResultEntry &results, EngineContext &ectx) const = 0;
// vectoring scan
virtual void scan_vectored(const char *const *data,
const unsigned int *len, unsigned int count,
unsigned int streamId, ResultEntry &result,
EngineContext &ectx) const = 0;
// stream open
virtual std::unique_ptr<EngineStream> streamOpen(EngineContext &ectx,
unsigned id) const = 0;
// stream close
virtual void streamClose(std::unique_ptr<EngineStream> stream,
ResultEntry &result) const = 0;
// stream compress and expand
virtual void streamCompressExpand(EngineStream &stream,
std::vector<char> &temp) const = 0;
// streaming scan
virtual void streamScan(EngineStream &stream, const char *data,
unsigned int len, unsigned int id,
ResultEntry &result) const = 0;
virtual void printStats() const = 0;
virtual void sqlStats(SqlDB &db) const = 0;
};
#endif // ENGINE_H

View File

@ -0,0 +1,314 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "ExpressionParser.h"
#include "common.h"
#include "engine_chimera.h"
#include "expressions.h"
#include "heapstats.h"
#include "sqldb.h"
#include "timer.h"
#include "chimera/ch_database.h"
#include "util/make_unique.h"
using namespace std;
EngineCHContext::EngineCHContext(const ch_database_t *db) {
ch_alloc_scratch(db, &scratch);
assert(scratch);
}
EngineCHContext::~EngineCHContext() {
ch_free_scratch(scratch);
}
namespace /* anonymous */ {
/** Scan context structure passed to the onMatch callback function. */
struct ScanCHContext {
ScanCHContext(unsigned id_in, ResultEntry &result_in)
: id(id_in), result(result_in) {}
unsigned id;
ResultEntry &result;
};
} // namespace
/**
* Callback function called for every match that Chimera produces, used when
* "echo matches" is off.
*/
static
int onMatch(unsigned int, unsigned long long, unsigned long long, unsigned int,
unsigned int, const ch_capture_t *, void *ctx) {
ScanCHContext *sc = static_cast<ScanCHContext *>(ctx);
assert(sc);
sc->result.matches++;
return 0;
}
/**
* Callback function called for every match that Chimera produces when "echo
* matches" is enabled.
*/
static
int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to,
unsigned int, unsigned int, const ch_capture_t *, void *ctx) {
ScanCHContext *sc = static_cast<ScanCHContext *>(ctx);
assert(sc);
sc->result.matches++;
printf("Match @%u:%llu for %u\n", sc->id, to, id);
return 0;
}
EngineChimera::EngineChimera(ch_database_t *db_in, CompileCHStats cs)
: db(db_in), compile_stats(move(cs)) {
assert(db);
}
EngineChimera::~EngineChimera() {
ch_free_database(db);
}
unique_ptr<EngineContext> EngineChimera::makeContext() const {
return ue2::make_unique<EngineCHContext>(db);
}
void EngineChimera::scan(const char *data, unsigned int len, unsigned int id,
ResultEntry &result, EngineContext &ectx) const {
assert(data);
auto &ctx = static_cast<EngineCHContext &>(ectx);
ScanCHContext sc(id, result);
auto callback = echo_matches ? onMatchEcho : onMatch;
ch_error_t rv = ch_scan(db, data, len, 0, ctx.scratch, callback, nullptr,
&sc);
if (rv != CH_SUCCESS) {
printf("Fatal error: ch_scan returned error %d\n", rv);
abort();
}
}
// vectoring scan
void EngineChimera::scan_vectored(UNUSED const char *const *data,
UNUSED const unsigned int *len,
UNUSED unsigned int count,
UNUSED unsigned int streamId,
UNUSED ResultEntry &result,
UNUSED EngineContext &ectx) const {
printf("Hybrid matcher can't support vectored mode.\n");
abort();
}
unique_ptr<EngineStream> EngineChimera::streamOpen(UNUSED EngineContext &ectx,
UNUSED unsigned id) const {
printf("Hybrid matcher can't stream.\n");
abort();
}
void EngineChimera::streamClose(UNUSED unique_ptr<EngineStream> stream,
UNUSED ResultEntry &result) const {
printf("Hybrid matcher can't stream.\n");
abort();
}
void EngineChimera::streamScan(UNUSED EngineStream &stream,
UNUSED const char *data,
UNUSED unsigned len, UNUSED unsigned id,
UNUSED ResultEntry &result) const {
printf("Hybrid matcher can't stream.\n");
abort();
}
void EngineChimera::streamCompressExpand(UNUSED EngineStream &stream,
UNUSED vector<char> &temp) const {
printf("Hybrid matcher can't stream.\n");
abort();
}
void EngineChimera::printStats() const {
// Output summary information.
if (!compile_stats.sigs_name.empty()) {
printf("Signature set: %s\n", compile_stats.sigs_name.c_str());
}
printf("Signatures: %s\n", compile_stats.signatures.c_str());
printf("Chimera info: %s\n", compile_stats.db_info.c_str());
printf("Expression count: %'zu\n", compile_stats.expressionCount);
printf("Bytecode size: %'zu bytes\n", compile_stats.compiledSize);
printf("Database CRC: 0x%x\n", compile_stats.crc32);
printf("Scratch size: %'zu bytes\n", compile_stats.scratchSize);
printf("Compile time: %'0.3Lf seconds\n", compile_stats.compileSecs);
printf("Peak heap usage: %'u bytes\n", compile_stats.peakMemorySize);
}
void EngineChimera::sqlStats(SqlDB &sqldb) const {
ostringstream crc;
crc << "0x" << hex << compile_stats.crc32;
static const string Q =
"INSERT INTO Compile ("
"sigsName, signatures, dbInfo, exprCount, dbSize, crc,"
"scratchSize, compileSecs, peakMemory) "
"VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)";
sqldb.insert_all(Q, compile_stats.sigs_name, compile_stats.signatures,
compile_stats.db_info, compile_stats.expressionCount,
compile_stats.compiledSize, crc.str(),
compile_stats.scratchSize, compile_stats.compileSecs,
compile_stats.peakMemorySize);
}
unique_ptr<EngineChimera>
buildEngineChimera(const ExpressionMap &expressions, const string &name,
const string &sigs_name) {
if (expressions.empty()) {
assert(0);
return nullptr;
}
long double compileSecs = 0.0;
size_t compiledSize = 0.0;
size_t scratchSize = 0;
unsigned int peakMemorySize = 0;
string db_info;
ch_database_t *db;
ch_error_t err;
const unsigned int count = expressions.size();
vector<string> exprs;
vector<unsigned int> flags, ids;
vector<hs_expr_ext> ext;
for (const auto &m : expressions) {
string expr;
unsigned int f = 0;
hs_expr_ext extparam; // unused
extparam.flags = 0;
if (!readExpression(m.second, expr, &f, &extparam)) {
printf("Error parsing PCRE: %s (id %u)\n", m.second.c_str(),
m.first);
return nullptr;
}
if (extparam.flags) {
printf("Error parsing PCRE with extended flags: %s (id %u)\n",
m.second.c_str(), m.first);
return nullptr;
}
exprs.push_back(expr);
ids.push_back(m.first);
flags.push_back(f);
}
// Our compiler takes an array of plain ol' C strings.
vector<const char *> patterns(count);
for (unsigned int i = 0; i < count; i++) {
patterns[i] = exprs[i].c_str();
}
Timer timer;
timer.start();
// Capture groups by default
unsigned int mode = CH_MODE_GROUPS;
ch_compile_error_t *compile_err;
err = ch_compile_multi(patterns.data(), flags.data(), ids.data(),
count, mode, nullptr, &db, &compile_err);
timer.complete();
compileSecs = timer.seconds();
peakMemorySize = getPeakHeap();
if (err == CH_COMPILER_ERROR) {
if (compile_err->expression >= 0) {
printf("Compile error for signature #%u: %s\n",
compile_err->expression, compile_err->message);
} else {
printf("Compile error: %s\n", compile_err->message);
}
ch_free_compile_error(compile_err);
return nullptr;
}
err = ch_database_size(db, &compiledSize);
if (err != CH_SUCCESS) {
return nullptr;
}
assert(compiledSize > 0);
char *info;
err = ch_database_info(db, &info);
if (err != CH_SUCCESS) {
return nullptr;
} else {
db_info = string(info);
free(info);
}
// Allocate scratch temporarily to find its size: this is a good test
// anyway.
ch_scratch_t *scratch = nullptr;
err = ch_alloc_scratch(db, &scratch);
if (err != HS_SUCCESS) {
return nullptr;
}
err = ch_scratch_size(scratch, &scratchSize);
if (err != CH_SUCCESS) {
return nullptr;
}
ch_free_scratch(scratch);
// Collect summary information.
CompileCHStats cs;
cs.sigs_name = sigs_name;
if (!sigs_name.empty()) {
const auto pos = name.find_last_of('/');
cs.signatures = name.substr(pos + 1);
} else {
cs.signatures = name;
}
cs.db_info = db_info;
cs.expressionCount = expressions.size();
cs.compiledSize = compiledSize;
cs.scratchSize = scratchSize;
cs.compileSecs = compileSecs;
cs.peakMemorySize = peakMemorySize;
return ue2::make_unique<EngineChimera>(db, move(cs));
}

View File

@ -0,0 +1,103 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef ENGINECHIMERA_H
#define ENGINECHIMERA_H
#include "expressions.h"
#include "engine.h"
#include "chimera/ch.h"
#include <memory>
#include <string>
#include <vector>
/** Infomation about the database compile */
struct CompileCHStats {
std::string sigs_name;
std::string signatures;
std::string db_info;
size_t expressionCount = 0;
size_t compiledSize = 0;
uint32_t crc32 = 0;
size_t scratchSize = 0;
long double compileSecs = 0;
unsigned int peakMemorySize = 0;
};
/** Engine context which is allocated on a per-thread basis. */
class EngineCHContext : public EngineContext{
public:
explicit EngineCHContext(const ch_database_t *db);
~EngineCHContext();
ch_scratch_t *scratch = nullptr;
};
/** Chimera Engine for scanning data. */
class EngineChimera : public Engine {
public:
explicit EngineChimera(ch_database_t *db, CompileCHStats cs);
~EngineChimera();
std::unique_ptr<EngineContext> makeContext() const;
void scan(const char *data, unsigned int len, unsigned int id,
ResultEntry &result, EngineContext &ectx) const;
void scan_vectored(const char *const *data, const unsigned int *len,
unsigned int count, unsigned int streamId,
ResultEntry &result, EngineContext &ectx) const;
std::unique_ptr<EngineStream> streamOpen(EngineContext &ectx,
unsigned id) const;
void streamClose(std::unique_ptr<EngineStream> stream,
ResultEntry &result) const;
void streamCompressExpand(EngineStream &stream,
std::vector<char> &temp) const;
void streamScan(EngineStream &stream, const char *data, unsigned int len,
unsigned int id, ResultEntry &result) const;
void printStats() const;
void sqlStats(SqlDB &db) const;
private:
ch_database_t *db;
CompileCHStats compile_stats;
};
std::unique_ptr<EngineChimera>
buildEngineChimera(const ExpressionMap &expressions, const std::string &name,
const std::string &sigs_name);
#endif // ENGINECHIMERA_H

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -57,20 +57,22 @@
using namespace std; using namespace std;
EngineContext::EngineContext(const hs_database_t *db) { EngineHSContext::EngineHSContext(const hs_database_t *db) {
hs_alloc_scratch(db, &scratch); hs_alloc_scratch(db, &scratch);
assert(scratch); assert(scratch);
} }
EngineContext::~EngineContext() { EngineHSContext::~EngineHSContext() {
hs_free_scratch(scratch); hs_free_scratch(scratch);
} }
EngineHSStream::~EngineHSStream() { }
namespace /* anonymous */ { namespace /* anonymous */ {
/** Scan context structure passed to the onMatch callback function. */ /** Scan context structure passed to the onMatch callback function. */
struct ScanContext { struct ScanHSContext {
ScanContext(unsigned id_in, ResultEntry &result_in, ScanHSContext(unsigned id_in, ResultEntry &result_in,
const EngineStream *stream_in) const EngineStream *stream_in)
: id(id_in), result(result_in), stream(stream_in) {} : id(id_in), result(result_in), stream(stream_in) {}
unsigned id; unsigned id;
@ -87,7 +89,7 @@ struct ScanContext {
static static
int onMatch(unsigned int, unsigned long long, unsigned long long, unsigned int, int onMatch(unsigned int, unsigned long long, unsigned long long, unsigned int,
void *ctx) { void *ctx) {
ScanContext *sc = static_cast<ScanContext *>(ctx); ScanHSContext *sc = static_cast<ScanHSContext *>(ctx);
assert(sc); assert(sc);
sc->result.matches++; sc->result.matches++;
@ -101,7 +103,7 @@ int onMatch(unsigned int, unsigned long long, unsigned long long, unsigned int,
static static
int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to, int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to,
unsigned int, void *ctx) { unsigned int, void *ctx) {
ScanContext *sc = static_cast<ScanContext *>(ctx); ScanHSContext *sc = static_cast<ScanHSContext *>(ctx);
assert(sc); assert(sc);
sc->result.matches++; sc->result.matches++;
@ -114,7 +116,7 @@ int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to,
return 0; return 0;
} }
EngineHyperscan::EngineHyperscan(hs_database_t *db_in, CompileStats cs) EngineHyperscan::EngineHyperscan(hs_database_t *db_in, CompileHSStats cs)
: db(db_in), compile_stats(std::move(cs)) { : db(db_in), compile_stats(std::move(cs)) {
assert(db); assert(db);
} }
@ -124,14 +126,15 @@ EngineHyperscan::~EngineHyperscan() {
} }
unique_ptr<EngineContext> EngineHyperscan::makeContext() const { unique_ptr<EngineContext> EngineHyperscan::makeContext() const {
return ue2::make_unique<EngineContext>(db); return ue2::make_unique<EngineHSContext>(db);
} }
void EngineHyperscan::scan(const char *data, unsigned int len, unsigned int id, void EngineHyperscan::scan(const char *data, unsigned int len, unsigned int id,
ResultEntry &result, EngineContext &ctx) const { ResultEntry &result, EngineContext &ectx) const {
assert(data); assert(data);
ScanContext sc(id, result, nullptr); EngineHSContext &ctx = static_cast<EngineHSContext &>(ectx);
ScanHSContext sc(id, result, nullptr);
auto callback = echo_matches ? onMatchEcho : onMatch; auto callback = echo_matches ? onMatchEcho : onMatch;
hs_error_t rv = hs_scan(db, data, len, 0, ctx.scratch, callback, &sc); hs_error_t rv = hs_scan(db, data, len, 0, ctx.scratch, callback, &sc);
@ -144,11 +147,12 @@ void EngineHyperscan::scan(const char *data, unsigned int len, unsigned int id,
void EngineHyperscan::scan_vectored(const char *const *data, void EngineHyperscan::scan_vectored(const char *const *data,
const unsigned int *len, unsigned int count, const unsigned int *len, unsigned int count,
unsigned streamId, ResultEntry &result, unsigned streamId, ResultEntry &result,
EngineContext &ctx) const { EngineContext &ectx) const {
assert(data); assert(data);
assert(len); assert(len);
ScanContext sc(streamId, result, nullptr); EngineHSContext &ctx = static_cast<EngineHSContext &>(ectx);
ScanHSContext sc(streamId, result, nullptr);
auto callback = echo_matches ? onMatchEcho : onMatch; auto callback = echo_matches ? onMatchEcho : onMatch;
hs_error_t rv = hs_error_t rv =
hs_scan_vector(db, data, len, count, 0, ctx.scratch, callback, &sc); hs_scan_vector(db, data, len, count, 0, ctx.scratch, callback, &sc);
@ -159,9 +163,10 @@ void EngineHyperscan::scan_vectored(const char *const *data,
} }
} }
unique_ptr<EngineStream> EngineHyperscan::streamOpen(EngineContext &ctx, unique_ptr<EngineStream> EngineHyperscan::streamOpen(EngineContext &ectx,
unsigned streamId) const { unsigned streamId) const {
auto stream = ue2::make_unique<EngineStream>(); EngineHSContext &ctx = static_cast<EngineHSContext &>(ectx);
auto stream = ue2::make_unique<EngineHSStream>();
stream->ctx = &ctx; stream->ctx = &ctx;
hs_open_stream(db, 0, &stream->id); hs_open_stream(db, 0, &stream->id);
@ -170,17 +175,18 @@ unique_ptr<EngineStream> EngineHyperscan::streamOpen(EngineContext &ctx,
return nullptr; return nullptr;
} }
stream->sn = streamId; stream->sn = streamId;
return stream; return move(stream);
} }
void EngineHyperscan::streamClose(unique_ptr<EngineStream> stream, void EngineHyperscan::streamClose(unique_ptr<EngineStream> stream,
ResultEntry &result) const { ResultEntry &result) const {
assert(stream); assert(stream);
auto &s = static_cast<EngineStream &>(*stream); auto &s = static_cast<EngineHSStream &>(*stream);
EngineContext &ctx = *s.ctx; EngineContext &ectx = *s.ctx;
EngineHSContext &ctx = static_cast<EngineHSContext &>(ectx);
ScanContext sc(0, result, &s); ScanHSContext sc(0, result, &s);
auto callback = echo_matches ? onMatchEcho : onMatch; auto callback = echo_matches ? onMatchEcho : onMatch;
assert(s.id); assert(s.id);
@ -193,10 +199,10 @@ void EngineHyperscan::streamScan(EngineStream &stream, const char *data,
ResultEntry &result) const { ResultEntry &result) const {
assert(data); assert(data);
auto &s = static_cast<EngineStream &>(stream); auto &s = static_cast<EngineHSStream &>(stream);
EngineContext &ctx = *s.ctx; EngineHSContext &ctx = *s.ctx;
ScanContext sc(id, result, &s); ScanHSContext sc(id, result, &s);
auto callback = echo_matches ? onMatchEcho : onMatch; auto callback = echo_matches ? onMatchEcho : onMatch;
hs_error_t rv = hs_error_t rv =
hs_scan_stream(s.id, data, len, 0, ctx.scratch, callback, &sc); hs_scan_stream(s.id, data, len, 0, ctx.scratch, callback, &sc);
@ -210,11 +216,12 @@ void EngineHyperscan::streamScan(EngineStream &stream, const char *data,
void EngineHyperscan::streamCompressExpand(EngineStream &stream, void EngineHyperscan::streamCompressExpand(EngineStream &stream,
vector<char> &temp) const { vector<char> &temp) const {
size_t used = 0; size_t used = 0;
hs_error_t err = hs_compress_stream(stream.id, temp.data(), temp.size(), auto &s = static_cast<EngineHSStream &>(stream);
hs_error_t err = hs_compress_stream(s.id, temp.data(), temp.size(),
&used); &used);
if (err == HS_INSUFFICIENT_SPACE) { if (err == HS_INSUFFICIENT_SPACE) {
temp.resize(used); temp.resize(used);
err = hs_compress_stream(stream.id, temp.data(), temp.size(), &used); err = hs_compress_stream(s.id, temp.data(), temp.size(), &used);
} }
if (err != HS_SUCCESS) { if (err != HS_SUCCESS) {
@ -223,10 +230,10 @@ void EngineHyperscan::streamCompressExpand(EngineStream &stream,
} }
if (printCompressSize) { if (printCompressSize) {
printf("stream %u: compressed to %zu\n", stream.sn, used); printf("stream %u: compressed to %zu\n", s.sn, used);
} }
err = hs_reset_and_expand_stream(stream.id, temp.data(), temp.size(), err = hs_reset_and_expand_stream(s.id, temp.data(), temp.size(),
nullptr, nullptr, nullptr); nullptr, nullptr, nullptr);
if (err != HS_SUCCESS) { if (err != HS_SUCCESS) {
@ -469,7 +476,7 @@ buildEngineHyperscan(const ExpressionMap &expressions, ScanMode scan_mode,
hs_free_scratch(scratch); hs_free_scratch(scratch);
// Collect summary information. // Collect summary information.
CompileStats cs; CompileHSStats cs;
cs.sigs_name = sigs_name; cs.sigs_name = sigs_name;
if (!sigs_name.empty()) { if (!sigs_name.empty()) {
const auto pos = name.find_last_of('/'); const auto pos = name.find_last_of('/');

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -30,22 +30,15 @@
#define ENGINEHYPERSCAN_H #define ENGINEHYPERSCAN_H
#include "expressions.h" #include "expressions.h"
#include "common.h" #include "engine.h"
#include "sqldb.h"
#include "hs_runtime.h" #include "hs_runtime.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
/** Structure for the result of a single complete scan. */
struct ResultEntry {
double seconds = 0; //!< Time taken for scan.
unsigned int matches = 0; //!< Count of matches found.
};
/** Infomation about the database compile */ /** Infomation about the database compile */
struct CompileStats { struct CompileHSStats {
std::string sigs_name; std::string sigs_name;
std::string signatures; std::string signatures;
std::string db_info; std::string db_info;
@ -60,38 +53,38 @@ struct CompileStats {
}; };
/** Engine context which is allocated on a per-thread basis. */ /** Engine context which is allocated on a per-thread basis. */
class EngineContext { class EngineHSContext : public EngineContext {
public: public:
explicit EngineContext(const hs_database_t *db); explicit EngineHSContext(const hs_database_t *db);
~EngineContext(); ~EngineHSContext();
hs_scratch_t *scratch = nullptr; hs_scratch_t *scratch = nullptr;
}; };
/** Streaming mode scans have persistent stream state associated with them. */ /** Streaming mode scans have persistent stream state associated with them. */
class EngineStream { class EngineHSStream : public EngineStream {
public: public:
~EngineHSStream();
hs_stream_t *id; hs_stream_t *id;
unsigned int sn; EngineHSContext *ctx;
EngineContext *ctx;
}; };
/** Hyperscan Engine for scanning data. */ /** Hyperscan Engine for scanning data. */
class EngineHyperscan { class EngineHyperscan : public Engine {
public: public:
explicit EngineHyperscan(hs_database_t *db, CompileStats cs); explicit EngineHyperscan(hs_database_t *db, CompileHSStats cs);
~EngineHyperscan(); ~EngineHyperscan();
std::unique_ptr<EngineContext> makeContext() const; std::unique_ptr<EngineContext> makeContext() const;
void scan(const char *data, unsigned int len, unsigned int id, void scan(const char *data, unsigned int len, unsigned int id,
ResultEntry &result, EngineContext &ctx) const; ResultEntry &result, EngineContext &ectx) const;
void scan_vectored(const char *const *data, const unsigned int *len, void scan_vectored(const char *const *data, const unsigned int *len,
unsigned int count, unsigned int streamId, unsigned int count, unsigned int streamId,
ResultEntry &result, EngineContext &ctx) const; ResultEntry &result, EngineContext &ectx) const;
std::unique_ptr<EngineStream> streamOpen(EngineContext &ctx, std::unique_ptr<EngineStream> streamOpen(EngineContext &ectx,
unsigned id) const; unsigned id) const;
void streamClose(std::unique_ptr<EngineStream> stream, void streamClose(std::unique_ptr<EngineStream> stream,
@ -109,7 +102,7 @@ public:
private: private:
hs_database_t *db; hs_database_t *db;
CompileStats compile_stats; CompileHSStats compile_stats;
}; };
namespace ue2 { namespace ue2 {

View File

@ -0,0 +1,388 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "common.h"
#include "engine_pcre.h"
#include "heapstats.h"
#include "huge.h"
#include "sqldb.h"
#include "timer.h"
#include "util/make_unique.h"
#include "util/unicode_def.h"
using namespace std;
EnginePCREContext::EnginePCREContext(int capture_cnt) {
ovec = (int *)malloc((capture_cnt + 1)* sizeof(int) * 3);
}
EnginePCREContext::~EnginePCREContext() {
free(ovec);
}
namespace /* anonymous */ {
/** Scan context structure passed to the onMatch callback function. */
struct ScanPCREContext {
ScanPCREContext(unsigned id_in, ResultEntry &result_in)
: id(id_in), result(result_in) {}
unsigned id;
ResultEntry &result;
};
} // namespace
/**
* Function called for every match that PCRE produces, used when
* "echo matches" is off.
*/
static
int onMatch(ScanPCREContext *sc) {
assert(sc);
sc->result.matches++;
return 0;
}
/**
* Function called for every match that PCRE produces when "echo
* matches" is enabled.
*/
static
int onMatchEcho(unsigned int id, unsigned long long, unsigned long long to,
ScanPCREContext *sc) {
assert(sc);
sc->result.matches++;
printf("Match @%u:%llu for %u\n", sc->id, to, id);
return 0;
}
EnginePCRE::EnginePCRE(vector<unique_ptr<PcreDB>> dbs_in, CompilePCREStats cs,
int capture_cnt_in)
: dbs(move(dbs_in)), compile_stats(move(cs)),
capture_cnt(capture_cnt_in) {}
EnginePCRE::~EnginePCRE() {
for (auto &pcreDB : dbs) {
free(pcreDB->extra);
free(pcreDB->db);
}
}
unique_ptr<EngineContext> EnginePCRE::makeContext() const {
return ue2::make_unique<EnginePCREContext>(capture_cnt);
}
void EnginePCRE::scan(const char *data, unsigned int len, unsigned int id,
ResultEntry &result, EngineContext &ectx) const {
assert(data);
ScanPCREContext sc(id, result);
auto &ctx = static_cast<EnginePCREContext &>(ectx);
int *ovec = ctx.ovec;
int ovec_size = (capture_cnt + 1) * 3;
for (const auto &pcreDB : dbs) {
int startoffset = 0;
bool utf8 = pcreDB->utf8;
bool highlander = pcreDB->highlander;
int flags = 0;
int ret;
do {
ret = pcre_exec(pcreDB->db, pcreDB->extra, data, len,
startoffset, flags, ovec, ovec_size);
if (ret <= PCRE_ERROR_NOMATCH) {
break;
}
int from = ovec[0];
int to = ovec[1];
assert(from <= to);
if (echo_matches) {
onMatchEcho(pcreDB->id, from, to, &sc);
} else {
onMatch(&sc);
}
// If we only wanted a single match, we're done.
if (highlander) {
break;
}
// Next scan starts at the first codepoint after the match. It's
// possible that we have a vacuous match, in which case we must step
// past it to ensure that we always progress.
if (from != to) {
startoffset = to;
} else if (utf8) {
startoffset = to + 1;
while (startoffset < (int)len &&
((data[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) {
++startoffset;
}
} else {
startoffset = to + 1;
}
} while (startoffset <= (int)len);
if (ret < PCRE_ERROR_NOMATCH) {
printf("Fatal error: pcre returned error %d\n", ret);
abort();
}
}
}
// vectoring scan
void EnginePCRE::scan_vectored(UNUSED const char *const *data,
UNUSED const unsigned int *len,
UNUSED unsigned int count,
UNUSED unsigned int streamId,
UNUSED ResultEntry &result,
UNUSED EngineContext &ectx) const {
printf("PCRE matcher can't support vectored mode.\n");
abort();
}
unique_ptr<EngineStream> EnginePCRE::streamOpen(UNUSED EngineContext &ectx,
UNUSED unsigned id) const {
printf("PCRE matcher can't stream.\n");
abort();
}
void EnginePCRE::streamClose(UNUSED unique_ptr<EngineStream> stream,
UNUSED ResultEntry &result) const {
printf("PCRE matcher can't stream.\n");
abort();
}
void EnginePCRE::streamScan(UNUSED EngineStream &stream,
UNUSED const char *data,
UNUSED unsigned len, UNUSED unsigned id,
UNUSED ResultEntry &result) const {
printf("PCRE matcher can't stream.\n");
abort();
}
void EnginePCRE::streamCompressExpand(UNUSED EngineStream &stream,
UNUSED vector<char> &temp) const {
printf("PCRE matcher can't stream.\n");
abort();
}
void EnginePCRE::printStats() const {
// Output summary information.
if (!compile_stats.sigs_name.empty()) {
printf("Signature set: %s\n", compile_stats.sigs_name.c_str());
}
printf("Signatures: %s\n", compile_stats.signatures.c_str());
printf("PCRE info: %s\n", compile_stats.db_info.c_str());
printf("Expression count: %'zu\n", compile_stats.expressionCount);
printf("Bytecode size: %'zu bytes\n", compile_stats.compiledSize);
printf("Scratch size: %'zu bytes\n", compile_stats.scratchSize);
printf("Compile time: %'0.3Lf seconds\n", compile_stats.compileSecs);
printf("Peak heap usage: %'u bytes\n", compile_stats.peakMemorySize);
}
void EnginePCRE::sqlStats(SqlDB &sqldb) const {
ostringstream crc;
static const string Q =
"INSERT INTO Compile ("
"sigsName, signatures, dbInfo, exprCount, dbSize, crc,"
"scratchSize, compileSecs, peakMemory) "
"VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)";
sqldb.insert_all(Q, compile_stats.sigs_name, compile_stats.signatures,
compile_stats.db_info, compile_stats.expressionCount,
compile_stats.compiledSize, crc.str(),
compile_stats.scratchSize, compile_stats.compileSecs,
compile_stats.peakMemorySize);
}
static
bool decodeExprPCRE(string &expr, unsigned *flags, struct PcreDB &db) {
if (expr[0] != '/') {
return false;
}
size_t end = expr.find_last_of('/');
if (end == string::npos) {
return false;
}
string strFlags = expr.substr(end + 1, expr.length() - end - 1);
// strip starting and trailing slashes and the flags
expr.erase(end, expr.length() - end);
expr.erase(0, 1);
// decode the flags
*flags = 0;
for (size_t i = 0; i != strFlags.length(); ++i) {
switch (strFlags[i]) {
case 's':
*flags |= PCRE_DOTALL;
break;
case 'm':
*flags |= PCRE_MULTILINE;
break;
case 'i':
*flags |= PCRE_CASELESS;
break;
case '8':
*flags |= PCRE_UTF8;
db.utf8 = true;
break;
case 'W':
*flags |= PCRE_UCP;
break;
case 'H':
db.highlander = true;
break;
default:
return false;
}
}
return true;
}
unique_ptr<EnginePCRE>
buildEnginePcre(const ExpressionMap &expressions, const string &name,
const string &sigs_name) {
if (expressions.empty()) {
assert(0);
return nullptr;
}
long double compileSecs = 0.0;
size_t compiledSize = 0.0;
unsigned int peakMemorySize = 0;
string db_info("Version: ");
db_info += string(pcre_version());
vector<unique_ptr<PcreDB>> dbs;
int capture_cnt = 0;
Timer timer;
timer.start();
for (const auto &m : expressions) {
string expr(m.second);
unsigned int flags = 0;
auto pcreDB = ue2::make_unique<PcreDB>();
if (!decodeExprPCRE(expr, &flags, *pcreDB)) {
printf("Error parsing PCRE: %s (id %u)\n", m.second.c_str(),
m.first);
return nullptr;
}
const char *errp;
int erro;
pcre *db = pcre_compile(expr.c_str(), flags, &errp, &erro, NULL);
if (!db) {
printf("Compile error %s\n", errp);
return nullptr;
}
pcre_extra *extra = pcre_study(db, PCRE_STUDY_JIT_COMPILE, &errp);
if (errp) {
printf("PCRE could not be studied: %s\n", errp);
return nullptr;
}
if (!extra) {
extra = (pcre_extra *)malloc(sizeof(pcre_extra));
}
int cap = 0; // PCRE_INFO_CAPTURECOUNT demands an int
if (pcre_fullinfo(db, extra, PCRE_INFO_CAPTURECOUNT, &cap)) {
printf("PCRE fullinfo error\n");
free(extra);
free(db);
return nullptr;
}
assert(cap >= 0);
capture_cnt = max(capture_cnt, cap);
size_t db_size = 0;
if (pcre_fullinfo(db, extra, PCRE_INFO_SIZE, &db_size)) {
printf("PCRE fullinfo error\n");
free(extra);
free(db);
return nullptr;
}
size_t study_size = 0;
if (pcre_fullinfo(db, extra, PCRE_INFO_STUDYSIZE,
&study_size)) {
printf("PCRE fullinfo error\n");
free(extra);
free(db);
return nullptr;
}
compiledSize += db_size + study_size;
pcreDB->id = m.first;
pcreDB->db = db;
extra->flags =
PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
extra->match_limit = 10000000;
extra->match_limit_recursion = 1500;
pcreDB->extra = extra;
dbs.push_back(move(pcreDB));
}
timer.complete();
compileSecs = timer.seconds();
peakMemorySize = getPeakHeap();
// Collect summary information.
CompilePCREStats cs;
cs.sigs_name = sigs_name;
if (!sigs_name.empty()) {
const auto pos = name.find_last_of('/');
cs.signatures = name.substr(pos + 1);
} else {
cs.signatures = name;
}
cs.db_info = db_info;
cs.expressionCount = expressions.size();
cs.compiledSize = compiledSize;
cs.scratchSize = (capture_cnt + 1) * sizeof(int) * 3;
cs.compileSecs = compileSecs;
cs.peakMemorySize = peakMemorySize;
return ue2::make_unique<EnginePCRE>(move(dbs), move(cs), capture_cnt);
}

114
tools/hsbench/engine_pcre.h Normal file
View File

@ -0,0 +1,114 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef ENGINEPCRE_H
#define ENGINEPCRE_H
#include "expressions.h"
#include "engine.h"
#include <pcre.h>
#include <memory>
#include <string>
#include <vector>
/** Infomation about the database compile */
struct CompilePCREStats {
std::string sigs_name;
std::string signatures;
std::string db_info;
size_t expressionCount = 0;
size_t compiledSize = 0;
size_t scratchSize = 0;
long double compileSecs = 0;
unsigned int peakMemorySize = 0;
};
/** Engine context which is allocated on a per-thread basis. */
class EnginePCREContext : public EngineContext{
public:
explicit EnginePCREContext(int capture_cnt);
~EnginePCREContext();
int *ovec = nullptr;
};
struct PcreDB {
bool highlander = false;
bool utf8 = false;
u32 id;
pcre *db = nullptr;
pcre_extra *extra = nullptr;
};
/** PCRE Engine for scanning data. */
class EnginePCRE : public Engine {
public:
explicit EnginePCRE(std::vector<std::unique_ptr<PcreDB>> dbs_in,
CompilePCREStats cs, int capture_cnt_in);
~EnginePCRE();
std::unique_ptr<EngineContext> makeContext() const;
void scan(const char *data, unsigned int len, unsigned int id,
ResultEntry &result, EngineContext &ectx) const;
void scan_vectored(const char *const *data, const unsigned int *len,
unsigned int count, unsigned int streamId,
ResultEntry &result, EngineContext &ectx) const;
std::unique_ptr<EngineStream> streamOpen(EngineContext &ectx,
unsigned id) const;
void streamClose(std::unique_ptr<EngineStream> stream,
ResultEntry &result) const;
void streamCompressExpand(EngineStream &stream,
std::vector<char> &temp) const;
void streamScan(EngineStream &stream, const char *data, unsigned int len,
unsigned int id, ResultEntry &result) const;
void printStats() const;
void sqlStats(SqlDB &db) const;
private:
std::vector<std::unique_ptr<PcreDB>> dbs;
CompilePCREStats compile_stats;
int capture_cnt;
};
std::unique_ptr<EnginePCRE>
buildEnginePcre(const ExpressionMap &expressions, const std::string &name,
const std::string &sigs_name);
#endif // ENGINEPCRE_H

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2016-2017, Intel Corporation * Copyright (c) 2016-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -31,6 +31,10 @@
#include "common.h" #include "common.h"
#include "data_corpus.h" #include "data_corpus.h"
#include "engine_hyperscan.h" #include "engine_hyperscan.h"
#if defined(HS_HYBRID)
#include "engine_chimera.h"
#include "engine_pcre.h"
#endif
#include "expressions.h" #include "expressions.h"
#include "sqldb.h" #include "sqldb.h"
#include "thread_barrier.h" #include "thread_barrier.h"
@ -87,6 +91,8 @@ namespace /* anonymous */ {
bool display_per_scan = false; bool display_per_scan = false;
ScanMode scan_mode = ScanMode::STREAMING; ScanMode scan_mode = ScanMode::STREAMING;
bool useHybrid = false;
bool usePcre = false;
unsigned repeats = 20; unsigned repeats = 20;
string exprPath(""); string exprPath("");
string corpusFile(""); string corpusFile("");
@ -102,7 +108,7 @@ typedef void (*thread_func_t)(void *context);
class ThreadContext : boost::noncopyable { class ThreadContext : boost::noncopyable {
public: public:
ThreadContext(unsigned num_in, const EngineHyperscan &db_in, ThreadContext(unsigned num_in, const Engine &db_in,
thread_barrier &tb_in, thread_func_t function_in, thread_barrier &tb_in, thread_func_t function_in,
vector<DataBlock> corpus_data_in) vector<DataBlock> corpus_data_in)
: num(num_in), results(repeats), engine(db_in), : num(num_in), results(repeats), engine(db_in),
@ -155,7 +161,7 @@ public:
unsigned num; unsigned num;
Timer timer; Timer timer;
vector<ResultEntry> results; vector<ResultEntry> results;
const EngineHyperscan &engine; const Engine &engine;
unique_ptr<EngineContext> enginectx; unique_ptr<EngineContext> enginectx;
vector<DataBlock> corpus_data; vector<DataBlock> corpus_data;
@ -181,6 +187,10 @@ void usage(const char *error) {
" (default: streaming).\n"); " (default: streaming).\n");
printf(" -V Benchmark in vectored mode" printf(" -V Benchmark in vectored mode"
" (default: streaming).\n"); " (default: streaming).\n");
#if defined(HS_HYBRID)
printf(" -H Benchmark using Chimera (if supported).\n");
printf(" -P Benchmark using PCRE (if supported).\n");
#endif
#ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP #ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP
printf(" -T CPU,CPU,... Benchmark with threads on these CPUs.\n"); printf(" -T CPU,CPU,... Benchmark with threads on these CPUs.\n");
#endif #endif
@ -214,7 +224,7 @@ struct BenchmarkSigs {
static static
void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets, void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
UNUSED unique_ptr<Grey> &grey) { UNUSED unique_ptr<Grey> &grey) {
const char options[] = "-b:c:Cd:e:E:G:hi:n:No:p:sS:Vw:z:" const char options[] = "-b:c:Cd:e:E:G:hHi:n:No:p:PsS:Vw:z:"
#ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP #ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP
"T:" // add the thread flag "T:" // add the thread flag
#endif #endif
@ -287,6 +297,14 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
usage(nullptr); usage(nullptr);
exit(0); exit(0);
break; break;
case 'H':
#if defined(HS_HYBRID)
useHybrid = true;
#else
usage("Hybrid matcher not enabled in this build");
exit(1);
#endif
break;
case 'n': case 'n':
if (!fromString(optarg, repeats) || repeats == 0) { if (!fromString(optarg, repeats) || repeats == 0) {
usage("Couldn't parse argument to -n flag, should be" usage("Couldn't parse argument to -n flag, should be"
@ -294,6 +312,14 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
exit(1); exit(1);
} }
break; break;
case 'P':
#if defined(HS_HYBRID)
usePcre = true;
#else
usage("PCRE matcher not enabled in this build");
exit(1);
#endif
break;
case 's': case 's':
in_sigfile = 2; in_sigfile = 2;
break; break;
@ -399,6 +425,24 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
exit(1); exit(1);
} }
// Constraints on Chimera and PCRE engines
if (useHybrid || usePcre) {
if (useHybrid && usePcre) {
usage("Can't run both Chimera and PCRE.");
exit(1);
}
if (scan_mode != ScanMode::BLOCK) {
usage("Must specify block mode in Chimera or PCRE with "
"the -N option.");
exit(1);
}
if (forceEditDistance || loadDatabases || saveDatabases) {
usage("No extended options are supported in Chimera or PCRE.");
exit(1);
}
}
// Read in any -s signature sets. // Read in any -s signature sets.
for (const auto &file : sigFiles) { for (const auto &file : sigFiles) {
SignatureSet sigs; SignatureSet sigs;
@ -503,7 +547,7 @@ static
void benchStreamingInternal(ThreadContext *ctx, vector<StreamInfo> &streams, void benchStreamingInternal(ThreadContext *ctx, vector<StreamInfo> &streams,
bool do_compress) { bool do_compress) {
assert(ctx); assert(ctx);
const EngineHyperscan &e = ctx->engine; const Engine &e = ctx->engine;
const vector<DataBlock> &blocks = ctx->corpus_data; const vector<DataBlock> &blocks = ctx->corpus_data;
vector<char> compress_buf(do_compress ? 1000 : 0); vector<char> compress_buf(do_compress ? 1000 : 0);
@ -812,7 +856,7 @@ void sqlResults(const vector<unique_ptr<ThreadContext>> &threads,
* the same copy of the data. * the same copy of the data.
*/ */
static static
unique_ptr<ThreadContext> makeThreadContext(const EngineHyperscan &db, unique_ptr<ThreadContext> makeThreadContext(const Engine &db,
const vector<DataBlock> &blocks, const vector<DataBlock> &blocks,
unsigned id, unsigned id,
thread_barrier &sync_barrier) { thread_barrier &sync_barrier) {
@ -839,7 +883,7 @@ unique_ptr<ThreadContext> makeThreadContext(const EngineHyperscan &db,
/** Run the given benchmark. */ /** Run the given benchmark. */
static static
void runBenchmark(const EngineHyperscan &db, void runBenchmark(const Engine &db,
const vector<DataBlock> &corpus_blocks) { const vector<DataBlock> &corpus_blocks) {
size_t numThreads; size_t numThreads;
bool useAffinity = false; bool useAffinity = false;
@ -936,8 +980,18 @@ int main(int argc, char *argv[]) {
continue; continue;
} }
auto engine = buildEngineHyperscan(exprMap, scan_mode, s.name, unique_ptr<Engine> engine;
sigName, *grey); if (useHybrid) {
#if defined(HS_HYBRID)
engine = buildEngineChimera(exprMap, s.name, sigName);
} else if (usePcre) {
engine = buildEnginePcre(exprMap, s.name, sigName);
#endif
} else {
engine = buildEngineHyperscan(exprMap, scan_mode, s.name,
sigName, *grey);
}
if (!engine) { if (!engine) {
printf("Error: expressions failed to compile.\n"); printf("Error: expressions failed to compile.\n");
exit(1); exit(1);

View File

@ -5,6 +5,14 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
SET(hscheck_SOURCES SET(hscheck_SOURCES
main.cpp main.cpp
) )
add_executable(hscheck ${hscheck_SOURCES})
target_link_libraries(hscheck hs expressionutil pthread) if (BUILD_CHIMERA)
include_directories(${PCRE_INCLUDE_DIRS})
add_definitions(-DHS_HYBRID)
add_executable(hscheck ${hscheck_SOURCES})
target_link_libraries(hscheck hs chimera ${PCRE_LDFLAGS} expressionutil pthread)
else()
add_executable(hscheck ${hscheck_SOURCES})
target_link_libraries(hscheck hs expressionutil pthread)
endif()

View File

@ -59,6 +59,11 @@
#include "hs_internal.h" #include "hs_internal.h"
#include "ue2common.h" #include "ue2common.h"
#ifdef HS_HYBRID
#include <pcre.h>
#include "chimera/ch.h"
#endif
#include <cassert> #include <cassert>
#include <fstream> #include <fstream>
#include <mutex> #include <mutex>
@ -77,6 +82,7 @@ namespace /* anonymous */ {
// are we in streaming mode? (default: yes) // are we in streaming mode? (default: yes)
bool g_streaming = true; bool g_streaming = true;
bool g_vectored = false; bool g_vectored = false;
bool g_hybrid = false;
string g_exprPath(""); string g_exprPath("");
string g_signatureFile(""); string g_signatureFile("");
bool g_allSignatures = false; bool g_allSignatures = false;
@ -282,34 +288,57 @@ void checkExpression(UNUSED void *threadarg) {
// Try and compile a database. // Try and compile a database.
const char *regexp = regex.c_str(); const char *regexp = regex.c_str();
const hs_expr_ext *extp = &ext;
hs_error_t err; hs_error_t err;
hs_compile_error_t *compile_err;
hs_database_t *db = nullptr; if (g_hybrid) {
#ifdef HS_HYBRID
ch_compile_error_t *ch_compile_err;
ch_database_t *hybrid_db = nullptr;
err = ch_compile_multi(&regexp, &flags, nullptr, 1, CH_MODE_GROUPS,
nullptr, &hybrid_db, &ch_compile_err);
if (err == HS_SUCCESS) {
assert(hybrid_db);
recordSuccess(g_exprMap, it->first);
ch_free_database(hybrid_db);
} else {
assert(!hybrid_db);
assert(ch_compile_err);
recordFailure(g_exprMap, it->first, ch_compile_err->message);
ch_free_compile_error(ch_compile_err);
}
#else
cerr << "Hybrid mode not available in this build." << endl;
exit(1);
#endif // HS_HYBRID
} else {
const hs_expr_ext *extp = &ext;
hs_compile_error_t *compile_err;
hs_database_t *db = nullptr;
#if !defined(RELEASE_BUILD) #if !defined(RELEASE_BUILD)
// This variant is available in non-release builds and allows us to // This variant is available in non-release builds and allows us to
// modify greybox settings. // modify greybox settings.
err = hs_compile_multi_int(&regexp, &flags, nullptr, &extp, 1, mode, err = hs_compile_multi_int(&regexp, &flags, nullptr, &extp, 1, mode,
nullptr, &db, &compile_err, *g_grey); nullptr, &db, &compile_err, *g_grey);
#else #else
err = hs_compile_ext_multi(&regexp, &flags, nullptr, &extp, 1, mode, err = hs_compile_ext_multi(&regexp, &flags, nullptr, &extp, 1, mode,
nullptr, &db, &compile_err); nullptr, &db, &compile_err);
#endif #endif
if (err == HS_SUCCESS) { if (err == HS_SUCCESS) {
assert(db); assert(db);
recordSuccess(g_exprMap, it->first); recordSuccess(g_exprMap, it->first);
hs_free_database(db); hs_free_database(db);
if (check_logical) { if (check_logical) {
cacheSubExpr(it->first, regex, flags, ext); cacheSubExpr(it->first, regex, flags, ext);
}
} else {
assert(!db);
assert(compile_err);
recordFailure(g_exprMap, it->first, compile_err->message);
hs_free_compile_error(compile_err);
} }
} else {
assert(!db);
assert(compile_err);
recordFailure(g_exprMap, it->first, compile_err->message);
hs_free_compile_error(compile_err);
} }
} }
} }
@ -429,6 +458,9 @@ void usage() {
#endif #endif
<< " -V Operate in vectored mode." << endl << " -V Operate in vectored mode." << endl
<< " -N Operate in block mode (default: streaming)." << endl << " -N Operate in block mode (default: streaming)." << endl
#ifdef HS_HYBRID
<< " -H Operate in hybrid mode." << endl
#endif
<< " -L Pass HS_FLAG_SOM_LEFTMOST for all expressions (default: off)." << endl << " -L Pass HS_FLAG_SOM_LEFTMOST for all expressions (default: off)." << endl
<< " -8 Force UTF8 mode on all patterns." << endl << " -8 Force UTF8 mode on all patterns." << endl
<< " -T NUM Run with NUM threads." << endl << " -T NUM Run with NUM threads." << endl
@ -440,7 +472,7 @@ void usage() {
static static
void processArgs(int argc, char *argv[], UNUSED unique_ptr<Grey> &grey) { void processArgs(int argc, char *argv[], UNUSED unique_ptr<Grey> &grey) {
const char options[] = "e:E:s:z:hLNV8G:T:BC"; const char options[] = "e:E:s:z:hHLNV8G:T:BC";
bool signatureSet = false; bool signatureSet = false;
for (;;) { for (;;) {
@ -492,6 +524,9 @@ void processArgs(int argc, char *argv[], UNUSED unique_ptr<Grey> &grey) {
g_streaming = false; g_streaming = false;
g_vectored = true; g_vectored = true;
break; break;
case 'H':
g_hybrid = true;
break;
case 'T': case 'T':
num_of_threads = atoi(optarg); num_of_threads = atoi(optarg);
break; break;

View File

@ -1,9 +1,3 @@
# we have a fixed requirement for PCRE
set(PCRE_REQUIRED_MAJOR_VERSION 8)
set(PCRE_REQUIRED_MINOR_VERSION 41)
set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION})
include (${CMAKE_MODULE_PATH}/pcre.cmake)
if (NOT CORRECT_PCRE_VERSION) if (NOT CORRECT_PCRE_VERSION)
message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} not found, not building hscollider") message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} not found, not building hscollider")
return() return()
@ -29,6 +23,8 @@ set_source_files_properties(
ragelmaker(ColliderCorporaParser.rl) ragelmaker(ColliderCorporaParser.rl)
add_definitions(-DHS_HYBRID)
# only set these after all tests are done # only set these after all tests are done
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
@ -69,7 +65,7 @@ add_dependencies(hscollider ragel_ColliderCorporaParser)
add_dependencies(hscollider pcre) add_dependencies(hscollider pcre)
if(NOT WIN32) if(NOT WIN32)
target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil target_link_libraries(hscollider hs chimera ${PCRE_LDFLAGS} databaseutil
expressionutil corpusomatic crosscompileutil pthread expressionutil corpusomatic crosscompileutil pthread
"${BACKTRACE_LDFLAGS}") "${BACKTRACE_LDFLAGS}")
@ -78,7 +74,7 @@ if(HAVE_BACKTRACE)
"${BACKTRACE_CFLAGS}") "${BACKTRACE_CFLAGS}")
endif() endif()
else() # WIN32 else() # WIN32
target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil target_link_libraries(hscollider hs chimera ${PCRE_LDFLAGS} databaseutil
expressionutil corpusomatic crosscompileutil) expressionutil corpusomatic crosscompileutil)
endif() endif()

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2016, Intel Corporation * Copyright (c) 2015-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -54,10 +54,10 @@ public:
explicit DatabaseProxy(const std::set<unsigned> &expr_ids) explicit DatabaseProxy(const std::set<unsigned> &expr_ids)
: ids(expr_ids) {} : ids(expr_ids) {}
explicit DatabaseProxy(std::shared_ptr<HyperscanDB> built_db) explicit DatabaseProxy(std::shared_ptr<BaseDB> built_db)
: db(built_db) {} : db(built_db) {}
std::shared_ptr<HyperscanDB> get(const UltimateTruth &ultimate) { std::shared_ptr<BaseDB> get(const UltimateTruth &ultimate) {
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
if (failed) { if (failed) {
// We have previously failed to compile this database. // We have previously failed to compile this database.
@ -80,7 +80,7 @@ public:
private: private:
std::mutex mutex; std::mutex mutex;
std::shared_ptr<HyperscanDB> db; std::shared_ptr<BaseDB> db;
std::set<unsigned> ids; std::set<unsigned> ids;
bool failed = false; // Database failed compilation. bool failed = false; // Database failed compilation.
}; };

View File

@ -187,6 +187,14 @@ string pcreErrStr(int err) {
} }
} }
/* that is, a mode provided by native hyperscan */
static
bool isStandardMode(unsigned int mode) {
return mode == MODE_BLOCK
|| mode == MODE_STREAMING
|| mode == MODE_VECTORED;
}
GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr, GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
unsigned long int limit, unsigned long int limit,
unsigned long int limit_recursion) unsigned long int limit_recursion)
@ -194,8 +202,10 @@ GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
matchLimitRecursion(limit_recursion) {} matchLimitRecursion(limit_recursion) {}
void GroundTruth::global_prep() { void GroundTruth::global_prep() {
// We're using pcre callouts if (isStandardMode(colliderMode)) {
pcre_callout = &pcreCallOut; // We're using pcre callouts
pcre_callout = &pcreCallOut;
}
} }
static static
@ -262,11 +272,17 @@ GroundTruth::compile(unsigned id, bool no_callouts) {
throw PcreCompileFailure("Unsupported extended flags."); throw PcreCompileFailure("Unsupported extended flags.");
} }
// Hybrid mode implies SOM.
if (colliderMode == MODE_HYBRID) {
assert(!use_NFA);
som = true;
}
// SOM flags might be set globally. // SOM flags might be set globally.
som |= !!somFlags; som |= !!somFlags;
// For traditional Hyperscan, add global callout to pattern. // For traditional Hyperscan, add global callout to pattern.
if (!combination && !no_callouts) { if (!combination && !no_callouts && isStandardMode(colliderMode)) {
addCallout(re); addCallout(re);
} }
@ -403,6 +419,79 @@ int scanBasic(const CompiledPcre &compiled, const string &buffer,
return ret; return ret;
} }
static
bool isUtf8(const CompiledPcre &compiled) {
unsigned long int options = 0;
pcre_fullinfo(compiled.bytecode, NULL, PCRE_INFO_OPTIONS, &options);
return options & PCRE_UTF8;
}
static
CaptureVec makeCaptureVec(const vector<int> &ovector, int ret) {
assert(ret > 0);
CaptureVec cap;
if (no_groups) {
return cap; // No group info requested.
}
cap.reserve(ret * 2);
for (int i = 0; i < ret * 2; i += 2) {
int from = ovector[i], to = ovector[i + 1];
cap.push_back(make_pair(from, to));
}
return cap;
}
static
int scanHybrid(const CompiledPcre &compiled, const string &buffer,
const pcre_extra &extra, vector<int> &ovector,
ResultSet &rs, ostream &out) {
int len = (int)buffer.length();
int startoffset = 0;
bool utf8 = isUtf8(compiled);
int flags = 0;
int ret;
do {
ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), len,
startoffset, flags, &ovector[0], ovector.size());
if (ret <= PCRE_ERROR_NOMATCH) {
return ret;
}
int from = ovector.at(0);
int to = ovector.at(1);
rs.addMatch(from, to, makeCaptureVec(ovector, ret));
if (echo_matches) {
out << "PCRE Match @ (" << from << "," << to << ")" << endl;
}
// If we only wanted a single match, we're done.
if (compiled.highlander) break;
// Next scan starts at the first codepoint after the match. It's
// possible that we have a vacuous match, in which case we must step
// past it to ensure that we always progress.
if (from != to) {
startoffset = to;
} else if (utf8) {
startoffset = to + 1;
while (startoffset < len
&& ((buffer[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) {
++startoffset;
}
} else {
startoffset = to + 1;
}
} while (startoffset <= len);
return ret;
}
static static
int scanOffset(const CompiledPcre &compiled, const string &buffer, int scanOffset(const CompiledPcre &compiled, const string &buffer,
const pcre_extra &extra, vector<int> &ovector, const pcre_extra &extra, vector<int> &ovector,
@ -532,15 +621,24 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
pcre_extra extra; pcre_extra extra;
extra.flags = 0; extra.flags = 0;
// Switch on callouts. // If running in traditional HyperScan mode, switch on callouts.
extra.flags |= PCRE_EXTRA_CALLOUT_DATA; bool usingCallouts = isStandardMode(colliderMode);
extra.callout_data = &ctx; if (usingCallouts) {
// Switch on callouts.
extra.flags |= PCRE_EXTRA_CALLOUT_DATA;
extra.callout_data = &ctx;
}
// Set the match_limit (in order to bound execution time on very complex // Set the match_limit (in order to bound execution time on very complex
// patterns) // patterns)
extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION); extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION);
extra.match_limit = matchLimit; if (colliderMode == MODE_HYBRID) {
extra.match_limit_recursion = matchLimitRecursion; extra.match_limit = 10000000;
extra.match_limit_recursion = 1500;
} else {
extra.match_limit = matchLimit;
extra.match_limit_recursion = matchLimitRecursion;
}
#ifdef PCRE_NO_START_OPTIMIZE #ifdef PCRE_NO_START_OPTIMIZE
// Switch off optimizations that may result in callouts not occurring. // Switch off optimizations that may result in callouts not occurring.
@ -553,6 +651,7 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
ovector.resize(ovecsize); ovector.resize(ovecsize);
int ret; int ret;
bool hybrid = false;
switch (colliderMode) { switch (colliderMode) {
case MODE_BLOCK: case MODE_BLOCK:
case MODE_STREAMING: case MODE_STREAMING:
@ -563,6 +662,10 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
ret = scanBasic(compiled, buffer, extra, ovector, ctx); ret = scanBasic(compiled, buffer, extra, ovector, ctx);
} }
break; break;
case MODE_HYBRID:
ret = scanHybrid(compiled, buffer, extra, ovector, rs, out);
hybrid = true;
break;
default: default:
assert(0); assert(0);
ret = PCRE_ERROR_NULL; ret = PCRE_ERROR_NULL;
@ -595,7 +698,7 @@ bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
return true; return true;
} }
if (compiled.som) { if (compiled.som && !hybrid) {
filterLeftmostSom(rs); filterLeftmostSom(rs);
} }

View File

@ -35,25 +35,36 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
// Type for capturing groups: a vector of (from, to) offsets, with both set to
// -1 for inactive groups (like pcre's ovector). Used by hybrid modes.
typedef std::vector<std::pair<int, int> > CaptureVec;
// Class representing a single match, encapsulating to/from offsets. // Class representing a single match, encapsulating to/from offsets.
class MatchResult { class MatchResult {
public: public:
MatchResult(unsigned long long start, unsigned long long end) MatchResult(unsigned long long start, unsigned long long end)
: from(start), to(end) {} : from(start), to(end) {}
MatchResult(unsigned long long start, unsigned long long end,
const CaptureVec &cap)
: from(start), to(end), captured(cap) {}
bool operator<(const MatchResult &a) const { bool operator<(const MatchResult &a) const {
if (from != a.from) { if (from != a.from) {
return from < a.from; return from < a.from;
} }
return to < a.to; if (to != a.to) {
return to < a.to;
}
return captured < a.captured;
} }
bool operator==(const MatchResult &a) const { bool operator==(const MatchResult &a) const {
return from == a.from && to == a.to; return from == a.from && to == a.to && captured == a.captured;
} }
unsigned long long from; unsigned long long from;
unsigned long long to; unsigned long long to;
CaptureVec captured;
}; };
enum ResultSource { enum ResultSource {
@ -114,6 +125,19 @@ public:
} }
} }
// Add a match (with capturing vector)
void addMatch(unsigned long long from, unsigned long long to,
const CaptureVec &cap, int block = 0) {
MatchResult m(from, to, cap);
matches.insert(m);
if (matches_by_block[block].find(m) != matches_by_block[block].end()) {
dupe_matches.insert(m);
} else {
matches_by_block[block].insert(m);
}
}
// Clear all matches. // Clear all matches.
void clear() { void clear() {
matches.clear(); matches.clear();

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -90,19 +90,14 @@ hs_error_t open_magic_stream(const hs_database_t *db, unsigned flags,
#endif // RELEASE_BUILD #endif // RELEASE_BUILD
class HyperscanDB : boost::noncopyable { class BaseDB : boost::noncopyable {
public: public:
// Constructor takes iterators over a container of pattern IDs. // Constructor takes iterators over a container of pattern IDs.
template <class Iter> template <class Iter>
HyperscanDB(hs_database_t *db_in, Iter ids_begin, Iter ids_end) BaseDB(Iter ids_begin, Iter ids_end)
: db(db_in), ids(ids_begin, ids_end) {} : ids(ids_begin, ids_end) {}
~HyperscanDB() { virtual ~BaseDB();
hs_free_database(db);
}
// Underlying Hyperscan database pointer.
hs_database_t *db;
// The set of expression IDs that must return their matches in order. // The set of expression IDs that must return their matches in order.
unordered_set<unsigned> ordered; unordered_set<unsigned> ordered;
@ -111,15 +106,55 @@ public:
unordered_set<unsigned> ids; unordered_set<unsigned> ids;
}; };
BaseDB::~BaseDB() { }
class HyperscanDB : public BaseDB {
public:
// Constructor takes iterators over a container of pattern IDs.
template <class Iter>
HyperscanDB(hs_database_t *db_in, Iter ids_begin, Iter ids_end)
: BaseDB(ids_begin, ids_end), db(db_in) {}
~HyperscanDB();
// Underlying Hyperscan database pointer.
hs_database_t *db;
};
HyperscanDB::~HyperscanDB() {
hs_free_database(db);
}
#ifdef HS_HYBRID
class HybridDB : public BaseDB {
public:
// Constructor takes iterators over a container of pattern IDs.
template <class Iter>
HybridDB(ch_database_t *db_in, Iter ids_begin, Iter ids_end)
: BaseDB(ids_begin, ids_end), db(db_in) {}
~HybridDB();
// Underlying Hyperscan database pointer.
ch_database_t *db;
};
HybridDB::~HybridDB() {
ch_free_database(db);
}
#endif // HS_HYBRID
// Used to track the ID and result set. // Used to track the ID and result set.
namespace { namespace {
struct MultiContext { struct MultiContext {
MultiContext(unsigned int id_in, const HyperscanDB &db_in, ResultSet *rs_in, MultiContext(unsigned int id_in, const BaseDB &db_in, ResultSet *rs_in,
bool single_in, ostream &os) bool single_in, ostream &os)
: id(id_in), db(db_in), rs(rs_in), single(single_in), out(os) {} : id(id_in), db(db_in), rs(rs_in), single(single_in), out(os) {}
unsigned int id; unsigned int id;
int block = 0; int block = 0;
const HyperscanDB &db; const BaseDB &db;
ResultSet *rs; ResultSet *rs;
u64a lastRawMatch = 0; /* store last known unadjusted match location */ u64a lastRawMatch = 0; /* store last known unadjusted match location */
u64a lastOrderMatch = 0; u64a lastOrderMatch = 0;
@ -230,6 +265,75 @@ int callbackMulti(unsigned int id, unsigned long long from,
return 0; return 0;
} }
#ifdef HS_HYBRID
// Hybrid matcher callback.
static
ch_callback_t callbackHybrid(unsigned id, unsigned long long from,
unsigned long long to, unsigned, unsigned size,
const ch_capture_t *captured, void *ctx) {
MultiContext *mctx = static_cast<MultiContext *>(ctx);
assert(mctx);
assert(mctx->rs);
assert(mctx->in_scan_call);
ostream &out = mctx->out;
to -= g_corpora_prefix.size();
if (mctx->terminated) {
out << "UE2 Match @ (" << from << "," << to << ") for " << id
<< " after termination" << endl;
mctx->rs->match_after_halt = true;
}
if (mctx->single || id == mctx->id) {
CaptureVec cap;
for (unsigned int i = 0; i < size; i++) {
if (!(captured[i].flags & CH_CAPTURE_FLAG_ACTIVE)) {
cap.push_back(make_pair(-1, -1));
} else {
cap.push_back(make_pair(captured[i].from, captured[i].to));
}
}
mctx->rs->addMatch(from, to, cap);
}
if (echo_matches) {
out << "Match @ [" << from << "," << to << "] for " << id << endl;
out << " Captured " << size << " groups: ";
for (unsigned int i = 0; i < size; i++) {
if (!(captured[i].flags & CH_CAPTURE_FLAG_ACTIVE)) {
out << "{} ";
} else {
out << "{" << captured[i].from << "," << captured[i].to << "} ";
}
}
out << endl;
}
if (limit_matches && mctx->rs->matches.size() == limit_matches) {
mctx->terminated = true;
return CH_CALLBACK_TERMINATE;
}
return CH_CALLBACK_CONTINUE;
}
// Hybrid matcher error callback.
static
ch_callback_t errorCallback(UNUSED ch_error_event_t errorType, UNUSED unsigned int id, void *,
void *ctx) {
UNUSED MultiContext *mctx = static_cast<MultiContext *>(ctx);
assert(mctx);
assert(mctx->rs);
assert(mctx->in_scan_call);
return CH_CALLBACK_SKIP_PATTERN;
}
#endif // HS_HYBRID
static static
void filterLeftmostSom(ResultSet &rs) { void filterLeftmostSom(ResultSet &rs) {
if (rs.matches.size() <= 1) { if (rs.matches.size() <= 1) {
@ -252,6 +356,9 @@ UltimateTruth::UltimateTruth(ostream &os, const ExpressionMap &expr,
const Grey &grey_in, unsigned int streamBlocks) const Grey &grey_in, unsigned int streamBlocks)
: grey(grey_in), out(os), m_expr(expr), m_xcompile(false), : grey(grey_in), out(os), m_expr(expr), m_xcompile(false),
m_streamBlocks(streamBlocks), scratch(nullptr), m_streamBlocks(streamBlocks), scratch(nullptr),
#ifdef HS_HYBRID
chimeraScratch(nullptr),
#endif
platform(plat) { platform(plat) {
// Build our mode flags. // Build our mode flags.
@ -265,15 +372,27 @@ UltimateTruth::UltimateTruth(ostream &os, const ExpressionMap &expr,
case MODE_VECTORED: case MODE_VECTORED:
m_mode = HS_MODE_VECTORED; m_mode = HS_MODE_VECTORED;
break; break;
case MODE_HYBRID:
m_mode = 0;
break;
} }
// Set desired SOM precision, if we're in streaming mode. // Set desired SOM precision, if we're in streaming mode.
if (colliderMode == MODE_STREAMING) { if (colliderMode == MODE_STREAMING) {
m_mode |= somPrecisionMode; m_mode |= somPrecisionMode;
} }
#ifdef HS_HYBRID
if (colliderMode == MODE_HYBRID && !no_groups) {
m_mode |= CH_MODE_GROUPS;
}
#endif
} }
UltimateTruth::~UltimateTruth() { UltimateTruth::~UltimateTruth() {
#ifdef HS_HYBRID
ch_free_scratch(chimeraScratch);
#endif
hs_free_scratch(scratch); hs_free_scratch(scratch);
} }
@ -327,13 +446,13 @@ void mangle_scratch(hs_scratch_t *scratch) {
scratch->fdr_conf_offset = 0xe4; scratch->fdr_conf_offset = 0xe4;
} }
bool UltimateTruth::blockScan(const HyperscanDB &hdb, const string &buffer, bool UltimateTruth::blockScan(const BaseDB &bdb, const string &buffer,
size_t align, match_event_handler callback, size_t align, match_event_handler callback,
void *ctx_in, ResultSet *) { void *ctx_in, ResultSet *) {
assert(colliderMode == MODE_BLOCK); assert(colliderMode == MODE_BLOCK);
assert(!m_xcompile); assert(!m_xcompile);
const hs_database_t *db = hdb.db; const hs_database_t *db = reinterpret_cast<const HyperscanDB &>(bdb).db;
assert(db); assert(db);
MultiContext *ctx = (MultiContext *)ctx_in; MultiContext *ctx = (MultiContext *)ctx_in;
@ -438,13 +557,13 @@ hs_stream_t *compressAndResetExpandStream(const hs_database_t *db,
return out; return out;
} }
bool UltimateTruth::streamingScan(const HyperscanDB &hdb, const string &buffer, bool UltimateTruth::streamingScan(const BaseDB &bdb, const string &buffer,
size_t align, match_event_handler callback, size_t align, match_event_handler callback,
void *ctx_in, ResultSet *rs) { void *ctx_in, ResultSet *rs) {
assert(colliderMode == MODE_STREAMING); assert(colliderMode == MODE_STREAMING);
assert(!m_xcompile); assert(!m_xcompile);
const hs_database_t *db = hdb.db; const hs_database_t *db = reinterpret_cast<const HyperscanDB &>(bdb).db;
assert(db); assert(db);
MultiContext *ctx = (MultiContext *)ctx_in; MultiContext *ctx = (MultiContext *)ctx_in;
@ -594,13 +713,13 @@ bool UltimateTruth::streamingScan(const HyperscanDB &hdb, const string &buffer,
return ret == HS_SUCCESS; return ret == HS_SUCCESS;
} }
bool UltimateTruth::vectoredScan(const HyperscanDB &hdb, const string &buffer, bool UltimateTruth::vectoredScan(const BaseDB &bdb, const string &buffer,
size_t align, match_event_handler callback, size_t align, match_event_handler callback,
void *ctx_in, ResultSet *rs) { void *ctx_in, ResultSet *rs) {
assert(colliderMode == MODE_VECTORED); assert(colliderMode == MODE_VECTORED);
assert(!m_xcompile); assert(!m_xcompile);
const hs_database_t *db = hdb.db; const hs_database_t *db = reinterpret_cast<const HyperscanDB &>(bdb).db;
assert(db); assert(db);
MultiContext *ctx = (MultiContext *)ctx_in; MultiContext *ctx = (MultiContext *)ctx_in;
@ -682,19 +801,67 @@ bool UltimateTruth::vectoredScan(const HyperscanDB &hdb, const string &buffer,
return true; return true;
} }
bool UltimateTruth::run(unsigned int id, shared_ptr<const HyperscanDB> hdb, #ifdef HS_HYBRID
bool UltimateTruth::hybridScan(const BaseDB &bdb, const string &buffer,
size_t align, ch_match_event_handler callback,
ch_error_event_handler error_callback,
void *ctx_in, ResultSet *) {
assert(colliderMode == MODE_HYBRID);
assert(!m_xcompile);
const ch_database_t *db = reinterpret_cast<const HybridDB &>(bdb).db;
assert(db);
MultiContext *ctx = (MultiContext *)ctx_in;
char *realigned = setupScanBuffer(buffer.c_str(), buffer.size(), align);
if (!realigned) {
return false;
}
if (use_copy_scratch && !cloneScratch()) {
return false;
}
ctx->in_scan_call = true;
ch_error_t ret =
ch_scan(db, realigned, buffer.size(), 0, chimeraScratch, callback,
error_callback, ctx);
ctx->in_scan_call = false;
if (g_verbose) {
out << "Scan call returned " << ret << endl;
}
if (ctx->terminated) {
if (g_verbose && ret != CH_SCAN_TERMINATED) {
out << "Scan should have returned CH_SCAN_TERMINATED, returned "
<< ret << " instead." << endl;
}
return ret == CH_SCAN_TERMINATED;
}
if (g_verbose && ret != CH_SUCCESS) {
out << "Scan should have returned CH_SUCCESS, returned " << ret
<< " instead." << endl;
}
return ret == CH_SUCCESS;
}
#endif
bool UltimateTruth::run(unsigned int id, shared_ptr<const BaseDB> bdb,
const string &buffer, bool single_pattern, const string &buffer, bool single_pattern,
unsigned int align, ResultSet &rs) { unsigned int align, ResultSet &rs) {
assert(!m_xcompile); assert(!m_xcompile);
assert(hdb); assert(bdb);
// Ensure that scratch is appropriate for this database. // Ensure that scratch is appropriate for this database.
if (!allocScratch(hdb)) { if (!allocScratch(bdb)) {
out << "Scratch alloc failed." << endl; out << "Scratch alloc failed." << endl;
return false; return false;
} }
MultiContext ctx(id, *hdb, &rs, single_pattern, out); MultiContext ctx(id, *bdb, &rs, single_pattern, out);
if (!g_corpora_suffix.empty()) { if (!g_corpora_suffix.empty()) {
ctx.use_max_offset = true; ctx.use_max_offset = true;
ctx.max_offset = buffer.size() - g_corpora_suffix.size(); ctx.max_offset = buffer.size() - g_corpora_suffix.size();
@ -702,11 +869,20 @@ bool UltimateTruth::run(unsigned int id, shared_ptr<const HyperscanDB> hdb,
switch (colliderMode) { switch (colliderMode) {
case MODE_BLOCK: case MODE_BLOCK:
return blockScan(*hdb, buffer, align, callbackMulti, &ctx, &rs); return blockScan(*bdb, buffer, align, callbackMulti, &ctx, &rs);
case MODE_STREAMING: case MODE_STREAMING:
return streamingScan(*hdb, buffer, align, callbackMulti, &ctx, &rs); return streamingScan(*bdb, buffer, align, callbackMulti, &ctx, &rs);
case MODE_VECTORED: case MODE_VECTORED:
return vectoredScan(*hdb, buffer, align, callbackMulti, &ctx, &rs); return vectoredScan(*bdb, buffer, align, callbackMulti, &ctx, &rs);
case MODE_HYBRID:
#ifdef HS_HYBRID
return hybridScan(*bdb, buffer, align, callbackHybrid, errorCallback,
&ctx, &rs);
#else
cerr << "Hybrid mode not available in this build." << endl;
abort();
#endif
break;
} }
assert(0); assert(0);
@ -739,7 +915,7 @@ bool isOrdered(const string &expr, unsigned int flags) {
return ordered; return ordered;
} }
static unique_ptr<HyperscanDB> static unique_ptr<BaseDB>
compileHyperscan(vector<const char *> &patterns, vector<unsigned> &flags, compileHyperscan(vector<const char *> &patterns, vector<unsigned> &flags,
vector<unsigned> &idsvec, ptr_vector<hs_expr_ext> &ext, vector<unsigned> &idsvec, ptr_vector<hs_expr_ext> &ext,
unsigned mode, const hs_platform_info *platform, string &error, unsigned mode, const hs_platform_info *platform, string &error,
@ -762,7 +938,30 @@ compileHyperscan(vector<const char *> &patterns, vector<unsigned> &flags,
return ue2::make_unique<HyperscanDB>(db, idsvec.begin(), idsvec.end()); return ue2::make_unique<HyperscanDB>(db, idsvec.begin(), idsvec.end());
} }
shared_ptr<HyperscanDB> UltimateTruth::compile(const set<unsigned> &ids, #ifdef HS_HYBRID
static unique_ptr<BaseDB>
compileHybrid(vector<const char *> &patterns,
vector<unsigned> &flags, vector<unsigned> &idsvec,
unsigned mode, const hs_platform_info *platform, string &error) {
const unsigned count = patterns.size();
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err;
ch_error_t err = ch_compile_multi(&patterns[0], &flags[0],
&idsvec[0], count, mode, platform, &db,
&compile_err);
if (err != HS_SUCCESS) {
error = compile_err->message;
ch_free_compile_error(compile_err);
return nullptr;
}
return ue2::make_unique<HybridDB>(db, idsvec.begin(), idsvec.end());
}
#endif
shared_ptr<BaseDB> UltimateTruth::compile(const set<unsigned> &ids,
string &error) const { string &error) const {
// Build our vectors for compilation // Build our vectors for compilation
const size_t count = ids.size(); const size_t count = ids.size();
@ -811,6 +1010,17 @@ shared_ptr<HyperscanDB> UltimateTruth::compile(const set<unsigned> &ids,
ext[n].edit_distance = edit_distance; ext[n].edit_distance = edit_distance;
} }
if (colliderMode == MODE_HYBRID) {
if (ext[n].flags) {
error = "Hybrid does not support extended parameters.";
return nullptr;
}
// We can also strip some other flags in the hybrid matcher.
flags[n] &= ~HS_FLAG_PREFILTER; // prefilter always used
flags[n] &= ~HS_FLAG_ALLOWEMPTY; // empty always allowed
flags[n] &= ~HS_FLAG_SOM_LEFTMOST; // SOM always on
}
n++; n++;
} }
@ -827,8 +1037,18 @@ shared_ptr<HyperscanDB> UltimateTruth::compile(const set<unsigned> &ids,
idsvec.push_back(0); idsvec.push_back(0);
} }
auto db = compileHyperscan(patterns, flags, idsvec, ext, m_mode, platform, unique_ptr<BaseDB> db;
error, grey); if (colliderMode == MODE_HYBRID) {
#ifdef HS_HYBRID
db = compileHybrid(patterns, flags, idsvec, m_mode, platform, error);
#else
error = "Hybrid mode not available in this build.";
#endif
} else {
db = compileHyperscan(patterns, flags, idsvec, ext, m_mode,
platform, error, grey);
}
if (!db) { if (!db) {
return nullptr; return nullptr;
} }
@ -850,18 +1070,29 @@ shared_ptr<HyperscanDB> UltimateTruth::compile(const set<unsigned> &ids,
return move(db); return move(db);
} }
bool UltimateTruth::allocScratch(shared_ptr<const HyperscanDB> db) { bool UltimateTruth::allocScratch(shared_ptr<const BaseDB> db) {
assert(db); assert(db);
// We explicitly avoid running scratch allocators for the same HyperscanDB // We explicitly avoid running scratch allocators for the same BaseDB
// over and over again by retaining a shared_ptr to the last one we saw. // over and over again by retaining a shared_ptr to the last one we saw.
if (db == last_db) { if (db == last_db) {
return true; return true;
} }
hs_error_t err = hs_alloc_scratch(db.get()->db, &scratch); if (colliderMode == MODE_HYBRID) {
if (err != HS_SUCCESS) { #ifdef HS_HYBRID
return false; ch_error_t err = ch_alloc_scratch(
reinterpret_cast<const HybridDB *>(db.get())->db, &chimeraScratch);
if (err != HS_SUCCESS) {
return false;
}
#endif // HS_HYBRID
} else {
hs_error_t err = hs_alloc_scratch(
reinterpret_cast<const HyperscanDB *>(db.get())->db, &scratch);
if (err != HS_SUCCESS) {
return false;
}
} }
last_db = db; last_db = db;
@ -869,20 +1100,40 @@ bool UltimateTruth::allocScratch(shared_ptr<const HyperscanDB> db) {
} }
bool UltimateTruth::cloneScratch(void) { bool UltimateTruth::cloneScratch(void) {
hs_scratch_t *old_scratch = scratch; if (colliderMode == MODE_HYBRID) {
hs_scratch_t *new_scratch; #ifdef HS_HYBRID
hs_error_t ret = hs_clone_scratch(scratch, &new_scratch); ch_scratch_t *old_scratch = chimeraScratch;
if (ret != HS_SUCCESS) { ch_scratch_t *new_scratch;
DEBUG_PRINTF("failure to clone %d\n", ret); ch_error_t ret = ch_clone_scratch(chimeraScratch, &new_scratch);
return false; if (ret != CH_SUCCESS) {
DEBUG_PRINTF("failure to clone %d\n", ret);
return false;
}
chimeraScratch = new_scratch;
ret = ch_free_scratch(old_scratch);
if (ret != CH_SUCCESS) {
DEBUG_PRINTF("failure to free %d\n", ret);
return false;
}
DEBUG_PRINTF("hybrid scratch cloned from %p to %p\n",
old_scratch, chimeraScratch);
#endif // HS_HYBRID
} else {
hs_scratch_t *old_scratch = scratch;
hs_scratch_t *new_scratch;
hs_error_t ret = hs_clone_scratch(scratch, &new_scratch);
if (ret != HS_SUCCESS) {
DEBUG_PRINTF("failure to clone %d\n", ret);
return false;
}
scratch = new_scratch;
ret = hs_free_scratch(old_scratch);
if (ret != HS_SUCCESS) {
DEBUG_PRINTF("failure to free %d\n", ret);
return false;
}
DEBUG_PRINTF("scratch cloned from %p to %p\n", old_scratch, scratch);
} }
scratch = new_scratch;
ret = hs_free_scratch(old_scratch);
if (ret != HS_SUCCESS) {
DEBUG_PRINTF("failure to free %d\n", ret);
return false;
}
DEBUG_PRINTF("scratch cloned from %p to %p\n", old_scratch, scratch);
return true; return true;
} }
@ -947,20 +1198,35 @@ char *UltimateTruth::setupVecScanBuffer(const char *begin, size_t len,
return ptr; return ptr;
} }
bool UltimateTruth::saveDatabase(const HyperscanDB &hdb, bool UltimateTruth::saveDatabase(const BaseDB &bdb,
const string &filename) const { const string &filename) const {
return ::saveDatabase(hdb.db, filename.c_str(), g_verbose); if (colliderMode == MODE_HYBRID) {
cerr << "Hybrid mode doesn't support serialization." << endl;
abort();
} else {
return ::saveDatabase(reinterpret_cast<const HyperscanDB *>(&bdb)->db,
filename.c_str(), g_verbose);
}
return false;
} }
shared_ptr<HyperscanDB> shared_ptr<BaseDB>
UltimateTruth::loadDatabase(const string &filename, UltimateTruth::loadDatabase(const string &filename,
const std::set<unsigned> &ids) const { const std::set<unsigned> &ids) const {
hs_database_t *hs_db = ::loadDatabase(filename.c_str(), g_verbose); shared_ptr<BaseDB> db;
if (!hs_db) {
return nullptr; if (colliderMode == MODE_HYBRID) {
cerr << "Hybrid mode doesn't support deserialization." << endl;
abort();
} else {
hs_database_t *hs_db = ::loadDatabase(filename.c_str(), g_verbose);
if (!hs_db) {
return nullptr;
}
db = make_shared<HyperscanDB>(hs_db, ids.begin(), ids.end());
} }
auto db = make_shared<HyperscanDB>(hs_db, ids.begin(), ids.end());
assert(db); assert(db);
// Fill db::ordered with the expressions that require the ordered flag. // Fill db::ordered with the expressions that require the ordered flag.

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -33,6 +33,10 @@
#include "hs.h" #include "hs.h"
#ifdef HS_HYBRID
#include "chimera/ch.h"
#endif
#include <memory> #include <memory>
#include <ostream> #include <ostream>
#include <set> #include <set>
@ -47,7 +51,7 @@ struct Grey;
} // namespace ue2 } // namespace ue2
class HyperscanDB; class BaseDB;
class ResultSet; class ResultSet;
// Wrapper around ue2 to generate results for an expression and corpus. // Wrapper around ue2 to generate results for an expression and corpus.
@ -59,13 +63,13 @@ public:
~UltimateTruth(); ~UltimateTruth();
std::shared_ptr<HyperscanDB> compile(const std::set<unsigned> &ids, std::shared_ptr<BaseDB> compile(const std::set<unsigned> &ids,
std::string &error) const; std::string &error) const;
bool saveDatabase(const HyperscanDB &db, bool saveDatabase(const BaseDB &db,
const std::string &filename) const; const std::string &filename) const;
std::shared_ptr<HyperscanDB> std::shared_ptr<BaseDB>
loadDatabase(const std::string &filename, loadDatabase(const std::string &filename,
const std::set<unsigned> &ids) const; const std::set<unsigned> &ids) const;
@ -74,7 +78,7 @@ public:
return !m_xcompile; return !m_xcompile;
} }
bool run(unsigned id, std::shared_ptr<const HyperscanDB> db, bool run(unsigned id, std::shared_ptr<const BaseDB> db,
const std::string &buffer, bool single_pattern, unsigned align, const std::string &buffer, bool single_pattern, unsigned align,
ResultSet &rs); ResultSet &rs);
@ -84,22 +88,28 @@ public:
std::string dbFilename(const std::set<unsigned int> &ids) const; std::string dbFilename(const std::set<unsigned int> &ids) const;
private: private:
bool blockScan(const HyperscanDB &db, const std::string &buffer, bool blockScan(const BaseDB &db, const std::string &buffer,
size_t align, match_event_handler callback, void *ctx, size_t align, match_event_handler callback, void *ctx,
ResultSet *rs); ResultSet *rs);
bool streamingScan(const HyperscanDB &db, const std::string &buffer, bool streamingScan(const BaseDB &db, const std::string &buffer,
size_t align, match_event_handler callback, void *ctx, size_t align, match_event_handler callback, void *ctx,
ResultSet *rs); ResultSet *rs);
bool vectoredScan(const HyperscanDB &db, const std::string &buffer, bool vectoredScan(const BaseDB &db, const std::string &buffer,
size_t align, match_event_handler callback, void *ctx, size_t align, match_event_handler callback, void *ctx,
ResultSet *rs); ResultSet *rs);
#ifdef HS_HYBRID
bool hybridScan(const BaseDB &db, const std::string &buffer,
size_t align, ch_match_event_handler callback,
ch_error_event_handler error_callback,
void *ctx, ResultSet *rs);
#endif // HS_HYBRID
char *setupScanBuffer(const char *buf, size_t len, size_t align); char *setupScanBuffer(const char *buf, size_t len, size_t align);
char *setupVecScanBuffer(const char *buf, size_t len, size_t align, char *setupVecScanBuffer(const char *buf, size_t len, size_t align,
unsigned int block_id); unsigned int block_id);
bool allocScratch(std::shared_ptr<const HyperscanDB> db); bool allocScratch(std::shared_ptr<const BaseDB> db);
bool cloneScratch(void); bool cloneScratch(void);
@ -126,6 +136,11 @@ private:
// Scratch space for Hyperscan. // Scratch space for Hyperscan.
hs_scratch_t *scratch; hs_scratch_t *scratch;
#ifdef HS_HYBRID
// Scratch space for Chimera.
ch_scratch_t *chimeraScratch;
#endif // HS_HYBRID
// Temporary scan buffer used for realigned scanning // Temporary scan buffer used for realigned scanning
std::vector<char> m_scanBuf; std::vector<char> m_scanBuf;
@ -134,7 +149,7 @@ private:
// Last database we successfully allocated scratch for, so that we can // Last database we successfully allocated scratch for, so that we can
// avoid unnecessarily reallocating for it. // avoid unnecessarily reallocating for it.
std::shared_ptr<const HyperscanDB> last_db; std::shared_ptr<const BaseDB> last_db;
const hs_platform_info *platform; const hs_platform_info *platform;
}; };

View File

@ -76,6 +76,7 @@ void usage(const char *name, const char *error) {
"blocks.\n"); "blocks.\n");
printf(" -V NUM Use vectored mode, split data into ~NUM " printf(" -V NUM Use vectored mode, split data into ~NUM "
"blocks.\n"); "blocks.\n");
printf(" -H Use hybrid mode.\n");
printf(" -Z {R or 0-%d} Only test one alignment, either as given or " printf(" -Z {R or 0-%d} Only test one alignment, either as given or "
"'R' for random.\n", MAX_MAX_UE2_ALIGN - 1); "'R' for random.\n", MAX_MAX_UE2_ALIGN - 1);
printf(" -q Quiet; display only match differences, no other " printf(" -q Quiet; display only match differences, no other "
@ -90,6 +91,7 @@ void usage(const char *name, const char *error) {
printf(" -E DISTANCE Match all patterns within edit distance" printf(" -E DISTANCE Match all patterns within edit distance"
" DISTANCE.\n"); " DISTANCE.\n");
printf(" --prefilter Apply HS_FLAG_PREFILTER to all patterns.\n"); printf(" --prefilter Apply HS_FLAG_PREFILTER to all patterns.\n");
printf(" --no-groups Disable capturing in Hybrid mode.\n");
printf("\n"); printf("\n");
printf("Testing mode options:\n"); printf("Testing mode options:\n");
printf("\n"); printf("\n");
@ -157,7 +159,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
vector<string> *corpora, UNUSED Grey *grey, vector<string> *corpora, UNUSED Grey *grey,
unique_ptr<hs_platform_info> *plat_out) { unique_ptr<hs_platform_info> *plat_out) {
static const char options[] static const char options[]
= "-ab:cC:d:D:e:E:G:hi:k:Lm:M:n:o:O:p:P:qr:R:S:s:t:T:vV:w:x:X:Y:z:Z:8"; = "-ab:cC:d:D:e:E:G:hHi:k:Lm:M:n:o:O:p:P:qr:R:S:s:t:T:vV:w:x:X:Y:z:Z:8";
s32 in_multi = 0; s32 in_multi = 0;
s32 in_corpora = 0; s32 in_corpora = 0;
int pcreFlag = 1; int pcreFlag = 1;
@ -180,6 +182,7 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
{"no-signal-handler", 0, &no_signal_handler, 1}, {"no-signal-handler", 0, &no_signal_handler, 1},
{"compress-expand", 0, &compressFlag, 1}, {"compress-expand", 0, &compressFlag, 1},
{"compress-reset-expand", 0, &compressResetFlag, 1}, {"compress-reset-expand", 0, &compressResetFlag, 1},
{"no-groups", 0, &no_groups, 1},
{nullptr, 0, nullptr, 0}}; {nullptr, 0, nullptr, 0}};
for (;;) { for (;;) {
@ -271,6 +274,15 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
case 'h': case 'h':
usage(argv[0], nullptr); usage(argv[0], nullptr);
exit(0); exit(0);
case 'H':
if (colliderMode != MODE_BLOCK) {
usage(argv[0], "You can only use one mode at a time!");
exit(1);
}
colliderMode = MODE_HYBRID;
// Disable graph truth in hybrid mode
nfaFlag = 0;
break;
case 'i': case 'i':
loadDatabases = true; loadDatabases = true;
serializePath = optarg; serializePath = optarg;
@ -542,6 +554,11 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
exit(1); exit(1);
} }
if (colliderMode == MODE_HYBRID && !ue2Flag) {
usage(argv[0], "You cannot disable UE2 engine in Hybrid mode.");
exit(1);
}
// need at least two pattern engines active // need at least two pattern engines active
if (nfaFlag + pcreFlag + ue2Flag < 2) { if (nfaFlag + pcreFlag + ue2Flag < 2) {
usage(argv[0], "At least two pattern engines should be active."); usage(argv[0], "At least two pattern engines should be active.");

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -36,7 +36,8 @@
enum ColliderMode { enum ColliderMode {
MODE_BLOCK, MODE_BLOCK,
MODE_STREAMING, MODE_STREAMING,
MODE_VECTORED MODE_VECTORED,
MODE_HYBRID
}; };
extern unsigned numThreads; extern unsigned numThreads;
@ -68,6 +69,7 @@ extern unsigned max_ue2_align;
extern size_t g_memoryLimit; extern size_t g_memoryLimit;
extern bool force_utf8; extern bool force_utf8;
extern int force_prefilter; extern int force_prefilter;
extern int no_groups;
extern unsigned somPrecisionMode; extern unsigned somPrecisionMode;
extern unsigned limit_matches; extern unsigned limit_matches;
extern unsigned randomSeed; extern unsigned randomSeed;

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2018, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -448,6 +448,9 @@ void printMode(void) {
case MODE_VECTORED: case MODE_VECTORED:
cout << "Vectored-" << g_streamBlocks; cout << "Vectored-" << g_streamBlocks;
break; break;
case MODE_HYBRID:
cout << "Hybrid";
break;
} }
if (use_copy_scratch) { if (use_copy_scratch) {
@ -690,7 +693,7 @@ shared_ptr<DatabaseProxy> constructDatabase(const set<unsigned int> &ids,
if (loadDatabases) { if (loadDatabases) {
string filename = ultimate.dbFilename(ids); string filename = ultimate.dbFilename(ids);
shared_ptr<HyperscanDB> db = ultimate.loadDatabase(filename, ids); shared_ptr<BaseDB> db = ultimate.loadDatabase(filename, ids);
if (!db) { if (!db) {
if (!g_quiet) { if (!g_quiet) {
cout << "FAILED: could not load database " << filename << endl; cout << "FAILED: could not load database " << filename << endl;
@ -706,7 +709,7 @@ shared_ptr<DatabaseProxy> constructDatabase(const set<unsigned int> &ids,
// If we're not runnable (i.e. we're cross-compiling), let's at least // If we're not runnable (i.e. we're cross-compiling), let's at least
// try to build the database. // try to build the database.
if (!ultimate.runnable()) { if (!ultimate.runnable()) {
shared_ptr<HyperscanDB> db = ue2->get(ultimate); shared_ptr<BaseDB> db = ue2->get(ultimate);
assert(db); // throws otherwise assert(db); // throws otherwise
} }
@ -872,7 +875,7 @@ void runTestUnit(ostream &out, GroundTruth &ground, GraphTruth &graph,
assert(use_UE2); assert(use_UE2);
Corpus &corpus = unit.corpus; Corpus &corpus = unit.corpus;
shared_ptr<const HyperscanDB> db; shared_ptr<const BaseDB> db;
if (use_UE2) { if (use_UE2) {
// Acquire UE2 database. // Acquire UE2 database.
debug_stage = STAGE_UE2_COMPILE; debug_stage = STAGE_UE2_COMPILE;
@ -1648,6 +1651,7 @@ void printSettingsV(const vector<string> &corporaFiles,
case MODE_BLOCK: cout << "block mode"; break; case MODE_BLOCK: cout << "block mode"; break;
case MODE_STREAMING: cout << "streaming mode"; break; case MODE_STREAMING: cout << "streaming mode"; break;
case MODE_VECTORED: cout << "vectored mode"; break; case MODE_VECTORED: cout << "vectored mode"; break;
case MODE_HYBRID: cout << "hybrid mode"; break;
} }
cout << endl; cout << endl;
@ -1746,6 +1750,7 @@ void printSettingsQ(const vector<string> &corporaFiles,
case MODE_BLOCK: cout << "block mode"; break; case MODE_BLOCK: cout << "block mode"; break;
case MODE_STREAMING: cout << "streaming mode"; break; case MODE_STREAMING: cout << "streaming mode"; break;
case MODE_VECTORED: cout << "vectored mode"; break; case MODE_VECTORED: cout << "vectored mode"; break;
case MODE_HYBRID: cout << "hybrid mode"; break;
} }
cout << endl; cout << endl;

View File

@ -123,22 +123,58 @@ set_target_properties(unit-internal PROPERTIES COMPILE_FLAGS "${HS_CXX_FLAGS}")
target_link_libraries(unit-internal hs corpusomatic) target_link_libraries(unit-internal hs corpusomatic)
endif(NOT (RELEASE_BUILD OR FAT_RUNTIME)) endif(NOT (RELEASE_BUILD OR FAT_RUNTIME))
# if (BUILD_CHIMERA)
# build target to run unit tests # enable Chimera unit tests
# set(unit_chimera_SOURCES
if (NOT RELEASE_BUILD) ${gtest_SOURCES}
add_custom_target( chimera/allocators.cpp
unit chimera/arg_checks.cpp
COMMAND bin/unit-internal chimera/bad_patterns.cpp
COMMAND bin/unit-hyperscan chimera/compat.cpp
WORKING_DIRECTORY ${CMAKE_BINARY_DIR} chimera/main.cpp
DEPENDS unit-internal unit-hyperscan chimera/scan.cpp
) )
else () add_executable(unit-chimera ${unit_chimera_SOURCES})
add_custom_target( target_link_libraries(unit-chimera chimera hs pcre)
unit #
COMMAND bin/unit-hyperscan # build target to run unit tests
WORKING_DIRECTORY ${CMAKE_BINARY_DIR} #
DEPENDS unit-hyperscan if (NOT RELEASE_BUILD)
) add_custom_target(
unit
COMMAND bin/unit-internal
COMMAND bin/unit-hyperscan
COMMAND bin/unit-chimera
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
DEPENDS unit-internal unit-hyperscan unit-chimera
)
else ()
add_custom_target(
unit
COMMAND bin/unit-hyperscan
COMMAND bin/unit-chimera
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
DEPENDS unit-hyperscan unit-chimera
)
endif()
else()
#
# build target to run unit tests
#
if (NOT RELEASE_BUILD)
add_custom_target(
unit
COMMAND bin/unit-internal
COMMAND bin/unit-hyperscan
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
DEPENDS unit-internal unit-hyperscan
)
else ()
add_custom_target(
unit
COMMAND bin/unit-hyperscan
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
DEPENDS unit-hyperscan
)
endif()
endif() endif()

149
unit/chimera/allocators.cpp Normal file
View File

@ -0,0 +1,149 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "gtest/gtest.h"
#include "chimera/ch.h"
#include <cstdlib>
#include <string>
using std::string;
static void *null_malloc(size_t) { return nullptr; }
// Helper: correctly construct a simple database.
static
void makeDatabase(ch_database_t **hydb) {
static const char *expr[] = { "foobar" };
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
ch_error_t err;
err = ch_compile_multi(expr, nullptr, nullptr, 1, 0, nullptr, &db,
&compile_err);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(db != nullptr);
*hydb = db;
}
TEST(HybridAllocator, DatabaseInfoBadAlloc) {
ch_database_t *db = nullptr;
makeDatabase(&db);
ASSERT_TRUE(db != nullptr);
ch_set_allocator(null_malloc, nullptr);
char *info = nullptr;
ch_error_t err = ch_database_info(db, &info);
ASSERT_EQ(CH_NOMEM, err);
ch_set_allocator(nullptr, nullptr);
ch_free_database(db);
}
static
void * two_aligned_malloc(size_t len) {
void *mem = malloc(len + 2);
if (!mem) {
return nullptr;
}
return (char *)mem + 2;
}
static
void two_aligned_free(void *mem) {
if (!mem) {
return;
}
// Allocated with two_aligned_malloc above.
free((char *)mem - 2);
}
TEST(HybridAllocator, TwoAlignedCompile) {
ch_set_database_allocator(two_aligned_malloc, two_aligned_free);
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
const hs_platform_info_t *platform = nullptr;
ch_error_t err =
ch_compile("foobar", 0, CH_MODE_GROUPS, platform, &db, &compile_err);
ASSERT_EQ(CH_COMPILER_ERROR, err);
ASSERT_EQ(nullptr, db);
ASSERT_NE(nullptr, compile_err);
ch_free_compile_error(compile_err);
ch_set_database_allocator(nullptr, nullptr);
}
TEST(HybridAllocator, TwoAlignedCompileError) {
ch_set_misc_allocator(two_aligned_malloc, two_aligned_free);
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
const hs_platform_info_t *platform = nullptr;
ch_error_t err =
ch_compile("\\1", 0, CH_MODE_GROUPS, platform, &db, &compile_err);
ASSERT_EQ(CH_COMPILER_ERROR, err);
ASSERT_EQ(nullptr, db);
ASSERT_NE(nullptr, compile_err);
EXPECT_STREQ("Allocator returned misaligned memory.", compile_err->message);
ch_free_compile_error(compile_err);
ch_set_database_allocator(nullptr, nullptr);
ch_set_misc_allocator(nullptr, nullptr);
}
TEST(HybridAllocator, TwoAlignedDatabaseInfo) {
ch_database_t *db = nullptr;
makeDatabase(&db);
ch_set_misc_allocator(two_aligned_malloc, two_aligned_free);
char *info = nullptr;
ch_error_t err = ch_database_info(db, &info);
ASSERT_EQ(CH_BAD_ALLOC, err);
ch_set_misc_allocator(nullptr, nullptr);
ch_free_database(db);
}
TEST(HybridAllocator, TwoAlignedAllocScratch) {
ch_database_t *db = nullptr;
makeDatabase(&db);
ch_set_scratch_allocator(two_aligned_malloc, two_aligned_free);
ch_scratch_t *scratch = nullptr;
ch_error_t err = ch_alloc_scratch(db, &scratch);
ASSERT_EQ(CH_BAD_ALLOC, err);
ch_set_scratch_allocator(nullptr, nullptr);
ch_free_database(db);
}

591
unit/chimera/arg_checks.cpp Normal file
View File

@ -0,0 +1,591 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gtest/gtest.h"
#include "chimera/ch.h"
static char garbage[] = "TEST(HybridArgChecks, DatabaseSizeNoDatabase) {" \
" size_t sz = ch_database_size(0);" \
" ASSERT_EQ(0, sz);";
namespace /* anonymous */ {
// Dummy callback: does nothing, returns 0 (keep matching)
ch_callback_t dummyHandler(unsigned, unsigned long long,
unsigned long long, unsigned, unsigned,
const ch_capture_t *, void *) {
// empty
return CH_CALLBACK_CONTINUE;
}
// Helper: correctly construct a simple database.
static
void makeDatabase(ch_database_t **hydb) {
static const char *expr[] = { "foo.*bar" };
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
ch_error_t err;
err = ch_compile_multi(expr, nullptr, nullptr, 1, 0, nullptr, &db,
&compile_err);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(db != nullptr);
*hydb = db;
}
// Helper: given a database, build me some scratch.
static
void makeScratch(const ch_database_t *db,
ch_scratch_t **scratch) {
ch_error_t err = ch_alloc_scratch(db, scratch);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(*scratch != nullptr);
}
// Break the magic number of the given database.
void breakDatabaseMagic(ch_database *db) {
// database magic should be 0xdbdb at the start
ASSERT_TRUE(memcmp("\xde\xde", db, 2) == 0);
*(char *)db = 0xdc;
}
// Break the version number of the given database.
void breakDatabaseVersion(ch_database *db) {
// database version is the second u32
*((char *)db + 4) += 1;
}
// Check that CH_version gives us a reasonable string back
TEST(HybridArgChecks, Version) {
const char *version = ch_version();
ASSERT_TRUE(version != nullptr);
ASSERT_TRUE(version[0] >= '0' && version[0] <= '9')
<< "First byte should be a digit.";
ASSERT_EQ('.', version[1]) << "Second byte should be a dot.";
}
// ch_compile: Hand the compiler a bogus flag.
TEST(HybridArgChecks, SingleBogusFlags) {
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
ch_error_t err;
static const unsigned int badflags[] = {
0xffffffff,
16,
128,
256,
512,
};
for (size_t i = 0; i < sizeof(badflags)/sizeof(badflags[0]); i++) {
const char expr[] = "foobar";
err = ch_compile(expr, badflags[i], 0, nullptr, &db, &compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(db == nullptr);
EXPECT_TRUE(compile_err != nullptr);
EXPECT_STREQ("Unrecognized flag used.", compile_err->message);
ch_free_compile_error(compile_err);
}
}
// ch_compile: Hand the compiler a bogus mode.
TEST(HybridArgChecks, SingleBogusMode) {
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
ch_error_t err;
static const unsigned int badModes[] = {
0xffffffff,
1,
2,
CH_MODE_GROUPS << 1, // this was our largest mode flag
};
for (size_t i = 0; i < sizeof(badModes)/sizeof(badModes[0]); i++) {
const char expr[] = "foobar";
err = ch_compile(expr, 0, badModes[i], nullptr, &db, &compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(db == nullptr);
EXPECT_TRUE(compile_err != nullptr);
EXPECT_STREQ("Invalid mode flag supplied.", compile_err->message);
ch_free_compile_error(compile_err);
}
}
// ch_compile: Compile a nullptr pattern set)
TEST(HybridArgChecks, SingleCompileBlockNoPattern) {
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
ch_error_t err;
err = ch_compile(nullptr, 0, 0, nullptr, &db, &compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(db == nullptr);
EXPECT_TRUE(compile_err != nullptr);
ch_free_compile_error(compile_err);
}
// ch_compile: Compile a pattern to a nullptr database ptr
TEST(HybridArgChecks, SingleCompileBlockNoDatabase) {
ch_compile_error_t *compile_err = nullptr;
const char expr[] = "foobar";
ch_error_t err;
err = ch_compile(expr, 0, 0, nullptr, nullptr, &compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(compile_err != nullptr);
ch_free_compile_error(compile_err);
}
// ch_compile_multi: Hand the compiler a bogus flag.
TEST(HybridArgChecks, MultiBogusFlags) {
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
ch_error_t err;
static const unsigned int badflags[] = {
0xffffffff,
16, // HS_FLAG_ERROREOD
128,
256,
512,
};
for (size_t i = 0; i < sizeof(badflags)/sizeof(badflags[0]); i++) {
const char *expr[] = { "foobar" };
err = ch_compile_multi(expr, &badflags[i], nullptr, 1, 0, nullptr, &db,
&compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(db == nullptr);
EXPECT_TRUE(compile_err != nullptr);
EXPECT_STREQ("Unrecognized flag used.", compile_err->message);
ch_free_compile_error(compile_err);
}
}
// ch_compile_multi: Hand the ch_compile_multi a bogus mode.
TEST(HybridArgChecks, MultiBogusMode) {
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
ch_error_t err;
static const unsigned int badModes[] = {
0xffffffff,
1,
2,
CH_MODE_GROUPS << 1, // this was our largest mode flag
};
for (size_t i = 0; i < sizeof(badModes)/sizeof(badModes[0]); i++) {
const char *expr[] = { "foobar" };
err = ch_compile_multi(expr, nullptr, nullptr, 1, badModes[i], nullptr,
&db, &compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(db == nullptr);
EXPECT_TRUE(compile_err != nullptr);
EXPECT_STREQ("Invalid mode flag supplied.", compile_err->message);
ch_free_compile_error(compile_err);
}
}
// ch_compile_multi: Compile a nullptr pattern set (block mode)
TEST(HybridArgChecks, MultiCompileBlockNoPattern) {
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
ch_error_t err;
err = ch_compile_multi(nullptr, nullptr, nullptr, 1, 0, nullptr, &db,
&compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(db == nullptr);
EXPECT_TRUE(compile_err != nullptr);
ch_free_compile_error(compile_err);
}
// ch_compile_multi: Compile a set of zero patterns
TEST(HybridArgChecks, MultiCompileZeroPatterns) {
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
const char *expr[] = {"foobar"};
ch_error_t err;
err = ch_compile_multi(expr, nullptr, nullptr, 0, 0, nullptr, &db,
&compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(db == nullptr);
EXPECT_TRUE(compile_err != nullptr);
ch_free_compile_error(compile_err);
}
// ch_compile_multi: Compile a pattern to a nullptr database ptr
TEST(HybridArgChecks, MultiCompileBlockNoDatabase) {
ch_compile_error_t *compile_err = nullptr;
const char *expr[] = {"foobar"};
ch_error_t err;
err = ch_compile_multi(expr, nullptr, nullptr, 1, 0, nullptr, nullptr,
&compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(compile_err != nullptr);
ch_free_compile_error(compile_err);
}
// ch_compile_ext_multi: Hand the compiler a bogus flag.
TEST(HybridArgChecks, ExtMultiBogusFlags) {
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
ch_error_t err;
static const unsigned int badflags[] = {
0xffffffff,
16, // HS_FLAG_ERROREOD
128,
256,
512,
};
for (size_t i = 0; i < sizeof(badflags)/sizeof(badflags[0]); i++) {
const char *expr[] = { "foobar" };
err = ch_compile_ext_multi(expr, &badflags[i], nullptr, 1, 0,
10000000, 8000, nullptr, &db, &compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(db == nullptr);
EXPECT_TRUE(compile_err != nullptr);
EXPECT_STREQ("Unrecognized flag used.", compile_err->message);
ch_free_compile_error(compile_err);
}
}
// ch_compile_ext_multi: Hand the ch_compile_multi a bogus mode.
TEST(HybridArgChecks, ExtMultiBogusMode) {
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
ch_error_t err;
static const unsigned int badModes[] = {
0xffffffff,
1,
2,
CH_MODE_GROUPS << 1, // this was our largest mode flag
};
for (size_t i = 0; i < sizeof(badModes)/sizeof(badModes[0]); i++) {
const char *expr[] = { "foobar" };
err = ch_compile_ext_multi(expr, nullptr, nullptr, 1, badModes[i],
10000000, 8000, nullptr, &db, &compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(db == nullptr);
EXPECT_TRUE(compile_err != nullptr);
EXPECT_STREQ("Invalid mode flag supplied.", compile_err->message);
ch_free_compile_error(compile_err);
}
}
// ch_compile_ext_multi: Compile a nullptr pattern set (block mode)
TEST(HybridArgChecks, ExtMultiCompileBlockNoPattern) {
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
ch_error_t err;
err = ch_compile_ext_multi(nullptr, nullptr, nullptr, 1, 0, 10000000,
8000, nullptr, &db, &compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(db == nullptr);
EXPECT_TRUE(compile_err != nullptr);
ch_free_compile_error(compile_err);
}
// ch_compile_ext_multi: Compile a set of zero patterns
TEST(HybridArgChecks, ExtMultiCompileZeroPatterns) {
ch_database_t *db = nullptr;
ch_compile_error_t *compile_err = nullptr;
const char *expr[] = {"foobar"};
ch_error_t err;
err = ch_compile_ext_multi(expr, nullptr, nullptr, 0, 0, 10000000,
8000, nullptr, &db, &compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(db == nullptr);
EXPECT_TRUE(compile_err != nullptr);
ch_free_compile_error(compile_err);
}
// ch_compile_ext_multi: Compile a pattern to a nullptr database ptr
TEST(HybridArgChecks, ExtMultiCompileBlockNoDatabase) {
ch_compile_error_t *compile_err = nullptr;
const char *expr[] = {"foobar"};
ch_error_t err;
err = ch_compile_ext_multi(expr, nullptr, nullptr, 1, 0, 10000000,
8000, nullptr, nullptr, &compile_err);
EXPECT_EQ(CH_COMPILER_ERROR, err);
EXPECT_TRUE(compile_err != nullptr);
ch_free_compile_error(compile_err);
}
// ch_scan: Call with no database
TEST(HybridArgChecks, ScanBlockNoDatabase) {
ch_database_t *db = nullptr;
makeDatabase(&db);
ch_scratch_t *scratch = nullptr;
makeScratch(db, &scratch);
ch_error_t err = ch_scan(nullptr, "data", 4, 0, scratch,
dummyHandler, nullptr, nullptr);
ASSERT_NE(CH_SUCCESS, err);
EXPECT_NE(CH_SCAN_TERMINATED, err);
// teardown
err = ch_free_scratch(scratch);
ASSERT_EQ(CH_SUCCESS, err);
ch_free_database(db);
}
// ch_scan: Call with a database with broken magic
TEST(HybridArgChecks, ScanBlockBrokenDatabaseMagic) {
ch_database_t *db = nullptr;
makeDatabase(&db);
ch_scratch_t *scratch = nullptr;
makeScratch(db, &scratch);
// break the database here, after scratch alloc
breakDatabaseMagic(db);
ch_error_t err = ch_scan(db, "data", 4, 0, scratch,
dummyHandler, nullptr, nullptr);
ASSERT_EQ(CH_INVALID, err);
// teardown
err = ch_free_scratch(scratch);
ASSERT_EQ(CH_SUCCESS, err);
free(db);
}
// ch_scan: Call with a database with broken version
TEST(HybridArgChecks, ScanBlockBrokenDatabaseVersion) {
ch_database_t *db = nullptr;
makeDatabase(&db);
ch_scratch_t *scratch = nullptr;
makeScratch(db, &scratch);
// break the database here, after scratch alloc
breakDatabaseVersion(db);
ch_error_t err = ch_scan(db, "data", 4, 0, scratch,
dummyHandler, nullptr, nullptr);
ASSERT_EQ(CH_DB_VERSION_ERROR, err);
// teardown
err = ch_free_scratch(scratch);
ASSERT_EQ(CH_SUCCESS, err);
ch_free_database(db);
}
// ch_scan: Call with no data
TEST(HybridArgChecks, ScanBlockNoData) {
ch_database_t *db = nullptr;
makeDatabase(&db);
ch_scratch_t *scratch = nullptr;
makeScratch(db, &scratch);
ch_error_t err = ch_scan(db, nullptr, 4, 0, scratch, dummyHandler,
nullptr, nullptr);
ASSERT_NE(CH_SUCCESS, err);
EXPECT_NE(CH_SCAN_TERMINATED, err);
// teardown
err = ch_free_scratch(scratch);
ASSERT_EQ(CH_SUCCESS, err);
ch_free_database(db);
}
// ch_scan: Call with no scratch
TEST(HybridArgChecks, ScanBlockNoScratch) {
ch_database_t *db = nullptr;
makeDatabase(&db);
ch_error_t err = ch_scan(db, "data", 4, 0, nullptr, dummyHandler,
nullptr, nullptr);
ASSERT_NE(CH_SUCCESS, err);
EXPECT_NE(CH_SCAN_TERMINATED, err);
// teardown
ch_free_database(db);
}
// ch_scan: Call with no event handler
TEST(HybridArgChecks, ScanBlockNoHandler) {
ch_database_t *db = nullptr;
makeDatabase(&db);
ch_scratch_t *scratch = nullptr;
makeScratch(db, &scratch);
ch_error_t err = ch_scan(db, "data", 4, 0, scratch, nullptr, nullptr,
nullptr);
ASSERT_EQ(CH_SUCCESS, err);
EXPECT_NE(CH_SCAN_TERMINATED, err);
// teardown
err = ch_free_scratch(scratch);
ASSERT_EQ(CH_SUCCESS, err);
ch_free_database(db);
}
// ch_alloc_scratch: Call with no database
TEST(HybridArgChecks, AllocScratchNoDatabase) {
ch_scratch_t *scratch = nullptr;
ch_error_t err = ch_alloc_scratch(nullptr, &scratch);
EXPECT_NE(CH_SUCCESS, err);
EXPECT_TRUE(scratch == nullptr);
}
// ch_alloc_scratch: Call with nullptr ptr-to-scratch
TEST(HybridArgChecks, AllocScratchNullScratchPtr) {
ch_database_t *db = nullptr;
makeDatabase(&db);
ch_error_t err = ch_alloc_scratch(db, nullptr);
ASSERT_EQ(CH_INVALID, err);
// teardown
ch_free_database(db);
}
// ch_alloc_scratch: Call with bogus scratch
TEST(HybridArgChecks, AllocScratchBogusScratch) {
ch_database_t *db = nullptr;
makeDatabase(&db);
ch_scratch_t *blah = (ch_scratch_t *)malloc(100);
memset(blah, 0xf0, 100);
ch_error_t err = ch_alloc_scratch(db, &blah);
ASSERT_EQ(CH_INVALID, err);
// teardown
free(blah);
ch_free_database(db);
}
// ch_alloc_scratch: Call with broken database magic
TEST(HybridArgChecks, AllocScratchBadDatabaseMagic) {
ch_database_t *db = nullptr;
makeDatabase(&db);
breakDatabaseMagic(db);
ch_scratch_t *scratch = nullptr;
ch_error_t err = ch_alloc_scratch(db, &scratch);
ASSERT_EQ(CH_INVALID, err);
// teardown
free(db);
}
// ch_alloc_scratch: Call with broken database version
TEST(HybridArgChecks, AllocScratchBadDatabaseVersion) {
ch_database_t *db = nullptr;
makeDatabase(&db);
breakDatabaseVersion(db);
ch_scratch_t *scratch = nullptr;
ch_error_t err = ch_alloc_scratch(db, &scratch);
ASSERT_EQ(CH_DB_VERSION_ERROR, err);
// teardown
ch_free_database(db);
}
// ch_clone_scratch: Call with no source scratch
TEST(HybridArgChecks, CloneScratchNoSource) {
ch_scratch_t *scratch = nullptr, *scratch2 = nullptr;
ch_error_t err = ch_clone_scratch(scratch, &scratch2);
EXPECT_NE(CH_SUCCESS, err);
EXPECT_TRUE(scratch2 == nullptr);
}
// ch_database_size: Call with no database
TEST(HybridArgChecks, DatabaseSizeNoDatabase) {
size_t sz = 0;
ch_error_t err = ch_database_size(0, &sz);
ASSERT_EQ(CH_INVALID, err);
ASSERT_EQ(0U, sz);
}
// ch_clone_scratch: bad scratch arg
TEST(HybridArgChecks, CloneBadScratch) {
// Try cloning the scratch
void *local_garbage = malloc(sizeof(garbage));
memcpy(local_garbage, garbage, sizeof(garbage));
ch_scratch_t *cloned = nullptr;
ch_scratch_t *scratch = (ch_scratch_t *)local_garbage;
ch_error_t err = ch_clone_scratch(scratch, &cloned);
free(local_garbage);
ASSERT_EQ(CH_INVALID, err);
}
// ch_scan: bad scratch arg
TEST(HybridArgChecks, ScanBadScratch) {
ch_database_t *db = nullptr;
makeDatabase(&db);
void *local_garbage = malloc(sizeof(garbage));
memcpy(local_garbage, garbage, sizeof(garbage));
ch_scratch_t *scratch = (ch_scratch_t *)local_garbage;
ch_error_t err = ch_scan(db, "data", 4, 0, scratch,
dummyHandler, nullptr, nullptr);
free(local_garbage);
ASSERT_EQ(CH_INVALID, err);
// teardown
ch_free_database(db);
}
TEST(HybridArgChecks, ch_free_database_null) {
ch_error_t err = ch_free_database(nullptr);
ASSERT_EQ(CH_SUCCESS, err);
}
TEST(HybridArgChecks, ch_free_database_garbage) {
ch_error_t err = ch_free_database((ch_database_t *)garbage);
ASSERT_EQ(CH_INVALID, err);
}
TEST(HybridArgChecks, ch_free_scratch_null) {
ch_error_t err = ch_free_scratch(nullptr);
ASSERT_EQ(CH_SUCCESS, err);
}
TEST(HybridArgChecks, ch_free_scratch_garbage) {
ch_error_t err = ch_free_scratch((ch_scratch_t *)garbage);
ASSERT_EQ(CH_INVALID, err);
}
TEST(HybridArgChecks, ch_free_compile_error_null) {
ch_error_t err = ch_free_compile_error(nullptr);
ASSERT_EQ(CH_SUCCESS, err);
}
} // namespace

View File

@ -0,0 +1,95 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gtest/gtest.h"
#include "chimera/ch.h"
using namespace testing;
class HybridCompile : public TestWithParam<const char *> {
// empty
};
TEST_P(HybridCompile, BadPattern) {
ch_error_t err;
ch_compile_error_t *compile_err = nullptr;
const char *pattern = GetParam();
ch_database_t *db = nullptr;
err = ch_compile_multi(&pattern, nullptr, nullptr, 1, 0, nullptr, &db,
&compile_err);
ASSERT_NE(CH_SUCCESS, err) << "Compile should have failed for expr: "
<< pattern;
ASSERT_TRUE(db == nullptr);
ASSERT_TRUE(compile_err != nullptr);
ch_free_compile_error(compile_err);
}
static
const char * BAD_PATTERNS[] = {
// unmatched parens
"(foo",
"foo)",
"((foo)",
"(foo))",
// nothing to repeat
"a+++",
"a+?+",
"a???",
"a??+",
"?qa",
"*abc",
"+abc",
// repeating boundaries is not allowed (UE-1007)
"^?0",
"^*0",
"^+0",
"^{1,3}0",
"0$?",
"0$*",
"0$+",
"0${1,3}",
// char classes
"[]",
"[]foobar",
"[`-\\80",
// bad named classes
"[[:foo:]]",
"[[:1234:]]",
"[[:f\\oo:]]",
"[[: :]]",
"[[:...:]]",
"[[:l\\ower:]]",
"[[:abc\\:]]",
"[abc[:x\\]pqr:]]",
"[[:a\\dz:]]",
"foobar\\", // trailing unescaped backslash
};
INSTANTIATE_TEST_CASE_P(Compile, HybridCompile, ValuesIn(BAD_PATTERNS));

56
unit/chimera/compat.cpp Normal file
View File

@ -0,0 +1,56 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gtest/gtest.h"
#include "chimera/ch.h"
#include "hs.h"
// We currently depend on our common (meaning) hash defines having the same
// values.
TEST(HybridCompat, Defines) {
// flags
EXPECT_EQ(HS_FLAG_CASELESS, CH_FLAG_CASELESS);
EXPECT_EQ(HS_FLAG_DOTALL, CH_FLAG_DOTALL);
EXPECT_EQ(HS_FLAG_MULTILINE, CH_FLAG_MULTILINE);
EXPECT_EQ(HS_FLAG_SINGLEMATCH, CH_FLAG_SINGLEMATCH);
EXPECT_EQ(HS_FLAG_UTF8, CH_FLAG_UTF8);
EXPECT_EQ(HS_FLAG_UCP, CH_FLAG_UCP);
// errors
EXPECT_EQ(HS_SUCCESS, CH_SUCCESS);
EXPECT_EQ(HS_INVALID, CH_INVALID);
EXPECT_EQ(HS_NOMEM, CH_NOMEM);
EXPECT_EQ(HS_SCAN_TERMINATED, CH_SCAN_TERMINATED);
EXPECT_EQ(HS_COMPILER_ERROR, CH_COMPILER_ERROR);
EXPECT_EQ(HS_DB_VERSION_ERROR, CH_DB_VERSION_ERROR);
EXPECT_EQ(HS_DB_PLATFORM_ERROR, CH_DB_PLATFORM_ERROR);
EXPECT_EQ(HS_DB_MODE_ERROR, CH_DB_MODE_ERROR);
EXPECT_EQ(HS_BAD_ALIGN, CH_BAD_ALIGN);
EXPECT_EQ(HS_BAD_ALLOC, CH_BAD_ALLOC);
EXPECT_EQ(HS_SCRATCH_IN_USE, CH_SCRATCH_IN_USE);
}

35
unit/chimera/main.cpp Normal file
View File

@ -0,0 +1,35 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gtest/gtest.h"
// Driver: run all the tests (defined in other source files in this directory)
int main(int argc, char **argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

551
unit/chimera/scan.cpp Normal file
View File

@ -0,0 +1,551 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <vector>
#include <tuple>
#include "gtest/gtest.h"
#include "chimera/ch.h"
using namespace std;
using namespace testing;
namespace {
class HybridScanParams {
public:
HybridScanParams() {}
HybridScanParams(const char *s, unsigned int f)
: patterns(1, s), flags(1, f) {}
void add(const char *pattern, unsigned int myflags) {
patterns.push_back(pattern);
flags.push_back(myflags);
}
size_t size() const {
return patterns.size();
}
const char * const * getPatterns() const {
return &patterns[0];
}
const unsigned int * getFlags() const {
return &flags[0];
}
private:
vector<const char *> patterns;
vector<unsigned int> flags;
};
static
vector<HybridScanParams> paramFactory() {
vector<HybridScanParams> hsp;
// Some simple single-pattern cases.
hsp.push_back(HybridScanParams(".", CH_FLAG_DOTALL));
hsp.push_back(HybridScanParams("foobar", 0));
hsp.push_back(HybridScanParams("foo.*bar", 0));
hsp.push_back(HybridScanParams("fred.*bill", CH_FLAG_DOTALL));
hsp.push_back(HybridScanParams(".*", 0)); // vacuosity!
hsp.push_back(HybridScanParams("\\A(.?.{7,27}jf[tmqq]l(f|t|hgmr.+.fg|abks)){3,7}", 0));
hsp.push_back(HybridScanParams("^begin", CH_FLAG_MULTILINE));
hsp.push_back(HybridScanParams("match", CH_FLAG_SINGLEMATCH));
// Single-pattern cases where the pattern isn't supported by hyperscan but
// can be prefiltered.
hsp.push_back(HybridScanParams("foo(?!bar)", 0));
hsp.push_back(HybridScanParams("(sens|respons)e and \\1ibility", 0));
// A case that can't be prefiltered (as of this writing) because it's too
// gosh-darned big. This tests that the hybrid matcher can run without the
// multi-matcher (or with a "fake" one).
hsp.push_back(HybridScanParams("((c(p|p)h{2,}bh.|p|((((cq|j|c|(\\b)|.[^nbgn]|(\\B)[qfh]a)){10,12}|ih|a|mnde[pa].|.g)){5,8})){3}", 0));
// Simple multi-pattern literal case.
hsp.push_back(HybridScanParams());
hsp.back().add("hatstand", 0);
hsp.back().add("teakettle", 0);
hsp.back().add("badgerbrush", 0);
hsp.back().add("mnemosyne", 0);
// More complex multi-pattern case.
hsp.push_back(HybridScanParams());
hsp.back().add("foo.{3,7}bar", 0);
hsp.back().add("foo.{30,70}bar", 0);
hsp.back().add("foobar.*foobar", 0);
hsp.back().add("^blingwrapper.*foo", 0);
hsp.back().add("[0-9a-f]{70,}\\n", 0);
// A couple of trivial Unicode patterns, mostly to make sure we accept
// the flags.
hsp.push_back(HybridScanParams());
hsp.back().add("foo.*bar", CH_FLAG_UTF8);
hsp.back().add("today", CH_FLAG_UTF8|CH_FLAG_UCP);
// PCRE exotica.
hsp.push_back(HybridScanParams());
hsp.back().add("benign literal", 0);
hsp.back().add("(?|(abc)|(def))\\1", 0);
hsp.back().add("(?|(abc)|(def))(?1)", 0);
hsp.back().add("(sens|respons)e and \\1ibility", 0);
hsp.back().add("\\w+(?=;)", 0);
hsp.back().add("foo(?!bar)", 0);
hsp.back().add("(?<=bullock|donkey)", 0);
return hsp;
}
// Dummy callback.
static
ch_callback_t dummyHandler(unsigned, unsigned long long, unsigned long long,
unsigned, unsigned,const ch_capture_t *, void *) {
// empty
return CH_CALLBACK_CONTINUE;
}
static
void checkGroups(unsigned int num, const ch_capture_t *captured) {
// We should have _some_ group info.
ASSERT_LT(0U, num);
ASSERT_TRUE(captured != nullptr);
// Group 0 is always active.
ASSERT_TRUE(captured[0].flags & CH_CAPTURE_FLAG_ACTIVE);
// Sanity-checking.
for (unsigned int i = 0; i < num; i++) {
if (!(captured[i].flags & CH_CAPTURE_FLAG_ACTIVE)) {
continue;
}
ASSERT_LE(captured[i].from, captured[i].to) << "Group " << i
<< "not sane.";
}
}
// Dummy callback that checks that we had some groups set.
static
ch_callback_t dummyGroupHandler(unsigned, unsigned long long,
unsigned long long, unsigned, unsigned num,
const ch_capture_t *captured, void *) {
checkGroups(num, captured);
return CH_CALLBACK_CONTINUE;
}
class HybridScan : public TestWithParam<tuple<HybridScanParams, bool>> {
protected:
virtual void SetUp() {
ch_error_t err;
ch_compile_error_t *compile_err = nullptr;
const HybridScanParams &hsp = get<0>(GetParam());
groups = get<1>(GetParam());
err = ch_compile_ext_multi(hsp.getPatterns(), hsp.getFlags(), nullptr,
hsp.size(), groups ? CH_MODE_GROUPS :
CH_MODE_NOGROUPS, 10000000, 8000,
nullptr, &db, &compile_err);
ASSERT_EQ(err, CH_SUCCESS);
ASSERT_TRUE(db != nullptr);
err = ch_alloc_scratch(db, &scratch);
ASSERT_EQ(err, CH_SUCCESS);
ASSERT_TRUE(scratch != nullptr);
}
virtual void TearDown() {
ch_free_database(db);
ch_free_scratch(scratch);
}
ch_database_t *db = nullptr;
ch_scratch_t *scratch = nullptr;
bool groups;
};
static const string SCAN_DATA(
"Beware the Jabberwock, my son!\n"
"The jaws that bite, the claws that catch!\n"
"Beware the Jubjub bird, and shun\n"
"The frumious Bandersnatch!\n");
TEST_P(HybridScan, BuildAndScan) {
ASSERT_TRUE(db != nullptr);
size_t sz;
ch_error_t err = ch_database_size(db, &sz);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_LT(16U, sz);
ch_match_event_handler cb = groups ? dummyGroupHandler : dummyHandler;
err = ch_scan(db, SCAN_DATA.c_str(), SCAN_DATA.length(), 0,
scratch, cb, nullptr, nullptr);
ASSERT_EQ(CH_SUCCESS, err);
}
TEST_P(HybridScan, ScanNearly4KData) {
ASSERT_TRUE(db != nullptr);
string data(4000, '*'); // it's full of stars!
// Insert some strings that will match a few patterns.
data.insert(278, "foo");
data.insert(285, "bar");
data.insert(1178, "foobar");
data.insert(1894, "bar");
data.insert(3000, "foobar");
ch_match_event_handler cb = groups ? dummyGroupHandler : dummyHandler;
ch_error_t err = ch_scan(db, data.c_str(), data.length(), 0,
scratch, cb, nullptr, nullptr);
ASSERT_EQ(CH_SUCCESS, err);
}
TEST_P(HybridScan, ScanBigData) {
ASSERT_TRUE(db != nullptr);
// More than 4MB, as that pushes us into using PCRE for non-Pawn cases.
string data(5*1024*1024, '*'); // it's full of stars!
// Insert some strings that will match a few patterns.
data.insert(278, "foo");
data.insert(285, "bar");
data.insert(1178, "foobar");
data.insert(1894, "bar");
data.insert(3000, "foobar");
ch_match_event_handler cb = groups ? dummyGroupHandler : dummyHandler;
ch_error_t err = ch_scan(db, data.c_str(), data.length(), 0,
scratch, cb, nullptr, nullptr);
ASSERT_EQ(CH_SUCCESS, err);
}
TEST_P(HybridScan, ScanClonedScratch) {
ASSERT_TRUE(db != nullptr);
ch_error_t err;
ch_scratch_t *clonedScratch = nullptr;
err = ch_clone_scratch(scratch, &clonedScratch);
ASSERT_EQ(CH_SUCCESS, err);
ch_match_event_handler cb = groups ? dummyGroupHandler : dummyHandler;
err = ch_scan(db, SCAN_DATA.c_str(), SCAN_DATA.length(), 0,
clonedScratch, cb, nullptr, nullptr);
ASSERT_EQ(CH_SUCCESS, err);
ch_free_scratch(clonedScratch);
}
TEST_P(HybridScan, DatabaseInfo) {
ASSERT_TRUE(db != nullptr);
char *info = nullptr;
ch_error_t err = ch_database_info(db, &info);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(info != nullptr);
const string strinfo(info);
const string prefix("Chimera ");
ASSERT_GE(strinfo.size(), prefix.size());
ASSERT_EQ(prefix, strinfo.substr(0, prefix.size()));
free(info);
}
TEST_P(HybridScan, NonZeroScratchSize) {
ASSERT_TRUE(db != nullptr);
size_t curr_size;
ch_error_t err = ch_scratch_size(scratch, &curr_size);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_LT(0, curr_size);
}
INSTANTIATE_TEST_CASE_P(Scan, HybridScan,
Combine(ValuesIn(paramFactory()), Bool()));
// Counting callback that returns CH_CALLBACK_CONTINUE.
static
ch_callback_t countHandler(unsigned, unsigned long long, unsigned long long,
unsigned, unsigned, const ch_capture_t *,
void *ctx) {
unsigned int *count = (unsigned int *)ctx;
++(*count);
return CH_CALLBACK_CONTINUE;
}
// Counting callback that returns CH_CALLBACK_SKIP_PATTERN.
static
ch_callback_t skipHandler(unsigned, unsigned long long, unsigned long long,
unsigned, unsigned, const ch_capture_t *,
void *ctx) {
unsigned int *count = (unsigned int *)ctx;
++(*count);
return CH_CALLBACK_SKIP_PATTERN;
}
// Counting callback that returns CH_CALLBACK_TERMINATE.
static
ch_callback_t terminateHandler(unsigned, unsigned long long, unsigned long long,
unsigned, unsigned, const ch_capture_t *,
void *ctx) {
unsigned int *count = (unsigned int *)ctx;
++(*count);
return CH_CALLBACK_TERMINATE;
}
static
void makeDatabase(ch_database_t **db, const char * const expr[], size_t num) {
*db = nullptr;
ch_compile_error_t *compile_err = nullptr;
ch_error_t err = ch_compile_ext_multi(expr, nullptr, nullptr, num, 0,
10000000, 8000, nullptr, db,
&compile_err);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(*db != nullptr);
}
struct RescanContext {
RescanContext(const ch_database_t *db_in, ch_scratch_t *scratch_in)
: db(db_in), scratch(scratch_in) {}
const ch_database_t *db;
ch_scratch_t *scratch;
size_t matches = 0;
};
static
int rescan_block_cb(unsigned, unsigned long long, unsigned long long, unsigned,
unsigned, const ch_capture_t *, void *ctx) {
RescanContext *rctx = (RescanContext *)ctx;
rctx->matches++;
const string data = "___foo___bar_";
hs_error_t err = ch_scan(rctx->db, data.c_str(), data.length(), 0,
rctx->scratch, nullptr, nullptr, nullptr);
EXPECT_EQ(CH_SCRATCH_IN_USE, err);
return 0;
}
TEST(Scan, ScratchInUse) {
static const char * const expr[] = { "foo.*bar" };
ch_database_t *db = nullptr;
makeDatabase(&db, expr, 1);
ch_scratch_t *scratch = nullptr;
ch_error_t err = ch_alloc_scratch(db, &scratch);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(scratch != nullptr);
RescanContext rc(db, scratch);
const string data("___foo___bar_");
err = ch_scan(db, data.c_str(), data.length(), 0,
scratch, rescan_block_cb, 0, &rc);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_EQ(1U, rc.matches);
ch_free_scratch(scratch);
ch_free_database(db);
}
TEST(Scan, CallbackSkip1) {
static const char * const expr[] = { "." };
ch_database_t *db = nullptr;
makeDatabase(&db, expr, 1);
ch_scratch_t *scratch = nullptr;
ch_error_t err = ch_alloc_scratch(db, &scratch);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(scratch != nullptr);
unsigned int count = 0;
const string data("qwertyuiop");
err = ch_scan(db, data.c_str(), data.length(), 0,
scratch, skipHandler, 0, &count);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_EQ(1U, count);
ch_free_scratch(scratch);
ch_free_database(db);
}
TEST(Scan, CallbackSkip2) {
static const char * const expr[] = { "[a-z]+", "[0-9]" };
ch_database_t *db = nullptr;
makeDatabase(&db, expr, 2);
ch_scratch_t *scratch = nullptr;
ch_error_t err = ch_alloc_scratch(db, &scratch);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(scratch != nullptr);
unsigned int count = 0;
const string data("foo 0123 0 bar 39483 n34jfhlqekrcoi3q4");
err = ch_scan(db, data.c_str(), data.length(), 0,
scratch, skipHandler, 0, &count);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_EQ(2U, count); // both patterns should match once
ch_free_scratch(scratch);
ch_free_database(db);
}
// This case includes a pattern that we use libpcre for.
TEST(Scan, CallbackSkip3) {
static const char * const expr[] = { "[a-z]+", "foo(?!bar)" };
ch_database_t *db = nullptr;
makeDatabase(&db, expr, 2);
ch_scratch_t *scratch = nullptr;
ch_error_t err = ch_alloc_scratch(db, &scratch);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(scratch != nullptr);
unsigned int count = 0;
const string data("foobaz foobing foobar");
err = ch_scan(db, data.c_str(), data.length(), 0,
scratch, skipHandler, 0, &count);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_EQ(2U, count); // both patterns should match once
ch_free_scratch(scratch);
ch_free_database(db);
}
TEST(Scan, CallbackNoSkip1) {
static const char * const expr[] = { "foo|bar", "[0-9]{3}" };
ch_database_t *db = nullptr;
makeDatabase(&db, expr, 2);
ch_scratch_t *scratch = nullptr;
ch_error_t err = ch_alloc_scratch(db, &scratch);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(scratch != nullptr);
unsigned int count = 0;
const string data("foo 012 bar 345 foobar 678");
err = ch_scan(db, data.c_str(), data.length(), 0,
scratch, countHandler, 0, &count);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_EQ(7U, count); // seven matches in total
ch_free_scratch(scratch);
ch_free_database(db);
}
TEST(Scan, CallbackNoSkip2) {
static const char * const expr[] = { "foo(?!bar)", "[0-9]{3}" };
ch_database_t *db = nullptr;
makeDatabase(&db, expr, 2);
ch_scratch_t *scratch = nullptr;
ch_error_t err = ch_alloc_scratch(db, &scratch);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(scratch != nullptr);
unsigned int count = 0;
const string data("foo 012 bar 345 foobar 678");
err = ch_scan(db, data.c_str(), data.length(), 0,
scratch, countHandler, 0, &count);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_EQ(4U, count); // four matches in total
ch_free_scratch(scratch);
ch_free_database(db);
}
TEST(Scan, CallbackTerm1) {
static const char * const expr[] = { "." };
ch_database_t *db = nullptr;
makeDatabase(&db, expr, 1);
ch_scratch_t *scratch = nullptr;
ch_error_t err = ch_alloc_scratch(db, &scratch);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(scratch != nullptr);
unsigned int count = 0;
const string data("qwertyuiop");
err = ch_scan(db, data.c_str(), data.length(), 0,
scratch, terminateHandler, 0, &count);
ASSERT_EQ(CH_SCAN_TERMINATED, err);
ASSERT_EQ(1U, count);
ch_free_scratch(scratch);
ch_free_database(db);
}
TEST(Scan, CallbackTerm2) {
static const char * const expr[] = { "[a-z]+", "[0-9]" };
ch_database_t *db = nullptr;
makeDatabase(&db, expr, 2);
ch_scratch_t *scratch = nullptr;
ch_error_t err = ch_alloc_scratch(db, &scratch);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(scratch != 0);
unsigned int count = 0;
const string data("foo 0123 0 bar 39483 n34jfhlqekrcoi3q4");
err = ch_scan(db, data.c_str(), data.length(), 0,
scratch, terminateHandler, 0, &count);
ASSERT_EQ(CH_SCAN_TERMINATED, err);
ASSERT_EQ(1U, count);
ch_free_scratch(scratch);
ch_free_database(db);
}
// This case includes a pattern that we use libpcre for.
TEST(Scan, CallbackTerm3) {
static const char * const expr[] = { "[a-z]+", "foo(?!bar)" };
ch_database_t *db = nullptr;
makeDatabase(&db, expr, 2);
ch_scratch_t *scratch = nullptr;
ch_error_t err = ch_alloc_scratch(db, &scratch);
ASSERT_EQ(CH_SUCCESS, err);
ASSERT_TRUE(scratch != nullptr);
unsigned int count = 0;
const string data("foobaz foobing foobar");
err = ch_scan(db, data.c_str(), data.length(), 0,
scratch, terminateHandler, 0, &count);
ASSERT_EQ(CH_SCAN_TERMINATED, err);
ASSERT_EQ(1U, count);
ch_free_scratch(scratch);
ch_free_database(db);
}
} // namespace