chimera: hybrid of Hyperscan and PCRE

This commit is contained in:
Wang, Xiang W
2018-03-09 03:52:12 -05:00
parent 8a1c497f44
commit bf87f8c003
47 changed files with 6985 additions and 202 deletions

32
chimera/CMakeLists.txt Normal file
View File

@@ -0,0 +1,32 @@
# Chimera lib
include_directories(${PCRE_INCLUDE_DIRS})
# only set these after all tests are done
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
SET(chimera_HEADERS
ch.h
ch_common.h
ch_compile.h
ch_runtime.h
)
install(FILES ${chimera_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs")
SET(chimera_SRCS
${chimera_HEADERS}
ch_alloc.c
ch_alloc.h
ch_compile.cpp
ch_database.c
ch_database.h
ch_internal.h
ch_runtime.c
ch_scratch.h
ch_scratch.c
)
add_library(chimera STATIC ${chimera_SRCS})
add_dependencies(chimera hs pcre)
target_link_libraries(chimera hs pcre)

45
chimera/ch.h Normal file
View File

@@ -0,0 +1,45 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CH_H_
#define CH_H_
/**
* @file
* @brief The complete Chimera API definition.
*
* Chimera is a hybrid solution of Hyperscan and PCRE.
*
* This header includes both the Chimera compiler and runtime components. See
* the individual component headers for documentation.
*/
#include "ch_compile.h"
#include "ch_runtime.h"
#endif /* CH_H_ */

109
chimera/ch_alloc.c Normal file
View File

@@ -0,0 +1,109 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Runtime functions for setting custom allocators.
*/
#include "ch.h"
#include "ch_common.h"
#include "ch_internal.h"
#include "hs.h"
#include "ue2common.h"
#define default_malloc malloc
#define default_free free
ch_alloc_t ch_database_alloc = default_malloc;
ch_alloc_t ch_misc_alloc = default_malloc;
ch_alloc_t ch_scratch_alloc = default_malloc;
ch_free_t ch_database_free = default_free;
ch_free_t ch_misc_free = default_free;
ch_free_t ch_scratch_free = default_free;
static
ch_alloc_t normalise_alloc(ch_alloc_t a) {
if (!a) {
return default_malloc;
} else {
return a;
}
}
static
ch_free_t normalise_free(ch_free_t f) {
if (!f) {
return default_free;
} else {
return f;
}
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_set_allocator(ch_alloc_t allocfunc,
ch_free_t freefunc) {
ch_set_database_allocator(allocfunc, freefunc);
ch_set_misc_allocator(allocfunc, freefunc);
ch_set_scratch_allocator(allocfunc, freefunc);
// Set core Hyperscan alloc/free.
hs_error_t ret = hs_set_allocator(allocfunc, freefunc);
return ret;
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_set_database_allocator(ch_alloc_t allocfunc,
ch_free_t freefunc) {
ch_database_alloc = normalise_alloc(allocfunc);
ch_database_free = normalise_free(freefunc);
// Set Hyperscan database alloc/free.
return hs_set_database_allocator(allocfunc, freefunc);
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_set_misc_allocator(ch_alloc_t allocfunc,
ch_free_t freefunc) {
ch_misc_alloc = normalise_alloc(allocfunc);
ch_misc_free = normalise_free(freefunc);
// Set Hyperscan misc alloc/free.
return hs_set_misc_allocator(allocfunc, freefunc);
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_set_scratch_allocator(ch_alloc_t allocfunc,
ch_free_t freefunc) {
ch_scratch_alloc = normalise_alloc(allocfunc);
ch_scratch_free = normalise_free(freefunc);
// Set Hyperscan scratch alloc/free.
return hs_set_scratch_allocator(allocfunc, freefunc);
}

65
chimera/ch_alloc.h Normal file
View File

@@ -0,0 +1,65 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CH_ALLOC_H
#define CH_ALLOC_H
#include "hs_common.h"
#include "ue2common.h"
#include "ch_common.h"
#ifdef __cplusplus
extern "C"
{
#endif
extern hs_alloc_t ch_database_alloc;
extern hs_alloc_t ch_misc_alloc;
extern hs_alloc_t ch_scratch_alloc;
extern hs_free_t ch_database_free;
extern hs_free_t ch_misc_free;
extern hs_free_t ch_scratch_free;
#ifdef __cplusplus
} /* extern C */
#endif
/** \brief Check the results of an alloc done with hs_alloc for alignment.
*
* If we have incorrect alignment, return an error. Caller should free the
* offending block. */
static really_inline
ch_error_t ch_check_alloc(const void *mem) {
ch_error_t ret = CH_SUCCESS;
if (!mem) {
ret = CH_NOMEM;
} else if (!ISALIGNED_N(mem, alignof(unsigned long long))) {
ret = CH_BAD_ALLOC;
}
return ret;
}
#endif

360
chimera/ch_common.h Normal file
View File

@@ -0,0 +1,360 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CH_COMMON_H_
#define CH_COMMON_H_
#include "hs_common.h"
#include <stdlib.h>
/**
* @file
* @brief The Chimera common API definition.
*
* Chimera is a hybrid of Hyperscan and PCRE.
*
* This header contains functions available to both the Chimera compiler and
* runtime.
*/
#ifdef __cplusplus
extern "C"
{
#endif
struct ch_database;
/**
* A Chimera pattern database.
*
* Generated by one of the Chimera compiler functions:
* - @ref ch_compile()
* - @ref ch_compile_multi()
* - @ref ch_compile_ext_multi()
*/
typedef struct ch_database ch_database_t;
/**
* A type for errors returned by Chimera functions.
*/
typedef int ch_error_t;
/**
* Free a compiled pattern database.
*
* The free callback set by @ref ch_set_allocator()) will be used by this
* function.
*
* @param db
* A compiled pattern database. NULL may also be safely provided, in which
* case the function does nothing.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_free_database(ch_database_t *db);
/**
* Utility function for identifying this release version.
*
* @return
* A string containing the version number of this release build and the
* date of the build. It is allocated statically, so it does not need to
* be freed by the caller.
*/
const char * HS_CDECL ch_version(void);
/**
* Returns the size of the given database.
*
* @param database
* Pointer to compiled expression database.
*
* @param database_size
* On success, the size of the compiled database in bytes is placed in this
* parameter.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_database_size(const ch_database_t *database,
size_t *database_size);
/**
* Utility function providing information about a database.
*
* @param database
* Pointer to a compiled database.
*
* @param info
* On success, a string containing the version and platform information for
* the supplied database is placed in the parameter. The string is
* allocated using the allocator supplied in @ref hs_set_allocator()
* (or malloc() if no allocator was set) and should be freed by the caller.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_database_info(const ch_database_t *database,
char **info);
/**
* The type of the callback function that will be used by Chimera to allocate
* more memory at runtime as required.
*
* If Chimera is to be used in a multi-threaded, or similarly concurrent
* environment, the allocation function will need to be re-entrant, or
* similarly safe for concurrent use.
*
* @param size
* The number of bytes to allocate.
* @return
* A pointer to the region of memory allocated, or NULL on error.
*/
typedef void *(HS_CDECL *ch_alloc_t)(size_t size);
/**
* The type of the callback function that will be used by Chimera to free
* memory regions previously allocated using the @ref ch_alloc_t function.
*
* @param ptr
* The region of memory to be freed.
*/
typedef void (HS_CDECL *ch_free_t)(void *ptr);
/**
* Set the allocate and free functions used by Chimera for allocating
* memory at runtime for stream state, scratch space, database bytecode,
* and various other data structure returned by the Chimera API.
*
* The function is equivalent to calling @ref ch_set_scratch_allocator(),
* @ref ch_set_database_allocator() and
* @ref ch_set_misc_allocator() with the provided parameters.
*
* This call will override any previous allocators that have been set.
*
* Note: there is no way to change the allocator used for temporary objects
* created during the various compile calls (@ref ch_compile() and @ref
* ch_compile_multi()).
*
* @param alloc_func
* A callback function pointer that allocates memory. This function must
* return memory suitably aligned for the largest representable data type
* on this platform.
*
* @param free_func
* A callback function pointer that frees allocated memory.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_set_allocator(ch_alloc_t alloc_func,
ch_free_t free_func);
/**
* Set the allocate and free functions used by Chimera for allocating memory
* for database bytecode produced by the compile calls (@ref ch_compile() and @ref
* ch_compile_multi()).
*
* If no database allocation functions are set, or if NULL is used in place of
* both parameters, then memory allocation will default to standard methods
* (such as the system malloc() and free() calls).
*
* This call will override any previous database allocators that have been set.
*
* Note: the database allocator may also be set by calling @ref
* ch_set_allocator().
*
* Note: there is no way to change how temporary objects created during the
* various compile calls (@ref ch_compile() and @ref ch_compile_multi()) are
* allocated.
*
* @param alloc_func
* A callback function pointer that allocates memory. This function must
* return memory suitably aligned for the largest representable data type
* on this platform.
*
* @param free_func
* A callback function pointer that frees allocated memory.
*
* @return
* @ref HS_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_set_database_allocator(ch_alloc_t alloc_func,
ch_free_t free_func);
/**
* Set the allocate and free functions used by Chimera for allocating memory
* for items returned by the Chimera API such as @ref ch_compile_error_t.
*
* If no misc allocation functions are set, or if NULL is used in place of both
* parameters, then memory allocation will default to standard methods (such as
* the system malloc() and free() calls).
*
* This call will override any previous misc allocators that have been set.
*
* Note: the misc allocator may also be set by calling @ref ch_set_allocator().
*
* @param alloc_func
* A callback function pointer that allocates memory. This function must
* return memory suitably aligned for the largest representable data type
* on this platform.
*
* @param free_func
* A callback function pointer that frees allocated memory.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_set_misc_allocator(ch_alloc_t alloc_func,
ch_free_t free_func);
/**
* Set the allocate and free functions used by Chimera for allocating memory
* for scratch space by @ref ch_alloc_scratch() and @ref ch_clone_scratch().
*
* If no scratch allocation functions are set, or if NULL is used in place of
* both parameters, then memory allocation will default to standard methods
* (such as the system malloc() and free() calls).
*
* This call will override any previous scratch allocators that have been set.
*
* Note: the scratch allocator may also be set by calling @ref
* ch_set_allocator().
*
* @param alloc_func
* A callback function pointer that allocates memory. This function must
* return memory suitably aligned for the largest representable data type
* on this platform.
*
* @param free_func
* A callback function pointer that frees allocated memory.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_set_scratch_allocator(ch_alloc_t alloc_func,
ch_free_t free_func);
/**
* @defgroup CH_ERROR ch_error_t values
*
* @{
*/
/**
* The engine completed normally.
*/
#define CH_SUCCESS 0
/**
* A parameter passed to this function was invalid.
*/
#define CH_INVALID (-1)
/**
* A memory allocation failed.
*/
#define CH_NOMEM (-2)
/**
* The engine was terminated by callback.
*
* This return value indicates that the target buffer was partially scanned,
* but that the callback function requested that scanning cease after a match
* was located.
*/
#define CH_SCAN_TERMINATED (-3)
/**
* The pattern compiler failed, and the @ref ch_compile_error_t should be
* inspected for more detail.
*/
#define CH_COMPILER_ERROR (-4)
/**
* The given database was built for a different version of the Chimera matcher.
*/
#define CH_DB_VERSION_ERROR (-5)
/**
* The given database was built for a different platform (i.e., CPU type).
*/
#define CH_DB_PLATFORM_ERROR (-6)
/**
* The given database was built for a different mode of operation. This error
* is returned when streaming calls are used with a non-streaming database and
* vice versa.
*/
#define CH_DB_MODE_ERROR (-7)
/**
* A parameter passed to this function was not correctly aligned.
*/
#define CH_BAD_ALIGN (-8)
/**
* The memory allocator did not correctly return memory suitably aligned for
* the largest representable data type on this platform.
*/
#define CH_BAD_ALLOC (-9)
/**
* The scratch region was already in use.
*
* This error is returned when Chimera is able to detect that the scratch
* region given is already in use by another Chimera API call.
*
* A separate scratch region, allocated with @ref ch_alloc_scratch() or @ref
* ch_clone_scratch(), is required for every concurrent caller of the Chimera
* API.
*
* For example, this error might be returned when @ref ch_scan() has been
* called inside a callback delivered by a currently-executing @ref ch_scan()
* call using the same scratch region.
*
* Note: Not all concurrent uses of scratch regions may be detected. This error
* is intended as a best-effort debugging tool, not a guarantee.
*/
#define CH_SCRATCH_IN_USE (-10)
/**
* Returned when pcre_exec (called for some expressions internally from @ref
* ch_scan) failed due to a fatal error.
*/
#define CH_FAIL_INTERNAL (-32)
/** @} */
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* CH_COMMON_H_ */

878
chimera/ch_compile.cpp Normal file
View File

@@ -0,0 +1,878 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Compiler front-end, including public API calls for compilation.
*/
#include "ch_compile.h"
#include "ch_alloc.h"
#include "ch_internal.h"
#include "ch_database.h"
#include "grey.h"
#include "hs_common.h"
#include "hs_internal.h"
#include "ue2common.h"
#include "util/compile_error.h"
#include "util/make_unique.h"
#include "util/multibit_build.h"
#include "util/target_info.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstring>
#include <memory>
#include <ostream>
#include <sstream>
#include <limits.h>
#include <string>
#include <vector>
#include <boost/core/noncopyable.hpp>
#define PCRE_ERROR_MSG "Internal error building PCRE pattern."
using namespace std;
using namespace ue2;
static const char failureNoMemory[] = "Unable to allocate memory.";
static const char failureInternal[] = "Internal error.";
static const char failureBadAlloc[] = "Allocator returned misaligned memory.";
static const ch_compile_error_t ch_enomem
= { const_cast<char *>(failureNoMemory), 0 };
static const ch_compile_error_t ch_einternal
= { const_cast<char *>(failureInternal), 0 };
static const ch_compile_error_t ch_badalloc
= { const_cast<char *>(failureBadAlloc), 0 };
static
ch_compile_error_t *generateChimeraCompileError(const string &err,
int expression) {
ch_compile_error_t *ret =
(struct ch_compile_error *)ch_misc_alloc(sizeof(ch_compile_error_t));
if (ret) {
ch_error_t e = ch_check_alloc(ret);
if (e != CH_SUCCESS) {
ch_misc_free(ret);
return const_cast<ch_compile_error_t *>(&ch_badalloc);
}
char *msg = (char *)ch_misc_alloc(err.size() + 1);
if (msg) {
e = ch_check_alloc(msg);
if (e != HS_SUCCESS) {
ch_misc_free(msg);
return const_cast<ch_compile_error_t *>(&ch_badalloc);
}
memcpy(msg, err.c_str(), err.size() + 1);
ret->message = msg;
} else {
ch_misc_free(ret);
ret = nullptr;
}
}
if (!ret || !ret->message) {
return const_cast<ch_compile_error_t *>(&ch_enomem);
}
ret->expression = expression;
return ret;
}
static
void freeChimeraCompileError(ch_compile_error_t *error) {
if (!error) {
return;
}
if (error == &ch_enomem || error == &ch_einternal ||
error == &ch_badalloc) {
// These are not allocated.
return;
}
ch_misc_free(error->message);
ch_misc_free(error);
}
static
bool checkMode(unsigned int mode, ch_compile_error_t **comp_error) {
static const unsigned int supported = CH_MODE_GROUPS;
if (mode & ~supported) {
*comp_error =
generateChimeraCompileError("Invalid mode flag supplied.", -1);
return false;
}
return true;
}
/** \brief Throw a compile error if we're passed some unsupported flags. */
static
void checkFlags(const unsigned int flags) {
static const unsigned int supported = HS_FLAG_DOTALL
| HS_FLAG_MULTILINE
| HS_FLAG_CASELESS
| HS_FLAG_SINGLEMATCH
| HS_FLAG_UCP
| HS_FLAG_UTF8;
if (flags & ~supported) {
throw CompileError("Unrecognized flag used.");
}
}
static
bool isHyperscanSupported(const char *expression, unsigned int flags,
const hs_platform_info *platform) {
hs_database_t *db = nullptr;
hs_compile_error *comp_error = nullptr;
unsigned int id = 0;
hs_error_t err = hs_compile_multi(&expression, &flags, &id,
1, HS_MODE_BLOCK, platform, &db,
&comp_error);
if (err != HS_SUCCESS) {
assert(!db);
assert(comp_error);
DEBUG_PRINTF("unsupported: %s\n", comp_error->message);
hs_free_compile_error(comp_error);
return false;
}
assert(db);
assert(!comp_error);
hs_free_database(db);
return true;
}
static
bool writeHyperscanDatabase(char *ptr, hs_database_t *db) {
// Note: we must use our serialization calls to re-home the database.
char *serialized = nullptr;
size_t slen = 0;
hs_error_t err = hs_serialize_database(db, &serialized, &slen);
if (err != HS_SUCCESS) {
DEBUG_PRINTF("hs_serialize_database returned %d\n", err);
assert(0);
return false;
}
DEBUG_PRINTF("writing database to ptr %p\n", ptr);
// deserialize_at without the platform tests.
err = hs_deserialize_database_at(serialized, slen, (hs_database_t *)ptr);
if (err != HS_SUCCESS) {
DEBUG_PRINTF("hs_deserialize_database_at returned %d\n", err);
assert(0);
ch_misc_free(serialized);
return false;
}
ch_misc_free(serialized);
return true;
}
static
bool writeHyperscanDatabase(ch_bytecode *db, hs_database_t *hs_db) {
db->databaseOffset = ROUNDUP_CL(sizeof(*db));
char *ptr = (char *)db + db->databaseOffset;
return writeHyperscanDatabase(ptr, hs_db);
}
static
int convertFlagsToPcreOptions(unsigned int flags) {
int options = 0;
if (flags & HS_FLAG_CASELESS) {
options |= PCRE_CASELESS;
}
if (flags & HS_FLAG_DOTALL) {
options |= PCRE_DOTALL;
}
if (flags & HS_FLAG_MULTILINE) {
options |= PCRE_MULTILINE;
}
if (flags & HS_FLAG_UTF8) {
options |= PCRE_UTF8;
}
if (flags & HS_FLAG_UCP) {
options |= PCRE_UCP;
}
// All other flags are meaningless to PCRE.
return options;
}
namespace {
/** \brief Data about a single pattern. */
struct PatternData : boost::noncopyable {
PatternData(const char *pattern, u32 flags, u32 idx, u32 id_in,
unsigned mode, unsigned long int match_limit,
unsigned long int match_limit_recursion,
const hs_platform_info *platform);
~PatternData() {
pcre_free(compiled);
pcre_free(extra);
}
void buildPcre(const char *pattern, u32 flags);
size_t patternSize() const;
void writePattern(ch_pattern *pattern) const;
pcre *compiled; //!< pcre_compile output
pcre_extra *extra; //!< pcre_study output
size_t compiled_size;
int study_size;
int capture_cnt;
bool utf8;
u32 id; //!< ID from the user
u32 expr_index; //!< index in the expression array
bool singlematch; //!< pattern is in highlander mode
bool guard; //!< this pattern should be guarded by the multimatcher
u32 minWidth; //!< min match width
u32 maxWidth; //!< max match width
u32 fixedWidth; //!< fixed pattern width
unsigned long int matchLimit; //! pcre match limit
unsigned long int matchLimitRecursion; //! pcre match_limit_recursion
};
PatternData::PatternData(const char *pattern, u32 flags, u32 idx, u32 id_in,
unsigned mode, unsigned long int match_limit,
unsigned long int match_limit_recursion,
const hs_platform_info *platform)
: compiled(nullptr), extra(nullptr), id(id_in), expr_index(idx),
singlematch(flags & HS_FLAG_SINGLEMATCH),
guard(false), minWidth(0), maxWidth(UINT_MAX),
fixedWidth(UINT_MAX), matchLimit(match_limit),
matchLimitRecursion(match_limit_recursion) {
assert(pattern);
flags |= HS_FLAG_ALLOWEMPTY; /* don't hand things off to pcre for no
reason */
buildPcre(pattern, flags);
// Fetch the expression info for a prefiltering, non-singlematch version of
// this pattern, if possible.
hs_expr_info *info = nullptr;
hs_compile_error_t *error = nullptr;
u32 infoflags = (flags | HS_FLAG_PREFILTER) & ~HS_FLAG_SINGLEMATCH;
u32 rawflags = (flags | HS_FLAG_SOM_LEFTMOST) & ~HS_FLAG_SINGLEMATCH;
hs_error_t err = hs_expression_info(pattern, infoflags, &info, &error);
if (err == HS_SUCCESS) {
assert(info);
hs_expr_info *i = (hs_expr_info *)info;
minWidth = i->min_width;
maxWidth = i->max_width;
bool ordered = i->unordered_matches ? false : true;
// Only enable capturing if required
u32 captureCnt = 0;
if (mode & CH_MODE_GROUPS) {
captureCnt = capture_cnt;
}
// No need to confirm with PCRE if:
// 1) pattern is fixed width
// 2) pattern isn't vacuous as it can't combine with start of match
// 3) no capturing in this pattern
// 4) no offset adjust in this pattern as hyperscan match callback
// will arrive without order, i.e. [^a]\z has offset adjust
// 5) hyperscan compile succeeds without prefiltering
if (minWidth == maxWidth && minWidth && maxWidth != UINT_MAX &&
!captureCnt && ordered &&
isHyperscanSupported(pattern, rawflags, platform)) {
fixedWidth = maxWidth;
}
DEBUG_PRINTF("gathered info: widths=[%u,%u]\n", minWidth, maxWidth);
ch_misc_free(info);
u32 guardflags;
guardflags = (flags | HS_FLAG_PREFILTER) & ~HS_FLAG_SINGLEMATCH;
guard = isHyperscanSupported(pattern, guardflags, platform);
} else {
// We can't even prefilter this pattern, so we're dependent on Big Dumb
// Pcre Scans.
DEBUG_PRINTF("hs_expression_info failed, falling back to pcre\n");
hs_free_compile_error(error);
}
}
void PatternData::buildPcre(const char *pattern, u32 flags) {
int options = convertFlagsToPcreOptions(flags);
const char *errptr = nullptr;
int erroffset = 0;
compiled = pcre_compile(pattern, options, &errptr, &erroffset, nullptr);
if (!compiled) {
DEBUG_PRINTF("PCRE failed to compile: %s\n", pattern);
string err("PCRE compilation failed: ");
err += string(errptr);
err += ".";
throw CompileError(expr_index, err);
}
extra = pcre_study(compiled, PCRE_STUDY_JIT_COMPILE, &errptr);
// Note that it's OK for pcre_study to return NULL if there's nothing
// to be found, but a non-NULL error is always bad.
if (errptr) {
DEBUG_PRINTF("PCRE could not be studied: %s\n", errptr);
string err("PCRE compilation failed: ");
err += string(errptr);
err += ".";
throw CompileError(expr_index, err);
}
if (pcre_fullinfo(compiled, extra, PCRE_INFO_SIZE, &compiled_size)) {
throw CompileError(PCRE_ERROR_MSG);
}
if (!extra) {
study_size = 0;
} else {
if (pcre_fullinfo(compiled, extra, PCRE_INFO_STUDYSIZE, &study_size)) {
throw CompileError(PCRE_ERROR_MSG);
}
}
if (pcre_fullinfo(compiled, extra, PCRE_INFO_CAPTURECOUNT, &capture_cnt)) {
throw CompileError(PCRE_ERROR_MSG);
}
/* We use the pcre rather than hs to get this information as we may need it
* even in the pure unguarded pcre mode where there is no hs available. We
* can not use the compile flags due to (*UTF8) verb */
unsigned long int opts = 0; // PCRE_INFO_OPTIONS demands an unsigned long
if (pcre_fullinfo(compiled, extra, PCRE_INFO_OPTIONS, &opts)) {
throw CompileError(PCRE_ERROR_MSG);
}
utf8 = opts & PCRE_UTF8;
}
size_t PatternData::patternSize() const {
size_t len = 0;
// ch_pattern header.
len += sizeof(ch_pattern);
len = ROUNDUP_N(len, 8);
DEBUG_PRINTF("compiled pcre at %zu\n", len);
len += compiled_size;
// PCRE study data, which may be zero.
if (study_size) {
len = ROUNDUP_N(len, 8);
DEBUG_PRINTF("study at %zu\n", len);
len += (size_t)study_size;
}
DEBUG_PRINTF("pattern size %zu\n", len);
return len;
}
/** \brief Write out an ch_pattern structure, which should already be sized
* correctly according to PatternData::patternSize. */
void PatternData::writePattern(ch_pattern *pattern) const {
assert(pattern);
assert(ISALIGNED_CL(pattern));
pattern->id = id;
u32 flags = 0;
if (singlematch) {
flags |= CHIMERA_PATTERN_FLAG_SINGLEMATCH;
}
if (utf8) {
flags |= CHIMERA_PATTERN_FLAG_UTF8;
}
pattern->flags = flags;
pattern->maxWidth = maxWidth;
pattern->minWidth = minWidth == UINT_MAX ? 0 : minWidth;
pattern->fixedWidth = fixedWidth;
// Compiled PCRE pattern.
char *ptr = (char *)pattern;
ptr += ROUNDUP_N(sizeof(*pattern), 8);
DEBUG_PRINTF("compiled pcre at %zu\n", (size_t)(ptr - (char *)pattern));
memcpy(ptr, compiled, compiled_size);
ptr += compiled_size;
// PCRE match limits
pattern->extra.flags = PCRE_EXTRA_MATCH_LIMIT |
PCRE_EXTRA_MATCH_LIMIT_RECURSION;
pattern->extra.match_limit = matchLimit ? matchLimit : 10000000;
// Set to avoid segment fault
pattern->extra.match_limit_recursion =
matchLimitRecursion ? matchLimitRecursion : 1500;
// PCRE study_data.
u32 studyOffset = 0;
if (extra) {
assert(extra->study_data);
ptr = ROUNDUP_PTR(ptr, 8);
DEBUG_PRINTF("study at %zu\n", (size_t)(ptr - (char *)pattern));
memcpy(ptr, extra->study_data, study_size);
studyOffset = (size_t)(ptr - (char *)pattern);
pattern->extra.flags |= PCRE_EXTRA_STUDY_DATA;
pattern->extra.study_data = ptr;
ptr += study_size;
} else {
pattern->extra.flags &= ~PCRE_EXTRA_STUDY_DATA;
}
pattern->studyOffset = studyOffset;
size_t pcreLen = (ptr - (char *)pattern);
assert(pcreLen <= patternSize());
pattern->length = (u32)pcreLen;
// We shouldn't overrun the space we've allocated for this pattern.
assert(patternSize() >= (size_t)(ptr - (char *)pattern));
}
} // namespace
namespace ch {
static
void ch_compile_multi_int(const char *const *expressions, const unsigned *flags,
const unsigned *ids, unsigned elements,
unsigned mode, unsigned long int match_limit,
unsigned long int match_limit_recursion,
const hs_platform_info_t *platform,
ch_database_t **out) {
vector<unique_ptr<PatternData>> pcres;
pcres.reserve(elements);
vector<u32> unguarded; // indices of unguarded PCREs.
vector<const char *> multiExpr;
vector<unsigned int> multiFlags;
vector<unsigned int> multiIds;
bool allConfirm = true;
bool allSingleMatch = true;
for (unsigned int i = 0; i < elements; i++) {
const char *myExpr = expressions[i];
unsigned int myFlags = flags ? flags[i] : 0;
unsigned int myId = ids ? ids[i] : 0;
checkFlags(myFlags);
// First, build with libpcre. A build failure from libpcre will throw
// an exception up to the caller.
auto patternData =
ue2::make_unique<PatternData>(myExpr, myFlags, i, myId, mode, match_limit,
match_limit_recursion, platform);
pcres.push_back(move(patternData));
PatternData &curr = *pcres.back();
if (!(myFlags & HS_FLAG_SINGLEMATCH)) {
allSingleMatch = false;
}
// in the multimatch, we always run in prefilter mode and accept vacuous
// patterns.
myFlags |=
HS_FLAG_ALLOWEMPTY | HS_FLAG_PREFILTER;
if (curr.fixedWidth != UINT_MAX) {
myFlags |= HS_FLAG_SOM_LEFTMOST;
DEBUG_PRINTF("fixed width, turn off prefiltering\n");
myFlags &= ~HS_FLAG_PREFILTER;
allConfirm = false;
// Single match can't coexist with SOM.
myFlags &= ~HS_FLAG_SINGLEMATCH;
}
if (curr.guard) {
// We use the index into the PCREs array as the Hyperscan idx.
multiExpr.push_back(myExpr);
multiFlags.push_back(myFlags);
multiIds.push_back(i);
} else {
// No Hyperscan support, PCRE is unguarded.
unguarded.push_back(i);
}
}
DEBUG_PRINTF("built %zu PCREs, %zu of which are unguarded\n",
pcres.size(), unguarded.size());
// Work out our sizing for the output database.
size_t patternSize = 0;
for (unsigned int i = 0; i < elements; i++) {
size_t len = pcres[i]->patternSize();
patternSize += ROUNDUP_CL(len);
}
DEBUG_PRINTF("pcre bytecode takes %zu bytes\n", patternSize);
bool noMulti = multiExpr.empty();
size_t multiSize = 0;
hs_database *multidb = nullptr;
if (!noMulti) {
hs_compile_error_t *hs_comp_error = nullptr;
hs_error_t err = hs_compile_multi(&multiExpr[0], &multiFlags[0],
&multiIds[0], multiExpr.size(),
HS_MODE_BLOCK, platform, &multidb,
&hs_comp_error);
if (err != HS_SUCCESS) {
assert(hs_comp_error);
DEBUG_PRINTF("hs_compile_multi returned error: %s\n",
hs_comp_error->message);
assert(0);
hs_free_compile_error(hs_comp_error);
throw CompileError("Internal error.");
}
assert(multidb);
err = hs_database_size(multidb, &multiSize);
if (err != HS_SUCCESS) {
assert(0);
throw CompileError("Internal error.");
}
DEBUG_PRINTF("built hyperscan database with len %zu bytes\n", multiSize);
}
size_t bytecodeLen = sizeof(ch_bytecode) +
multiSize + alignof(u32) +
(sizeof(u32) * unguarded.size()) +
(sizeof(u32) * elements) +
patternSize +
128; // padding for alignment
size_t totalSize = sizeof(ch_database) + bytecodeLen;
DEBUG_PRINTF("allocating %zu bytes for database\n", totalSize);
char *ptr = (char *)ch_database_alloc(totalSize);
if (ch_check_alloc(ptr) != CH_SUCCESS) {
ch_database_free(ptr);
throw std::bad_alloc();
}
memset(ptr, 0, totalSize);
// First, the header.
ch_database *hydb = (ch_database *)ptr;
hydb->magic = CH_DB_MAGIC;
hydb->version = HS_VERSION_32BIT;
hydb->length = bytecodeLen;
// Then, the bytecode.
size_t shift = (size_t)hydb->bytes & 0x3f;
hydb->bytecode = offsetof(struct ch_database, bytes) - shift;
ch_bytecode *db = (ch_bytecode *)((char *)hydb + hydb->bytecode);
db->patternCount = elements;
db->activeSize = mmbit_size(elements);
db->flags = 0;
db->length = bytecodeLen;
if (noMulti) {
db->flags |= CHIMERA_FLAG_NO_MULTIMATCH;
}
if (mode & CH_MODE_GROUPS) {
db->flags |= CHIMERA_FLAG_GROUPS;
}
if (allConfirm) {
db->flags |= CHIMERA_FLAG_ALL_CONFIRM;
}
if (allSingleMatch) {
db->flags |= CHIMERA_FLAG_ALL_SINGLE;
}
// Find and set the max ovector size by looking at the capture count for
// each pcre.
u32 maxCaptureGroups = 0;
for (unsigned int i = 0; i < elements; i++) {
maxCaptureGroups = max(maxCaptureGroups, (u32)pcres[i]->capture_cnt);
}
db->maxCaptureGroups = maxCaptureGroups;
DEBUG_PRINTF("max capture groups is %u\n", maxCaptureGroups);
if (!noMulti) {
DEBUG_PRINTF("write hyperscan database\n");
// Write Hyperscan database directly after the header struct, then free it.
if (!writeHyperscanDatabase(db, multidb)) {
ch_database_free(hydb);
hs_free_database(multidb);
throw CompileError("Internal error.");
}
hs_free_database(multidb);
} else {
db->databaseOffset = ROUNDUP_CL(sizeof(*db));
}
// Then, write our unguarded PCRE list.
db->unguardedCount = unguarded.size();
db->unguardedOffset = ROUNDUP_N(db->databaseOffset + multiSize, 4);
ptr = (char *)db + db->unguardedOffset;
copy(unguarded.begin(), unguarded.end(), (u32 *)ptr);
// Then, write all our compiled PCRE patterns and the lookup table for
// them.
db->patternOffset = db->unguardedOffset + unguarded.size() * sizeof(u32);
u32 *patternOffset = (u32 *)((char *)db + db->patternOffset);
u32 offset = ROUNDUP_CL(db->patternOffset + elements * sizeof(u32));
for (unsigned int i = 0; i < elements; i++) {
*patternOffset = offset;
size_t len = pcres[i]->patternSize();
ptr = (char *)db + offset;
struct ch_pattern *pattern = (struct ch_pattern *)ptr;
pcres[i]->writePattern(pattern);
DEBUG_PRINTF("wrote pcre %u into offset %u, len %zu\n", i, offset, len);
offset += ROUNDUP_CL(len);
patternOffset++;
}
assert(offset <= totalSize);
assert(hydb->magic == CH_DB_MAGIC);
DEBUG_PRINTF("built hybrid database, size %zu bytes\n", totalSize);
DEBUG_PRINTF("offset=%u\n", offset);
*out = hydb;
}
} // namespace ch
extern "C" HS_PUBLIC_API
ch_error_t HS_CDECL ch_compile(const char *expression, unsigned flags,
unsigned mode,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **comp_error) {
if (!comp_error) {
if (db) {
db = nullptr;
}
// nowhere to write the string, but we can still report an error code
return CH_COMPILER_ERROR;
}
if (!db) {
*comp_error =
generateChimeraCompileError("Invalid parameter: db is NULL", -1);
return CH_COMPILER_ERROR;
}
if (!expression) {
*db = nullptr;
*comp_error =
generateChimeraCompileError("Invalid parameter: expressions is\
NULL", -1);
return CH_COMPILER_ERROR;
}
if (!checkMode(mode, comp_error)) {
*db = nullptr;
assert(*comp_error); // set by checkMode
return CH_COMPILER_ERROR;
}
try {
unsigned id = 0; // single expressions get zero as an ID
// Internal function to do all the work, now that we've handled all the
// argument checking.
ch::ch_compile_multi_int(&expression, &flags, &id, 1, mode, 0, 0,
platform, db);
}
catch (const CompileError &e) {
// Compiler error occurred
*db = nullptr;
*comp_error = generateChimeraCompileError(e.reason, e.hasIndex ?
(int)e.index : -1);
return CH_COMPILER_ERROR;
}
catch (std::bad_alloc) {
*db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_enomem);
return CH_COMPILER_ERROR;
}
catch (...) {
assert(!"Internal error, unexpected exception");
*db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_einternal);
return CH_COMPILER_ERROR;
}
DEBUG_PRINTF("success!\n");
return CH_SUCCESS;
}
extern "C" HS_PUBLIC_API
ch_error_t HS_CDECL ch_compile_multi(const char *const *expressions,
const unsigned *flags, const unsigned *ids,
unsigned elements, unsigned mode,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **comp_error) {
if (!comp_error) {
if (db) {
db = nullptr;
}
// nowhere to write the string, but we can still report an error code
return CH_COMPILER_ERROR;
}
if (!db) {
*comp_error =
generateChimeraCompileError("Invalid parameter: db is NULL", -1);
return CH_COMPILER_ERROR;
}
if (!expressions) {
*db = nullptr;
*comp_error =
generateChimeraCompileError("Invalid parameter: expressions is\
NULL", -1);
return CH_COMPILER_ERROR;
}
if (!elements) {
*db = nullptr;
*comp_error = generateChimeraCompileError("Invalid parameter:\
elements is zero", -1);
return CH_COMPILER_ERROR;
}
if (!checkMode(mode, comp_error)) {
*db = nullptr;
assert(*comp_error); // set by checkMode
return CH_COMPILER_ERROR;
}
try {
// Internal function to do all the work, now that we've handled all the
// argument checking.
ch::ch_compile_multi_int(expressions, flags, ids, elements, mode, 0, 0,
platform, db);
}
catch (const CompileError &e) {
// Compiler error occurred
*db = nullptr;
*comp_error = generateChimeraCompileError(e.reason, e.hasIndex ?
(int)e.index : -1);
return CH_COMPILER_ERROR;
}
catch (std::bad_alloc) {
*db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_enomem);
return CH_COMPILER_ERROR;
}
catch (...) {
assert(!"Internal error, unexpected exception");
*db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_einternal);
return CH_COMPILER_ERROR;
}
DEBUG_PRINTF("success!\n");
return CH_SUCCESS;
}
extern "C" HS_PUBLIC_API
ch_error_t HS_CDECL ch_compile_ext_multi(
const char *const *expressions,
const unsigned *flags,
const unsigned *ids,
unsigned elements, unsigned mode,
unsigned long int match_limit,
unsigned long int match_limit_recursion,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **comp_error) {
if (!comp_error) {
if (db) {
db = nullptr;
}
// nowhere to write the string, but we can still report an error code
return CH_COMPILER_ERROR;
}
if (!db) {
*comp_error =
generateChimeraCompileError("Invalid parameter: db is NULL", -1);
return CH_COMPILER_ERROR;
}
if (!expressions) {
*db = nullptr;
*comp_error =
generateChimeraCompileError("Invalid parameter: expressions is\
NULL", -1);
return CH_COMPILER_ERROR;
}
if (!elements) {
*db = nullptr;
*comp_error = generateChimeraCompileError("Invalid parameter:\
elements is zero", -1);
return CH_COMPILER_ERROR;
}
if (!checkMode(mode, comp_error)) {
*db = nullptr;
assert(*comp_error); // set by checkMode
return CH_COMPILER_ERROR;
}
try {
// Internal function to do all the work, now that we've handled all the
// argument checking.
ch::ch_compile_multi_int(expressions, flags, ids, elements, mode,
match_limit, match_limit_recursion, platform,
db);
}
catch (const CompileError &e) {
// Compiler error occurred
*db = nullptr;
*comp_error = generateChimeraCompileError(e.reason, e.hasIndex ?
(int)e.index : -1);
return CH_COMPILER_ERROR;
}
catch (std::bad_alloc) {
*db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_enomem);
return CH_COMPILER_ERROR;
}
catch (...) {
assert(!"Internal error, unexpected exception");
*db = nullptr;
*comp_error = const_cast<ch_compile_error_t *>(&ch_einternal);
return CH_COMPILER_ERROR;
}
DEBUG_PRINTF("success!\n");
return CH_SUCCESS;
}
extern "C" HS_PUBLIC_API
ch_error_t HS_CDECL ch_free_compile_error(ch_compile_error_t *error) {
freeChimeraCompileError(error);
return CH_SUCCESS;
}

394
chimera/ch_compile.h Normal file
View File

@@ -0,0 +1,394 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CH_COMPILE_H_
#define CH_COMPILE_H_
/**
* @file
* @brief The Chimera compiler API definition.
*
* Chimera is a hybrid solution of Hyperscan and PCRE.
*
* This header contains functions for compiling regular expressions into
* Chimera databases that can be used by the Chimera runtime.
*/
#include "ch_common.h"
#include "hs_compile.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* A type containing error details that is returned by the compile calls (@ref
* ch_compile() and @ref ch_compile_multi() on failure. The caller may inspect
* the values returned in this type to determine the cause of failure.
*/
typedef struct ch_compile_error {
/**
* A human-readable error message describing the error.
*/
char *message;
/**
* The zero-based number of the expression that caused the error (if this
* can be determined). If the error is not specific to an expression, then
* this value will be less than zero.
*/
int expression;
} ch_compile_error_t;
/**
* The basic regular expression compiler.
*
* This is the function call with which an expression is compiled into a
* Chimera database which can be passed to the runtime function (
* @ref ch_scan())
*
* @param expression
* The NULL-terminated expression to parse. Note that this string must
* represent ONLY the pattern to be matched, with no delimiters or flags;
* any global flags should be specified with the @a flags argument. For
* example, the expression `/abc?def/i` should be compiled by providing
* `abc?def` as the @a expression, and @ref CH_FLAG_CASELESS as the @a
* flags.
*
* @param flags
* Flags which modify the behaviour of the expression. Multiple flags may
* be used by ORing them together. Valid values are:
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - CH_FLAG_SINGLEMATCH - Only one match will be generated for the
* expression per stream.
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - CH_FLAG_UCP - Use Unicode properties for character classes.
*
* @param mode
* Compiler mode flag that affect the database as a whole for capturing
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
* See @ref CH_MODE_FLAG for more details.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref ch_free_database() function.
*
* @param compile_error
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* ch_free_compile_error() function.
*
* @return
* @ref CH_SUCCESS is returned on successful compilation; @ref
* CH_COMPILER_ERROR on failure, with details provided in the error
* parameter.
*/
ch_error_t HS_CDECL ch_compile(const char *expression, unsigned int flags,
unsigned int mode,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **compile_error);
/**
* The multiple regular expression compiler.
*
* This is the function call with which a set of expressions is compiled into a
* database which can be passed to the runtime function (@ref ch_scan()).
* Each expression can be labelled with a unique integer which is passed into
* the match callback to identify the pattern that has matched.
*
* @param expressions
* Array of NULL-terminated expressions to compile. Note that (as for @ref
* ch_compile()) these strings must contain only the pattern to be
* matched, with no delimiters or flags. For example, the expression
* `/abc?def/i` should be compiled by providing `abc?def` as the first
* string in the @a expressions array, and @ref CH_FLAG_CASELESS as the
* first value in the @a flags array.
*
* @param flags
* Array of flags which modify the behaviour of each expression. Multiple
* flags may be used by ORing them together. Specifying the NULL pointer
* in place of an array will set the flags value for all patterns to zero.
* Valid values are:
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns
* with this match id per stream.
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - CH_FLAG_UCP - Use Unicode properties for character classes.
*
* @param ids
* An array of integers specifying the ID number to be associated with the
* corresponding pattern in the expressions array. Specifying the NULL
* pointer in place of an array will set the ID value for all patterns to
* zero.
*
* @param elements
* The number of elements in the input arrays.
*
* @param mode
* Compiler mode flag that affect the database as a whole for capturing
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
* See @ref CH_MODE_FLAG for more details.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref ch_free_database() function.
*
* @param compile_error
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* ch_free_compile_error() function.
*
* @return
* @ref CH_SUCCESS is returned on successful compilation; @ref
* CH_COMPILER_ERROR on failure, with details provided in the @a error
* parameter.
*
*/
ch_error_t HS_CDECL ch_compile_multi(const char *const *expressions,
const unsigned int *flags,
const unsigned int *ids,
unsigned int elements, unsigned int mode,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **compile_error);
/**
* The multiple regular expression compiler with extended match limits support.
*
* This is the function call with which a set of expressions is compiled into a
* database in the same way as @ref ch_compile_multi(), but allows additional
* parameters to be specified via match_limit and match_limit_recursion to
* define match limits for PCRE runtime.
*
* @param expressions
* Array of NULL-terminated expressions to compile. Note that (as for @ref
* ch_compile()) these strings must contain only the pattern to be
* matched, with no delimiters or flags. For example, the expression
* `/abc?def/i` should be compiled by providing `abc?def` as the first
* string in the @a expressions array, and @ref CH_FLAG_CASELESS as the
* first value in the @a flags array.
*
* @param flags
* Array of flags which modify the behaviour of each expression. Multiple
* flags may be used by ORing them together. Specifying the NULL pointer
* in place of an array will set the flags value for all patterns to zero.
* Valid values are:
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns
* with this match id per stream.
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - CH_FLAG_UCP - Use Unicode properties for character classes.
*
* @param ids
* An array of integers specifying the ID number to be associated with the
* corresponding pattern in the expressions array. Specifying the NULL
* pointer in place of an array will set the ID value for all patterns to
* zero.
*
* @param elements
* The number of elements in the input arrays.
*
* @param mode
* Compiler mode flag that affect the database as a whole for capturing
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
* See @ref CH_MODE_FLAG for more details.
*
* @param match_limit
* A limit from pcre_extra on the amount of match function called in PCRE
* to limit backtracking that can take place.
*
* @param match_limit_recursion
* A limit from pcre_extra on the recursion depth of match function
* in PCRE.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref ch_free_database() function.
*
* @param compile_error
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* ch_free_compile_error() function.
*
* @return
* @ref CH_SUCCESS is returned on successful compilation; @ref
* CH_COMPILER_ERROR on failure, with details provided in the @a error
* parameter.
*
*/
ch_error_t HS_CDECL ch_compile_ext_multi(const char *const *expressions,
const unsigned int *flags,
const unsigned int *ids,
unsigned int elements,
unsigned int mode,
unsigned long int match_limit,
unsigned long int match_limit_recursion,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **compile_error);
/**
* Free an error structure generated by @ref ch_compile(), @ref
* ch_compile_multi().
*
* @param error
* The @ref ch_compile_error_t to be freed. NULL may also be safely
* provided.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_free_compile_error(ch_compile_error_t *error);
/**
* @defgroup CH_PATTERN_FLAG Pattern flags
*
* @{
*/
/**
* Compile flag: Set case-insensitive matching.
*
* This flag sets the expression to be matched case-insensitively by default.
* The expression may still use PCRE tokens (notably `(?i)` and
* `(?-i)`) to switch case-insensitive matching on and off.
*/
#define CH_FLAG_CASELESS 1
/**
* Compile flag: Matching a `.` will not exclude newlines.
*
* This flag sets any instances of the `.` token to match newline characters as
* well as all other characters. The PCRE specification states that the `.`
* token does not match newline characters by default, so without this flag the
* `.` token will not cross line boundaries.
*/
#define CH_FLAG_DOTALL 2
/**
* Compile flag: Set multi-line anchoring.
*
* This flag instructs the expression to make the `^` and `$` tokens match
* newline characters as well as the start and end of the stream. If this flag
* is not specified, the `^` token will only ever match at the start of a
* stream, and the `$` token will only ever match at the end of a stream within
* the guidelines of the PCRE specification.
*/
#define CH_FLAG_MULTILINE 4
/**
* Compile flag: Set single-match only mode.
*
* This flag sets the expression's match ID to match at most once, only the
* first match for each invocation of @ref ch_scan() will be returned.
*
*/
#define CH_FLAG_SINGLEMATCH 8
/**
* Compile flag: Enable UTF-8 mode for this expression.
*
* This flag instructs Chimera to treat the pattern as a sequence of UTF-8
* characters. The results of scanning invalid UTF-8 sequences with a Chimera
* library that has been compiled with one or more patterns using this flag are
* undefined.
*/
#define CH_FLAG_UTF8 32
/**
* Compile flag: Enable Unicode property support for this expression.
*
* This flag instructs Chimera to use Unicode properties, rather than the
* default ASCII interpretations, for character mnemonics like `\w` and `\s` as
* well as the POSIX character classes. It is only meaningful in conjunction
* with @ref CH_FLAG_UTF8.
*/
#define CH_FLAG_UCP 64
/** @} */
/**
* @defgroup CH_MODE_FLAG Compile mode flags
*
* The mode flags are used as values for the mode parameter of the various
* compile calls (@ref ch_compile(), @ref ch_compile_multi().
*
* By default, the matcher will only supply the start and end offsets of the
* match when the match callback is called. Using mode flag @ref CH_MODE_GROUPS
* will also fill the `captured' array with the start and end offsets of all
* the capturing groups specified by the pattern that has matched.
*
* @{
*/
/**
* Compiler mode flag: Disable capturing groups.
*/
#define CH_MODE_NOGROUPS 0
/**
* Compiler mode flag: Enable capturing groups.
*/
#define CH_MODE_GROUPS 1048576
/** @} */
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* CH_COMPILE_H_ */

126
chimera/ch_database.c Normal file
View File

@@ -0,0 +1,126 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Chimera: database construction, etc.
*/
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "allocator.h"
#include "database.h"
#include "hs.h"
#include "ch.h"
#include "hs_internal.h"
#include "ch_common.h"
#include "ch_alloc.h"
#include "ch_database.h"
#include "ch_internal.h"
static really_inline
int db_correctly_aligned(const void *db) {
return ISALIGNED_N(db, alignof(unsigned long long));
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_free_database(ch_database_t *hydb) {
if (hydb && hydb->magic != CH_DB_MAGIC) {
return CH_INVALID;
}
ch_database_free(hydb);
return CH_SUCCESS;
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_database_size(const ch_database_t *hydb, size_t *size) {
if (!size) {
return CH_INVALID;
}
ch_error_t ret = hydbIsValid(hydb);
if (unlikely(ret != CH_SUCCESS)) {
return ret;
}
*size = sizeof(struct ch_database) + hydb->length;
return CH_SUCCESS;
}
/** \brief Identifier prepended to database info. */
static const char CHIMERA_IDENT[] = "Chimera ";
HS_PUBLIC_API
ch_error_t HS_CDECL ch_database_info(const ch_database_t *hydb, char **info) {
if (!info) {
return CH_INVALID;
}
*info = NULL;
if (!hydb || !db_correctly_aligned(hydb) || hydb->magic != CH_DB_MAGIC) {
return HS_INVALID;
}
const struct ch_bytecode *bytecode = ch_get_bytecode(hydb);
char noMulti = (bytecode->flags & CHIMERA_FLAG_NO_MULTIMATCH);
if (noMulti) {
size_t len = strlen(CHIMERA_IDENT);
*info = ch_misc_alloc(len + 1);
if (!(*info)) {
return CH_INVALID;
}
memcpy((*info), CHIMERA_IDENT, len);
(*info)[len] = '\0';
return CH_SUCCESS;
}
char *hsinfo = NULL;
hs_error_t ret = hs_database_info(getHyperscanDatabase(bytecode), &hsinfo);
if (ret != HS_SUCCESS) {
assert(!hsinfo);
return ret;
}
size_t hybridlen = strlen(CHIMERA_IDENT);
size_t hslen = strlen(hsinfo);
*info = ch_misc_alloc(hybridlen + hslen + 1);
if (!(*info)) {
ch_misc_free(hsinfo);
return CH_INVALID;
}
memcpy((*info), CHIMERA_IDENT, hybridlen);
memcpy((*info) + hybridlen, hsinfo, hslen);
(*info)[hybridlen + hslen] = '\0';
ch_misc_free(hsinfo);
return CH_SUCCESS;
}

158
chimera/ch_database.h Normal file
View File

@@ -0,0 +1,158 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Runtime code for ch_database manipulation.
*/
#ifndef CH_DATABASE_H_
#define CH_DATABASE_H_
#ifdef __cplusplus
extern "C"
{
#endif
#define PCRE_STATIC
#include <pcre.h>
#include "ch_compile.h" // for CH_MODE_ flags
#include "ue2common.h"
#include "hs_version.h"
#include "hs.h"
#define CH_DB_MAGIC 0xdedededeU //!< Magic number stored in \ref ch_database
/** \brief Main Chimera database header. */
struct ch_database {
u32 magic; //!< must be \ref CH_DB_MAGIC
u32 version; //!< release version
u32 length; //!< total allocated length in bytes
u32 reserved0; //!< unused
u32 reserved1; //!< unused
u32 bytecode; //!< offset relative to db start
u32 padding[16]; //!< padding for alignment of rest of bytecode
char bytes[];
};
/** \brief Chimera bytecode header, which follows the \ref ch_database and is
* always 64-byte aligned. */
struct ch_bytecode {
u32 length; //!< length of bytecode including this header struct
u32 flags; //!< whole-database flags (CHIMERA_FLAG_NO_MULTIMATCH,
// CHIMERA_FLAG_GROUPS)
u32 patternCount; //!< total number of patterns
u32 activeSize; //!< size of mmbit to store active pattern ids
u32 databaseOffset; //!< offset for database following \ref ch_bytecode
// header
u32 patternOffset; //!< points to an array of u32 offsets, each pointing to
// a \ref ch_pattern
u32 unguardedOffset; //!< pointer to a list of unguarded pattern indices
u32 unguardedCount; //!< number of unguarded patterns
u32 maxCaptureGroups; //!< max number of capture groups used by any pattern
};
/** \brief Per-pattern header.
*
* struct is followed in bytecode by:
* 1. pcre bytecode (always present)
* 2. pcre study data (sometimes)
*/
struct ch_pattern {
u32 id; //!< pattern ID to report to the user
u32 flags; //!< per-pattern flags (e.g. \ref CHIMERA_PATTERN_FLAG_UTF8)
u32 maxWidth; //!< maximum width of a match, or UINT_MAX for inf.
u32 minWidth; //!< minimum width of a match.
u32 fixedWidth;//!< pattern has fixed width.
u32 studyOffset; //!< offset relative to struct start of study data,
// or zero if there is none
u32 length; //!< length of struct plus pcre bytecode and study data
pcre_extra extra; //!< pcre_extra struct, used to store study data ptr for
// the currently-running pcre at runtime.
};
static really_inline
const void *ch_get_bytecode(const struct ch_database *db) {
assert(db);
const void *bytecode = (const char *)db + db->bytecode;
assert(ISALIGNED_16(bytecode));
return bytecode;
}
struct hs_database;
static really_inline
const struct hs_database *getHyperscanDatabase(const struct ch_bytecode *db) {
assert(db);
const char *ptr = (const char *)db;
const struct hs_database *hs_db;
hs_db = (const struct hs_database *)(ptr + db->databaseOffset);
assert(ISALIGNED_CL(hs_db));
return hs_db;
}
static really_inline
const u32 *getUnguarded(const struct ch_bytecode *db) {
assert(db);
const char *ptr = (const char *)db;
const u32 *unguarded = (const u32 *)(ptr + db->unguardedOffset);
assert(ISALIGNED_N(unguarded, sizeof(u32)));
return unguarded;
}
static really_inline
const struct ch_pattern *getPattern(const struct ch_bytecode *db, u32 i) {
assert(db);
assert(i < db->patternCount);
const char *ptr = (const char *)db;
const u32 *patternOffset = (const u32 *)(ptr + db->patternOffset);
assert(patternOffset[i] < db->length);
return (const struct ch_pattern *)(ptr + patternOffset[i]);
}
static really_inline
ch_error_t hydbIsValid(const struct ch_database *hydb) {
if (!hydb || hydb->magic != CH_DB_MAGIC) {
DEBUG_PRINTF("bad magic (%u != %u)\n", hydb->magic, CH_DB_MAGIC);
return CH_INVALID;
}
if (hydb->version != HS_VERSION_32BIT) {
DEBUG_PRINTF("bad version\n");
return CH_DB_VERSION_ERROR;
}
return CH_SUCCESS;
}
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* CH_DATABASE_H_ */

44
chimera/ch_internal.h Normal file
View File

@@ -0,0 +1,44 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Chimera: data structures and internals.
*/
#ifndef CH_INTERNAL_H
#define CH_INTERNAL_H
#define CHIMERA_FLAG_NO_MULTIMATCH 1 //!< Don't run a multimatch scan
#define CHIMERA_FLAG_GROUPS 2 //!< Return capturing groups
#define CHIMERA_FLAG_ALL_CONFIRM 4 //!< All patterns need confirm
#define CHIMERA_FLAG_ALL_SINGLE 8 //!< All patterns need only one match
#define CHIMERA_PATTERN_FLAG_SINGLEMATCH 1 //!< only report the first match
#define CHIMERA_PATTERN_FLAG_UTF8 2 //!< pattern is in UTF-8 mode
#endif

629
chimera/ch_runtime.c Normal file
View File

@@ -0,0 +1,629 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Chimera: main runtime.
*/
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "ch.h"
#include "hs.h"
#include "hs_internal.h"
#include "ue2common.h"
#include "ch_database.h"
#include "ch_internal.h"
#include "ch_scratch.h"
#include "util/multibit.h"
#include "util/unicode_def.h"
typedef struct queue_item PQ_T;
static
char PQ_COMP(PQ_T *pqc_items, int a, int b) {
if ((pqc_items)[a].to != (pqc_items)[b].to) {
return (pqc_items)[a].to < (pqc_items)[b].to;
} else if ((pqc_items)[a].from != (pqc_items)[b].from) {
return (pqc_items)[a].from < (pqc_items)[b].from;
} else {
return (pqc_items)[a].id < (pqc_items)[b].id;
}
}
static
char PQ_COMP_B(PQ_T *pqc_items, int a, PQ_T b_fixed) {
if ((pqc_items)[a].to != (b_fixed).to) {
return (pqc_items)[a].to < (b_fixed).to;
} else if ((pqc_items)[a].from != (b_fixed).from) {
return (pqc_items)[a].from < (b_fixed).from;
} else {
return (pqc_items)[a].id < b_fixed.id;
}
}
#include "util/pqueue.h"
static really_inline
void pq_insert_with(struct match_pq *pq, int from, int to, u32 id) {
DEBUG_PRINTF("inserting pattern%u in pq at %u\n", id, to);
struct queue_item temp = {
.from = from,
.to = to,
.id = id,
};
pq_insert(pq->item, pq->size, temp);
++pq->size;
}
static really_inline
void pq_pop_nice(struct match_pq *pq) {
pq_pop(pq->item, pq->size);
pq->size--;
}
/** dummy event handler for use when user does not provide one */
static
int null_onEvent(UNUSED unsigned id, UNUSED unsigned long long from,
UNUSED unsigned long long to, UNUSED unsigned flags,
UNUSED unsigned size, UNUSED const ch_capture_t *captured,
UNUSED void *ctxt) {
return 0;
}
/** \brief Chimera runtime context. */
struct HybridContext {
const char *data; //!< buffer being scanned
u32 length; //!< length of data buffer
u32 valid_utf8_highwater; //!< UTF-8 has been validated up to here.
const struct ch_bytecode *db;
struct ch_scratch *scratch;
struct match_pq *pq;
/** \brief user-supplied match callback */
int (*match_callback)(unsigned int id, unsigned long long from,
unsigned long long to, unsigned int flags,
unsigned int size, const ch_capture_t *capture,
void *ctx);
/** \brief user-supplied error callback */
int (*error_callback)(ch_error_event_t error_type, unsigned int id,
void *info, void *ctx);
/** \brief user-supplied context */
void *context;
};
// Internal PCRE func.
extern int _pcre_valid_utf(const unsigned char *, int, int *);
/** UTF-8 validity check. Returns >0 if the given region of the data is valid
* UTF-8, 0 otherwise. */
static
char isValidUTF8(struct HybridContext *hyctx, u32 end) {
assert(hyctx);
if (hyctx->valid_utf8_highwater >= end) {
return 1; // Already validated.
}
const unsigned char *data =
(const unsigned char *)hyctx->data + hyctx->valid_utf8_highwater;
int validate_len = end - hyctx->valid_utf8_highwater;
DEBUG_PRINTF("validating %d bytes\n", validate_len);
int erroroffset = 0;
if (_pcre_valid_utf(data, validate_len, &erroroffset)) {
DEBUG_PRINTF("UTF8 invalid at offset %d\n", erroroffset);
return 0;
}
hyctx->valid_utf8_highwater = end;
return 1;
}
static
const pcre *getPcre(const struct ch_pattern *pattern) {
const char *ptr = (const char *)pattern;
const pcre *p = (const pcre *)(ptr + ROUNDUP_N(sizeof(*pattern), 8));
assert(ISALIGNED_N(p, 8));
return p;
}
/** \brief Fill the Chimera groups array from a pcre_exec ovector. */
static
void fillGroupsFromOvector(ch_capture_t *groups, int numPairs, int *ovector) {
assert(groups);
assert(ISALIGNED_N(groups, alignof(ch_capture_t)));
DEBUG_PRINTF("filling %d groups (@ %p) from pcre ovector\n",
numPairs, groups);
for (int i = 0; i < numPairs * 2; i += 2) {
if (ovector[i] == -1) {
groups->flags = CH_CAPTURE_FLAG_INACTIVE;
} else {
groups->flags = CH_CAPTURE_FLAG_ACTIVE;
assert(ovector[i] <= ovector[i + 1]);
groups->from = ovector[i];
groups->to = ovector[i + 1];
}
++groups;
}
}
static
ch_error_t handlePcreNonMatch(const struct ch_pattern *pattern, int rv,
ch_error_event_handler onError,
void *userContext) {
assert(rv < 0);
if (rv == PCRE_ERROR_NOMATCH) {
DEBUG_PRINTF("no match found by libpcre\n");
return CH_SUCCESS;
} else if (rv == PCRE_ERROR_MATCHLIMIT) {
DEBUG_PRINTF("pcre hit match limit\n");
if (onError) {
return onError(CH_ERROR_MATCHLIMIT, pattern->id, NULL,
userContext);
}
return CH_SUCCESS;
} else if (rv == PCRE_ERROR_RECURSIONLIMIT) {
DEBUG_PRINTF("pcre hit recursion limit\n");
if (onError) {
return onError(CH_ERROR_RECURSIONLIMIT, pattern->id, NULL,
userContext);
}
return CH_SUCCESS;
}
// All other errors not handled above are fatal.
return CH_FAIL_INTERNAL;
}
static
ch_error_t scanPcre(struct HybridContext *hyctx, UNUSED unsigned int length,
unsigned int offset, u32 id) {
const char *data = hyctx->data;
unsigned int full_length = hyctx->length;
ch_error_event_handler onError = hyctx->error_callback;
void *userContext = hyctx->context;
const struct ch_pattern *pattern = getPattern(hyctx->db, id);
const pcre *p = getPcre(pattern);
// Set up the PCRE extra block.
const pcre_extra *extra = &pattern->extra;
int startoffset = offset;
int *ovector = hyctx->scratch->ovector;
int ovectorSize = (hyctx->scratch->maxCaptureGroups + 1) * 3;
assert(ovectorSize >= 2);
DEBUG_PRINTF("scanning %u bytes, pattern %u, startoffset %d\n",
length, id, startoffset);
int options = 0;
if (pattern->flags & CHIMERA_PATTERN_FLAG_UTF8) {
// We do our own UTF-8 validation.
options |= PCRE_NO_UTF8_CHECK;
if (!isValidUTF8(hyctx, full_length)) {
return handlePcreNonMatch(pattern, PCRE_ERROR_BADUTF8, onError,
userContext);
}
}
int rv = pcre_exec(p, extra, data, full_length, startoffset, options,
ovector, ovectorSize);
DEBUG_PRINTF("pcre return code is %d\n", rv);
// Handle all non-match or error cases, all of which involve us
// terminating the loop.
if (rv < 0) {
return handlePcreNonMatch(pattern, rv, onError, userContext);
}
// We've found a match, and we should always have room for at least the
// start and end offsets in our ovector. Pass this info to the user.
assert(rv >= 1);
assert(rv < ovectorSize);
int from = ovector[0];
int to = ovector[1];
DEBUG_PRINTF("match %d -> %d\n", from, to);
struct ch_patterndata *pd = hyctx->scratch->patternData + id;
if (hyctx->db->flags & CHIMERA_FLAG_GROUPS) {
fillGroupsFromOvector(pd->match, rv, ovector);
} else {
rv = 0;
}
pd->groupCount = (u32)rv;
// Insert new matched item to the queue
pq_insert_with(hyctx->pq, from, to, id);
// Next scan starts at the first codepoint after the match. It's
// possible that we have a vacuous match, in which case we must step
// past it to ensure that we always progress.
if (from != to) {
startoffset = to;
} else if (pattern->flags & CHIMERA_PATTERN_FLAG_UTF8) {
startoffset = to + 1;
while (startoffset < (int)full_length &&
((data[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) {
++startoffset;
}
} else {
startoffset = to + 1;
}
pd->scanStart = startoffset;
DEBUG_PRINTF("new offset %u\n", pd->scanStart);
return CH_SUCCESS;
}
static
ch_error_t catchupPcre(struct HybridContext *hyctx, unsigned int id,
unsigned long long from, unsigned long long to) {
ch_match_event_handler onEvent = hyctx->match_callback;
void *userContext = hyctx->context;
DEBUG_PRINTF("priority queue size %u\n", hyctx->pq->size);
while (hyctx->pq->size) {
u32 num_item = hyctx->pq->size;
struct queue_item *item = pq_top(hyctx->pq->item);
size_t top_from = item->from;
size_t top_to = item->to;
u32 top_id = item->id;
if (top_to > to) {
pq_insert_with(hyctx->pq, from, to, id);
break;
}
pq_pop_nice(hyctx->pq);
const struct ch_pattern *pattern = getPattern(hyctx->db, top_id);
struct ch_patterndata *pd = hyctx->scratch->patternData + top_id;
// Report match for pattern
DEBUG_PRINTF("trigger match@%zu\n", top_to);
ch_callback_t cbrv =
onEvent(pattern->id, top_from, top_to, 0 /* flags */,
pd->groupCount, pd->match, userContext);
if (cbrv == CH_CALLBACK_TERMINATE) {
DEBUG_PRINTF("user callback told us to terminate scanning\n");
return CH_SCAN_TERMINATED;
} else if (cbrv == CH_CALLBACK_SKIP_PATTERN) {
DEBUG_PRINTF("user callback told us to skip this pattern\n");
pd->scanStart = hyctx->length;
}
if (top_id == id) {
break;
}
// Push a new match to replace the old one
unsigned int start = pd->scanStart;
unsigned int len = hyctx->length - pd->scanStart;
if (hyctx->length >= pd->scanStart &&
!(pattern->flags & CHIMERA_PATTERN_FLAG_SINGLEMATCH)) {
DEBUG_PRINTF("get a new match item\n");
int ret = scanPcre(hyctx, len, start, top_id);
if (ret == CH_CALLBACK_TERMINATE) {
DEBUG_PRINTF("user callback told us to terminate scanning\n");
return CH_SCAN_TERMINATED;
} else if (ret == CH_CALLBACK_SKIP_PATTERN) {
DEBUG_PRINTF("user callback told us to skip this pattern\n");
pd->scanStart = hyctx->length;
ret = CH_SUCCESS;
} else if (ret == CH_FAIL_INTERNAL) {
return ret;
}
// No further match is found
if (hyctx->pq->size == num_item - 1) {
pd->scanStart = hyctx->length;
}
}
}
return CH_SUCCESS;
}
/** \brief Callback used for internal Hyperscan multi-matcher. */
static
int multiCallback(unsigned int id, unsigned long long from,
unsigned long long to, UNUSED unsigned int flags,
void *ctx) {
assert(ctx);
struct HybridContext *hyctx = ctx;
DEBUG_PRINTF("match for ID %u at offset %llu\n", id, to);
assert(id < hyctx->db->patternCount);
const struct ch_pattern *pattern = getPattern(hyctx->db, id);
struct ch_patterndata *pd = hyctx->scratch->patternData + id;
char needConfirm = pattern->fixedWidth == ~0U;
if (needConfirm &&
mmbit_isset(hyctx->scratch->active, hyctx->db->patternCount, id)) {
if ((hyctx->db->flags & CHIMERA_FLAG_ALL_CONFIRM) &&
mmbit_all(hyctx->scratch->active, hyctx->db->patternCount)) {
return 1;
}
return 0;
}
// Store the fact that we've seen this bit.
char already = mmbit_set(hyctx->scratch->active,
hyctx->db->patternCount, id);
DEBUG_PRINTF("match from %u to %llu\n", pd->scanStart, to);
if (!already) {
pd->scanStart = 0;
} else if (to < pd->scanStart + pattern->minWidth) {
return 0;
} else if (pattern->flags & CHIMERA_PATTERN_FLAG_SINGLEMATCH) {
if ((hyctx->db->flags & CHIMERA_FLAG_ALL_SINGLE) &&
mmbit_all(hyctx->scratch->active, hyctx->db->patternCount)) {
return 1;
}
// Note: we may have unordered match from Hyperscan,
// thus possibly get to < pd->scanStart.
return 0;
}
int ret = HS_SUCCESS;
unsigned int start = pd->scanStart;
unsigned int len = hyctx->length - pd->scanStart;
assert(hyctx->length >= pd->scanStart);
const char *data = hyctx->data;
if (needConfirm) {
DEBUG_PRINTF("run confirm for the first time\n");
ret = scanPcre(hyctx, len, start, id);
hyctx->scratch->ret = ret;
if (ret == CH_CALLBACK_TERMINATE) {
DEBUG_PRINTF("user callback told us to terminate scanning\n");
return HS_SCAN_TERMINATED;
} else if (ret == CH_CALLBACK_SKIP_PATTERN) {
DEBUG_PRINTF("user callback told us to skip this pattern\n");
pd->scanStart = hyctx->length;
ret = HS_SUCCESS;
} else if (ret == CH_FAIL_INTERNAL) {
return ret;
}
} else {
if (already) {
DEBUG_PRINTF("catch up with new matches\n");
ret = catchupPcre(hyctx, id, from, to);
hyctx->scratch->ret = ret;
if (pd->scanStart >= hyctx->length) {
return ret;
}
}
int startoffset = 0;
// Next scan starts at the first codepoint after the match. It's
// possible that we have a vacuous match, in which case we must step
// past it to ensure that we always progress.
if (from != to) {
startoffset = to;
} else if (pattern->flags & CHIMERA_PATTERN_FLAG_UTF8) {
startoffset = to + 1;
while (startoffset < (int)hyctx->length &&
((data[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) {
++startoffset;
}
} else {
startoffset = to + 1;
}
pd->scanStart = startoffset;
int rv = 0;
if (hyctx->db->flags & CHIMERA_FLAG_GROUPS) {
ch_capture_t *groups = pd->match;
groups->flags = CH_CAPTURE_FLAG_ACTIVE;
groups->from = from;
groups->to = to;
rv = 1;
}
pd->groupCount = (u32)rv;
pq_insert_with(hyctx->pq, from, to, id);
}
return ret;
}
static
hs_error_t scanHyperscan(struct HybridContext *hyctx, const char *data,
unsigned int length) {
DEBUG_PRINTF("scanning %u bytes with Hyperscan\n", length);
const struct ch_bytecode *hydb = hyctx->db;
const hs_database_t *db = getHyperscanDatabase(hydb);
hs_scratch_t *scratch = hyctx->scratch->multi_scratch;
hs_error_t err = hs_scan(db, data, length, 0, scratch, multiCallback,
hyctx);
return err;
}
/** \brief Init match priority queue.
*
* Add a first match offset for each pattern that is not supported by Hyperscan
* with prefiltering.
*/
static really_inline
ch_error_t initQueue(struct HybridContext *hyctx, struct match_pq *pq) {
const struct ch_bytecode *db = hyctx->db;
u8 *active = hyctx->scratch->active;
mmbit_clear(active, db->patternCount);
// Init match queue size
pq->size = 0;
unsigned int length = hyctx->length;
const u32 *unguarded = getUnguarded(db);
for (u32 i = 0; i < db->unguardedCount; i++) {
u32 patternId = unguarded[i];
DEBUG_PRINTF("switch on unguarded pcre %u\n", patternId);
mmbit_set(active, db->patternCount, patternId);
DEBUG_PRINTF("get a new match item\n");
int ret = scanPcre(hyctx, length, 0, patternId);
struct ch_patterndata *pd = hyctx->scratch->patternData + patternId;
if (ret == CH_CALLBACK_TERMINATE) {
DEBUG_PRINTF("user callback told us to terminate scanning\n");
return CH_SCAN_TERMINATED;
} else if (ret == CH_CALLBACK_SKIP_PATTERN) {
DEBUG_PRINTF("user callback told us to skip this pattern\n");
pd->scanStart = length;
ret = CH_SUCCESS;
} else if (ret == CH_FAIL_INTERNAL) {
return ret;
}
}
return CH_SUCCESS;
}
static really_inline
ch_error_t ch_scan_i(const ch_database_t *hydb,
const char *data, unsigned int length,
UNUSED unsigned int flags,
ch_scratch_t *scratch,
ch_match_event_handler onEvent,
ch_error_event_handler onError,
void *userContext) {
if (unlikely(!hydb || !scratch || !data)) {
DEBUG_PRINTF("args invalid\n");
return CH_INVALID;
}
ch_error_t ret = hydbIsValid(hydb);
if (ret != CH_SUCCESS) {
DEBUG_PRINTF("database invalid\n");
return ret;
}
if (!ISALIGNED_CL(scratch)) {
DEBUG_PRINTF("bad alignment %p\n", scratch);
return CH_INVALID;
}
if (scratch->magic != CH_SCRATCH_MAGIC) {
DEBUG_PRINTF("scratch invalid\n");
return CH_INVALID;
}
if (unlikely(markScratchInUse(scratch))) {
return CH_SCRATCH_IN_USE;
}
// Hyperscan underlying scratch and database validity will be checked by
// the hs_scan() call, so no need to do it here.
// PCRE takes the data region length in as an int, so this limits our block
// size to INT_MAX.
if (length > INT_MAX) {
DEBUG_PRINTF("length invalid\n");
unmarkScratchInUse(scratch);
return CH_INVALID;
}
const struct ch_bytecode *db = ch_get_bytecode(hydb);
scratch->pq.size = 0;
scratch->ret = CH_SUCCESS;
// Firstly, we run Hyperscan in block mode and add its matches into the
// active list for subsequent confirmation with pcre.
struct HybridContext hyctx = {
.data = data,
.length = length,
.valid_utf8_highwater = 0,
.db = db,
.scratch = scratch,
.pq = &scratch->pq,
.match_callback = onEvent ? onEvent : null_onEvent,
.error_callback = onError,
.context = userContext
};
// Init priority queue.
ret = initQueue(&hyctx, &scratch->pq);
if (ret != CH_SUCCESS) {
DEBUG_PRINTF("Chimera returned error %d\n", ret);
unmarkScratchInUse(scratch);
return ret;
}
if (!(db->flags & CHIMERA_FLAG_NO_MULTIMATCH)) {
ret = scanHyperscan(&hyctx, data, length);
if (ret != HS_SUCCESS && scratch->ret != CH_SUCCESS) {
DEBUG_PRINTF("Hyperscan returned error %d\n", scratch->ret);
unmarkScratchInUse(scratch);
return scratch->ret;
}
}
DEBUG_PRINTF("Flush priority queue\n");
// Catch up with PCRE and make up id and offsets as we don't really care
// about their values
ret = catchupPcre(&hyctx, ~0U, length, length);
if (ret != CH_SUCCESS) {
DEBUG_PRINTF("PCRE catch up returned error %d\n", ret);
unmarkScratchInUse(scratch);
return ret;
}
unmarkScratchInUse(scratch);
return CH_SUCCESS;
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_scan(const ch_database_t *hydb, const char *data,
unsigned int length, unsigned int flags,
ch_scratch_t *scratch,
ch_match_event_handler onEvent,
ch_error_event_handler onError, void *userContext) {
ch_error_t ret = ch_scan_i(hydb, data, length, flags, scratch, onEvent,
onError, userContext);
return ret;
}
HS_PUBLIC_API
const char * HS_CDECL ch_version(void) {
return HS_VERSION_STRING;
}

377
chimera/ch_runtime.h Normal file
View File

@@ -0,0 +1,377 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CH_RUNTIME_H_
#define CH_RUNTIME_H_
#include <stdlib.h>
/**
* @file
* @brief The Chimera runtime API definition.
*
* Chimera is a hybrid of Hyperscan and PCRE regular expression engine.
*
* This header contains functions for using compiled Chimera databases for
* scanning data at runtime.
*/
#include "hs_common.h"
#ifdef __cplusplus
extern "C"
{
#endif
struct ch_scratch;
/**
* A Chimera scratch space.
*/
typedef struct ch_scratch ch_scratch_t;
/**
* Callback return value used to tell the Chimera matcher what to do after
* processing this match.
*/
typedef int ch_callback_t;
/**
* @defgroup CH_CALLBACK ch_callback_t values
*
* @{
*/
/**
* Continue matching.
*/
#define CH_CALLBACK_CONTINUE 0
/**
* Terminate matching.
*/
#define CH_CALLBACK_TERMINATE 1
/**
* Skip remaining matches for this ID and continue.
*/
#define CH_CALLBACK_SKIP_PATTERN 2
/** @} */
/**
* Type used to differentiate the errors raised with the @ref
* ch_error_event_handler callback.
*/
typedef int ch_error_event_t;
/**
* @defgroup CH_ERROR_EVENT ch_error_event_t values
*
* @{
*/
/**
* PCRE hits its match limit and reports PCRE_ERROR_MATCHLIMIT.
*/
#define CH_ERROR_MATCHLIMIT 1
/**
* PCRE hits its recursion limit and reports PCRE_ERROR_RECURSIONLIMIT.
*/
#define CH_ERROR_RECURSIONLIMIT 2
/** @} */
/**
* Structure representing a captured subexpression within a match. An array of
* these structures corresponding to capture groups in order is passed to the
* callback on match, with active structures identified by the
* CH_CAPTURE_FLAG_ACTIVE flag.
*/
typedef struct ch_capture {
/**
* The flags indicating if this structure is active.
*/
unsigned int flags;
/**
* offset at which this capture group begins.
*/
unsigned long long from; /*< offset at which this capture group begins. */
/**
* offset at which this capture group ends.
*/
unsigned long long to;
} ch_capture_t;
/**
* @defgroup CH_CAPTURE ch_capture_t flags
*
* These flags are used in @ref ch_capture_t::flags to indicate if this
* structure is active.
*
* @{
*/
/**
* Flag indicating that a particular capture group is inactive, used in @ref
* ch_capture_t::flags.
*/
#define CH_CAPTURE_FLAG_INACTIVE 0
/**
* Flag indicating that a particular capture group is active, used in @ref
* ch_capture_t::flags.
*/
#define CH_CAPTURE_FLAG_ACTIVE 1
/** @} */
/**
* Definition of the match event callback function type.
*
* A callback function matching the defined type must be provided by the
* application calling the @ref ch_scan()
*
* This callback function will be invoked whenever a match is located in the
* target data during the execution of a scan. The details of the match are
* passed in as parameters to the callback function, and the callback function
* should return a value indicating whether or not matching should continue on
* the target data. If no callbacks are desired from a scan call, NULL may be
* provided in order to suppress match production.
*
* @param id
* The ID number of the expression that matched. If the expression was a
* single expression compiled with @ref ch_compile(), this value will be
* zero.
*
* @param from
* The offset of the first byte that matches the expression.
*
* @param to
* The offset after the last byte that matches the expression.
*
* @param flags
* This is provided for future use and is unused at present.
*
* @param size
* The number of valid entries pointed to by the captured parameter.
*
* @param captured
* A pointer to an array of @ref ch_capture_t structures that
* contain the start and end offsets of entire pattern match and
* each captured subexpression.
*
* @param ctx
* The pointer supplied by the user to the @ref ch_scan() function.
*
* @return
* The callback can return @ref CH_CALLBACK_TERMINATE to stop matching.
* Otherwise, a return value of @ref CH_CALLBACK_CONTINUE will continue,
* with the current pattern if configured to produce multiple matches per
* pattern, while a return value of @ref CH_CALLBACK_SKIP_PATTERN will
* cease matching this pattern but continue matching the next pattern.
*/
typedef ch_callback_t (*ch_match_event_handler)(unsigned int id,
unsigned long long from,
unsigned long long to,
unsigned int flags,
unsigned int size,
const ch_capture_t *captured,
void *ctx);
/**
* Definition of the Chimera error event callback function type.
*
* A callback function matching the defined type may be provided by the
* application calling the @ref ch_scan function. This callback function
* will be invoked when an error event occurs during matching; this indicates
* that some matches for a given expression may not be reported.
*
* @param error_type
* The type of error event that occurred. Currently these errors
* correspond to resource limits on PCRE backtracking
* @ref CH_ERROR_MATCHLIMIT and @ref CH_ERROR_RECURSIONLIMIT.
*
* @param id
* The ID number of the expression that matched.
*
* @param info
* Event-specific data, for future use. Currently unused.
*
* @param ctx
* The context pointer supplied by the user to the @ref ch_scan
* function.
*
* @return
* The callback can return @ref CH_CALLBACK_SKIP_PATTERN to cease matching this
* pattern but continue matching the next pattern. Otherwise, we stop
* matching for all patterns with @ref CH_CALLBACK_TERMINATE.
*/
typedef ch_callback_t (*ch_error_event_handler)(ch_error_event_t error_type,
unsigned int id, void *info,
void *ctx);
/**
* The block regular expression scanner.
*
* This is the function call in which the actual pattern matching takes place
* for block-mode pattern databases.
*
* @param db
* A compiled pattern database.
*
* @param data
* Pointer to the data to be scanned.
*
* @param length
* The number of bytes to scan.
*
* @param flags
* Flags modifying the behaviour of this function. This parameter is
* provided for future use and is unused at present.
*
* @param scratch
* A per-thread scratch space allocated by @ref ch_alloc_scratch() for this
* database.
*
* @param onEvent
* Pointer to a match event callback function. If a NULL pointer is given,
* no matches will be returned.
*
* @param onError
* Pointer to a error event callback function. If a NULL pointer is given,
* @ref CH_ERROR_MATCHLIMIT and @ref CH_ERROR_RECURSIONLIMIT errors will
* be ignored and match will continue.
*
* @param context
* The user defined pointer which will be passed to the callback function.
*
* @return
* Returns @ref CH_SUCCESS on success; @ref CH_SCAN_TERMINATED if the
* match callback indicated that scanning should stop; other values on
* error.
*/
ch_error_t HS_CDECL ch_scan(const ch_database_t *db, const char *data,
unsigned int length, unsigned int flags,
ch_scratch_t *scratch,
ch_match_event_handler onEvent,
ch_error_event_handler onError,
void *context);
/**
* Allocate a "scratch" space for use by Chimera.
*
* This is required for runtime use, and one scratch space per thread, or
* concurrent caller, is required. Any allocator callback set by @ref
* ch_set_scratch_allocator() or @ref ch_set_allocator() will be used by this
* function.
*
* @param db
* The database, as produced by @ref ch_compile().
*
* @param scratch
* On first allocation, a pointer to NULL should be provided so a new
* scratch can be allocated. If a scratch block has been previously
* allocated, then a pointer to it should be passed back in to see if it
* is valid for this database block. If a new scratch block is required,
* the original will be freed and the new one returned, otherwise the
* previous scratch block will be returned. On success, the scratch block
* will be suitable for use with the provided database in addition to any
* databases that original scratch space was suitable for.
*
* @return
* @ref CH_SUCCESS on successful allocation; @ref CH_NOMEM if the
* allocation fails. Other errors may be returned if invalid parameters
* are specified.
*/
ch_error_t HS_CDECL ch_alloc_scratch(const ch_database_t *db,
ch_scratch_t **scratch);
/**
* Allocate a scratch space that is a clone of an existing scratch space.
*
* This is useful when multiple concurrent threads will be using the same set
* of compiled databases, and another scratch space is required. Any allocator
* callback set by @ref ch_set_scratch_allocator() or @ref ch_set_allocator()
* will be used by this function.
*
* @param src
* The existing @ref ch_scratch_t to be cloned.
*
* @param dest
* A pointer to the new scratch space will be returned here.
*
* @return
* @ref CH_SUCCESS on success; @ref CH_NOMEM if the allocation fails.
* Other errors may be returned if invalid parameters are specified.
*/
ch_error_t HS_CDECL ch_clone_scratch(const ch_scratch_t *src,
ch_scratch_t **dest);
/**
* Provides the size of the given scratch space.
*
* @param scratch
* A per-thread scratch space allocated by @ref ch_alloc_scratch() or @ref
* ch_clone_scratch().
*
* @param scratch_size
* On success, the size of the scratch space in bytes is placed in this
* parameter.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_scratch_size(const ch_scratch_t *scratch,
size_t *scratch_size);
/**
* Free a scratch block previously allocated by @ref ch_alloc_scratch() or @ref
* ch_clone_scratch().
*
* The free callback set by @ref ch_set_scratch_allocator() or @ref
* ch_set_allocator() will be used by this function.
*
* @param scratch
* The scratch block to be freed. NULL may also be safely provided.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_free_scratch(ch_scratch_t *scratch);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* CH_RUNTIME_H_ */

317
chimera/ch_scratch.c Normal file
View File

@@ -0,0 +1,317 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Chimera: scratch space alloc.
*/
#include <string.h>
#include "allocator.h"
#include "ch.h"
#include "hs.h"
#include "hs_internal.h"
#include "ue2common.h"
#include "ch_alloc.h"
#include "ch_internal.h"
#include "ch_scratch.h"
#include "ch_database.h"
static
size_t getPatternDataSize(const ch_scratch_t *s) {
size_t numCapturingStructs =
s->patternCount * (s->maxCaptureGroups + 1);
return (sizeof(struct ch_patterndata) * s->patternCount) +
alignof(struct ch_capture) + // padding
(sizeof(struct ch_capture) * numCapturingStructs);
}
static
void initPatternData(const ch_scratch_t *s) {
// ch_capture array is aligned, directly after the patterndata array.
char *ptr = (char *)s->patternData +
(sizeof(struct ch_patterndata) * s->patternCount);
struct ch_capture *cap = (struct ch_capture *)
(ROUNDUP_PTR(ptr, alignof(struct ch_capture)));
for (u32 i = 0; i < s->patternCount; i++) {
struct ch_patterndata *pd = &s->patternData[i];
pd->match = cap;
DEBUG_PRINTF("pattern %u: pd=%p, match=%p\n", i, pd, pd->match);
cap += (s->maxCaptureGroups + 1);
}
}
static
ch_error_t alloc_scratch(const ch_scratch_t *proto, ch_scratch_t **scratch) {
size_t ovectorSize = (proto->maxCaptureGroups + 1) * sizeof(int) * 3;
size_t capturedSize =
sizeof(struct ch_capture) * (proto->maxCaptureGroups + 1);
size_t patternDataSize = getPatternDataSize(proto);
size_t activeSize = proto->activeSize;
size_t queueSize = proto->patternCount * sizeof(struct queue_item);
// max padding for alignment below.
size_t padding = alignof(int) + alignof(struct ch_capture) +
alignof(struct ch_patterndata) +
alignof(struct queue_item);
size_t allocSize = sizeof(ch_scratch_t) + ovectorSize + capturedSize +
patternDataSize + activeSize + queueSize + padding
+ 256; /* padding for cacheline alignment */
ch_scratch_t *s;
ch_scratch_t *s_tmp = ch_scratch_alloc(allocSize);
ch_error_t err = ch_check_alloc(s_tmp);
if (err != CH_SUCCESS) {
ch_scratch_free(s_tmp);
*scratch = NULL;
return err;
}
memset(s_tmp, 0, allocSize);
s = ROUNDUP_PTR(s_tmp, 64);
// Set ordinary members.
*s = *proto;
s->magic = CH_SCRATCH_MAGIC;
s->in_use = 0;
s->scratch_alloc = (char *)s_tmp;
// Set pointers internal to allocation.
char *ptr = (char *)s + sizeof(*s);
ptr = ROUNDUP_PTR(ptr, alignof(int));
s->ovector = (int *)ptr;
ptr += ovectorSize;
ptr = ROUNDUP_PTR(ptr, alignof(struct ch_capture));
s->captured = (struct ch_capture *)ptr;
ptr += capturedSize;
ptr = ROUNDUP_PTR(ptr, alignof(struct ch_patterndata));
s->patternData = (struct ch_patterndata *)ptr;
ptr += patternDataSize;
// Pre-fill pattern data, setting captureOffsets
initPatternData(s);
ptr = ROUNDUP_PTR(ptr, alignof(struct queue_item));
s->pq.item = (struct queue_item *)ptr;
ptr += queueSize;
s->active = (u8 *)ptr;
// Store size.
s->scratchSize = allocSize;
// We should never overrun our allocation.
assert((ptr + activeSize) - (char *)s <= (ptrdiff_t)allocSize);
*scratch = s;
return CH_SUCCESS;
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_alloc_scratch(const ch_database_t *hydb,
ch_scratch_t **scratch) {
if (!hydb || !scratch) {
DEBUG_PRINTF("invalid args\n");
return CH_INVALID;
}
DEBUG_PRINTF("hydb=%p, &scratch=%p\n", hydb, scratch);
ch_error_t rv = hydbIsValid(hydb);
if (rv != CH_SUCCESS) {
DEBUG_PRINTF("invalid database\n");
return rv;
}
if (*scratch != NULL) {
/* has to be aligned before we can do anything with it */
if (!ISALIGNED_CL(*scratch)) {
return CH_INVALID;
}
if ((*scratch)->magic != CH_SCRATCH_MAGIC) {
return CH_INVALID;
}
if (markScratchInUse(*scratch)) {
return CH_SCRATCH_IN_USE;
}
}
// We allocate a prototype of the scratch header to do our sizing with.
ch_scratch_t *proto;
ch_scratch_t *proto_tmp = ch_scratch_alloc(sizeof(ch_scratch_t) + 256);
ch_error_t proto_ret = ch_check_alloc(proto_tmp);
if (proto_ret != CH_SUCCESS) {
ch_scratch_free(proto_tmp);
ch_scratch_free(*scratch);
*scratch = NULL;
return proto_ret;
}
proto = ROUNDUP_PTR(proto_tmp, 64);
int resize = 0;
if (*scratch) {
*proto = **scratch;
} else {
memset(proto, 0, sizeof(*proto));
resize = 1;
}
proto->scratch_alloc = (char *)proto_tmp;
const struct ch_bytecode *db = ch_get_bytecode(hydb);
if (db->maxCaptureGroups > proto->maxCaptureGroups) {
proto->maxCaptureGroups = db->maxCaptureGroups;
resize = 1;
}
if (db->patternCount > proto->patternCount) {
proto->patternCount = db->patternCount;
proto->activeSize = db->activeSize;
resize = 1;
}
if (resize) {
if (*scratch) {
ch_scratch_free((*scratch)->scratch_alloc);
}
ch_error_t alloc_ret = alloc_scratch(proto, scratch);
ch_scratch_free(proto_tmp);
if (alloc_ret != CH_SUCCESS) {
*scratch = NULL;
return alloc_ret;
}
} else {
ch_scratch_free(proto_tmp);
unmarkScratchInUse(*scratch);
}
if (db->flags & CHIMERA_FLAG_NO_MULTIMATCH) {
(*scratch)->multi_scratch = NULL;
return CH_SUCCESS;
}
// We may still have to realloc the underlying Hyperscan scratch.
rv = hs_alloc_scratch(getHyperscanDatabase(db),
&(*scratch)->multi_scratch);
if (rv != HS_SUCCESS) {
DEBUG_PRINTF("hs_alloc_scratch for multi_scratch failed\n");
hs_free_scratch((*scratch)->multi_scratch);
ch_scratch_free((*scratch)->scratch_alloc);
*scratch = NULL;
return rv;
}
return CH_SUCCESS;
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_clone_scratch(const ch_scratch_t *src,
ch_scratch_t **dest) {
if (!dest || !src || !ISALIGNED_CL(src) ||
src->magic != CH_SCRATCH_MAGIC) {
DEBUG_PRINTF("scratch invalid\n");
return CH_INVALID;
}
ch_error_t ret = alloc_scratch(src, dest);
if (ret != CH_SUCCESS) {
DEBUG_PRINTF("alloc_scratch failed\n");
*dest = NULL;
return ret;
}
if (src->multi_scratch) {
(*dest)->multi_scratch = NULL;
ret = hs_clone_scratch(src->multi_scratch, &(*dest)->multi_scratch);
if (ret != HS_SUCCESS) {
DEBUG_PRINTF("hs_clone_scratch(multi_scratch,...) failed\n");
ch_scratch_free(*dest);
return ret;
}
}
return CH_SUCCESS;
}
HS_PUBLIC_API
ch_error_t HS_CDECL ch_free_scratch(ch_scratch_t *scratch) {
ch_error_t ret = CH_SUCCESS;
if (scratch) {
/* has to be aligned before we can do anything with it */
if (!ISALIGNED_CL(scratch)) {
return CH_INVALID;
}
if (scratch->magic != CH_SCRATCH_MAGIC) {
return CH_INVALID;
}
if (markScratchInUse(scratch)) {
return CH_SCRATCH_IN_USE;
}
if (scratch->multi_scratch) {
ret = hs_free_scratch(scratch->multi_scratch);
}
scratch->magic = 0;
assert(scratch->scratch_alloc);
DEBUG_PRINTF("scratch %p is really at %p : freeing\n", scratch,
scratch->scratch_alloc);
ch_scratch_free(scratch->scratch_alloc);
}
return ret;
}
/** Not public, but used for info from our internal tools. Note that in the
* hybrid matcher the scratch is definitely not a contiguous memory region. */
HS_PUBLIC_API
ch_error_t HS_CDECL ch_scratch_size(const ch_scratch_t *scratch, size_t *size) {
ch_error_t ret = CH_SUCCESS;
if (!size || !scratch || !ISALIGNED_CL(scratch) ||
scratch->magic != CH_SCRATCH_MAGIC) {
return CH_INVALID;
} else {
size_t multi_size = 0;
if (scratch->multi_scratch) {
ret = hs_scratch_size(scratch->multi_scratch, &multi_size);
}
if (ret) {
multi_size = 0;
}
*size = scratch->scratchSize + multi_size;
}
return ret;
}

119
chimera/ch_scratch.h Normal file
View File

@@ -0,0 +1,119 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Scratch and associated data structures.
*
* This header gets pulled into many places (many deep, slow to compile
* places). Try to keep the included headers under control.
*/
#ifndef CH_SCRATCH_H_
#define CH_SCRATCH_H_
#include "ch_common.h"
#include "ch_runtime.h"
#ifdef __cplusplus
extern "C"
{
#endif
#define CH_SCRATCH_MAGIC 0x554F4259 //!< Magic number stored in \ref ch_scratch
struct queue_item {
int from; /** \brief used to store the start location. */
int to; /** \brief used to store the current location. */
u32 id; /**< pattern index. */
};
struct match_pq {
struct queue_item *item;
u32 size; /**< current size of the priority queue */
};
/** \brief Information about a pattern stored at runtime when a match is
* encountered. */
struct ch_patterndata {
struct ch_capture *match; //!< buffered group info
u32 groupCount; //!< number of capturing groups
u32 scanStart; //!< start of match window (still to be single-scanned).
};
/** \brief Scratch space header for Chimera. */
struct ch_scratch {
u32 magic; //!< must be \ref CH_SCRATCH_MAGIC
u8 in_use; /**< non-zero when being used by an API call. */
struct hs_scratch *multi_scratch; //!< for hyperscan scatch.
int *ovector; //!< maximally-sized ovector for PCRE usage.
struct ch_capture *captured; //!< max-sized capture group struct.
u8 *active; //!< active multibit.
struct ch_patterndata *patternData; //!< per-pattern match data, indexed by
// pattern ID.
struct match_pq pq; //!< priority queue to ensure matching ordering
u32 patternCount; //!< number of patterns, used to size active multibit
u32 activeSize; //!< size of active multibit
u32 maxCaptureGroups; //!< largest num of capturing groups required
u32 scratchSize; //!< size of allocation
int ret; //!< return value in Hyperscan callback
char *scratch_alloc; /* user allocated scratch object */
};
/**
* \brief Mark scratch as in use.
*
* Returns non-zero if it was already in use, zero otherwise.
*/
static really_inline
char markScratchInUse(struct ch_scratch *scratch) {
DEBUG_PRINTF("marking scratch as in use\n");
assert(scratch && scratch->magic == CH_SCRATCH_MAGIC);
if (scratch->in_use) {
DEBUG_PRINTF("scratch already in use!\n");
return 1;
}
scratch->in_use = 1;
return 0;
}
/**
* \brief Mark scratch as no longer in use.
*/
static really_inline
void unmarkScratchInUse(struct ch_scratch *scratch) {
DEBUG_PRINTF("marking scratch as not in use\n");
assert(scratch && scratch->magic == CH_SCRATCH_MAGIC);
assert(scratch->in_use == 1);
scratch->in_use = 0;
}
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* CH_SCRATCH_H_ */