mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-09-30 19:47:43 +03:00
chimera: hybrid of Hyperscan and PCRE
This commit is contained in:
394
chimera/ch_compile.h
Normal file
394
chimera/ch_compile.h
Normal file
@@ -0,0 +1,394 @@
|
||||
/*
|
||||
* Copyright (c) 2018, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef CH_COMPILE_H_
|
||||
#define CH_COMPILE_H_
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @brief The Chimera compiler API definition.
|
||||
*
|
||||
* Chimera is a hybrid solution of Hyperscan and PCRE.
|
||||
*
|
||||
* This header contains functions for compiling regular expressions into
|
||||
* Chimera databases that can be used by the Chimera runtime.
|
||||
*/
|
||||
|
||||
#include "ch_common.h"
|
||||
#include "hs_compile.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
/**
|
||||
* A type containing error details that is returned by the compile calls (@ref
|
||||
* ch_compile() and @ref ch_compile_multi() on failure. The caller may inspect
|
||||
* the values returned in this type to determine the cause of failure.
|
||||
*/
|
||||
typedef struct ch_compile_error {
|
||||
/**
|
||||
* A human-readable error message describing the error.
|
||||
*/
|
||||
char *message;
|
||||
|
||||
/**
|
||||
* The zero-based number of the expression that caused the error (if this
|
||||
* can be determined). If the error is not specific to an expression, then
|
||||
* this value will be less than zero.
|
||||
*/
|
||||
int expression;
|
||||
} ch_compile_error_t;
|
||||
|
||||
/**
|
||||
* The basic regular expression compiler.
|
||||
*
|
||||
* This is the function call with which an expression is compiled into a
|
||||
* Chimera database which can be passed to the runtime function (
|
||||
* @ref ch_scan())
|
||||
*
|
||||
* @param expression
|
||||
* The NULL-terminated expression to parse. Note that this string must
|
||||
* represent ONLY the pattern to be matched, with no delimiters or flags;
|
||||
* any global flags should be specified with the @a flags argument. For
|
||||
* example, the expression `/abc?def/i` should be compiled by providing
|
||||
* `abc?def` as the @a expression, and @ref CH_FLAG_CASELESS as the @a
|
||||
* flags.
|
||||
*
|
||||
* @param flags
|
||||
* Flags which modify the behaviour of the expression. Multiple flags may
|
||||
* be used by ORing them together. Valid values are:
|
||||
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
|
||||
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
|
||||
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
|
||||
* - CH_FLAG_SINGLEMATCH - Only one match will be generated for the
|
||||
* expression per stream.
|
||||
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
|
||||
* - CH_FLAG_UCP - Use Unicode properties for character classes.
|
||||
*
|
||||
* @param mode
|
||||
* Compiler mode flag that affect the database as a whole for capturing
|
||||
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
|
||||
* See @ref CH_MODE_FLAG for more details.
|
||||
*
|
||||
* @param platform
|
||||
* If not NULL, the platform structure is used to determine the target
|
||||
* platform for the database. If NULL, a database suitable for running
|
||||
* on the current host platform is produced.
|
||||
*
|
||||
* @param db
|
||||
* On success, a pointer to the generated database will be returned in
|
||||
* this parameter, or NULL on failure. The caller is responsible for
|
||||
* deallocating the buffer using the @ref ch_free_database() function.
|
||||
*
|
||||
* @param compile_error
|
||||
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
|
||||
* returned, providing details of the error condition. The caller is
|
||||
* responsible for deallocating the buffer using the @ref
|
||||
* ch_free_compile_error() function.
|
||||
*
|
||||
* @return
|
||||
* @ref CH_SUCCESS is returned on successful compilation; @ref
|
||||
* CH_COMPILER_ERROR on failure, with details provided in the error
|
||||
* parameter.
|
||||
*/
|
||||
ch_error_t HS_CDECL ch_compile(const char *expression, unsigned int flags,
|
||||
unsigned int mode,
|
||||
const hs_platform_info_t *platform,
|
||||
ch_database_t **db,
|
||||
ch_compile_error_t **compile_error);
|
||||
|
||||
/**
|
||||
* The multiple regular expression compiler.
|
||||
*
|
||||
* This is the function call with which a set of expressions is compiled into a
|
||||
* database which can be passed to the runtime function (@ref ch_scan()).
|
||||
* Each expression can be labelled with a unique integer which is passed into
|
||||
* the match callback to identify the pattern that has matched.
|
||||
*
|
||||
* @param expressions
|
||||
* Array of NULL-terminated expressions to compile. Note that (as for @ref
|
||||
* ch_compile()) these strings must contain only the pattern to be
|
||||
* matched, with no delimiters or flags. For example, the expression
|
||||
* `/abc?def/i` should be compiled by providing `abc?def` as the first
|
||||
* string in the @a expressions array, and @ref CH_FLAG_CASELESS as the
|
||||
* first value in the @a flags array.
|
||||
*
|
||||
* @param flags
|
||||
* Array of flags which modify the behaviour of each expression. Multiple
|
||||
* flags may be used by ORing them together. Specifying the NULL pointer
|
||||
* in place of an array will set the flags value for all patterns to zero.
|
||||
* Valid values are:
|
||||
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
|
||||
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
|
||||
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
|
||||
* - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns
|
||||
* with this match id per stream.
|
||||
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
|
||||
* - CH_FLAG_UCP - Use Unicode properties for character classes.
|
||||
*
|
||||
* @param ids
|
||||
* An array of integers specifying the ID number to be associated with the
|
||||
* corresponding pattern in the expressions array. Specifying the NULL
|
||||
* pointer in place of an array will set the ID value for all patterns to
|
||||
* zero.
|
||||
*
|
||||
* @param elements
|
||||
* The number of elements in the input arrays.
|
||||
*
|
||||
* @param mode
|
||||
* Compiler mode flag that affect the database as a whole for capturing
|
||||
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
|
||||
* See @ref CH_MODE_FLAG for more details.
|
||||
*
|
||||
* @param platform
|
||||
* If not NULL, the platform structure is used to determine the target
|
||||
* platform for the database. If NULL, a database suitable for running
|
||||
* on the current host platform is produced.
|
||||
*
|
||||
* @param db
|
||||
* On success, a pointer to the generated database will be returned in
|
||||
* this parameter, or NULL on failure. The caller is responsible for
|
||||
* deallocating the buffer using the @ref ch_free_database() function.
|
||||
*
|
||||
* @param compile_error
|
||||
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
|
||||
* returned, providing details of the error condition. The caller is
|
||||
* responsible for deallocating the buffer using the @ref
|
||||
* ch_free_compile_error() function.
|
||||
*
|
||||
* @return
|
||||
* @ref CH_SUCCESS is returned on successful compilation; @ref
|
||||
* CH_COMPILER_ERROR on failure, with details provided in the @a error
|
||||
* parameter.
|
||||
*
|
||||
*/
|
||||
ch_error_t HS_CDECL ch_compile_multi(const char *const *expressions,
|
||||
const unsigned int *flags,
|
||||
const unsigned int *ids,
|
||||
unsigned int elements, unsigned int mode,
|
||||
const hs_platform_info_t *platform,
|
||||
ch_database_t **db,
|
||||
ch_compile_error_t **compile_error);
|
||||
|
||||
/**
|
||||
* The multiple regular expression compiler with extended match limits support.
|
||||
*
|
||||
* This is the function call with which a set of expressions is compiled into a
|
||||
* database in the same way as @ref ch_compile_multi(), but allows additional
|
||||
* parameters to be specified via match_limit and match_limit_recursion to
|
||||
* define match limits for PCRE runtime.
|
||||
*
|
||||
* @param expressions
|
||||
* Array of NULL-terminated expressions to compile. Note that (as for @ref
|
||||
* ch_compile()) these strings must contain only the pattern to be
|
||||
* matched, with no delimiters or flags. For example, the expression
|
||||
* `/abc?def/i` should be compiled by providing `abc?def` as the first
|
||||
* string in the @a expressions array, and @ref CH_FLAG_CASELESS as the
|
||||
* first value in the @a flags array.
|
||||
*
|
||||
* @param flags
|
||||
* Array of flags which modify the behaviour of each expression. Multiple
|
||||
* flags may be used by ORing them together. Specifying the NULL pointer
|
||||
* in place of an array will set the flags value for all patterns to zero.
|
||||
* Valid values are:
|
||||
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
|
||||
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
|
||||
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
|
||||
* - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns
|
||||
* with this match id per stream.
|
||||
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
|
||||
* - CH_FLAG_UCP - Use Unicode properties for character classes.
|
||||
*
|
||||
* @param ids
|
||||
* An array of integers specifying the ID number to be associated with the
|
||||
* corresponding pattern in the expressions array. Specifying the NULL
|
||||
* pointer in place of an array will set the ID value for all patterns to
|
||||
* zero.
|
||||
*
|
||||
* @param elements
|
||||
* The number of elements in the input arrays.
|
||||
*
|
||||
* @param mode
|
||||
* Compiler mode flag that affect the database as a whole for capturing
|
||||
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
|
||||
* See @ref CH_MODE_FLAG for more details.
|
||||
*
|
||||
* @param match_limit
|
||||
* A limit from pcre_extra on the amount of match function called in PCRE
|
||||
* to limit backtracking that can take place.
|
||||
*
|
||||
* @param match_limit_recursion
|
||||
* A limit from pcre_extra on the recursion depth of match function
|
||||
* in PCRE.
|
||||
*
|
||||
* @param platform
|
||||
* If not NULL, the platform structure is used to determine the target
|
||||
* platform for the database. If NULL, a database suitable for running
|
||||
* on the current host platform is produced.
|
||||
*
|
||||
* @param db
|
||||
* On success, a pointer to the generated database will be returned in
|
||||
* this parameter, or NULL on failure. The caller is responsible for
|
||||
* deallocating the buffer using the @ref ch_free_database() function.
|
||||
*
|
||||
* @param compile_error
|
||||
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
|
||||
* returned, providing details of the error condition. The caller is
|
||||
* responsible for deallocating the buffer using the @ref
|
||||
* ch_free_compile_error() function.
|
||||
*
|
||||
* @return
|
||||
* @ref CH_SUCCESS is returned on successful compilation; @ref
|
||||
* CH_COMPILER_ERROR on failure, with details provided in the @a error
|
||||
* parameter.
|
||||
*
|
||||
*/
|
||||
ch_error_t HS_CDECL ch_compile_ext_multi(const char *const *expressions,
|
||||
const unsigned int *flags,
|
||||
const unsigned int *ids,
|
||||
unsigned int elements,
|
||||
unsigned int mode,
|
||||
unsigned long int match_limit,
|
||||
unsigned long int match_limit_recursion,
|
||||
const hs_platform_info_t *platform,
|
||||
ch_database_t **db,
|
||||
ch_compile_error_t **compile_error);
|
||||
|
||||
/**
|
||||
* Free an error structure generated by @ref ch_compile(), @ref
|
||||
* ch_compile_multi().
|
||||
*
|
||||
* @param error
|
||||
* The @ref ch_compile_error_t to be freed. NULL may also be safely
|
||||
* provided.
|
||||
*
|
||||
* @return
|
||||
* @ref CH_SUCCESS on success, other values on failure.
|
||||
*/
|
||||
ch_error_t HS_CDECL ch_free_compile_error(ch_compile_error_t *error);
|
||||
|
||||
/**
|
||||
* @defgroup CH_PATTERN_FLAG Pattern flags
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* Compile flag: Set case-insensitive matching.
|
||||
*
|
||||
* This flag sets the expression to be matched case-insensitively by default.
|
||||
* The expression may still use PCRE tokens (notably `(?i)` and
|
||||
* `(?-i)`) to switch case-insensitive matching on and off.
|
||||
*/
|
||||
#define CH_FLAG_CASELESS 1
|
||||
|
||||
/**
|
||||
* Compile flag: Matching a `.` will not exclude newlines.
|
||||
*
|
||||
* This flag sets any instances of the `.` token to match newline characters as
|
||||
* well as all other characters. The PCRE specification states that the `.`
|
||||
* token does not match newline characters by default, so without this flag the
|
||||
* `.` token will not cross line boundaries.
|
||||
*/
|
||||
#define CH_FLAG_DOTALL 2
|
||||
|
||||
/**
|
||||
* Compile flag: Set multi-line anchoring.
|
||||
*
|
||||
* This flag instructs the expression to make the `^` and `$` tokens match
|
||||
* newline characters as well as the start and end of the stream. If this flag
|
||||
* is not specified, the `^` token will only ever match at the start of a
|
||||
* stream, and the `$` token will only ever match at the end of a stream within
|
||||
* the guidelines of the PCRE specification.
|
||||
*/
|
||||
#define CH_FLAG_MULTILINE 4
|
||||
|
||||
/**
|
||||
* Compile flag: Set single-match only mode.
|
||||
*
|
||||
* This flag sets the expression's match ID to match at most once, only the
|
||||
* first match for each invocation of @ref ch_scan() will be returned.
|
||||
*
|
||||
*/
|
||||
#define CH_FLAG_SINGLEMATCH 8
|
||||
|
||||
/**
|
||||
* Compile flag: Enable UTF-8 mode for this expression.
|
||||
*
|
||||
* This flag instructs Chimera to treat the pattern as a sequence of UTF-8
|
||||
* characters. The results of scanning invalid UTF-8 sequences with a Chimera
|
||||
* library that has been compiled with one or more patterns using this flag are
|
||||
* undefined.
|
||||
*/
|
||||
#define CH_FLAG_UTF8 32
|
||||
|
||||
/**
|
||||
* Compile flag: Enable Unicode property support for this expression.
|
||||
*
|
||||
* This flag instructs Chimera to use Unicode properties, rather than the
|
||||
* default ASCII interpretations, for character mnemonics like `\w` and `\s` as
|
||||
* well as the POSIX character classes. It is only meaningful in conjunction
|
||||
* with @ref CH_FLAG_UTF8.
|
||||
*/
|
||||
#define CH_FLAG_UCP 64
|
||||
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
* @defgroup CH_MODE_FLAG Compile mode flags
|
||||
*
|
||||
* The mode flags are used as values for the mode parameter of the various
|
||||
* compile calls (@ref ch_compile(), @ref ch_compile_multi().
|
||||
*
|
||||
* By default, the matcher will only supply the start and end offsets of the
|
||||
* match when the match callback is called. Using mode flag @ref CH_MODE_GROUPS
|
||||
* will also fill the `captured' array with the start and end offsets of all
|
||||
* the capturing groups specified by the pattern that has matched.
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* Compiler mode flag: Disable capturing groups.
|
||||
*/
|
||||
#define CH_MODE_NOGROUPS 0
|
||||
|
||||
/**
|
||||
* Compiler mode flag: Enable capturing groups.
|
||||
*/
|
||||
#define CH_MODE_GROUPS 1048576
|
||||
|
||||
/** @} */
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* CH_COMPILE_H_ */
|
Reference in New Issue
Block a user