mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
395 lines
16 KiB
C
395 lines
16 KiB
C
/*
|
|
* Copyright (c) 2018, Intel Corporation
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Intel Corporation nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef CH_COMPILE_H_
|
|
#define CH_COMPILE_H_
|
|
|
|
/**
|
|
* @file
|
|
* @brief The Chimera compiler API definition.
|
|
*
|
|
* Chimera is a hybrid solution of Hyperscan and PCRE.
|
|
*
|
|
* This header contains functions for compiling regular expressions into
|
|
* Chimera databases that can be used by the Chimera runtime.
|
|
*/
|
|
|
|
#include "ch_common.h"
|
|
#include "hs_compile.h"
|
|
|
|
#ifdef __cplusplus
|
|
extern "C"
|
|
{
|
|
#endif
|
|
|
|
/**
|
|
* A type containing error details that is returned by the compile calls (@ref
|
|
* ch_compile() and @ref ch_compile_multi() on failure. The caller may inspect
|
|
* the values returned in this type to determine the cause of failure.
|
|
*/
|
|
typedef struct ch_compile_error {
|
|
/**
|
|
* A human-readable error message describing the error.
|
|
*/
|
|
char *message;
|
|
|
|
/**
|
|
* The zero-based number of the expression that caused the error (if this
|
|
* can be determined). If the error is not specific to an expression, then
|
|
* this value will be less than zero.
|
|
*/
|
|
int expression;
|
|
} ch_compile_error_t;
|
|
|
|
/**
|
|
* The basic regular expression compiler.
|
|
*
|
|
* This is the function call with which an expression is compiled into a
|
|
* Chimera database which can be passed to the runtime function (
|
|
* @ref ch_scan())
|
|
*
|
|
* @param expression
|
|
* The NULL-terminated expression to parse. Note that this string must
|
|
* represent ONLY the pattern to be matched, with no delimiters or flags;
|
|
* any global flags should be specified with the @a flags argument. For
|
|
* example, the expression `/abc?def/i` should be compiled by providing
|
|
* `abc?def` as the @a expression, and @ref CH_FLAG_CASELESS as the @a
|
|
* flags.
|
|
*
|
|
* @param flags
|
|
* Flags which modify the behaviour of the expression. Multiple flags may
|
|
* be used by ORing them together. Valid values are:
|
|
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
|
|
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
|
|
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
|
|
* - CH_FLAG_SINGLEMATCH - Only one match will be generated for the
|
|
* expression per stream.
|
|
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
|
|
* - CH_FLAG_UCP - Use Unicode properties for character classes.
|
|
*
|
|
* @param mode
|
|
* Compiler mode flag that affect the database as a whole for capturing
|
|
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
|
|
* See @ref CH_MODE_FLAG for more details.
|
|
*
|
|
* @param platform
|
|
* If not NULL, the platform structure is used to determine the target
|
|
* platform for the database. If NULL, a database suitable for running
|
|
* on the current host platform is produced.
|
|
*
|
|
* @param db
|
|
* On success, a pointer to the generated database will be returned in
|
|
* this parameter, or NULL on failure. The caller is responsible for
|
|
* deallocating the buffer using the @ref ch_free_database() function.
|
|
*
|
|
* @param compile_error
|
|
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
|
|
* returned, providing details of the error condition. The caller is
|
|
* responsible for deallocating the buffer using the @ref
|
|
* ch_free_compile_error() function.
|
|
*
|
|
* @return
|
|
* @ref CH_SUCCESS is returned on successful compilation; @ref
|
|
* CH_COMPILER_ERROR on failure, with details provided in the error
|
|
* parameter.
|
|
*/
|
|
ch_error_t HS_CDECL ch_compile(const char *expression, unsigned int flags,
|
|
unsigned int mode,
|
|
const hs_platform_info_t *platform,
|
|
ch_database_t **db,
|
|
ch_compile_error_t **compile_error);
|
|
|
|
/**
|
|
* The multiple regular expression compiler.
|
|
*
|
|
* This is the function call with which a set of expressions is compiled into a
|
|
* database which can be passed to the runtime function (@ref ch_scan()).
|
|
* Each expression can be labelled with a unique integer which is passed into
|
|
* the match callback to identify the pattern that has matched.
|
|
*
|
|
* @param expressions
|
|
* Array of NULL-terminated expressions to compile. Note that (as for @ref
|
|
* ch_compile()) these strings must contain only the pattern to be
|
|
* matched, with no delimiters or flags. For example, the expression
|
|
* `/abc?def/i` should be compiled by providing `abc?def` as the first
|
|
* string in the @a expressions array, and @ref CH_FLAG_CASELESS as the
|
|
* first value in the @a flags array.
|
|
*
|
|
* @param flags
|
|
* Array of flags which modify the behaviour of each expression. Multiple
|
|
* flags may be used by ORing them together. Specifying the NULL pointer
|
|
* in place of an array will set the flags value for all patterns to zero.
|
|
* Valid values are:
|
|
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
|
|
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
|
|
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
|
|
* - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns
|
|
* with this match id per stream.
|
|
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
|
|
* - CH_FLAG_UCP - Use Unicode properties for character classes.
|
|
*
|
|
* @param ids
|
|
* An array of integers specifying the ID number to be associated with the
|
|
* corresponding pattern in the expressions array. Specifying the NULL
|
|
* pointer in place of an array will set the ID value for all patterns to
|
|
* zero.
|
|
*
|
|
* @param elements
|
|
* The number of elements in the input arrays.
|
|
*
|
|
* @param mode
|
|
* Compiler mode flag that affect the database as a whole for capturing
|
|
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
|
|
* See @ref CH_MODE_FLAG for more details.
|
|
*
|
|
* @param platform
|
|
* If not NULL, the platform structure is used to determine the target
|
|
* platform for the database. If NULL, a database suitable for running
|
|
* on the current host platform is produced.
|
|
*
|
|
* @param db
|
|
* On success, a pointer to the generated database will be returned in
|
|
* this parameter, or NULL on failure. The caller is responsible for
|
|
* deallocating the buffer using the @ref ch_free_database() function.
|
|
*
|
|
* @param compile_error
|
|
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
|
|
* returned, providing details of the error condition. The caller is
|
|
* responsible for deallocating the buffer using the @ref
|
|
* ch_free_compile_error() function.
|
|
*
|
|
* @return
|
|
* @ref CH_SUCCESS is returned on successful compilation; @ref
|
|
* CH_COMPILER_ERROR on failure, with details provided in the @a error
|
|
* parameter.
|
|
*
|
|
*/
|
|
ch_error_t HS_CDECL ch_compile_multi(const char *const *expressions,
|
|
const unsigned int *flags,
|
|
const unsigned int *ids,
|
|
unsigned int elements, unsigned int mode,
|
|
const hs_platform_info_t *platform,
|
|
ch_database_t **db,
|
|
ch_compile_error_t **compile_error);
|
|
|
|
/**
|
|
* The multiple regular expression compiler with extended match limits support.
|
|
*
|
|
* This is the function call with which a set of expressions is compiled into a
|
|
* database in the same way as @ref ch_compile_multi(), but allows additional
|
|
* parameters to be specified via match_limit and match_limit_recursion to
|
|
* define match limits for PCRE runtime.
|
|
*
|
|
* @param expressions
|
|
* Array of NULL-terminated expressions to compile. Note that (as for @ref
|
|
* ch_compile()) these strings must contain only the pattern to be
|
|
* matched, with no delimiters or flags. For example, the expression
|
|
* `/abc?def/i` should be compiled by providing `abc?def` as the first
|
|
* string in the @a expressions array, and @ref CH_FLAG_CASELESS as the
|
|
* first value in the @a flags array.
|
|
*
|
|
* @param flags
|
|
* Array of flags which modify the behaviour of each expression. Multiple
|
|
* flags may be used by ORing them together. Specifying the NULL pointer
|
|
* in place of an array will set the flags value for all patterns to zero.
|
|
* Valid values are:
|
|
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
|
|
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
|
|
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
|
|
* - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns
|
|
* with this match id per stream.
|
|
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
|
|
* - CH_FLAG_UCP - Use Unicode properties for character classes.
|
|
*
|
|
* @param ids
|
|
* An array of integers specifying the ID number to be associated with the
|
|
* corresponding pattern in the expressions array. Specifying the NULL
|
|
* pointer in place of an array will set the ID value for all patterns to
|
|
* zero.
|
|
*
|
|
* @param elements
|
|
* The number of elements in the input arrays.
|
|
*
|
|
* @param mode
|
|
* Compiler mode flag that affect the database as a whole for capturing
|
|
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
|
|
* See @ref CH_MODE_FLAG for more details.
|
|
*
|
|
* @param match_limit
|
|
* A limit from pcre_extra on the amount of match function called in PCRE
|
|
* to limit backtracking that can take place.
|
|
*
|
|
* @param match_limit_recursion
|
|
* A limit from pcre_extra on the recursion depth of match function
|
|
* in PCRE.
|
|
*
|
|
* @param platform
|
|
* If not NULL, the platform structure is used to determine the target
|
|
* platform for the database. If NULL, a database suitable for running
|
|
* on the current host platform is produced.
|
|
*
|
|
* @param db
|
|
* On success, a pointer to the generated database will be returned in
|
|
* this parameter, or NULL on failure. The caller is responsible for
|
|
* deallocating the buffer using the @ref ch_free_database() function.
|
|
*
|
|
* @param compile_error
|
|
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
|
|
* returned, providing details of the error condition. The caller is
|
|
* responsible for deallocating the buffer using the @ref
|
|
* ch_free_compile_error() function.
|
|
*
|
|
* @return
|
|
* @ref CH_SUCCESS is returned on successful compilation; @ref
|
|
* CH_COMPILER_ERROR on failure, with details provided in the @a error
|
|
* parameter.
|
|
*
|
|
*/
|
|
ch_error_t HS_CDECL ch_compile_ext_multi(const char *const *expressions,
|
|
const unsigned int *flags,
|
|
const unsigned int *ids,
|
|
unsigned int elements,
|
|
unsigned int mode,
|
|
unsigned long int match_limit,
|
|
unsigned long int match_limit_recursion,
|
|
const hs_platform_info_t *platform,
|
|
ch_database_t **db,
|
|
ch_compile_error_t **compile_error);
|
|
|
|
/**
|
|
* Free an error structure generated by @ref ch_compile(), @ref
|
|
* ch_compile_multi().
|
|
*
|
|
* @param error
|
|
* The @ref ch_compile_error_t to be freed. NULL may also be safely
|
|
* provided.
|
|
*
|
|
* @return
|
|
* @ref CH_SUCCESS on success, other values on failure.
|
|
*/
|
|
ch_error_t HS_CDECL ch_free_compile_error(ch_compile_error_t *error);
|
|
|
|
/**
|
|
* @defgroup CH_PATTERN_FLAG Pattern flags
|
|
*
|
|
* @{
|
|
*/
|
|
|
|
/**
|
|
* Compile flag: Set case-insensitive matching.
|
|
*
|
|
* This flag sets the expression to be matched case-insensitively by default.
|
|
* The expression may still use PCRE tokens (notably `(?i)` and
|
|
* `(?-i)`) to switch case-insensitive matching on and off.
|
|
*/
|
|
#define CH_FLAG_CASELESS 1
|
|
|
|
/**
|
|
* Compile flag: Matching a `.` will not exclude newlines.
|
|
*
|
|
* This flag sets any instances of the `.` token to match newline characters as
|
|
* well as all other characters. The PCRE specification states that the `.`
|
|
* token does not match newline characters by default, so without this flag the
|
|
* `.` token will not cross line boundaries.
|
|
*/
|
|
#define CH_FLAG_DOTALL 2
|
|
|
|
/**
|
|
* Compile flag: Set multi-line anchoring.
|
|
*
|
|
* This flag instructs the expression to make the `^` and `$` tokens match
|
|
* newline characters as well as the start and end of the stream. If this flag
|
|
* is not specified, the `^` token will only ever match at the start of a
|
|
* stream, and the `$` token will only ever match at the end of a stream within
|
|
* the guidelines of the PCRE specification.
|
|
*/
|
|
#define CH_FLAG_MULTILINE 4
|
|
|
|
/**
|
|
* Compile flag: Set single-match only mode.
|
|
*
|
|
* This flag sets the expression's match ID to match at most once, only the
|
|
* first match for each invocation of @ref ch_scan() will be returned.
|
|
*
|
|
*/
|
|
#define CH_FLAG_SINGLEMATCH 8
|
|
|
|
/**
|
|
* Compile flag: Enable UTF-8 mode for this expression.
|
|
*
|
|
* This flag instructs Chimera to treat the pattern as a sequence of UTF-8
|
|
* characters. The results of scanning invalid UTF-8 sequences with a Chimera
|
|
* library that has been compiled with one or more patterns using this flag are
|
|
* undefined.
|
|
*/
|
|
#define CH_FLAG_UTF8 32
|
|
|
|
/**
|
|
* Compile flag: Enable Unicode property support for this expression.
|
|
*
|
|
* This flag instructs Chimera to use Unicode properties, rather than the
|
|
* default ASCII interpretations, for character mnemonics like `\w` and `\s` as
|
|
* well as the POSIX character classes. It is only meaningful in conjunction
|
|
* with @ref CH_FLAG_UTF8.
|
|
*/
|
|
#define CH_FLAG_UCP 64
|
|
|
|
/** @} */
|
|
|
|
/**
|
|
* @defgroup CH_MODE_FLAG Compile mode flags
|
|
*
|
|
* The mode flags are used as values for the mode parameter of the various
|
|
* compile calls (@ref ch_compile(), @ref ch_compile_multi().
|
|
*
|
|
* By default, the matcher will only supply the start and end offsets of the
|
|
* match when the match callback is called. Using mode flag @ref CH_MODE_GROUPS
|
|
* will also fill the `captured' array with the start and end offsets of all
|
|
* the capturing groups specified by the pattern that has matched.
|
|
*
|
|
* @{
|
|
*/
|
|
|
|
/**
|
|
* Compiler mode flag: Disable capturing groups.
|
|
*/
|
|
#define CH_MODE_NOGROUPS 0
|
|
|
|
/**
|
|
* Compiler mode flag: Enable capturing groups.
|
|
*/
|
|
#define CH_MODE_GROUPS 1048576
|
|
|
|
/** @} */
|
|
|
|
#ifdef __cplusplus
|
|
} /* extern "C" */
|
|
#endif
|
|
|
|
#endif /* CH_COMPILE_H_ */
|