vectorscan/src/hs_compile.h
2022-08-29 15:03:14 +03:00

1219 lines
49 KiB
C

/*
* Copyright (c) 2015-2021, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef HS_COMPILE_H_
#define HS_COMPILE_H_
/**
* @file
* @brief The Hyperscan compiler API definition.
*
* Hyperscan is a high speed regular expression engine.
*
* This header contains functions for compiling regular expressions into
* Hyperscan databases that can be used by the Hyperscan runtime.
*/
#include "hs_common.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* A type containing error details that is returned by the compile calls (@ref
* hs_compile(), @ref hs_compile_multi() and @ref hs_compile_ext_multi()) on
* failure. The caller may inspect the values returned in this type to
* determine the cause of failure.
*
* Common errors generated during the compile process include:
*
* - *Invalid parameter*
*
* An invalid argument was specified in the compile call.
*
* - *Unrecognised flag*
*
* An unrecognised value was passed in the flags argument.
*
* - *Pattern matches empty buffer*
*
* By default, Hyperscan only supports patterns that will *always*
* consume at least one byte of input. Patterns that do not have this
* property (such as `/(abc)?/`) will produce this error unless
* the @ref HS_FLAG_ALLOWEMPTY flag is supplied. Note that such
* patterns will produce a match for *every* byte when scanned.
*
* - *Embedded anchors not supported*
*
* Hyperscan only supports the use of anchor meta-characters (such as
* `^` and `$`) in patterns where they could *only* match
* at the start or end of a buffer. A pattern containing an embedded
* anchor, such as `/abc^def/`, can never match, as there is no
* way for `abc` to precede the start of the data stream.
*
* - *Bounded repeat is too large*
*
* The pattern contains a repeated construct with very large finite
* bounds.
*
* - *Unsupported component type*
*
* An unsupported PCRE construct was used in the pattern.
*
* - *Unable to generate bytecode*
*
* This error indicates that Hyperscan was unable to compile a pattern
* that is syntactically valid. The most common cause is a pattern that is
* very long and complex or contains a large repeated subpattern.
*
* - *Unable to allocate memory*
*
* The library was unable to allocate temporary storage used during
* compilation time.
*
* - *Allocator returned misaligned memory*
*
* The memory allocator (either malloc() or the allocator set with @ref
* hs_set_allocator()) did not correctly return memory suitably aligned
* for the largest representable data type on this platform.
*
* - *Internal error*
*
* An unexpected error occurred: if this error is reported, please contact
* the Hyperscan team with a description of the situation.
*/
typedef struct hs_compile_error {
/**
* A human-readable error message describing the error.
*/
char *message;
/**
* The zero-based number of the expression that caused the error (if this
* can be determined). If the error is not specific to an expression, then
* this value will be less than zero.
*/
int expression;
} hs_compile_error_t;
/**
* A type containing information on the target platform which may optionally be
* provided to the compile calls (@ref hs_compile(), @ref hs_compile_multi(),
* @ref hs_compile_ext_multi()).
*
* A hs_platform_info structure may be populated for the current platform by
* using the @ref hs_populate_platform() call.
*/
typedef struct hs_platform_info {
/**
* Information about the target platform which may be used to guide the
* optimisation process of the compile.
*
* Use of this field does not limit the processors that the resulting
* database can run on, but may impact the performance of the resulting
* database.
*/
unsigned int tune;
/**
* Relevant CPU features available on the target platform
*
* This value may be produced by combining HS_CPU_FEATURE_* flags (such as
* @ref HS_CPU_FEATURES_AVX2). Multiple CPU features may be or'ed together
* to produce the value.
*/
unsigned long long cpu_features;
/**
* Reserved for future use.
*/
unsigned long long reserved1;
/**
* Reserved for future use.
*/
unsigned long long reserved2;
} hs_platform_info_t;
/**
* A type containing information related to an expression that is returned by
* @ref hs_expression_info() or @ref hs_expression_ext_info.
*/
typedef struct hs_expr_info {
/**
* The minimum length in bytes of a match for the pattern.
*
* Note: in some cases when using advanced features to suppress matches
* (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this
* may represent a conservative lower bound for the true minimum length of
* a match.
*/
unsigned int min_width;
/**
* The maximum length in bytes of a match for the pattern. If the pattern
* has an unbounded maximum length, this will be set to the maximum value
* of an unsigned int (UINT_MAX).
*
* Note: in some cases when using advanced features to suppress matches
* (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this
* may represent a conservative upper bound for the true maximum length of
* a match.
*/
unsigned int max_width;
/**
* Whether this expression can produce matches that are not returned in
* order, such as those produced by assertions. Zero if false, non-zero if
* true.
*/
char unordered_matches;
/**
* Whether this expression can produce matches at end of data (EOD). In
* streaming mode, EOD matches are raised during @ref hs_close_stream(),
* since it is only when @ref hs_close_stream() is called that the EOD
* location is known. Zero if false, non-zero if true.
*
* Note: trailing `\b` word boundary assertions may also result in EOD
* matches as end-of-data can act as a word boundary.
*/
char matches_at_eod;
/**
* Whether this expression can *only* produce matches at end of data (EOD).
* In streaming mode, all matches for this expression are raised during
* @ref hs_close_stream(). Zero if false, non-zero if true.
*/
char matches_only_at_eod;
} hs_expr_info_t;
/**
* A structure containing additional parameters related to an expression,
* passed in at build time to @ref hs_compile_ext_multi() or @ref
* hs_expression_ext_info.
*
* These parameters allow the set of matches produced by a pattern to be
* constrained at compile time, rather than relying on the application to
* process unwanted matches at runtime.
*/
typedef struct hs_expr_ext {
/**
* Flags governing which parts of this structure are to be used by the
* compiler. See @ref HS_EXT_FLAG.
*/
unsigned long long flags;
/**
* The minimum end offset in the data stream at which this expression
* should match successfully. To use this parameter, set the
* @ref HS_EXT_FLAG_MIN_OFFSET flag in the hs_expr_ext::flags field.
*/
unsigned long long min_offset;
/**
* The maximum end offset in the data stream at which this expression
* should match successfully. To use this parameter, set the
* @ref HS_EXT_FLAG_MAX_OFFSET flag in the hs_expr_ext::flags field.
*/
unsigned long long max_offset;
/**
* The minimum match length (from start to end) required to successfully
* match this expression. To use this parameter, set the
* @ref HS_EXT_FLAG_MIN_LENGTH flag in the hs_expr_ext::flags field.
*/
unsigned long long min_length;
/**
* Allow patterns to approximately match within this edit distance. To use
* this parameter, set the @ref HS_EXT_FLAG_EDIT_DISTANCE flag in the
* hs_expr_ext::flags field.
*/
unsigned edit_distance;
/**
* Allow patterns to approximately match within this Hamming distance. To
* use this parameter, set the @ref HS_EXT_FLAG_HAMMING_DISTANCE flag in the
* hs_expr_ext::flags field.
*/
unsigned hamming_distance;
} hs_expr_ext_t;
/**
* @defgroup HS_EXT_FLAG hs_expr_ext_t flags
*
* These flags are used in @ref hs_expr_ext_t::flags to indicate which fields
* are used.
*
* @{
*/
/** Flag indicating that the hs_expr_ext::min_offset field is used. */
#define HS_EXT_FLAG_MIN_OFFSET 1ULL
/** Flag indicating that the hs_expr_ext::max_offset field is used. */
#define HS_EXT_FLAG_MAX_OFFSET 2ULL
/** Flag indicating that the hs_expr_ext::min_length field is used. */
#define HS_EXT_FLAG_MIN_LENGTH 4ULL
/** Flag indicating that the hs_expr_ext::edit_distance field is used. */
#define HS_EXT_FLAG_EDIT_DISTANCE 8ULL
/** Flag indicating that the hs_expr_ext::hamming_distance field is used. */
#define HS_EXT_FLAG_HAMMING_DISTANCE 16ULL
/** @} */
/**
* The basic regular expression compiler.
*
* This is the function call with which an expression is compiled into a
* Hyperscan database which can be passed to the runtime functions (such as
* @ref hs_scan(), @ref hs_open_stream(), etc.)
*
* @param expression
* The NULL-terminated expression to parse. Note that this string must
* represent ONLY the pattern to be matched, with no delimiters or flags;
* any global flags should be specified with the @p flags argument. For
* example, the expression `/abc?def/i` should be compiled by providing
* `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a
* flags.
*
* @param flags
* Flags which modify the behaviour of the expression. Multiple flags may
* be used by ORing them together. Valid values are:
* - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
* - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - HS_FLAG_SINGLEMATCH - Only one match will be generated for the
* expression per stream.
* - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
* empty string, such as `.*`.
* - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - HS_FLAG_UCP - Use Unicode properties for character classes.
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
* when a match is found.
* - HS_FLAG_COMBINATION - Parse the expression in logical combination
* syntax.
* - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
* the sub-expressions in logical combinations.
*
* @param mode
* Compiler mode flags that affect the database as a whole. One of @ref
* HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
* supplied, to select between the generation of a streaming, block or
* vectored database. In addition, other flags (beginning with HS_MODE_)
* may be supplied to enable specific features. See @ref HS_MODE_FLAG for
* more details.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref hs_free_database() function.
*
* @param error
* If the compile fails, a pointer to a @ref hs_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* hs_free_compile_error() function.
*
* @return
* @ref HS_SUCCESS is returned on successful compilation; @ref
* HS_COMPILER_ERROR on failure, with details provided in the error
* parameter.
*/
hs_error_t HS_CDECL hs_compile(const char *expression, unsigned int flags,
unsigned int mode,
const hs_platform_info_t *platform,
hs_database_t **db, hs_compile_error_t **error);
/**
* The multiple regular expression compiler.
*
* This is the function call with which a set of expressions is compiled into a
* database which can be passed to the runtime functions (such as @ref
* hs_scan(), @ref hs_open_stream(), etc.) Each expression can be labelled with
* a unique integer which is passed into the match callback to identify the
* pattern that has matched.
*
* @param expressions
* Array of NULL-terminated expressions to compile. Note that (as for @ref
* hs_compile()) these strings must contain only the pattern to be
* matched, with no delimiters or flags. For example, the expression
* `/abc?def/i` should be compiled by providing `abc?def` as the first
* string in the @p expressions array, and @ref HS_FLAG_CASELESS as the
* first value in the @p flags array.
*
* @param flags
* Array of flags which modify the behaviour of each expression. Multiple
* flags may be used by ORing them together. Specifying the NULL pointer
* in place of an array will set the flags value for all patterns to zero.
* Valid values are:
* - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
* - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - HS_FLAG_SINGLEMATCH - Only one match will be generated by patterns
* with this match id per stream.
* - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
* empty string, such as `.*`.
* - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - HS_FLAG_UCP - Use Unicode properties for character classes.
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
* when a match is found.
* - HS_FLAG_COMBINATION - Parse the expression in logical combination
* syntax.
* - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
* the sub-expressions in logical combinations.
*
* @param ids
* An array of integers specifying the ID number to be associated with the
* corresponding pattern in the expressions array. Specifying the NULL
* pointer in place of an array will set the ID value for all patterns to
* zero.
*
* @param elements
* The number of elements in the input arrays.
*
* @param mode
* Compiler mode flags that affect the database as a whole. One of @ref
* HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
* supplied, to select between the generation of a streaming, block or
* vectored database. In addition, other flags (beginning with HS_MODE_)
* may be supplied to enable specific features. See @ref HS_MODE_FLAG for
* more details.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref hs_free_database() function.
*
* @param error
* If the compile fails, a pointer to a @ref hs_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* hs_free_compile_error() function.
*
* @return
* @ref HS_SUCCESS is returned on successful compilation; @ref
* HS_COMPILER_ERROR on failure, with details provided in the @p error
* parameter.
*
*/
hs_error_t HS_CDECL hs_compile_multi(const char *const *expressions,
const unsigned int *flags,
const unsigned int *ids,
unsigned int elements, unsigned int mode,
const hs_platform_info_t *platform,
hs_database_t **db,
hs_compile_error_t **error);
/**
* The multiple regular expression compiler with extended parameter support.
*
* This function call compiles a group of expressions into a database in the
* same way as @ref hs_compile_multi(), but allows additional parameters to be
* specified via an @ref hs_expr_ext_t structure per expression.
*
* @param expressions
* Array of NULL-terminated expressions to compile. Note that (as for @ref
* hs_compile()) these strings must contain only the pattern to be
* matched, with no delimiters or flags. For example, the expression
* `/abc?def/i` should be compiled by providing `abc?def` as the first
* string in the @p expressions array, and @ref HS_FLAG_CASELESS as the
* first value in the @p flags array.
*
* @param flags
* Array of flags which modify the behaviour of each expression. Multiple
* flags may be used by ORing them together. Specifying the NULL pointer
* in place of an array will set the flags value for all patterns to zero.
* Valid values are:
* - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
* - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - HS_FLAG_SINGLEMATCH - Only one match will be generated by patterns
* with this match id per stream.
* - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
* empty string, such as `.*`.
* - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - HS_FLAG_UCP - Use Unicode properties for character classes.
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
* when a match is found.
* - HS_FLAG_COMBINATION - Parse the expression in logical combination
* syntax.
* - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
* the sub-expressions in logical combinations.
*
* @param ids
* An array of integers specifying the ID number to be associated with the
* corresponding pattern in the expressions array. Specifying the NULL
* pointer in place of an array will set the ID value for all patterns to
* zero.
*
* @param ext
* An array of pointers to filled @ref hs_expr_ext_t structures that
* define extended behaviour for each pattern. NULL may be specified if no
* extended behaviour is needed for an individual pattern, or in place of
* the whole array if it is not needed for any expressions. Memory used by
* these structures must be both allocated and freed by the caller.
*
* @param elements
* The number of elements in the input arrays.
*
* @param mode
* Compiler mode flags that affect the database as a whole. One of @ref
* HS_MODE_STREAM, @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
* supplied, to select between the generation of a streaming, block or
* vectored database. In addition, other flags (beginning with HS_MODE_)
* may be supplied to enable specific features. See @ref HS_MODE_FLAG for
* more details.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref hs_free_database() function.
*
* @param error
* If the compile fails, a pointer to a @ref hs_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* hs_free_compile_error() function.
*
* @return
* @ref HS_SUCCESS is returned on successful compilation; @ref
* HS_COMPILER_ERROR on failure, with details provided in the @p error
* parameter.
*
*/
hs_error_t HS_CDECL hs_compile_ext_multi(const char *const *expressions,
const unsigned int *flags,
const unsigned int *ids,
const hs_expr_ext_t *const *ext,
unsigned int elements, unsigned int mode,
const hs_platform_info_t *platform,
hs_database_t **db, hs_compile_error_t **error);
/**
* The basic pure literal expression compiler.
*
* This is the function call with which a pure literal expression (not a
* common regular expression) is compiled into a Hyperscan database which
* can be passed to the runtime functions (such as @ref hs_scan(),
* @ref hs_open_stream(), etc.)
*
* @param expression
* The NULL-terminated expression to parse. Note that this string must
* represent ONLY the pattern to be matched, with no delimiters or flags;
* any global flags should be specified with the @p flags argument. For
* example, the expression `/abc?def/i` should be compiled by providing
* `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a
* flags. Meanwhile, the string content shall be fully parsed in a literal
* sense without any regular grammars. For example, the @p expression
* `abc?` simply means a char sequence of `a`, `b`, `c`, and `?`. The `?`
* here doesn't mean 0 or 1 quantifier under regular semantics.
*
* @param flags
* Flags which modify the behaviour of the expression. Multiple flags may
* be used by ORing them together. Compared to @ref hs_compile(), fewer
* valid values are provided:
* - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
* - HS_FLAG_SINGLEMATCH - Only one match will be generated for the
* expression per stream.
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
* when a match is found.
*
* @param len
* The length of the text content of the pure literal expression. As the
* text content indicated by @p expression is treated as single character
* one by one, the special terminating character `\0` should be allowed
* to appear in expression, and not treated as a terminator for a string.
* Thus, the end of a pure literal expression cannot be indicated by
* identifying `\0`, but by counting to the expression length.
*
* @param mode
* Compiler mode flags that affect the database as a whole. One of @ref
* HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
* supplied, to select between the generation of a streaming, block or
* vectored database. In addition, other flags (beginning with HS_MODE_)
* may be supplied to enable specific features. See @ref HS_MODE_FLAG for
* more details.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref hs_free_database() function.
*
* @param error
* If the compile fails, a pointer to a @ref hs_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* hs_free_compile_error() function.
*
* @return
* @ref HS_SUCCESS is returned on successful compilation; @ref
* HS_COMPILER_ERROR on failure, with details provided in the error
* parameter.
*/
hs_error_t HS_CDECL hs_compile_lit(const char *expression, unsigned flags,
const size_t len, unsigned mode,
const hs_platform_info_t *platform,
hs_database_t **db,
hs_compile_error_t **error);
/**
* The multiple pure literal expression compiler.
*
* This is the function call with which a set of pure literal expressions is
* compiled into a database which can be passed to the runtime functions (such
* as @ref hs_scan(), @ref hs_open_stream(), etc.) Each expression can be
* labelled with a unique integer which is passed into the match callback to
* identify the pattern that has matched.
*
* @param expressions
* The NULL-terminated expression to parse. Note that this string must
* represent ONLY the pattern to be matched, with no delimiters or flags;
* any global flags should be specified with the @p flags argument. For
* example, the expression `/abc?def/i` should be compiled by providing
* `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a
* flags. Meanwhile, the string content shall be fully parsed in a literal
* sense without any regular grammars. For example, the @p expression
* `abc?` simply means a char sequence of `a`, `b`, `c`, and `?`. The `?`
* here doesn't mean 0 or 1 quantifier under regular semantics.
*
* @param flags
* Array of flags which modify the behaviour of each expression. Multiple
* flags may be used by ORing them together. Specifying the NULL pointer
* in place of an array will set the flags value for all patterns to zero.
* Compared to @ref hs_compile_multi(), fewer valid values are provided:
* - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
* - HS_FLAG_SINGLEMATCH - Only one match will be generated for the
* expression per stream.
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
* when a match is found.
*
* @param ids
* An array of integers specifying the ID number to be associated with the
* corresponding pattern in the expressions array. Specifying the NULL
* pointer in place of an array will set the ID value for all patterns to
* zero.
*
* @param lens
* Array of lengths of the text content of each pure literal expression.
* As the text content indicated by @p expression is treated as single
* character one by one, the special terminating character `\0` should be
* allowed to appear in expression, and not treated as a terminator for a
* string. Thus, the end of a pure literal expression cannot be indicated
* by identifying `\0`, but by counting to the expression length.
*
* @param elements
* The number of elements in the input arrays.
*
* @param mode
* Compiler mode flags that affect the database as a whole. One of @ref
* HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
* supplied, to select between the generation of a streaming, block or
* vectored database. In addition, other flags (beginning with HS_MODE_)
* may be supplied to enable specific features. See @ref HS_MODE_FLAG for
* more details.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref hs_free_database() function.
*
* @param error
* If the compile fails, a pointer to a @ref hs_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* hs_free_compile_error() function.
*
* @return
* @ref HS_SUCCESS is returned on successful compilation; @ref
* HS_COMPILER_ERROR on failure, with details provided in the error
* parameter.
*/
hs_error_t HS_CDECL hs_compile_lit_multi(const char * const *expressions,
const unsigned *flags,
const unsigned *ids,
const size_t *lens,
unsigned elements, unsigned mode,
const hs_platform_info_t *platform,
hs_database_t **db,
hs_compile_error_t **error);
/**
* Free an error structure generated by @ref hs_compile(), @ref
* hs_compile_multi() or @ref hs_compile_ext_multi().
*
* @param error
* The @ref hs_compile_error_t to be freed. NULL may also be safely
* provided.
*
* @return
* @ref HS_SUCCESS on success, other values on failure.
*/
hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error);
/**
* Utility function providing information about a regular expression. The
* information provided in @ref hs_expr_info_t includes the minimum and maximum
* width of a pattern match.
*
* Note: successful analysis of an expression with this function does not imply
* that compilation of the same expression (via @ref hs_compile(), @ref
* hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This
* function may return @ref HS_SUCCESS for regular expressions that Hyperscan
* cannot compile.
*
* Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref
* HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect
* the properties returned in the @ref hs_expr_info_t structure, they will not
* affect the outcome of this function.
*
* @param expression
* The NULL-terminated expression to parse. Note that this string must
* represent ONLY the pattern to be matched, with no delimiters or flags;
* any global flags should be specified with the @p flags argument. For
* example, the expression `/abc?def/i` should be compiled by providing
* `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a
* flags.
*
* @param flags
* Flags which modify the behaviour of the expression. Multiple flags may
* be used by ORing them together. Valid values are:
* - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
* - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - HS_FLAG_SINGLEMATCH - Only one match will be generated by the
* expression per stream.
* - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
* empty string, such as `.*`.
* - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - HS_FLAG_UCP - Use Unicode properties for character classes.
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
* when a match is found.
* - HS_FLAG_QUIET - This flag will be ignored.
*
* @param info
* On success, a pointer to the pattern information will be returned in
* this parameter, or NULL on failure. This structure is allocated using
* the allocator supplied in @ref hs_set_allocator() (or malloc() if no
* allocator was set) and should be freed by the caller.
*
* @param error
* If the call fails, a pointer to a @ref hs_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* hs_free_compile_error() function.
*
* @return
* @ref HS_SUCCESS is returned on successful compilation; @ref
* HS_COMPILER_ERROR on failure, with details provided in the error
* parameter.
*/
hs_error_t HS_CDECL hs_expression_info(const char *expression,
unsigned int flags,
hs_expr_info_t **info,
hs_compile_error_t **error);
/**
* Utility function providing information about a regular expression, with
* extended parameter support. The information provided in @ref hs_expr_info_t
* includes the minimum and maximum width of a pattern match.
*
* Note: successful analysis of an expression with this function does not imply
* that compilation of the same expression (via @ref hs_compile(), @ref
* hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This
* function may return @ref HS_SUCCESS for regular expressions that Hyperscan
* cannot compile.
*
* Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref
* HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect
* the properties returned in the @ref hs_expr_info_t structure, they will not
* affect the outcome of this function.
*
* @param expression
* The NULL-terminated expression to parse. Note that this string must
* represent ONLY the pattern to be matched, with no delimiters or flags;
* any global flags should be specified with the @p flags argument. For
* example, the expression `/abc?def/i` should be compiled by providing
* `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a
* flags.
*
* @param flags
* Flags which modify the behaviour of the expression. Multiple flags may
* be used by ORing them together. Valid values are:
* - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
* - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - HS_FLAG_SINGLEMATCH - Only one match will be generated by the
* expression per stream.
* - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
* empty string, such as `.*`.
* - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - HS_FLAG_UCP - Use Unicode properties for character classes.
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
* when a match is found.
* - HS_FLAG_QUIET - This flag will be ignored.
*
* @param ext
* A pointer to a filled @ref hs_expr_ext_t structure that defines
* extended behaviour for this pattern. NULL may be specified if no
* extended parameters are needed.
*
* @param info
* On success, a pointer to the pattern information will be returned in
* this parameter, or NULL on failure. This structure is allocated using
* the allocator supplied in @ref hs_set_allocator() (or malloc() if no
* allocator was set) and should be freed by the caller.
*
* @param error
* If the call fails, a pointer to a @ref hs_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* hs_free_compile_error() function.
*
* @return
* @ref HS_SUCCESS is returned on successful compilation; @ref
* HS_COMPILER_ERROR on failure, with details provided in the error
* parameter.
*/
hs_error_t HS_CDECL hs_expression_ext_info(const char *expression,
unsigned int flags,
const hs_expr_ext_t *ext,
hs_expr_info_t **info,
hs_compile_error_t **error);
/**
* Populates the platform information based on the current host.
*
* @param platform
* On success, the pointed to structure is populated based on the current
* host.
*
* @return
* @ref HS_SUCCESS on success, other values on failure.
*/
hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
/**
* @defgroup HS_PATTERN_FLAG Pattern flags
*
* @{
*/
/**
* Compile flag: Set case-insensitive matching.
*
* This flag sets the expression to be matched case-insensitively by default.
* The expression may still use PCRE tokens (notably `(?i)` and
* `(?-i)`) to switch case-insensitive matching on and off.
*/
#define HS_FLAG_CASELESS 1
/**
* Compile flag: Matching a `.` will not exclude newlines.
*
* This flag sets any instances of the `.` token to match newline characters as
* well as all other characters. The PCRE specification states that the `.`
* token does not match newline characters by default, so without this flag the
* `.` token will not cross line boundaries.
*/
#define HS_FLAG_DOTALL 2
/**
* Compile flag: Set multi-line anchoring.
*
* This flag instructs the expression to make the `^` and `$` tokens match
* newline characters as well as the start and end of the stream. If this flag
* is not specified, the `^` token will only ever match at the start of a
* stream, and the `$` token will only ever match at the end of a stream within
* the guidelines of the PCRE specification.
*/
#define HS_FLAG_MULTILINE 4
/**
* Compile flag: Set single-match only mode.
*
* This flag sets the expression's match ID to match at most once. In streaming
* mode, this means that the expression will return only a single match over
* the lifetime of the stream, rather than reporting every match as per
* standard Hyperscan semantics. In block mode or vectored mode, only the first
* match for each invocation of @ref hs_scan() or @ref hs_scan_vector() will be
* returned.
*
* If multiple expressions in the database share the same match ID, then they
* either must all specify @ref HS_FLAG_SINGLEMATCH or none of them specify
* @ref HS_FLAG_SINGLEMATCH. If a group of expressions sharing a match ID
* specify the flag, then at most one match with the match ID will be generated
* per stream.
*
* Note: The use of this flag in combination with @ref HS_FLAG_SOM_LEFTMOST
* is not currently supported.
*/
#define HS_FLAG_SINGLEMATCH 8
/**
* Compile flag: Allow expressions that can match against empty buffers.
*
* This flag instructs the compiler to allow expressions that can match against
* empty buffers, such as `.?`, `.*`, `(a|)`. Since Hyperscan can return every
* possible match for an expression, such expressions generally execute very
* slowly; the default behaviour is to return an error when an attempt to
* compile one is made. Using this flag will force the compiler to allow such
* an expression.
*/
#define HS_FLAG_ALLOWEMPTY 16
/**
* Compile flag: Enable UTF-8 mode for this expression.
*
* This flag instructs Hyperscan to treat the pattern as a sequence of UTF-8
* characters. The results of scanning invalid UTF-8 sequences with a Hyperscan
* library that has been compiled with one or more patterns using this flag are
* undefined.
*/
#define HS_FLAG_UTF8 32
/**
* Compile flag: Enable Unicode property support for this expression.
*
* This flag instructs Hyperscan to use Unicode properties, rather than the
* default ASCII interpretations, for character mnemonics like `\w` and `\s` as
* well as the POSIX character classes. It is only meaningful in conjunction
* with @ref HS_FLAG_UTF8.
*/
#define HS_FLAG_UCP 64
/**
* Compile flag: Enable prefiltering mode for this expression.
*
* This flag instructs Hyperscan to compile an "approximate" version of this
* pattern for use in a prefiltering application, even if Hyperscan does not
* support the pattern in normal operation.
*
* The set of matches returned when this flag is used is guaranteed to be a
* superset of the matches specified by the non-prefiltering expression.
*
* If the pattern contains pattern constructs not supported by Hyperscan (such
* as zero-width assertions, back-references or conditional references) these
* constructs will be replaced internally with broader constructs that may
* match more often.
*
* Furthermore, in prefiltering mode Hyperscan may simplify a pattern that
* would otherwise return a "Pattern too large" error at compile time, or for
* performance reasons (subject to the matching guarantee above).
*
* It is generally expected that the application will subsequently confirm
* prefilter matches with another regular expression matcher that can provide
* exact matches for the pattern.
*
* Note: The use of this flag in combination with @ref HS_FLAG_SOM_LEFTMOST
* is not currently supported.
*/
#define HS_FLAG_PREFILTER 128
/**
* Compile flag: Enable leftmost start of match reporting.
*
* This flag instructs Hyperscan to report the leftmost possible start of match
* offset when a match is reported for this expression. (By default, no start
* of match is returned.)
*
* For all the 3 modes, enabling this behaviour may reduce performance. And
* particularly, it may increase stream state requirements in streaming mode.
*/
#define HS_FLAG_SOM_LEFTMOST 256
/**
* Compile flag: Logical combination.
*
* This flag instructs Hyperscan to parse this expression as logical
* combination syntax.
* Logical constraints consist of operands, operators and parentheses.
* The operands are expression indices, and operators can be
* '!'(NOT), '&'(AND) or '|'(OR).
* For example:
* (101&102&103)|(104&!105)
* ((301|302)&303)&(304|305)
*/
#define HS_FLAG_COMBINATION 512
/**
* Compile flag: Don't do any match reporting.
*
* This flag instructs Hyperscan to ignore match reporting for this expression.
* It is designed to be used on the sub-expressions in logical combinations.
*/
#define HS_FLAG_QUIET 1024
/** @} */
/**
* @defgroup HS_CPU_FEATURES_FLAG CPU feature support flags
*
* @{
*/
/**
* CPU features flag - Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2)
*
* Setting this flag indicates that the target platform supports AVX2
* instructions.
*/
#define HS_CPU_FEATURES_AVX2 (1ULL << 2)
/**
* CPU features flag - Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX512)
*
* Setting this flag indicates that the target platform supports AVX512
* instructions, specifically AVX-512BW. Using AVX512 implies the use of AVX2.
*/
#define HS_CPU_FEATURES_AVX512 (1ULL << 3)
/**
* CPU features flag - Intel(R) Advanced Vector Extensions 512
* Vector Byte Manipulation Instructions (Intel(R) AVX512VBMI)
*
* Setting this flag indicates that the target platform supports AVX512VBMI
* instructions. Using AVX512VBMI implies the use of AVX512.
*/
#define HS_CPU_FEATURES_AVX512VBMI (1ULL << 4)
/** @} */
/**
* @defgroup HS_TUNE_FLAG Tuning flags
*
* @{
*/
/**
* Tuning Parameter - Generic
*
* This indicates that the compiled database should not be tuned for any
* particular target platform.
*/
#define HS_TUNE_FAMILY_GENERIC 0
/**
* Tuning Parameter - Intel(R) microarchitecture code name Sandy Bridge
*
* This indicates that the compiled database should be tuned for the
* Sandy Bridge microarchitecture.
*/
#define HS_TUNE_FAMILY_SNB 1
/**
* Tuning Parameter - Intel(R) microarchitecture code name Ivy Bridge
*
* This indicates that the compiled database should be tuned for the
* Ivy Bridge microarchitecture.
*/
#define HS_TUNE_FAMILY_IVB 2
/**
* Tuning Parameter - Intel(R) microarchitecture code name Haswell
*
* This indicates that the compiled database should be tuned for the
* Haswell microarchitecture.
*/
#define HS_TUNE_FAMILY_HSW 3
/**
* Tuning Parameter - Intel(R) microarchitecture code name Silvermont
*
* This indicates that the compiled database should be tuned for the
* Silvermont microarchitecture.
*/
#define HS_TUNE_FAMILY_SLM 4
/**
* Tuning Parameter - Intel(R) microarchitecture code name Broadwell
*
* This indicates that the compiled database should be tuned for the
* Broadwell microarchitecture.
*/
#define HS_TUNE_FAMILY_BDW 5
/**
* Tuning Parameter - Intel(R) microarchitecture code name Skylake
*
* This indicates that the compiled database should be tuned for the
* Skylake microarchitecture.
*/
#define HS_TUNE_FAMILY_SKL 6
/**
* Tuning Parameter - Intel(R) microarchitecture code name Skylake Server
*
* This indicates that the compiled database should be tuned for the
* Skylake Server microarchitecture.
*/
#define HS_TUNE_FAMILY_SKX 7
/**
* Tuning Parameter - Intel(R) microarchitecture code name Goldmont
*
* This indicates that the compiled database should be tuned for the
* Goldmont microarchitecture.
*/
#define HS_TUNE_FAMILY_GLM 8
/**
* Tuning Parameter - Intel(R) microarchitecture code name Icelake
*
* This indicates that the compiled database should be tuned for the
* Icelake microarchitecture.
*/
#define HS_TUNE_FAMILY_ICL 9
/**
* Tuning Parameter - Intel(R) microarchitecture code name Icelake Server
*
* This indicates that the compiled database should be tuned for the
* Icelake Server microarchitecture.
*/
#define HS_TUNE_FAMILY_ICX 10
/** @} */
/**
* @defgroup HS_MODE_FLAG Compile mode flags
*
* The mode flags are used as values for the mode parameter of the various
* compile calls (@ref hs_compile(), @ref hs_compile_multi() and @ref
* hs_compile_ext_multi()).
*
* A mode value can be built by ORing these flag values together; the only
* required flag is one of @ref HS_MODE_BLOCK, @ref HS_MODE_STREAM or @ref
* HS_MODE_VECTORED. Other flags may be added to enable support for additional
* features.
*
* @{
*/
/**
* Compiler mode flag: Block scan (non-streaming) database.
*/
#define HS_MODE_BLOCK 1
/**
* Compiler mode flag: Alias for @ref HS_MODE_BLOCK.
*/
#define HS_MODE_NOSTREAM 1
/**
* Compiler mode flag: Streaming database.
*/
#define HS_MODE_STREAM 2
/**
* Compiler mode flag: Vectored scanning database.
*/
#define HS_MODE_VECTORED 4
/**
* Compiler mode flag: use full precision to track start of match offsets in
* stream state.
*
* This mode will use the most stream state per pattern, but will always return
* an accurate start of match offset regardless of how far back in the past it
* was found.
*
* One of the SOM_HORIZON modes must be selected to use the @ref
* HS_FLAG_SOM_LEFTMOST expression flag.
*/
#define HS_MODE_SOM_HORIZON_LARGE (1U << 24)
/**
* Compiler mode flag: use medium precision to track start of match offsets in
* stream state.
*
* This mode will use less stream state than @ref HS_MODE_SOM_HORIZON_LARGE and
* will limit start of match accuracy to offsets within 2^32 bytes of the
* end of match offset reported.
*
* One of the SOM_HORIZON modes must be selected to use the @ref
* HS_FLAG_SOM_LEFTMOST expression flag.
*/
#define HS_MODE_SOM_HORIZON_MEDIUM (1U << 25)
/**
* Compiler mode flag: use limited precision to track start of match offsets in
* stream state.
*
* This mode will use less stream state than @ref HS_MODE_SOM_HORIZON_LARGE and
* will limit start of match accuracy to offsets within 2^16 bytes of the
* end of match offset reported.
*
* One of the SOM_HORIZON modes must be selected to use the @ref
* HS_FLAG_SOM_LEFTMOST expression flag.
*/
#define HS_MODE_SOM_HORIZON_SMALL (1U << 26)
/** @} */
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* HS_COMPILE_H_ */