mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-09-29 19:24:25 +03:00
Initial commit of Hyperscan
This commit is contained in:
35
doc/dev-reference/CMakeLists.txt
Normal file
35
doc/dev-reference/CMakeLists.txt
Normal file
@@ -0,0 +1,35 @@
|
||||
find_program(DOXYGEN doxygen)
|
||||
|
||||
if (DOXYGEN STREQUAL DOXYGEN-NOTFOUND)
|
||||
message(STATUS "Doxygen not found, unable to generate API reference")
|
||||
else()
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/hyperscan.doxyfile.in"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/hyperscan.doxyfile" @ONLY)
|
||||
|
||||
add_custom_target(dev-reference-doxygen
|
||||
${DOXYGEN} ${CMAKE_CURRENT_BINARY_DIR}/hyperscan.doxyfile
|
||||
COMMENT "Building doxygen XML for API reference")
|
||||
endif()
|
||||
|
||||
find_program(SPHINX_BUILD sphinx-build)
|
||||
|
||||
if (SPHINX_BUILD STREQUAL SPHINX_BUILD-NOTFOUND)
|
||||
message(STATUS "Sphinx not found, unable to generate developer reference")
|
||||
else()
|
||||
set(SPHINX_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
|
||||
set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
|
||||
set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
|
||||
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/conf.py" @ONLY)
|
||||
|
||||
add_custom_target(dev-reference
|
||||
${SPHINX_BUILD}
|
||||
-b html
|
||||
-c "${CMAKE_CURRENT_BINARY_DIR}"
|
||||
-d "${SPHINX_CACHE_DIR}"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}"
|
||||
"${SPHINX_HTML_DIR}"
|
||||
DEPENDS dev-reference-doxygen
|
||||
COMMENT "Building HTML dev reference with Sphinx")
|
||||
endif()
|
4
doc/dev-reference/_static/hyperscan.css
Normal file
4
doc/dev-reference/_static/hyperscan.css
Normal file
@@ -0,0 +1,4 @@
|
||||
/* Differentiate the way we display regex fragments. */
|
||||
.regexp {
|
||||
color: darkred !important;
|
||||
}
|
53
doc/dev-reference/api_constants.rst
Normal file
53
doc/dev-reference/api_constants.rst
Normal file
@@ -0,0 +1,53 @@
|
||||
.. _api_constants:
|
||||
|
||||
########################
|
||||
API Reference: Constants
|
||||
########################
|
||||
|
||||
***********
|
||||
Error Codes
|
||||
***********
|
||||
|
||||
.. doxygengroup:: HS_ERROR
|
||||
:content-only:
|
||||
:no-link:
|
||||
|
||||
*****************
|
||||
hs_expr_ext flags
|
||||
*****************
|
||||
|
||||
.. doxygengroup:: HS_EXT_FLAG
|
||||
:content-only:
|
||||
:no-link:
|
||||
|
||||
*************
|
||||
Pattern flags
|
||||
*************
|
||||
|
||||
.. doxygengroup:: HS_PATTERN_FLAG
|
||||
:content-only:
|
||||
:no-link:
|
||||
|
||||
*************************
|
||||
CPU feature support flags
|
||||
*************************
|
||||
|
||||
.. doxygengroup:: HS_CPU_FEATURES_FLAG
|
||||
:content-only:
|
||||
:no-link:
|
||||
|
||||
****************
|
||||
CPU tuning flags
|
||||
****************
|
||||
|
||||
.. doxygengroup:: HS_TUNE_FLAG
|
||||
:content-only:
|
||||
:no-link:
|
||||
|
||||
******************
|
||||
Compile mode flags
|
||||
******************
|
||||
|
||||
.. doxygengroup:: HS_MODE_FLAG
|
||||
:content-only:
|
||||
:no-link:
|
29
doc/dev-reference/api_files.rst
Normal file
29
doc/dev-reference/api_files.rst
Normal file
@@ -0,0 +1,29 @@
|
||||
.. _api_files:
|
||||
|
||||
####################
|
||||
API Reference: Files
|
||||
####################
|
||||
|
||||
**********
|
||||
File: hs.h
|
||||
**********
|
||||
|
||||
.. doxygenfile:: hs.h
|
||||
|
||||
*****************
|
||||
File: hs_common.h
|
||||
*****************
|
||||
|
||||
.. doxygenfile:: hs_common.h
|
||||
|
||||
******************
|
||||
File: hs_compile.h
|
||||
******************
|
||||
|
||||
.. doxygenfile:: hs_compile.h
|
||||
|
||||
******************
|
||||
File: hs_runtime.h
|
||||
******************
|
||||
|
||||
.. doxygenfile:: hs_runtime.h
|
365
doc/dev-reference/compilation.rst
Normal file
365
doc/dev-reference/compilation.rst
Normal file
@@ -0,0 +1,365 @@
|
||||
.. include:: <isonum.txt>
|
||||
.. _compilation:
|
||||
|
||||
##################
|
||||
Compiling Patterns
|
||||
##################
|
||||
|
||||
*******************
|
||||
Building a Database
|
||||
*******************
|
||||
|
||||
The Hyperscan compiler API accepts regular expressions and converts them into a
|
||||
compiled pattern database that can then be used to scan data.
|
||||
|
||||
The API provides three functions that compile regular expressions into
|
||||
databases:
|
||||
|
||||
#. :c:func:`hs_compile`: compiles a single expression into a pattern database.
|
||||
|
||||
#. :c:func:`hs_compile_multi`: compiles an array of expressions into a pattern
|
||||
database. All of the supplied patterns will be scanned for concurrently at
|
||||
scan time, with user-supplied identifiers returned when they match.
|
||||
|
||||
#. :c:func:`hs_compile_ext_multi`: compiles an array of expressions as above,
|
||||
but allows :ref:`extparam` to be specified for each expression.
|
||||
|
||||
Compilation allows the Hyperscan library to analyze the given pattern(s) and
|
||||
pre-determine how to scan for these patterns in an optimized fashion that would
|
||||
be far too expensive to compute at run-time.
|
||||
|
||||
When compiling expressions, a decision needs to be made whether the resulting
|
||||
compiled patterns are to be used in a streaming, block or vectored mode:
|
||||
|
||||
- **Streaming mode**: the target data to be scanned is a continuous stream, not
|
||||
all of which is available at once; blocks of data are scanned in sequence and
|
||||
matches may span multiple blocks in a stream. In streaming mode, each stream
|
||||
requires a block of memory to store its state between scan calls.
|
||||
|
||||
- **Block mode**: the target data is a discrete, contiguous block which can be
|
||||
scanned in one call and does not require state to be retained.
|
||||
|
||||
- **Vectored mode**: the target data consists of a list of non-contiguous
|
||||
blocks that are available all at once. As for block mode, no retention of
|
||||
state is required.
|
||||
|
||||
To compile patterns to be used in streaming mode, the ``mode`` parameter of
|
||||
:c:func:`hs_compile` must be set to :c:member:`HS_MODE_STREAM`; similarly,
|
||||
block mode requires the use of :c:member:`HS_MODE_BLOCK` and vectored mode
|
||||
requires the use of :c:member:`HS_MODE_VECTORED`. A pattern database compiled
|
||||
for one mode (streaming, block or vectored) can only be used in that mode. The
|
||||
version of Hyperscan used to produce a compiled pattern database must match the
|
||||
version of Hyperscan used to scan with it.
|
||||
|
||||
Hyperscan provides support for targeting a database at a particular CPU
|
||||
platform; see :ref:`instr_specialization` for details.
|
||||
|
||||
***************
|
||||
Pattern Support
|
||||
***************
|
||||
|
||||
Hyperscan supports the pattern syntax used by the PCRE library ("libpcre"),
|
||||
described at <http://www.pcre.org/>. However, not all constructs available in
|
||||
libpcre are supported. The use of unsupported constructs will result in
|
||||
compilation errors.
|
||||
|
||||
====================
|
||||
Supported Constructs
|
||||
====================
|
||||
|
||||
The following regex constructs are supported by Hyperscan:
|
||||
|
||||
* Literal characters and strings, with all libpcre quoting and character
|
||||
escapes.
|
||||
|
||||
* Character classes such as :regexp:`.` (dot), :regexp:`[abc]`, and
|
||||
:regexp:`[^abc]`, as well as the predefined character classes :regexp:`\\s`,
|
||||
:regexp:`\\d`, :regexp:`\\w`, :regexp:`\\v`, and :regexp:`\\h` and their
|
||||
negated counterparts (:regexp:`\\S`, :regexp:`\\D`, :regexp:`\\W`,
|
||||
:regexp:`\\V`, and :regexp:`\\H`).
|
||||
|
||||
* The POSIX named character classes :regexp:`[[:xxx:]]` and negated named
|
||||
character classes :regexp:`[[:^xxx:]]`.
|
||||
|
||||
* Unicode character properties, such as :regexp:`\\p{L}`, :regexp:`\\P{Sc}`,
|
||||
:regexp:`\\p{Greek}`.
|
||||
|
||||
* Quantifiers:
|
||||
|
||||
* Quantifiers such as :regexp:`?`, :regexp:`*` and :regexp:`+` are supported
|
||||
when applied to arbitrary supported sub-expressions.
|
||||
|
||||
* Bounded repeat qualifiers such as :regexp:`{n}`, :regexp:`{m,n}`,
|
||||
:regexp:`{n,}` are supported with limitations.
|
||||
|
||||
* For arbitrary repeated sub-patterns: *n* and *m* should be either small
|
||||
or infinite, e.g. :regexp:`(a|b}{4}`, :regexp:`(ab?c?d){4,10}` or
|
||||
:regexp:`(ab(cd)*){6,}`.
|
||||
|
||||
* For single-character width sub-patterns such as :regexp:`[^\\a]` or
|
||||
:regexp:`.` or :regexp:`x`, nearly all repeat counts are supported, except
|
||||
where repeats are extremely large (maximum bound greater than 32767).
|
||||
Stream states may be very large for large bounded repeats, e.g.
|
||||
:regexp:`a.{2000}b`. Note: such sub-patterns may be considerably
|
||||
cheaper if at the beginning or end of patterns and especially if the
|
||||
:c:member:`HS_FLAG_SINGLEMATCH` flag is on for that pattern.
|
||||
|
||||
* Lazy modifiers (:regexp:`?` appended to another quantifier, e.g.
|
||||
:regexp:`\\w+?`) are supported but ignored (as Hyperscan reports all
|
||||
matches).
|
||||
|
||||
* Parenthesization, including the named and unnamed capturing and
|
||||
non-capturing forms. However, capturing is ignored.
|
||||
|
||||
* Alternation with the :regexp:`|` symbol, as in :regexp:`foo|bar`.
|
||||
|
||||
* The anchors :regexp:`^`, :regexp:`$`, :regexp:`\\A`, :regexp:`\\Z` and
|
||||
:regexp:`\\z`.
|
||||
|
||||
* Option modifiers for:
|
||||
|
||||
* Case-sensitivity: :regexp:`(?i)` and :regexp:`(?-i)`
|
||||
* Multi-line: :regexp:`(?m)` and :regexp:`(?-m)`
|
||||
* Dot-all: :regexp:`(?s)` and :regexp:`(?-s)`
|
||||
* Extended syntax: :regexp:`(?s)` and :regexp:`(?-s)`
|
||||
|
||||
* The :regexp:`\\b` and :regexp:`\\B` zero-width assertions (word boundary and
|
||||
'not word boundary', respectively).
|
||||
|
||||
* Comments in :regexp:`(?# comment)` syntax.
|
||||
|
||||
* The :regexp:`(*UTF8)` and :regexp:`(*UCP)` control verbs at the beginning of a
|
||||
pattern, used to enable UTF-8 and UCP mode.
|
||||
|
||||
.. note:: Bounded-repeat quantifiers with large repeat counts of arbitrary
|
||||
expressions (e.g. :regexp:`([a-z]|bc*d|xy?z){1000,5000}`) will result in a
|
||||
"Pattern too large" error at pattern compile time.
|
||||
|
||||
.. note:: At this time, not all patterns can be successfully compiled with the
|
||||
:c:member:`HS_FLAG_SOM_LEFTMOST` flag, which enables per-pattern support for
|
||||
:ref:`som`. The patterns that support this flag are a subset of patterns that
|
||||
can be successfully compiled with Hyperscan; notably, many bounded repeat
|
||||
forms that can be compiled with Hyperscan without the Start of Match flag
|
||||
enabled cannot be compiled with the flag enabled.
|
||||
|
||||
======================
|
||||
Unsupported Constructs
|
||||
======================
|
||||
|
||||
The following regex constructs are not supported by Hyperscan:
|
||||
|
||||
* Backreferences and capturing sub-expressions.
|
||||
* Arbitrary zero-width assertions.
|
||||
* Subroutine references and recursive patterns.
|
||||
* Conditional patterns.
|
||||
* Backtracking control verbs.
|
||||
* The :regexp:`\\C` "single-byte" directive (which breaks UTF-8 sequences).
|
||||
* The :regexp:`\\R` newline match.
|
||||
* The :regexp:`\\K` start of match reset directive.
|
||||
* Callouts and embedded code.
|
||||
* Atomic grouping and possessive quantifiers.
|
||||
|
||||
*********
|
||||
Semantics
|
||||
*********
|
||||
|
||||
While Hyperscan follows libpcre syntax, it provides different semantics. The
|
||||
major departures from libpcre semantics are motivated by the requirements of
|
||||
streaming and multiple simultaneous pattern matching.
|
||||
|
||||
The major departures from libpcre semantics are:
|
||||
|
||||
#. **Multiple pattern matching**: Hyperscan allows matches to be reported for
|
||||
several patterns simultaneously. This is not equivalent to separating the
|
||||
patterns by :regexp:`|` in libpcre, which evaluates alternations
|
||||
left-to-right.
|
||||
|
||||
#. **Lack of ordering**: the multiple matches that Hyperscan produces are not
|
||||
guaranteed to be ordered, although they will always fall within the bounds of
|
||||
the current scan.
|
||||
|
||||
#. **End offsets only**: Hyperscan's default behaviour is only to report the end
|
||||
offset of a match. Reporting of the start offset can be enabled with
|
||||
per-expression flags at pattern compile time. See :ref:`som` for details.
|
||||
|
||||
#. **"All matches" reported**: scanning :regexp:`/foo.*bar/` against
|
||||
``fooxyzbarbar`` will return two matches from Hyperscan -- at the points
|
||||
corresponding to the ends of ``fooxyzbar`` and ``fooxyzbarbar``. In contrast,
|
||||
libpcre semantics by default would report only one match at ``fooxyzbarbar``
|
||||
(greedy semantics) or, if non-greedy semantics were switched on, one match at
|
||||
``fooxyzbar``. This means that switching between greedy and non-greedy
|
||||
semantics is a no-op in Hyperscan.
|
||||
|
||||
To support libpcre quantifier semantics while accurately reporting streaming
|
||||
matches at the time they occur is impossible. For example, consider the pattern
|
||||
above, :regexp:`/foo.*bar/`, in streaming mode, against the following
|
||||
stream (three blocks scanned in sequence):
|
||||
|
||||
============= ======= ========
|
||||
block 1 block 2 block 3
|
||||
============= ======= ========
|
||||
``fooxyzbar`` ``baz`` ``qbar``
|
||||
============= ======= ========
|
||||
|
||||
Since the :regexp:`.*` repeat in the pattern is a *greedy* repeat in libpcre, it
|
||||
must match as much as possible without causing the rest of the pattern to fail.
|
||||
However, in streaming mode, this would require knowledge of data in the stream
|
||||
beyond the current block being scanned.
|
||||
|
||||
In this example, the match at offset 9 in the first block is only the correct
|
||||
match (under libpcre semantics) if there is no ``bar`` in a subsequent block --
|
||||
as in block 3 -- which would constitute a better match for the pattern.
|
||||
|
||||
.. _som:
|
||||
|
||||
==============
|
||||
Start of Match
|
||||
==============
|
||||
|
||||
In standard operation, Hyperscan will only provide the end offset of a match
|
||||
when the match callback is called. If the :c:member:`HS_FLAG_SOM_LEFTMOST` flag
|
||||
is specified for a particular pattern, then the same set of matches is
|
||||
returned, but each match will also provide the leftmost possible start offset
|
||||
corresponding to its end offset.
|
||||
|
||||
Using the SOM flag entails a number of trade-offs and limitations:
|
||||
|
||||
* Reduced pattern support: For many patterns, tracking SOM is complex and can
|
||||
result in Hyperscan failing to compile a pattern with a "Pattern too
|
||||
large" error, even if the pattern is supported in normal operation.
|
||||
* Increased stream state: At scan time, state space is required to track
|
||||
potential SOM offsets, and this must be stored in persistent stream state in
|
||||
streaming mode. Accordingly, SOM will generally increase the stream state
|
||||
required to match a pattern.
|
||||
* Performance overhead: Similarly, there is generally a performance cost
|
||||
associated with tracking SOM.
|
||||
* Incompatible features: Some other Hyperscan pattern flags (such as
|
||||
:c:member:`HS_FLAG_SINGLEMATCH` and :c:member:`HS_FLAG_PREFILTER`) can not be
|
||||
used in combination with SOM. Specifying them together with
|
||||
:c:member:`HS_FLAG_SOM_LEFTMOST` will result in a compilation error.
|
||||
|
||||
In streaming mode, the amount of precision delivered by SOM can be controlled
|
||||
with the SOM horizon flags. These instruct Hyperscan to deliver accurate SOM
|
||||
information within a certain distance of the end offset, and return a special
|
||||
start offset of :c:member:`HS_OFFSET_PAST_HORIZON` otherwise. Specifying a
|
||||
small or medium SOM horizon will usually reduce the stream state required for a
|
||||
given database.
|
||||
|
||||
.. note:: In streaming mode, the start offset returned for a match may refer to
|
||||
a point in the stream *before* the current block being scanned. Hyperscan
|
||||
provides no facility for accessing earlier blocks; if the calling application
|
||||
needs to inspect historical data, then it must store it itself.
|
||||
|
||||
.. _extparam:
|
||||
|
||||
===================
|
||||
Extended Parameters
|
||||
===================
|
||||
|
||||
In some circumstances, more control over the matching behaviour of a pattern is
|
||||
required than can be specified easily using regular expression syntax. For
|
||||
these scenarios, Hyperscan provides the :c:func:`hs_compile_ext_multi` function
|
||||
that allows a set of "extended parameters" to be set on a per-pattern basis.
|
||||
|
||||
Extended parameters are specified using an :c:type:`hs_expr_ext_t` structure,
|
||||
which provides the following fields:
|
||||
|
||||
* ``flags``: Flags governing which of the other fields in the structure are
|
||||
used.
|
||||
* ``min_offset``: The minimum end offset in the data stream at which this
|
||||
expression should match successfully.
|
||||
* ``max_offset``: The maximum end offset in the data stream at which this
|
||||
expression should match successfully.
|
||||
* ``min_length``: The minimum match length (from start to end) required to
|
||||
successfully match this expression.
|
||||
|
||||
These parameters allow the set of matches produced by a pattern to be
|
||||
constrained at compile time, rather than relying on the application to process
|
||||
unwanted matches at runtime.
|
||||
|
||||
For example, the pattern :regexp:`/foo.*bar/` when given a ``min_offset`` of 10
|
||||
and a ``max_offset`` of 15 will not produce matches when scanned against
|
||||
``foobar`` or ``foo0123456789bar`` but will produce a match against the data
|
||||
streams ``foo0123bar`` or ``foo0123456bar``.
|
||||
|
||||
=================
|
||||
Prefiltering Mode
|
||||
=================
|
||||
|
||||
Hyperscan provides a per-pattern flag, :c:member:`HS_FLAG_PREFILTER`, which can
|
||||
be used to implement a prefilter for a pattern than Hyperscan would not
|
||||
ordinarily support.
|
||||
|
||||
This flag instructs Hyperscan to compile an "approximate" version of this
|
||||
pattern for use in a prefiltering application, even if Hyperscan does not
|
||||
support the pattern in normal operation.
|
||||
|
||||
The set of matches returned when this flag is used is guaranteed to be a
|
||||
superset of the matches specified by the non-prefiltering expression.
|
||||
|
||||
If the pattern contains pattern constructs not supported by Hyperscan (such as
|
||||
zero-width assertions, back-references or conditional references) these
|
||||
constructs will be replaced internally with broader constructs that may match
|
||||
more often.
|
||||
|
||||
For example, the pattern :regexp:`/(\\w+) again \\1/` contains the
|
||||
back-reference :regexp:`\\1`. In prefiltering mode, this pattern might be
|
||||
approximated by having its back-reference replaced with its referent, forming
|
||||
:regexp:`/\\w+ again \\w+/`.
|
||||
|
||||
Furthermore, in prefiltering mode Hyperscan may simplify a pattern that would
|
||||
otherwise return a "Pattern too large" error at compile time, or for performance
|
||||
reasons (subject to the matching guarantee above).
|
||||
|
||||
It is generally expected that the application will subsequently confirm
|
||||
prefilter matches with another regular expression matcher that can provide exact
|
||||
matches for the pattern.
|
||||
|
||||
.. note:: The use of this flag in combination with Start of Match mode (using
|
||||
the :c:member:`HS_FLAG_SOM_LEFTMOST` flag) is not currently supported and
|
||||
will result in a pattern compilation error.
|
||||
|
||||
.. _instr_specialization:
|
||||
|
||||
******************************
|
||||
Instruction Set Specialization
|
||||
******************************
|
||||
|
||||
Hyperscan is able to make use of several modern instruction set features found
|
||||
on x86 processors to provide improvements in scanning performance.
|
||||
|
||||
Some of these features are selected when the library is built; for example,
|
||||
Hyperscan will use the native ``POPCNT`` instruction on processors where it is
|
||||
available and the library has been optimized for the host architecture.
|
||||
|
||||
.. note:: By default, the Hyperscan runtime is built with the ``-march=native``
|
||||
compiler flag and (where possible) will make use of all instructions known by
|
||||
the host's C compiler.
|
||||
|
||||
To use some instruction set features, however, Hyperscan must build a
|
||||
specialized database to support them. This means that the target platform must
|
||||
be specified at pattern compile time.
|
||||
|
||||
The Hyperscan compiler API functions all accept an optional
|
||||
:c:type:`hs_platform_info_t` argument, which describes the target platform
|
||||
for the database to be built. If this argument is NULL, the database will be
|
||||
targeted at the current host platform.
|
||||
|
||||
The :c:type:`hs_platform_info_t` structure has two fields:
|
||||
|
||||
#. ``tune``: This allows the application to specify information about the target
|
||||
platform which may be used to guide the optimisation process of the compile.
|
||||
Use of this field does not limit the processors that the resulting database
|
||||
can run on, but may impact the performance of the resulting database.
|
||||
|
||||
#. ``cpu_features``: This allows the application to specify a mask of CPU
|
||||
features that may be used on the target platform. For example,
|
||||
:c:member:`HS_CPU_FEATURES_AVX2` can be specified for Intel\ |reg| Advanced
|
||||
Vector Extensions +2 (Intel\ |reg| AVX2) instruction set support. If a flag
|
||||
for a particular CPU feature is specified, the database will not be usable on
|
||||
a CPU without that feature.
|
||||
|
||||
An :c:type:`hs_platform_info_t` structure targeted at the current host can be
|
||||
built with the :c:func:`hs_populate_platform` function.
|
||||
|
||||
See :ref:`api_constants` for the full list of CPU tuning and feature flags.
|
275
doc/dev-reference/conf.py.in
Normal file
275
doc/dev-reference/conf.py.in
Normal file
@@ -0,0 +1,275 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Hyperscan documentation build configuration file, created by
|
||||
# sphinx-quickstart on Tue Sep 29 15:59:19 2015.
|
||||
#
|
||||
# This file is execfile()d with the current directory set to its
|
||||
# containing dir.
|
||||
#
|
||||
# Note that not all possible configuration values are present in this
|
||||
# autogenerated file.
|
||||
#
|
||||
# All configuration values have a default; values that are commented out
|
||||
# serve to show the default.
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#sys.path.insert(0, os.path.abspath('.'))
|
||||
|
||||
# -- General configuration ------------------------------------------------
|
||||
|
||||
# If your documentation needs a minimal Sphinx version, state it here.
|
||||
#needs_sphinx = '1.0'
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = ['breathe']
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# The suffix of source filenames.
|
||||
source_suffix = '.rst'
|
||||
|
||||
# The encoding of source files.
|
||||
#source_encoding = 'utf-8-sig'
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = 'index'
|
||||
|
||||
# General information about the project.
|
||||
project = u'Hyperscan'
|
||||
copyright = u'2015, Intel Corporation'
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement for
|
||||
# |version| and |release|, also used in various other places throughout the
|
||||
# built documents.
|
||||
#
|
||||
# The short X.Y version.
|
||||
version = '@HS_MAJOR_VERSION@.@HS_MINOR_VERSION@'
|
||||
# The full version, including alpha/beta/rc tags.
|
||||
release = '@HS_VERSION@'
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
#language = None
|
||||
|
||||
# There are two options for replacing |today|: either, you set today to some
|
||||
# non-false value, then it is used:
|
||||
#today = ''
|
||||
# Else, today_fmt is used as the format for a strftime call.
|
||||
#today_fmt = '%B %d, %Y'
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
exclude_patterns = ['_build']
|
||||
|
||||
# The reST default role (used for this markup: `text`) to use for all
|
||||
# documents.
|
||||
#default_role = None
|
||||
|
||||
# If true, '()' will be appended to :func: etc. cross-reference text.
|
||||
#add_function_parentheses = True
|
||||
|
||||
# If true, the current module name will be prepended to all description
|
||||
# unit titles (such as .. function::).
|
||||
#add_module_names = True
|
||||
|
||||
# If true, sectionauthor and moduleauthor directives will be shown in the
|
||||
# output. They are ignored by default.
|
||||
#show_authors = False
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = 'sphinx'
|
||||
|
||||
# A list of ignored prefixes for module index sorting.
|
||||
#modindex_common_prefix = []
|
||||
|
||||
# If true, keep warnings as "system message" paragraphs in the built documents.
|
||||
#keep_warnings = False
|
||||
|
||||
|
||||
# -- Options for HTML output ----------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
html_theme = 'alabaster'
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a theme
|
||||
# further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
html_theme_options = {
|
||||
# Change some style colors; these are used for admonitions
|
||||
'pink_1' : '#e0f8ff',
|
||||
'pink_2' : '#e0f8ff'
|
||||
}
|
||||
|
||||
# Add any paths that contain custom themes here, relative to this directory.
|
||||
#html_theme_path = []
|
||||
|
||||
# The name for this set of Sphinx documents. If None, it defaults to
|
||||
# "<project> v<release> documentation".
|
||||
#html_title = None
|
||||
|
||||
# A shorter title for the navigation bar. Default is the same as html_title.
|
||||
#html_short_title = None
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the top
|
||||
# of the sidebar.
|
||||
#html_logo = None
|
||||
|
||||
# The name of an image file (within the static path) to use as favicon of the
|
||||
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
||||
# pixels large.
|
||||
#html_favicon = None
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['@CMAKE_CURRENT_SOURCE_DIR@/_static']
|
||||
|
||||
# Add any extra paths that contain custom files (such as robots.txt or
|
||||
# .htaccess) here, relative to this directory. These files are copied
|
||||
# directly to the root of the documentation.
|
||||
#html_extra_path = []
|
||||
|
||||
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
||||
# using the given strftime format.
|
||||
#html_last_updated_fmt = '%b %d, %Y'
|
||||
|
||||
# If true, SmartyPants will be used to convert quotes and dashes to
|
||||
# typographically correct entities.
|
||||
#html_use_smartypants = True
|
||||
|
||||
# Custom sidebar templates, maps document names to template names.
|
||||
html_sidebars = {
|
||||
'**': ['globaltoc.html', 'searchbox.html']
|
||||
}
|
||||
|
||||
# Additional templates that should be rendered to pages, maps page names to
|
||||
# template names.
|
||||
#html_additional_pages = {}
|
||||
|
||||
# If false, no module index is generated.
|
||||
#html_domain_indices = True
|
||||
|
||||
# If false, no index is generated.
|
||||
#html_use_index = True
|
||||
|
||||
# If true, the index is split into individual pages for each letter.
|
||||
#html_split_index = False
|
||||
|
||||
# If true, links to the reST sources are added to the pages.
|
||||
html_show_sourcelink = False
|
||||
|
||||
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
|
||||
#html_show_sphinx = True
|
||||
|
||||
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
|
||||
#html_show_copyright = True
|
||||
|
||||
# If true, an OpenSearch description file will be output, and all pages will
|
||||
# contain a <link> tag referring to it. The value of this option must be the
|
||||
# base URL from which the finished HTML is served.
|
||||
#html_use_opensearch = ''
|
||||
|
||||
# This is the file name suffix for HTML files (e.g. ".xhtml").
|
||||
#html_file_suffix = None
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = 'Hyperscandoc'
|
||||
|
||||
|
||||
# -- Options for LaTeX output ---------------------------------------------
|
||||
|
||||
latex_elements = {
|
||||
# The paper size ('letterpaper' or 'a4paper').
|
||||
#'papersize': 'letterpaper',
|
||||
|
||||
# The font size ('10pt', '11pt' or '12pt').
|
||||
#'pointsize': '10pt',
|
||||
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
#'preamble': '',
|
||||
}
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title,
|
||||
# author, documentclass [howto, manual, or own class]).
|
||||
latex_documents = [
|
||||
('index', 'Hyperscan.tex', u'Hyperscan Documentation',
|
||||
u'Intel Corporation', 'manual'),
|
||||
]
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the top of
|
||||
# the title page.
|
||||
#latex_logo = None
|
||||
|
||||
# For "manual" documents, if this is true, then toplevel headings are parts,
|
||||
# not chapters.
|
||||
#latex_use_parts = False
|
||||
|
||||
# If true, show page references after internal links.
|
||||
#latex_show_pagerefs = False
|
||||
|
||||
# If true, show URL addresses after external links.
|
||||
#latex_show_urls = False
|
||||
|
||||
# Documents to append as an appendix to all manuals.
|
||||
#latex_appendices = []
|
||||
|
||||
# If false, no module index is generated.
|
||||
#latex_domain_indices = True
|
||||
|
||||
|
||||
# -- Options for manual page output ---------------------------------------
|
||||
|
||||
# One entry per manual page. List of tuples
|
||||
# (source start file, name, description, authors, manual section).
|
||||
man_pages = [
|
||||
('index', 'hyperscan', u'Hyperscan Documentation',
|
||||
[u'Intel Corporation'], 1)
|
||||
]
|
||||
|
||||
# If true, show URL addresses after external links.
|
||||
#man_show_urls = False
|
||||
|
||||
|
||||
# -- Options for Texinfo output -------------------------------------------
|
||||
|
||||
# Grouping the document tree into Texinfo files. List of tuples
|
||||
# (source start file, target name, title, author,
|
||||
# dir menu entry, description, category)
|
||||
texinfo_documents = [
|
||||
('index', 'Hyperscan', u'Hyperscan Documentation',
|
||||
u'Intel Corporation', 'Hyperscan', 'High-performance regular expression matcher.',
|
||||
'Miscellaneous'),
|
||||
]
|
||||
|
||||
# Documents to append as an appendix to all manuals.
|
||||
#texinfo_appendices = []
|
||||
|
||||
# If false, no module index is generated.
|
||||
#texinfo_domain_indices = True
|
||||
|
||||
# How to display URL addresses: 'footnote', 'no', or 'inline'.
|
||||
#texinfo_show_urls = 'footnote'
|
||||
|
||||
# If true, do not generate a @detailmenu in the "Top" node's menu.
|
||||
#texinfo_no_detailmenu = False
|
||||
|
||||
# -- Options for Breathe doxygen import -----------------------------------
|
||||
|
||||
breathe_projects = { "hyperscan": "doxygen_xml" }
|
||||
breathe_default_project = "hyperscan"
|
||||
breathe_domain_by_extension = {"h" : "c"}
|
||||
|
||||
# -- Add some customisation -----------------------------------------------
|
||||
|
||||
def setup(app):
|
||||
app.add_stylesheet("hyperscan.css") # Custom stylesheet for e.g. :regex:
|
33
doc/dev-reference/copyright.rst
Normal file
33
doc/dev-reference/copyright.rst
Normal file
@@ -0,0 +1,33 @@
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
#########
|
||||
Copyright
|
||||
#########
|
||||
|
||||
No license (express or implied, by estoppel or otherwise) to any intellectual
|
||||
property rights is granted by this document.
|
||||
|
||||
Intel disclaims all express and implied warranties, including without
|
||||
limitation, the implied warranties of merchantability, fitness for a particular
|
||||
purpose, and non-infringement, as well as any warranty arising from course of
|
||||
performance, course of dealing, or usage in trade.
|
||||
|
||||
This document contains information on products, services and/or processes in
|
||||
development. All information provided here is subject to change without
|
||||
notice. Contact your Intel representative to obtain the latest forecast,
|
||||
schedule, specifications and roadmaps.
|
||||
|
||||
The products and services described may contain defects or errors known as
|
||||
errata which may cause deviations from published specifications. Current
|
||||
characterized errata are available on request.
|
||||
|
||||
Copies of documents which have an order number and are referenced in this
|
||||
document, or other Intel literature, may be obtained by calling 1-800-548-4725,
|
||||
or go to: <http://www.intel.com/design/literature.htm>.
|
||||
|
||||
Intel, and the Intel logo, are trademarks of Intel Corporation in the U.S.
|
||||
and/or other countries.
|
||||
|
||||
\*Other names and brands may be claimed as the property of others.
|
||||
|
||||
Copyright |copy| 2015, Intel Corporation. All rights reserved.
|
211
doc/dev-reference/getting_started.rst
Normal file
211
doc/dev-reference/getting_started.rst
Normal file
@@ -0,0 +1,211 @@
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
###############
|
||||
Getting Started
|
||||
###############
|
||||
|
||||
Very Quick Start
|
||||
****************
|
||||
|
||||
#. Clone Hyperscan ::
|
||||
|
||||
cd <where-you-want-hyperscan-source>
|
||||
git clone git://github/01org/hyperscan
|
||||
|
||||
#. Configure Hyperscan
|
||||
|
||||
Ensure that you have the correct :ref:`dependencies <software>` present,
|
||||
and then:
|
||||
|
||||
::
|
||||
|
||||
cd <where-you-want-to-build-hyperscan>
|
||||
mkdir <build-dir>
|
||||
cd <build-dir>
|
||||
cmake [-G <generator>] [options] <hyperscan-source-path>
|
||||
|
||||
Known working generators:
|
||||
* ``Unix Makefiles`` --- make-compatible makefiles (default on Linux/FreeBSD/Mac OS X)
|
||||
* ``Ninja`` --- `Ninja <http://martine.github.io/ninja/>`_ build files.
|
||||
|
||||
Generators that might work include:
|
||||
* ``Xcode`` --- OS X Xcode projects.
|
||||
* ``Visual Studio`` --- Visual Studio projects - very experimental
|
||||
|
||||
#. Build Hyperscan
|
||||
|
||||
Depending on the generator used:
|
||||
* ``cmake --build .`` --- will build everything
|
||||
* ``make -j<jobs>`` --- use makefiles in parallel
|
||||
* ``ninja`` --- use Ninja build
|
||||
* etc.
|
||||
|
||||
#. Check Hyperscan
|
||||
|
||||
Run the Hyperscan unit tests: ::
|
||||
|
||||
bin/unit-hyperscan
|
||||
|
||||
Requirements
|
||||
************
|
||||
|
||||
Hardware
|
||||
========
|
||||
|
||||
Hyperscan will run on x86 processors in 64-bit (Intel\ |reg| 64 Architecture) and
|
||||
32-bit (IA-32 Architecture) modes.
|
||||
|
||||
Hyperscan is a high performance software library that takes advantage of recent
|
||||
Intel architecture advances. At a minimum, support for Supplemental Streaming
|
||||
SIMD Extensions 3 (SSSE3) is required, which should be available on any modern
|
||||
x86 processor.
|
||||
|
||||
Additionally, Hyperscan can make use of:
|
||||
|
||||
* Intel Streaming SIMD Extensions 4.2 (SSE4.2)
|
||||
* the POPCNT instruction
|
||||
* Bit Manipulation Instructions (BMI, BMI2)
|
||||
* Intel Advanced Vector Extensions 2 (Intel AVX2)
|
||||
|
||||
if present.
|
||||
|
||||
These can be determined at library compile time, see :ref:`target_arch`.
|
||||
|
||||
.. _software:
|
||||
|
||||
Software
|
||||
========
|
||||
|
||||
As a software library, Hyperscan doesn't impose any particular runtime
|
||||
software requirements, however to build the Hyperscan library we require a
|
||||
modern C and C++ compiler -- in particular, Hyperscan requires C99 and C++11
|
||||
compiler support. The supported compilers are:
|
||||
|
||||
* GCC, v4.8.1 or higher
|
||||
* Clang, v3.4 or higher (with libstdc++ or libc++)
|
||||
* Intel C++ Compiler v15 or higher
|
||||
|
||||
Examples of operating systems that Hyperscan is known to work on include:
|
||||
|
||||
Linux:
|
||||
|
||||
* Ubuntu 14.04 LTS or newer
|
||||
* RedHat/CentOS 7 or newer
|
||||
|
||||
FreeBSD:
|
||||
|
||||
* 10.0 or newer
|
||||
|
||||
Mac OS X:
|
||||
|
||||
* 10.8 or newer, using XCode/Clang
|
||||
|
||||
Hyperscan *may* compile and run on other platforms, but there is no guarantee.
|
||||
We currently have experimental support for Windows using Intel C++ Compiler
|
||||
or Visual Studio 2015.
|
||||
|
||||
In addition, the following software is required for compiling the Hyperscan library:
|
||||
|
||||
======================================================= =========== ======================================
|
||||
Dependency Version Notes
|
||||
======================================================= =========== ======================================
|
||||
`CMake <http://www.cmake.org/>`_ >=2.8.11
|
||||
`Ragel <http://www.colm.net/open-source/ragel/>`_ 6.9
|
||||
`Python <http://www.python.org/>`_ 2.7
|
||||
`Boost <http://boost.org/>`_ >=1.57 Boost headers required
|
||||
`Pcap <http://tcpdump.org>`_ >=0.8 Optional: needed for example code only
|
||||
======================================================= =========== ======================================
|
||||
|
||||
Most of these dependencies can be provided by the package manager on the build
|
||||
system (e.g. Debian/Ubuntu/RedHat packages, FreeBSD ports, etc). However,
|
||||
ensure that the correct version is present.
|
||||
|
||||
Boost Headers
|
||||
-------------
|
||||
|
||||
Compiling Hyperscan depends on a recent version of the Boost C++ header
|
||||
library. If the Boost libraries are installed on the build machine in the
|
||||
usual paths, CMake will find them. An alternative is to put a copy of (or a
|
||||
symlink to) the boost subdirectory in ``<hyperscan-source-path>/include/boost``.
|
||||
|
||||
For example: for the Boost-1.59.0 release: ::
|
||||
|
||||
ln -s boost_1_59_0/boost <hyperscan-source-path>/include/boost
|
||||
|
||||
As Hyperscan uses the header-only parts of Boost, it is not necessary to
|
||||
compile the Boost libraries.
|
||||
|
||||
CMake Configuration
|
||||
===================
|
||||
|
||||
When CMake is invoked, it generates build files using the given options.
|
||||
Options are passed to CMake in the form ``-D<variable name>=<value>``.
|
||||
Common options for CMake include:
|
||||
|
||||
+------------------------+----------------------------------------------------+
|
||||
| Variable | Description |
|
||||
+========================+====================================================+
|
||||
| CMAKE_C_COMPILER | C compiler to use. Default is /usr/bin/cc. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| CMAKE_CXX_COMPILER | C++ compiler to use. Default is /usr/bin/c++. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| CMAKE_INSTALL_PREFIX | Install directory for ``install`` target |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| CMAKE_BUILD_TYPE | Define which kind of build to generate. |
|
||||
| | Valid options are Debug, Release, RelWithDebInfo, |
|
||||
| | and MinSizeRel. Default is RelWithDebInfo. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| BUILD_SHARED_LIBS | Build Hyperscan as a shared library instead of |
|
||||
| | the default static library. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| BUILD_STATIC_AND_SHARED| Build both static and shared Hyperscan libs. |
|
||||
| | Default off. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| DEBUG_OUTPUT | Enable very verbose debug output. Default off. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
|
||||
For example, to generate a ``Debug`` build: ::
|
||||
|
||||
cd <build-dir>
|
||||
cmake -DCMAKE_BUILD_TYPE=Debug <hyperscan-source-path>
|
||||
|
||||
|
||||
|
||||
Build Type
|
||||
----------
|
||||
|
||||
CMake determines a number of features for a build based on the Build Type.
|
||||
Hyperscan defaults to ``RelWithDebInfo``, i.e. "release with debugging
|
||||
information". This is a performance optimized build without runtime assertions
|
||||
but with debug symbols enabled.
|
||||
|
||||
The other types of builds are:
|
||||
|
||||
* ``Release``: as above, but without debug symbols
|
||||
* ``MinSizeRel``: a stripped release build
|
||||
* ``Debug``: used when developing Hyperscan. Includes runtime assertions
|
||||
(which has a large impact on runtime performance), and will also enable
|
||||
some other build features like building internal unit
|
||||
tests.
|
||||
|
||||
.. _target_arch:
|
||||
|
||||
Target Architecture
|
||||
-------------------
|
||||
|
||||
By default, Hyperscan will be compiled to target the instruction set of the
|
||||
processor of the machine that being used for compilation. This is done via
|
||||
the use of ``-march=native``. The result of this means that a library built on
|
||||
one machine may not work on a different machine if they differ in supported
|
||||
instruction subsets.
|
||||
|
||||
To override the use of ``-march=native``, set appropriate flags for the
|
||||
compiler in ``CFLAGS`` and ``CXXFLAGS`` environment variables before invoking
|
||||
CMake, or ``CMAKE_C_FLAGS`` and ``CMAKE_CXX_FLAGS`` on the CMake command line. For
|
||||
example, to set the instruction subsets up to ``SSE4.2`` using GCC 4.8: ::
|
||||
|
||||
cmake -DCMAKE_C_FLAGS="-march=corei7" \
|
||||
-DCMAKE_CXX_FLAGS="-march=corei7" <hyperscan-source-path>
|
||||
|
||||
For more information, refer to :ref:`instr_specialization`.
|
||||
|
2383
doc/dev-reference/hyperscan.doxyfile.in
Normal file
2383
doc/dev-reference/hyperscan.doxyfile.in
Normal file
File diff suppressed because it is too large
Load Diff
20
doc/dev-reference/index.rst
Normal file
20
doc/dev-reference/index.rst
Normal file
@@ -0,0 +1,20 @@
|
||||
###############################################
|
||||
Hyperscan |version| Developer's Reference Guide
|
||||
###############################################
|
||||
|
||||
-------
|
||||
|today|
|
||||
-------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
copyright
|
||||
preface
|
||||
intro
|
||||
getting_started
|
||||
compilation
|
||||
runtime
|
||||
performance
|
||||
api_constants
|
||||
api_files
|
78
doc/dev-reference/intro.rst
Normal file
78
doc/dev-reference/intro.rst
Normal file
@@ -0,0 +1,78 @@
|
||||
.. include:: <isonum.txt>
|
||||
.. _intro:
|
||||
|
||||
############
|
||||
Introduction
|
||||
############
|
||||
|
||||
Hyperscan is a software regular expression matching engine designed with
|
||||
high performance and flexibility in mind. It is implemented as a library that
|
||||
exposes a straightforward C API.
|
||||
|
||||
The Hyperscan API itself is composed of two major components:
|
||||
|
||||
***********
|
||||
Compilation
|
||||
***********
|
||||
|
||||
These functions take a group of regular expressions, along with identifiers and
|
||||
option flags, and compile them into an immutable database that can be used by
|
||||
the Hyperscan scanning API. This compilation process performs considerable
|
||||
analysis and optimization work in order to build a database that will match the
|
||||
given expressions efficiently.
|
||||
|
||||
If a pattern cannot be built into a database for any reason (such as the use of
|
||||
an unsupported expression construct, or the overflowing of a resource limit),
|
||||
an error will be returned by the pattern compiler.
|
||||
|
||||
Compiled databases can be serialized and relocated, so that they can be stored
|
||||
to disk or moved between hosts. They can also be targeted to particular
|
||||
platform features (for example, the use of Intel\ |reg| Advanced Vector Extensions
|
||||
2 (Intel\ |reg| AVX2) instructions).
|
||||
|
||||
See :ref:`compilation` for more detail.
|
||||
|
||||
********
|
||||
Scanning
|
||||
********
|
||||
|
||||
Once a Hyperscan database has been created, it can be used to scan data in
|
||||
memory. Hyperscan provides several scanning modes, depending on whether the
|
||||
data to be scanned is available as a single contiguous block, whether it is
|
||||
distributed amongst several blocks in memory at the same time, or whether it is
|
||||
to be scanned as a sequence of blocks in a stream.
|
||||
|
||||
Matches are delivered to the application via a user-supplied callback function
|
||||
that is called synchronously for each match.
|
||||
|
||||
For a given database, Hyperscan provides several guarantees:
|
||||
|
||||
* No memory allocations occur at runtime with the exception of two
|
||||
fixed-size allocations, both of which should be done ahead of time for
|
||||
performance-critical applications:
|
||||
|
||||
- **Scratch space**: temporary memory used for internal data at scan time.
|
||||
Structures in scratch space do not persist beyond the end of a single scan
|
||||
call.
|
||||
- **Stream state**: in streaming mode only, some state space is required to
|
||||
store data that persists between scan calls for each stream. This allows
|
||||
Hyperscan to track matches that span multiple blocks of data.
|
||||
|
||||
* The sizes of the scratch space and stream state (in streaming mode) required
|
||||
for a given database are fixed and determined at database compile time. This
|
||||
means that the memory requirements of the application are known ahead of
|
||||
time, and these structures can be pre-allocated if required for performance
|
||||
reasons.
|
||||
|
||||
* Any pattern that has successfully been compiled by the Hyperscan compiler can
|
||||
be scanned against any input. There are no internal resource limits or other
|
||||
limitations at runtime that could cause a scan call to return an error.
|
||||
|
||||
See :ref:`runtime` for more detail.
|
||||
|
||||
************
|
||||
Example Code
|
||||
************
|
||||
|
||||
Some simple example code demonstrating the use of the Hyperscan API is
|
||||
available in the ``examples/`` subdirectory of the Hyperscan distribution.
|
335
doc/dev-reference/performance.rst
Normal file
335
doc/dev-reference/performance.rst
Normal file
@@ -0,0 +1,335 @@
|
||||
.. _perf:
|
||||
|
||||
##########################
|
||||
Performance Considerations
|
||||
##########################
|
||||
|
||||
Hyperscan supports a wide range of patterns in all three scanning modes. It is
|
||||
capable of extremely high levels of performance, but certain patterns can
|
||||
reduce performance markedly.
|
||||
|
||||
The following guidelines will help construct patterns and pattern sets that
|
||||
will perform better:
|
||||
|
||||
*****************************
|
||||
Regular expression constructs
|
||||
*****************************
|
||||
|
||||
.. tip:: Do not hand-optimize regular expression constructs.
|
||||
|
||||
Quite a large number of regular expressions can be written in multiple ways.
|
||||
For example, caseless matching of :regexp:`/abc/` can be written as:
|
||||
|
||||
* :regexp:`/[Aa][Bb][Cc]/`
|
||||
* :regexp:`/(A|a)(B|b)(C|c)/`
|
||||
* :regexp:`/(?i)abc(?-i)/`
|
||||
* :regexp:`/abc/i`
|
||||
|
||||
Hyperscan is capable of handling all these constructs. Unless there is a
|
||||
specific reason otherwise, do not rewrite patterns from one form to another.
|
||||
|
||||
As another example, matching of :regexp:`/foo(bar|baz)(frotz)?/` can be
|
||||
equivalently written as:
|
||||
|
||||
* :regexp:`/foobarfrotz|foobazfrotz|foobar|foobaz/`
|
||||
|
||||
This change will not improve performance or reduce overheads.
|
||||
|
||||
*************
|
||||
Library usage
|
||||
*************
|
||||
|
||||
.. tip:: Do not hand-optimize library usage.
|
||||
|
||||
The Hyperscan library is capable of dealing with small writes, unusually large
|
||||
and small pattern sets, etc. Unless there is a specific performance problem
|
||||
with some usage of the library, it is best to use Hyperscan in a simple and
|
||||
direct fashion. For example, it is unlikely for there to be much benefit in
|
||||
buffering input to the library into larger blocks unless streaming writes are
|
||||
tiny (say, 1-2 bytes at a time).
|
||||
|
||||
Unlike many other pattern matching products, Hyperscan will run faster with
|
||||
small numbers of patterns and slower with large numbers of patterns in a smooth
|
||||
fashion (as opposed to, typically, running at a moderate speed up to some fixed
|
||||
limit then either breaking or running half as fast).
|
||||
|
||||
Hyperscan also provides high-throughput matching with a single thread of
|
||||
control per core; if a database runs at 3.0 Gbps in Hyperscan it means that a
|
||||
3000-bit block of data will be scanned in 1 microsecond in a single thread of
|
||||
control, not that it is required to scan 22 3000-bit blocks of data in 22
|
||||
microseconds. Thus, it is not usually necessary to buffer data to supply
|
||||
Hyperscan with available parallelism.
|
||||
|
||||
********************
|
||||
Block-based matching
|
||||
********************
|
||||
|
||||
.. tip:: Prefer block-based matching to streaming matching where possible.
|
||||
|
||||
Whenever input data appears in discrete records, or already requires some sort
|
||||
of transformation (e.g. URI normalization) that requires all the data to be
|
||||
accumulated before processing, it should be scanned in block rather than in
|
||||
streaming mode.
|
||||
|
||||
Unnecessary use of streaming mode reduces the number of optimizations that can
|
||||
be applied in Hyperscan and may make some patterns run slower.
|
||||
|
||||
If there is a mixture of 'block' and 'streaming' mode patterns, these should be
|
||||
scanned in separate databases except in the case that the streaming patterns
|
||||
vastly outnumber the block mode patterns.
|
||||
|
||||
*********************
|
||||
Unnecessary databases
|
||||
*********************
|
||||
|
||||
.. tip:: Avoid unnecessary 'union' databases.
|
||||
|
||||
If there are 5 different types of network traffic T1 through T5 that must
|
||||
be scanned against 5 different signature sets, it will be far more efficient to
|
||||
construct 5 separate databases and scan traffic against the appropriate one
|
||||
than it will be to merge all 5 signature sets and remove inappropriate matches
|
||||
after the fact.
|
||||
|
||||
This will be true even in the case where there is substantial overlap among the
|
||||
signatures. Only if the common subset of the signatures is overwhelmingly large
|
||||
(say, 90% of the signatures appear in all 5 traffic types) should a database
|
||||
that merges all 5 signature sets be considered, and only then if there are no
|
||||
performance issues with specific patterns that appear outside the common
|
||||
subset.
|
||||
|
||||
******************************
|
||||
Allocate scratch ahead of time
|
||||
******************************
|
||||
|
||||
.. tip:: Do not allocate scratch space for your pattern database just before
|
||||
calling a scan function. Instead, do it just after the pattern database is
|
||||
compiled or deserialized.
|
||||
|
||||
Scratch allocation is not necessarily a cheap operation. Since it is the first
|
||||
time (after compilation or deserialization) that a pattern database is used,
|
||||
Hyperscan performs some validation checks inside :c:func:`hs_alloc_scratch` and
|
||||
must also allocate memory.
|
||||
|
||||
Therefore, it is important to ensure that :c:func:`hs_alloc_scratch` is not
|
||||
called in the application's scanning path just before :c:func:`hs_scan` (for
|
||||
example).
|
||||
|
||||
Instead, scratch should be allocated immediately after a pattern database is
|
||||
compiled or deserialized, then retained for later scanning operations.
|
||||
|
||||
***********************************************
|
||||
Allocate one scratch space per scanning context
|
||||
***********************************************
|
||||
|
||||
.. tip:: A scratch space can be allocated so that it can be used with any one of
|
||||
a number of databases. Each concurrent scan operation (such as a thread)
|
||||
needs its own scratch space.
|
||||
|
||||
The :c:func:`hs_alloc_scratch` function can accept an existing scratch space and
|
||||
"grow" it to support scanning with another pattern database. This means that
|
||||
instead of allocating one scratch space for every database used by an
|
||||
application, one can call :c:func:`hs_alloc_scratch` with a pointer to the same
|
||||
:c:type:`hs_scratch_t` and it will be sized appropriately for use with any of
|
||||
the given databases. For example:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
hs_database_t *db1 = buildDatabaseOne();
|
||||
hs_database_t *db2 = buildDatabaseTwo();
|
||||
hs_database_t *db3 = buildDatabaseThree();
|
||||
|
||||
hs_error_t err;
|
||||
hs_scratch_t *scratch = NULL;
|
||||
err = hs_alloc_scratch(db1, &scratch);
|
||||
if (err != HS_SUCCESS) {
|
||||
printf("hs_alloc_scratch failed!");
|
||||
exit(1);
|
||||
}
|
||||
err = hs_alloc_scratch(db2, &scratch);
|
||||
if (err != HS_SUCCESS) {
|
||||
printf("hs_alloc_scratch failed!");
|
||||
exit(1);
|
||||
}
|
||||
err = hs_alloc_scratch(db3, &scratch);
|
||||
if (err != HS_SUCCESS) {
|
||||
printf("hs_alloc_scratch failed!");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* scratch may now be used to scan against any of
|
||||
the databases db1, db2, db3. */
|
||||
|
||||
*****************
|
||||
Anchored patterns
|
||||
*****************
|
||||
|
||||
.. tip:: If a pattern is meant to appear at the start of data, be sure to
|
||||
anchor it.
|
||||
|
||||
Anchored patterns (:regexp:`/^.../`) are far simpler to match than other
|
||||
patterns, especially patterns anchored to the start of the buffer (or stream, in
|
||||
streaming mode). Anchoring patterns to the end of the buffer results in less of
|
||||
a performance gain, especially in streaming mode.
|
||||
|
||||
There are a variety of ways to anchor a pattern to a particular offset:
|
||||
|
||||
- The :regexp:`^` and :regexp:`\\A` constructs anchor the pattern to the start
|
||||
of the buffer. For example, :regexp:`/^foo/` can *only* match at offset 3.
|
||||
|
||||
- The :regexp:`$`, :regexp:`\\z` and :regexp:`\\Z` constructs anchor the pattern
|
||||
to the end of the buffer. For example, :regexp:`/foo\\z/` can only match when
|
||||
the data buffer being scanned ends in ``foo``. (It should be noted that
|
||||
:regexp:`$` and :regexp:`\\Z` will also match before a newline at the end of
|
||||
the buffer, so :regexp:`/foo\\z/` would match against either ``abc foo`` or
|
||||
``abc foo\n``.)
|
||||
|
||||
- The ``min_offset`` and ``max_offset`` extended parameters may also be used to
|
||||
constrain where a pattern could match. For example, the pattern
|
||||
:regexp:`/foo/` with a ``max_offset`` of 10 will only match at offsets less
|
||||
than or equal to 10 in the buffer. (This pattern could also be written as
|
||||
:regexp:`/^.{0,7}foo/`, compiled with the :c:member:`HS_FLAG_DOTALL` flag).
|
||||
|
||||
|
||||
*******************
|
||||
Matching everywhere
|
||||
*******************
|
||||
|
||||
.. tip:: Avoid patterns that match everywhere, and remember that our semantics
|
||||
are 'match everywhere, end of match only'.
|
||||
|
||||
Pattern that match everywhere will run slowly due to the sheer number of
|
||||
matches that they return.
|
||||
|
||||
Patterns like :regexp:`/.*/` in an automata-based matcher will match before and
|
||||
after every single character position, so a buffer with 100 characters will
|
||||
return 101 matches. Greedy pattern matchers such as libpcre will return a
|
||||
single match in this case, but our semantics is to return all matches. This is
|
||||
likely to be very expensive for our code and for the client code of the
|
||||
library.
|
||||
|
||||
Another result of our semantics ("match everywhere") is that patterns that have
|
||||
optional start or ending sections -- for example :regexp:`/x?abcd*/` -- may not
|
||||
perform as expected.
|
||||
|
||||
Firstly, the :regexp:`x?` portion of the pattern is unnecessary, as it will not
|
||||
affect the match results.
|
||||
|
||||
Secondly, the above pattern will match 'more' than :regexp:`/abc/` but
|
||||
:regexp:`/abc/` will always detect any input data that will be matched by
|
||||
:regexp:`/x?abcd*/` -- it will just produce fewer matches.
|
||||
|
||||
For example, input data ``0123abcdddd`` will match :regexp:`/abc/` once but
|
||||
:regexp:`/abcd*/` five times (at ``abc``, ``abcd``, ``abcdd``, ``abcddd``, and
|
||||
``abcdddd``).
|
||||
|
||||
*********************************
|
||||
Bounded repeats in streaming mode
|
||||
*********************************
|
||||
|
||||
.. tip:: Bounded repeats are expensive in streaming mode.
|
||||
|
||||
A bounded repeat construction such as :regexp:`/X.{1000,1001}abcd/` is extremely
|
||||
expensive in streaming mode, of necessity. It requires us to take action on
|
||||
each ``X`` character (itself expensive, relative to searching for longer strings)
|
||||
and potentially record a history of hundreds of offsets where ``X`` occurred in
|
||||
case the ``X`` and ``abcd`` characters are separated by a stream boundary.
|
||||
|
||||
Heavy and unnecessary use of bounded repeats should be avoided, especially
|
||||
where other parts of a signature are quite specific. For example, a virus
|
||||
signature that matches a virus payload may be sufficient without including a
|
||||
prefix that includes, for example, a 2-character Windows executable prefix and
|
||||
a bounded repeat beforehand.
|
||||
|
||||
***************
|
||||
Prefer literals
|
||||
***************
|
||||
|
||||
.. tip:: Where possible, prefer patterns which 'require' literals, especially
|
||||
longer literals, and in streaming mode, prefer signatures that 'require'
|
||||
literals earlier in the pattern.
|
||||
|
||||
Patterns which must match on a literal will run faster than patterns that do
|
||||
not. For example:
|
||||
|
||||
- :regexp:`/\\wab\\d*\\w\\w\\w/` will run faster than
|
||||
- :regexp:`/\\w\\w\\d*\\w\\w/`, or, for that matter
|
||||
- :regexp:`/\\w(abc)?\\d*\\w\\w\\w/` (this contains a literal but it need
|
||||
not appear in the input).
|
||||
|
||||
Even implicit literals are better than none: :regexp:`/[0-2][3-5].*\\w\\w/`
|
||||
still effectively contains 9 2-character literals. No hand-optimization of this
|
||||
case is required; this pattern will not run faster if rewritten as:
|
||||
:regexp:`/(03|04|05|13|14|15|23|24|25).*\\w\\w/`.
|
||||
|
||||
Under all circumstances it is better to use longer literals than shorter ones.
|
||||
A database consisting of 100 14-character literals will scan considerably
|
||||
faster than one consisting of 100 4-character literals and return fewer
|
||||
positives.
|
||||
|
||||
Additionally, in streaming mode, a signature that contains a longer literal
|
||||
early in the pattern is preferred to one that does not.
|
||||
|
||||
For example: :regexp:`/b\\w*foobar/` is not as good a pattern as
|
||||
:regexp:`/blah\\w*foobar/`.
|
||||
|
||||
The disparity between these patterns is much smaller in block mode.
|
||||
|
||||
Longer literals anywhere in the pattern are still preferred in streaming mode.
|
||||
For example, both of the above patterns are stronger and will scan faster than
|
||||
:regexp:`/b\\w*fo/` even in streaming mode.
|
||||
|
||||
**************
|
||||
"Dot all" mode
|
||||
**************
|
||||
|
||||
.. tip:: Use "dot all" mode where possible.
|
||||
|
||||
Not using the :c:member:`HS_FLAG_DOTALL` pattern flag can be expensive, as
|
||||
implicitly, it means that patterns of the form :regexp:`/A.*B/` become
|
||||
:regexp:`/A[^\\n]*B/`.
|
||||
|
||||
It is likely that scanning tasks without the DOTALL flag are better done 'line
|
||||
at a time', with the newline sequences marking the beginning and end of each
|
||||
block.
|
||||
|
||||
This will be true in most use-cases (an exception being where the DOTALL flag
|
||||
is off but the pattern contains either explicit newlines or constructs such as
|
||||
:regexp:`\\s` that implicitly match a newline character).
|
||||
|
||||
*****************
|
||||
Single-match flag
|
||||
*****************
|
||||
|
||||
.. tip:: Consider using the single-match flag to limit matches to one match per
|
||||
pattern only if possible.
|
||||
|
||||
If only one match per pattern is required, use the flag provided to indicate
|
||||
this (:c:member:`HS_FLAG_SINGLEMATCH`). This flag can allow a number of
|
||||
optimizations to be applied, allowing both performance improvements and state
|
||||
space reductions when streaming.
|
||||
|
||||
However, there is some overhead associated with tracking whether each pattern in
|
||||
the pattern set has matched, and some applications with infrequent matches may
|
||||
see reduced performance when the single-match flag is used.
|
||||
|
||||
********************
|
||||
Start of Match flag
|
||||
********************
|
||||
|
||||
.. tip:: Do not request Start of Match information if it is not not needed.
|
||||
|
||||
Start of Match (SOM) information can be expensive to gather and can require
|
||||
large amounts of stream state to store in streaming mode. As such, SOM
|
||||
information should only be requested with the :c:member:`HS_FLAG_SOM_LEFTMOST`
|
||||
flag for patterns that require it.
|
||||
|
||||
SOM information is not generally expected to be cheaper (in either performance
|
||||
terms or in stream state overhead) than the use of bounded repeats.
|
||||
Consequently, :regexp:`/foo.*bar/L` with a check on start of match values after
|
||||
the callback is considerably more expensive and general than
|
||||
:regexp:`/foo.{300}bar/`.
|
||||
|
||||
Similarly, the :c:member:`hs_expr_ext::min_length` extended parameter can be
|
||||
used to specify a lower bound on the length of the matches for a pattern. Using
|
||||
this facility may be more lightweight in some circumstances than using the SOM
|
||||
flag and post-confirming match length in the calling application.
|
47
doc/dev-reference/preface.rst
Normal file
47
doc/dev-reference/preface.rst
Normal file
@@ -0,0 +1,47 @@
|
||||
#######
|
||||
Preface
|
||||
#######
|
||||
|
||||
********
|
||||
Overview
|
||||
********
|
||||
|
||||
Hyperscan is a regular expression engine designed to offer high performance, the
|
||||
ability to match multiple expressions simultaneously and flexibility in
|
||||
scanning operation.
|
||||
|
||||
Patterns are provided to a compilation interface which generates an immutable
|
||||
pattern database. The scan interface then can be used to scan a target data
|
||||
buffer for the given patterns, returning any matching results from that data
|
||||
buffer. Hyperscan also provides a streaming mode, in which matches that span
|
||||
several blocks in a stream are detected.
|
||||
|
||||
This document is designed to facilitate code-level integration of the Hyperscan
|
||||
library with existing or new applications.
|
||||
|
||||
:ref:`intro` is a short overview of the Hyperscan library, with more detail on
|
||||
the Hyperscan API provided in the subsequent sections: :ref:`compilation` and
|
||||
:ref:`runtime`.
|
||||
|
||||
:ref:`perf` provides details on various factors which may impact the
|
||||
performance of a Hyperscan integration.
|
||||
|
||||
:ref:`api_constants` and :ref:`api_files` provides a detailed summary of the
|
||||
Hyperscan Application Programming Interface (API).
|
||||
|
||||
********
|
||||
Audience
|
||||
********
|
||||
|
||||
This guide is aimed at developers interested in integrating Hyperscan into an
|
||||
application. For information on building the Hyperscan library, see the Quick
|
||||
Start Guide.
|
||||
|
||||
***********
|
||||
Conventions
|
||||
***********
|
||||
|
||||
* Text in a ``fixed-width font`` refers to a code element, e.g. type name;
|
||||
function or method name.
|
||||
* Text in a :regexp:`coloured fixed-width font` refers to a regular
|
||||
expression or a part of a regular expression.
|
198
doc/dev-reference/runtime.rst
Normal file
198
doc/dev-reference/runtime.rst
Normal file
@@ -0,0 +1,198 @@
|
||||
.. _runtime:
|
||||
|
||||
#####################
|
||||
Scanning for Patterns
|
||||
#####################
|
||||
|
||||
Hyperscan provides three different scanning modes, each with its own scan
|
||||
function beginning with ``hs_scan``. In addition, streaming mode has a number
|
||||
of other API functions for managing stream state.
|
||||
|
||||
****************
|
||||
Handling Matches
|
||||
****************
|
||||
|
||||
All of these functions will call a user-supplied callback function when a match
|
||||
is found. This function has the following signature:
|
||||
|
||||
.. doxygentypedef:: match_event_handler
|
||||
:outline:
|
||||
:no-link:
|
||||
|
||||
The *id* argument will be set to the identifier for the matching expression
|
||||
provided at compile time, and the *to* argument will be set to the end-offset
|
||||
of the match. If SOM was requested for the pattern (see :ref:`som`), the
|
||||
*from* argument will be set to the leftmost possible start-offset for the match.
|
||||
|
||||
The match callback function has the capability to halt scanning
|
||||
by returning a non-zero value.
|
||||
|
||||
See :c:type:`match_event_handler` for more information.
|
||||
|
||||
**************
|
||||
Streaming Mode
|
||||
**************
|
||||
|
||||
The streaming runtime API consists of functions to open, scan, and close
|
||||
Hyperscan data streams -- these functions being :c:func:`hs_open_stream`,
|
||||
:c:func:`hs_scan_stream`, and :c:func:`hs_close_stream`. Any matches detected
|
||||
in the written data are returned to the calling application via a function
|
||||
pointer callback.
|
||||
|
||||
The match callback function has the capability to halt scanning of the current
|
||||
data stream by returning a non-zero value. In streaming mode, the result of
|
||||
this is that the stream is then left in a state where no more data can be
|
||||
scanned, and any subsequent calls to :c:func:`hs_scan_stream` for that stream
|
||||
will return immediately with :c:member:`HS_SCAN_TERMINATED`. The caller must
|
||||
still call :c:func:`hs_close_stream` to complete the clean-up process for that
|
||||
stream.
|
||||
|
||||
Streams exist in the Hyperscan library so that pattern matching state can be
|
||||
maintained across multiple blocks of target data -- without maintaining this
|
||||
state, it would not be possible to detect patterns that span these blocks of
|
||||
data. This, however, does come at the cost of requiring an amount of storage
|
||||
per-stream (the size of this storage is fixed at compile time), and a slight
|
||||
performance penalty in some cases to manage the state.
|
||||
|
||||
While Hyperscan does always support a strict ordering of multiple matches,
|
||||
streaming matches will not be delivered at offsets before the current stream
|
||||
write, with the exception of zero-width asserts, where constructs such as
|
||||
:regexp:`\\b` and :regexp:`$` can cause a match on the final character of a
|
||||
stream write to be delayed until the next stream write or stream close
|
||||
operation.
|
||||
|
||||
=================
|
||||
Stream Management
|
||||
=================
|
||||
|
||||
In addition to :c:func:`hs_open_stream`, :c:func:`hs_scan_stream`, and
|
||||
:c:func:`hs_close_stream`, the Hyperscan API provides a number of other
|
||||
functions for the management of streams:
|
||||
|
||||
* :c:func:`hs_reset_stream`: resets a stream to its initial state; this is
|
||||
equivalent to calling :c:func:`hs_close_stream` but will not free the memory
|
||||
used for stream state.
|
||||
|
||||
* :c:func:`hs_copy_stream`: constructs a (newly allocated) duplicate of a
|
||||
stream.
|
||||
|
||||
* :c:func:`hs_reset_and_copy_stream`: constructs a duplicate of a stream into
|
||||
another, resetting the destination stream first. This call avoids the
|
||||
allocation done by :c:func:`hs_copy_stream`.
|
||||
|
||||
**********
|
||||
Block Mode
|
||||
**********
|
||||
|
||||
The block mode runtime API consists of a single function: :c:func:`hs_scan`. Using
|
||||
the compiled patterns this function identifies matches in the target data,
|
||||
using a function pointer callback to communicate with the application.
|
||||
|
||||
This single :c:func:`hs_scan` function is essentially equivalent to calling
|
||||
:c:func:`hs_open_stream`, making a single call to :c:func:`hs_scan_stream`, and
|
||||
then :c:func:`hs_close_stream`, except that block mode operation does not
|
||||
incur all the stream related overhead.
|
||||
|
||||
*************
|
||||
Vectored Mode
|
||||
*************
|
||||
|
||||
The vectored mode runtime API, like the block mode API, consists of a single
|
||||
function: :c:func:`hs_scan_vector`. This function accepts an array of data
|
||||
pointers and lengths, facilitating the scanning in sequence of a set of data
|
||||
blocks that are not contiguous in memory.
|
||||
|
||||
From the caller's perspective, this mode will produce the same matches as if
|
||||
the set of data blocks were (a) scanned in sequence with a series of streaming
|
||||
mode scans, or (b) copied in sequence into a single block of memory and then
|
||||
scanned in block mode.
|
||||
|
||||
*************
|
||||
Scratch Space
|
||||
*************
|
||||
|
||||
While scanning data, Hyperscan needs a small amount of temporary memory to store
|
||||
on-the-fly internal data. This amount is unfortunately too large to fit on the
|
||||
stack, particularly for embedded applications, and allocating memory dynamically
|
||||
is too expensive, so a pre-allocated "scratch" space must be provided to the
|
||||
scanning functions.
|
||||
|
||||
The function :c:func:`hs_alloc_scratch` allocates a large enough region of
|
||||
scratch space to support a given database. If the application uses multiple
|
||||
databases, only a single scratch region is necessary: in this case, calling
|
||||
:c:func:`hs_alloc_scratch` on each database (with the same ``scratch`` pointer)
|
||||
will ensure that the scratch space is large enough to support scanning against
|
||||
any of the given databases.
|
||||
|
||||
Importantly, only one such space is required per thread and can (and indeed
|
||||
should) be allocated before data scanning is to commence. In a scenario where a
|
||||
set of expressions are compiled by a single "master" thread and data will be
|
||||
scanned by multiple "worker" threads, the convenience function
|
||||
:c:func:`hs_clone_scratch` allows multiple copies of an existing scratch space
|
||||
to be made for each thread (rather than forcing the caller to pass all the
|
||||
compiled databases through :c:func:`hs_alloc_scratch` multiple times).
|
||||
|
||||
For example:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
hs_error_t err;
|
||||
hs_scratch_t *scratch_prototype = NULL;
|
||||
err = hs_alloc_scratch(db, &scratch_prototype);
|
||||
if (err != HS_SUCCESS) {
|
||||
printf("hs_alloc_scratch failed!");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
hs_scratch_t *scratch_thread1 = NULL;
|
||||
hs_scratch_t *scratch_thread2 = NULL;
|
||||
|
||||
err = hs_clone_scratch(scratch_prototype, &scratch_thread1);
|
||||
if (err != HS_SUCCESS) {
|
||||
printf("hs_clone_scratch failed!");
|
||||
exit(1);
|
||||
}
|
||||
err = hs_clone_scratch(scratch_prototype, &scratch_thread2);
|
||||
if (err != HS_SUCCESS) {
|
||||
printf("hs_clone_scratch failed!");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
hs_free_scratch(scratch_prototype);
|
||||
|
||||
/* Now two threads can both scan against database db,
|
||||
each with its own scratch space. */
|
||||
|
||||
While the Hyperscan library is re-entrant, the use of scratch spaces is not.
|
||||
For example, if by design it is deemed necessary to run recursive or nested
|
||||
scanning (say, from the match callback function), then an additional scratch
|
||||
space is required for that context.
|
||||
|
||||
The easiest way to achieve this is to build up a single scratch space as a
|
||||
prototype, then clone it for each context:
|
||||
|
||||
*****************
|
||||
Custom Allocators
|
||||
*****************
|
||||
|
||||
By default, structures used by Hyperscan at runtime (scratch space, stream
|
||||
state, etc) are allocated with the default system allocators, usually
|
||||
``malloc()`` and ``free()``.
|
||||
|
||||
The Hyperscan API provides a facility for changing this behaviour to support
|
||||
applications that use custom memory allocators.
|
||||
|
||||
These functions are:
|
||||
|
||||
- :c:func:`hs_set_database_allocator`, which sets the allocate and free functions
|
||||
used for compiled pattern databases.
|
||||
- :c:func:`hs_set_scratch_allocator`, which sets the allocate and free
|
||||
functions used for scratch space.
|
||||
- :c:func:`hs_set_stream_allocator`, which sets the allocate and free functions
|
||||
used for stream state in streaming mode.
|
||||
- :c:func:`hs_set_misc_allocator`, which sets the allocate and free functions
|
||||
used for miscellaneous data, such as compile error structures and
|
||||
informational strings.
|
||||
|
||||
The :c:func:`hs_set_allocator` function can be used to set all of the custom
|
||||
allocators to the same allocate/free pair.
|
Reference in New Issue
Block a user