Initial commit of Hyperscan

2025-11-15 08:52:15 +03:00 · 2015-10-20 09:13:35 +11:00
commit 904e436f11
610 changed files with 213627 additions and 0 deletions
--- a/doc/dev-reference/CMakeLists.txt
+++ b/doc/dev-reference/CMakeLists.txt
@@ -0,0 +1,35 @@
+find_program(DOXYGEN doxygen)
+
+if (DOXYGEN STREQUAL DOXYGEN-NOTFOUND)
+    message(STATUS "Doxygen not found, unable to generate API reference")
+else()
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/hyperscan.doxyfile.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/hyperscan.doxyfile" @ONLY)
+
+add_custom_target(dev-reference-doxygen
+    ${DOXYGEN} ${CMAKE_CURRENT_BINARY_DIR}/hyperscan.doxyfile
+    COMMENT "Building doxygen XML for API reference")
+endif()
+
+find_program(SPHINX_BUILD sphinx-build)
+
+if (SPHINX_BUILD STREQUAL SPHINX_BUILD-NOTFOUND)
+    message(STATUS "Sphinx not found, unable to generate developer reference")
+else()
+set(SPHINX_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
+set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
+set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
+
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/conf.py" @ONLY)
+
+add_custom_target(dev-reference
+    ${SPHINX_BUILD}
+        -b html
+        -c "${CMAKE_CURRENT_BINARY_DIR}"
+        -d "${SPHINX_CACHE_DIR}"
+        "${CMAKE_CURRENT_SOURCE_DIR}"
+        "${SPHINX_HTML_DIR}"
+    DEPENDS dev-reference-doxygen
+    COMMENT "Building HTML dev reference with Sphinx")
+endif()
--- a/doc/dev-reference/_static/hyperscan.css
+++ b/doc/dev-reference/_static/hyperscan.css
@@ -0,0 +1,4 @@
+/* Differentiate the way we display regex fragments. */
+.regexp {
+  color: darkred !important;
+}
--- a/doc/dev-reference/api_constants.rst
+++ b/doc/dev-reference/api_constants.rst
@@ -0,0 +1,53 @@
+.. _api_constants:
+
+########################
+API Reference: Constants
+########################
+
+***********
+Error Codes
+***********
+
+.. doxygengroup:: HS_ERROR
+   :content-only:
+   :no-link:
+
+*****************
+hs_expr_ext flags
+*****************
+
+.. doxygengroup:: HS_EXT_FLAG
+   :content-only:
+   :no-link:
+
+*************
+Pattern flags
+*************
+
+.. doxygengroup:: HS_PATTERN_FLAG
+   :content-only:
+   :no-link:
+
+*************************
+CPU feature support flags
+*************************
+
+.. doxygengroup:: HS_CPU_FEATURES_FLAG
+   :content-only:
+   :no-link:
+
+****************
+CPU tuning flags
+****************
+
+.. doxygengroup:: HS_TUNE_FLAG
+   :content-only:
+   :no-link:
+
+******************
+Compile mode flags
+******************
+
+.. doxygengroup:: HS_MODE_FLAG
+   :content-only:
+   :no-link:
--- a/doc/dev-reference/api_files.rst
+++ b/doc/dev-reference/api_files.rst
@@ -0,0 +1,29 @@
+.. _api_files:
+
+####################
+API Reference: Files
+####################
+
+**********
+File: hs.h
+**********
+
+.. doxygenfile:: hs.h
+
+*****************
+File: hs_common.h
+*****************
+
+.. doxygenfile:: hs_common.h
+
+******************
+File: hs_compile.h
+******************
+
+.. doxygenfile:: hs_compile.h
+
+******************
+File: hs_runtime.h
+******************
+
+.. doxygenfile:: hs_runtime.h
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@@ -0,0 +1,365 @@
+.. include:: <isonum.txt>
+.. _compilation:
+
+##################
+Compiling Patterns
+##################
+
+*******************
+Building a Database
+*******************
+
+The Hyperscan compiler API accepts regular expressions and converts them into a
+compiled pattern database that can then be used to scan data.
+
+The API provides three functions that compile regular expressions into
+databases:
+
+#. :c:func:`hs_compile`: compiles a single expression into a pattern database.
+
+#. :c:func:`hs_compile_multi`: compiles an array of expressions into a pattern
+   database. All of the supplied patterns will be scanned for concurrently at
+   scan time, with user-supplied identifiers returned when they match.
+
+#. :c:func:`hs_compile_ext_multi`: compiles an array of expressions as above,
+   but allows :ref:`extparam` to be specified for each expression.
+
+Compilation allows the Hyperscan library to analyze the given pattern(s) and
+pre-determine how to scan for these patterns in an optimized fashion that would
+be far too expensive to compute at run-time.
+
+When compiling expressions, a decision needs to be made whether the resulting
+compiled patterns are to be used in a streaming, block or vectored mode:
+
+- **Streaming mode**: the target data to be scanned is a continuous stream, not
+  all of which is available at once; blocks of data are scanned in sequence and
+  matches may span multiple blocks in a stream. In streaming mode, each stream
+  requires a block of memory to store its state between scan calls.
+
+- **Block mode**: the target data is a discrete, contiguous block which can be
+  scanned in one call and does not require state to be retained.
+
+- **Vectored mode**: the target data consists of a list of non-contiguous
+  blocks that are available all at once. As for block mode, no retention of
+  state is required.
+
+To compile patterns to be used in streaming mode, the ``mode`` parameter of
+:c:func:`hs_compile` must be set to :c:member:`HS_MODE_STREAM`; similarly,
+block mode requires the use of :c:member:`HS_MODE_BLOCK` and vectored mode
+requires the use of :c:member:`HS_MODE_VECTORED`. A pattern database compiled
+for one mode (streaming, block or vectored) can only be used in that mode. The
+version of Hyperscan used to produce a compiled pattern database must match the
+version of Hyperscan used to scan with it.
+
+Hyperscan provides support for targeting a database at a particular CPU
+platform; see :ref:`instr_specialization` for details.
+
+***************
+Pattern Support
+***************
+
+Hyperscan supports the pattern syntax used by the PCRE library ("libpcre"),
+described at <http://www.pcre.org/>. However, not all constructs available in
+libpcre are supported. The use of unsupported constructs will result in
+compilation errors.
+
+====================
+Supported Constructs
+====================
+
+The following regex constructs are supported by Hyperscan:
+
+* Literal characters and strings, with all libpcre quoting and character
+  escapes.
+
+* Character classes such as :regexp:`.` (dot), :regexp:`[abc]`, and
+  :regexp:`[^abc]`, as well as the predefined character classes :regexp:`\\s`,
+  :regexp:`\\d`, :regexp:`\\w`, :regexp:`\\v`, and :regexp:`\\h` and their
+  negated counterparts (:regexp:`\\S`, :regexp:`\\D`, :regexp:`\\W`,
+  :regexp:`\\V`, and :regexp:`\\H`).
+
+* The POSIX named character classes :regexp:`[[:xxx:]]` and negated named
+  character classes :regexp:`[[:^xxx:]]`.
+
+* Unicode character properties, such as :regexp:`\\p{L}`, :regexp:`\\P{Sc}`,
+  :regexp:`\\p{Greek}`.
+
+* Quantifiers:
+
+  * Quantifiers such as :regexp:`?`, :regexp:`*` and :regexp:`+` are supported
+    when applied to arbitrary supported sub-expressions.
+
+  * Bounded repeat qualifiers such as :regexp:`{n}`, :regexp:`{m,n}`,
+    :regexp:`{n,}` are supported with limitations.
+
+    * For arbitrary repeated sub-patterns: *n* and *m* should be either small
+      or infinite, e.g. :regexp:`(a|b}{4}`, :regexp:`(ab?c?d){4,10}` or
+      :regexp:`(ab(cd)*){6,}`.
+
+    * For single-character width sub-patterns such as :regexp:`[^\\a]` or
+      :regexp:`.` or :regexp:`x`, nearly all repeat counts are supported, except
+      where repeats are extremely large (maximum bound greater than 32767).
+      Stream states may be very large for large bounded repeats, e.g.
+      :regexp:`a.{2000}b`. Note: such sub-patterns may be considerably
+      cheaper if at the beginning or end of patterns and especially if the
+      :c:member:`HS_FLAG_SINGLEMATCH` flag is on for that pattern.
+
+  * Lazy modifiers (:regexp:`?` appended to another quantifier, e.g.
+    :regexp:`\\w+?`) are supported but ignored (as Hyperscan reports all
+    matches).
+
+* Parenthesization, including the named and unnamed capturing and
+  non-capturing forms. However, capturing is ignored.
+
+* Alternation with the :regexp:`|` symbol, as in :regexp:`foo|bar`.
+
+* The anchors :regexp:`^`, :regexp:`$`, :regexp:`\\A`, :regexp:`\\Z` and
+  :regexp:`\\z`.
+
+* Option modifiers for:
+
+    * Case-sensitivity: :regexp:`(?i)` and :regexp:`(?-i)`
+    * Multi-line: :regexp:`(?m)` and :regexp:`(?-m)`
+    * Dot-all: :regexp:`(?s)` and :regexp:`(?-s)`
+    * Extended syntax: :regexp:`(?s)` and :regexp:`(?-s)`
+
+* The :regexp:`\\b` and :regexp:`\\B` zero-width assertions (word boundary and
+  'not word boundary', respectively).
+
+* Comments in :regexp:`(?# comment)` syntax.
+
+* The :regexp:`(*UTF8)` and :regexp:`(*UCP)` control verbs at the beginning of a
+  pattern, used to enable UTF-8 and UCP mode.
+
+.. note:: Bounded-repeat quantifiers with large repeat counts of arbitrary
+   expressions (e.g. :regexp:`([a-z]|bc*d|xy?z){1000,5000}`) will result in a
+   "Pattern too large" error at pattern compile time.
+
+.. note:: At this time, not all patterns can be successfully compiled with the
+  :c:member:`HS_FLAG_SOM_LEFTMOST` flag, which enables per-pattern support for
+  :ref:`som`. The patterns that support this flag are a subset of patterns that
+  can be successfully compiled with Hyperscan; notably, many bounded repeat
+  forms that can be compiled with Hyperscan without the Start of Match flag
+  enabled cannot be compiled with the flag enabled.
+
+======================
+Unsupported Constructs
+======================
+
+The following regex constructs are not supported by Hyperscan:
+
+* Backreferences and capturing sub-expressions.
+* Arbitrary zero-width assertions.
+* Subroutine references and recursive patterns.
+* Conditional patterns.
+* Backtracking control verbs.
+* The :regexp:`\\C` "single-byte" directive (which breaks UTF-8 sequences).
+* The :regexp:`\\R` newline match.
+* The :regexp:`\\K` start of match reset directive.
+* Callouts and embedded code.
+* Atomic grouping and possessive quantifiers.
+
+*********
+Semantics
+*********
+
+While Hyperscan follows libpcre syntax, it provides different semantics. The
+major departures from libpcre semantics are motivated by the requirements of
+streaming and multiple simultaneous pattern matching.
+
+The major departures from libpcre semantics are:
+
+#. **Multiple pattern matching**: Hyperscan allows matches to be reported for
+   several patterns simultaneously. This is not equivalent to separating the
+   patterns by :regexp:`|` in libpcre, which evaluates alternations
+   left-to-right.
+
+#. **Lack of ordering**: the multiple matches that Hyperscan produces are not
+   guaranteed to be ordered, although they will always fall within the bounds of
+   the current scan.
+
+#. **End offsets only**: Hyperscan's default behaviour is only to report the end
+   offset of a match. Reporting of the start offset can be enabled with
+   per-expression flags at pattern compile time. See :ref:`som` for details.
+
+#. **"All matches" reported**: scanning :regexp:`/foo.*bar/` against
+   ``fooxyzbarbar`` will return two matches from Hyperscan -- at the points
+   corresponding to the ends of ``fooxyzbar`` and ``fooxyzbarbar``. In contrast,
+   libpcre semantics by default would report only one match at ``fooxyzbarbar``
+   (greedy semantics) or, if non-greedy semantics were switched on, one match at
+   ``fooxyzbar``. This means that switching between greedy and non-greedy
+   semantics is a no-op in Hyperscan.
+
+To support libpcre quantifier semantics while accurately reporting streaming
+matches at the time they occur is impossible. For example, consider the pattern
+above, :regexp:`/foo.*bar/`, in streaming mode, against the following
+stream (three blocks scanned in sequence):
+
+    =============   =======     ========
+    block 1         block 2     block 3
+    =============   =======     ========
+    ``fooxyzbar``   ``baz``     ``qbar``
+    =============   =======     ========
+
+Since the :regexp:`.*` repeat in the pattern is a *greedy* repeat in libpcre, it
+must match as much as possible without causing the rest of the pattern to fail.
+However, in streaming mode, this would require knowledge of data in the stream
+beyond the current block being scanned.
+
+In this example, the match at offset 9 in the first block is only the correct
+match (under libpcre semantics) if there is no ``bar`` in a subsequent block --
+as in block 3 -- which would constitute a better match for the pattern.
+
+.. _som:
+
+==============
+Start of Match
+==============
+
+In standard operation, Hyperscan will only provide the end offset of a match
+when the match callback is called. If the :c:member:`HS_FLAG_SOM_LEFTMOST` flag
+is specified for a particular pattern, then the same set of matches is
+returned, but each match will also provide the leftmost possible start offset
+corresponding to its end offset.
+
+Using the SOM flag entails a number of trade-offs and limitations:
+
+* Reduced pattern support: For many patterns, tracking SOM is complex and can
+  result in Hyperscan failing to compile a pattern with a "Pattern too
+  large" error, even if the pattern is supported in normal operation.
+* Increased stream state: At scan time, state space is required to track
+  potential SOM offsets, and this must be stored in persistent stream state in
+  streaming mode. Accordingly, SOM will generally increase the stream state
+  required to match a pattern.
+* Performance overhead: Similarly, there is generally a performance cost
+  associated with tracking SOM.
+* Incompatible features: Some other Hyperscan pattern flags (such as
+  :c:member:`HS_FLAG_SINGLEMATCH` and :c:member:`HS_FLAG_PREFILTER`) can not be
+  used in combination with SOM. Specifying them together with
+  :c:member:`HS_FLAG_SOM_LEFTMOST` will result in a compilation error.
+
+In streaming mode, the amount of precision delivered by SOM can be controlled
+with the SOM horizon flags. These instruct Hyperscan to deliver accurate SOM
+information within a certain distance of the end offset, and return a special
+start offset of :c:member:`HS_OFFSET_PAST_HORIZON` otherwise. Specifying a
+small or medium SOM horizon will usually reduce the stream state required for a
+given database.
+
+.. note:: In streaming mode, the start offset returned for a match may refer to
+   a point in the stream *before* the current block being scanned. Hyperscan
+   provides no facility for accessing earlier blocks; if the calling application
+   needs to inspect historical data, then it must store it itself.
+
+.. _extparam:
+
+===================
+Extended Parameters
+===================
+
+In some circumstances, more control over the matching behaviour of a pattern is
+required than can be specified easily using regular expression syntax. For
+these scenarios, Hyperscan provides the :c:func:`hs_compile_ext_multi` function
+that allows a set of "extended parameters" to be set on a per-pattern basis.
+
+Extended parameters are specified using an :c:type:`hs_expr_ext_t` structure,
+which provides the following fields:
+
+* ``flags``: Flags governing which of the other fields in the structure are
+  used.
+* ``min_offset``: The minimum end offset in the data stream at which this
+  expression should match successfully.
+* ``max_offset``: The maximum end offset in the data stream at which this
+  expression should match successfully.
+* ``min_length``: The minimum match length (from start to end) required to
+  successfully match this expression.
+
+These parameters allow the set of matches produced by a pattern to be
+constrained at compile time, rather than relying on the application to process
+unwanted matches at runtime.
+
+For example, the pattern :regexp:`/foo.*bar/` when given a ``min_offset`` of 10
+and a ``max_offset`` of 15 will not produce matches when scanned against
+``foobar`` or ``foo0123456789bar`` but will produce a match against the data
+streams ``foo0123bar`` or ``foo0123456bar``.
+
+=================
+Prefiltering Mode
+=================
+
+Hyperscan provides a per-pattern flag, :c:member:`HS_FLAG_PREFILTER`, which can
+be used to implement a prefilter for a pattern than Hyperscan would not
+ordinarily support.
+
+This flag instructs Hyperscan to compile an "approximate" version of this
+pattern for use in a prefiltering application, even if Hyperscan does not
+support the pattern in normal operation.
+
+The set of matches returned when this flag is used is guaranteed to be a
+superset of the matches specified by the non-prefiltering expression.
+
+If the pattern contains pattern constructs not supported by Hyperscan (such as
+zero-width assertions, back-references or conditional references) these
+constructs will be replaced internally with broader constructs that may match
+more often.
+
+For example, the pattern :regexp:`/(\\w+) again \\1/` contains the
+back-reference :regexp:`\\1`. In prefiltering mode, this pattern might be
+approximated by having its back-reference replaced with its referent, forming
+:regexp:`/\\w+ again \\w+/`.
+
+Furthermore, in prefiltering mode Hyperscan may simplify a pattern that would
+otherwise return a "Pattern too large" error at compile time, or for performance
+reasons (subject to the matching guarantee above).
+
+It is generally expected that the application will subsequently confirm
+prefilter matches with another regular expression matcher that can provide exact
+matches for the pattern.
+
+.. note:: The use of this flag in combination with Start of Match mode (using
+   the :c:member:`HS_FLAG_SOM_LEFTMOST` flag) is not currently supported and
+   will result in a pattern compilation error.
+
+.. _instr_specialization:
+
+******************************
+Instruction Set Specialization
+******************************
+
+Hyperscan is able to make use of several modern instruction set features found
+on x86 processors to provide improvements in scanning performance.
+
+Some of these features are selected when the library is built; for example,
+Hyperscan will use the native ``POPCNT`` instruction on processors where it is
+available and the library has been optimized for the host architecture.
+
+.. note:: By default, the Hyperscan runtime is built with the ``-march=native``
+   compiler flag and (where possible) will make use of all instructions known by
+   the host's C compiler.
+
+To use some instruction set features, however, Hyperscan must build a
+specialized database to support them. This means that the target platform must
+be specified at pattern compile time.
+
+The Hyperscan compiler API functions all accept an optional
+:c:type:`hs_platform_info_t` argument, which describes the target platform
+for the database to be built. If this argument is NULL, the database will be
+targeted at the current host platform.
+
+The :c:type:`hs_platform_info_t` structure has two fields:
+
+#. ``tune``: This allows the application to specify information about the target
+   platform which may be used to guide the optimisation process of the compile.
+   Use of this field does not limit the processors that the resulting database
+   can run on, but may impact the performance of the resulting database.
+
+#. ``cpu_features``: This allows the application to specify a mask of CPU
+   features that may be used on the target platform. For example,
+   :c:member:`HS_CPU_FEATURES_AVX2` can be specified for Intel\ |reg| Advanced
+   Vector Extensions +2 (Intel\ |reg| AVX2) instruction set support. If a flag
+   for a particular CPU feature is specified, the database will not be usable on
+   a CPU without that feature.
+
+An :c:type:`hs_platform_info_t` structure targeted at the current host can be
+built with the :c:func:`hs_populate_platform` function.
+
+See :ref:`api_constants` for the full list of CPU tuning and feature flags.
--- a/doc/dev-reference/conf.py.in
+++ b/doc/dev-reference/conf.py.in
@@ -0,0 +1,275 @@
+# -*- coding: utf-8 -*-
+#
+# Hyperscan documentation build configuration file, created by
+# sphinx-quickstart on Tue Sep 29 15:59:19 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['breathe']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Hyperscan'
+copyright = u'2015, Intel Corporation'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '@HS_MAJOR_VERSION@.@HS_MINOR_VERSION@'
+# The full version, including alpha/beta/rc tags.
+release = '@HS_VERSION@'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+html_theme_options = {
+    # Change some style colors; these are used for admonitions
+    'pink_1' : '#e0f8ff',
+    'pink_2' : '#e0f8ff'
+}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['@CMAKE_CURRENT_SOURCE_DIR@/_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+html_sidebars = {
+    '**': ['globaltoc.html', 'searchbox.html']
+}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+html_show_sourcelink = False
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Hyperscandoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  ('index', 'Hyperscan.tex', u'Hyperscan Documentation',
+   u'Intel Corporation', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'hyperscan', u'Hyperscan Documentation',
+     [u'Intel Corporation'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('index', 'Hyperscan', u'Hyperscan Documentation',
+   u'Intel Corporation', 'Hyperscan', 'High-performance regular expression matcher.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
+
+# -- Options for Breathe doxygen import -----------------------------------
+
+breathe_projects = { "hyperscan": "doxygen_xml" }
+breathe_default_project = "hyperscan"
+breathe_domain_by_extension = {"h" : "c"}
+
+# -- Add some customisation -----------------------------------------------
+
+def setup(app):
+    app.add_stylesheet("hyperscan.css") # Custom stylesheet for e.g. :regex:
--- a/doc/dev-reference/copyright.rst
+++ b/doc/dev-reference/copyright.rst
@@ -0,0 +1,33 @@
+.. include:: <isonum.txt>
+
+#########
+Copyright
+#########
+
+No license (express or implied, by estoppel or otherwise) to any intellectual
+property rights is granted by this document.
+
+Intel disclaims all express and implied warranties, including without
+limitation, the implied warranties of merchantability, fitness for a particular
+purpose, and non-infringement, as well as any warranty arising from course of
+performance, course of dealing, or usage in trade.
+
+This document contains information on products, services and/or processes in
+development.  All information provided here is subject to change without
+notice. Contact your Intel representative to obtain the latest forecast,
+schedule, specifications and roadmaps.
+
+The products and services described may contain defects or errors known as
+errata which may cause deviations from published specifications. Current
+characterized errata are available on request.
+
+Copies of documents which have an order number and are referenced in this
+document, or other Intel literature, may be obtained by calling 1-800-548-4725,
+or go to: <http://www.intel.com/design/literature.htm>.
+
+Intel, and the Intel logo, are trademarks of Intel Corporation in the U.S.
+and/or other countries.
+
+\*Other names and brands may be claimed as the property of others.
+
+Copyright |copy| 2015, Intel Corporation. All rights reserved.
--- a/doc/dev-reference/getting_started.rst
+++ b/doc/dev-reference/getting_started.rst
@@ -0,0 +1,211 @@
+.. include:: <isonum.txt>
+
+###############
+Getting Started
+###############
+
+Very Quick Start
+****************
+
+#. Clone Hyperscan ::
+
+     cd <where-you-want-hyperscan-source>
+     git clone git://github/01org/hyperscan
+
+#. Configure Hyperscan
+
+   Ensure that you have the correct :ref:`dependencies <software>` present,
+   and then:
+
+   ::
+
+     cd <where-you-want-to-build-hyperscan>
+     mkdir <build-dir>
+     cd <build-dir>
+     cmake [-G <generator>] [options] <hyperscan-source-path>
+
+   Known working generators:
+      * ``Unix Makefiles`` --- make-compatible makefiles (default on Linux/FreeBSD/Mac OS X)
+      * ``Ninja`` --- `Ninja <http://martine.github.io/ninja/>`_ build files.
+
+   Generators that might work include:
+      * ``Xcode`` --- OS X Xcode projects.
+      * ``Visual Studio`` --- Visual Studio projects - very experimental
+
+#. Build Hyperscan
+
+   Depending on the generator used:
+     * ``cmake --build .`` --- will build everything
+     * ``make -j<jobs>`` --- use makefiles in parallel
+     * ``ninja`` --- use Ninja build
+     * etc.
+
+#. Check Hyperscan
+
+   Run the Hyperscan unit tests: ::
+
+     bin/unit-hyperscan
+
+Requirements
+************
+
+Hardware
+========
+
+Hyperscan will run on x86 processors in 64-bit (Intel\ |reg| 64 Architecture) and
+32-bit (IA-32 Architecture) modes.
+
+Hyperscan is a high performance software library that takes advantage of recent
+Intel architecture advances. At a minimum, support for Supplemental Streaming
+SIMD Extensions 3 (SSSE3) is required, which should be available on any modern
+x86 processor.
+
+Additionally, Hyperscan can make use of:
+
+    * Intel Streaming SIMD Extensions 4.2 (SSE4.2)
+    * the POPCNT instruction
+    * Bit Manipulation Instructions (BMI, BMI2)
+    * Intel Advanced Vector Extensions 2 (Intel AVX2)
+
+if present.
+
+These can be determined at library compile time, see :ref:`target_arch`.
+
+.. _software:
+
+Software
+========
+
+As a software library, Hyperscan doesn't impose any particular runtime
+software requirements, however to build the Hyperscan library we require a
+modern C and C++ compiler -- in particular, Hyperscan requires C99 and C++11
+compiler support. The supported compilers are:
+
+    * GCC, v4.8.1 or higher
+    * Clang, v3.4 or higher (with libstdc++ or libc++)
+    * Intel C++ Compiler v15 or higher
+
+Examples of operating systems that Hyperscan is known to work on include:
+
+Linux:
+
+* Ubuntu 14.04 LTS or newer
+* RedHat/CentOS 7 or newer
+
+FreeBSD:
+
+* 10.0 or newer
+
+Mac OS X:
+
+* 10.8 or newer, using XCode/Clang
+
+Hyperscan *may* compile and run on other platforms, but there is no guarantee.
+We currently have experimental support for Windows using Intel C++ Compiler
+or Visual Studio 2015.
+
+In addition, the following software is required for compiling the Hyperscan library:
+
+======================================================= =========== ======================================
+Dependency                                              Version     Notes
+======================================================= =========== ======================================
+`CMake <http://www.cmake.org/>`_                        >=2.8.11
+`Ragel <http://www.colm.net/open-source/ragel/>`_       6.9
+`Python <http://www.python.org/>`_                      2.7
+`Boost <http://boost.org/>`_                            >=1.57      Boost headers required
+`Pcap <http://tcpdump.org>`_                            >=0.8       Optional: needed for example code only
+======================================================= =========== ======================================
+
+Most of these dependencies can be provided by the package manager on the build
+system (e.g. Debian/Ubuntu/RedHat packages, FreeBSD ports, etc). However,
+ensure that the correct version is present.
+
+Boost Headers
+-------------
+
+Compiling Hyperscan depends on a recent version of the Boost C++ header
+library. If the Boost libraries are installed on the build machine in the
+usual paths, CMake will find them. An alternative is to put a copy of (or a
+symlink to) the boost subdirectory in ``<hyperscan-source-path>/include/boost``.
+
+For example: for the Boost-1.59.0 release: ::
+
+    ln -s boost_1_59_0/boost <hyperscan-source-path>/include/boost
+
+As Hyperscan uses the header-only parts of Boost, it is not necessary to
+compile the Boost libraries.
+
+CMake Configuration
+===================
+
+When CMake is invoked, it generates build files using the given options.
+Options are passed to CMake in the form ``-D<variable name>=<value>``.
+Common options for CMake include:
+
+------------------------+----------------------------------------------------+
+| Variable               | Description                                        |
+========================+====================================================+
+| CMAKE_C_COMPILER       | C compiler to use. Default is /usr/bin/cc.         |
+------------------------+----------------------------------------------------+
+| CMAKE_CXX_COMPILER     | C++ compiler to use. Default is /usr/bin/c++.      |
+------------------------+----------------------------------------------------+
+| CMAKE_INSTALL_PREFIX   | Install directory for ``install`` target           |
+------------------------+----------------------------------------------------+
+| CMAKE_BUILD_TYPE       | Define which kind of build to generate.            |
+|                        | Valid options are Debug, Release, RelWithDebInfo,  |
+|                        | and MinSizeRel. Default is RelWithDebInfo.         |
+------------------------+----------------------------------------------------+
+| BUILD_SHARED_LIBS      | Build Hyperscan as a shared library instead of     |
+|                        | the default static library.                        |
+------------------------+----------------------------------------------------+
+| BUILD_STATIC_AND_SHARED| Build both static and shared Hyperscan libs.       |
+|                        | Default off.                                       |
+------------------------+----------------------------------------------------+
+| DEBUG_OUTPUT           | Enable very verbose debug output. Default off.     |
+------------------------+----------------------------------------------------+
+
+For example, to generate a ``Debug`` build: ::
+
+    cd <build-dir>
+    cmake -DCMAKE_BUILD_TYPE=Debug <hyperscan-source-path>
+
+
+
+Build Type
+----------
+
+CMake determines a number of features for a build based on the Build Type.
+Hyperscan defaults to ``RelWithDebInfo``, i.e. "release with debugging
+information". This is a performance optimized build without runtime assertions
+but with debug symbols enabled.
+
+The other types of builds are:
+
+ * ``Release``: as above, but without debug symbols
+ * ``MinSizeRel``: a stripped release build
+ * ``Debug``: used when developing Hyperscan. Includes runtime assertions
+   (which has a large impact on runtime performance), and will also enable
+   some other build features like building internal unit
+   tests.
+
+.. _target_arch:
+
+Target Architecture
+-------------------
+
+By default, Hyperscan will be compiled to target the instruction set of the
+processor of the machine that being used for compilation. This is done via
+the use of ``-march=native``. The result of this means that a library built on
+one machine may not work on a different machine if they differ in supported
+instruction subsets.
+
+To override the use of ``-march=native``, set appropriate flags for the
+compiler in ``CFLAGS`` and ``CXXFLAGS`` environment variables before invoking
+CMake, or ``CMAKE_C_FLAGS`` and ``CMAKE_CXX_FLAGS`` on the CMake command line. For
+example, to set the instruction subsets up to ``SSE4.2`` using GCC 4.8: ::
+
+    cmake -DCMAKE_C_FLAGS="-march=corei7" \
+      -DCMAKE_CXX_FLAGS="-march=corei7" <hyperscan-source-path>
+
+For more information, refer to :ref:`instr_specialization`.
+
--- a/doc/dev-reference/hyperscan.doxyfile.in
+++ b/doc/dev-reference/hyperscan.doxyfile.in
--- a/doc/dev-reference/index.rst
+++ b/doc/dev-reference/index.rst
@@ -0,0 +1,20 @@
+###############################################
+Hyperscan |version| Developer's Reference Guide
+###############################################
+
+-------
+|today|
+-------
+
+.. toctree::
+   :maxdepth: 2
+
+   copyright
+   preface
+   intro
+   getting_started
+   compilation
+   runtime
+   performance
+   api_constants
+   api_files
--- a/doc/dev-reference/intro.rst
+++ b/doc/dev-reference/intro.rst
@@ -0,0 +1,78 @@
+.. include:: <isonum.txt>
+.. _intro:
+
+############
+Introduction
+############
+
+Hyperscan is a software regular expression matching engine designed with
+high performance and flexibility in mind. It is implemented as a library that
+exposes a straightforward C API.
+
+The Hyperscan API itself is composed of two major components:
+
+***********
+Compilation
+***********
+
+These functions take a group of regular expressions, along with identifiers and
+option flags, and compile them into an immutable database that can be used by
+the Hyperscan scanning API. This compilation process performs considerable
+analysis and optimization work in order to build a database that will match the
+given expressions efficiently.
+
+If a pattern cannot be built into a database for any reason (such as the use of
+an unsupported expression construct, or the overflowing of a resource limit),
+an error will be returned by the pattern compiler.  
+
+Compiled databases can be serialized and relocated, so that they can be stored
+to disk or moved between hosts. They can also be targeted to particular
+platform features (for example, the use of Intel\ |reg| Advanced Vector Extensions
+2 (Intel\ |reg| AVX2) instructions).
+
+See :ref:`compilation` for more detail.
+
+********
+Scanning
+********
+
+Once a Hyperscan database has been created, it can be used to scan data in
+memory. Hyperscan provides several scanning modes, depending on whether the
+data to be scanned is available as a single contiguous block, whether it is
+distributed amongst several blocks in memory at the same time, or whether it is
+to be scanned as a sequence of blocks in a stream.
+
+Matches are delivered to the application via a user-supplied callback function
+that is called synchronously for each match.
+
+For a given database, Hyperscan provides several guarantees:
+
+* No memory allocations occur at runtime with the exception of two
+  fixed-size allocations, both of which should be done ahead of time for
+  performance-critical applications:
+
+  - **Scratch space**: temporary memory used for internal data at scan time.
+    Structures in scratch space do not persist beyond the end of a single scan
+    call.
+  - **Stream state**: in streaming mode only, some state space is required to
+    store data that persists between scan calls for each stream. This allows
+    Hyperscan to track matches that span multiple blocks of data.
+
+* The sizes of the scratch space and stream state (in streaming mode) required
+  for a given database are fixed and determined at database compile time. This
+  means that the memory requirements of the application are known ahead of
+  time, and these structures can be pre-allocated if required for performance
+  reasons.
+
+* Any pattern that has successfully been compiled by the Hyperscan compiler can
+  be scanned against any input. There are no internal resource limits or other
+  limitations at runtime that could cause a scan call to return an error.
+
+See :ref:`runtime` for more detail.
+
+************
+Example Code
+************
+
+Some simple example code demonstrating the use of the Hyperscan API is
+available in the ``examples/`` subdirectory of the Hyperscan distribution.
--- a/doc/dev-reference/performance.rst
+++ b/doc/dev-reference/performance.rst
@@ -0,0 +1,335 @@
+.. _perf:
+
+##########################
+Performance Considerations
+##########################
+
+Hyperscan supports a wide range of patterns in all three scanning modes. It is
+capable of extremely high levels of performance, but certain patterns can
+reduce performance markedly.
+
+The following guidelines will help construct patterns and pattern sets that
+will perform better:
+
+*****************************
+Regular expression constructs
+*****************************
+
+.. tip:: Do not hand-optimize regular expression constructs.
+
+Quite a large number of regular expressions can be written in multiple ways.
+For example, caseless matching of :regexp:`/abc/` can be written as:
+
+* :regexp:`/[Aa][Bb][Cc]/`
+* :regexp:`/(A|a)(B|b)(C|c)/`
+* :regexp:`/(?i)abc(?-i)/`
+* :regexp:`/abc/i`
+
+Hyperscan is capable of handling all these constructs. Unless there is a
+specific reason otherwise, do not rewrite patterns from one form to another.
+
+As another example, matching of :regexp:`/foo(bar|baz)(frotz)?/` can be
+equivalently written as:
+
+* :regexp:`/foobarfrotz|foobazfrotz|foobar|foobaz/`
+
+This change will not improve performance or reduce overheads.
+
+*************
+Library usage
+*************
+
+.. tip:: Do not hand-optimize library usage.
+
+The Hyperscan library is capable of dealing with small writes, unusually large
+and small pattern sets, etc. Unless there is a specific performance problem
+with some usage of the library, it is best to use Hyperscan in a simple and
+direct fashion. For example, it is unlikely for there to be much benefit in
+buffering input to the library into larger blocks unless streaming writes are
+tiny (say, 1-2 bytes at a time).
+
+Unlike many other pattern matching products, Hyperscan will run faster with
+small numbers of patterns and slower with large numbers of patterns in a smooth
+fashion (as opposed to, typically, running at a moderate speed up to some fixed
+limit then either breaking or running half as fast).
+
+Hyperscan also provides high-throughput matching with a single thread of
+control per core; if a database runs at 3.0 Gbps in Hyperscan it means that a
+3000-bit block of data will be scanned in 1 microsecond in a single thread of
+control, not that it is required to scan 22 3000-bit blocks of data in 22
+microseconds. Thus, it is not usually necessary to buffer data to supply
+Hyperscan with available parallelism.
+
+********************
+Block-based matching
+********************
+
+.. tip:: Prefer block-based matching to streaming matching where possible.
+
+Whenever input data appears in discrete records, or already requires some sort
+of transformation (e.g. URI normalization) that requires all the data to be
+accumulated before processing, it should be scanned in block rather than in
+streaming mode.
+
+Unnecessary use of streaming mode reduces the number of optimizations that can
+be applied in Hyperscan and may make some patterns run slower.
+
+If there is a mixture of 'block' and 'streaming' mode patterns, these should be
+scanned in separate databases except in the case that the streaming patterns
+vastly outnumber the block mode patterns.
+
+*********************
+Unnecessary databases
+*********************
+
+.. tip:: Avoid unnecessary 'union' databases.
+
+If there are 5 different types of network traffic T1 through T5 that must
+be scanned against 5 different signature sets, it will be far more efficient to
+construct 5 separate databases and scan traffic against the appropriate one
+than it will be to merge all 5 signature sets and remove inappropriate matches
+after the fact.
+
+This will be true even in the case where there is substantial overlap among the
+signatures. Only if the common subset of the signatures is overwhelmingly large
+(say, 90% of the signatures appear in all 5 traffic types) should a database
+that merges all 5 signature sets be considered, and only then if there are no
+performance issues with specific patterns that appear outside the common
+subset.
+
+******************************
+Allocate scratch ahead of time
+******************************
+
+.. tip:: Do not allocate scratch space for your pattern database just before
+   calling a scan function. Instead, do it just after the pattern database is
+   compiled or deserialized.
+
+Scratch allocation is not necessarily a cheap operation. Since it is the first
+time (after compilation or deserialization) that a pattern database is used,
+Hyperscan performs some validation checks inside :c:func:`hs_alloc_scratch` and
+must also allocate memory.
+
+Therefore, it is important to ensure that :c:func:`hs_alloc_scratch` is not
+called in the application's scanning path just before :c:func:`hs_scan` (for
+example).
+
+Instead, scratch should be allocated immediately after a pattern database is
+compiled or deserialized, then retained for later scanning operations.
+
+***********************************************
+Allocate one scratch space per scanning context
+***********************************************
+
+.. tip:: A scratch space can be allocated so that it can be used with any one of
+   a number of databases. Each concurrent scan operation (such as a thread)
+   needs its own scratch space.
+
+The :c:func:`hs_alloc_scratch` function can accept an existing scratch space and
+"grow" it to support scanning with another pattern database. This means that
+instead of allocating one scratch space for every database used by an
+application, one can call :c:func:`hs_alloc_scratch` with a pointer to the same
+:c:type:`hs_scratch_t` and it will be sized appropriately for use with any of
+the given databases. For example:
+
+.. code-block:: c
+
+    hs_database_t *db1 = buildDatabaseOne();
+    hs_database_t *db2 = buildDatabaseTwo();
+    hs_database_t *db3 = buildDatabaseThree();
+
+    hs_error_t err;
+    hs_scratch_t *scratch = NULL;
+    err = hs_alloc_scratch(db1, &scratch);
+    if (err != HS_SUCCESS) {
+        printf("hs_alloc_scratch failed!");
+        exit(1);
+    }
+    err = hs_alloc_scratch(db2, &scratch);
+    if (err != HS_SUCCESS) {
+        printf("hs_alloc_scratch failed!");
+        exit(1);
+    }
+    err = hs_alloc_scratch(db3, &scratch);
+    if (err != HS_SUCCESS) {
+        printf("hs_alloc_scratch failed!");
+        exit(1);
+    }
+
+    /* scratch may now be used to scan against any of
+       the databases db1, db2, db3. */
+
+*****************
+Anchored patterns
+*****************
+
+.. tip:: If a pattern is meant to appear at the start of data, be sure to
+   anchor it.
+
+Anchored patterns (:regexp:`/^.../`) are far simpler to match than other
+patterns, especially patterns anchored to the start of the buffer (or stream, in
+streaming mode). Anchoring patterns to the end of the buffer results in less of
+a performance gain, especially in streaming mode.
+
+There are a variety of ways to anchor a pattern to a particular offset:
+
+- The :regexp:`^` and :regexp:`\\A` constructs anchor the pattern to the start
+  of the buffer. For example, :regexp:`/^foo/` can *only* match at offset 3.
+
+- The :regexp:`$`, :regexp:`\\z` and :regexp:`\\Z` constructs anchor the pattern
+  to the end of the buffer. For example, :regexp:`/foo\\z/` can only match when
+  the data buffer being scanned ends in ``foo``. (It should be noted that
+  :regexp:`$` and :regexp:`\\Z` will also match before a newline at the end of
+  the buffer, so :regexp:`/foo\\z/` would match against either ``abc foo`` or
+  ``abc foo\n``.)
+
+- The ``min_offset`` and ``max_offset`` extended parameters may also be used to
+  constrain where a pattern could match. For example, the pattern
+  :regexp:`/foo/` with a ``max_offset`` of 10 will only match at offsets less
+  than or equal to 10 in the buffer. (This pattern could also be written as
+  :regexp:`/^.{0,7}foo/`, compiled with the :c:member:`HS_FLAG_DOTALL` flag).
+
+
+*******************
+Matching everywhere
+*******************
+
+.. tip:: Avoid patterns that match everywhere, and remember that our semantics
+   are 'match everywhere, end of match only'.
+
+Pattern that match everywhere will run slowly due to the sheer number of
+matches that they return.
+
+Patterns like :regexp:`/.*/` in an automata-based matcher will match before and
+after every single character position, so a buffer with 100 characters will
+return 101 matches. Greedy pattern matchers such as libpcre will return a
+single match in this case, but our semantics is to return all matches. This is
+likely to be very expensive for our code and for the client code of the
+library.
+
+Another result of our semantics ("match everywhere") is that patterns that have
+optional start or ending sections -- for example :regexp:`/x?abcd*/` -- may not
+perform as expected.
+
+Firstly, the :regexp:`x?` portion of the pattern is unnecessary, as it will not
+affect the match results.
+
+Secondly, the above pattern will match 'more' than :regexp:`/abc/` but
+:regexp:`/abc/` will always detect any input data that will be matched by
+:regexp:`/x?abcd*/` -- it will just produce fewer matches.
+
+For example, input data ``0123abcdddd`` will match :regexp:`/abc/` once but
+:regexp:`/abcd*/` five times (at ``abc``, ``abcd``, ``abcdd``, ``abcddd``, and
+``abcdddd``).
+
+*********************************
+Bounded repeats in streaming mode
+*********************************
+
+.. tip:: Bounded repeats are expensive in streaming mode.
+
+A bounded repeat construction such as :regexp:`/X.{1000,1001}abcd/` is extremely
+expensive in streaming mode, of necessity. It requires us to take action on
+each ``X`` character (itself expensive, relative to searching for longer strings)
+and potentially record a history of hundreds of offsets where ``X`` occurred in
+case the ``X`` and ``abcd`` characters are separated by a stream boundary.
+
+Heavy and unnecessary use of bounded repeats should be avoided, especially
+where other parts of a signature are quite specific. For example, a virus
+signature that matches a virus payload may be sufficient without including a
+prefix that includes, for example, a 2-character Windows executable prefix and
+a bounded repeat beforehand.
+
+***************
+Prefer literals
+***************
+
+.. tip:: Where possible, prefer patterns which 'require' literals, especially
+   longer literals, and in streaming mode, prefer signatures that 'require'
+   literals earlier in the pattern.
+
+Patterns which must match on a literal will run faster than patterns that do
+not. For example:
+
+- :regexp:`/\\wab\\d*\\w\\w\\w/` will run faster than
+- :regexp:`/\\w\\w\\d*\\w\\w/`, or, for that matter
+- :regexp:`/\\w(abc)?\\d*\\w\\w\\w/` (this contains a literal but it need
+  not appear in the input).
+
+Even implicit literals are better than none: :regexp:`/[0-2][3-5].*\\w\\w/`
+still effectively contains 9 2-character literals. No hand-optimization of this
+case is required; this pattern will not run faster if rewritten as:
+:regexp:`/(03|04|05|13|14|15|23|24|25).*\\w\\w/`.
+
+Under all circumstances it is better to use longer literals than shorter ones.
+A database consisting of 100 14-character literals will scan considerably
+faster than one consisting of 100 4-character literals and return fewer
+positives.
+
+Additionally, in streaming mode, a signature that contains a longer literal
+early in the pattern is preferred to one that does not.
+
+For example: :regexp:`/b\\w*foobar/` is not as good a pattern as
+:regexp:`/blah\\w*foobar/`.
+
+The disparity between these patterns is much smaller in block mode.
+
+Longer literals anywhere in the pattern are still preferred in streaming mode.
+For example, both of the above patterns are stronger and will scan faster than
+:regexp:`/b\\w*fo/` even in streaming mode.
+
+**************
+"Dot all" mode
+**************
+
+.. tip:: Use "dot all" mode where possible.
+
+Not using the :c:member:`HS_FLAG_DOTALL` pattern flag can be expensive, as
+implicitly, it means that patterns of the form :regexp:`/A.*B/` become
+:regexp:`/A[^\\n]*B/`.
+
+It is likely that scanning tasks without the DOTALL flag are better done 'line
+at a time', with the newline sequences marking the beginning and end of each
+block.
+
+This will be true in most use-cases (an exception being where the DOTALL flag
+is off but the pattern contains either explicit newlines or constructs such as
+:regexp:`\\s` that implicitly match a newline character).
+
+*****************
+Single-match flag
+*****************
+
+.. tip:: Consider using the single-match flag to limit matches to one match per
+   pattern only if possible.
+
+If only one match per pattern is required, use the flag provided to indicate
+this (:c:member:`HS_FLAG_SINGLEMATCH`). This flag can allow a number of
+optimizations to be applied, allowing both performance improvements and state
+space reductions when streaming.
+
+However, there is some overhead associated with tracking whether each pattern in
+the pattern set has matched, and some applications with infrequent matches may
+see reduced performance when the single-match flag is used.
+
+********************
+Start of Match flag
+********************
+
+.. tip:: Do not request Start of Match information if it is not not needed.
+
+Start of Match (SOM) information can be expensive to gather and can require
+large amounts of stream state to store in streaming mode. As such, SOM
+information should only be requested with the :c:member:`HS_FLAG_SOM_LEFTMOST`
+flag for patterns that require it.
+
+SOM information is not generally expected to be cheaper (in either performance
+terms or in stream state overhead) than the use of bounded repeats.
+Consequently, :regexp:`/foo.*bar/L` with a check on start of match values after
+the callback is considerably more expensive and general than
+:regexp:`/foo.{300}bar/`.
+
+Similarly, the :c:member:`hs_expr_ext::min_length` extended parameter can be
+used to specify a lower bound on the length of the matches for a pattern. Using
+this facility may be more lightweight in some circumstances than using the SOM
+flag and post-confirming match length in the calling application.
--- a/doc/dev-reference/preface.rst
+++ b/doc/dev-reference/preface.rst
@@ -0,0 +1,47 @@
+#######
+Preface
+#######
+
+********
+Overview
+********
+
+Hyperscan is a regular expression engine designed to offer high performance, the
+ability to match multiple expressions simultaneously and flexibility in
+scanning operation.
+
+Patterns are provided to a compilation interface which generates an immutable
+pattern database. The scan interface then can be used to scan a target data
+buffer for the given patterns, returning any matching results from that data
+buffer. Hyperscan also provides a streaming mode, in which matches that span
+several blocks in a stream are detected.
+
+This document is designed to facilitate code-level integration of the Hyperscan
+library with existing or new applications.
+
+:ref:`intro` is a short overview of the Hyperscan library, with more detail on
+the Hyperscan API provided in the subsequent sections: :ref:`compilation` and
+:ref:`runtime`.
+
+:ref:`perf` provides details on various factors which may impact the
+performance of a Hyperscan integration.
+
+:ref:`api_constants` and :ref:`api_files` provides a detailed summary of the
+Hyperscan Application Programming Interface (API).
+
+********
+Audience
+********
+
+This guide is aimed at developers interested in integrating Hyperscan into an
+application. For information on building the Hyperscan library, see the Quick
+Start Guide.
+
+***********
+Conventions
+***********
+
+* Text in a ``fixed-width font`` refers to a code element, e.g. type name;
+  function or method name.
+* Text in a :regexp:`coloured fixed-width font` refers to a regular
+  expression or a part of a regular expression.
--- a/doc/dev-reference/runtime.rst
+++ b/doc/dev-reference/runtime.rst
@@ -0,0 +1,198 @@
+.. _runtime:
+
+#####################
+Scanning for Patterns
+#####################
+
+Hyperscan provides three different scanning modes, each with its own scan
+function beginning with ``hs_scan``. In addition, streaming mode has a number
+of other API functions for managing stream state.
+
+****************
+Handling Matches
+****************
+
+All of these functions will call a user-supplied callback function when a match
+is found. This function has the following signature:
+
+  .. doxygentypedef:: match_event_handler
+     :outline:
+     :no-link:
+
+The *id* argument will be set to the identifier for the matching expression
+provided at compile time, and the *to* argument will be set to the end-offset
+of the match. If SOM was requested for the pattern (see :ref:`som`), the
+*from* argument will be set to the leftmost possible start-offset for the match.
+
+The match callback function has the capability to halt scanning
+by returning a non-zero value.
+
+See :c:type:`match_event_handler` for more information.
+
+**************
+Streaming Mode
+**************
+
+The streaming runtime API consists of functions to open, scan, and close
+Hyperscan data streams -- these functions being :c:func:`hs_open_stream`,
+:c:func:`hs_scan_stream`, and :c:func:`hs_close_stream`. Any matches detected
+in the written data are returned to the calling application via a function
+pointer callback.
+
+The match callback function has the capability to halt scanning of the current
+data stream by returning a non-zero value. In streaming mode, the result of
+this is that the stream is then left in a state where no more data can be
+scanned, and any subsequent calls to :c:func:`hs_scan_stream` for that stream
+will return immediately with :c:member:`HS_SCAN_TERMINATED`. The caller must
+still call :c:func:`hs_close_stream` to complete the clean-up process for that
+stream.
+
+Streams exist in the Hyperscan library so that pattern matching state can be
+maintained across multiple blocks of target data -- without maintaining this
+state, it would not be possible to detect patterns that span these blocks of
+data. This, however, does come at the cost of requiring an amount of storage
+per-stream (the size of this storage is fixed at compile time), and a slight
+performance penalty in some cases to manage the state.
+
+While Hyperscan does always support a strict ordering of multiple matches,
+streaming matches will not be delivered at offsets before the current stream
+write, with the exception of zero-width asserts, where constructs such as
+:regexp:`\\b` and :regexp:`$` can cause a match on the final character of a
+stream write to be delayed until the next stream write or stream close
+operation.
+
+=================
+Stream Management
+=================
+
+In addition to :c:func:`hs_open_stream`, :c:func:`hs_scan_stream`, and
+:c:func:`hs_close_stream`, the Hyperscan API provides a number of other
+functions for the management of streams:
+
+* :c:func:`hs_reset_stream`: resets a stream to its initial state; this is
+  equivalent to calling :c:func:`hs_close_stream` but will not free the memory
+  used for stream state.
+
+* :c:func:`hs_copy_stream`: constructs a (newly allocated) duplicate of a
+  stream.
+
+* :c:func:`hs_reset_and_copy_stream`: constructs a duplicate of a stream into
+  another, resetting the destination stream first. This call avoids the
+  allocation done by :c:func:`hs_copy_stream`.
+
+**********
+Block Mode
+**********
+
+The block mode runtime API consists of a single function: :c:func:`hs_scan`. Using
+the compiled patterns this function identifies matches in the target data,
+using a function pointer callback to communicate with the application.
+
+This single :c:func:`hs_scan` function is essentially equivalent to calling
+:c:func:`hs_open_stream`, making a single call to :c:func:`hs_scan_stream`, and
+then :c:func:`hs_close_stream`, except that block mode operation does not
+incur all the stream related overhead.
+
+*************
+Vectored Mode
+*************
+
+The vectored mode runtime API, like the block mode API, consists of a single
+function: :c:func:`hs_scan_vector`. This function accepts an array of data
+pointers and lengths, facilitating the scanning in sequence of a set of data
+blocks that are not contiguous in memory.
+
+From the caller's perspective, this mode will produce the same matches as if
+the set of data blocks were (a) scanned in sequence with a series of streaming
+mode scans, or (b) copied in sequence into a single block of memory and then
+scanned in block mode.
+
+*************
+Scratch Space
+*************
+
+While scanning data, Hyperscan needs a small amount of temporary memory to store
+on-the-fly internal data. This amount is unfortunately too large to fit on the
+stack, particularly for embedded applications, and allocating memory dynamically
+is too expensive, so a pre-allocated "scratch" space must be provided to the
+scanning functions.
+
+The function :c:func:`hs_alloc_scratch` allocates a large enough region of
+scratch space to support a given database. If the application uses multiple
+databases, only a single scratch region is necessary: in this case, calling
+:c:func:`hs_alloc_scratch` on each database (with the same ``scratch`` pointer)
+will ensure that the scratch space is large enough to support scanning against
+any of the given databases.
+
+Importantly, only one such space is required per thread and can (and indeed
+should) be allocated before data scanning is to commence. In a scenario where a
+set of expressions are compiled by a single "master" thread and data will be
+scanned by multiple "worker" threads, the convenience function
+:c:func:`hs_clone_scratch` allows multiple copies of an existing scratch space
+to be made for each thread (rather than forcing the caller to pass all the
+compiled databases through :c:func:`hs_alloc_scratch` multiple times).
+
+For example:
+
+.. code-block:: c
+
+    hs_error_t err;
+    hs_scratch_t *scratch_prototype = NULL;
+    err = hs_alloc_scratch(db, &scratch_prototype);
+    if (err != HS_SUCCESS) {
+        printf("hs_alloc_scratch failed!");
+        exit(1);
+    }
+
+    hs_scratch_t *scratch_thread1 = NULL;
+    hs_scratch_t *scratch_thread2 = NULL;
+
+    err = hs_clone_scratch(scratch_prototype, &scratch_thread1);
+    if (err != HS_SUCCESS) {
+        printf("hs_clone_scratch failed!");
+        exit(1);
+    }
+    err = hs_clone_scratch(scratch_prototype, &scratch_thread2);
+    if (err != HS_SUCCESS) {
+        printf("hs_clone_scratch failed!");
+        exit(1);
+    }
+
+    hs_free_scratch(scratch_prototype);
+
+    /* Now two threads can both scan against database db,
+       each with its own scratch space. */
+
+While the Hyperscan library is re-entrant, the use of scratch spaces is not.
+For example, if by design it is deemed necessary to run recursive or nested
+scanning (say, from the match callback function), then an additional scratch
+space is required for that context.
+
+The easiest way to achieve this is to build up a single scratch space as a
+prototype, then clone it for each context:
+
+*****************
+Custom Allocators
+*****************
+
+By default, structures used by Hyperscan at runtime (scratch space, stream
+state, etc) are allocated with the default system allocators, usually
+``malloc()`` and ``free()``.
+
+The Hyperscan API provides a facility for changing this behaviour to support
+applications that use custom memory allocators.
+
+These functions are:
+
+- :c:func:`hs_set_database_allocator`, which sets the allocate and free functions
+  used for compiled pattern databases.
+- :c:func:`hs_set_scratch_allocator`, which sets the allocate and free
+  functions used for scratch space.
+- :c:func:`hs_set_stream_allocator`, which sets the allocate and free functions
+  used for stream state in streaming mode.
+- :c:func:`hs_set_misc_allocator`, which sets the allocate and free functions
+  used for miscellaneous data, such as compile error structures and
+  informational strings.
+
+The :c:func:`hs_set_allocator` function can be used to set all of the custom
+allocators to the same allocate/free pair.