Initial commit of Hyperscan

2025-06-28 16:41:01 +03:00 · 2015-10-20 09:13:35 +11:00 · 2015-10-20 09:13:35 +11:00 · 904e436f11
commit 904e436f11
610 changed files with 213627 additions and 0 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,6 @@
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: false
+AllowShortIfStatementsOnASingleLine: false
+IndentCaseLabels: false
+AccessModifierOffset: -4
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,103 @@
+##
+## There are some more .gitignore files in subdirs, but this is the main place
+## to add new entries. These are mostly for the common case when ue2 is built
+## in place
+##
+
+# Autogenerated stuff that we don't want to know about
+.deps
+autom4te.cache
+autojunk
+.dirstamp
+
+# Temp and swap files
+*~
+.*.swp
+.sw?
+
+# compiler output and binaries
+*.a
+*.o
+*.lo
+*.la
+*.so
+*.pyc
+.libs
+bin
+
+# Merge files created by git.
+*.orig
+
+# sigs dir is handled externally
+signatures
+
+# ignore pcre symlink if it exists
+pcre
+# but not pcre subdirs!
+!pcre/
+
+# ignore boost symlink if it exists
+include/boost
+
+# ignore sqlite3 symlink if it exists
+sqlite3
+
+# Generated files
+src/config.h
+src/config.h.in
+src/hs_version.h
+src/fdr/fdr_autogen.c
+src/fdr/fdr_autogen_compiler.cpp
+src/fdr/teddy_autogen.c
+src/fdr/teddy_autogen_compiler.cpp
+src/parser/Parser.cpp
+
+# Generated PCRE files
+pcre/pcre_chartables.c
+pcre/pcregrep
+pcre/pcretest
+
+# Autoconf/automake/libtool noise
+Makefile
+Makefile.in
+aclocal.m4
+config.cache
+config.log
+config.status
+configure
+libhs.pc
+libtool
+m4/libtool.m4
+m4/ltoptions.m4
+m4/ltsugar.m4
+m4/ltversion.m4
+m4/lt~obsolete.m4
+src/stamp-h1
+
+# Docs
+!doc/dev-reference/Makefile # not generated
+doc/dev-reference/doxygen_sqlite3.db
+doc/dev-reference/doxygen_xml/
+doc/dev-reference/_build/
+
+# Autotools noise in pcre
+pcre/INSTALL
+pcre/Makefile
+pcre/Makefile.in
+pcre/aclocal.m4
+pcre/ar-lib
+pcre/compile
+pcre/config.*
+pcre/configure
+pcre/depcomp
+pcre/install-sh
+pcre/*.pc
+pcre/libtool
+pcre/ltmain.sh
+pcre/missing
+pcre/pcre-config
+pcre/pcre.h
+pcre/pcre_stringpiece.h
+pcre/pcrecpparg.h
+pcre/stamp-h1
+pcre/test-driver
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,944 @@
+cmake_minimum_required (VERSION 2.8)
+project (Hyperscan C CXX)
+
+set (HS_MAJOR_VERSION 4)
+set (HS_MINOR_VERSION 0)
+set (HS_PATCH_VERSION 0)
+set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
+
+string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
+
+set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
+include(CheckCCompilerFlag)
+include(CheckCXXCompilerFlag)
+INCLUDE (CheckFunctionExists)
+INCLUDE (CheckIncludeFiles)
+INCLUDE (CheckIncludeFileCXX)
+INCLUDE (CheckLibraryExists)
+INCLUDE (CheckSymbolExists)
+include (CMakeDependentOption)
+include (${CMAKE_MODULE_PATH}/platform.cmake)
+include (${CMAKE_MODULE_PATH}/ragel.cmake)
+
+find_package(PkgConfig QUIET)
+
+if (NOT CMAKE_BUILD_TYPE)
+    message(STATUS "Default build type 'Release with debug info'")
+    set(CMAKE_BUILD_TYPE "RELWITHDEBINFO")
+else()
+    string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE)
+    message(STATUS "Build type ${CMAKE_BUILD_TYPE}")
+endif()
+
+if(CMAKE_BUILD_TYPE MATCHES RELEASE|RELWITHDEBINFO)
+    set(RELEASE_BUILD TRUE)
+else()
+    set(RELEASE_BUILD FALSE)
+endif()
+
+set(BINDIR ${PROJECT_BINARY_DIR}/bin)
+set(LIBDIR ${PROJECT_BINARY_DIR}/lib)
+
+# First for the generic no-config case
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${BINDIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LIBDIR})
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LIBDIR})
+# Second, for multi-config builds (e.g. msvc)
+foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
+    string (TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG)
+    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${BINDIR})
+    set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${LIBDIR})
+    set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${LIBDIR})
+endforeach (OUTPUTCONFIG CMAKE_CONFIGURATION_TYPES)
+
+
+if(CMAKE_GENERATOR STREQUAL Xcode)
+    set(XCODE TRUE)
+endif()
+
+include_directories(src .)
+include_directories(${CMAKE_BINARY_DIR})
+include_directories(SYSTEM include)
+
+set(BOOST_USE_STATIC_LIBS OFF)
+set(BOOST_USE_MULTITHREADED OFF)
+set(BOOST_USE_STATIC_RUNTIME OFF)
+set(BOOST_MINVERSION 1.57.0)
+set(BOOST_NO_BOOST_CMAKE ON)
+
+# first check for Boost installed on the system
+find_package(Boost ${BOOST_MINVERSION})
+if(NOT Boost_FOUND)
+    # we might have boost in tree, so provide a hint and try again
+    message(STATUS "trying include dir for boost")
+    set(BOOST_INCLUDEDIR ${CMAKE_SOURCE_DIR}/include)
+    find_package(Boost ${BOOST_MINVERSION})
+    if(NOT Boost_FOUND)
+        message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system pacakges if available or extract Boost headers to ${CMAKE_SOURCE_DIR}/include")
+    endif()
+endif()
+
+# -- make this work? set(python_ADDITIONAL_VERSIONS 2.7 2.6)
+find_package(PythonInterp)
+find_program(RAGEL ragel)
+
+if(PYTHONINTERP_FOUND)
+    set(PYTHON ${PYTHON_EXECUTABLE})
+else()
+    message(FATAL_ERROR "No python interpreter found")
+endif()
+
+option(OPTIMISE "Turns off compiler optimizations (on by default unless debug output enabled or coverage testing)" TRUE)
+
+option(DEBUG_OUTPUT "Enable debug output (warning: very verbose)" FALSE)
+
+if(DEBUG_OUTPUT)
+    add_definitions(-DDEBUG)
+    set(OPTIMISE FALSE)
+endif(DEBUG_OUTPUT)
+
+option(BUILD_SHARED_LIBS "Build shared libs instead of static" OFF)
+option(BUILD_STATIC_AND_SHARED "Build shared libs as well as static" OFF)
+
+if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
+    if (WIN32)
+        message(FATAL_ERROR "Windows DLLs currently not supported")
+    else()
+        message(STATUS "Building shared libraries")
+    endif()
+endif()
+
+#for config
+set(HS_OPTIMIZE OPTIMISE)
+
+CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in release builds" ON "NOT RELEASE_BUILD" OFF)
+
+option(DISABLE_ASSERTS "Disable assert(); enabled in debug builds, disabled in release builds" FALSE)
+
+if (DISABLE_ASSERTS)
+    if (CMAKE_BUILD_TYPE STREQUAL "DEBUG")
+        add_definitions(-DNDEBUG)
+    endif()
+endif()
+
+option(WINDOWS_ICC "Use Intel C++ Compiler on Windows, default off, requires ICC to be set in project" OFF)
+
+# TODO: per platform config files?
+
+# TODO: windows generator on cmake always uses msvc, even if we plan to build with icc
+if(MSVC OR MSVC_IDE)
+    message(STATUS "Building for Windows")
+    if (MSVC_VERSION LESS 1700)
+        message(FATAL_ERROR "The project requires C++11 features.")
+    else()
+        if (WINDOWS_ICC)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Qstd=c99 /Qrestrict /QxHost /O3 /wd4267 /Qdiag-disable:remark")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qstd=c++11 /Qrestrict /QxHost /O2 /wd4267 /wd4800 /Qdiag-disable:remark -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
+        else()
+            #TODO: don't hardcode arch
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX /O2 /wd4267")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX /O2 /wd4244 /wd4267 /wd4800 /wd2586 /wd1170 -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
+        endif()
+        string(REGEX REPLACE "/RTC1" ""
+            CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}" )
+        string(REGEX REPLACE "/RTC1" ""
+            CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}" )
+
+    endif()
+
+else()
+
+    # compiler version checks TODO: test more compilers
+    if (CMAKE_COMPILER_IS_GNUCXX)
+        set (GNUCXX_MINVER "4.8.1")
+        exec_program(${CMAKE_CXX_COMPILER}
+                     ARGS ${CMAKE_CXX_COMPILER_ARG1} --version
+                     OUTPUT_VARIABLE _GXX_OUTPUT)
+        # is the following too fragile?
+        string(REGEX REPLACE ".* ([0-9]\\.[0-9](\\.[0-9])?)( |\n).*" "\\1"
+               GNUCXX_VERSION "${_GXX_OUTPUT}")
+        message(STATUS "g++ version ${GNUCXX_VERSION}")
+        if (GNUCXX_VERSION VERSION_LESS ${GNUCXX_MINVER})
+            message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++11 support")
+        endif()
+        unset(_GXX_OUTPUT)
+    endif()
+
+    # set compiler flags - more are tested and added later
+    set(EXTRA_C_FLAGS "-std=c99 -Wall -Wextra -Wshadow -Wcast-qual -Werror")
+    set(EXTRA_CXX_FLAGS "-std=c++11 -Wall -Wextra -Werror -Wno-shadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor")
+
+    if (NOT CMAKE_C_FLAGS MATCHES .*march.*)
+        message(STATUS "Building for current host CPU")
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -march=native -mtune=native")
+    endif()
+    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.*)
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -march=native -mtune=native")
+    endif()
+
+    if(CMAKE_COMPILER_IS_GNUCC)
+        # spurious warnings?
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized")
+    endif()
+
+    if(CMAKE_COMPILER_IS_GNUCXX)
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0 -Wno-unused-local-typedefs -Wno-maybe-uninitialized")
+    endif()
+
+    if(OPTIMISE)
+        set(EXTRA_C_FLAGS "-O3 ${EXTRA_C_FLAGS}")
+        set(EXTRA_CXX_FLAGS "-O2 ${EXTRA_CXX_FLAGS}")
+    else()
+        set(EXTRA_C_FLAGS "-O0 ${EXTRA_C_FLAGS}")
+        set(EXTRA_CXX_FLAGS "-O0 ${EXTRA_CXX_FLAGS}")
+    endif(OPTIMISE)
+
+    if(NOT RELEASE_BUILD)
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
+    endif()
+
+endif()
+
+CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
+CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
+CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
+CHECK_INCLUDE_FILES(tmmintrin.h HAVE_TMMINTRIN_H)
+CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
+CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
+
+CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
+CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
+
+# these end up in the config file
+CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
+CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)
+
+# testing a builtin takes a little more work
+CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
+CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
+
+if (NOT WIN32)
+set(C_FLAGS_TO_CHECK
+# Variable length arrays are way bad, most especially at run time
+"-Wvla"
+# Pointer arith on void pointers is doing it wong.
+ "-Wpointer-arith"
+# Build our C code with -Wstrict-prototypes -Wmissing-prototypes
+ "-Wstrict-prototypes"
+ "-Wmissing-prototypes"
+)
+foreach (FLAG ${C_FLAGS_TO_CHECK})
+    # munge the name so it doesn't break things
+    string(REPLACE "-" "_" FNAME C_FLAG${FLAG})
+    CHECK_C_COMPILER_FLAG("${FLAG}" ${FNAME})
+    if (${FNAME})
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} ${FLAG}")
+    endif()
+endforeach()
+
+set(CXX_FLAGS_TO_CHECK
+"-Wvla"
+"-Wpointer-arith"
+)
+foreach (FLAG ${CXX_FLAGS_TO_CHECK})
+    string(REPLACE "-" "_" FNAME CXX_FLAG${FLAG})
+    CHECK_CXX_COMPILER_FLAG("${FLAG}" ${FNAME})
+    if (${FNAME})
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} ${FLAG}")
+    endif()
+endforeach()
+
+# self-assign should be thrown away, but clang whinges
+CHECK_C_COMPILER_FLAG("-Wself-assign" CC_SELF_ASSIGN)
+if (CC_SELF_ASSIGN)
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-self-assign")
+endif()
+CHECK_CXX_COMPILER_FLAG("-Wself-assign" CXX_SELF_ASSIGN)
+if (CXX_SELF_ASSIGN)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-self-assign")
+endif()
+
+# clang gets up in our face for going paren crazy with macros
+CHECK_C_COMPILER_FLAG("-Wparentheses-equality" CC_PAREN_EQUALITY)
+if (CC_PAREN_EQUALITY)
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-parentheses-equality")
+endif()
+
+# clang compains about unused const vars in our Ragel-generated code.
+CHECK_CXX_COMPILER_FLAG("-Wunused-const-variable" CXX_UNUSED_CONST_VAR)
+if (CXX_UNUSED_CONST_VAR)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-const-variable")
+endif()
+
+
+# note this for later
+# g++ doesn't have this flag but clang does
+CHECK_CXX_COMPILER_FLAG("-Wweak-vtables" CXX_WEAK_VTABLES)
+if (CXX_WEAK_VTABLES)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wweak-vtables")
+endif()
+
+CHECK_CXX_COMPILER_FLAG("-Wmissing-declarations" CXX_MISSING_DECLARATIONS)
+if (CXX_MISSING_DECLARATIONS)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wmissing-declarations")
+endif()
+
+# gcc5 complains about this
+CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE)
+
+endif()
+
+if (NOT XCODE)
+    include_directories(SYSTEM ${Boost_INCLUDE_DIR})
+else()
+    # cmake doesn't think Xcode supports isystem
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -isystem ${Boost_INCLUDE_DIR}")
+endif()
+
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+    set(LINUX TRUE)
+endif(CMAKE_SYSTEM_NAME MATCHES "Linux")
+
+if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+    set(FREEBSD true)
+endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+
+if(NOT WIN32)
+if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -diag-error 10006 -diag-disable 177 -diag-disable 2304 -diag-disable 2305 -diag-disable 2338 -diag-disable 1418 -diag-disable=remark")
+endif()
+if(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -diag-error 10006 -diag-disable 177 -diag-disable 2304 -diag-disable 2305 -diag-disable 2338 -diag-disable 1418 -diag-disable 1170 -diag-disable 3373 -diag-disable=remark")
+endif()
+endif()
+
+add_subdirectory(util)
+add_subdirectory(unit)
+add_subdirectory(doc/dev-reference)
+if (EXISTS ${CMAKE_SOURCE_DIR}/tools)
+    add_subdirectory(tools)
+endif()
+
+# do substitutions
+configure_file(${CMAKE_MODULE_PATH}/config.h.in ${CMAKE_BINARY_DIR}/config.h)
+configure_file(src/hs_version.h.in hs_version.h)
+
+if (PKG_CONFIG_FOUND)
+    # we really only need to do this if we have pkg-config
+    configure_file(libhs.pc.in libhs.pc @ONLY) # only replace @ quoted vars
+    install(FILES ${CMAKE_BINARY_DIR}/libhs.pc
+            DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/pkgconfig")
+endif()
+
+# only set these after all tests are done
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
+
+
+# include the autogen targets
+add_subdirectory(src/fdr)
+
+include_directories(${CMAKE_BINARY_DIR}/src/fdr)
+
+if(NOT WIN32)
+set(RAGEL_C_FLAGS "-Wno-unused")
+endif()
+
+set_source_files_properties(
+    ${CMAKE_BINARY_DIR}/src/parser/Parser.cpp
+    PROPERTIES
+        COMPILE_FLAGS "${RAGEL_C_FLAGS}")
+
+ragelmaker(src/parser/Parser.rl)
+
+SET(hs_HEADERS
+    src/hs.h
+    src/hs_common.h
+    src/hs_compile.h
+    src/hs_runtime.h
+)
+install(FILES ${hs_HEADERS} DESTINATION include/hs)
+
+set(fdr_autogen_targets autogen_runtime autogen_teddy_runtime)
+
+set (hs_exec_SRCS
+    ${hs_HEADERS}
+    src/hs_version.h
+    src/ue2common.h
+    src/alloc.c
+    src/allocator.h
+    src/runtime.c
+    src/fdr/fdr.c
+    src/fdr/fdr.h
+    src/fdr/fdr_internal.h
+    src/fdr/fdr_confirm.h
+    src/fdr/fdr_confirm_runtime.h
+    src/fdr/fdr_streaming_runtime.h
+    src/fdr/flood_runtime.h
+    src/fdr/fdr_loadval.h
+    src/fdr/teddy.c
+    src/fdr/teddy_internal.h
+    src/hwlm/hwlm.c
+    src/hwlm/hwlm.h
+    src/hwlm/hwlm_internal.h
+    src/hwlm/noodle_engine.c
+    src/hwlm/noodle_engine.h
+    src/hwlm/noodle_internal.h
+    src/nfa/accel.c
+    src/nfa/accel.h
+    src/nfa/castle.c
+    src/nfa/castle.h
+    src/nfa/castle_internal.h
+    src/nfa/gough.c
+    src/nfa/gough_internal.h
+    src/nfa/lbr.c
+    src/nfa/lbr.h
+    src/nfa/lbr_common_impl.h
+    src/nfa/lbr_internal.h
+    src/nfa/mcclellan.c
+    src/nfa/mcclellan.h
+    src/nfa/mcclellan_common_impl.h
+    src/nfa/mcclellan_internal.h
+    src/nfa/limex_accel.c
+    src/nfa/limex_accel.h
+    src/nfa/limex_exceptional.h
+    src/nfa/limex_native.c
+    src/nfa/limex_ring.h
+    src/nfa/limex_simd128.c
+    src/nfa/limex_simd256.c
+    src/nfa/limex_simd384.c
+    src/nfa/limex_simd512a.c
+    src/nfa/limex_simd512b.c
+    src/nfa/limex_simd512c.c
+    src/nfa/limex.h
+    src/nfa/limex_common_impl.h
+    src/nfa/limex_context.h
+    src/nfa/limex_internal.h
+    src/nfa/limex_runtime.h
+    src/nfa/limex_runtime_impl.h
+    src/nfa/limex_state_impl.h
+    src/nfa/mpv.h
+    src/nfa/mpv.c
+    src/nfa/mpv_internal.h
+    src/nfa/nfa_api.h
+    src/nfa/nfa_api_dispatch.c
+    src/nfa/nfa_internal.h
+    src/nfa/nfa_rev_api.h
+    src/nfa/repeat.c
+    src/nfa/repeat.h
+    src/nfa/repeat_internal.h
+    src/nfa/shufti.c
+    src/nfa/shufti.h
+    src/nfa/truffle.c
+    src/nfa/truffle.h
+    src/nfa/vermicelli.h
+    src/nfa/vermicelli_run.h
+    src/nfa/vermicelli_sse.h
+    src/sidecar/sidecar.c
+    src/sidecar/sidecar.h
+    src/sidecar/sidecar_generic.h
+    src/sidecar/sidecar_internal.h
+    src/sidecar/sidecar_shufti.c
+    src/sidecar/sidecar_shufti.h
+    src/som/som.h
+    src/som/som_runtime.h
+    src/som/som_runtime.c
+    src/som/som_stream.c
+    src/som/som_stream.h
+    src/rose/block.c
+    src/rose/catchup.h
+    src/rose/catchup.c
+    src/rose/eod.c
+    src/rose/infix.h
+    src/rose/init.h
+    src/rose/init.c
+    src/rose/stream.c
+    src/rose/match.h
+    src/rose/match.c
+    src/rose/miracle.h
+    src/rose/runtime.h
+    src/rose/rose_sidecar_runtime.h
+    src/rose/rose.h
+    src/rose/rose_internal.h
+    src/rose/rose_types.h
+    src/rose/rose_common.h
+    src/util/bitutils.h
+    src/util/exhaust.h
+    src/util/fatbit.h
+    src/util/fatbit.c
+    src/util/join.h
+    src/util/masked_move.c
+    src/util/masked_move.h
+    src/util/multibit.h
+    src/util/multibit_internal.h
+    src/util/multibit.c
+    src/util/pack_bits.h
+    src/util/popcount.h
+    src/util/pqueue.h
+    src/util/scatter.h
+    src/util/scatter_runtime.h
+    src/util/shuffle.h
+    src/util/shuffle_ssse3.h
+    src/util/simd_utils.h
+    src/util/simd_utils_ssse3.h
+    src/util/state_compress.h
+    src/util/state_compress.c
+    src/util/unaligned.h
+    src/util/uniform_ops.h
+    src/scratch.h
+    src/scratch.c
+    src/crc32.c
+    src/crc32.h
+    src/database.c
+    src/database.h
+)
+
+
+SET (hs_SRCS
+    ${hs_HEADERS}
+    src/crc32.h
+    src/database.h
+    src/grey.cpp
+    src/grey.h
+    src/hs.cpp
+    src/hs_internal.h
+    src/hs_version.c
+    src/hs_version.h
+    src/scratch.h
+    src/state.h
+    src/ue2common.h
+    src/compiler/asserts.cpp
+    src/compiler/asserts.h
+    src/compiler/compiler.cpp
+    src/compiler/compiler.h
+    src/compiler/error.cpp
+    src/compiler/error.h
+    src/fdr/engine_description.cpp
+    src/fdr/engine_description.h
+    src/fdr/fdr_compile.cpp
+    src/fdr/fdr_compile.h
+    src/fdr/fdr_compile_internal.h
+    src/fdr/fdr_compile_util.cpp
+    src/fdr/fdr_confirm_compile.cpp
+    src/fdr/fdr_confirm.h
+    src/fdr/fdr_engine_description.cpp
+    src/fdr/fdr_engine_description.h
+    src/fdr/fdr_internal.h
+    src/fdr/fdr_streaming_compile.cpp
+    src/fdr/fdr_streaming_internal.h
+    src/fdr/flood_compile.cpp
+    src/fdr/teddy_compile.cpp
+    src/fdr/teddy_compile.h
+    src/fdr/teddy_engine_description.cpp
+    src/fdr/teddy_engine_description.h
+    src/fdr/teddy_internal.h
+    src/hwlm/hwlm_build.cpp
+    src/hwlm/hwlm_build.h
+    src/hwlm/hwlm_internal.h
+    src/hwlm/hwlm_literal.cpp
+    src/hwlm/hwlm_literal.h
+    src/hwlm/noodle_build.cpp
+    src/hwlm/noodle_build.h
+    src/hwlm/noodle_internal.h
+    src/nfa/accel.h
+    src/nfa/accelcompile.cpp
+    src/nfa/accelcompile.h
+    src/nfa/callback.h
+    src/nfa/castlecompile.cpp
+    src/nfa/castlecompile.h
+    src/nfa/dfa_min.cpp
+    src/nfa/dfa_min.h
+    src/nfa/goughcompile.cpp
+    src/nfa/goughcompile.h
+    src/nfa/goughcompile_accel.cpp
+    src/nfa/goughcompile_internal.h
+    src/nfa/goughcompile_reg.cpp
+    src/nfa/mcclellan.h
+    src/nfa/mcclellan_internal.h
+    src/nfa/mcclellancompile.cpp
+    src/nfa/mcclellancompile.h
+    src/nfa/mcclellancompile_util.cpp
+    src/nfa/mcclellancompile_util.h
+    src/nfa/limex_compile.cpp
+    src/nfa/limex_compile.h
+    src/nfa/limex_accel.h
+    src/nfa/limex_internal.h
+    src/nfa/mpv_internal.h
+    src/nfa/mpvcompile.cpp
+    src/nfa/mpvcompile.h
+    src/nfa/nfa_api.h
+    src/nfa/nfa_api_queue.h
+    src/nfa/nfa_api_util.h
+    src/nfa/nfa_build_util.cpp
+    src/nfa/nfa_build_util.h
+    src/nfa/nfa_internal.h
+    src/nfa/nfa_kind.h
+    src/nfa/rdfa.h
+    src/nfa/rdfa_merge.cpp
+    src/nfa/rdfa_merge.h
+    src/nfa/repeat_internal.h
+    src/nfa/repeatcompile.cpp
+    src/nfa/repeatcompile.h
+    src/nfa/shufticompile.cpp
+    src/nfa/shufticompile.h
+    src/nfa/trufflecompile.cpp
+    src/nfa/trufflecompile.h
+    src/nfagraph/ng.cpp
+    src/nfagraph/ng.h
+    src/nfagraph/ng_anchored_acyclic.cpp
+    src/nfagraph/ng_anchored_acyclic.h
+    src/nfagraph/ng_anchored_dots.cpp
+    src/nfagraph/ng_anchored_dots.h
+    src/nfagraph/ng_asserts.cpp
+    src/nfagraph/ng_asserts.h
+    src/nfagraph/ng_builder.cpp
+    src/nfagraph/ng_builder.h
+    src/nfagraph/ng_calc_components.cpp
+    src/nfagraph/ng_calc_components.h
+    src/nfagraph/ng_cyclic_redundancy.cpp
+    src/nfagraph/ng_cyclic_redundancy.h
+    src/nfagraph/ng_depth.cpp
+    src/nfagraph/ng_depth.h
+    src/nfagraph/ng_dominators.cpp
+    src/nfagraph/ng_dominators.h
+    src/nfagraph/ng_edge_redundancy.cpp
+    src/nfagraph/ng_edge_redundancy.h
+    src/nfagraph/ng_equivalence.cpp
+    src/nfagraph/ng_equivalence.h
+    src/nfagraph/ng_execute.cpp
+    src/nfagraph/ng_execute.h
+    src/nfagraph/ng_expr_info.cpp
+    src/nfagraph/ng_expr_info.h
+    src/nfagraph/ng_extparam.cpp
+    src/nfagraph/ng_extparam.h
+    src/nfagraph/ng_fixed_width.cpp
+    src/nfagraph/ng_fixed_width.h
+    src/nfagraph/ng_graph.h
+    src/nfagraph/ng_haig.cpp
+    src/nfagraph/ng_haig.h
+    src/nfagraph/ng_holder.cpp
+    src/nfagraph/ng_holder.h
+    src/nfagraph/ng_is_equal.cpp
+    src/nfagraph/ng_is_equal.h
+    src/nfagraph/ng_lbr.cpp
+    src/nfagraph/ng_lbr.h
+    src/nfagraph/ng_literal_analysis.cpp
+    src/nfagraph/ng_literal_analysis.h
+    src/nfagraph/ng_literal_component.cpp
+    src/nfagraph/ng_literal_component.h
+    src/nfagraph/ng_literal_decorated.cpp
+    src/nfagraph/ng_literal_decorated.h
+    src/nfagraph/ng_mcclellan.cpp
+    src/nfagraph/ng_mcclellan.h
+    src/nfagraph/ng_mcclellan_internal.h
+    src/nfagraph/ng_limex.cpp
+    src/nfagraph/ng_limex.h
+    src/nfagraph/ng_limex_accel.cpp
+    src/nfagraph/ng_limex_accel.h
+    src/nfagraph/ng_misc_opt.cpp
+    src/nfagraph/ng_misc_opt.h
+    src/nfagraph/ng_netflow.cpp
+    src/nfagraph/ng_netflow.h
+    src/nfagraph/ng_prefilter.cpp
+    src/nfagraph/ng_prefilter.h
+    src/nfagraph/ng_prune.cpp
+    src/nfagraph/ng_prune.h
+    src/nfagraph/ng_puff.cpp
+    src/nfagraph/ng_puff.h
+    src/nfagraph/ng_redundancy.cpp
+    src/nfagraph/ng_redundancy.h
+    src/nfagraph/ng_region.cpp
+    src/nfagraph/ng_region.h
+    src/nfagraph/ng_region_redundancy.cpp
+    src/nfagraph/ng_region_redundancy.h
+    src/nfagraph/ng_repeat.cpp
+    src/nfagraph/ng_repeat.h
+    src/nfagraph/ng_reports.cpp
+    src/nfagraph/ng_reports.h
+    src/nfagraph/ng_restructuring.cpp
+    src/nfagraph/ng_restructuring.h
+    src/nfagraph/ng_revacc.cpp
+    src/nfagraph/ng_revacc.h
+    src/nfagraph/ng_rose.cpp
+    src/nfagraph/ng_rose.h
+    src/nfagraph/ng_sep.cpp
+    src/nfagraph/ng_sep.h
+    src/nfagraph/ng_small_literal_set.cpp
+    src/nfagraph/ng_small_literal_set.h
+    src/nfagraph/ng_som.cpp
+    src/nfagraph/ng_som.h
+    src/nfagraph/ng_som_add_redundancy.cpp
+    src/nfagraph/ng_som_add_redundancy.h
+    src/nfagraph/ng_som_util.cpp
+    src/nfagraph/ng_som_util.h
+    src/nfagraph/ng_split.cpp
+    src/nfagraph/ng_split.h
+    src/nfagraph/ng_squash.cpp
+    src/nfagraph/ng_squash.h
+    src/nfagraph/ng_stop.cpp
+    src/nfagraph/ng_stop.h
+    src/nfagraph/ng_uncalc_components.cpp
+    src/nfagraph/ng_uncalc_components.h
+    src/nfagraph/ng_undirected.h
+    src/nfagraph/ng_utf8.cpp
+    src/nfagraph/ng_utf8.h
+    src/nfagraph/ng_util.cpp
+    src/nfagraph/ng_util.h
+    src/nfagraph/ng_vacuous.cpp
+    src/nfagraph/ng_vacuous.h
+    src/nfagraph/ng_width.cpp
+    src/nfagraph/ng_width.h
+    src/parser/AsciiComponentClass.cpp
+    src/parser/AsciiComponentClass.h
+    src/parser/Component.cpp
+    src/parser/Component.h
+    src/parser/ComponentAlternation.cpp
+    src/parser/ComponentAlternation.h
+    src/parser/ComponentAssertion.cpp
+    src/parser/ComponentAssertion.h
+    src/parser/ComponentAtomicGroup.cpp
+    src/parser/ComponentAtomicGroup.h
+    src/parser/ComponentBackReference.cpp
+    src/parser/ComponentBackReference.h
+    src/parser/ComponentBoundary.cpp
+    src/parser/ComponentBoundary.h
+    src/parser/ComponentByte.cpp
+    src/parser/ComponentByte.h
+    src/parser/ComponentClass.cpp
+    src/parser/ComponentClass.h
+    src/parser/ComponentCondReference.cpp
+    src/parser/ComponentCondReference.h
+    src/parser/ComponentEUS.cpp
+    src/parser/ComponentEUS.h
+    src/parser/ComponentEmpty.cpp
+    src/parser/ComponentEmpty.h
+    src/parser/ComponentRepeat.cpp
+    src/parser/ComponentRepeat.h
+    src/parser/ComponentSequence.cpp
+    src/parser/ComponentSequence.h
+    src/parser/ComponentVisitor.cpp
+    src/parser/ComponentVisitor.h
+    src/parser/ComponentWordBoundary.cpp
+    src/parser/ComponentWordBoundary.h
+    src/parser/ConstComponentVisitor.cpp
+    src/parser/ConstComponentVisitor.h
+    src/parser/Parser.cpp
+    src/parser/Parser.h
+    src/parser/Utf8ComponentClass.cpp
+    src/parser/Utf8ComponentClass.h
+    src/parser/buildstate.cpp
+    src/parser/buildstate.h
+    src/parser/check_refs.cpp
+    src/parser/check_refs.h
+    src/parser/parse_error.cpp
+    src/parser/parse_error.h
+    src/parser/parser_util.cpp
+    src/parser/position.h
+    src/parser/position_info.h
+    src/parser/prefilter.cpp
+    src/parser/prefilter.h
+    src/parser/shortcut_literal.cpp
+    src/parser/shortcut_literal.h
+    src/parser/ucp_table.cpp
+    src/parser/ucp_table.h
+    src/parser/unsupported.cpp
+    src/parser/unsupported.h
+    src/parser/utf8_validate.h
+    src/parser/utf8_validate.cpp
+    src/sidecar/sidecar_compile.cpp
+    src/sidecar/sidecar_compile.h
+    src/smallwrite/smallwrite_build.cpp
+    src/smallwrite/smallwrite_build.h
+    src/smallwrite/smallwrite_internal.h
+    src/som/slot_manager.cpp
+    src/som/slot_manager.h
+    src/som/slot_manager_internal.h
+    src/som/som.h
+    src/rose/rose_build.h
+    src/rose/rose_build_add.cpp
+    src/rose/rose_build_add_internal.h
+    src/rose/rose_build_add_mask.cpp
+    src/rose/rose_build_anchored.cpp
+    src/rose/rose_build_anchored.h
+    src/rose/rose_build_bytecode.cpp
+    src/rose/rose_build_compile.cpp
+    src/rose/rose_build_convert.cpp
+    src/rose/rose_build_convert.h
+    src/rose/rose_build_impl.h
+    src/rose/rose_build_infix.cpp
+    src/rose/rose_build_infix.h
+    src/rose/rose_build_lookaround.cpp
+    src/rose/rose_build_lookaround.h
+    src/rose/rose_build_merge.cpp
+    src/rose/rose_build_merge.h
+    src/rose/rose_build_misc.cpp
+    src/rose/rose_build_role_aliasing.cpp
+    src/rose/rose_build_scatter.cpp
+    src/rose/rose_build_scatter.h
+    src/rose/rose_build_util.h
+    src/rose/rose_build_width.cpp
+    src/rose/rose_build_width.h
+    src/rose/rose_graph.h
+    src/rose/rose_in_graph.h
+    src/rose/rose_in_util.cpp
+    src/rose/rose_in_util.h
+    src/util/alloc.cpp
+    src/util/alloc.h
+    src/util/bitfield.h
+    src/util/boundary_reports.h
+    src/util/charreach.cpp
+    src/util/charreach.h
+    src/util/charreach_util.h
+    src/util/compare.h
+    src/util/compile_context.cpp
+    src/util/compile_context.h
+    src/util/compile_error.cpp
+    src/util/compile_error.h
+    src/util/container.h
+    src/util/cpuid_flags.c
+    src/util/cpuid_flags.h
+    src/util/depth.cpp
+    src/util/depth.h
+    src/util/determinise.h
+    src/util/dump_mask.cpp
+    src/util/dump_mask.h
+    src/util/graph.h
+    src/util/internal_report.h
+    src/util/multibit_build.cpp
+    src/util/multibit_build.h
+    src/util/order_check.h
+    src/util/partial_store.h
+    src/util/partitioned_set.h
+    src/util/popcount.h
+    src/util/queue_index_factory.h
+    src/util/report.cpp
+    src/util/report.h
+    src/util/report_manager.cpp
+    src/util/report_manager.h
+    src/util/simd_utils.h
+    src/util/simd_utils_ssse3.h
+    src/util/target_info.cpp
+    src/util/target_info.h
+    src/util/ue2_containers.h
+    src/util/ue2string.cpp
+    src/util/ue2string.h
+    src/util/unaligned.h
+    src/util/unicode_def.h
+    src/util/unicode_set.h
+    src/util/uniform_ops.h
+    src/util/verify_types.h
+)
+
+set(hs_dump_SRCS
+    src/scratch_dump.cpp
+    src/scratch_dump.h
+    src/fdr/fdr_dump.cpp
+    src/hwlm/hwlm_dump.cpp
+    src/hwlm/hwlm_dump.h
+    src/nfa/accel_dump.cpp
+    src/nfa/accel_dump.h
+    src/nfa/castle_dump.cpp
+    src/nfa/castle_dump.h
+    src/nfagraph/ng_dump.cpp
+    src/nfagraph/ng_dump.h
+    src/nfa/goughcompile_dump.cpp
+    src/nfa/goughcompile_dump.h
+    src/nfa/goughdump.cpp
+    src/nfa/goughdump.h
+    src/nfa/lbr_dump.cpp
+    src/nfa/limex_dump.cpp
+    src/nfa/mcclellandump.cpp
+    src/nfa/mcclellandump.h
+    src/nfa/mpv_dump.cpp
+    src/nfa/nfa_dump_api.h
+    src/nfa/nfa_dump_dispatch.cpp
+    src/nfa/nfa_dump_internal.cpp
+    src/nfa/nfa_dump_internal.h
+    src/parser/dump.cpp
+    src/parser/dump.h
+    src/parser/position_dump.h
+    src/sidecar/sidecar_dump.cpp
+    src/sidecar/sidecar_dump.h
+    src/smallwrite/smallwrite_dump.cpp
+    src/smallwrite/smallwrite_dump.h
+    src/som/slot_manager_dump.cpp
+    src/som/slot_manager_dump.h
+    src/rose/rose_build_dump.cpp
+    src/rose/rose_build_dump.h
+    src/rose/rose_in_dump.cpp
+    src/rose/rose_in_dump.h
+    src/rose/rose_dump.cpp
+    src/rose/rose_dump.h
+    src/util/dump_charclass.cpp
+    src/util/dump_charclass.h
+)
+
+if (DUMP_SUPPORT)
+    set(hs_SRCS ${hs_SRCS} ${hs_dump_SRCS})
+endif()
+
+# we group things by sublibraries, specifying shared and static and then
+# choose which ones to build
+
+set (LIB_VERSION ${HS_VERSION})
+set (LIB_SOVERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION})
+
+add_library(hs_exec OBJECT ${hs_exec_SRCS})
+add_dependencies(hs_exec ${fdr_autogen_targets})
+
+if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
+add_library(hs_exec_shared OBJECT ${hs_exec_SRCS})
+add_dependencies(hs_exec_shared ${fdr_autogen_targets})
+set_target_properties(hs_exec_shared PROPERTIES
+    POSITION_INDEPENDENT_CODE TRUE)
+endif()
+
+# hs_version.c is added explicitly to avoid some build systems that refuse to
+# create a lib without any src (I'm looking at you Xcode)
+
+add_library(hs_runtime STATIC src/hs_version.c $<TARGET_OBJECTS:hs_exec>)
+
+set_target_properties(hs_runtime PROPERTIES
+    LINKER_LANGUAGE C)
+if (NOT BUILD_SHARED_LIBS)
+    install(TARGETS hs_runtime DESTINATION lib)
+endif()
+
+if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
+    add_library(hs_runtime_shared SHARED src/hs_version.c $<TARGET_OBJECTS:hs_exec_shared>)
+    set_target_properties(hs_runtime_shared PROPERTIES
+        VERSION ${LIB_VERSION}
+        SOVERSION ${LIB_SOVERSION}
+        OUTPUT_NAME hs_runtime
+        MACOSX_RPATH ON
+        LINKER_LANGUAGE C)
+    install(TARGETS hs_runtime_shared DESTINATION lib)
+endif()
+
+# we want the static lib for testing
+add_library(hs STATIC ${hs_SRCS} $<TARGET_OBJECTS:hs_exec>)
+
+add_dependencies(hs ragel_Parser)
+add_dependencies(hs autogen_compiler autogen_teddy_compiler)
+
+if (NOT BUILD_SHARED_LIBS)
+install(TARGETS hs DESTINATION lib)
+endif()
+
+if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
+    add_library(hs_shared SHARED ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_shared>)
+    add_dependencies(hs_shared ragel_Parser)
+    add_dependencies(hs_shared autogen_compiler autogen_teddy_compiler)
+    set_target_properties(hs_shared PROPERTIES
+        OUTPUT_NAME hs
+        VERSION ${LIB_VERSION}
+        SOVERSION ${LIB_SOVERSION}
+        MACOSX_RPATH ON)
+install(TARGETS hs_shared DESTINATION lib)
+endif()
+
+if(NOT WIN32)
+    add_subdirectory(examples)
+endif()
--- a/26
+++ b/26
@ -0,0 +1,26 @@
+Copyright (c) 2015, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
--- a/118
+++ b/118
@ -0,0 +1,118 @@
+Hyperscan is licensed under the BSD License.
+
+Copyright (c) 2015, Intel Corporation
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+  * Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of Intel Corporation nor the names of its contributors
+    may be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+--------------------------------------------------------------------------------
+
+This product also contains code from third parties, under the following
+licenses:
+
+Intel's Slicing-by-8 CRC32 implementation
+-----------------------------------------
+
+Copyright (c) 2004-2006, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+  * Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Boost C++ Headers Library
+-------------------------
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+
+The Google C++ Testing Framework (Google Test)
+----------------------------------------------
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
--- a/README.md
+++ b/README.md
@ -0,0 +1,22 @@
+# Hyperscan
+
+Hyperscan is a high-performance multiple regex matching library. It follows the
+regular expression syntax of the commonly-used libpcre library, but is a
+standalone library with its own C API.
+
+Hyperscan uses hybrid automata techniques to allow simultaneous matching of
+large numbers (up to tens of thousands) of regular expressions and for the
+matching of regular expressions across streams of data.
+
+Hyperscan is typically used in a DPI library stack.
+
+# Documentation
+
+Information on building the Hyperscan library and using its API is available in
+the [Developer Reference Guide](http://01org.github.io/hyperscan/dev-reference/).
+
+# License
+
+Hyperscan is licensed under the BSD License. See the LICENSE file in the
+project repository.
+
--- a/cmake/backtrace.cmake
+++ b/cmake/backtrace.cmake
@ -0,0 +1,54 @@
+# The `backtrace' function is available on Linux via glibc, and on FreeBSD if
+# the 'libexecinfo' package is installed.
+
+CHECK_C_SOURCE_COMPILES(
+    "#include <stdlib.h>\n#include <execinfo.h>\nint main () { backtrace(NULL, 0); }"
+    BACKTRACE_LIBC)
+
+if(BACKTRACE_LIBC)
+    set(HAVE_BACKTRACE TRUE)
+    set(BACKTRACE_CFLAGS "")
+    set(BACKTRACE_LDFLAGS "")
+endif()
+
+if(NOT BACKTRACE_LIBC)
+    # FreeBSD 10 has backtrace but requires libexecinfo
+    list(INSERT CMAKE_REQUIRED_LIBRARIES 0 "-lexecinfo")
+    CHECK_C_SOURCE_COMPILES(
+        "#include <stdlib.h>\n#include <execinfo.h>\nint main () { backtrace(NULL, 0); }"
+        BACKTRACE_LIBEXECINFO)
+    list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES "-lexecinfo")
+
+    if(BACKTRACE_LIBEXECINFO)
+        set(HAVE_BACKTRACE TRUE)
+        set(BACKTRACE_CFLAGS "")
+        set(BACKTRACE_LDFLAGS "-lexecinfo")
+    else()
+        # older FreeBSD requires it from ports
+        list(INSERT CMAKE_REQUIRED_INCLUDES 0 "/usr/local/include")
+        list(INSERT CMAKE_REQUIRED_LIBRARIES 0 "-L/usr/local/lib -lexecinfo")
+        CHECK_C_SOURCE_COMPILES(
+            "#include <stdlib.h>\n#include <execinfo.h>\nint main () { backtrace(NULL, 0); }"
+            BACKTRACE_LIBEXECINFO_LOCAL)
+        list(REMOVE_ITEM CMAKE_REQUIRED_INCLUDES 0 "/usr/local/include")
+        list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES "-L/usr/local/lib -lexecinfo")
+        if(BACKTRACE_LIBEXECINFO_LOCAL)
+            set(HAVE_BACKTRACE TRUE)
+            set(BACKTRACE_CFLAGS "-I/usr/local/include")
+            set(BACKTRACE_LDFLAGS "-L/usr/local/lib -lexecinfo")
+        endif()
+    endif()
+endif()
+
+if(HAVE_BACKTRACE)
+    CHECK_C_COMPILER_FLAG(-rdynamic HAS_RDYNAMIC)
+    if(HAS_RDYNAMIC)
+        list(INSERT BACKTRACE_LDFLAGS 0 -rdynamic)
+    endif()
+    # cmake scope fun
+    set(HAVE_BACKTRACE ${HAVE_BACKTRACE} PARENT_SCOPE)
+else()
+    set(BACKTRACE_CFLAGS "")
+    set(BACKTRACE_LDFLAGS "")
+endif()
+
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@ -0,0 +1,101 @@
+/* used by cmake */
+
+/* "Define if the build is 32 bit" */
+#cmakedefine ARCH_32_BIT
+
+/* "Define if the build is 64 bit" */
+#cmakedefine ARCH_64_BIT
+
+/* "Define if building for IA32" */
+#cmakedefine ARCH_IA32
+
+/* "Define if building for EM64T" */
+#cmakedefine ARCH_X86_64
+
+/* internal build, switch on dump support. */
+#cmakedefine DUMP_SUPPORT
+
+/* Build tools with threading support */
+#cmakedefine ENABLE_TOOLS_THREADS
+
+/* Define to 1 if `backtrace' works. */
+#cmakedefine HAVE_BACKTRACE
+
+/* C compiler has __builtin_assume_aligned */
+#cmakedefine HAVE_CC_BUILTIN_ASSUME_ALIGNED
+
+/* C++ compiler has __builtin_assume_aligned */
+#cmakedefine HAVE_CXX_BUILTIN_ASSUME_ALIGNED
+
+/* C++ compiler has x86intrin.h */
+#cmakedefine HAVE_CXX_X86INTRIN_H
+
+/* C compiler has x86intrin.h */
+#cmakedefine HAVE_C_X86INTRIN_H
+
+/* C++ compiler has intrin.h */
+#cmakedefine HAVE_CXX_INTRIN_H
+
+/* C compiler has intrin.h */
+#cmakedefine HAVE_C_INTRIN_H
+
+/* Define to 1 if you have the declaration of `pthread_barrier_init', and to 0
+   if you don't. */
+#cmakedefine HAVE_DECL_PTHREAD_BARRIER_INIT
+
+/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
+   0 if you don't. */
+#cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP
+
+/* Define to 1 if you have the `malloc_info' function. */
+#cmakedefine HAVE_MALLOC_INFO
+
+/* Define to 1 if you have the `memmem' function. */
+#cmakedefine HAVE_MEMMEM
+
+/* Define to 1 if you have a working `mmap' system call. */
+#cmakedefine HAVE_MMAP
+
+/* Define to 1 if `posix_memalign' works. */
+#cmakedefine HAVE_POSIX_MEMALIGN
+
+/* Define to 1 if you have the <pthread.h> header file. */
+#cmakedefine HAVE_PTHREAD_H
+
+/* Define to 1 if you have the `setrlimit' function. */
+#cmakedefine HAVE_SETRLIMIT
+
+/* Define to 1 if you have the `shmget' function. */
+#cmakedefine HAVE_SHMGET
+
+/* Define to 1 if you have the `sigaction' function. */
+#cmakedefine HAVE_SIGACTION
+
+/* Define to 1 if you have the `sigaltstack' function. */
+#cmakedefine HAVE_SIGALTSTACK
+
+/* Define if the sqlite3_open_v2 call is available */
+#cmakedefine HAVE_SQLITE3_OPEN_V2
+
+/* Define to 1 if you have the <tmmintrin.h> header file. */
+#cmakedefine HAVE_TMMINTRIN_H
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#cmakedefine HAVE_UNISTD_H
+
+/* Define to 1 if you have the `_aligned_malloc' function. */
+#cmakedefine HAVE__ALIGNED_MALLOC
+
+/* Optimize, inline critical functions */
+#cmakedefine HS_OPTIMIZE
+
+#cmakedefine HS_VERSION
+#cmakedefine HS_MAJOR_VERSION
+#cmakedefine HS_MINOR_VERSION
+#cmakedefine HS_PATCH_VERSION
+
+#cmakedefine BUILD_DATE
+
+/* define if this is a release build. */
+#cmakedefine RELEASE_BUILD
+
--- a/cmake/platform.cmake
+++ b/cmake/platform.cmake
@ -0,0 +1,9 @@
+# determine the target arch
+
+# really only interested in the preprocessor here
+CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT)
+
+CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT)
+
+set(ARCH_X86_64 ${ARCH_64_BIT})
+set(ARCH_IA32 ${ARCH_32_BIT})
--- a/cmake/ragel.cmake
+++ b/cmake/ragel.cmake
@ -0,0 +1,16 @@
+# function for doing all the dirty work in turning a .rl into C++
+
+function(ragelmaker src_rl)
+    get_filename_component(src_dir ${src_rl} PATH) # old cmake needs PATH
+    get_filename_component(src_file ${src_rl} NAME_WE)
+    set(rl_out ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}/${src_file}.cpp)
+    add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}/${src_file}.cpp
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}
+        COMMAND ${RAGEL} ${CMAKE_CURRENT_SOURCE_DIR}/${src_rl} -o ${rl_out}
+        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${src_rl}
+        )
+    add_custom_target(ragel_${src_file} DEPENDS ${rl_out})
+    set_source_files_properties(${rl_out} PROPERTIES GENERATED TRUE)
+endfunction(ragelmaker)
+
--- a/doc/dev-reference/CMakeLists.txt
+++ b/doc/dev-reference/CMakeLists.txt
@ -0,0 +1,35 @@
+find_program(DOXYGEN doxygen)
+
+if (DOXYGEN STREQUAL DOXYGEN-NOTFOUND)
+    message(STATUS "Doxygen not found, unable to generate API reference")
+else()
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/hyperscan.doxyfile.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/hyperscan.doxyfile" @ONLY)
+
+add_custom_target(dev-reference-doxygen
+    ${DOXYGEN} ${CMAKE_CURRENT_BINARY_DIR}/hyperscan.doxyfile
+    COMMENT "Building doxygen XML for API reference")
+endif()
+
+find_program(SPHINX_BUILD sphinx-build)
+
+if (SPHINX_BUILD STREQUAL SPHINX_BUILD-NOTFOUND)
+    message(STATUS "Sphinx not found, unable to generate developer reference")
+else()
+set(SPHINX_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
+set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
+set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
+
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/conf.py" @ONLY)
+
+add_custom_target(dev-reference
+    ${SPHINX_BUILD}
+        -b html
+        -c "${CMAKE_CURRENT_BINARY_DIR}"
+        -d "${SPHINX_CACHE_DIR}"
+        "${CMAKE_CURRENT_SOURCE_DIR}"
+        "${SPHINX_HTML_DIR}"
+    DEPENDS dev-reference-doxygen
+    COMMENT "Building HTML dev reference with Sphinx")
+endif()
--- a/doc/dev-reference/_static/hyperscan.css
+++ b/doc/dev-reference/_static/hyperscan.css
@ -0,0 +1,4 @@
+/* Differentiate the way we display regex fragments. */
+.regexp {
+  color: darkred !important;
+}
--- a/doc/dev-reference/api_constants.rst
+++ b/doc/dev-reference/api_constants.rst
@ -0,0 +1,53 @@
+.. _api_constants:
+
+########################
+API Reference: Constants
+########################
+
+***********
+Error Codes
+***********
+
+.. doxygengroup:: HS_ERROR
+   :content-only:
+   :no-link:
+
+*****************
+hs_expr_ext flags
+*****************
+
+.. doxygengroup:: HS_EXT_FLAG
+   :content-only:
+   :no-link:
+
+*************
+Pattern flags
+*************
+
+.. doxygengroup:: HS_PATTERN_FLAG
+   :content-only:
+   :no-link:
+
+*************************
+CPU feature support flags
+*************************
+
+.. doxygengroup:: HS_CPU_FEATURES_FLAG
+   :content-only:
+   :no-link:
+
+****************
+CPU tuning flags
+****************
+
+.. doxygengroup:: HS_TUNE_FLAG
+   :content-only:
+   :no-link:
+
+******************
+Compile mode flags
+******************
+
+.. doxygengroup:: HS_MODE_FLAG
+   :content-only:
+   :no-link:
--- a/doc/dev-reference/api_files.rst
+++ b/doc/dev-reference/api_files.rst
@ -0,0 +1,29 @@
+.. _api_files:
+
+####################
+API Reference: Files
+####################
+
+**********
+File: hs.h
+**********
+
+.. doxygenfile:: hs.h
+
+*****************
+File: hs_common.h
+*****************
+
+.. doxygenfile:: hs_common.h
+
+******************
+File: hs_compile.h
+******************
+
+.. doxygenfile:: hs_compile.h
+
+******************
+File: hs_runtime.h
+******************
+
+.. doxygenfile:: hs_runtime.h
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@ -0,0 +1,365 @@
+.. include:: <isonum.txt>
+.. _compilation:
+
+##################
+Compiling Patterns
+##################
+
+*******************
+Building a Database
+*******************
+
+The Hyperscan compiler API accepts regular expressions and converts them into a
+compiled pattern database that can then be used to scan data.
+
+The API provides three functions that compile regular expressions into
+databases:
+
+#. :c:func:`hs_compile`: compiles a single expression into a pattern database.
+
+#. :c:func:`hs_compile_multi`: compiles an array of expressions into a pattern
+   database. All of the supplied patterns will be scanned for concurrently at
+   scan time, with user-supplied identifiers returned when they match.
+
+#. :c:func:`hs_compile_ext_multi`: compiles an array of expressions as above,
+   but allows :ref:`extparam` to be specified for each expression.
+
+Compilation allows the Hyperscan library to analyze the given pattern(s) and
+pre-determine how to scan for these patterns in an optimized fashion that would
+be far too expensive to compute at run-time.
+
+When compiling expressions, a decision needs to be made whether the resulting
+compiled patterns are to be used in a streaming, block or vectored mode:
+
+- **Streaming mode**: the target data to be scanned is a continuous stream, not
+  all of which is available at once; blocks of data are scanned in sequence and
+  matches may span multiple blocks in a stream. In streaming mode, each stream
+  requires a block of memory to store its state between scan calls.
+
+- **Block mode**: the target data is a discrete, contiguous block which can be
+  scanned in one call and does not require state to be retained.
+
+- **Vectored mode**: the target data consists of a list of non-contiguous
+  blocks that are available all at once. As for block mode, no retention of
+  state is required.
+
+To compile patterns to be used in streaming mode, the ``mode`` parameter of
+:c:func:`hs_compile` must be set to :c:member:`HS_MODE_STREAM`; similarly,
+block mode requires the use of :c:member:`HS_MODE_BLOCK` and vectored mode
+requires the use of :c:member:`HS_MODE_VECTORED`. A pattern database compiled
+for one mode (streaming, block or vectored) can only be used in that mode. The
+version of Hyperscan used to produce a compiled pattern database must match the
+version of Hyperscan used to scan with it.
+
+Hyperscan provides support for targeting a database at a particular CPU
+platform; see :ref:`instr_specialization` for details.
+
+***************
+Pattern Support
+***************
+
+Hyperscan supports the pattern syntax used by the PCRE library ("libpcre"),
+described at <http://www.pcre.org/>. However, not all constructs available in
+libpcre are supported. The use of unsupported constructs will result in
+compilation errors.
+
+====================
+Supported Constructs
+====================
+
+The following regex constructs are supported by Hyperscan:
+
+* Literal characters and strings, with all libpcre quoting and character
+  escapes.
+
+* Character classes such as :regexp:`.` (dot), :regexp:`[abc]`, and
+  :regexp:`[^abc]`, as well as the predefined character classes :regexp:`\\s`,
+  :regexp:`\\d`, :regexp:`\\w`, :regexp:`\\v`, and :regexp:`\\h` and their
+  negated counterparts (:regexp:`\\S`, :regexp:`\\D`, :regexp:`\\W`,
+  :regexp:`\\V`, and :regexp:`\\H`).
+
+* The POSIX named character classes :regexp:`[[:xxx:]]` and negated named
+  character classes :regexp:`[[:^xxx:]]`.
+
+* Unicode character properties, such as :regexp:`\\p{L}`, :regexp:`\\P{Sc}`,
+  :regexp:`\\p{Greek}`.
+
+* Quantifiers:
+
+  * Quantifiers such as :regexp:`?`, :regexp:`*` and :regexp:`+` are supported
+    when applied to arbitrary supported sub-expressions.
+
+  * Bounded repeat qualifiers such as :regexp:`{n}`, :regexp:`{m,n}`,
+    :regexp:`{n,}` are supported with limitations.
+
+    * For arbitrary repeated sub-patterns: *n* and *m* should be either small
+      or infinite, e.g. :regexp:`(a|b}{4}`, :regexp:`(ab?c?d){4,10}` or
+      :regexp:`(ab(cd)*){6,}`.
+
+    * For single-character width sub-patterns such as :regexp:`[^\\a]` or
+      :regexp:`.` or :regexp:`x`, nearly all repeat counts are supported, except
+      where repeats are extremely large (maximum bound greater than 32767).
+      Stream states may be very large for large bounded repeats, e.g.
+      :regexp:`a.{2000}b`. Note: such sub-patterns may be considerably
+      cheaper if at the beginning or end of patterns and especially if the
+      :c:member:`HS_FLAG_SINGLEMATCH` flag is on for that pattern.
+
+  * Lazy modifiers (:regexp:`?` appended to another quantifier, e.g.
+    :regexp:`\\w+?`) are supported but ignored (as Hyperscan reports all
+    matches).
+
+* Parenthesization, including the named and unnamed capturing and
+  non-capturing forms. However, capturing is ignored.
+
+* Alternation with the :regexp:`|` symbol, as in :regexp:`foo|bar`.
+
+* The anchors :regexp:`^`, :regexp:`$`, :regexp:`\\A`, :regexp:`\\Z` and
+  :regexp:`\\z`.
+
+* Option modifiers for:
+
+    * Case-sensitivity: :regexp:`(?i)` and :regexp:`(?-i)`
+    * Multi-line: :regexp:`(?m)` and :regexp:`(?-m)`
+    * Dot-all: :regexp:`(?s)` and :regexp:`(?-s)`
+    * Extended syntax: :regexp:`(?s)` and :regexp:`(?-s)`
+
+* The :regexp:`\\b` and :regexp:`\\B` zero-width assertions (word boundary and
+  'not word boundary', respectively).
+
+* Comments in :regexp:`(?# comment)` syntax.
+
+* The :regexp:`(*UTF8)` and :regexp:`(*UCP)` control verbs at the beginning of a
+  pattern, used to enable UTF-8 and UCP mode.
+
+.. note:: Bounded-repeat quantifiers with large repeat counts of arbitrary
+   expressions (e.g. :regexp:`([a-z]|bc*d|xy?z){1000,5000}`) will result in a
+   "Pattern too large" error at pattern compile time.
+
+.. note:: At this time, not all patterns can be successfully compiled with the
+  :c:member:`HS_FLAG_SOM_LEFTMOST` flag, which enables per-pattern support for
+  :ref:`som`. The patterns that support this flag are a subset of patterns that
+  can be successfully compiled with Hyperscan; notably, many bounded repeat
+  forms that can be compiled with Hyperscan without the Start of Match flag
+  enabled cannot be compiled with the flag enabled.
+
+======================
+Unsupported Constructs
+======================
+
+The following regex constructs are not supported by Hyperscan:
+
+* Backreferences and capturing sub-expressions.
+* Arbitrary zero-width assertions.
+* Subroutine references and recursive patterns.
+* Conditional patterns.
+* Backtracking control verbs.
+* The :regexp:`\\C` "single-byte" directive (which breaks UTF-8 sequences).
+* The :regexp:`\\R` newline match.
+* The :regexp:`\\K` start of match reset directive.
+* Callouts and embedded code.
+* Atomic grouping and possessive quantifiers.
+
+*********
+Semantics
+*********
+
+While Hyperscan follows libpcre syntax, it provides different semantics. The
+major departures from libpcre semantics are motivated by the requirements of
+streaming and multiple simultaneous pattern matching.
+
+The major departures from libpcre semantics are:
+
+#. **Multiple pattern matching**: Hyperscan allows matches to be reported for
+   several patterns simultaneously. This is not equivalent to separating the
+   patterns by :regexp:`|` in libpcre, which evaluates alternations
+   left-to-right.
+
+#. **Lack of ordering**: the multiple matches that Hyperscan produces are not
+   guaranteed to be ordered, although they will always fall within the bounds of
+   the current scan.
+
+#. **End offsets only**: Hyperscan's default behaviour is only to report the end
+   offset of a match. Reporting of the start offset can be enabled with
+   per-expression flags at pattern compile time. See :ref:`som` for details.
+
+#. **"All matches" reported**: scanning :regexp:`/foo.*bar/` against
+   ``fooxyzbarbar`` will return two matches from Hyperscan -- at the points
+   corresponding to the ends of ``fooxyzbar`` and ``fooxyzbarbar``. In contrast,
+   libpcre semantics by default would report only one match at ``fooxyzbarbar``
+   (greedy semantics) or, if non-greedy semantics were switched on, one match at
+   ``fooxyzbar``. This means that switching between greedy and non-greedy
+   semantics is a no-op in Hyperscan.
+
+To support libpcre quantifier semantics while accurately reporting streaming
+matches at the time they occur is impossible. For example, consider the pattern
+above, :regexp:`/foo.*bar/`, in streaming mode, against the following
+stream (three blocks scanned in sequence):
+
+    =============   =======     ========
+    block 1         block 2     block 3
+    =============   =======     ========
+    ``fooxyzbar``   ``baz``     ``qbar``
+    =============   =======     ========
+
+Since the :regexp:`.*` repeat in the pattern is a *greedy* repeat in libpcre, it
+must match as much as possible without causing the rest of the pattern to fail.
+However, in streaming mode, this would require knowledge of data in the stream
+beyond the current block being scanned.
+
+In this example, the match at offset 9 in the first block is only the correct
+match (under libpcre semantics) if there is no ``bar`` in a subsequent block --
+as in block 3 -- which would constitute a better match for the pattern.
+
+.. _som:
+
+==============
+Start of Match
+==============
+
+In standard operation, Hyperscan will only provide the end offset of a match
+when the match callback is called. If the :c:member:`HS_FLAG_SOM_LEFTMOST` flag
+is specified for a particular pattern, then the same set of matches is
+returned, but each match will also provide the leftmost possible start offset
+corresponding to its end offset.
+
+Using the SOM flag entails a number of trade-offs and limitations:
+
+* Reduced pattern support: For many patterns, tracking SOM is complex and can
+  result in Hyperscan failing to compile a pattern with a "Pattern too
+  large" error, even if the pattern is supported in normal operation.
+* Increased stream state: At scan time, state space is required to track
+  potential SOM offsets, and this must be stored in persistent stream state in
+  streaming mode. Accordingly, SOM will generally increase the stream state
+  required to match a pattern.
+* Performance overhead: Similarly, there is generally a performance cost
+  associated with tracking SOM.
+* Incompatible features: Some other Hyperscan pattern flags (such as
+  :c:member:`HS_FLAG_SINGLEMATCH` and :c:member:`HS_FLAG_PREFILTER`) can not be
+  used in combination with SOM. Specifying them together with
+  :c:member:`HS_FLAG_SOM_LEFTMOST` will result in a compilation error.
+
+In streaming mode, the amount of precision delivered by SOM can be controlled
+with the SOM horizon flags. These instruct Hyperscan to deliver accurate SOM
+information within a certain distance of the end offset, and return a special
+start offset of :c:member:`HS_OFFSET_PAST_HORIZON` otherwise. Specifying a
+small or medium SOM horizon will usually reduce the stream state required for a
+given database.
+
+.. note:: In streaming mode, the start offset returned for a match may refer to
+   a point in the stream *before* the current block being scanned. Hyperscan
+   provides no facility for accessing earlier blocks; if the calling application
+   needs to inspect historical data, then it must store it itself.
+
+.. _extparam:
+
+===================
+Extended Parameters
+===================
+
+In some circumstances, more control over the matching behaviour of a pattern is
+required than can be specified easily using regular expression syntax. For
+these scenarios, Hyperscan provides the :c:func:`hs_compile_ext_multi` function
+that allows a set of "extended parameters" to be set on a per-pattern basis.
+
+Extended parameters are specified using an :c:type:`hs_expr_ext_t` structure,
+which provides the following fields:
+
+* ``flags``: Flags governing which of the other fields in the structure are
+  used.
+* ``min_offset``: The minimum end offset in the data stream at which this
+  expression should match successfully.
+* ``max_offset``: The maximum end offset in the data stream at which this
+  expression should match successfully.
+* ``min_length``: The minimum match length (from start to end) required to
+  successfully match this expression.
+
+These parameters allow the set of matches produced by a pattern to be
+constrained at compile time, rather than relying on the application to process
+unwanted matches at runtime.
+
+For example, the pattern :regexp:`/foo.*bar/` when given a ``min_offset`` of 10
+and a ``max_offset`` of 15 will not produce matches when scanned against
+``foobar`` or ``foo0123456789bar`` but will produce a match against the data
+streams ``foo0123bar`` or ``foo0123456bar``.
+
+=================
+Prefiltering Mode
+=================
+
+Hyperscan provides a per-pattern flag, :c:member:`HS_FLAG_PREFILTER`, which can
+be used to implement a prefilter for a pattern than Hyperscan would not
+ordinarily support.
+
+This flag instructs Hyperscan to compile an "approximate" version of this
+pattern for use in a prefiltering application, even if Hyperscan does not
+support the pattern in normal operation.
+
+The set of matches returned when this flag is used is guaranteed to be a
+superset of the matches specified by the non-prefiltering expression.
+
+If the pattern contains pattern constructs not supported by Hyperscan (such as
+zero-width assertions, back-references or conditional references) these
+constructs will be replaced internally with broader constructs that may match
+more often.
+
+For example, the pattern :regexp:`/(\\w+) again \\1/` contains the
+back-reference :regexp:`\\1`. In prefiltering mode, this pattern might be
+approximated by having its back-reference replaced with its referent, forming
+:regexp:`/\\w+ again \\w+/`.
+
+Furthermore, in prefiltering mode Hyperscan may simplify a pattern that would
+otherwise return a "Pattern too large" error at compile time, or for performance
+reasons (subject to the matching guarantee above).
+
+It is generally expected that the application will subsequently confirm
+prefilter matches with another regular expression matcher that can provide exact
+matches for the pattern.
+
+.. note:: The use of this flag in combination with Start of Match mode (using
+   the :c:member:`HS_FLAG_SOM_LEFTMOST` flag) is not currently supported and
+   will result in a pattern compilation error.
+
+.. _instr_specialization:
+
+******************************
+Instruction Set Specialization
+******************************
+
+Hyperscan is able to make use of several modern instruction set features found
+on x86 processors to provide improvements in scanning performance.
+
+Some of these features are selected when the library is built; for example,
+Hyperscan will use the native ``POPCNT`` instruction on processors where it is
+available and the library has been optimized for the host architecture.
+
+.. note:: By default, the Hyperscan runtime is built with the ``-march=native``
+   compiler flag and (where possible) will make use of all instructions known by
+   the host's C compiler.
+
+To use some instruction set features, however, Hyperscan must build a
+specialized database to support them. This means that the target platform must
+be specified at pattern compile time.
+
+The Hyperscan compiler API functions all accept an optional
+:c:type:`hs_platform_info_t` argument, which describes the target platform
+for the database to be built. If this argument is NULL, the database will be
+targeted at the current host platform.
+
+The :c:type:`hs_platform_info_t` structure has two fields:
+
+#. ``tune``: This allows the application to specify information about the target
+   platform which may be used to guide the optimisation process of the compile.
+   Use of this field does not limit the processors that the resulting database
+   can run on, but may impact the performance of the resulting database.
+
+#. ``cpu_features``: This allows the application to specify a mask of CPU
+   features that may be used on the target platform. For example,
+   :c:member:`HS_CPU_FEATURES_AVX2` can be specified for Intel\ |reg| Advanced
+   Vector Extensions +2 (Intel\ |reg| AVX2) instruction set support. If a flag
+   for a particular CPU feature is specified, the database will not be usable on
+   a CPU without that feature.
+
+An :c:type:`hs_platform_info_t` structure targeted at the current host can be
+built with the :c:func:`hs_populate_platform` function.
+
+See :ref:`api_constants` for the full list of CPU tuning and feature flags.
--- a/doc/dev-reference/conf.py.in
+++ b/doc/dev-reference/conf.py.in
@ -0,0 +1,275 @@
+# -*- coding: utf-8 -*-
+#
+# Hyperscan documentation build configuration file, created by
+# sphinx-quickstart on Tue Sep 29 15:59:19 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['breathe']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Hyperscan'
+copyright = u'2015, Intel Corporation'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '@HS_MAJOR_VERSION@.@HS_MINOR_VERSION@'
+# The full version, including alpha/beta/rc tags.
+release = '@HS_VERSION@'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+html_theme_options = {
+    # Change some style colors; these are used for admonitions
+    'pink_1' : '#e0f8ff',
+    'pink_2' : '#e0f8ff'
+}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['@CMAKE_CURRENT_SOURCE_DIR@/_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+html_sidebars = {
+    '**': ['globaltoc.html', 'searchbox.html']
+}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+html_show_sourcelink = False
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Hyperscandoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  ('index', 'Hyperscan.tex', u'Hyperscan Documentation',
+   u'Intel Corporation', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'hyperscan', u'Hyperscan Documentation',
+     [u'Intel Corporation'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('index', 'Hyperscan', u'Hyperscan Documentation',
+   u'Intel Corporation', 'Hyperscan', 'High-performance regular expression matcher.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
+
+# -- Options for Breathe doxygen import -----------------------------------
+
+breathe_projects = { "hyperscan": "doxygen_xml" }
+breathe_default_project = "hyperscan"
+breathe_domain_by_extension = {"h" : "c"}
+
+# -- Add some customisation -----------------------------------------------
+
+def setup(app):
+    app.add_stylesheet("hyperscan.css") # Custom stylesheet for e.g. :regex:
--- a/doc/dev-reference/copyright.rst
+++ b/doc/dev-reference/copyright.rst
@ -0,0 +1,33 @@
+.. include:: <isonum.txt>
+
+#########
+Copyright
+#########
+
+No license (express or implied, by estoppel or otherwise) to any intellectual
+property rights is granted by this document.
+
+Intel disclaims all express and implied warranties, including without
+limitation, the implied warranties of merchantability, fitness for a particular
+purpose, and non-infringement, as well as any warranty arising from course of
+performance, course of dealing, or usage in trade.
+
+This document contains information on products, services and/or processes in
+development.  All information provided here is subject to change without
+notice. Contact your Intel representative to obtain the latest forecast,
+schedule, specifications and roadmaps.
+
+The products and services described may contain defects or errors known as
+errata which may cause deviations from published specifications. Current
+characterized errata are available on request.
+
+Copies of documents which have an order number and are referenced in this
+document, or other Intel literature, may be obtained by calling 1-800-548-4725,
+or go to: <http://www.intel.com/design/literature.htm>.
+
+Intel, and the Intel logo, are trademarks of Intel Corporation in the U.S.
+and/or other countries.
+
+\*Other names and brands may be claimed as the property of others.
+
+Copyright |copy| 2015, Intel Corporation. All rights reserved.
--- a/doc/dev-reference/getting_started.rst
+++ b/doc/dev-reference/getting_started.rst
@ -0,0 +1,211 @@
+.. include:: <isonum.txt>
+
+###############
+Getting Started
+###############
+
+Very Quick Start
+****************
+
+#. Clone Hyperscan ::
+
+     cd <where-you-want-hyperscan-source>
+     git clone git://github/01org/hyperscan
+
+#. Configure Hyperscan
+
+   Ensure that you have the correct :ref:`dependencies <software>` present,
+   and then:
+
+   ::
+
+     cd <where-you-want-to-build-hyperscan>
+     mkdir <build-dir>
+     cd <build-dir>
+     cmake [-G <generator>] [options] <hyperscan-source-path>
+
+   Known working generators:
+      * ``Unix Makefiles`` --- make-compatible makefiles (default on Linux/FreeBSD/Mac OS X)
+      * ``Ninja`` --- `Ninja <http://martine.github.io/ninja/>`_ build files.
+
+   Generators that might work include:
+      * ``Xcode`` --- OS X Xcode projects.
+      * ``Visual Studio`` --- Visual Studio projects - very experimental
+
+#. Build Hyperscan
+
+   Depending on the generator used:
+     * ``cmake --build .`` --- will build everything
+     * ``make -j<jobs>`` --- use makefiles in parallel
+     * ``ninja`` --- use Ninja build
+     * etc.
+
+#. Check Hyperscan
+
+   Run the Hyperscan unit tests: ::
+
+     bin/unit-hyperscan
+
+Requirements
+************
+
+Hardware
+========
+
+Hyperscan will run on x86 processors in 64-bit (Intel\ |reg| 64 Architecture) and
+32-bit (IA-32 Architecture) modes.
+
+Hyperscan is a high performance software library that takes advantage of recent
+Intel architecture advances. At a minimum, support for Supplemental Streaming
+SIMD Extensions 3 (SSSE3) is required, which should be available on any modern
+x86 processor.
+
+Additionally, Hyperscan can make use of:
+
+    * Intel Streaming SIMD Extensions 4.2 (SSE4.2)
+    * the POPCNT instruction
+    * Bit Manipulation Instructions (BMI, BMI2)
+    * Intel Advanced Vector Extensions 2 (Intel AVX2)
+
+if present.
+
+These can be determined at library compile time, see :ref:`target_arch`.
+
+.. _software:
+
+Software
+========
+
+As a software library, Hyperscan doesn't impose any particular runtime
+software requirements, however to build the Hyperscan library we require a
+modern C and C++ compiler -- in particular, Hyperscan requires C99 and C++11
+compiler support. The supported compilers are:
+
+    * GCC, v4.8.1 or higher
+    * Clang, v3.4 or higher (with libstdc++ or libc++)
+    * Intel C++ Compiler v15 or higher
+
+Examples of operating systems that Hyperscan is known to work on include:
+
+Linux:
+
+* Ubuntu 14.04 LTS or newer
+* RedHat/CentOS 7 or newer
+
+FreeBSD:
+
+* 10.0 or newer
+
+Mac OS X:
+
+* 10.8 or newer, using XCode/Clang
+
+Hyperscan *may* compile and run on other platforms, but there is no guarantee.
+We currently have experimental support for Windows using Intel C++ Compiler
+or Visual Studio 2015.
+
+In addition, the following software is required for compiling the Hyperscan library:
+
+======================================================= =========== ======================================
+Dependency                                              Version     Notes
+======================================================= =========== ======================================
+`CMake <http://www.cmake.org/>`_                        >=2.8.11
+`Ragel <http://www.colm.net/open-source/ragel/>`_       6.9
+`Python <http://www.python.org/>`_                      2.7
+`Boost <http://boost.org/>`_                            >=1.57      Boost headers required
+`Pcap <http://tcpdump.org>`_                            >=0.8       Optional: needed for example code only
+======================================================= =========== ======================================
+
+Most of these dependencies can be provided by the package manager on the build
+system (e.g. Debian/Ubuntu/RedHat packages, FreeBSD ports, etc). However,
+ensure that the correct version is present.
+
+Boost Headers
+-------------
+
+Compiling Hyperscan depends on a recent version of the Boost C++ header
+library. If the Boost libraries are installed on the build machine in the
+usual paths, CMake will find them. An alternative is to put a copy of (or a
+symlink to) the boost subdirectory in ``<hyperscan-source-path>/include/boost``.
+
+For example: for the Boost-1.59.0 release: ::
+
+    ln -s boost_1_59_0/boost <hyperscan-source-path>/include/boost
+
+As Hyperscan uses the header-only parts of Boost, it is not necessary to
+compile the Boost libraries.
+
+CMake Configuration
+===================
+
+When CMake is invoked, it generates build files using the given options.
+Options are passed to CMake in the form ``-D<variable name>=<value>``.
+Common options for CMake include:
+
+------------------------+----------------------------------------------------+
+| Variable               | Description                                        |
+========================+====================================================+
+| CMAKE_C_COMPILER       | C compiler to use. Default is /usr/bin/cc.         |
+------------------------+----------------------------------------------------+
+| CMAKE_CXX_COMPILER     | C++ compiler to use. Default is /usr/bin/c++.      |
+------------------------+----------------------------------------------------+
+| CMAKE_INSTALL_PREFIX   | Install directory for ``install`` target           |
+------------------------+----------------------------------------------------+
+| CMAKE_BUILD_TYPE       | Define which kind of build to generate.            |
+|                        | Valid options are Debug, Release, RelWithDebInfo,  |
+|                        | and MinSizeRel. Default is RelWithDebInfo.         |
+------------------------+----------------------------------------------------+
+| BUILD_SHARED_LIBS      | Build Hyperscan as a shared library instead of     |
+|                        | the default static library.                        |
+------------------------+----------------------------------------------------+
+| BUILD_STATIC_AND_SHARED| Build both static and shared Hyperscan libs.       |
+|                        | Default off.                                       |
+------------------------+----------------------------------------------------+
+| DEBUG_OUTPUT           | Enable very verbose debug output. Default off.     |
+------------------------+----------------------------------------------------+
+
+For example, to generate a ``Debug`` build: ::
+
+    cd <build-dir>
+    cmake -DCMAKE_BUILD_TYPE=Debug <hyperscan-source-path>
+
+
+
+Build Type
+----------
+
+CMake determines a number of features for a build based on the Build Type.
+Hyperscan defaults to ``RelWithDebInfo``, i.e. "release with debugging
+information". This is a performance optimized build without runtime assertions
+but with debug symbols enabled.
+
+The other types of builds are:
+
+ * ``Release``: as above, but without debug symbols
+ * ``MinSizeRel``: a stripped release build
+ * ``Debug``: used when developing Hyperscan. Includes runtime assertions
+   (which has a large impact on runtime performance), and will also enable
+   some other build features like building internal unit
+   tests.
+
+.. _target_arch:
+
+Target Architecture
+-------------------
+
+By default, Hyperscan will be compiled to target the instruction set of the
+processor of the machine that being used for compilation. This is done via
+the use of ``-march=native``. The result of this means that a library built on
+one machine may not work on a different machine if they differ in supported
+instruction subsets.
+
+To override the use of ``-march=native``, set appropriate flags for the
+compiler in ``CFLAGS`` and ``CXXFLAGS`` environment variables before invoking
+CMake, or ``CMAKE_C_FLAGS`` and ``CMAKE_CXX_FLAGS`` on the CMake command line. For
+example, to set the instruction subsets up to ``SSE4.2`` using GCC 4.8: ::
+
+    cmake -DCMAKE_C_FLAGS="-march=corei7" \
+      -DCMAKE_CXX_FLAGS="-march=corei7" <hyperscan-source-path>
+
+For more information, refer to :ref:`instr_specialization`.
+
--- a/doc/dev-reference/hyperscan.doxyfile.in
+++ b/doc/dev-reference/hyperscan.doxyfile.in
--- a/doc/dev-reference/index.rst
+++ b/doc/dev-reference/index.rst
@ -0,0 +1,20 @@
+###############################################
+Hyperscan |version| Developer's Reference Guide
+###############################################
+
+-------
+|today|
+-------
+
+.. toctree::
+   :maxdepth: 2
+
+   copyright
+   preface
+   intro
+   getting_started
+   compilation
+   runtime
+   performance
+   api_constants
+   api_files
--- a/doc/dev-reference/intro.rst
+++ b/doc/dev-reference/intro.rst
@ -0,0 +1,78 @@
+.. include:: <isonum.txt>
+.. _intro:
+
+############
+Introduction
+############
+
+Hyperscan is a software regular expression matching engine designed with
+high performance and flexibility in mind. It is implemented as a library that
+exposes a straightforward C API.
+
+The Hyperscan API itself is composed of two major components:
+
+***********
+Compilation
+***********
+
+These functions take a group of regular expressions, along with identifiers and
+option flags, and compile them into an immutable database that can be used by
+the Hyperscan scanning API. This compilation process performs considerable
+analysis and optimization work in order to build a database that will match the
+given expressions efficiently.
+
+If a pattern cannot be built into a database for any reason (such as the use of
+an unsupported expression construct, or the overflowing of a resource limit),
+an error will be returned by the pattern compiler.  
+
+Compiled databases can be serialized and relocated, so that they can be stored
+to disk or moved between hosts. They can also be targeted to particular
+platform features (for example, the use of Intel\ |reg| Advanced Vector Extensions
+2 (Intel\ |reg| AVX2) instructions).
+
+See :ref:`compilation` for more detail.
+
+********
+Scanning
+********
+
+Once a Hyperscan database has been created, it can be used to scan data in
+memory. Hyperscan provides several scanning modes, depending on whether the
+data to be scanned is available as a single contiguous block, whether it is
+distributed amongst several blocks in memory at the same time, or whether it is
+to be scanned as a sequence of blocks in a stream.
+
+Matches are delivered to the application via a user-supplied callback function
+that is called synchronously for each match.
+
+For a given database, Hyperscan provides several guarantees:
+
+* No memory allocations occur at runtime with the exception of two
+  fixed-size allocations, both of which should be done ahead of time for
+  performance-critical applications:
+
+  - **Scratch space**: temporary memory used for internal data at scan time.
+    Structures in scratch space do not persist beyond the end of a single scan
+    call.
+  - **Stream state**: in streaming mode only, some state space is required to
+    store data that persists between scan calls for each stream. This allows
+    Hyperscan to track matches that span multiple blocks of data.
+
+* The sizes of the scratch space and stream state (in streaming mode) required
+  for a given database are fixed and determined at database compile time. This
+  means that the memory requirements of the application are known ahead of
+  time, and these structures can be pre-allocated if required for performance
+  reasons.
+
+* Any pattern that has successfully been compiled by the Hyperscan compiler can
+  be scanned against any input. There are no internal resource limits or other
+  limitations at runtime that could cause a scan call to return an error.
+
+See :ref:`runtime` for more detail.
+
+************
+Example Code
+************
+
+Some simple example code demonstrating the use of the Hyperscan API is
+available in the ``examples/`` subdirectory of the Hyperscan distribution.
--- a/doc/dev-reference/performance.rst
+++ b/doc/dev-reference/performance.rst
@ -0,0 +1,335 @@
+.. _perf:
+
+##########################
+Performance Considerations
+##########################
+
+Hyperscan supports a wide range of patterns in all three scanning modes. It is
+capable of extremely high levels of performance, but certain patterns can
+reduce performance markedly.
+
+The following guidelines will help construct patterns and pattern sets that
+will perform better:
+
+*****************************
+Regular expression constructs
+*****************************
+
+.. tip:: Do not hand-optimize regular expression constructs.
+
+Quite a large number of regular expressions can be written in multiple ways.
+For example, caseless matching of :regexp:`/abc/` can be written as:
+
+* :regexp:`/[Aa][Bb][Cc]/`
+* :regexp:`/(A|a)(B|b)(C|c)/`
+* :regexp:`/(?i)abc(?-i)/`
+* :regexp:`/abc/i`
+
+Hyperscan is capable of handling all these constructs. Unless there is a
+specific reason otherwise, do not rewrite patterns from one form to another.
+
+As another example, matching of :regexp:`/foo(bar|baz)(frotz)?/` can be
+equivalently written as:
+
+* :regexp:`/foobarfrotz|foobazfrotz|foobar|foobaz/`
+
+This change will not improve performance or reduce overheads.
+
+*************
+Library usage
+*************
+
+.. tip:: Do not hand-optimize library usage.
+
+The Hyperscan library is capable of dealing with small writes, unusually large
+and small pattern sets, etc. Unless there is a specific performance problem
+with some usage of the library, it is best to use Hyperscan in a simple and
+direct fashion. For example, it is unlikely for there to be much benefit in
+buffering input to the library into larger blocks unless streaming writes are
+tiny (say, 1-2 bytes at a time).
+
+Unlike many other pattern matching products, Hyperscan will run faster with
+small numbers of patterns and slower with large numbers of patterns in a smooth
+fashion (as opposed to, typically, running at a moderate speed up to some fixed
+limit then either breaking or running half as fast).
+
+Hyperscan also provides high-throughput matching with a single thread of
+control per core; if a database runs at 3.0 Gbps in Hyperscan it means that a
+3000-bit block of data will be scanned in 1 microsecond in a single thread of
+control, not that it is required to scan 22 3000-bit blocks of data in 22
+microseconds. Thus, it is not usually necessary to buffer data to supply
+Hyperscan with available parallelism.
+
+********************
+Block-based matching
+********************
+
+.. tip:: Prefer block-based matching to streaming matching where possible.
+
+Whenever input data appears in discrete records, or already requires some sort
+of transformation (e.g. URI normalization) that requires all the data to be
+accumulated before processing, it should be scanned in block rather than in
+streaming mode.
+
+Unnecessary use of streaming mode reduces the number of optimizations that can
+be applied in Hyperscan and may make some patterns run slower.
+
+If there is a mixture of 'block' and 'streaming' mode patterns, these should be
+scanned in separate databases except in the case that the streaming patterns
+vastly outnumber the block mode patterns.
+
+*********************
+Unnecessary databases
+*********************
+
+.. tip:: Avoid unnecessary 'union' databases.
+
+If there are 5 different types of network traffic T1 through T5 that must
+be scanned against 5 different signature sets, it will be far more efficient to
+construct 5 separate databases and scan traffic against the appropriate one
+than it will be to merge all 5 signature sets and remove inappropriate matches
+after the fact.
+
+This will be true even in the case where there is substantial overlap among the
+signatures. Only if the common subset of the signatures is overwhelmingly large
+(say, 90% of the signatures appear in all 5 traffic types) should a database
+that merges all 5 signature sets be considered, and only then if there are no
+performance issues with specific patterns that appear outside the common
+subset.
+
+******************************
+Allocate scratch ahead of time
+******************************
+
+.. tip:: Do not allocate scratch space for your pattern database just before
+   calling a scan function. Instead, do it just after the pattern database is
+   compiled or deserialized.
+
+Scratch allocation is not necessarily a cheap operation. Since it is the first
+time (after compilation or deserialization) that a pattern database is used,
+Hyperscan performs some validation checks inside :c:func:`hs_alloc_scratch` and
+must also allocate memory.
+
+Therefore, it is important to ensure that :c:func:`hs_alloc_scratch` is not
+called in the application's scanning path just before :c:func:`hs_scan` (for
+example).
+
+Instead, scratch should be allocated immediately after a pattern database is
+compiled or deserialized, then retained for later scanning operations.
+
+***********************************************
+Allocate one scratch space per scanning context
+***********************************************
+
+.. tip:: A scratch space can be allocated so that it can be used with any one of
+   a number of databases. Each concurrent scan operation (such as a thread)
+   needs its own scratch space.
+
+The :c:func:`hs_alloc_scratch` function can accept an existing scratch space and
+"grow" it to support scanning with another pattern database. This means that
+instead of allocating one scratch space for every database used by an
+application, one can call :c:func:`hs_alloc_scratch` with a pointer to the same
+:c:type:`hs_scratch_t` and it will be sized appropriately for use with any of
+the given databases. For example:
+
+.. code-block:: c
+
+    hs_database_t *db1 = buildDatabaseOne();
+    hs_database_t *db2 = buildDatabaseTwo();
+    hs_database_t *db3 = buildDatabaseThree();
+
+    hs_error_t err;
+    hs_scratch_t *scratch = NULL;
+    err = hs_alloc_scratch(db1, &scratch);
+    if (err != HS_SUCCESS) {
+        printf("hs_alloc_scratch failed!");
+        exit(1);
+    }
+    err = hs_alloc_scratch(db2, &scratch);
+    if (err != HS_SUCCESS) {
+        printf("hs_alloc_scratch failed!");
+        exit(1);
+    }
+    err = hs_alloc_scratch(db3, &scratch);
+    if (err != HS_SUCCESS) {
+        printf("hs_alloc_scratch failed!");
+        exit(1);
+    }
+
+    /* scratch may now be used to scan against any of
+       the databases db1, db2, db3. */
+
+*****************
+Anchored patterns
+*****************
+
+.. tip:: If a pattern is meant to appear at the start of data, be sure to
+   anchor it.
+
+Anchored patterns (:regexp:`/^.../`) are far simpler to match than other
+patterns, especially patterns anchored to the start of the buffer (or stream, in
+streaming mode). Anchoring patterns to the end of the buffer results in less of
+a performance gain, especially in streaming mode.
+
+There are a variety of ways to anchor a pattern to a particular offset:
+
+- The :regexp:`^` and :regexp:`\\A` constructs anchor the pattern to the start
+  of the buffer. For example, :regexp:`/^foo/` can *only* match at offset 3.
+
+- The :regexp:`$`, :regexp:`\\z` and :regexp:`\\Z` constructs anchor the pattern
+  to the end of the buffer. For example, :regexp:`/foo\\z/` can only match when
+  the data buffer being scanned ends in ``foo``. (It should be noted that
+  :regexp:`$` and :regexp:`\\Z` will also match before a newline at the end of
+  the buffer, so :regexp:`/foo\\z/` would match against either ``abc foo`` or
+  ``abc foo\n``.)
+
+- The ``min_offset`` and ``max_offset`` extended parameters may also be used to
+  constrain where a pattern could match. For example, the pattern
+  :regexp:`/foo/` with a ``max_offset`` of 10 will only match at offsets less
+  than or equal to 10 in the buffer. (This pattern could also be written as
+  :regexp:`/^.{0,7}foo/`, compiled with the :c:member:`HS_FLAG_DOTALL` flag).
+
+
+*******************
+Matching everywhere
+*******************
+
+.. tip:: Avoid patterns that match everywhere, and remember that our semantics
+   are 'match everywhere, end of match only'.
+
+Pattern that match everywhere will run slowly due to the sheer number of
+matches that they return.
+
+Patterns like :regexp:`/.*/` in an automata-based matcher will match before and
+after every single character position, so a buffer with 100 characters will
+return 101 matches. Greedy pattern matchers such as libpcre will return a
+single match in this case, but our semantics is to return all matches. This is
+likely to be very expensive for our code and for the client code of the
+library.
+
+Another result of our semantics ("match everywhere") is that patterns that have
+optional start or ending sections -- for example :regexp:`/x?abcd*/` -- may not
+perform as expected.
+
+Firstly, the :regexp:`x?` portion of the pattern is unnecessary, as it will not
+affect the match results.
+
+Secondly, the above pattern will match 'more' than :regexp:`/abc/` but
+:regexp:`/abc/` will always detect any input data that will be matched by
+:regexp:`/x?abcd*/` -- it will just produce fewer matches.
+
+For example, input data ``0123abcdddd`` will match :regexp:`/abc/` once but
+:regexp:`/abcd*/` five times (at ``abc``, ``abcd``, ``abcdd``, ``abcddd``, and
+``abcdddd``).
+
+*********************************
+Bounded repeats in streaming mode
+*********************************
+
+.. tip:: Bounded repeats are expensive in streaming mode.
+
+A bounded repeat construction such as :regexp:`/X.{1000,1001}abcd/` is extremely
+expensive in streaming mode, of necessity. It requires us to take action on
+each ``X`` character (itself expensive, relative to searching for longer strings)
+and potentially record a history of hundreds of offsets where ``X`` occurred in
+case the ``X`` and ``abcd`` characters are separated by a stream boundary.
+
+Heavy and unnecessary use of bounded repeats should be avoided, especially
+where other parts of a signature are quite specific. For example, a virus
+signature that matches a virus payload may be sufficient without including a
+prefix that includes, for example, a 2-character Windows executable prefix and
+a bounded repeat beforehand.
+
+***************
+Prefer literals
+***************
+
+.. tip:: Where possible, prefer patterns which 'require' literals, especially
+   longer literals, and in streaming mode, prefer signatures that 'require'
+   literals earlier in the pattern.
+
+Patterns which must match on a literal will run faster than patterns that do
+not. For example:
+
+- :regexp:`/\\wab\\d*\\w\\w\\w/` will run faster than
+- :regexp:`/\\w\\w\\d*\\w\\w/`, or, for that matter
+- :regexp:`/\\w(abc)?\\d*\\w\\w\\w/` (this contains a literal but it need
+  not appear in the input).
+
+Even implicit literals are better than none: :regexp:`/[0-2][3-5].*\\w\\w/`
+still effectively contains 9 2-character literals. No hand-optimization of this
+case is required; this pattern will not run faster if rewritten as:
+:regexp:`/(03|04|05|13|14|15|23|24|25).*\\w\\w/`.
+
+Under all circumstances it is better to use longer literals than shorter ones.
+A database consisting of 100 14-character literals will scan considerably
+faster than one consisting of 100 4-character literals and return fewer
+positives.
+
+Additionally, in streaming mode, a signature that contains a longer literal
+early in the pattern is preferred to one that does not.
+
+For example: :regexp:`/b\\w*foobar/` is not as good a pattern as
+:regexp:`/blah\\w*foobar/`.
+
+The disparity between these patterns is much smaller in block mode.
+
+Longer literals anywhere in the pattern are still preferred in streaming mode.
+For example, both of the above patterns are stronger and will scan faster than
+:regexp:`/b\\w*fo/` even in streaming mode.
+
+**************
+"Dot all" mode
+**************
+
+.. tip:: Use "dot all" mode where possible.
+
+Not using the :c:member:`HS_FLAG_DOTALL` pattern flag can be expensive, as
+implicitly, it means that patterns of the form :regexp:`/A.*B/` become
+:regexp:`/A[^\\n]*B/`.
+
+It is likely that scanning tasks without the DOTALL flag are better done 'line
+at a time', with the newline sequences marking the beginning and end of each
+block.
+
+This will be true in most use-cases (an exception being where the DOTALL flag
+is off but the pattern contains either explicit newlines or constructs such as
+:regexp:`\\s` that implicitly match a newline character).
+
+*****************
+Single-match flag
+*****************
+
+.. tip:: Consider using the single-match flag to limit matches to one match per
+   pattern only if possible.
+
+If only one match per pattern is required, use the flag provided to indicate
+this (:c:member:`HS_FLAG_SINGLEMATCH`). This flag can allow a number of
+optimizations to be applied, allowing both performance improvements and state
+space reductions when streaming.
+
+However, there is some overhead associated with tracking whether each pattern in
+the pattern set has matched, and some applications with infrequent matches may
+see reduced performance when the single-match flag is used.
+
+********************
+Start of Match flag
+********************
+
+.. tip:: Do not request Start of Match information if it is not not needed.
+
+Start of Match (SOM) information can be expensive to gather and can require
+large amounts of stream state to store in streaming mode. As such, SOM
+information should only be requested with the :c:member:`HS_FLAG_SOM_LEFTMOST`
+flag for patterns that require it.
+
+SOM information is not generally expected to be cheaper (in either performance
+terms or in stream state overhead) than the use of bounded repeats.
+Consequently, :regexp:`/foo.*bar/L` with a check on start of match values after
+the callback is considerably more expensive and general than
+:regexp:`/foo.{300}bar/`.
+
+Similarly, the :c:member:`hs_expr_ext::min_length` extended parameter can be
+used to specify a lower bound on the length of the matches for a pattern. Using
+this facility may be more lightweight in some circumstances than using the SOM
+flag and post-confirming match length in the calling application.
--- a/doc/dev-reference/preface.rst
+++ b/doc/dev-reference/preface.rst
@ -0,0 +1,47 @@
+#######
+Preface
+#######
+
+********
+Overview
+********
+
+Hyperscan is a regular expression engine designed to offer high performance, the
+ability to match multiple expressions simultaneously and flexibility in
+scanning operation.
+
+Patterns are provided to a compilation interface which generates an immutable
+pattern database. The scan interface then can be used to scan a target data
+buffer for the given patterns, returning any matching results from that data
+buffer. Hyperscan also provides a streaming mode, in which matches that span
+several blocks in a stream are detected.
+
+This document is designed to facilitate code-level integration of the Hyperscan
+library with existing or new applications.
+
+:ref:`intro` is a short overview of the Hyperscan library, with more detail on
+the Hyperscan API provided in the subsequent sections: :ref:`compilation` and
+:ref:`runtime`.
+
+:ref:`perf` provides details on various factors which may impact the
+performance of a Hyperscan integration.
+
+:ref:`api_constants` and :ref:`api_files` provides a detailed summary of the
+Hyperscan Application Programming Interface (API).
+
+********
+Audience
+********
+
+This guide is aimed at developers interested in integrating Hyperscan into an
+application. For information on building the Hyperscan library, see the Quick
+Start Guide.
+
+***********
+Conventions
+***********
+
+* Text in a ``fixed-width font`` refers to a code element, e.g. type name;
+  function or method name.
+* Text in a :regexp:`coloured fixed-width font` refers to a regular
+  expression or a part of a regular expression.
--- a/doc/dev-reference/runtime.rst
+++ b/doc/dev-reference/runtime.rst
@ -0,0 +1,198 @@
+.. _runtime:
+
+#####################
+Scanning for Patterns
+#####################
+
+Hyperscan provides three different scanning modes, each with its own scan
+function beginning with ``hs_scan``. In addition, streaming mode has a number
+of other API functions for managing stream state.
+
+****************
+Handling Matches
+****************
+
+All of these functions will call a user-supplied callback function when a match
+is found. This function has the following signature:
+
+  .. doxygentypedef:: match_event_handler
+     :outline:
+     :no-link:
+
+The *id* argument will be set to the identifier for the matching expression
+provided at compile time, and the *to* argument will be set to the end-offset
+of the match. If SOM was requested for the pattern (see :ref:`som`), the
+*from* argument will be set to the leftmost possible start-offset for the match.
+
+The match callback function has the capability to halt scanning
+by returning a non-zero value.
+
+See :c:type:`match_event_handler` for more information.
+
+**************
+Streaming Mode
+**************
+
+The streaming runtime API consists of functions to open, scan, and close
+Hyperscan data streams -- these functions being :c:func:`hs_open_stream`,
+:c:func:`hs_scan_stream`, and :c:func:`hs_close_stream`. Any matches detected
+in the written data are returned to the calling application via a function
+pointer callback.
+
+The match callback function has the capability to halt scanning of the current
+data stream by returning a non-zero value. In streaming mode, the result of
+this is that the stream is then left in a state where no more data can be
+scanned, and any subsequent calls to :c:func:`hs_scan_stream` for that stream
+will return immediately with :c:member:`HS_SCAN_TERMINATED`. The caller must
+still call :c:func:`hs_close_stream` to complete the clean-up process for that
+stream.
+
+Streams exist in the Hyperscan library so that pattern matching state can be
+maintained across multiple blocks of target data -- without maintaining this
+state, it would not be possible to detect patterns that span these blocks of
+data. This, however, does come at the cost of requiring an amount of storage
+per-stream (the size of this storage is fixed at compile time), and a slight
+performance penalty in some cases to manage the state.
+
+While Hyperscan does always support a strict ordering of multiple matches,
+streaming matches will not be delivered at offsets before the current stream
+write, with the exception of zero-width asserts, where constructs such as
+:regexp:`\\b` and :regexp:`$` can cause a match on the final character of a
+stream write to be delayed until the next stream write or stream close
+operation.
+
+=================
+Stream Management
+=================
+
+In addition to :c:func:`hs_open_stream`, :c:func:`hs_scan_stream`, and
+:c:func:`hs_close_stream`, the Hyperscan API provides a number of other
+functions for the management of streams:
+
+* :c:func:`hs_reset_stream`: resets a stream to its initial state; this is
+  equivalent to calling :c:func:`hs_close_stream` but will not free the memory
+  used for stream state.
+
+* :c:func:`hs_copy_stream`: constructs a (newly allocated) duplicate of a
+  stream.
+
+* :c:func:`hs_reset_and_copy_stream`: constructs a duplicate of a stream into
+  another, resetting the destination stream first. This call avoids the
+  allocation done by :c:func:`hs_copy_stream`.
+
+**********
+Block Mode
+**********
+
+The block mode runtime API consists of a single function: :c:func:`hs_scan`. Using
+the compiled patterns this function identifies matches in the target data,
+using a function pointer callback to communicate with the application.
+
+This single :c:func:`hs_scan` function is essentially equivalent to calling
+:c:func:`hs_open_stream`, making a single call to :c:func:`hs_scan_stream`, and
+then :c:func:`hs_close_stream`, except that block mode operation does not
+incur all the stream related overhead.
+
+*************
+Vectored Mode
+*************
+
+The vectored mode runtime API, like the block mode API, consists of a single
+function: :c:func:`hs_scan_vector`. This function accepts an array of data
+pointers and lengths, facilitating the scanning in sequence of a set of data
+blocks that are not contiguous in memory.
+
+From the caller's perspective, this mode will produce the same matches as if
+the set of data blocks were (a) scanned in sequence with a series of streaming
+mode scans, or (b) copied in sequence into a single block of memory and then
+scanned in block mode.
+
+*************
+Scratch Space
+*************
+
+While scanning data, Hyperscan needs a small amount of temporary memory to store
+on-the-fly internal data. This amount is unfortunately too large to fit on the
+stack, particularly for embedded applications, and allocating memory dynamically
+is too expensive, so a pre-allocated "scratch" space must be provided to the
+scanning functions.
+
+The function :c:func:`hs_alloc_scratch` allocates a large enough region of
+scratch space to support a given database. If the application uses multiple
+databases, only a single scratch region is necessary: in this case, calling
+:c:func:`hs_alloc_scratch` on each database (with the same ``scratch`` pointer)
+will ensure that the scratch space is large enough to support scanning against
+any of the given databases.
+
+Importantly, only one such space is required per thread and can (and indeed
+should) be allocated before data scanning is to commence. In a scenario where a
+set of expressions are compiled by a single "master" thread and data will be
+scanned by multiple "worker" threads, the convenience function
+:c:func:`hs_clone_scratch` allows multiple copies of an existing scratch space
+to be made for each thread (rather than forcing the caller to pass all the
+compiled databases through :c:func:`hs_alloc_scratch` multiple times).
+
+For example:
+
+.. code-block:: c
+
+    hs_error_t err;
+    hs_scratch_t *scratch_prototype = NULL;
+    err = hs_alloc_scratch(db, &scratch_prototype);
+    if (err != HS_SUCCESS) {
+        printf("hs_alloc_scratch failed!");
+        exit(1);
+    }
+
+    hs_scratch_t *scratch_thread1 = NULL;
+    hs_scratch_t *scratch_thread2 = NULL;
+
+    err = hs_clone_scratch(scratch_prototype, &scratch_thread1);
+    if (err != HS_SUCCESS) {
+        printf("hs_clone_scratch failed!");
+        exit(1);
+    }
+    err = hs_clone_scratch(scratch_prototype, &scratch_thread2);
+    if (err != HS_SUCCESS) {
+        printf("hs_clone_scratch failed!");
+        exit(1);
+    }
+
+    hs_free_scratch(scratch_prototype);
+
+    /* Now two threads can both scan against database db,
+       each with its own scratch space. */
+
+While the Hyperscan library is re-entrant, the use of scratch spaces is not.
+For example, if by design it is deemed necessary to run recursive or nested
+scanning (say, from the match callback function), then an additional scratch
+space is required for that context.
+
+The easiest way to achieve this is to build up a single scratch space as a
+prototype, then clone it for each context:
+
+*****************
+Custom Allocators
+*****************
+
+By default, structures used by Hyperscan at runtime (scratch space, stream
+state, etc) are allocated with the default system allocators, usually
+``malloc()`` and ``free()``.
+
+The Hyperscan API provides a facility for changing this behaviour to support
+applications that use custom memory allocators.
+
+These functions are:
+
+- :c:func:`hs_set_database_allocator`, which sets the allocate and free functions
+  used for compiled pattern databases.
+- :c:func:`hs_set_scratch_allocator`, which sets the allocate and free
+  functions used for scratch space.
+- :c:func:`hs_set_stream_allocator`, which sets the allocate and free functions
+  used for stream state in streaming mode.
+- :c:func:`hs_set_misc_allocator`, which sets the allocate and free functions
+  used for miscellaneous data, such as compile error structures and
+  informational strings.
+
+The :c:func:`hs_set_allocator` function can be used to set all of the custom
+allocators to the same allocate/free pair.
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -0,0 +1,24 @@
+find_library(PCAP_LIBRARY pcap)
+
+if (NOT PCAP_LIBRARY)
+    message(STATUS "Could not find libpcap - some examples will not be built")
+endif()
+
+add_executable(simplegrep simplegrep.c)
+set_source_files_properties(simplegrep.c PROPERTIES COMPILE_FLAGS
+    "-Wall -Wno-unused-parameter")
+target_link_libraries(simplegrep hs)
+
+if (PCAP_LIBRARY)
+add_executable(pcapscan pcapscan.cc)
+set_source_files_properties(pcapscan.cc PROPERTIES COMPILE_FLAGS
+    "-Wall -Wno-unused-parameter")
+target_link_libraries(pcapscan hs pcap)
+endif()
+
+if (PCAP_LIBRARY)
+add_executable(patbench patbench.cc)
+set_source_files_properties(patbench.cc PROPERTIES COMPILE_FLAGS
+    "-Wall -Wno-unused-parameter")
+target_link_libraries(patbench hs pcap)
+endif()
--- a/examples/README.md
+++ b/examples/README.md
@ -0,0 +1,155 @@
+Hyperscan Example Code
+======================
+
+Copyright (C) 2015 Intel Corporation. All rights reserved.
+
+The files in this directory contain example code demonstrating the use of the
+Hyperscan regular expression matching library. The examples have been
+constructed to be useful utility programs, but they have been simplified
+somewhat, so generally contain "shortcuts" that one would not take if building
+a "real" system.
+
+The examples each contain a short description in a comment at the top of the
+file, including build instructions.
+
+---
+
+
+Example 1: simplegrep
+---------------------
+
+The first example program (`simplegrep.c`) is modelled on the ubiquitous grep
+tool to search a file for a single regular expression. 'simplegrep' does the
+same, but eschews a lot of grep's complexity: it is unable to read data from
+`stdin`, and doesn't support grep's plethora of command-line arguments.
+
+This code is intended to be simple portable C99.
+
+simplegrep demonstrates the following Hyperscan concepts:
+
+- Single pattern compilation: As simplegrep can scan for one pattern only, it
+  uses the `hs_compile` function instead of the multi-pattern variant:
+  `hs_compile_multi`.
+
+- Block mode pattern-matching: simplegrep will search a single data buffer
+  for the given pattern, so it has no need to set up and tear down streams.
+  (See the next section for a streaming mode example)
+
+- Scratch space allocation and use: Hyperscan requires a small amount of
+  temporary memory that is used in the `hs_scan` call. The caller needs to
+  guarantee that only one instance of `hs_scan` is using the scratch space at a
+  time, but there is no requirement that the same scratch area be used on
+  consecutive calls to `hs_scan`. Given that it is expensive to allocate the
+  scratch space, one would typically allocate all necessary scratch space at
+  system startup and reuse it throughout execution of the program.
+
+
+Example 2: pcapscan
+-------------------
+
+The second example program (`pcapscan.cc`) is a very simple packet scanning
+benchmark. It scans a given PCAP file full of network traffic against a group
+of regular expressions and returns some coarse performance measurements.  This
+example provides a quick way to examine the performance achievable on a
+particular combination of platform, pattern set and input data.
+
+In block mode, pcapscan scans each packet individually against a Hyperscan
+database. In streaming mode, pcapscan assigns packets to flows using a
+rudimentary connection tracker, then scans the packets in each flow with
+Hyperscan's streaming mode interface. This demonstrates the use of streaming
+mode operation to detect matches that straddle packet boundaries.
+
+**Note**: the flow assignment implemented here is intended as a simple demo; it
+merely ensures that packets with the same 5-tuple are written to the same
+stream in the order in which they appear in the PCAP file.  No packet
+re-ordering or connection state tracking (as you would expect to find in a real
+network scanning application) is done.
+
+pcapscan introduces the following Hyperscan concepts:
+
+- Multi-pattern compilation: Unlike simplegrep, pcapscan requires a file of
+  expressions as input instead of a single pattern. pcapscan will read this
+  file in, one pattern per line, and use it as input to the `hs_compile_multi`
+  function. This function generates a pattern database that will match all the
+  input patterns in parallel.
+
+- Streamed pattern-matching: pcapscan uses the `hs_scan_stream` function
+  (instead of the block-mode `hs_scan` call) to allow it to identify matches
+  that occur in a stream of data, even if they straddle the boundaries between blocks.
+  Streaming mode operation has a number of unique properties:
+
+  - Stream state that persists for the lifetime of the stream must be allocated
+    with the `hs_open_stream` function before scanning can take place.
+    Similarly, it must be freed with `hs_close_stream` after it is no longer
+    needed. Each stream being scanned concurrently requires its own stream
+    state.
+
+  - In streaming mode, a non-zero return from the user-specified event-handler
+    function has consequences for the rest of that stream's lifetime: when a
+    non-zero return occurs, it signals that no more of the stream should be
+    scanned. Consequently if the user makes a subsequent call to
+    `hs_scan_stream` on a stream whose processing was terminated in this way,
+    hs_scan_stream will return `HS_SCAN_TERMINATED`. This case has not been
+    demonstrated in pcapscan, as its callback always returns 0.
+
+  - Match handling during stream shutdown: As matches may occur when the
+    `hs_close_stream` function is called, it too must be provided with scratch
+    space in order to perform this match processing. Similarly, the user must
+    be prepared to be issued match event callbacks during the `hs_close_stream`
+    call. For this reason, we advise that stream shutdown be an integral part
+    of the system design.
+
+
+Example 3: patbench
+-------------------
+
+This program allows users to detect which signatures may be the most expensive
+in a set of patterns. It is designed for use with small to medium pattern set
+sizes (e.g. 5-500). If used with very large pattern sets it may take a very
+long time - the number of recompiles done is `g * O(lg2(n))` where `g` is the
+number of generations and `n` is the number of patterns (assuming that `n >>
+g`).
+
+This utility will return a cumulative series of removed patterns. The first
+generation will find and remove a single pattern. The second generation will
+begin with the first pattern removed and find another pattern to remove, etc.
+So if we have 100 patterns and 15 generations, the final generation's score
+will be a run over 85 patterns.
+
+This utility is probabilistic. It is possible that the pattern removed in a
+generation is not a particularly expensive pattern. To reduce noise in the
+results use 'taskset' and set the number of repeats to a level that still
+completes in reasonable time (this will reduce the effect of random measurement
+noise).
+
+The criterion for performance can be altered by use of the `-C<x>` flag where
+`<x>` can be `t,r,s,c,b`, selecting pattern matching throughput, scratch size,
+stream state size (only available in streaming mode), compile time and bytecode
+size respectively.
+
+This utility will also not produce good results if all the patterns are roughly
+equally expensive.
+
+### Factor Group Size:
+
+If there are multiple expensive patterns that are very similar on the
+left-hand-side or identical, this utility will typically not find these groups
+unless the `-F` flag is used to search for a group size that is equal to or
+larger than the size of the group of similar patterns.
+
+Otherwise, removing a portion of the similar patterns will have no or almost no
+effect, and the search procedure used relies on the ability to remove all of
+the similar patterns in at least one search case, something which will only
+happen if the `factor_group_size` is large enough.
+
+This alters the operation of the tool so that instead of trying to find the
+single pattern whose removal has the most effect by binary search (the default
+with `factor_group_size == 1`), we attempt to find the N patterns whose removal
+has the most effect by searching over `N + 1` evenly sized groups, removing
+only `1/(N + 1)` of the search signatures per iteration.
+
+Note that the number of recompiles done greatly increases with increased factor
+group size.  For example, with `factor_group_size = 1`, we do `g * 2 * lg2(n)`
+recompiles, while with `factor_group_size = 4`, we do `g * 4 * log(5/4)(n)`.
+Informally the number of generations we require goes up as we eliminate a
+smaller number of signatures and the we have to do more work per generation.
--- a/examples/patbench.cc
+++ b/examples/patbench.cc
@ -0,0 +1,892 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Hyperscan pattern benchmarker.
+ *
+ * This program allows users to detect which signatures may be the most
+ * expensive in a set of patterns. It is designed for use with small to medium
+ * pattern set sizes (e.g. 5-500). If used with very large pattern sets it may
+ * take a very long time - the number of recompiles done is g * O(lg2(n)) where
+ * g is the number of generations and n is the number of patterns (assuming
+ * that n >> g).
+ *
+ * This utility will return a cumulative series of removed patterns. The first
+ * generation will find and remove a single pattern. The second generation will
+ * begin with the first pattern removed and find another pattern to remove,
+ * etc. So if we have 100 patterns and 15 generations, the final generation's
+ * score will be a run over 85 patterns.
+ *
+ * This utility is probabilistic. It is possible that the pattern removed in a
+ * generation is not a particularly expensive pattern. To reduce noise in the
+ * results use 'taskset' and set the number of repeats to a level that still
+ * completes in reasonable time (this will reduce the effect of random
+ * measurement noise).
+ *
+ * The criterion for performance can be altered by use of the -C<x> flag where
+ * <x> can be t,r,s,c,b, selecting pattern matching throughput, scratch size,
+ * stream state size (only available in streaming mode), compile time and
+ * bytecode size respectively.
+ *
+ * This utility will also not produce good results if all the patterns are
+ * roughly equally expensive.
+ *
+ * Factor Group Size:
+ *
+ * If there are multiple expensive patterns that are very similar on the
+ * left-hand-side or identical, this utility will typically not find these
+ * groups unless the -F flag is used to search for a group size that is equal
+ * to or larger than the size of the group of similar patterns.
+ *
+ * Otherwise, removing a portion of the similar patterns will have no or almost
+ * no effect, and the search procedure used relies on the ability to remove all
+ * of the similar patterns in at least one search case, something which will
+ * only happen if the factor_group_size is large enough.
+ *
+ * This alters the operation of our tool so that instead of trying to find the
+ * single pattern whose removal has the most effect by binary search (the
+ * default with factor_group_size == 1), we attempt to find the N patterns
+ * whose removal has the most effect by searching over N+1 evenly sized groups,
+ * removing only 1/(N+1) of the search signatures per iteration.
+ *
+ * Note that the number of recompiles done greatly increases with increased
+ * factor group size.  For example, with factor_group_size = 1, we do g * 2 *
+ * lg2(n) recompiles, while with factor_group_size = 4, we do g * 4 *
+ * log(5/4)(n). Informally the number of generations we require goes up as we
+ * eliminate a smaller number of signatures and the we have to do more work per
+ * generation.
+ *
+ *
+ * Build instructions:
+ *
+ *     g++ -o patbench patbench.cc $(pkg-config --cflags --libs libhs) -lpcap
+ *
+ * Usage:
+ *
+ *     ./patbench [ -n repeats] [ -G generations] [ -C criterion ]
+ *             [ -F factor_group_size ] [ -N | -S ] <pattern file> <pcap file>
+ *
+ *     -n repeats sets the number of times the PCAP is repeatedly scanned
+ *        with the pattern
+ *     -G generations sets the number of generations that the algorithm is
+ *        run for
+ *     -N sets non-streaming mode, -S sets streaming mode (default)
+ *     -F sets the factor group size (must be >0); this allows the detection
+ *        of multiple interacting factors
+ *
+ *     -C sets the "criterion", which can be either:
+ *          t  throughput (the default) - this requires a pcap file
+ *          r  scratch size
+ *          s  stream state size
+ *          c  compile time
+ *          b  bytecode size
+ *
+ * We recommend the use of a utility like 'taskset' on multiprocessor hosts to
+ * lock execution to a single processor: this will remove processor migration
+ * by the scheduler as a source of noise in the results.
+ *
+ */
+
+#include <algorithm>
+#include <cstring>
+#include <chrono>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <set>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include <unistd.h>
+
+// We use the BSD primitives throughout as they exist on both BSD and Linux.
+#define __FAVOR_BSD
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <netinet/ip_icmp.h>
+#include <net/ethernet.h>
+#include <arpa/inet.h>
+
+#include <pcap.h>
+
+#include <hs.h>
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::ifstream;
+using std::string;
+using std::unordered_map;
+using std::vector;
+using std::set;
+using std::min;
+using std::max;
+using std::copy;
+
+enum Criterion {
+    CRITERION_THROUGHPUT,
+    CRITERION_BYTECODE_SIZE,
+    CRITERION_COMPILE_TIME,
+    CRITERION_STREAM_STATE,
+    CRITERION_SCRATCH_SIZE
+};
+
+static bool higher_is_better(Criterion c) {
+    return c == CRITERION_THROUGHPUT;
+}
+
+static void print_criterion(Criterion c, double val) {
+    switch (c) {
+    case CRITERION_THROUGHPUT:
+        cout << std::fixed << std::setprecision(3) << val << " Megabits/s";
+        break;
+    case CRITERION_COMPILE_TIME:
+        cout << std::fixed << std::setprecision(3) << val << " seconds";
+        break;
+    case CRITERION_BYTECODE_SIZE:
+    case CRITERION_STREAM_STATE:
+    case CRITERION_SCRATCH_SIZE:
+    default:
+        cout << static_cast<size_t>(val) << " bytes";
+        break;
+    }
+}
+
+// Key for identifying a stream in our pcap input data, using data from its IP
+// headers.
+struct FiveTuple {
+    unsigned int protocol;
+    unsigned int srcAddr;
+    unsigned int srcPort;
+    unsigned int dstAddr;
+    unsigned int dstPort;
+
+    // Construct a FiveTuple from a TCP or UDP packet.
+    FiveTuple(const struct ip *iphdr) {
+        // IP fields
+        protocol = iphdr->ip_p;
+        srcAddr = iphdr->ip_src.s_addr;
+        dstAddr = iphdr->ip_dst.s_addr;
+
+        // UDP/TCP ports
+        const struct udphdr *uh = (const struct udphdr *)
+                (((const char *)iphdr) + (iphdr->ip_hl * 4));
+        srcPort = uh->uh_sport;
+        dstPort = uh->uh_dport;
+    }
+
+    bool operator==(const FiveTuple &a) const {
+        return protocol == a.protocol && srcAddr == a.srcAddr &&
+               srcPort == a.srcPort && dstAddr == a.dstAddr &&
+               dstPort == a.dstPort;
+    }
+};
+
+// A *very* simple hash function, used when we create an unordered_map of
+// FiveTuple objects.
+struct FiveTupleHash {
+    size_t operator()(const FiveTuple &x) const {
+        return x.srcAddr ^ x.dstAddr ^ x.protocol ^ x.srcPort ^ x.dstPort;
+    }
+};
+
+// Helper function. See end of file.
+static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
+                          unsigned int *length);
+
+// Match event handler: called every time Hyperscan finds a match.
+static
+int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
+            unsigned int flags, void *ctx) {
+    // Our context points to a size_t storing the match count
+    size_t *matches = (size_t *)ctx;
+    (*matches)++;
+    return 0; // continue matching
+}
+
+// Simple timing class
+class Clock {
+public:
+    void start() {
+        time_start = std::chrono::system_clock::now();
+    }
+
+    void stop() {
+        time_end = std::chrono::system_clock::now();
+    }
+
+    double seconds() const {
+        std::chrono::duration<double> delta = time_end - time_start;
+        return delta.count();
+    }
+private:
+    std::chrono::time_point<std::chrono::system_clock> time_start, time_end;
+};
+
+// Class wrapping all state associated with the benchmark
+class Benchmark {
+private:
+    // Packet data to be scanned
+    vector<string> packets;
+
+    // Stream ID for each packet
+    vector<size_t> stream_ids;
+
+    // Map used to construct stream_ids
+    unordered_map<FiveTuple, size_t, FiveTupleHash> stream_map;
+
+    // Hyperscan compiled database
+    hs_database_t *db = nullptr;
+
+    // Hyperscan temporary scratch space
+    hs_scratch_t *scratch = nullptr;
+
+    // Vector of Hyperscan stream state
+    vector<hs_stream_t *> streams;
+
+    // Count of matches found while scanning
+    size_t matchCount = 0;
+public:
+    ~Benchmark() {
+        hs_free_scratch(scratch);
+        hs_free_database(db);
+    }
+
+    // Initialisation; after this call, Benchmark owns the database and will
+    // ensure it is freed.
+    void setDatabase(hs_database_t *hs_db) {
+        hs_free_database(db); // Free previous database.
+        db = hs_db;
+        // (Re)allocate scratch to ensure that it is large enough to handle the
+        // database.
+        hs_error_t err = hs_alloc_scratch(db, &scratch);
+        if (err != HS_SUCCESS) {
+            cerr << "ERROR: could not allocate scratch space. Exiting." << endl;
+            exit(-1);
+        }
+    }
+    const hs_database_t *getDatabase() const {
+        return db;
+    }
+
+    size_t getScratchSize() const {
+        size_t scratch_size;
+        hs_error_t err = hs_scratch_size(scratch, &scratch_size);
+        if (err != HS_SUCCESS) {
+            cerr << "ERROR: could not query scratch space size. Exiting."
+                 << endl;
+            exit(-1);
+        }
+        return scratch_size;
+    }
+
+    // Read a set of streams from a pcap file
+    bool readStreams(const char *pcapFile) {
+        // Open PCAP file for input
+        char errbuf[PCAP_ERRBUF_SIZE];
+        pcap_t *pcapHandle = pcap_open_offline(pcapFile, errbuf);
+        if (pcapHandle == nullptr) {
+            cerr << "ERROR: Unable to open pcap file \"" << pcapFile
+                 << "\": " << errbuf << endl;
+            return false;
+        }
+
+        struct pcap_pkthdr pktHeader;
+        const unsigned char *pktData;
+        while ((pktData = pcap_next(pcapHandle, &pktHeader)) != nullptr) {
+            unsigned int offset = 0, length = 0;
+            if (!payloadOffset(pktData, &offset, &length)) {
+                continue;
+            }
+
+            // Valid TCP or UDP packet
+            const struct ip *iphdr = (const struct ip *)(pktData
+                    + sizeof(struct ether_header));
+            const char *payload = (const char *)pktData + offset;
+
+            size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
+                                          stream_map.size())).first->second;
+
+            packets.push_back(string(payload, length));
+            stream_ids.push_back(id);
+        }
+        pcap_close(pcapHandle);
+
+        return !packets.empty();
+    }
+
+    // Return the number of bytes scanned
+    size_t bytes() const {
+        size_t sum = 0;
+        for (const auto &packet : packets) {
+            sum += packet.size();
+        }
+        return sum;
+    }
+
+    // Return the number of matches found.
+    size_t matches() const {
+        return matchCount;
+    }
+
+    // Clear the number of matches found.
+    void clearMatches() {
+        matchCount = 0;
+    }
+
+    // Open a Hyperscan stream for each stream in stream_ids
+    void openStreams() {
+        streams.resize(stream_map.size());
+        for (auto &stream : streams) {
+            hs_error_t err = hs_open_stream(db, 0, &stream);
+            if (err != HS_SUCCESS) {
+                cerr << "ERROR: Unable to open stream. Exiting." << endl;
+                exit(-1);
+            }
+        }
+    }
+
+    // Close all open Hyperscan streams (potentially generating any
+    // end-anchored matches)
+    void closeStreams() {
+        for (auto &stream : streams) {
+            hs_error_t err =
+                hs_close_stream(stream, scratch, onMatch, &matchCount);
+            if (err != HS_SUCCESS) {
+                cerr << "ERROR: Unable to close stream. Exiting." << endl;
+                exit(-1);
+            }
+        }
+    }
+
+    // Scan each packet (in the ordering given in the PCAP file) through
+    // Hyperscan using the streaming interface.
+    void scanStreams() {
+        for (size_t i = 0; i != packets.size(); ++i) {
+            const std::string &pkt = packets[i];
+            hs_error_t err = hs_scan_stream(streams[stream_ids[i]],
+                                            pkt.c_str(), pkt.length(), 0,
+                                            scratch, onMatch, &matchCount);
+            if (err != HS_SUCCESS) {
+                cerr << "ERROR: Unable to scan packet. Exiting." << endl;
+                exit(-1);
+            }
+        }
+    }
+
+    // Scan each packet (in the ordering given in the PCAP file) through
+    // Hyperscan using the block-mode interface.
+    void scanBlock() {
+        for (size_t i = 0; i != packets.size(); ++i) {
+            const std::string &pkt = packets[i];
+            hs_error_t err = hs_scan(db, pkt.c_str(), pkt.length(), 0,
+                                     scratch, onMatch, &matchCount);
+            if (err != HS_SUCCESS) {
+                cerr << "ERROR: Unable to scan packet. Exiting." << endl;
+                exit(-1);
+            }
+        }
+    }
+};
+
+// helper function - see end of file
+static void parseFile(const char *filename, vector<string> &patterns,
+                      vector<unsigned> &flags, vector<unsigned> &ids,
+                      vector<string> &originals);
+
+class Sigdata {
+    vector<unsigned> flags;
+    vector<unsigned> ids;
+    vector<string> patterns;
+    vector<string> originals;
+
+public:
+    Sigdata() {}
+    Sigdata(const char *filename) {
+        parseFile(filename, patterns, flags, ids, originals);
+
+    }
+
+    const string &get_original(unsigned index) const {
+        return originals[index];
+    }
+
+    hs_database_t *compileDatabase(unsigned mode, double *compileTime) const {
+        hs_database_t *db = nullptr;
+        hs_compile_error_t *compileErr;
+
+        // Turn our vector of strings into a vector of char*'s to pass in to
+        // hs_compile_multi. (This is just using the vector of strings as
+        // dynamic storage.)
+        vector<const char *> cstrPatterns;
+        cstrPatterns.reserve(patterns.size());
+        for (const auto &pattern : patterns) {
+            cstrPatterns.push_back(pattern.c_str());
+        }
+
+        Clock clock;
+        clock.start();
+        hs_error_t err = hs_compile_multi(cstrPatterns.data(), flags.data(),
+                                          ids.data(), cstrPatterns.size(), mode,
+                                          nullptr, &db, &compileErr);
+        clock.stop();
+        if (err != HS_SUCCESS) {
+            if (compileErr->expression < 0) {
+                // The error does not refer to a particular expression.
+                cerr << "ERROR: " << compileErr->message << endl;
+            } else {
+                cerr << "ERROR: Pattern '"
+                     << patterns[compileErr->expression]
+                     << "' failed with error '" << compileErr->message << "'"
+                     << endl;
+            }
+            // As the compileErr pointer points to dynamically allocated memory,
+            // if we get an error, we must be sure to release it. This is not
+            // necessary when no error is detected.
+            hs_free_compile_error(compileErr);
+            exit(-1);
+        }
+
+        *compileTime = clock.seconds();
+        return db;
+    }
+
+    unsigned size() const {
+        return patterns.size();
+    }
+
+    Sigdata cloneExclude(const set<unsigned> &excludeIndexSet) const {
+        Sigdata c;
+        for (unsigned i = 0, e = size(); i != e; ++i) {
+            if (excludeIndexSet.find(i) == excludeIndexSet.end()) {
+                c.flags.push_back(flags[i]);
+                c.ids.push_back(ids[i]);
+                c.patterns.push_back(patterns[i]);
+                c.originals.push_back(originals[i]);
+            }
+        }
+        return c;
+    }
+};
+
+static
+void usage(const char *) {
+    cerr << "Usage:" << endl << endl;
+    cerr << "  patbench [-n repeats] [ -G generations] [ -C criterion ]" << endl
+         << "           [ -F factor_group_size ] [ -N | -S ] "
+         << "<pattern file> <pcap file>" << endl << endl
+         << "    -n repeats sets the number of times the PCAP is repeatedly "
+            "scanned" << endl << "       with the pattern." << endl
+         << "    -G generations sets the number of generations that the "
+            "algorithm is" << endl << "       run for." << endl
+         << "    -N sets non-streaming mode, -S sets streaming mode (default)."
+         << endl << "    -F sets the factor group size (must be >0); this "
+                    "allows the detection" << endl
+         << "       of multiple interacting factors." << endl << "" << endl
+         << "    -C sets the 'criterion', which can be either:" << endl
+         << "         t  throughput (the default) - this requires a pcap file"
+         << endl << "         r  scratch size" << endl
+         << "         s  stream state size" << endl
+         << "         c  compile time" << endl << "         b  bytecode size"
+         << endl << endl
+         << "We recommend the use of a utility like 'taskset' on "
+            "multiprocessor hosts to" << endl
+         << "lock execution to a single processor: this will remove processor "
+            "migration" << endl
+         << "by the scheduler as a source of noise in the results." << endl;
+}
+
+static
+double measure_stream_time(Benchmark &bench, unsigned int repeatCount) {
+    Clock clock;
+    bench.clearMatches();
+    clock.start();
+    for (unsigned int i = 0; i < repeatCount; i++) {
+        bench.openStreams();
+        bench.scanStreams();
+        bench.closeStreams();
+    }
+    clock.stop();
+    double secsScan = clock.seconds();
+    return secsScan;
+}
+
+static
+double measure_block_time(Benchmark &bench, unsigned int repeatCount) {
+    Clock clock;
+    bench.clearMatches();
+    clock.start();
+    for (unsigned int i = 0; i < repeatCount; i++) {
+        bench.scanBlock();
+    }
+    clock.stop();
+    double secsScan = clock.seconds();
+    return secsScan;
+}
+
+static
+double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
+                unsigned repeatCount, Criterion criterion,
+                bool diagnose = true) {
+    double compileTime = 0;
+    bench.setDatabase(sigs.compileDatabase(mode, &compileTime));
+
+    switch (criterion) {
+    case CRITERION_BYTECODE_SIZE: {
+        size_t dbSize;
+        hs_error_t err = hs_database_size(bench.getDatabase(), &dbSize);
+        if (err != HS_SUCCESS) {
+            cerr << "ERROR: could not retrieve bytecode size" << endl;
+            exit(1);
+        }
+        return dbSize;
+    }
+    case CRITERION_COMPILE_TIME:
+        return compileTime;
+    case CRITERION_STREAM_STATE: {
+        size_t streamStateSize;
+        hs_error_t err = hs_stream_size(bench.getDatabase(), &streamStateSize);
+        if (err != HS_SUCCESS) {
+            cerr << "ERROR: could not retrieve stream state size" << endl;
+            exit(1);
+        }
+        return streamStateSize;
+    }
+    case CRITERION_SCRATCH_SIZE:
+        return bench.getScratchSize();
+    case CRITERION_THROUGHPUT:
+    default:
+        break; // do nothing - we are THROUGHPUT
+    }
+    double scan_time;
+    if (mode == HS_MODE_NOSTREAM) {
+        scan_time = measure_block_time(bench, repeatCount);
+    } else {
+        scan_time = measure_stream_time(bench, repeatCount);
+    }
+    size_t bytes = bench.bytes();
+    size_t matches = bench.matches();
+    if (diagnose) {
+        cout << "Scan time " << std::fixed << std::setprecision(3) << scan_time
+             << " sec, Scanned " << bytes * repeatCount << " bytes, Throughput "
+             << std::fixed << std::setprecision(3)
+             << (bytes * 8 * repeatCount) / (scan_time * 1000000)
+             << " Mbps, Matches " << matches << endl;
+    }
+    return (bytes * 8 * repeatCount) / (scan_time * 1000000);
+}
+
+// Main entry point.
+int main(int argc, char **argv) {
+    unsigned int repeatCount = 1;
+    unsigned int mode = HS_MODE_STREAM;
+    Criterion criterion = CRITERION_THROUGHPUT;
+    unsigned int gen_max = 10;
+    unsigned int factor_max = 1;
+    // Process command line arguments.
+    int opt;
+    while ((opt = getopt(argc, argv, "SNn:G:F:C:")) != -1) {
+        switch (opt) {
+        case 'F':
+            factor_max = atoi(optarg);
+            break;
+        case 'G':
+            gen_max = atoi(optarg);
+            break;
+        case 'S':
+            mode = HS_MODE_STREAM;
+            break;
+        case 'N':
+            mode = HS_MODE_NOSTREAM;
+            break;
+        case 'C':
+            switch (optarg[0]) {
+            case 't':
+                criterion = CRITERION_THROUGHPUT;
+                break;
+            case 'b':
+                criterion = CRITERION_BYTECODE_SIZE;
+                break;
+            case 'c':
+                criterion = CRITERION_COMPILE_TIME;
+                break;
+            case 's':
+                criterion = CRITERION_STREAM_STATE;
+                break;
+            case 'r':
+                criterion = CRITERION_SCRATCH_SIZE;
+                break;
+            default:
+                cerr << "Unrecognised criterion: " << optarg[0] << endl;
+                usage(argv[0]);
+                exit(-1);
+            }
+            break;
+        case 'n':
+            repeatCount = atoi(optarg);
+            break;
+        default:
+            usage(argv[0]);
+            exit(-1);
+        }
+    }
+
+    if (argc - optind != ((criterion == CRITERION_THROUGHPUT) ? 2 : 1)) {
+        usage(argv[0]);
+        exit(-1);
+    }
+
+    const char *patternFile = argv[optind];
+    const char *pcapFile = argv[optind + 1];
+
+    // Read our input PCAP file in
+    Benchmark bench;
+    if (criterion == CRITERION_THROUGHPUT) {
+        if (!bench.readStreams(pcapFile)) {
+            cerr << "Unable to read packets from PCAP file. Exiting." << endl;
+            exit(-1);
+        }
+    }
+
+    if ((criterion == CRITERION_STREAM_STATE) && (mode != HS_MODE_STREAM)) {
+        cerr << "Cannot evaluate stream state for block mode compile. Exiting."
+             << endl;
+        exit(-1);
+    }
+
+    cout << "Base signatures: " << patternFile;
+    if (pcapFile) {
+        cout << "\tPCAP input file: " << pcapFile
+             << "\tRepeat count: " << repeatCount;
+    }
+    if (mode == HS_MODE_STREAM) {
+        cout << "\tMode: streaming";
+    } else {
+        cout << "\tMode: block";
+    }
+    cout << endl;
+
+    Sigdata sigs(patternFile);
+
+    // calculate and show a baseline
+    eval_set(bench, sigs, mode, repeatCount, criterion);
+
+    set<unsigned> work_sigs, exclude;
+
+    for (unsigned i = 0; i < sigs.size(); ++i) {
+        work_sigs.insert(i);
+    }
+
+    double score_base =
+        eval_set(bench, sigs, mode, repeatCount, criterion, false);
+    bool maximize = higher_is_better(criterion);
+    cout << "Number of signatures: " << sigs.size() << endl;
+    cout << "Base performance: ";
+    print_criterion(criterion, score_base);
+    cout << endl;
+
+    unsigned generations = min(gen_max, (sigs.size() - 1) / factor_max);
+
+    cout << "Cutting signatures cumulatively for " << generations
+         << " generations" << endl;
+    for (unsigned gen = 0; gen < generations; ++gen) {
+        cout << "Generation " << gen << " ";
+        set<unsigned> s(work_sigs.begin(), work_sigs.end());
+        double best = maximize ? 0 : 1000000000000.0;
+        unsigned count = 0;
+        while (s.size() > factor_max) {
+            count++;
+            cout << "." << std::flush;
+            vector<unsigned> sv(s.begin(), s.end());
+            random_shuffle(sv.begin(), sv.end());
+            unsigned groups = factor_max + 1;
+            for (unsigned current_group = 0; current_group < groups;
+                 current_group++) {
+                unsigned sz = sv.size();
+                unsigned lo = (current_group * sz) / groups;
+                unsigned hi = ((current_group + 1) * sz) / groups;
+
+                set<unsigned> s_part1(sv.begin(), sv.begin() + lo);
+                set<unsigned> s_part2(sv.begin() + hi, sv.end());
+                set<unsigned> s_tmp = s_part1;
+                s_tmp.insert(s_part2.begin(), s_part2.end());
+                set<unsigned> tmp = s_tmp;
+                tmp.insert(exclude.begin(), exclude.end());
+                Sigdata sigs_tmp = sigs.cloneExclude(tmp);
+                double score = eval_set(bench, sigs_tmp, mode, repeatCount,
+                                        criterion, false);
+
+                if ((current_group == 0) ||
+                    (!maximize ? (score < best) : (score > best))) {
+                    s = s_tmp;
+                    best = score;
+                }
+            }
+        }
+        for (unsigned i = count; i < 16; i++) {
+            cout << " ";
+        }
+        cout << "Performance: ";
+        print_criterion(criterion, best);
+        cout << " (" << std::fixed << std::setprecision(3) << (best / score_base)
+             << "x) after cutting:" << endl;
+
+        // s now has factor_max signatures
+        for (const auto &found : s) {
+            exclude.insert(found);
+            work_sigs.erase(found);
+            cout << sigs.get_original(found) << endl;
+        }
+
+        cout << endl;
+    }
+    return 0;
+}
+
+/**
+ * Helper function to locate the offset of the first byte of the payload in the
+ * given ethernet frame. Offset into the packet, and the length of the payload
+ * are returned in the arguments @a offset and @a length.
+ */
+static
+bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
+                   unsigned int *length) {
+    const ip *iph = (const ip *)(pkt_data + sizeof(ether_header));
+    const tcphdr *th = nullptr;
+
+    // Ignore packets that aren't IPv4
+    if (iph->ip_v != 4) {
+        return false;
+    }
+
+    // Ignore fragmented packets.
+    if (iph->ip_off & htons(IP_MF | IP_OFFMASK)) {
+        return false;
+    }
+
+    // IP header length, and transport header length.
+    unsigned int ihlen = iph->ip_hl * 4;
+    unsigned int thlen = 0;
+
+    switch (iph->ip_p) {
+    case IPPROTO_TCP:
+        th = (const tcphdr *)((const char *)iph + ihlen);
+        thlen = th->th_off * 4;
+        break;
+    case IPPROTO_UDP:
+        thlen = sizeof(udphdr);
+        break;
+    default:
+        return false;
+    }
+
+    *offset = sizeof(ether_header) + ihlen + thlen;
+    *length = sizeof(ether_header) + ntohs(iph->ip_len) - *offset;
+
+    return *length != 0;
+}
+
+static unsigned parseFlags(const string &flagsStr) {
+    unsigned flags = 0;
+    for (const auto &c : flagsStr) {
+        switch (c) {
+        case 'i':
+            flags |= HS_FLAG_CASELESS; break;
+        case 'm':
+            flags |= HS_FLAG_MULTILINE; break;
+        case 's':
+            flags |= HS_FLAG_DOTALL; break;
+        case 'H':
+            flags |= HS_FLAG_SINGLEMATCH; break;
+        case 'V':
+            flags |= HS_FLAG_ALLOWEMPTY; break;
+        case '8':
+            flags |= HS_FLAG_UTF8; break;
+        case 'W':
+            flags |= HS_FLAG_UCP; break;
+        default:
+            cerr << "Unsupported flag \'" << c << "\'" << endl;
+            exit(-1);
+        }
+    }
+    return flags;
+}
+
+static void parseFile(const char *filename, vector<string> &patterns,
+                      vector<unsigned> &flags, vector<unsigned> &ids,
+                      vector<string> &originals) {
+    ifstream inFile(filename);
+    if (!inFile.good()) {
+        cerr << "ERROR: Can't open pattern file \"" << filename << "\"" << endl;
+        exit(-1);
+    }
+
+    for (unsigned i = 1; !inFile.eof(); ++i) {
+        string line;
+        getline(inFile, line);
+
+        // if line is empty, or a comment, we can skip it
+        if (line.empty() || line[0] == '#') {
+            continue;
+        }
+
+        // otherwise, it should be ID:PCRE, e.g.
+        //  10001:/foobar/is
+
+        size_t colonIdx = line.find_first_of(':');
+        if (colonIdx == string::npos) {
+            cerr << "ERROR: Could not parse line " << i << endl;
+            exit(-1);
+        }
+
+        // we should have an unsigned int as an ID, before the colon
+        unsigned id = std::stoi(line.substr(0, colonIdx).c_str());
+
+        // rest of the expression is the PCRE
+        const string expr(line.substr(colonIdx + 1));
+
+        size_t flagsStart = expr.find_last_of('/');
+        if (flagsStart == string::npos) {
+            cerr << "ERROR: no trailing '/' char" << endl;
+            exit(-1);
+        }
+
+        string pcre(expr.substr(1, flagsStart - 1));
+        string flagsStr(expr.substr(flagsStart + 1, expr.size() - flagsStart));
+        unsigned flag = parseFlags(flagsStr);
+
+        originals.push_back(line);
+        patterns.push_back(pcre);
+        flags.push_back(flag);
+        ids.push_back(id);
+    }
+}
--- a/examples/pcapscan.cc
+++ b/examples/pcapscan.cc
@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Hyperscan example program 2: pcapscan
+ *
+ * This example is a very simple packet scanning benchmark. It scans a given
+ * PCAP file full of network traffic against a group of regular expressions and
+ * returns some coarse performance measurements.  This example provides a quick
+ * way to examine the performance achievable on a particular combination of
+ * platform, pattern set and input data.
+ *
+ * Build instructions:
+ *
+ *     g++ -std=c++11 -O2 -o pcapscan pcapscan.cc $(pkg-config --cflags --libs libhs) -lpcap
+ *
+ * Usage:
+ *
+ *     ./pcapscan [-n repeats] <pattern file> <pcap file>
+ *
+ * We recommend the use of a utility like 'taskset' on multiprocessor hosts to
+ * pin execution to a single processor: this will remove processor migration
+ * by the scheduler as a source of noise in the results.
+ *
+ */
+
+#include <cstring>
+#include <chrono>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <unistd.h>
+
+// We use the BSD primitives throughout as they exist on both BSD and Linux.
+#define __FAVOR_BSD
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <netinet/ip_icmp.h>
+#include <net/ethernet.h>
+#include <arpa/inet.h>
+
+#include <pcap.h>
+
+#include <hs.h>
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::ifstream;
+using std::string;
+using std::unordered_map;
+using std::vector;
+
+// Key for identifying a stream in our pcap input data, using data from its IP
+// headers.
+struct FiveTuple {
+    unsigned int protocol;
+    unsigned int srcAddr;
+    unsigned int srcPort;
+    unsigned int dstAddr;
+    unsigned int dstPort;
+
+    // Construct a FiveTuple from a TCP or UDP packet.
+    FiveTuple(const struct ip *iphdr) {
+        // IP fields
+        protocol = iphdr->ip_p;
+        srcAddr = iphdr->ip_src.s_addr;
+        dstAddr = iphdr->ip_dst.s_addr;
+
+        // UDP/TCP ports
+        const struct udphdr *uh =
+            (const struct udphdr *)(((const char *)iphdr) + (iphdr->ip_hl * 4));
+        srcPort = uh->uh_sport;
+        dstPort = uh->uh_dport;
+    }
+
+    bool operator==(const FiveTuple &a) const {
+        return protocol == a.protocol && srcAddr == a.srcAddr &&
+               srcPort == a.srcPort && dstAddr == a.dstAddr &&
+               dstPort == a.dstPort;
+    }
+};
+
+// A *very* simple hash function, used when we create an unordered_map of
+// FiveTuple objects.
+struct FiveTupleHash {
+    size_t operator()(const FiveTuple &x) const {
+        return x.srcAddr ^ x.dstAddr ^ x.protocol ^ x.srcPort ^ x.dstPort;
+    }
+};
+
+// Helper function. See end of file.
+static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
+                          unsigned int *length);
+
+// Match event handler: called every time Hyperscan finds a match.
+static
+int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
+            unsigned int flags, void *ctx) {
+    // Our context points to a size_t storing the match count
+    size_t *matches = (size_t *)ctx;
+    (*matches)++;
+    return 0; // continue matching
+}
+
+// Simple timing class
+class Clock {
+public:
+    void start() {
+        time_start = std::chrono::system_clock::now();
+    }
+
+    void stop() {
+        time_end = std::chrono::system_clock::now();
+    }
+
+    double seconds() const {
+        std::chrono::duration<double> delta = time_end - time_start;
+        return delta.count();
+    }
+private:
+    std::chrono::time_point<std::chrono::system_clock> time_start, time_end;
+};
+
+// Class wrapping all state associated with the benchmark
+class Benchmark {
+private:
+    // Packet data to be scanned.
+    vector<string> packets;
+
+    // The stream ID to which each packet belongs
+    vector<size_t> stream_ids;
+
+    // Map used to construct stream_ids
+    unordered_map<FiveTuple, size_t, FiveTupleHash> stream_map;
+
+    // Hyperscan compiled database (streaming mode)
+    const hs_database_t *db_streaming;
+
+    // Hyperscan compiled database (block mode)
+    const hs_database_t *db_block;
+
+    // Hyperscan temporary scratch space (used in both modes)
+    hs_scratch_t *scratch;
+
+    // Vector of Hyperscan stream state (used in streaming mode)
+    vector<hs_stream_t *> streams;
+
+    // Count of matches found during scanning
+    size_t matchCount;
+
+public:
+    Benchmark(const hs_database_t *streaming, const hs_database_t *block)
+        : db_streaming(streaming), db_block(block), scratch(nullptr),
+          matchCount(0) {
+        // Allocate enough scratch space to handle either streaming or block
+        // mode, so we only need the one scratch region.
+        hs_error_t err = hs_alloc_scratch(db_streaming, &scratch);
+        if (err != HS_SUCCESS) {
+            cerr << "ERROR: could not allocate scratch space. Exiting." << endl;
+            exit(-1);
+        }
+        // This second call will increase the scratch size if more is required
+        // for block mode.
+        err = hs_alloc_scratch(db_block, &scratch);
+        if (err != HS_SUCCESS) {
+            cerr << "ERROR: could not allocate scratch space. Exiting." << endl;
+            exit(-1);
+        }
+    }
+
+    ~Benchmark() {
+        // Free scratch region
+        hs_free_scratch(scratch);
+    }
+
+    // Read a set of streams from a pcap file
+    bool readStreams(const char *pcapFile) {
+        // Open PCAP file for input
+        char errbuf[PCAP_ERRBUF_SIZE];
+        pcap_t *pcapHandle = pcap_open_offline(pcapFile, errbuf);
+        if (pcapHandle == nullptr) {
+            cerr << "ERROR: Unable to open pcap file \"" << pcapFile
+                << "\": " << errbuf << endl;
+            return false;
+        }
+
+        struct pcap_pkthdr pktHeader;
+        const unsigned char *pktData;
+        while ((pktData = pcap_next(pcapHandle, &pktHeader)) != nullptr) {
+            unsigned int offset = 0, length = 0;
+            if (!payloadOffset(pktData, &offset, &length)) {
+                continue;
+            }
+
+            // Valid TCP or UDP packet
+            const struct ip *iphdr = (const struct ip *)(pktData
+                    + sizeof(struct ether_header));
+            const char *payload = (const char *)pktData + offset;
+
+            size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
+                                          stream_map.size())).first->second;
+
+            packets.push_back(string(payload, length));
+            stream_ids.push_back(id);
+        }
+        pcap_close(pcapHandle);
+
+        return !packets.empty();
+    }
+
+    // Return the number of bytes scanned
+    size_t bytes() const {
+        size_t sum = 0;
+        for (const auto &packet : packets) {
+            sum += packet.size();
+        }
+        return sum;
+    }
+
+    // Return the number of matches found.
+    size_t matches() const {
+        return matchCount;
+    }
+
+    // Clear the number of matches found.
+    void clearMatches() {
+        matchCount = 0;
+    }
+
+    // Open a Hyperscan stream for each stream in stream_ids
+    void openStreams() {
+        streams.resize(stream_map.size());
+        for (auto &stream : streams) {
+            hs_error_t err = hs_open_stream(db_streaming, 0, &stream);
+            if (err != HS_SUCCESS) {
+                cerr << "ERROR: Unable to open stream. Exiting." << endl;
+                exit(-1);
+            }
+        }
+    }
+
+    // Close all open Hyperscan streams (potentially generating any
+    // end-anchored matches)
+    void closeStreams() {
+        for (auto &stream : streams) {
+            hs_error_t err = hs_close_stream(stream, scratch, onMatch,
+                                             &matchCount);
+            if (err != HS_SUCCESS) {
+                cerr << "ERROR: Unable to close stream. Exiting." << endl;
+                exit(-1);
+            }
+        }
+    }
+
+    // Scan each packet (in the ordering given in the PCAP file) through
+    // Hyperscan using the streaming interface.
+    void scanStreams() {
+        for (size_t i = 0; i != packets.size(); ++i) {
+            const std::string &pkt = packets[i];
+            hs_error_t err = hs_scan_stream(streams[stream_ids[i]],
+                                            pkt.c_str(), pkt.length(), 0,
+                                            scratch, onMatch, &matchCount);
+            if (err != HS_SUCCESS) {
+                cerr << "ERROR: Unable to scan packet. Exiting." << endl;
+                exit(-1);
+            }
+        }
+    }
+
+    // Scan each packet (in the ordering given in the PCAP file) through
+    // Hyperscan using the block-mode interface.
+    void scanBlock() {
+        for (size_t i = 0; i != packets.size(); ++i) {
+            const std::string &pkt = packets[i];
+            hs_error_t err = hs_scan(db_block, pkt.c_str(), pkt.length(), 0,
+                                     scratch, onMatch, &matchCount);
+            if (err != HS_SUCCESS) {
+                cerr << "ERROR: Unable to scan packet. Exiting." << endl;
+                exit(-1);
+            }
+        }
+    }
+
+    // Display some information about the compiled database and scanned data.
+    void displayStats() {
+        size_t numPackets = packets.size();
+        size_t numStreams = stream_map.size();
+        size_t numBytes = bytes();
+        hs_error_t err;
+
+        cout << numPackets << " packets in " << numStreams
+             << " streams, totalling " << numBytes << " bytes." << endl;
+        cout << "Average packet length: " << numBytes / numPackets << " bytes."
+             << endl;
+        cout << "Average stream length: " << numBytes / numStreams << " bytes."
+             << endl;
+        cout << endl;
+
+        size_t dbStream_size = 0;
+        err = hs_database_size(db_streaming, &dbStream_size);
+        if (err == HS_SUCCESS) {
+            cout << "Streaming mode Hyperscan database size    : "
+                 << dbStream_size << " bytes." << endl;
+        } else {
+            cout << "Error getting streaming mode Hyperscan database size"
+                 << endl;
+        }
+
+        size_t dbBlock_size = 0;
+        err = hs_database_size(db_block, &dbBlock_size);
+        if (err == HS_SUCCESS) {
+            cout << "Block mode Hyperscan database size        : "
+                 << dbBlock_size << " bytes." << endl;
+        } else {
+            cout << "Error getting block mode Hyperscan database size"
+                 << endl;
+        }
+
+        size_t stream_size = 0;
+        err = hs_stream_size(db_streaming, &stream_size);
+        if (err == HS_SUCCESS) {
+            cout << "Streaming mode Hyperscan stream state size: "
+                 << stream_size << " bytes (per stream)." << endl;
+        } else {
+            cout << "Error getting stream state size" << endl;
+        }
+    }
+};
+
+// helper function - see end of file
+static void parseFile(const char *filename, vector<string> &patterns,
+                      vector<unsigned> &flags, vector<unsigned> &ids);
+
+static hs_database_t *buildDatabase(const vector<const char *> &expressions,
+                                    const vector<unsigned> flags,
+                                    const vector<unsigned> ids,
+                                    unsigned int mode) {
+    hs_database_t *db;
+    hs_compile_error_t *compileErr;
+    hs_error_t err;
+
+    Clock clock;
+    clock.start();
+
+    err = hs_compile_multi(expressions.data(), flags.data(), ids.data(),
+                           expressions.size(), mode, nullptr, &db, &compileErr);
+
+    clock.stop();
+
+    if (err != HS_SUCCESS) {
+        if (compileErr->expression < 0) {
+            // The error does not refer to a particular expression.
+            cerr << "ERROR: " << compileErr->message << endl;
+        } else {
+            cerr << "ERROR: Pattern '" << expressions[compileErr->expression]
+                 << "' failed compilation with error: " << compileErr->message
+                 << endl;
+        }
+        // As the compileErr pointer points to dynamically allocated memory, if
+        // we get an error, we must be sure to release it. This is not
+        // necessary when no error is detected.
+        hs_free_compile_error(compileErr);
+        exit(-1);
+    }
+
+    cout << "Hyperscan " << (mode == HS_MODE_STREAM ? "streaming" : "block")
+         << " mode database compiled in " << clock.seconds() << " seconds."
+         << endl;
+
+    return db;
+}
+
+/**
+ * This function will read in the file with the specified name, with an
+ * expression per line, ignoring lines starting with '#' and build a Hyperscan
+ * database for it.
+ */
+static void databasesFromFile(const char *filename,
+                              hs_database_t **db_streaming,
+                              hs_database_t **db_block) {
+    // hs_compile_multi requires three parallel arrays containing the patterns,
+    // flags and ids that we want to work with. To achieve this we use
+    // vectors and new entries onto each for each valid line of input from
+    // the pattern file.
+    vector<string> patterns;
+    vector<unsigned> flags;
+    vector<unsigned> ids;
+
+    // do the actual file reading and string handling
+    parseFile(filename, patterns, flags, ids);
+
+    // Turn our vector of strings into a vector of char*'s to pass in to
+    // hs_compile_multi. (This is just using the vector of strings as dynamic
+    // storage.)
+    vector<const char*> cstrPatterns;
+    for (const auto &pattern : patterns) {
+        cstrPatterns.push_back(pattern.c_str());
+    }
+
+    cout << "Compiling Hyperscan databases with " << patterns.size()
+         << " patterns." << endl;
+
+    *db_streaming = buildDatabase(cstrPatterns, flags, ids, HS_MODE_STREAM);
+    *db_block = buildDatabase(cstrPatterns, flags, ids, HS_MODE_BLOCK);
+}
+
+static void usage(const char *prog) {
+    cerr << "Usage: " << prog << " [-n repeats] <pattern file> <pcap file>" << endl;
+}
+
+// Main entry point.
+int main(int argc, char **argv) {
+    unsigned int repeatCount = 1;
+
+    // Process command line arguments.
+    int opt;
+    while ((opt = getopt(argc, argv, "n:")) != -1) {
+        switch (opt) {
+        case 'n':
+            repeatCount = atoi(optarg);
+            break;
+        default:
+            usage(argv[0]);
+            exit(-1);
+        }
+    }
+
+    if (argc - optind != 2) {
+        usage(argv[0]);
+        exit(-1);
+    }
+
+    const char *patternFile = argv[optind];
+    const char *pcapFile = argv[optind + 1];
+
+    // Read our pattern set in and build Hyperscan databases from it.
+    cout << "Pattern file: " << patternFile << endl;
+    hs_database_t *db_streaming, *db_block;
+    databasesFromFile(patternFile, &db_streaming, &db_block);
+
+    // Read our input PCAP file in
+    Benchmark bench(db_streaming, db_block);
+    cout << "PCAP input file: " << pcapFile << endl;
+    if (!bench.readStreams(pcapFile)) {
+        cerr << "Unable to read packets from PCAP file. Exiting." << endl;
+        exit(-1);
+    }
+
+    if (repeatCount != 1) {
+        cout << "Repeating PCAP scan " << repeatCount << " times." << endl;
+    }
+
+    bench.displayStats();
+
+    Clock clock;
+
+    // Streaming mode scans.
+    double secsStreamingScan = 0.0, secsStreamingOpenClose = 0.0;
+    for (unsigned int i = 0; i < repeatCount; i++) {
+        // Open streams.
+        clock.start();
+        bench.openStreams();
+        clock.stop();
+        secsStreamingOpenClose += clock.seconds();
+
+        // Scan all our packets in streaming mode.
+        clock.start();
+        bench.scanStreams();
+        clock.stop();
+        secsStreamingScan += clock.seconds();
+
+        // Close streams.
+        clock.start();
+        bench.closeStreams();
+        clock.stop();
+        secsStreamingOpenClose += clock.seconds();
+    }
+
+    // Collect data from streaming mode scans.
+    size_t bytes = bench.bytes();
+    double tputStreamScanning = (bytes * 8 * repeatCount) / secsStreamingScan;
+    double tputStreamOverhead = (bytes * 8 * repeatCount) / (secsStreamingScan + secsStreamingOpenClose);
+    size_t matchesStream = bench.matches();
+    double matchRateStream = matchesStream / ((bytes * repeatCount) / 1024.0); // matches per kilobyte
+
+    // Scan all our packets in block mode.
+    bench.clearMatches();
+    clock.start();
+    for (unsigned int i = 0; i < repeatCount; i++) {
+        bench.scanBlock();
+    }
+    clock.stop();
+    double secsScanBlock = clock.seconds();
+
+    // Collect data from block mode scans.
+    double tputBlockScanning = (bytes * 8 * repeatCount) / secsScanBlock;
+    size_t matchesBlock = bench.matches();
+    double matchRateBlock = matchesBlock / ((bytes * repeatCount) / 1024.0); // matches per kilobyte
+
+    cout << endl << "Streaming mode:" << endl << endl;
+    cout << "  Total matches: " << matchesStream << endl;
+    cout << std::fixed << std::setprecision(4);
+    cout << "  Match rate:    " << matchRateStream << " matches/kilobyte" << endl;
+    cout << std::fixed << std::setprecision(2);
+    cout << "  Throughput (with stream overhead): "
+              << tputStreamOverhead/1000000 << " megabits/sec" << endl;
+    cout << "  Throughput (no stream overhead):   "
+              << tputStreamScanning/1000000 << " megabits/sec" << endl;
+
+    cout << endl << "Block mode:" << endl << endl;
+    cout << "  Total matches: " << matchesBlock << endl;
+    cout << std::fixed << std::setprecision(4);
+    cout << "  Match rate:    " << matchRateBlock << " matches/kilobyte" << endl;
+    cout << std::fixed << std::setprecision(2);
+    cout << "  Throughput:    "
+              << tputBlockScanning/1000000 << " megabits/sec" << endl;
+
+    cout << endl;
+    if (bytes < (2*1024*1024)) {
+        cout << endl << "WARNING: Input PCAP file is less than 2MB in size." << endl
+                  << "This test may have been too short to calculate accurate results." << endl;
+    }
+
+    // Close Hyperscan databases
+    hs_free_database(db_streaming);
+    hs_free_database(db_block);
+
+    return 0;
+}
+
+/**
+ * Helper function to locate the offset of the first byte of the payload in the
+ * given ethernet frame. Offset into the packet, and the length of the payload
+ * are returned in the arguments @a offset and @a length.
+ */
+static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
+                          unsigned int *length) {
+    const ip *iph = (const ip *)(pkt_data + sizeof(ether_header));
+    const tcphdr *th = nullptr;
+
+    // Ignore packets that aren't IPv4
+    if (iph->ip_v != 4) {
+        return false;
+    }
+
+    // Ignore fragmented packets.
+    if (iph->ip_off & htons(IP_MF|IP_OFFMASK)) {
+        return false;
+    }
+
+    // IP header length, and transport header length.
+    unsigned int ihlen = iph->ip_hl * 4;
+    unsigned int thlen = 0;
+
+    switch (iph->ip_p) {
+    case IPPROTO_TCP:
+        th = (const tcphdr *)((const char *)iph + ihlen);
+        thlen = th->th_off * 4;
+        break;
+    case IPPROTO_UDP:
+        thlen = sizeof(udphdr);
+        break;
+    default:
+        return false;
+    }
+
+    *offset = sizeof(ether_header) + ihlen + thlen;
+    *length = sizeof(ether_header) + ntohs(iph->ip_len) - *offset;
+
+    return *length != 0;
+}
+
+static unsigned parseFlags(const string &flagsStr) {
+    unsigned flags = 0;
+    for (const auto &c : flagsStr) {
+        switch (c) {
+        case 'i':
+            flags |= HS_FLAG_CASELESS; break;
+        case 'm':
+            flags |= HS_FLAG_MULTILINE; break;
+        case 's':
+            flags |= HS_FLAG_DOTALL; break;
+        case 'H':
+            flags |= HS_FLAG_SINGLEMATCH; break;
+        case 'V':
+            flags |= HS_FLAG_ALLOWEMPTY; break;
+        case '8':
+            flags |= HS_FLAG_UTF8; break;
+        case 'W':
+            flags |= HS_FLAG_UCP; break;
+        default:
+            cerr << "Unsupported flag \'" << c << "\'" << endl;
+            exit(-1);
+        }
+    }
+    return flags;
+}
+
+static void parseFile(const char *filename, vector<string> &patterns,
+                      vector<unsigned> &flags, vector<unsigned> &ids) {
+    ifstream inFile(filename);
+    if (!inFile.good()) {
+        cerr << "ERROR: Can't open pattern file \"" << filename << "\"" << endl;
+        exit(-1);
+    }
+
+    for (unsigned i = 1; !inFile.eof(); ++i) {
+        string line;
+        getline(inFile, line);
+
+        // if line is empty, or a comment, we can skip it
+        if (line.empty() || line[0] == '#') {
+            continue;
+        }
+
+        // otherwise, it should be ID:PCRE, e.g.
+        //  10001:/foobar/is
+
+        size_t colonIdx = line.find_first_of(':');
+        if (colonIdx == string::npos) {
+            cerr << "ERROR: Could not parse line " << i << endl;
+            exit(-1);
+        }
+
+        // we should have an unsigned int as an ID, before the colon
+        unsigned id = std::stoi(line.substr(0, colonIdx).c_str());
+
+        // rest of the expression is the PCRE
+        const string expr(line.substr(colonIdx + 1));
+
+        size_t flagsStart = expr.find_last_of('/');
+        if (flagsStart == string::npos) {
+            cerr << "ERROR: no trailing '/' char" << endl;
+            exit(-1);
+        }
+
+        string pcre(expr.substr(1, flagsStart - 1));
+        string flagsStr(expr.substr(flagsStart + 1, expr.size() - flagsStart));
+        unsigned flag = parseFlags(flagsStr);
+
+        patterns.push_back(pcre);
+        flags.push_back(flag);
+        ids.push_back(id);
+    }
+}
+
--- a/examples/simplegrep.c
+++ b/examples/simplegrep.c
@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Hyperscan example program 1: simplegrep
+ *
+ * This is a simple example of Hyperscan's most basic functionality: it will
+ * search a given input file for a pattern supplied as a command-line argument.
+ * It is intended to demonstrate correct usage of the hs_compile and hs_scan
+ * functions of Hyperscan.
+ *
+ * Patterns are scanned in 'DOTALL' mode, which is equivalent to PCRE's '/s'
+ * modifier. This behaviour can be changed by modifying the "flags" argument to
+ * hs_compile.
+ *
+ * Build instructions:
+ *
+ *     gcc -o simplegrep simplegrep.c $(pkg-config --cflags --libs libhs)
+ *
+ * Usage:
+ *
+ *     ./simplegrep <pattern> <input file>
+ *
+ * Example:
+ *
+ *     ./simplegrep int simplegrep.c
+ *
+ */
+
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <hs.h>
+
+/**
+ * This is the function that will be called for each match that occurs. @a ctx
+ * is to allow you to have some application-specific state that you will get
+ * access to for each match. In our simple example we're just going to use it
+ * to pass in the pattern that was being searched for so we can print it out.
+ */
+static int eventHandler(unsigned int id, unsigned long long from,
+                        unsigned long long to, unsigned int flags, void *ctx) {
+    printf("Match for pattern \"%s\" at offset %llu\n", (char *)ctx, to);
+    return 0;
+}
+
+/**
+ * Fill a data buffer from the given filename, returning it and filling @a
+ * length with its length. Returns NULL on failure.
+ */
+static char *readInputData(const char *inputFN, unsigned int *length) {
+    FILE *f = fopen(inputFN, "r");
+    if (!f) {
+        fprintf(stderr, "ERROR: unable to open file \"%s\": %s\n", inputFN,
+                strerror(errno));
+        return NULL;
+    }
+
+    /* We use fseek/ftell to get our data length, in order to keep this example
+     * code as portable as possible. */
+    if (fseek(f, 0, SEEK_END) != 0) {
+        fprintf(stderr, "ERROR: unable to seek file \"%s\": %s\n", inputFN,
+                strerror(errno));
+        fclose(f);
+        return NULL;
+    }
+    long dataLen = ftell(f);
+    if (dataLen < 0) {
+        fprintf(stderr, "ERROR: ftell() failed: %s\n", strerror(errno));
+        fclose(f);
+        return NULL;
+    }
+    if (fseek(f, 0, SEEK_SET) != 0) {
+        fprintf(stderr, "ERROR: unable to seek file \"%s\": %s\n", inputFN,
+                strerror(errno));
+        fclose(f);
+        return NULL;
+    }
+
+    /* Hyperscan's hs_scan function accepts length as an unsigned int, so we
+     * limit the size of our buffer appropriately. */
+    if ((unsigned long)dataLen > UINT_MAX) {
+        dataLen = UINT_MAX;
+        printf("WARNING: clipping data to %lu bytes\n", dataLen);
+    } else if (dataLen == 0) {
+        fprintf(stderr, "ERROR: input file \"%s\" is empty\n", inputFN);
+        fclose(f);
+        return NULL;
+    }
+
+    char *inputData = malloc(dataLen);
+    if (!inputData) {
+        fprintf(stderr, "ERROR: unable to malloc %lu bytes\n", dataLen);
+        fclose(f);
+        return NULL;
+    }
+
+    char *p = inputData;
+    size_t bytesLeft = dataLen;
+    while (bytesLeft) {
+        size_t bytesRead = fread(p, 1, bytesLeft, f);
+        bytesLeft -= bytesRead;
+        p += bytesRead;
+        if (ferror(f) != 0) {
+            fprintf(stderr, "ERROR: fread() failed\n");
+            free(inputData);
+            fclose(f);
+            return NULL;
+        }
+    }
+
+    fclose(f);
+
+    *length = (unsigned int)dataLen;
+    return inputData;
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 3) {
+        fprintf(stderr, "Usage: %s <pattern> <input file>\n", argv[0]);
+        return -1;
+    }
+
+    char *pattern = argv[1];
+    char *inputFN = argv[2];
+
+    /* First, we attempt to compile the pattern provided on the command line.
+     * We assume 'DOTALL' semantics, meaning that the '.' meta-character will
+     * match newline characters. The compiler will analyse the given pattern and
+     * either return a compiled Hyperscan database, or an error message
+     * explaining why the pattern didn't compile.
+     */
+    hs_database_t *database;
+    hs_compile_error_t *compile_err;
+    if (hs_compile(pattern, HS_FLAG_DOTALL, HS_MODE_BLOCK, NULL, &database,
+                   &compile_err) != HS_SUCCESS) {
+        fprintf(stderr, "ERROR: Unable to compile pattern \"%s\": %s\n",
+                pattern, compile_err->message);
+        hs_free_compile_error(compile_err);
+        return -1;
+    }
+
+    /* Next, we read the input data file into a buffer. */
+    unsigned int length;
+    char *inputData = readInputData(inputFN, &length);
+    if (!inputData) {
+        hs_free_database(database);
+        return -1;
+    }
+
+    /* Finally, we issue a call to hs_scan, which will search the input buffer
+     * for the pattern represented in the bytecode. Note that in order to do
+     * this, scratch space needs to be allocated with the hs_alloc_scratch
+     * function. In typical usage, you would reuse this scratch space for many
+     * calls to hs_scan, but as we're only doing one, we'll be allocating it
+     * and deallocating it as soon as our matching is done.
+     *
+     * When matches occur, the specified callback function (eventHandler in
+     * this file) will be called. Note that although it is reminiscent of
+     * asynchronous APIs, Hyperscan operates synchronously: all matches will be
+     * found, and all callbacks issued, *before* hs_scan returns.
+     *
+     * In this example, we provide the input pattern as the context pointer so
+     * that the callback is able to print out the pattern that matched on each
+     * match event.
+     */
+    hs_scratch_t *scratch = NULL;
+    if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) {
+        fprintf(stderr, "ERROR: Unable to allocate scratch space. Exiting.\n");
+        free(inputData);
+        hs_free_database(database);
+        return -1;
+    }
+
+    printf("Scanning %u bytes with Hyperscan\n", length);
+
+    if (hs_scan(database, inputData, length, 0, scratch, eventHandler,
+                pattern) != HS_SUCCESS) {
+        fprintf(stderr, "ERROR: Unable to scan input buffer. Exiting.\n");
+        hs_free_scratch(scratch);
+        free(inputData);
+        hs_free_database(database);
+        return -1;
+    }
+
+    /* Scanning is complete, any matches have been handled, so now we just
+     * clean up and exit.
+     */
+    hs_free_scratch(scratch);
+    free(inputData);
+    hs_free_database(database);
+    return 0;
+}
--- a/include/boost-patched/graph/dominator_tree.hpp
+++ b/include/boost-patched/graph/dominator_tree.hpp
@ -0,0 +1,501 @@
+//=======================================================================
+// Copyright (C) 2005-2009 Jongsoo Park <jongsoo.park -at- gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+//=======================================================================
+
+#ifndef BOOST_GRAPH_DOMINATOR_HPP
+#define BOOST_GRAPH_DOMINATOR_HPP
+
+#include <boost/config.hpp>
+#include <deque>
+#include <set>
+#include <boost/graph/depth_first_search.hpp>
+#include <boost/concept/assert.hpp>
+
+// Dominator tree computation
+
+// NOTE: This file contains modifications from the distributed Boost version to
+// correctly support supplying a vertex index map to the algorithm. To
+// differentiate it, it has been moved into the boost_ue2 namespace.
+
+namespace boost_ue2 {
+
+  using namespace boost;
+
+  namespace detail {
+    /**
+     * An extended time_stamper which also records vertices for each dfs number
+     */
+    template<class TimeMap, class VertexVector, class TimeT, class Tag>
+    class time_stamper_with_vertex_vector
+      : public base_visitor<
+      time_stamper_with_vertex_vector<TimeMap, VertexVector, TimeT, Tag> >
+    {
+    public :
+      typedef Tag event_filter;
+      time_stamper_with_vertex_vector(TimeMap timeMap, VertexVector& v,
+                                      TimeT& t)
+        : timeStamper_(timeMap, t), v_(v) { }
+
+      template<class Graph>
+      void
+      operator()(const typename property_traits<TimeMap>::key_type& v,
+                 const Graph& g)
+      {
+        timeStamper_(v, g);
+        v_[timeStamper_.m_time] = v;
+      }
+
+    private :
+      time_stamper<TimeMap, TimeT, Tag> timeStamper_;
+      VertexVector& v_;
+    };
+
+    /**
+     * A convenient way to create a time_stamper_with_vertex_vector
+     */
+    template<class TimeMap, class VertexVector, class TimeT, class Tag>
+    time_stamper_with_vertex_vector<TimeMap, VertexVector, TimeT, Tag>
+    stamp_times_with_vertex_vector(TimeMap timeMap, VertexVector& v, TimeT& t,
+                                   Tag)
+    {
+      return time_stamper_with_vertex_vector<TimeMap, VertexVector, TimeT,
+                                             Tag>(timeMap, v, t);
+    }
+
+    template<class Graph, class IndexMap, class TimeMap, class PredMap,
+             class DomTreePredMap>
+    class dominator_visitor
+    {
+      typedef typename graph_traits<Graph>::vertex_descriptor Vertex;
+      typedef typename graph_traits<Graph>::vertices_size_type VerticesSizeType;
+
+    public :
+      /**
+       * @param g [in] the target graph of the dominator tree
+       * @param entry [in] the entry node of g
+       * @param indexMap [in] the vertex index map for g
+       * @param domTreePredMap [out] the immediate dominator map
+       *                             (parent map in dominator tree)
+       */
+      dominator_visitor(const Graph& g, const Vertex& entry,
+                        const IndexMap& indexMap,
+                        DomTreePredMap domTreePredMap)
+        : semi_(num_vertices(g)),
+          ancestor_(num_vertices(g), graph_traits<Graph>::null_vertex()),
+          samedom_(ancestor_),
+          best_(semi_),
+          semiMap_(make_iterator_property_map(semi_.begin(),
+                                              indexMap)),
+          ancestorMap_(make_iterator_property_map(ancestor_.begin(),
+                                                  indexMap)),
+          bestMap_(make_iterator_property_map(best_.begin(),
+                                              indexMap)),
+          buckets_(num_vertices(g)),
+          bucketMap_(make_iterator_property_map(buckets_.begin(),
+                                                indexMap)),
+          entry_(entry),
+          domTreePredMap_(domTreePredMap),
+          numOfVertices_(num_vertices(g)),
+          samedomMap(make_iterator_property_map(samedom_.begin(),
+                                                indexMap))
+      {
+      }
+
+      void
+      operator()(const Vertex& n, const TimeMap& dfnumMap,
+                 const PredMap& parentMap, const Graph& g)
+      {
+        if (n == entry_) return;
+
+        const Vertex p(get(parentMap, n));
+        Vertex s(p);
+
+        // 1. Calculate the semidominator of n,
+        // based on the semidominator thm.
+        // * Semidominator thm. : To find the semidominator of a node n,
+        //   consider all predecessors v of n in the CFG (Control Flow Graph).
+        //  - If v is a proper ancestor of n in the spanning tree
+        //    (so dfnum(v) < dfnum(n)), then v is a candidate for semi(n)
+        //  - If v is a non-ancestor of n (so dfnum(v) > dfnum(n))
+        //    then for each u that is an ancestor of v (or u = v),
+        //    Let semi(u) be a candidate for semi(n)
+        //   of all these candidates, the one with lowest dfnum is
+        //   the semidominator of n.
+
+        // For each predecessor of n
+        typename graph_traits<Graph>::in_edge_iterator inItr, inEnd;
+        for (boost::tie(inItr, inEnd) = in_edges(n, g); inItr != inEnd; ++inItr)
+          {
+            const Vertex v = source(*inItr, g);
+            // To deal with unreachable nodes
+            if (get(dfnumMap, v) < 0 || get(dfnumMap, v) >= numOfVertices_)
+              continue;
+
+            Vertex s2;
+            if (get(dfnumMap, v) <= get(dfnumMap, n))
+              s2 = v;
+            else
+              s2 = get(semiMap_, ancestor_with_lowest_semi_(v, dfnumMap));
+
+            if (get(dfnumMap, s2) < get(dfnumMap, s))
+              s = s2;
+          }
+        put(semiMap_, n, s);
+
+        // 2. Calculation of n's dominator is deferred until
+        // the path from s to n has been linked into the forest
+        get(bucketMap_, s).push_back(n);
+        get(ancestorMap_, n) = p;
+        get(bestMap_, n) = n;
+
+        // 3. Now that the path from p to v has been linked into
+        // the spanning forest, these lines calculate the dominator of v,
+        // based on the dominator thm., or else defer the calculation
+        // until y's dominator is known
+        // * Dominator thm. : On the spanning-tree path below semi(n) and
+        //   above or including n, let y be the node
+        //   with the smallest-numbered semidominator. Then,
+        //
+        //  idom(n) = semi(n) if semi(y)=semi(n) or
+        //            idom(y) if semi(y) != semi(n)
+        typename std::deque<Vertex>::iterator buckItr;
+        for (buckItr = get(bucketMap_, p).begin();
+             buckItr != get(bucketMap_, p).end();
+             ++buckItr)
+          {
+            const Vertex v(*buckItr);
+            const Vertex y(ancestor_with_lowest_semi_(v, dfnumMap));
+            if (get(semiMap_, y) == get(semiMap_, v))
+              put(domTreePredMap_, v, p);
+            else
+              put(samedomMap, v, y);
+          }
+
+        get(bucketMap_, p).clear();
+      }
+
+    protected :
+
+      /**
+       * Evaluate function in Tarjan's path compression
+       */
+      const Vertex
+      ancestor_with_lowest_semi_(const Vertex& v, const TimeMap& dfnumMap)
+      {
+        const Vertex a(get(ancestorMap_, v));
+
+        if (get(ancestorMap_, a) != graph_traits<Graph>::null_vertex())
+          {
+            const Vertex b(ancestor_with_lowest_semi_(a, dfnumMap));
+
+            put(ancestorMap_, v, get(ancestorMap_, a));
+
+            if (get(dfnumMap, get(semiMap_, b)) <
+                get(dfnumMap, get(semiMap_, get(bestMap_, v))))
+              put(bestMap_, v, b);
+          }
+
+        return get(bestMap_, v);
+      }
+
+      std::vector<Vertex> semi_, ancestor_, samedom_, best_;
+      PredMap semiMap_, ancestorMap_, bestMap_;
+      std::vector< std::deque<Vertex> > buckets_;
+
+      iterator_property_map<typename std::vector<std::deque<Vertex> >::iterator,
+                            IndexMap> bucketMap_;
+
+      const Vertex& entry_;
+      DomTreePredMap domTreePredMap_;
+      const VerticesSizeType numOfVertices_;
+
+    public :
+
+      PredMap samedomMap;
+    };
+
+  } // namespace detail
+
+  /**
+   * @brief Build dominator tree using Lengauer-Tarjan algorithm.
+   *                It takes O((V+E)log(V+E)) time.
+   *
+   * @pre dfnumMap, parentMap and verticesByDFNum have dfs results corresponding
+   *      indexMap.
+   *      If dfs has already run before,
+   *      this function would be good for saving computations.
+   * @pre Unreachable nodes must be masked as
+   *      graph_traits<Graph>::null_vertex in parentMap.
+   * @pre Unreachable nodes must be masked as
+   *      (std::numeric_limits<VerticesSizeType>::max)() in dfnumMap.
+   *
+   * @param domTreePredMap [out] : immediate dominator map (parent map
+   * in dom. tree)
+   *
+   * @note reference Appel. p. 452~453. algorithm 19.9, 19.10.
+   *
+   * @todo : Optimization in Finding Dominators in Practice, Loukas Georgiadis
+   */
+  template<class Graph, class IndexMap, class TimeMap, class PredMap,
+           class VertexVector, class DomTreePredMap>
+  void
+  lengauer_tarjan_dominator_tree_without_dfs
+    (const Graph& g,
+     const typename graph_traits<Graph>::vertex_descriptor& entry,
+     const IndexMap& indexMap,
+     TimeMap dfnumMap, PredMap parentMap, VertexVector& verticesByDFNum,
+     DomTreePredMap domTreePredMap)
+  {
+    // Typedefs and concept check
+    typedef typename graph_traits<Graph>::vertex_descriptor Vertex;
+    typedef typename graph_traits<Graph>::vertices_size_type VerticesSizeType;
+
+    BOOST_CONCEPT_ASSERT(( BidirectionalGraphConcept<Graph> ));
+
+    const VerticesSizeType numOfVertices = num_vertices(g);
+    if (numOfVertices == 0) return;
+
+    // 1. Visit each vertex in reverse post order and calculate sdom.
+    detail::dominator_visitor<Graph, IndexMap, TimeMap, PredMap, DomTreePredMap>
+      visitor(g, entry, indexMap, domTreePredMap);
+
+    VerticesSizeType i;
+    for (i = 0; i < numOfVertices; ++i)
+      {
+        const Vertex u(verticesByDFNum[numOfVertices - 1 - i]);
+        if (u != graph_traits<Graph>::null_vertex())
+          visitor(u, dfnumMap, parentMap, g);
+      }
+
+    // 2. Now all the deferred dominator calculations,
+    // based on the second clause of the dominator thm., are performed
+    for (i = 0; i < numOfVertices; ++i)
+      {
+        const Vertex n(verticesByDFNum[i]);
+
+        if (n == entry || n == graph_traits<Graph>::null_vertex())
+          continue;
+
+        Vertex u = get(visitor.samedomMap, n);
+        if (u != graph_traits<Graph>::null_vertex())
+          {
+            put(domTreePredMap, n, get(domTreePredMap, u));
+          }
+      }
+  }
+
+  /**
+   * Unlike lengauer_tarjan_dominator_tree_without_dfs,
+   * dfs is run in this function and
+   * the result is written to dfnumMap, parentMap, vertices.
+   *
+   * If the result of dfs required after this algorithm,
+   * this function can eliminate the need of rerunning dfs.
+   */
+  template<class Graph, class IndexMap, class TimeMap, class PredMap,
+           class VertexVector, class DomTreePredMap>
+  void
+  lengauer_tarjan_dominator_tree
+    (const Graph& g,
+     const typename graph_traits<Graph>::vertex_descriptor& entry,
+     const IndexMap& indexMap,
+     TimeMap dfnumMap, PredMap parentMap, VertexVector& verticesByDFNum,
+     DomTreePredMap domTreePredMap)
+  {
+    // Typedefs and concept check
+    typedef typename graph_traits<Graph>::vertices_size_type VerticesSizeType;
+
+    BOOST_CONCEPT_ASSERT(( BidirectionalGraphConcept<Graph> ));
+
+    // 1. Depth first visit
+    const VerticesSizeType numOfVertices = num_vertices(g);
+    if (numOfVertices == 0) return;
+
+    VerticesSizeType time =
+      (std::numeric_limits<VerticesSizeType>::max)();
+    std::vector<default_color_type>
+      colors(numOfVertices, color_traits<default_color_type>::white());
+    depth_first_visit
+      (g, entry,
+       make_dfs_visitor
+         (make_pair(record_predecessors(parentMap, on_tree_edge()),
+                    detail::stamp_times_with_vertex_vector
+                      (dfnumMap, verticesByDFNum, time, on_discover_vertex()))),
+       make_iterator_property_map(colors.begin(), indexMap));
+
+    // 2. Run main algorithm.
+    lengauer_tarjan_dominator_tree_without_dfs(g, entry, indexMap, dfnumMap,
+                                               parentMap, verticesByDFNum,
+                                               domTreePredMap);
+  }
+
+  /**
+   * Use vertex_index as IndexMap and make dfnumMap, parentMap, verticesByDFNum
+   * internally.
+   * If we don't need the result of dfs (dfnumMap, parentMap, verticesByDFNum),
+   * this function would be more convenient one.
+   */
+  template<class Graph, class DomTreePredMap>
+  void
+  lengauer_tarjan_dominator_tree
+    (const Graph& g,
+     const typename graph_traits<Graph>::vertex_descriptor& entry,
+     DomTreePredMap domTreePredMap)
+  {
+    // typedefs
+    typedef typename graph_traits<Graph>::vertex_descriptor Vertex;
+    typedef typename graph_traits<Graph>::vertices_size_type VerticesSizeType;
+    typedef typename property_map<Graph, vertex_index_t>::const_type IndexMap;
+    typedef
+      iterator_property_map<typename std::vector<VerticesSizeType>::iterator,
+                            IndexMap> TimeMap;
+    typedef
+      iterator_property_map<typename std::vector<Vertex>::iterator, IndexMap>
+      PredMap;
+
+    // Make property maps
+    const VerticesSizeType numOfVertices = num_vertices(g);
+    if (numOfVertices == 0) return;
+
+    const IndexMap indexMap = get(vertex_index, g);
+
+    std::vector<VerticesSizeType> dfnum(numOfVertices, 0);
+    TimeMap dfnumMap(make_iterator_property_map(dfnum.begin(), indexMap));
+
+    std::vector<Vertex> parent(numOfVertices,
+                               graph_traits<Graph>::null_vertex());
+    PredMap parentMap(make_iterator_property_map(parent.begin(), indexMap));
+
+    std::vector<Vertex> verticesByDFNum(parent);
+
+    // Run main algorithm
+    lengauer_tarjan_dominator_tree(g, entry,
+                                   indexMap, dfnumMap, parentMap,
+                                   verticesByDFNum, domTreePredMap);
+  }
+
+  /**
+   * Muchnick. p. 182, 184
+   *
+   * using iterative bit vector analysis
+   */
+  template<class Graph, class IndexMap, class DomTreePredMap>
+  void
+  iterative_bit_vector_dominator_tree
+    (const Graph& g,
+     const typename graph_traits<Graph>::vertex_descriptor& entry,
+     const IndexMap& indexMap,
+     DomTreePredMap domTreePredMap)
+  {
+    typedef typename graph_traits<Graph>::vertex_descriptor Vertex;
+    typedef typename graph_traits<Graph>::vertex_iterator vertexItr;
+    typedef typename graph_traits<Graph>::vertices_size_type VerticesSizeType;
+    typedef
+      iterator_property_map<typename std::vector< std::set<Vertex> >::iterator,
+                            IndexMap> vertexSetMap;
+
+    BOOST_CONCEPT_ASSERT(( BidirectionalGraphConcept<Graph> ));
+
+    // 1. Finding dominator
+    // 1.1. Initialize
+    const VerticesSizeType numOfVertices = num_vertices(g);
+    if (numOfVertices == 0) return;
+
+    vertexItr vi, viend;
+    boost::tie(vi, viend) = vertices(g);
+    const std::set<Vertex> N(vi, viend);
+
+    bool change = true;
+
+    std::vector< std::set<Vertex> > dom(numOfVertices, N);
+    vertexSetMap domMap(make_iterator_property_map(dom.begin(), indexMap));
+    get(domMap, entry).clear();
+    get(domMap, entry).insert(entry);
+
+    while (change)
+      {
+        change = false;
+        for (boost::tie(vi, viend) = vertices(g); vi != viend; ++vi)
+          {
+            if (*vi == entry) continue;
+
+            std::set<Vertex> T(N);
+
+            typename graph_traits<Graph>::in_edge_iterator inItr, inEnd;
+            for (boost::tie(inItr, inEnd) = in_edges(*vi, g); inItr != inEnd; ++inItr)
+              {
+                const Vertex p = source(*inItr, g);
+
+                std::set<Vertex> tempSet;
+                std::set_intersection(T.begin(), T.end(),
+                                      get(domMap, p).begin(),
+                                      get(domMap, p).end(),
+                                      std::inserter(tempSet, tempSet.begin()));
+                T.swap(tempSet);
+              }
+
+            T.insert(*vi);
+            if (T != get(domMap, *vi))
+              {
+                change = true;
+                get(domMap, *vi).swap(T);
+              }
+          } // end of for (boost::tie(vi, viend) = vertices(g)
+      } // end of while(change)
+
+    // 2. Build dominator tree
+    for (boost::tie(vi, viend) = vertices(g); vi != viend; ++vi)
+      get(domMap, *vi).erase(*vi);
+
+    Graph domTree(numOfVertices);
+
+    for (boost::tie(vi, viend) = vertices(g); vi != viend; ++vi)
+      {
+        if (*vi == entry) continue;
+
+        // We have to iterate through copied dominator set
+        const std::set<Vertex> tempSet(get(domMap, *vi));
+        typename std::set<Vertex>::const_iterator s;
+        for (s = tempSet.begin(); s != tempSet.end(); ++s)
+          {
+            typename std::set<Vertex>::iterator t;
+            for (t = get(domMap, *vi).begin(); t != get(domMap, *vi).end(); )
+              {
+        typename std::set<Vertex>::iterator old_t = t;
+        ++t; // Done early because t may become invalid
+                if (*old_t == *s) continue;
+                if (get(domMap, *s).find(*old_t) != get(domMap, *s).end())
+                  get(domMap, *vi).erase(old_t);
+              }
+          }
+      }
+
+    for (boost::tie(vi, viend) = vertices(g); vi != viend; ++vi)
+      {
+        if (*vi != entry && get(domMap, *vi).size() == 1)
+          {
+            Vertex temp = *get(domMap, *vi).begin();
+            put(domTreePredMap, *vi, temp);
+          }
+      }
+  }
+
+  template<class Graph, class DomTreePredMap>
+  void
+  iterative_bit_vector_dominator_tree
+    (const Graph& g,
+     const typename graph_traits<Graph>::vertex_descriptor& entry,
+     DomTreePredMap domTreePredMap)
+  {
+    typename property_map<Graph, vertex_index_t>::const_type
+      indexMap = get(vertex_index, g);
+
+    iterative_bit_vector_dominator_tree(g, entry, indexMap, domTreePredMap);
+  }
+} // namespace boost
+
+#endif // BOOST_GRAPH_DOMINATOR_HPP
--- a/libhs.pc.in
+++ b/libhs.pc.in
@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=@CMAKE_INSTALL_PREFIX@
+libdir=@CMAKE_INSTALL_PREFIX@/lib
+includedir=@CMAKE_INSTALL_PREFIX@/include
+
+Name: libhs
+Description: Intel(R) Hyperscan Library
+Version: @HS_VERSION@
+Libs: -L${libdir} -lhs
+Cflags: -I${includedir}/hs
--- a/src/alloc.c
+++ b/src/alloc.c
@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Runtime functions for setting custom allocators.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "allocator.h"
+
+#define default_malloc malloc
+#define default_free free
+
+hs_alloc_t hs_database_alloc = default_malloc;
+hs_alloc_t hs_misc_alloc = default_malloc;
+hs_alloc_t hs_scratch_alloc = default_malloc;
+hs_alloc_t hs_stream_alloc = default_malloc;
+
+hs_free_t hs_database_free = default_free;
+hs_free_t hs_misc_free = default_free;
+hs_free_t hs_scratch_free = default_free;
+hs_free_t hs_stream_free = default_free;
+
+static
+hs_alloc_t normalise_alloc(hs_alloc_t a) {
+    if (!a) {
+        return default_malloc;
+    } else {
+        return a;
+    }
+}
+
+static
+hs_free_t normalise_free(hs_free_t f) {
+    if (!f) {
+        return default_free;
+    } else {
+        return f;
+    }
+}
+
+HS_PUBLIC_API
+hs_error_t hs_set_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+    hs_set_database_allocator(allocfunc, freefunc);
+    hs_set_misc_allocator(allocfunc, freefunc);
+    hs_set_stream_allocator(allocfunc, freefunc);
+    hs_set_scratch_allocator(allocfunc, freefunc);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t hs_set_database_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+    hs_database_alloc = normalise_alloc(allocfunc);
+    hs_database_free = normalise_free(freefunc);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t hs_set_misc_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+    hs_misc_alloc = normalise_alloc(allocfunc);
+    hs_misc_free = normalise_free(freefunc);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t hs_set_scratch_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+    hs_scratch_alloc = normalise_alloc(allocfunc);
+    hs_scratch_free = normalise_free(freefunc);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t hs_set_stream_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+    hs_stream_alloc = normalise_alloc(allocfunc);
+    hs_stream_free = normalise_free(freefunc);
+
+    return HS_SUCCESS;
+}
--- a/src/allocator.h
+++ b/src/allocator.h
@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ALLOCATOR_H
+#define ALLOCATOR_H
+
+#include "hs_common.h"
+#include "ue2common.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+extern hs_alloc_t hs_database_alloc;
+extern hs_alloc_t hs_misc_alloc;
+extern hs_alloc_t hs_scratch_alloc;
+extern hs_alloc_t hs_stream_alloc;
+
+extern hs_free_t hs_database_free;
+extern hs_free_t hs_misc_free;
+extern hs_free_t hs_scratch_free;
+extern hs_free_t hs_stream_free;
+#ifdef __cplusplus
+} /* extern C */
+#endif
+/** \brief Check the results of an alloc done with hs_alloc for alignment.
+ *
+ * If we have incorrect alignment, return an error. Caller should free the
+ * offending block. */
+static really_inline
+hs_error_t hs_check_alloc(const void *mem) {
+    hs_error_t ret = HS_SUCCESS;
+    if (!mem) {
+        ret = HS_NOMEM;
+    } else if (!ISALIGNED_N(mem, alignof(unsigned long long))) {
+        ret = HS_BAD_ALLOC;
+    }
+    return ret;
+}
+
+#endif
--- a/src/compiler/asserts.cpp
+++ b/src/compiler/asserts.cpp
@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Convert temporary assert vertices (from construction method) to
+ * edge-based flags.
+ *
+ * This pass converts the temporary assert vertices created by the Glushkov
+ * construction process above (vertices with special assertions flags) into
+ * edges between those vertices' neighbours in the graph.
+ *
+ * These edges have the appropriate flags applied to them -- a path (u,t,v)
+ * through an assert vertex t will be replaced with the edge (u,v) with the
+ * assertion flags from t.
+ *
+ * Edges with mutually incompatible flags (such as the conjunction of
+ * word-to-word and word-to-nonword) are dropped.
+ */
+#include "asserts.h"
+#include "nfagraph/ng.h"
+#include "nfagraph/ng_prune.h"
+#include "nfagraph/ng_redundancy.h"
+#include "nfagraph/ng_util.h"
+#include "parser/position.h" // for POS flags
+#include "util/compile_error.h"
+#include "util/graph_range.h"
+
+#include <queue>
+#include <set>
+
+using namespace std;
+
+namespace ue2 {
+
+/** Hard limit on the maximum number of edges we'll clone before we throw up
+ * our hands and report 'Pattern too large.' */
+static const size_t MAX_ASSERT_EDGES = 300000;
+
+/** Flags representing the word-boundary assertions, \\b or \\B. */
+static const int WORDBOUNDARY_FLAGS = POS_FLAG_ASSERT_WORD_TO_WORD
+                                    | POS_FLAG_ASSERT_WORD_TO_NONWORD
+                                    | POS_FLAG_ASSERT_NONWORD_TO_WORD
+                                    | POS_FLAG_ASSERT_NONWORD_TO_NONWORD
+                                    | POS_FLAG_ASSERT_WORD_TO_WORD_UCP
+                                    | POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP
+                                    | POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP
+                                    | POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP;
+
+#define OPEN_EDGE 0U
+#define DEAD_EDGE (~0U)
+
+static
+u32 disjunct(u32 flags1, u32 flags2) {
+    /* from two asserts in parallel */
+    DEBUG_PRINTF("disjunct %x %x\n", flags1, flags2);
+    u32 rv;
+    if (flags1 == DEAD_EDGE) {
+        rv = flags2;
+    } else if (flags2 == DEAD_EDGE) {
+        rv = flags1;
+    } else if (flags1 == OPEN_EDGE || flags2 == OPEN_EDGE) {
+        rv = OPEN_EDGE;
+    } else {
+        rv = flags1 | flags2;
+    }
+    DEBUG_PRINTF("--> %x\n", rv);
+    return rv;
+}
+
+static
+u32 conjunct(u32 flags1, u32 flags2) {
+    /* from two asserts in series */
+    DEBUG_PRINTF("conjunct %x %x\n", flags1, flags2);
+    u32 rv;
+    if (flags1 == OPEN_EDGE) {
+        rv = flags2;
+    } else if (flags2 == OPEN_EDGE) {
+        rv = flags1;
+    } else if (flags1 & flags2) {
+        rv = flags1 & flags2;
+    } else {
+        rv = DEAD_EDGE; /* the conjunction of two different word boundary
+                         * assertion is impassable */
+    }
+
+    DEBUG_PRINTF("--> %x\n", rv);
+    return rv;
+}
+
+typedef map<pair<NFAVertex, NFAVertex>, NFAEdge> edge_cache_t;
+
+static
+void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
+                         u32 &assert_edge_count) {
+    DEBUG_PRINTF("replacing assert vertex %u\n", g[t].index);
+
+    const u32 flags = g[t].assert_flags;
+    DEBUG_PRINTF("consider assert vertex %u with flags %u\n",
+                 g[t].index, flags);
+
+    // Wire up all the predecessors to all the successors.
+
+    for (const auto &inEdge : in_edges_range(t, g)) {
+        NFAVertex u = source(inEdge, g);
+        if (u == t) {
+            continue; // ignore self-loops
+        }
+
+        const u32 flags_inc_in = conjunct(g[inEdge].assert_flags,
+                                          flags);
+        if (flags_inc_in == DEAD_EDGE) {
+            DEBUG_PRINTF("fail, in-edge has bad flags %d\n",
+                         g[inEdge].assert_flags);
+            continue;
+        }
+
+        for (const auto &outEdge : out_edges_range(t, g)) {
+            NFAVertex v = target(outEdge, g);
+
+            DEBUG_PRINTF("consider path [%u,%u,%u]\n", g[u].index,
+                         g[t].index, g[v].index);
+
+            if (v == t) {
+                continue; // ignore self-loops
+            }
+
+            const u32 flags_final = conjunct(g[outEdge].assert_flags,
+                                             flags_inc_in);
+
+            if (flags_final == DEAD_EDGE) {
+                DEBUG_PRINTF("fail, out-edge has bad flags %d\n",
+                             g[outEdge].assert_flags);
+                continue;
+            }
+
+            if ((g[u].assert_flags & POS_FLAG_MULTILINE_START)
+                && v == g.acceptEod) {
+                DEBUG_PRINTF("fail, (?m)^ does not match \\n at eod\n");
+                continue;
+            }
+
+            /* Replace path (u,t,v) with direct edge (u,v), unless the edge
+             * already exists, in which case we just need to edit its
+             * properties.
+             *
+             * Use edge_cache to prevent us going O(N).
+             */
+            auto cache_key = make_pair(u, v);
+            auto ecit = edge_cache.find(cache_key);
+            if (ecit == edge_cache.end()) {
+                DEBUG_PRINTF("adding edge %u %u\n", g[u].index,
+                              g[v].index);
+                NFAEdge e = add_edge(u, v, g).first;
+                edge_cache.emplace(cache_key, e);
+                g[e].assert_flags = flags;
+                if (++assert_edge_count > MAX_ASSERT_EDGES) {
+                    throw CompileError(g.expressionIndex,
+                                       "Pattern is too large.");
+                }
+            } else {
+                NFAEdge e = ecit->second;
+                DEBUG_PRINTF("updating edge %u %u [a %u]\n", g[u].index,
+                             g[v].index, g[t].index);
+                // Edge already exists.
+                u32 &e_flags = g[e].assert_flags;
+                e_flags = disjunct(e_flags, flags_final);
+                assert(e_flags != DEAD_EDGE);
+            }
+        }
+    }
+
+    // Clear vertex t to remove all the old edges.
+    /* no need to clear the cache, as we will never look up its edge as it is
+     * unreachable */
+    clear_vertex(t, g);
+}
+
+static
+void setReportId(ReportManager &rm, NGWrapper &g, NFAVertex v, s32 adj) {
+    // Don't try and set the report ID of a special vertex.
+    assert(!is_special(v, g));
+
+    // There should be no reports set already.
+    assert(g[v].reports.empty());
+
+    Report r = rm.getBasicInternalReport(g, adj);
+
+    g[v].reports.insert(rm.getInternalId(r));
+    DEBUG_PRINTF("set report id for vertex %u, adj %d\n",
+                 g[v].index, adj);
+}
+
+static
+void checkForMultilineStart(ReportManager &rm, NGWrapper &g) {
+    vector<NFAEdge> dead;
+    for (auto v : adjacent_vertices_range(g.start, g)) {
+        if (!(g[v].assert_flags & POS_FLAG_MULTILINE_START)) {
+            continue;
+        }
+        DEBUG_PRINTF("mls %u %08x\n", g[v].index,
+                     g[v].assert_flags);
+
+        /* we have found a multi-line start (maybe more than one) */
+
+        /* we need to interpose a dummy dot vertex between v and accept if
+         * required so that ^ doesn't match trailing \n */
+         for (const auto &e : out_edges_range(v, g)) {
+            if (target(e, g) == g.accept) {
+                dead.push_back(e);
+            }
+        }
+        /* assert has been resolved; clear flag */
+        g[v].assert_flags &= ~POS_FLAG_MULTILINE_START;
+    }
+
+    for (const auto &e : dead) {
+        NFAVertex dummy = add_vertex(g);
+        g[dummy].char_reach.setall();
+        setReportId(rm, g, dummy, -1);
+        add_edge(source(e, g), dummy, g[e], g);
+        add_edge(dummy, g.accept, g);
+    }
+
+    remove_edges(dead, g);
+}
+
+static
+bool hasAssertVertices(const NGHolder &g) {
+    for (auto v : vertices_range(g)) {
+        int flags = g[v].assert_flags;
+        if (flags & WORDBOUNDARY_FLAGS) {
+            return true;
+        }
+    }
+    return false;
+}
+
+/** \brief Convert temporary assert vertices (from construction method) to
+ * edge-based flags.
+ *
+ * Remove the horrors that are the temporary assert vertices which arise from
+ * our construction method. Allows the rest of our code base to live in
+ * blissful ignorance of their existence. */
+void removeAssertVertices(ReportManager &rm, NGWrapper &g) {
+    size_t num = 0;
+
+    DEBUG_PRINTF("before: graph has %zu vertices\n", num_vertices(g));
+
+    // Sweep over the graph and ascertain that we do actually have vertices
+    // with assertion flags set. Otherwise, we're done.
+    if (!hasAssertVertices(g)) {
+        DEBUG_PRINTF("no assert vertices, done\n");
+        return;
+    }
+
+    u32 assert_edge_count = 0;
+
+    // Build a cache of (u, v) vertex pairs to edge descriptors.
+    edge_cache_t edge_cache;
+    for (const auto &e : edges_range(g)) {
+        edge_cache[make_pair(source(e, g), target(e, g))] = e;
+    }
+
+    for (auto v : vertices_range(g)) {
+        if (g[v].assert_flags & WORDBOUNDARY_FLAGS) {
+            replaceAssertVertex(g, v, edge_cache, assert_edge_count);
+            num++;
+        }
+    }
+
+    checkForMultilineStart(rm, g);
+
+    if (num) {
+        DEBUG_PRINTF("resolved %zu assert vertices\n", num);
+        pruneUseless(g);
+        pruneEmptyVertices(g);
+        g.renumberVertices();
+        g.renumberEdges();
+    }
+
+    DEBUG_PRINTF("after: graph has %zu vertices\n", num_vertices(g));
+    assert(!hasAssertVertices(g));
+}
+
+} // namespace ue2
--- a/src/compiler/asserts.h
+++ b/src/compiler/asserts.h
@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Convert temporary assert vertices (from construction method) to
+ * edge-based flags.
+ */
+#ifndef ASSERTS_H
+#define ASSERTS_H
+
+namespace ue2 {
+
+class ReportManager;
+class NGWrapper;
+
+/** \brief Convert temporary assert vertices (from construction method) to
+ * edge-based flags.
+ *
+ * Remove the horrors that are the temporary assert vertices which arise from
+ * our construction method. Allows the rest of our code base to live in
+ * blissful ignorance of their existence. */
+void removeAssertVertices(ReportManager &rm, NGWrapper &g);
+
+} // namespace ue2
+
+#endif // ASSERTS_H
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Compiler front-end interface.
+ */
+#include "asserts.h"
+#include "compiler.h"
+#include "database.h"
+#include "grey.h"
+#include "hs_internal.h"
+#include "hs_runtime.h"
+#include "ue2common.h"
+#include "nfagraph/ng_builder.h"
+#include "nfagraph/ng_dump.h"
+#include "nfagraph/ng.h"
+#include "nfagraph/ng_util.h"
+#include "parser/buildstate.h"
+#include "parser/dump.h"
+#include "parser/Component.h"
+#include "parser/parse_error.h"
+#include "parser/Parser.h"          // for flags
+#include "parser/position.h"
+#include "parser/position_dump.h"
+#include "parser/position_info.h"
+#include "parser/prefilter.h"
+#include "parser/shortcut_literal.h"
+#include "parser/unsupported.h"
+#include "parser/utf8_validate.h"
+#include "smallwrite/smallwrite_build.h"
+#include "rose/rose_build.h"
+#include "rose/rose_build_dump.h"
+#include "som/slot_manager_dump.h"
+#include "util/alloc.h"
+#include "util/compile_error.h"
+#include "util/target_info.h"
+#include "util/verify_types.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <sstream>
+
+using namespace std;
+
+namespace ue2 {
+
+
+static
+void validateExt(const hs_expr_ext &ext) {
+    static const unsigned long long ALL_EXT_FLAGS = HS_EXT_FLAG_MIN_OFFSET |
+                                                    HS_EXT_FLAG_MAX_OFFSET |
+                                                    HS_EXT_FLAG_MIN_LENGTH;
+    if (ext.flags & ~ALL_EXT_FLAGS) {
+        throw CompileError("Invalid hs_expr_ext flag set.");
+    }
+
+    if ((ext.flags & HS_EXT_FLAG_MIN_OFFSET) &&
+        (ext.flags & HS_EXT_FLAG_MAX_OFFSET) &&
+        (ext.min_offset > ext.max_offset)) {
+        throw CompileError("In hs_expr_ext, min_offset must be less than or "
+                           "equal to max_offset.");
+    }
+
+    if ((ext.flags & HS_EXT_FLAG_MIN_LENGTH) &&
+        (ext.flags & HS_EXT_FLAG_MAX_OFFSET) &&
+        (ext.min_length > ext.max_offset)) {
+        throw CompileError("In hs_expr_ext, min_length must be less than or "
+                           "equal to max_offset.");
+    }
+}
+
+ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
+                                   unsigned flags, ReportID actionId,
+                                   const hs_expr_ext *ext)
+    : utf8(false),
+      allow_vacuous(flags & HS_FLAG_ALLOWEMPTY),
+      highlander(flags & HS_FLAG_SINGLEMATCH),
+      prefilter(flags & HS_FLAG_PREFILTER),
+      som(SOM_NONE),
+      index(index_in),
+      id(actionId),
+      min_offset(0),
+      max_offset(MAX_OFFSET),
+      min_length(0) {
+    ParseMode mode(flags);
+
+    component = parse(expression, mode);
+
+    utf8 = mode.utf8; /* utf8 may be set by parse() */
+
+    if (utf8 && !isValidUtf8(expression)) {
+        throw ParseError("Expression is not valid UTF-8.");
+    }
+
+    if (!component) {
+        assert(0); // parse() should have thrown a ParseError.
+        throw ParseError("Parse error.");
+    }
+
+    if (flags & ~HS_FLAG_ALL) {
+        DEBUG_PRINTF("Unrecognised flag, flags=%u.\n", flags);
+        throw CompileError("Unrecognised flag.");
+    }
+
+    // FIXME: we disallow highlander + SOM, see UE-1850.
+    if ((flags & HS_FLAG_SINGLEMATCH) && (flags & HS_FLAG_SOM_LEFTMOST)) {
+        throw CompileError("HS_FLAG_SINGLEMATCH is not supported in "
+                           "combination with HS_FLAG_SOM_LEFTMOST.");
+    }
+
+    // FIXME: we disallow prefilter + SOM, see UE-1899.
+    if ((flags & HS_FLAG_PREFILTER) && (flags & HS_FLAG_SOM_LEFTMOST)) {
+        throw CompileError("HS_FLAG_PREFILTER is not supported in "
+                           "combination with HS_FLAG_SOM_LEFTMOST.");
+    }
+
+    // Set SOM type.
+    if (flags & HS_FLAG_SOM_LEFTMOST) {
+        som = SOM_LEFT;
+    }
+
+    // Set extended parameters, if we have them.
+    if (ext) {
+        // Ensure that the given parameters make sense.
+        validateExt(*ext);
+
+        if (ext->flags & HS_EXT_FLAG_MIN_OFFSET) {
+            min_offset = ext->min_offset;
+        }
+        if (ext->flags & HS_EXT_FLAG_MAX_OFFSET) {
+            max_offset = ext->max_offset;
+        }
+        if (ext->flags & HS_EXT_FLAG_MIN_LENGTH) {
+            min_length = ext->min_length;
+        }
+    }
+
+    // These are validated in validateExt, so an error will already have been
+    // thrown if these conditions don't hold.
+    assert(max_offset >= min_offset);
+    assert(max_offset >= min_length);
+
+    // Since prefiltering and SOM aren't supported together, we must squash any
+    // min_length constraint as well.
+    if (flags & HS_FLAG_PREFILTER && min_length) {
+        DEBUG_PRINTF("prefiltering mode: squashing min_length constraint\n");
+        min_length = 0;
+    }
+}
+
+#if defined(DUMP_SUPPORT) || defined(DEBUG)
+/**
+ * \brief Dumps the parse tree to screen in debug mode and to disk in dump
+ * mode.
+ */
+void dumpExpression(UNUSED const ParsedExpression &expr,
+                    UNUSED const char *stage, UNUSED const Grey &grey) {
+#if defined(DEBUG)
+    DEBUG_PRINTF("===== Rule ID: %u (internalID:  %u) =====\n", expr.id,
+                 expr.index);
+    ostringstream debug_tree;
+    dumpTree(debug_tree, expr.component.get());
+    printf("%s\n", debug_tree.str().c_str());
+#endif // DEBUG
+
+#if defined(DUMP_SUPPORT)
+    if (grey.dumpFlags & Grey::DUMP_PARSE) {
+        stringstream ss;
+        ss << grey.dumpPath << "Expr_" << expr.index << "_componenttree_"
+           << stage << ".txt";
+        ofstream out(ss.str().c_str());
+        out << "Component Tree for " << expr.id << endl;
+        dumpTree(out, expr.component.get());
+        if (expr.utf8) {
+            out << "UTF8 mode" << endl;
+        }
+    }
+#endif // DEBUG
+}
+#endif
+
+/** \brief Run Component tree optimisations on \a expr. */
+static
+void optimise(ParsedExpression &expr) {
+    if (expr.min_length || expr.som) {
+        return;
+    }
+
+    DEBUG_PRINTF("optimising\n");
+    expr.component->optimise(true /* root is connected to sds */);
+}
+
+void addExpression(NG &ng, unsigned index, const char *expression,
+                   unsigned flags, const hs_expr_ext *ext, ReportID id) {
+    assert(expression);
+    const CompileContext &cc = ng.cc;
+    DEBUG_PRINTF("index=%u, id=%u, flags=%u, expr='%s'\n", index, id, flags,
+                 expression);
+
+    // Ensure that our pattern isn't too long (in characters).
+    if (strlen(expression) > cc.grey.limitPatternLength) {
+        throw CompileError("Pattern length exceeds limit.");
+    }
+
+    // Do per-expression processing: errors here will result in an exception
+    // being thrown up to our caller
+    ParsedExpression expr(index, expression, flags, id, ext);
+    dumpExpression(expr, "orig", cc.grey);
+
+    // Apply prefiltering transformations if desired.
+    if (expr.prefilter) {
+        prefilterTree(expr.component, ParseMode(flags));
+        dumpExpression(expr, "prefiltered", cc.grey);
+    }
+
+    // Expressions containing zero-width assertions and other extended pcre
+    // types aren't supported yet. This call will throw a ParseError exception
+    // if the component tree contains such a construct.
+    checkUnsupported(*expr.component);
+
+    expr.component->checkEmbeddedStartAnchor(true);
+    expr.component->checkEmbeddedEndAnchor(true);
+
+    if (cc.grey.optimiseComponentTree) {
+        optimise(expr);
+        dumpExpression(expr, "opt", cc.grey);
+    }
+
+    DEBUG_PRINTF("component=%p, nfaId=%u, reportId=%u\n",
+                 expr.component.get(), expr.index, expr.id);
+
+    // You can only use the SOM flags if you've also specified an SOM
+    // precision mode.
+    if (expr.som != SOM_NONE && cc.streaming && !ng.ssm.somPrecision()) {
+        throw CompileError("To use a SOM expression flag in streaming mode, "
+                           "an SOM precision mode (e.g. "
+                           "HS_MODE_SOM_HORIZON_LARGE) must be specified.");
+    }
+
+    // If this expression is a literal, we can feed it directly to Rose rather
+    // than building the NFA graph.
+    if (shortcutLiteral(ng, expr)) {
+        DEBUG_PRINTF("took literal short cut\n");
+        return;
+    }
+
+    unique_ptr<NGWrapper> g = buildWrapper(ng.rm, cc, expr);
+
+    if (!g) {
+        DEBUG_PRINTF("NFA build failed on ID %u, but no exception was "
+                     "thrown.\n", expr.id);
+        throw CompileError("Internal error.");
+    }
+
+    if (!expr.allow_vacuous && matches_everywhere(*g)) {
+        throw CompileError("Pattern matches empty buffer; use "
+                           "HS_FLAG_ALLOWEMPTY to enable support.");
+    }
+
+    if (!ng.addGraph(*g)) {
+        DEBUG_PRINTF("NFA addGraph failed on ID %u.\n", expr.id);
+        throw CompileError("Error compiling expression.");
+    }
+}
+
+static
+aligned_unique_ptr<RoseEngine> generateRoseEngine(NG &ng) {
+    const u32 minWidth =
+        ng.minWidth.is_finite() ? verify_u32(ng.minWidth) : ROSE_BOUND_INF;
+    auto rose = ng.rose->buildRose(minWidth);
+
+    if (!rose) {
+        DEBUG_PRINTF("error building rose\n");
+        assert(0);
+        return nullptr;
+    }
+
+    /* avoid building a smwr if just a pure floating case. */
+    if (!roseIsPureLiteral(rose.get())) {
+        u32 qual = roseQuality(rose.get());
+        auto smwr = ng.smwr->build(qual);
+        if (smwr) {
+            rose = roseAddSmallWrite(rose.get(), smwr.get());
+        }
+    }
+
+    dumpRose(*ng.rose, rose.get(), ng.cc.grey);
+    dumpReportManager(ng.rm, ng.cc.grey);
+    dumpSomSlotManager(ng.ssm, ng.cc.grey);
+    dumpSmallWrite(rose.get(), ng.cc.grey);
+
+    return rose;
+}
+
+platform_t target_to_platform(const target_t &target_info) {
+    platform_t p;
+    p = 0;
+
+    if (!target_info.has_avx2()) {
+        p |= HS_PLATFORM_NOAVX2;
+    }
+    return p;
+}
+
+struct hs_database *build(NG &ng, unsigned int *length) {
+    assert(length);
+
+    auto rose = generateRoseEngine(ng);
+    if (!rose) {
+        throw CompileError("Unable to generate bytecode.");
+    }
+    *length = roseSize(rose.get());
+    if (!*length) {
+        DEBUG_PRINTF("RoseEngine has zero length\n");
+        assert(0);
+        throw CompileError("Internal error.");
+    }
+
+    const char *bytecode = (const char *)(rose.get());
+    const platform_t p = target_to_platform(ng.cc.target_info);
+    struct hs_database *db = dbCreate(bytecode, *length, p);
+    if (!db) {
+        throw CompileError("Could not allocate memory for bytecode.");
+    }
+
+    return db;
+}
+
+static
+void stripFromPositions(vector<PositionInfo> &v, Position pos) {
+    auto removed = remove(v.begin(), v.end(), PositionInfo(pos));
+    v.erase(removed, v.end());
+}
+
+static
+void connectInitialStates(GlushkovBuildState &bs,
+                          const ParsedExpression &expr) {
+    vector<PositionInfo> initials = expr.component->first();
+    const NFABuilder &builder = bs.getBuilder();
+    const Position startState = builder.getStart();
+    const Position startDotStarState = builder.getStartDotStar();
+
+    DEBUG_PRINTF("wiring initials = %s\n",
+                 dumpPositions(initials.begin(), initials.end()).c_str());
+
+    vector<PositionInfo> starts = {startState, startDotStarState};
+
+    // strip start and startDs, which can be present due to boundaries
+    stripFromPositions(initials, startState);
+    stripFromPositions(initials, startDotStarState);
+
+    // replace epsilons with accepts
+    for (const auto &s : initials) {
+        if (s.pos != GlushkovBuildState::POS_EPSILON) {
+            continue;
+        }
+
+        assert(starts.size() == 2); /* start, startds */
+        vector<PositionInfo> starts_temp = starts;
+        starts_temp[0].flags = s.flags;
+        starts_temp[1].flags = s.flags;
+        bs.connectAccepts(starts_temp);
+    }
+
+    if (!initials.empty()) {
+        bs.connectRegions(starts, initials);
+    }
+}
+
+static
+void connectFinalStates(GlushkovBuildState &bs, const ParsedExpression &expr) {
+    vector<PositionInfo> finals = expr.component->last();
+
+    DEBUG_PRINTF("wiring finals = %s\n",
+                 dumpPositions(finals.begin(), finals.end()).c_str());
+
+    bs.connectAccepts(finals);
+}
+
+#ifndef NDEBUG
+static
+bool isSupported(const Component &c) {
+    try {
+        checkUnsupported(c);
+        return true;
+    }
+    catch (ParseError &) {
+        return false;
+    }
+}
+#endif
+
+unique_ptr<NGWrapper> buildWrapper(ReportManager &rm, const CompileContext &cc,
+                                   const ParsedExpression &expr) {
+    assert(isSupported(*expr.component));
+
+    const unique_ptr<NFABuilder> builder = makeNFABuilder(rm, cc, expr);
+    assert(builder);
+
+    // Set up START and ACCEPT states; retrieve the special states
+    const auto bs = makeGlushkovBuildState(*builder, expr.prefilter);
+
+    // Map position IDs to characters/components
+    expr.component->notePositions(*bs);
+
+    // Wire the start dotstar state to the firsts
+    connectInitialStates(*bs, expr);
+
+    DEBUG_PRINTF("wire up body of expr\n");
+    // Build the rest of the FOLLOW set
+    vector<PositionInfo> initials = {builder->getStartDotStar(),
+                                     builder->getStart()};
+    expr.component->buildFollowSet(*bs, initials);
+
+    // Wire the lasts to the accept state
+    connectFinalStates(*bs, expr);
+
+    // Create our edges
+    bs->buildEdges();
+
+    auto g = builder->getGraph();
+    assert(g);
+
+    dumpDotWrapper(*g, "00_before_asserts", cc.grey);
+    removeAssertVertices(rm, *g);
+
+    return g;
+}
+
+} // namespace ue2
--- a/src/compiler/compiler.h
+++ b/src/compiler/compiler.h
@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Compiler front-end interface
+ */
+
+#ifndef COMPILER_H
+#define COMPILER_H
+
+#include "ue2common.h"
+#include "database.h"
+#include "parser/Component.h"
+#include "som/som.h"
+
+#include <memory>
+#include <boost/core/noncopyable.hpp>
+
+struct hs_database;
+struct hs_expr_ext;
+
+namespace ue2 {
+
+struct CompileContext;
+struct Grey;
+struct target_t;
+class NG;
+class ReportManager;
+class NGWrapper;
+
+/** Class gathering together the pieces of a parsed expression.
+ * Note: Owns the provided component.
+ */
+class ParsedExpression : boost::noncopyable {
+public:
+    ParsedExpression(unsigned index, const char *expression, unsigned flags,
+                     ReportID actionId, const hs_expr_ext *ext = nullptr);
+
+    bool utf8; //!< UTF-8 mode flag specified
+
+    /** \brief root node of parsed component tree. */
+    std::unique_ptr<ue2::Component> component;
+
+    const bool allow_vacuous;   //!< HS_FLAG_ALLOWEMPTY specified
+    const bool highlander;      //!< HS_FLAG_SINGLEMATCH specified
+    const bool prefilter;       //!< HS_FLAG_PREFILTER specified
+    som_type som;               //!< chosen SOM mode, or SOM_NONE
+
+    /** \brief index in expressions array passed to \ref hs_compile_multi */
+    const unsigned index;
+
+    const ReportID id; //!< user-specified pattern ID
+    u64a min_offset;   //!< 0 if not used
+    u64a max_offset;   //!< MAX_OFFSET if not used
+    u64a min_length;   //!< 0 if not used
+};
+
+/**
+ * Add an expression to the compiler.
+ *
+ * @param ng
+ *      The global NG object.
+ * @param index
+ *      The index of the expression (used for errors)
+ * @param expression
+ *      NULL-terminated PCRE expression
+ * @param flags
+ *      The full set of Hyperscan flags associated with this rule.
+ * @param ext
+ *      Struct containing extra parameters for this expression, or NULL if
+ *      none.
+ * @param actionId
+ *      The identifier to associate with the expression; returned by engine on
+ *      match.
+ */
+void addExpression(NG &ng, unsigned index, const char *expression,
+                   unsigned flags, const hs_expr_ext *ext, ReportID actionId);
+
+/**
+ * Build a Hyperscan database out of the expressions we've been given. A
+ * fatal error will result in an exception being thrown.
+ *
+ * @param ng
+ *      The global NG object.
+ * @param[out] length
+ *      The number of bytes occupied by the compiled structure.
+ * @return
+ *      The compiled structure. Should be deallocated with the
+ *      hs_database_free() function.
+ */
+struct hs_database *build(NG &ng, unsigned int *length);
+
+/**
+ * Constructs an NFA graph from the given expression tree.
+ *
+ * @param rm
+ *      Global ReportManager for this compile.
+ * @param cc
+ *      Global compile context for this compile.
+ * @param expr
+ *      ParsedExpression object.
+ * @return
+ *      nullptr on error.
+ */
+std::unique_ptr<NGWrapper> buildWrapper(ReportManager &rm,
+                                        const CompileContext &cc,
+                                        const ParsedExpression &expr);
+
+/**
+ * Build a platform_t out of a target_t.
+ */
+platform_t target_to_platform(const target_t &target_info);
+
+#if defined(DUMP_SUPPORT) || defined(DEBUG)
+void dumpExpression(const ParsedExpression &expr, const char *stage,
+                    const Grey &grey);
+#else
+static really_inline
+void dumpExpression(UNUSED const ParsedExpression &expr,
+                    UNUSED const char *stage, UNUSED const Grey &grey) {
+}
+
+#endif
+
+} // namespace
+
+#endif // COMPILER_H
--- a/src/compiler/error.cpp
+++ b/src/compiler/error.cpp
@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Compile-time error utils.
+ */
+#include "allocator.h"
+#include "error.h"
+#include "ue2common.h"
+#include "hs_compile.h"
+#include "util/compile_error.h"
+
+#include <cstring>
+#include <string>
+
+using std::string;
+
+static const char failureNoMemory[] = "Unable to allocate memory.";
+static const char failureInternal[] = "Internal error.";
+
+extern const hs_compile_error_t hs_enomem = {
+    const_cast<char *>(failureNoMemory), 0
+};
+extern const hs_compile_error_t hs_einternal = {
+    const_cast<char *>(failureInternal), 0
+};
+
+namespace ue2 {
+
+hs_compile_error_t *generateCompileError(const string &err, int expression) {
+    hs_compile_error_t *ret =
+        (struct hs_compile_error *)hs_misc_alloc(sizeof(hs_compile_error_t));
+    if (ret) {
+        char *msg = (char *)hs_misc_alloc(err.size() + 1);
+        if (msg) {
+            memcpy(msg, err.c_str(), err.size() + 1);
+            ret->message = msg;
+        } else {
+            hs_misc_free(ret);
+            ret = nullptr;
+        }
+    }
+
+    if (!ret || !ret->message) {
+        return const_cast<hs_compile_error_t *>(&hs_enomem);
+    }
+
+    ret->expression = expression;
+
+    return ret;
+}
+
+hs_compile_error_t *generateCompileError(const CompileError &e) {
+    return generateCompileError(e.reason, e.hasIndex ? (int)e.index : -1);
+}
+
+void freeCompileError(hs_compile_error_t *error) {
+    if (!error) {
+        return;
+    }
+    if (error == &hs_enomem || error == &hs_einternal) {
+        // These are not allocated.
+        return;
+    }
+
+    hs_misc_free(error->message);
+    hs_misc_free(error);
+}
+
+} // namespace ue2
--- a/src/compiler/error.h
+++ b/src/compiler/error.h
@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Compile-time error utils.
+  */
+
+#ifndef COMPILE_ERROR_H
+#define COMPILE_ERROR_H
+
+#include <string>
+
+struct hs_compile_error;
+
+// Special errors that aren't allocated with hs_alloc/hs_free.
+extern const hs_compile_error hs_enomem;
+extern const hs_compile_error hs_einternal;
+
+namespace ue2 {
+
+class CompileError;
+
+hs_compile_error *generateCompileError(const std::string &err, int expression);
+hs_compile_error *generateCompileError(const CompileError &e);
+
+void freeCompileError(hs_compile_error *error);
+
+} // namespace ue2
+
+#endif
--- a/src/crc32.c
+++ b/src/crc32.c
@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "crc32.h"
+#include "config.h"
+#include "ue2common.h"
+
+#if defined(HAVE_C_X86INTRIN_H)
+#include <x86intrin.h>
+#elif defined(HAVE_C_INTRIN_H)
+#include <intrin.h>
+#endif
+
+#ifndef __SSE4_2__
+
+/***
+ *** What follows is derived from Intel's Slicing-by-8 CRC32 impl, which is BSD
+ *** licensed and available from http://sourceforge.net/projects/slicing-by-8/
+ ***/
+
+/*
+ * Copyright (c) 2004-2006 Intel Corporation - All Rights Reserved
+ *
+ *
+ * This software program is licensed subject to the BSD License, 
+ * available at http://www.opensource.org/licenses/bsd-license.html.
+ *
+ * Abstract:
+ *
+ *  Tables for software CRC generation
+ */
+
+/*
+ * The following CRC lookup table was generated automagically
+ * using the following model parameters: 
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41
+ * Generator Polynomial Length = .......... 32 bits
+ * Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits
+ * Number of Slices = ..................... 8 slices
+ * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 
+ * Directory Name = ....................... .\
+ * File Name = ............................ 8x256_tables.c
+ */
+
+static
+u32 crc_tableil8_o32[256] =
+{
+ 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
+ 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
+ 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
+ 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
+ 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
+ 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
+ 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
+ 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
+ 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
+ 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
+ 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
+ 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
+ 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
+ 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
+ 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
+ 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
+ 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
+ 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
+ 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
+ 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
+ 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
+ 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
+ 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
+ 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
+ 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
+ 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
+ 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
+ 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
+ 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
+ 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
+ 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
+ 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o32
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically
+ * using the following model parameters: 
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41
+ * Generator Polynomial Length = .......... 32 bits
+ * Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits
+ * Number of Slices = ..................... 8 slices
+ * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 
+ * Directory Name = ....................... .\
+ * File Name = ............................ 8x256_tables.c
+ */
+
+static
+u32 crc_tableil8_o40[256] =
+{
+ 0x00000000, 0x13A29877, 0x274530EE, 0x34E7A899, 0x4E8A61DC, 0x5D28F9AB, 0x69CF5132, 0x7A6DC945,
+ 0x9D14C3B8, 0x8EB65BCF, 0xBA51F356, 0xA9F36B21, 0xD39EA264, 0xC03C3A13, 0xF4DB928A, 0xE7790AFD,
+ 0x3FC5F181, 0x2C6769F6, 0x1880C16F, 0x0B225918, 0x714F905D, 0x62ED082A, 0x560AA0B3, 0x45A838C4,
+ 0xA2D13239, 0xB173AA4E, 0x859402D7, 0x96369AA0, 0xEC5B53E5, 0xFFF9CB92, 0xCB1E630B, 0xD8BCFB7C,
+ 0x7F8BE302, 0x6C297B75, 0x58CED3EC, 0x4B6C4B9B, 0x310182DE, 0x22A31AA9, 0x1644B230, 0x05E62A47,
+ 0xE29F20BA, 0xF13DB8CD, 0xC5DA1054, 0xD6788823, 0xAC154166, 0xBFB7D911, 0x8B507188, 0x98F2E9FF,
+ 0x404E1283, 0x53EC8AF4, 0x670B226D, 0x74A9BA1A, 0x0EC4735F, 0x1D66EB28, 0x298143B1, 0x3A23DBC6,
+ 0xDD5AD13B, 0xCEF8494C, 0xFA1FE1D5, 0xE9BD79A2, 0x93D0B0E7, 0x80722890, 0xB4958009, 0xA737187E,
+ 0xFF17C604, 0xECB55E73, 0xD852F6EA, 0xCBF06E9D, 0xB19DA7D8, 0xA23F3FAF, 0x96D89736, 0x857A0F41,
+ 0x620305BC, 0x71A19DCB, 0x45463552, 0x56E4AD25, 0x2C896460, 0x3F2BFC17, 0x0BCC548E, 0x186ECCF9,
+ 0xC0D23785, 0xD370AFF2, 0xE797076B, 0xF4359F1C, 0x8E585659, 0x9DFACE2E, 0xA91D66B7, 0xBABFFEC0,
+ 0x5DC6F43D, 0x4E646C4A, 0x7A83C4D3, 0x69215CA4, 0x134C95E1, 0x00EE0D96, 0x3409A50F, 0x27AB3D78,
+ 0x809C2506, 0x933EBD71, 0xA7D915E8, 0xB47B8D9F, 0xCE1644DA, 0xDDB4DCAD, 0xE9537434, 0xFAF1EC43,
+ 0x1D88E6BE, 0x0E2A7EC9, 0x3ACDD650, 0x296F4E27, 0x53028762, 0x40A01F15, 0x7447B78C, 0x67E52FFB,
+ 0xBF59D487, 0xACFB4CF0, 0x981CE469, 0x8BBE7C1E, 0xF1D3B55B, 0xE2712D2C, 0xD69685B5, 0xC5341DC2,
+ 0x224D173F, 0x31EF8F48, 0x050827D1, 0x16AABFA6, 0x6CC776E3, 0x7F65EE94, 0x4B82460D, 0x5820DE7A,
+ 0xFBC3FAF9, 0xE861628E, 0xDC86CA17, 0xCF245260, 0xB5499B25, 0xA6EB0352, 0x920CABCB, 0x81AE33BC,
+ 0x66D73941, 0x7575A136, 0x419209AF, 0x523091D8, 0x285D589D, 0x3BFFC0EA, 0x0F186873, 0x1CBAF004,
+ 0xC4060B78, 0xD7A4930F, 0xE3433B96, 0xF0E1A3E1, 0x8A8C6AA4, 0x992EF2D3, 0xADC95A4A, 0xBE6BC23D,
+ 0x5912C8C0, 0x4AB050B7, 0x7E57F82E, 0x6DF56059, 0x1798A91C, 0x043A316B, 0x30DD99F2, 0x237F0185,
+ 0x844819FB, 0x97EA818C, 0xA30D2915, 0xB0AFB162, 0xCAC27827, 0xD960E050, 0xED8748C9, 0xFE25D0BE,
+ 0x195CDA43, 0x0AFE4234, 0x3E19EAAD, 0x2DBB72DA, 0x57D6BB9F, 0x447423E8, 0x70938B71, 0x63311306,
+ 0xBB8DE87A, 0xA82F700D, 0x9CC8D894, 0x8F6A40E3, 0xF50789A6, 0xE6A511D1, 0xD242B948, 0xC1E0213F,
+ 0x26992BC2, 0x353BB3B5, 0x01DC1B2C, 0x127E835B, 0x68134A1E, 0x7BB1D269, 0x4F567AF0, 0x5CF4E287,
+ 0x04D43CFD, 0x1776A48A, 0x23910C13, 0x30339464, 0x4A5E5D21, 0x59FCC556, 0x6D1B6DCF, 0x7EB9F5B8,
+ 0x99C0FF45, 0x8A626732, 0xBE85CFAB, 0xAD2757DC, 0xD74A9E99, 0xC4E806EE, 0xF00FAE77, 0xE3AD3600,
+ 0x3B11CD7C, 0x28B3550B, 0x1C54FD92, 0x0FF665E5, 0x759BACA0, 0x663934D7, 0x52DE9C4E, 0x417C0439,
+ 0xA6050EC4, 0xB5A796B3, 0x81403E2A, 0x92E2A65D, 0xE88F6F18, 0xFB2DF76F, 0xCFCA5FF6, 0xDC68C781,
+ 0x7B5FDFFF, 0x68FD4788, 0x5C1AEF11, 0x4FB87766, 0x35D5BE23, 0x26772654, 0x12908ECD, 0x013216BA,
+ 0xE64B1C47, 0xF5E98430, 0xC10E2CA9, 0xD2ACB4DE, 0xA8C17D9B, 0xBB63E5EC, 0x8F844D75, 0x9C26D502,
+ 0x449A2E7E, 0x5738B609, 0x63DF1E90, 0x707D86E7, 0x0A104FA2, 0x19B2D7D5, 0x2D557F4C, 0x3EF7E73B,
+ 0xD98EEDC6, 0xCA2C75B1, 0xFECBDD28, 0xED69455F, 0x97048C1A, 0x84A6146D, 0xB041BCF4, 0xA3E32483
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o40
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically
+ * using the following model parameters: 
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41
+ * Generator Polynomial Length = .......... 32 bits
+ * Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits
+ * Number of Slices = ..................... 8 slices
+ * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 
+ * Directory Name = ....................... .\
+ * File Name = ............................ 8x256_tables.c
+ */
+
+static
+u32 crc_tableil8_o48[256] =
+{
+ 0x00000000, 0xA541927E, 0x4F6F520D, 0xEA2EC073, 0x9EDEA41A, 0x3B9F3664, 0xD1B1F617, 0x74F06469,
+ 0x38513EC5, 0x9D10ACBB, 0x773E6CC8, 0xD27FFEB6, 0xA68F9ADF, 0x03CE08A1, 0xE9E0C8D2, 0x4CA15AAC,
+ 0x70A27D8A, 0xD5E3EFF4, 0x3FCD2F87, 0x9A8CBDF9, 0xEE7CD990, 0x4B3D4BEE, 0xA1138B9D, 0x045219E3,
+ 0x48F3434F, 0xEDB2D131, 0x079C1142, 0xA2DD833C, 0xD62DE755, 0x736C752B, 0x9942B558, 0x3C032726,
+ 0xE144FB14, 0x4405696A, 0xAE2BA919, 0x0B6A3B67, 0x7F9A5F0E, 0xDADBCD70, 0x30F50D03, 0x95B49F7D,
+ 0xD915C5D1, 0x7C5457AF, 0x967A97DC, 0x333B05A2, 0x47CB61CB, 0xE28AF3B5, 0x08A433C6, 0xADE5A1B8,
+ 0x91E6869E, 0x34A714E0, 0xDE89D493, 0x7BC846ED, 0x0F382284, 0xAA79B0FA, 0x40577089, 0xE516E2F7,
+ 0xA9B7B85B, 0x0CF62A25, 0xE6D8EA56, 0x43997828, 0x37691C41, 0x92288E3F, 0x78064E4C, 0xDD47DC32,
+ 0xC76580D9, 0x622412A7, 0x880AD2D4, 0x2D4B40AA, 0x59BB24C3, 0xFCFAB6BD, 0x16D476CE, 0xB395E4B0,
+ 0xFF34BE1C, 0x5A752C62, 0xB05BEC11, 0x151A7E6F, 0x61EA1A06, 0xC4AB8878, 0x2E85480B, 0x8BC4DA75,
+ 0xB7C7FD53, 0x12866F2D, 0xF8A8AF5E, 0x5DE93D20, 0x29195949, 0x8C58CB37, 0x66760B44, 0xC337993A,
+ 0x8F96C396, 0x2AD751E8, 0xC0F9919B, 0x65B803E5, 0x1148678C, 0xB409F5F2, 0x5E273581, 0xFB66A7FF,
+ 0x26217BCD, 0x8360E9B3, 0x694E29C0, 0xCC0FBBBE, 0xB8FFDFD7, 0x1DBE4DA9, 0xF7908DDA, 0x52D11FA4,
+ 0x1E704508, 0xBB31D776, 0x511F1705, 0xF45E857B, 0x80AEE112, 0x25EF736C, 0xCFC1B31F, 0x6A802161,
+ 0x56830647, 0xF3C29439, 0x19EC544A, 0xBCADC634, 0xC85DA25D, 0x6D1C3023, 0x8732F050, 0x2273622E,
+ 0x6ED23882, 0xCB93AAFC, 0x21BD6A8F, 0x84FCF8F1, 0xF00C9C98, 0x554D0EE6, 0xBF63CE95, 0x1A225CEB,
+ 0x8B277743, 0x2E66E53D, 0xC448254E, 0x6109B730, 0x15F9D359, 0xB0B84127, 0x5A968154, 0xFFD7132A,
+ 0xB3764986, 0x1637DBF8, 0xFC191B8B, 0x595889F5, 0x2DA8ED9C, 0x88E97FE2, 0x62C7BF91, 0xC7862DEF,
+ 0xFB850AC9, 0x5EC498B7, 0xB4EA58C4, 0x11ABCABA, 0x655BAED3, 0xC01A3CAD, 0x2A34FCDE, 0x8F756EA0,
+ 0xC3D4340C, 0x6695A672, 0x8CBB6601, 0x29FAF47F, 0x5D0A9016, 0xF84B0268, 0x1265C21B, 0xB7245065,
+ 0x6A638C57, 0xCF221E29, 0x250CDE5A, 0x804D4C24, 0xF4BD284D, 0x51FCBA33, 0xBBD27A40, 0x1E93E83E,
+ 0x5232B292, 0xF77320EC, 0x1D5DE09F, 0xB81C72E1, 0xCCEC1688, 0x69AD84F6, 0x83834485, 0x26C2D6FB,
+ 0x1AC1F1DD, 0xBF8063A3, 0x55AEA3D0, 0xF0EF31AE, 0x841F55C7, 0x215EC7B9, 0xCB7007CA, 0x6E3195B4,
+ 0x2290CF18, 0x87D15D66, 0x6DFF9D15, 0xC8BE0F6B, 0xBC4E6B02, 0x190FF97C, 0xF321390F, 0x5660AB71,
+ 0x4C42F79A, 0xE90365E4, 0x032DA597, 0xA66C37E9, 0xD29C5380, 0x77DDC1FE, 0x9DF3018D, 0x38B293F3,
+ 0x7413C95F, 0xD1525B21, 0x3B7C9B52, 0x9E3D092C, 0xEACD6D45, 0x4F8CFF3B, 0xA5A23F48, 0x00E3AD36,
+ 0x3CE08A10, 0x99A1186E, 0x738FD81D, 0xD6CE4A63, 0xA23E2E0A, 0x077FBC74, 0xED517C07, 0x4810EE79,
+ 0x04B1B4D5, 0xA1F026AB, 0x4BDEE6D8, 0xEE9F74A6, 0x9A6F10CF, 0x3F2E82B1, 0xD50042C2, 0x7041D0BC,
+ 0xAD060C8E, 0x08479EF0, 0xE2695E83, 0x4728CCFD, 0x33D8A894, 0x96993AEA, 0x7CB7FA99, 0xD9F668E7,
+ 0x9557324B, 0x3016A035, 0xDA386046, 0x7F79F238, 0x0B899651, 0xAEC8042F, 0x44E6C45C, 0xE1A75622,
+ 0xDDA47104, 0x78E5E37A, 0x92CB2309, 0x378AB177, 0x437AD51E, 0xE63B4760, 0x0C158713, 0xA954156D,
+ 0xE5F54FC1, 0x40B4DDBF, 0xAA9A1DCC, 0x0FDB8FB2, 0x7B2BEBDB, 0xDE6A79A5, 0x3444B9D6, 0x91052BA8
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o48
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically
+ * using the following model parameters: 
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41
+ * Generator Polynomial Length = .......... 32 bits
+ * Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits
+ * Number of Slices = ..................... 8 slices
+ * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 
+ * Directory Name = ....................... .\
+ * File Name = ............................ 8x256_tables.c
+ */
+
+static
+u32 crc_tableil8_o56[256] =
+{
+ 0x00000000, 0xDD45AAB8, 0xBF672381, 0x62228939, 0x7B2231F3, 0xA6679B4B, 0xC4451272, 0x1900B8CA,
+ 0xF64463E6, 0x2B01C95E, 0x49234067, 0x9466EADF, 0x8D665215, 0x5023F8AD, 0x32017194, 0xEF44DB2C,
+ 0xE964B13D, 0x34211B85, 0x560392BC, 0x8B463804, 0x924680CE, 0x4F032A76, 0x2D21A34F, 0xF06409F7,
+ 0x1F20D2DB, 0xC2657863, 0xA047F15A, 0x7D025BE2, 0x6402E328, 0xB9474990, 0xDB65C0A9, 0x06206A11,
+ 0xD725148B, 0x0A60BE33, 0x6842370A, 0xB5079DB2, 0xAC072578, 0x71428FC0, 0x136006F9, 0xCE25AC41,
+ 0x2161776D, 0xFC24DDD5, 0x9E0654EC, 0x4343FE54, 0x5A43469E, 0x8706EC26, 0xE524651F, 0x3861CFA7,
+ 0x3E41A5B6, 0xE3040F0E, 0x81268637, 0x5C632C8F, 0x45639445, 0x98263EFD, 0xFA04B7C4, 0x27411D7C,
+ 0xC805C650, 0x15406CE8, 0x7762E5D1, 0xAA274F69, 0xB327F7A3, 0x6E625D1B, 0x0C40D422, 0xD1057E9A,
+ 0xABA65FE7, 0x76E3F55F, 0x14C17C66, 0xC984D6DE, 0xD0846E14, 0x0DC1C4AC, 0x6FE34D95, 0xB2A6E72D,
+ 0x5DE23C01, 0x80A796B9, 0xE2851F80, 0x3FC0B538, 0x26C00DF2, 0xFB85A74A, 0x99A72E73, 0x44E284CB,
+ 0x42C2EEDA, 0x9F874462, 0xFDA5CD5B, 0x20E067E3, 0x39E0DF29, 0xE4A57591, 0x8687FCA8, 0x5BC25610,
+ 0xB4868D3C, 0x69C32784, 0x0BE1AEBD, 0xD6A40405, 0xCFA4BCCF, 0x12E11677, 0x70C39F4E, 0xAD8635F6,
+ 0x7C834B6C, 0xA1C6E1D4, 0xC3E468ED, 0x1EA1C255, 0x07A17A9F, 0xDAE4D027, 0xB8C6591E, 0x6583F3A6,
+ 0x8AC7288A, 0x57828232, 0x35A00B0B, 0xE8E5A1B3, 0xF1E51979, 0x2CA0B3C1, 0x4E823AF8, 0x93C79040,
+ 0x95E7FA51, 0x48A250E9, 0x2A80D9D0, 0xF7C57368, 0xEEC5CBA2, 0x3380611A, 0x51A2E823, 0x8CE7429B,
+ 0x63A399B7, 0xBEE6330F, 0xDCC4BA36, 0x0181108E, 0x1881A844, 0xC5C402FC, 0xA7E68BC5, 0x7AA3217D,
+ 0x52A0C93F, 0x8FE56387, 0xEDC7EABE, 0x30824006, 0x2982F8CC, 0xF4C75274, 0x96E5DB4D, 0x4BA071F5,
+ 0xA4E4AAD9, 0x79A10061, 0x1B838958, 0xC6C623E0, 0xDFC69B2A, 0x02833192, 0x60A1B8AB, 0xBDE41213,
+ 0xBBC47802, 0x6681D2BA, 0x04A35B83, 0xD9E6F13B, 0xC0E649F1, 0x1DA3E349, 0x7F816A70, 0xA2C4C0C8,
+ 0x4D801BE4, 0x90C5B15C, 0xF2E73865, 0x2FA292DD, 0x36A22A17, 0xEBE780AF, 0x89C50996, 0x5480A32E,
+ 0x8585DDB4, 0x58C0770C, 0x3AE2FE35, 0xE7A7548D, 0xFEA7EC47, 0x23E246FF, 0x41C0CFC6, 0x9C85657E,
+ 0x73C1BE52, 0xAE8414EA, 0xCCA69DD3, 0x11E3376B, 0x08E38FA1, 0xD5A62519, 0xB784AC20, 0x6AC10698,
+ 0x6CE16C89, 0xB1A4C631, 0xD3864F08, 0x0EC3E5B0, 0x17C35D7A, 0xCA86F7C2, 0xA8A47EFB, 0x75E1D443,
+ 0x9AA50F6F, 0x47E0A5D7, 0x25C22CEE, 0xF8878656, 0xE1873E9C, 0x3CC29424, 0x5EE01D1D, 0x83A5B7A5,
+ 0xF90696D8, 0x24433C60, 0x4661B559, 0x9B241FE1, 0x8224A72B, 0x5F610D93, 0x3D4384AA, 0xE0062E12,
+ 0x0F42F53E, 0xD2075F86, 0xB025D6BF, 0x6D607C07, 0x7460C4CD, 0xA9256E75, 0xCB07E74C, 0x16424DF4,
+ 0x106227E5, 0xCD278D5D, 0xAF050464, 0x7240AEDC, 0x6B401616, 0xB605BCAE, 0xD4273597, 0x09629F2F,
+ 0xE6264403, 0x3B63EEBB, 0x59416782, 0x8404CD3A, 0x9D0475F0, 0x4041DF48, 0x22635671, 0xFF26FCC9,
+ 0x2E238253, 0xF36628EB, 0x9144A1D2, 0x4C010B6A, 0x5501B3A0, 0x88441918, 0xEA669021, 0x37233A99,
+ 0xD867E1B5, 0x05224B0D, 0x6700C234, 0xBA45688C, 0xA345D046, 0x7E007AFE, 0x1C22F3C7, 0xC167597F,
+ 0xC747336E, 0x1A0299D6, 0x782010EF, 0xA565BA57, 0xBC65029D, 0x6120A825, 0x0302211C, 0xDE478BA4,
+ 0x31035088, 0xEC46FA30, 0x8E647309, 0x5321D9B1, 0x4A21617B, 0x9764CBC3, 0xF54642FA, 0x2803E842
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o56
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically
+ * using the following model parameters: 
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41
+ * Generator Polynomial Length = .......... 32 bits
+ * Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits
+ * Number of Slices = ..................... 8 slices
+ * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 
+ * Directory Name = ....................... .\
+ * File Name = ............................ 8x256_tables.c
+ */
+
+static
+u32 crc_tableil8_o64[256] =
+{
+ 0x00000000, 0x38116FAC, 0x7022DF58, 0x4833B0F4, 0xE045BEB0, 0xD854D11C, 0x906761E8, 0xA8760E44,
+ 0xC5670B91, 0xFD76643D, 0xB545D4C9, 0x8D54BB65, 0x2522B521, 0x1D33DA8D, 0x55006A79, 0x6D1105D5,
+ 0x8F2261D3, 0xB7330E7F, 0xFF00BE8B, 0xC711D127, 0x6F67DF63, 0x5776B0CF, 0x1F45003B, 0x27546F97,
+ 0x4A456A42, 0x725405EE, 0x3A67B51A, 0x0276DAB6, 0xAA00D4F2, 0x9211BB5E, 0xDA220BAA, 0xE2336406,
+ 0x1BA8B557, 0x23B9DAFB, 0x6B8A6A0F, 0x539B05A3, 0xFBED0BE7, 0xC3FC644B, 0x8BCFD4BF, 0xB3DEBB13,
+ 0xDECFBEC6, 0xE6DED16A, 0xAEED619E, 0x96FC0E32, 0x3E8A0076, 0x069B6FDA, 0x4EA8DF2E, 0x76B9B082,
+ 0x948AD484, 0xAC9BBB28, 0xE4A80BDC, 0xDCB96470, 0x74CF6A34, 0x4CDE0598, 0x04EDB56C, 0x3CFCDAC0,
+ 0x51EDDF15, 0x69FCB0B9, 0x21CF004D, 0x19DE6FE1, 0xB1A861A5, 0x89B90E09, 0xC18ABEFD, 0xF99BD151,
+ 0x37516AAE, 0x0F400502, 0x4773B5F6, 0x7F62DA5A, 0xD714D41E, 0xEF05BBB2, 0xA7360B46, 0x9F2764EA,
+ 0xF236613F, 0xCA270E93, 0x8214BE67, 0xBA05D1CB, 0x1273DF8F, 0x2A62B023, 0x625100D7, 0x5A406F7B,
+ 0xB8730B7D, 0x806264D1, 0xC851D425, 0xF040BB89, 0x5836B5CD, 0x6027DA61, 0x28146A95, 0x10050539,
+ 0x7D1400EC, 0x45056F40, 0x0D36DFB4, 0x3527B018, 0x9D51BE5C, 0xA540D1F0, 0xED736104, 0xD5620EA8,
+ 0x2CF9DFF9, 0x14E8B055, 0x5CDB00A1, 0x64CA6F0D, 0xCCBC6149, 0xF4AD0EE5, 0xBC9EBE11, 0x848FD1BD,
+ 0xE99ED468, 0xD18FBBC4, 0x99BC0B30, 0xA1AD649C, 0x09DB6AD8, 0x31CA0574, 0x79F9B580, 0x41E8DA2C,
+ 0xA3DBBE2A, 0x9BCAD186, 0xD3F96172, 0xEBE80EDE, 0x439E009A, 0x7B8F6F36, 0x33BCDFC2, 0x0BADB06E,
+ 0x66BCB5BB, 0x5EADDA17, 0x169E6AE3, 0x2E8F054F, 0x86F90B0B, 0xBEE864A7, 0xF6DBD453, 0xCECABBFF,
+ 0x6EA2D55C, 0x56B3BAF0, 0x1E800A04, 0x269165A8, 0x8EE76BEC, 0xB6F60440, 0xFEC5B4B4, 0xC6D4DB18,
+ 0xABC5DECD, 0x93D4B161, 0xDBE70195, 0xE3F66E39, 0x4B80607D, 0x73910FD1, 0x3BA2BF25, 0x03B3D089,
+ 0xE180B48F, 0xD991DB23, 0x91A26BD7, 0xA9B3047B, 0x01C50A3F, 0x39D46593, 0x71E7D567, 0x49F6BACB,
+ 0x24E7BF1E, 0x1CF6D0B2, 0x54C56046, 0x6CD40FEA, 0xC4A201AE, 0xFCB36E02, 0xB480DEF6, 0x8C91B15A,
+ 0x750A600B, 0x4D1B0FA7, 0x0528BF53, 0x3D39D0FF, 0x954FDEBB, 0xAD5EB117, 0xE56D01E3, 0xDD7C6E4F,
+ 0xB06D6B9A, 0x887C0436, 0xC04FB4C2, 0xF85EDB6E, 0x5028D52A, 0x6839BA86, 0x200A0A72, 0x181B65DE,
+ 0xFA2801D8, 0xC2396E74, 0x8A0ADE80, 0xB21BB12C, 0x1A6DBF68, 0x227CD0C4, 0x6A4F6030, 0x525E0F9C,
+ 0x3F4F0A49, 0x075E65E5, 0x4F6DD511, 0x777CBABD, 0xDF0AB4F9, 0xE71BDB55, 0xAF286BA1, 0x9739040D,
+ 0x59F3BFF2, 0x61E2D05E, 0x29D160AA, 0x11C00F06, 0xB9B60142, 0x81A76EEE, 0xC994DE1A, 0xF185B1B6,
+ 0x9C94B463, 0xA485DBCF, 0xECB66B3B, 0xD4A70497, 0x7CD10AD3, 0x44C0657F, 0x0CF3D58B, 0x34E2BA27,
+ 0xD6D1DE21, 0xEEC0B18D, 0xA6F30179, 0x9EE26ED5, 0x36946091, 0x0E850F3D, 0x46B6BFC9, 0x7EA7D065,
+ 0x13B6D5B0, 0x2BA7BA1C, 0x63940AE8, 0x5B856544, 0xF3F36B00, 0xCBE204AC, 0x83D1B458, 0xBBC0DBF4,
+ 0x425B0AA5, 0x7A4A6509, 0x3279D5FD, 0x0A68BA51, 0xA21EB415, 0x9A0FDBB9, 0xD23C6B4D, 0xEA2D04E1,
+ 0x873C0134, 0xBF2D6E98, 0xF71EDE6C, 0xCF0FB1C0, 0x6779BF84, 0x5F68D028, 0x175B60DC, 0x2F4A0F70,
+ 0xCD796B76, 0xF56804DA, 0xBD5BB42E, 0x854ADB82, 0x2D3CD5C6, 0x152DBA6A, 0x5D1E0A9E, 0x650F6532,
+ 0x081E60E7, 0x300F0F4B, 0x783CBFBF, 0x402DD013, 0xE85BDE57, 0xD04AB1FB, 0x9879010F, 0xA0686EA3
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o64
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically
+ * using the following model parameters: 
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41
+ * Generator Polynomial Length = .......... 32 bits
+ * Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits
+ * Number of Slices = ..................... 8 slices
+ * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 
+ * Directory Name = ....................... .\
+ * File Name = ............................ 8x256_tables.c
+ */
+
+static
+u32 crc_tableil8_o72[256] =
+{
+ 0x00000000, 0xEF306B19, 0xDB8CA0C3, 0x34BCCBDA, 0xB2F53777, 0x5DC55C6E, 0x697997B4, 0x8649FCAD,
+ 0x6006181F, 0x8F367306, 0xBB8AB8DC, 0x54BAD3C5, 0xD2F32F68, 0x3DC34471, 0x097F8FAB, 0xE64FE4B2,
+ 0xC00C303E, 0x2F3C5B27, 0x1B8090FD, 0xF4B0FBE4, 0x72F90749, 0x9DC96C50, 0xA975A78A, 0x4645CC93,
+ 0xA00A2821, 0x4F3A4338, 0x7B8688E2, 0x94B6E3FB, 0x12FF1F56, 0xFDCF744F, 0xC973BF95, 0x2643D48C,
+ 0x85F4168D, 0x6AC47D94, 0x5E78B64E, 0xB148DD57, 0x370121FA, 0xD8314AE3, 0xEC8D8139, 0x03BDEA20,
+ 0xE5F20E92, 0x0AC2658B, 0x3E7EAE51, 0xD14EC548, 0x570739E5, 0xB83752FC, 0x8C8B9926, 0x63BBF23F,
+ 0x45F826B3, 0xAAC84DAA, 0x9E748670, 0x7144ED69, 0xF70D11C4, 0x183D7ADD, 0x2C81B107, 0xC3B1DA1E,
+ 0x25FE3EAC, 0xCACE55B5, 0xFE729E6F, 0x1142F576, 0x970B09DB, 0x783B62C2, 0x4C87A918, 0xA3B7C201,
+ 0x0E045BEB, 0xE13430F2, 0xD588FB28, 0x3AB89031, 0xBCF16C9C, 0x53C10785, 0x677DCC5F, 0x884DA746,
+ 0x6E0243F4, 0x813228ED, 0xB58EE337, 0x5ABE882E, 0xDCF77483, 0x33C71F9A, 0x077BD440, 0xE84BBF59,
+ 0xCE086BD5, 0x213800CC, 0x1584CB16, 0xFAB4A00F, 0x7CFD5CA2, 0x93CD37BB, 0xA771FC61, 0x48419778,
+ 0xAE0E73CA, 0x413E18D3, 0x7582D309, 0x9AB2B810, 0x1CFB44BD, 0xF3CB2FA4, 0xC777E47E, 0x28478F67,
+ 0x8BF04D66, 0x64C0267F, 0x507CEDA5, 0xBF4C86BC, 0x39057A11, 0xD6351108, 0xE289DAD2, 0x0DB9B1CB,
+ 0xEBF65579, 0x04C63E60, 0x307AF5BA, 0xDF4A9EA3, 0x5903620E, 0xB6330917, 0x828FC2CD, 0x6DBFA9D4,
+ 0x4BFC7D58, 0xA4CC1641, 0x9070DD9B, 0x7F40B682, 0xF9094A2F, 0x16392136, 0x2285EAEC, 0xCDB581F5,
+ 0x2BFA6547, 0xC4CA0E5E, 0xF076C584, 0x1F46AE9D, 0x990F5230, 0x763F3929, 0x4283F2F3, 0xADB399EA,
+ 0x1C08B7D6, 0xF338DCCF, 0xC7841715, 0x28B47C0C, 0xAEFD80A1, 0x41CDEBB8, 0x75712062, 0x9A414B7B,
+ 0x7C0EAFC9, 0x933EC4D0, 0xA7820F0A, 0x48B26413, 0xCEFB98BE, 0x21CBF3A7, 0x1577387D, 0xFA475364,
+ 0xDC0487E8, 0x3334ECF1, 0x0788272B, 0xE8B84C32, 0x6EF1B09F, 0x81C1DB86, 0xB57D105C, 0x5A4D7B45,
+ 0xBC029FF7, 0x5332F4EE, 0x678E3F34, 0x88BE542D, 0x0EF7A880, 0xE1C7C399, 0xD57B0843, 0x3A4B635A,
+ 0x99FCA15B, 0x76CCCA42, 0x42700198, 0xAD406A81, 0x2B09962C, 0xC439FD35, 0xF08536EF, 0x1FB55DF6,
+ 0xF9FAB944, 0x16CAD25D, 0x22761987, 0xCD46729E, 0x4B0F8E33, 0xA43FE52A, 0x90832EF0, 0x7FB345E9,
+ 0x59F09165, 0xB6C0FA7C, 0x827C31A6, 0x6D4C5ABF, 0xEB05A612, 0x0435CD0B, 0x308906D1, 0xDFB96DC8,
+ 0x39F6897A, 0xD6C6E263, 0xE27A29B9, 0x0D4A42A0, 0x8B03BE0D, 0x6433D514, 0x508F1ECE, 0xBFBF75D7,
+ 0x120CEC3D, 0xFD3C8724, 0xC9804CFE, 0x26B027E7, 0xA0F9DB4A, 0x4FC9B053, 0x7B757B89, 0x94451090,
+ 0x720AF422, 0x9D3A9F3B, 0xA98654E1, 0x46B63FF8, 0xC0FFC355, 0x2FCFA84C, 0x1B736396, 0xF443088F,
+ 0xD200DC03, 0x3D30B71A, 0x098C7CC0, 0xE6BC17D9, 0x60F5EB74, 0x8FC5806D, 0xBB794BB7, 0x544920AE,
+ 0xB206C41C, 0x5D36AF05, 0x698A64DF, 0x86BA0FC6, 0x00F3F36B, 0xEFC39872, 0xDB7F53A8, 0x344F38B1,
+ 0x97F8FAB0, 0x78C891A9, 0x4C745A73, 0xA344316A, 0x250DCDC7, 0xCA3DA6DE, 0xFE816D04, 0x11B1061D,
+ 0xF7FEE2AF, 0x18CE89B6, 0x2C72426C, 0xC3422975, 0x450BD5D8, 0xAA3BBEC1, 0x9E87751B, 0x71B71E02,
+ 0x57F4CA8E, 0xB8C4A197, 0x8C786A4D, 0x63480154, 0xE501FDF9, 0x0A3196E0, 0x3E8D5D3A, 0xD1BD3623,
+ 0x37F2D291, 0xD8C2B988, 0xEC7E7252, 0x034E194B, 0x8507E5E6, 0x6A378EFF, 0x5E8B4525, 0xB1BB2E3C
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o72
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically
+ * using the following model parameters: 
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41
+ * Generator Polynomial Length = .......... 32 bits
+ * Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits
+ * Number of Slices = ..................... 8 slices
+ * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 
+ * Directory Name = ....................... .\
+ * File Name = ............................ 8x256_tables.c
+ */
+
+static
+u32 crc_tableil8_o80[256] =
+{
+ 0x00000000, 0x68032CC8, 0xD0065990, 0xB8057558, 0xA5E0C5D1, 0xCDE3E919, 0x75E69C41, 0x1DE5B089,
+ 0x4E2DFD53, 0x262ED19B, 0x9E2BA4C3, 0xF628880B, 0xEBCD3882, 0x83CE144A, 0x3BCB6112, 0x53C84DDA,
+ 0x9C5BFAA6, 0xF458D66E, 0x4C5DA336, 0x245E8FFE, 0x39BB3F77, 0x51B813BF, 0xE9BD66E7, 0x81BE4A2F,
+ 0xD27607F5, 0xBA752B3D, 0x02705E65, 0x6A7372AD, 0x7796C224, 0x1F95EEEC, 0xA7909BB4, 0xCF93B77C,
+ 0x3D5B83BD, 0x5558AF75, 0xED5DDA2D, 0x855EF6E5, 0x98BB466C, 0xF0B86AA4, 0x48BD1FFC, 0x20BE3334,
+ 0x73767EEE, 0x1B755226, 0xA370277E, 0xCB730BB6, 0xD696BB3F, 0xBE9597F7, 0x0690E2AF, 0x6E93CE67,
+ 0xA100791B, 0xC90355D3, 0x7106208B, 0x19050C43, 0x04E0BCCA, 0x6CE39002, 0xD4E6E55A, 0xBCE5C992,
+ 0xEF2D8448, 0x872EA880, 0x3F2BDDD8, 0x5728F110, 0x4ACD4199, 0x22CE6D51, 0x9ACB1809, 0xF2C834C1,
+ 0x7AB7077A, 0x12B42BB2, 0xAAB15EEA, 0xC2B27222, 0xDF57C2AB, 0xB754EE63, 0x0F519B3B, 0x6752B7F3,
+ 0x349AFA29, 0x5C99D6E1, 0xE49CA3B9, 0x8C9F8F71, 0x917A3FF8, 0xF9791330, 0x417C6668, 0x297F4AA0,
+ 0xE6ECFDDC, 0x8EEFD114, 0x36EAA44C, 0x5EE98884, 0x430C380D, 0x2B0F14C5, 0x930A619D, 0xFB094D55,
+ 0xA8C1008F, 0xC0C22C47, 0x78C7591F, 0x10C475D7, 0x0D21C55E, 0x6522E996, 0xDD279CCE, 0xB524B006,
+ 0x47EC84C7, 0x2FEFA80F, 0x97EADD57, 0xFFE9F19F, 0xE20C4116, 0x8A0F6DDE, 0x320A1886, 0x5A09344E,
+ 0x09C17994, 0x61C2555C, 0xD9C72004, 0xB1C40CCC, 0xAC21BC45, 0xC422908D, 0x7C27E5D5, 0x1424C91D,
+ 0xDBB77E61, 0xB3B452A9, 0x0BB127F1, 0x63B20B39, 0x7E57BBB0, 0x16549778, 0xAE51E220, 0xC652CEE8,
+ 0x959A8332, 0xFD99AFFA, 0x459CDAA2, 0x2D9FF66A, 0x307A46E3, 0x58796A2B, 0xE07C1F73, 0x887F33BB,
+ 0xF56E0EF4, 0x9D6D223C, 0x25685764, 0x4D6B7BAC, 0x508ECB25, 0x388DE7ED, 0x808892B5, 0xE88BBE7D,
+ 0xBB43F3A7, 0xD340DF6F, 0x6B45AA37, 0x034686FF, 0x1EA33676, 0x76A01ABE, 0xCEA56FE6, 0xA6A6432E,
+ 0x6935F452, 0x0136D89A, 0xB933ADC2, 0xD130810A, 0xCCD53183, 0xA4D61D4B, 0x1CD36813, 0x74D044DB,
+ 0x27180901, 0x4F1B25C9, 0xF71E5091, 0x9F1D7C59, 0x82F8CCD0, 0xEAFBE018, 0x52FE9540, 0x3AFDB988,
+ 0xC8358D49, 0xA036A181, 0x1833D4D9, 0x7030F811, 0x6DD54898, 0x05D66450, 0xBDD31108, 0xD5D03DC0,
+ 0x8618701A, 0xEE1B5CD2, 0x561E298A, 0x3E1D0542, 0x23F8B5CB, 0x4BFB9903, 0xF3FEEC5B, 0x9BFDC093,
+ 0x546E77EF, 0x3C6D5B27, 0x84682E7F, 0xEC6B02B7, 0xF18EB23E, 0x998D9EF6, 0x2188EBAE, 0x498BC766,
+ 0x1A438ABC, 0x7240A674, 0xCA45D32C, 0xA246FFE4, 0xBFA34F6D, 0xD7A063A5, 0x6FA516FD, 0x07A63A35,
+ 0x8FD9098E, 0xE7DA2546, 0x5FDF501E, 0x37DC7CD6, 0x2A39CC5F, 0x423AE097, 0xFA3F95CF, 0x923CB907,
+ 0xC1F4F4DD, 0xA9F7D815, 0x11F2AD4D, 0x79F18185, 0x6414310C, 0x0C171DC4, 0xB412689C, 0xDC114454,
+ 0x1382F328, 0x7B81DFE0, 0xC384AAB8, 0xAB878670, 0xB66236F9, 0xDE611A31, 0x66646F69, 0x0E6743A1,
+ 0x5DAF0E7B, 0x35AC22B3, 0x8DA957EB, 0xE5AA7B23, 0xF84FCBAA, 0x904CE762, 0x2849923A, 0x404ABEF2,
+ 0xB2828A33, 0xDA81A6FB, 0x6284D3A3, 0x0A87FF6B, 0x17624FE2, 0x7F61632A, 0xC7641672, 0xAF673ABA,
+ 0xFCAF7760, 0x94AC5BA8, 0x2CA92EF0, 0x44AA0238, 0x594FB2B1, 0x314C9E79, 0x8949EB21, 0xE14AC7E9,
+ 0x2ED97095, 0x46DA5C5D, 0xFEDF2905, 0x96DC05CD, 0x8B39B544, 0xE33A998C, 0x5B3FECD4, 0x333CC01C,
+ 0x60F48DC6, 0x08F7A10E, 0xB0F2D456, 0xD8F1F89E, 0xC5144817, 0xAD1764DF, 0x15121187, 0x7D113D4F
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o80
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically
+ * using the following model parameters: 
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41
+ * Generator Polynomial Length = .......... 32 bits
+ * Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits
+ * Number of Slices = ..................... 8 slices
+ * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 
+ * Directory Name = ....................... .\
+ * File Name = ............................ 8x256_tables.c
+ */
+
+static
+u32 crc_tableil8_o88[256] =
+{
+ 0x00000000, 0x493C7D27, 0x9278FA4E, 0xDB448769, 0x211D826D, 0x6821FF4A, 0xB3657823, 0xFA590504,
+ 0x423B04DA, 0x0B0779FD, 0xD043FE94, 0x997F83B3, 0x632686B7, 0x2A1AFB90, 0xF15E7CF9, 0xB86201DE,
+ 0x847609B4, 0xCD4A7493, 0x160EF3FA, 0x5F328EDD, 0xA56B8BD9, 0xEC57F6FE, 0x37137197, 0x7E2F0CB0,
+ 0xC64D0D6E, 0x8F717049, 0x5435F720, 0x1D098A07, 0xE7508F03, 0xAE6CF224, 0x7528754D, 0x3C14086A,
+ 0x0D006599, 0x443C18BE, 0x9F789FD7, 0xD644E2F0, 0x2C1DE7F4, 0x65219AD3, 0xBE651DBA, 0xF759609D,
+ 0x4F3B6143, 0x06071C64, 0xDD439B0D, 0x947FE62A, 0x6E26E32E, 0x271A9E09, 0xFC5E1960, 0xB5626447,
+ 0x89766C2D, 0xC04A110A, 0x1B0E9663, 0x5232EB44, 0xA86BEE40, 0xE1579367, 0x3A13140E, 0x732F6929,
+ 0xCB4D68F7, 0x827115D0, 0x593592B9, 0x1009EF9E, 0xEA50EA9A, 0xA36C97BD, 0x782810D4, 0x31146DF3,
+ 0x1A00CB32, 0x533CB615, 0x8878317C, 0xC1444C5B, 0x3B1D495F, 0x72213478, 0xA965B311, 0xE059CE36,
+ 0x583BCFE8, 0x1107B2CF, 0xCA4335A6, 0x837F4881, 0x79264D85, 0x301A30A2, 0xEB5EB7CB, 0xA262CAEC,
+ 0x9E76C286, 0xD74ABFA1, 0x0C0E38C8, 0x453245EF, 0xBF6B40EB, 0xF6573DCC, 0x2D13BAA5, 0x642FC782,
+ 0xDC4DC65C, 0x9571BB7B, 0x4E353C12, 0x07094135, 0xFD504431, 0xB46C3916, 0x6F28BE7F, 0x2614C358,
+ 0x1700AEAB, 0x5E3CD38C, 0x857854E5, 0xCC4429C2, 0x361D2CC6, 0x7F2151E1, 0xA465D688, 0xED59ABAF,
+ 0x553BAA71, 0x1C07D756, 0xC743503F, 0x8E7F2D18, 0x7426281C, 0x3D1A553B, 0xE65ED252, 0xAF62AF75,
+ 0x9376A71F, 0xDA4ADA38, 0x010E5D51, 0x48322076, 0xB26B2572, 0xFB575855, 0x2013DF3C, 0x692FA21B,
+ 0xD14DA3C5, 0x9871DEE2, 0x4335598B, 0x0A0924AC, 0xF05021A8, 0xB96C5C8F, 0x6228DBE6, 0x2B14A6C1,
+ 0x34019664, 0x7D3DEB43, 0xA6796C2A, 0xEF45110D, 0x151C1409, 0x5C20692E, 0x8764EE47, 0xCE589360,
+ 0x763A92BE, 0x3F06EF99, 0xE44268F0, 0xAD7E15D7, 0x572710D3, 0x1E1B6DF4, 0xC55FEA9D, 0x8C6397BA,
+ 0xB0779FD0, 0xF94BE2F7, 0x220F659E, 0x6B3318B9, 0x916A1DBD, 0xD856609A, 0x0312E7F3, 0x4A2E9AD4,
+ 0xF24C9B0A, 0xBB70E62D, 0x60346144, 0x29081C63, 0xD3511967, 0x9A6D6440, 0x4129E329, 0x08159E0E,
+ 0x3901F3FD, 0x703D8EDA, 0xAB7909B3, 0xE2457494, 0x181C7190, 0x51200CB7, 0x8A648BDE, 0xC358F6F9,
+ 0x7B3AF727, 0x32068A00, 0xE9420D69, 0xA07E704E, 0x5A27754A, 0x131B086D, 0xC85F8F04, 0x8163F223,
+ 0xBD77FA49, 0xF44B876E, 0x2F0F0007, 0x66337D20, 0x9C6A7824, 0xD5560503, 0x0E12826A, 0x472EFF4D,
+ 0xFF4CFE93, 0xB67083B4, 0x6D3404DD, 0x240879FA, 0xDE517CFE, 0x976D01D9, 0x4C2986B0, 0x0515FB97,
+ 0x2E015D56, 0x673D2071, 0xBC79A718, 0xF545DA3F, 0x0F1CDF3B, 0x4620A21C, 0x9D642575, 0xD4585852,
+ 0x6C3A598C, 0x250624AB, 0xFE42A3C2, 0xB77EDEE5, 0x4D27DBE1, 0x041BA6C6, 0xDF5F21AF, 0x96635C88,
+ 0xAA7754E2, 0xE34B29C5, 0x380FAEAC, 0x7133D38B, 0x8B6AD68F, 0xC256ABA8, 0x19122CC1, 0x502E51E6,
+ 0xE84C5038, 0xA1702D1F, 0x7A34AA76, 0x3308D751, 0xC951D255, 0x806DAF72, 0x5B29281B, 0x1215553C,
+ 0x230138CF, 0x6A3D45E8, 0xB179C281, 0xF845BFA6, 0x021CBAA2, 0x4B20C785, 0x906440EC, 0xD9583DCB,
+ 0x613A3C15, 0x28064132, 0xF342C65B, 0xBA7EBB7C, 0x4027BE78, 0x091BC35F, 0xD25F4436, 0x9B633911,
+ 0xA777317B, 0xEE4B4C5C, 0x350FCB35, 0x7C33B612, 0x866AB316, 0xCF56CE31, 0x14124958, 0x5D2E347F,
+ 0xE54C35A1, 0xAC704886, 0x7734CFEF, 0x3E08B2C8, 0xC451B7CC, 0x8D6DCAEB, 0x56294D82, 0x1F1530A5
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o88
+ */
+
+//#define VERIFY_ASSERTION
+
+#ifdef VERIFY_ASSERTION
+
+// Trivial byte-by-byte version: you can switch on the assertion in the
+// Crc32_ComputeBuf function (by defining VERIFY_ASSERTION) to check this
+// against the slicing variant.
+static really_inline
+u32 crc32c(u32 running_crc, const unsigned char* p_buf, size_t length) {
+    u32 crc = running_crc;
+    while (length--) {
+        crc = crc_tableil8_o32[(crc ^ *p_buf++) & 0x000000FF] ^ (crc >> 8);
+    }
+    return crc;
+}
+
+#endif // VERIFY_ASSERTION
+
+// Slicing-by-8 approach, which is much faster. Derived from Intel's
+// BSD-licensed code, with additions to handled aligned case automatically.
+static really_inline
+u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
+                      const size_t length) {
+    u32 crc = running_crc;
+
+    // Process byte-by-byte until p_buf is aligned
+
+    const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, 4);
+    size_t init_bytes = aligned_buf - p_buf;
+    size_t running_length = ((length - init_bytes)/8)*8;
+    size_t end_bytes = length - init_bytes - running_length;
+
+    while (p_buf < aligned_buf) {
+        crc = crc_tableil8_o32[(crc ^ *p_buf++) & 0x000000FF] ^ (crc >> 8);
+    }
+
+    // Main aligned loop, processes eight bytes at a time.
+
+    u32 term1, term2;
+    for (size_t li = 0; li < running_length/8; li++) {
+        u32 block = *(const u32 *)p_buf;
+        crc ^= block;
+        p_buf += 4;
+        term1 = crc_tableil8_o88[crc & 0x000000FF] ^
+                crc_tableil8_o80[(crc >> 8) & 0x000000FF];
+        term2 = crc >> 16;
+        crc = term1 ^
+              crc_tableil8_o72[term2 & 0x000000FF] ^
+              crc_tableil8_o64[(term2 >> 8) & 0x000000FF];
+
+
+        block = *(const u32 *)p_buf;
+
+        term1 = crc_tableil8_o56[block & 0x000000FF] ^
+                crc_tableil8_o48[(block >> 8) & 0x000000FF];
+
+        term2 = block >> 16;
+        crc =   crc ^
+                term1 ^
+                crc_tableil8_o40[term2 & 0x000000FF] ^
+                crc_tableil8_o32[(term2 >> 8) & 0x000000FF];
+        p_buf += 4;
+    }
+
+    // Remaining bytes
+
+    for(size_t li = 0; li < end_bytes; li++) {
+        crc = crc_tableil8_o32[(crc ^ *p_buf++) & 0x000000FF] ^ (crc >> 8);
+    }
+
+    return crc;
+}
+
+#else // __SSE4_2__
+
+#ifdef ARCH_64_BIT
+#define CRC_WORD 8
+#define CRC_TYPE u64a
+#define CRC_FUNC _mm_crc32_u64
+#else
+#define CRC_WORD 4
+#define CRC_TYPE u32
+#define CRC_FUNC _mm_crc32_u32
+#endif
+
+/*
+ * Use the crc32 instruction from SSE4.2 to compute our checksum - same
+ * polynomial as the above function.
+ */
+static really_inline
+u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
+                      const size_t length) {
+    u32 crc = running_crc;
+
+    // Process byte-by-byte until p_buf is aligned
+
+    const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD);
+    size_t init_bytes = aligned_buf - p_buf;
+    size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD;
+    size_t end_bytes = length - init_bytes - running_length;
+
+    while (p_buf < aligned_buf) {
+        crc = _mm_crc32_u8(crc, *p_buf++);
+    }
+
+    // Main aligned loop, processes a word at a time.
+
+    for (size_t li = 0; li < running_length/CRC_WORD; li++) {
+        CRC_TYPE block = *(const CRC_TYPE *)p_buf;
+        crc = CRC_FUNC(crc, block);
+        p_buf += CRC_WORD;
+    }
+
+    // Remaining bytes
+
+    for(size_t li = 0; li < end_bytes; li++) {
+        crc = _mm_crc32_u8(crc, *p_buf++);
+    }
+
+    return crc;
+}
+#endif
+
+#ifdef VERIFY_ASSERTION
+#include <assert.h>
+#endif
+
+// Externally visible function
+u32 Crc32c_ComputeBuf(u32 inCrc32, const void *buf, size_t bufLen) {
+#ifdef __SSE4_2__
+    u32 crc = crc32c_sse42(inCrc32, (const unsigned char *)buf, bufLen);
+#else
+    u32 crc = crc32c_sb8_64_bit(inCrc32, (const unsigned char *)buf, bufLen);
+#endif
+
+#ifdef VERIFY_ASSERTION
+    assert(crc == crc32c(inCrc32, (const unsigned char *)buf, bufLen));
+#endif
+
+    return crc;
+}
--- a/src/crc32.h
+++ b/src/crc32.h
@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CRC32_H_36A5015B5840C1
+#define CRC32_H_36A5015B5840C1
+
+#include "ue2common.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+u32 Crc32c_ComputeBuf(u32 inCrc32, const void *buf, size_t bufLen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CRC32_H_36A5015B5840C1 */
+
--- a/src/database.c
+++ b/src/database.c
@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Runtime code for hs_database manipulation.
+  */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "allocator.h"
+#include "hs_common.h"
+#include "hs_internal.h"
+#include "hs_version.h"
+#include "ue2common.h"
+#include "database.h"
+#include "crc32.h"
+#include "rose/rose_internal.h"
+#include "util/unaligned.h"
+
+static really_inline
+int db_correctly_aligned(const void *db) {
+    return ISALIGNED_N(db, alignof(unsigned long long));
+}
+
+HS_PUBLIC_API
+hs_error_t hs_free_database(hs_database_t *db) {
+    if (db && db->magic != HS_DB_MAGIC) {
+        return HS_INVALID;
+    }
+    hs_database_free(db);
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t hs_serialize_database(const hs_database_t *db, char **bytes,
+                                 size_t *serialized_length) {
+    if (!db || !bytes || !serialized_length) {
+        return HS_INVALID;
+    }
+
+    if (!db_correctly_aligned(db)) {
+        return HS_BAD_ALIGN;
+    }
+
+    hs_error_t ret = validDatabase(db);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    size_t length = sizeof(struct hs_database) + db->length;
+
+    char *out = hs_misc_alloc(length);
+    ret = hs_check_alloc(out);
+    if (ret != HS_SUCCESS) {
+        hs_misc_free(out);
+        return ret;
+    }
+
+    memset(out, 0, length);
+
+    u32 *buf = (u32 *)out;
+    *buf = db->magic;
+    buf++;
+    *buf = db->version;
+    buf++;
+    *buf = db->length;
+    buf++;
+    memcpy(buf, &db->platform, sizeof(u64a));
+    buf += 2;
+    *buf = db->crc32;
+    buf++;
+    *buf = db->reserved0;
+    buf++;
+    *buf = db->reserved1;
+    buf++;
+
+    const char *bytecode = hs_get_bytecode(db);
+    memcpy(buf, bytecode, db->length);
+
+    *bytes = out;
+    *serialized_length = length;
+    return HS_SUCCESS;
+}
+
+// check that the database header's platform is compatible with the current
+// runtime platform.
+static
+hs_error_t db_check_platform(const u64a p) {
+    if (p != hs_current_platform
+        && p != hs_current_platform_no_avx2) {
+        return HS_DB_PLATFORM_ERROR;
+    }
+    // passed all checks
+    return HS_SUCCESS;
+}
+
+// Decode and check the database header, returning appropriate errors or
+// HS_SUCCESS if it's OK. The header should be allocated on the stack
+// and later copied into the deserialized database.
+static
+hs_error_t db_decode_header(const char **bytes, const size_t length,
+                            struct hs_database *header) {
+    if (!*bytes) {
+        return HS_INVALID;
+    }
+
+    if (length < sizeof(struct hs_database)) {
+        return HS_INVALID;
+    }
+
+    // There's no requirement, really, that the serialized stream of bytes
+    // we've been given is 4-byte aligned, so we use unaligned loads here.
+
+    const u32 *buf = (const u32 *)*bytes;
+
+    // Zero header so that none of it (e.g. its padding) is uninitialized.
+    memset(header, 0, sizeof(struct hs_database));
+
+    header->magic = unaligned_load_u32(buf++);
+    if (header->magic != HS_DB_MAGIC) {
+        return HS_INVALID;
+    }
+
+    header->version = unaligned_load_u32(buf++);
+    if (header->version != HS_DB_VERSION) {
+        return HS_DB_VERSION_ERROR;
+    }
+
+    header->length = unaligned_load_u32(buf++);
+    if (length != sizeof(struct hs_database) + header->length) {
+        DEBUG_PRINTF("bad length %zu, expecting %zu\n", length,
+                     sizeof(struct hs_database) + header->length);
+        return HS_INVALID;
+    }
+
+    header->platform = unaligned_load_u64a(buf);
+    buf += 2;
+    header->crc32 = unaligned_load_u32(buf++);
+    header->reserved0 = unaligned_load_u32(buf++);
+    header->reserved1 = unaligned_load_u32(buf++);
+
+    *bytes = (const char *)buf;
+
+    return HS_SUCCESS; // Header checks out
+}
+
+// Check the CRC on a database
+static
+hs_error_t db_check_crc(const hs_database_t *db) {
+    const char *bytecode = hs_get_bytecode(db);
+    u32 crc = Crc32c_ComputeBuf(0, bytecode, db->length);
+    if (crc != db->crc32) {
+        DEBUG_PRINTF("crc mismatch! 0x%x != 0x%x\n", crc, db->crc32);
+        return HS_INVALID;
+    }
+    return HS_SUCCESS;
+}
+
+static
+void db_copy_bytecode(const char *serialized, hs_database_t *db) {
+    // we need to align things manually
+    uintptr_t shift = (uintptr_t)db->bytes & 0x3f;
+    db->bytecode = offsetof(struct hs_database, bytes) - shift;
+    char *bytecode = (char *)db + db->bytecode;
+
+    // Copy the bytecode into place
+    memcpy(bytecode, serialized, db->length);
+}
+
+HS_PUBLIC_API
+hs_error_t hs_deserialize_database_at(const char *bytes, const size_t length,
+                                      hs_database_t *db) {
+    if (!bytes || !db) {
+        return HS_INVALID;
+    }
+
+    // We require the user to deserialize into an 8-byte aligned region.
+    if (!ISALIGNED_N(db, 8)) {
+        return HS_BAD_ALIGN;
+    }
+
+    // Decode the header
+    hs_database_t header;
+    hs_error_t ret = db_decode_header(&bytes, length, &header);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    // Make sure the serialized database is for our platform
+    ret = db_check_platform(header.platform);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    // Zero new space for safety
+    size_t dblength = sizeof(struct hs_database) + header.length;
+    memset(db, 0, dblength);
+
+    // Copy the decoded header into place
+    memcpy(db, &header, sizeof(header));
+
+    // Copy the bytecode into the correctly-aligned location, set offsets
+    db_copy_bytecode(bytes, db);
+
+    if (db_check_crc(db) != HS_SUCCESS) {
+        return HS_INVALID;
+    }
+
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t hs_deserialize_database(const char *bytes, const size_t length,
+                                   hs_database_t **db) {
+    if (!bytes || !db) {
+        return HS_INVALID;
+    }
+
+    *db = NULL;
+
+    // Decode and check the header
+    hs_database_t header;
+    hs_error_t ret = db_decode_header(&bytes, length, &header);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    // Make sure the serialized database is for our platform
+    ret = db_check_platform(header.platform);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    // Allocate space for new database
+    size_t dblength = sizeof(struct hs_database) + header.length;
+    struct hs_database *tempdb = hs_database_alloc(dblength);
+    ret = hs_check_alloc(tempdb);
+    if (ret != HS_SUCCESS) {
+        hs_database_free(tempdb);
+        return ret;
+    }
+
+    // Zero new space for safety
+    memset(tempdb, 0, dblength);
+
+    // Copy the decoded header into place
+    memcpy(tempdb, &header, sizeof(header));
+
+    // Copy the bytecode into the correctly-aligned location, set offsets
+    db_copy_bytecode(bytes, tempdb);
+
+    if (db_check_crc(tempdb) != HS_SUCCESS) {
+        hs_database_free(tempdb);
+        return HS_INVALID;
+    }
+
+    *db = tempdb;
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t hs_database_size(const hs_database_t *db, size_t *size) {
+    if (!size) {
+        return HS_INVALID;
+    }
+
+    hs_error_t ret = validDatabase(db);
+    if (unlikely(ret != HS_SUCCESS)) {
+        return ret;
+    }
+
+    *size = sizeof(struct hs_database) + db->length;
+    return HS_SUCCESS;
+}
+
+HS_PUBLIC_API
+hs_error_t hs_serialized_database_size(const char *bytes, const size_t length,
+                                       size_t *size) {
+    // Decode and check the header
+    hs_database_t header;
+    hs_error_t ret = db_decode_header(&bytes, length, &header);
+    if (ret != HS_SUCCESS) {
+        return ret;
+    }
+
+    if (!size) {
+        return HS_INVALID;
+    }
+
+    *size = sizeof(struct hs_database) + header.length;
+    return HS_SUCCESS;
+}
+
+hs_error_t dbIsValid(const hs_database_t *db) {
+    if (db->magic != HS_DB_MAGIC) {
+        DEBUG_PRINTF("bad magic\n");
+        return HS_INVALID;
+    }
+
+    if (db->version != HS_DB_VERSION) {
+        DEBUG_PRINTF("bad version\n");
+        return HS_DB_VERSION_ERROR;
+    }
+
+    if (db_check_platform(db->platform) != HS_SUCCESS) {
+        DEBUG_PRINTF("bad platform\n");
+        return HS_DB_PLATFORM_ERROR;
+    }
+
+    if (!ISALIGNED_16(hs_get_bytecode(db))) {
+        DEBUG_PRINTF("bad alignment\n");
+        return HS_INVALID;
+    }
+
+    hs_error_t rv = db_check_crc(db);
+    if (rv != HS_SUCCESS) {
+        DEBUG_PRINTF("bad crc\n");
+        return rv;
+    }
+
+    return HS_SUCCESS;
+}
+
+/** \brief Encapsulate the given bytecode (RoseEngine) in a newly-allocated
+ * \ref hs_database, ensuring that it is padded correctly to give cacheline
+ * alignment.  */
+hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
+    size_t db_len = sizeof(struct hs_database) + len;
+    DEBUG_PRINTF("db size %zu\n", db_len);
+    DEBUG_PRINTF("db platform %llx\n", platform);
+
+    struct hs_database *db = (struct hs_database *)hs_database_alloc(db_len);
+    if (hs_check_alloc(db) != HS_SUCCESS) {
+        hs_database_free(db);
+        return NULL;
+    }
+
+    // So that none of our database is uninitialized
+    memset(db, 0, db_len);
+
+    // we need to align things manually
+    size_t shift = (uintptr_t)db->bytes & 0x3f;
+    DEBUG_PRINTF("shift is %zu\n", shift);
+
+    db->bytecode = offsetof(struct hs_database, bytes) - shift;
+    char *bytecode = (char *)db + db->bytecode;
+    assert(ISALIGNED_CL(bytecode));
+
+    db->magic = HS_DB_MAGIC;
+    db->version = HS_DB_VERSION;
+    db->length = len;
+    db->platform = platform;
+
+    // Copy bytecode
+    memcpy(bytecode, in_bytecode, len);
+
+    db->crc32 = Crc32c_ComputeBuf(0, bytecode, db->length);
+    return db;
+}
+
+#if defined(_WIN32)
+#define SNPRINTF_COMPAT _snprintf
+#else
+#define SNPRINTF_COMPAT snprintf
+#endif
+
+/** Allocate a buffer and prints the database info into it. Returns an
+ * appropriate error code on failure, or HS_SUCCESS on success. */
+static
+hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
+                                 u32 raw_mode) {
+    assert(s);
+    *s = NULL;
+
+    u8 release = (version >> 8) & 0xff;
+    u8 minor = (version >> 16) & 0xff;
+    u8 major = (version >> 24) & 0xff;
+
+    const char *avx2 = (plat & HS_PLATFORM_NOAVX2)  ? "NOAVX2" : " AVX2";
+
+    const char *mode = NULL;
+
+    if (raw_mode == HS_MODE_STREAM) {
+        mode = "STREAM";
+    } else if (raw_mode == HS_MODE_VECTORED) {
+        mode = "VECTORED";
+    } else {
+        assert(raw_mode == HS_MODE_BLOCK);
+        mode = "BLOCK";
+    }
+
+    // Initial allocation size, which should be large enough to print our info.
+    // If it isn't, snprintf will tell us and we can resize appropriately.
+    size_t len = 256;
+
+    while (1) {
+        char *buf = hs_misc_alloc(len);
+        hs_error_t ret = hs_check_alloc(buf);
+        if (ret != HS_SUCCESS) {
+            hs_misc_free(buf);
+            return ret;
+        }
+
+        // Note: SNPRINTF_COMPAT is a macro defined above, to cope with systems
+        // that don't have snprintf but have a workalike.
+        int p_len = SNPRINTF_COMPAT(
+            buf, len, "Version: %u.%u.%u Features: %s Mode: %s",
+            major, minor, release, avx2, mode);
+        if (p_len < 0) {
+            DEBUG_PRINTF("snprintf output error, returned %d\n", p_len);
+            hs_misc_free(buf);
+            break;
+        } else if ((size_t)p_len < len) { // output fit within buffer.
+            assert(buf[p_len] == '\0');
+            *s = buf;
+            return HS_SUCCESS;
+        } else { // output didn't fit: resize and reallocate.
+            len = (size_t)p_len + 1; // must add one for null terminator.
+            hs_misc_free(buf);
+        }
+    }
+
+    return HS_NOMEM;
+}
+
+HS_PUBLIC_API
+hs_error_t hs_serialized_database_info(const char *bytes, size_t length,
+                                       char **info) {
+    if (!info) {
+        return HS_INVALID;
+    }
+    *info = NULL;
+
+    if (!bytes || length < sizeof(struct hs_database)) {
+        return HS_INVALID;
+    }
+
+    const u32 *buf = (const u32 *)bytes;
+
+    u32 magic = unaligned_load_u32(buf++);
+    if (magic != HS_DB_MAGIC) {
+        return HS_INVALID;
+    }
+
+    u32 version = unaligned_load_u32(buf++);
+
+    buf++; /* length */
+
+    platform_t plat;
+    plat = unaligned_load_u64a(buf);
+    buf += 2;
+
+    buf++; /* crc */
+    buf++; /* reserved 0 */
+    buf++; /* reserved 1 */
+
+    const char *t_raw = (const char *)buf;
+    u32 mode = unaligned_load_u32(t_raw + offsetof(struct RoseEngine, mode));
+
+    return print_database_string(info, version, plat, mode);
+}
+
+HS_PUBLIC_API
+hs_error_t hs_database_info(const hs_database_t *db, char **info) {
+    if (!info) {
+        return HS_INVALID;
+    }
+    *info = NULL;
+
+    if (!db || !db_correctly_aligned(db) || db->magic != HS_DB_MAGIC) {
+        return HS_INVALID;
+    }
+
+    platform_t plat;
+    plat = db->platform;
+
+    const struct RoseEngine *rose = hs_get_bytecode(db);
+
+    return print_database_string(info, db->version, plat, rose->mode);
+}
--- a/src/database.h
+++ b/src/database.h
@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Runtime code for hs_database manipulation.
+ */
+
+#ifndef DATABASE_H_D467FD6F343DDE
+#define DATABASE_H_D467FD6F343DDE
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "hs_compile.h" // for HS_MODE_ flags
+#include "hs_version.h"
+#include "ue2common.h"
+
+#define HS_DB_VERSION HS_VERSION_32BIT
+#define HS_DB_MAGIC   (0xdbdbdbdbU)
+
+// Values in here cannot (easily) change - add new ones!
+
+// CPU type is the low 6 bits (we can't need more than 64, surely!)
+
+#define HS_PLATFORM_INTEL           1
+#define HS_PLATFORM_CPU_MASK        0x3F
+
+#define HS_PLATFORM_NOAVX2          (4<<13)
+
+/** \brief Platform features bitmask. */
+typedef u64a platform_t;
+
+static UNUSED
+const platform_t hs_current_platform = {
+#if !defined(__AVX2__)
+    HS_PLATFORM_NOAVX2 |
+#endif
+    0,
+};
+
+static UNUSED
+const platform_t hs_current_platform_no_avx2 = {
+    HS_PLATFORM_NOAVX2 |
+    0,
+};
+
+/*
+ * a header to enclose the actual bytecode - useful for keeping info about the
+ * compiled data.
+ */
+struct hs_database {
+    u32 magic;
+    u32 version;
+    u32 length;
+    u64a platform;
+    u32 crc32;
+    u32 reserved0;
+    u32 reserved1;
+    u32 bytecode;    // offset relative to db start
+    u32 padding[16];
+    char bytes[];
+};
+
+static really_inline
+const void *hs_get_bytecode(const struct hs_database *db) {
+    return ((const char *)db + db->bytecode);
+}
+
+/**
+ * Cheap database sanity checks used in block mode scan calls and streaming
+ * mode open calls.
+ */
+static really_inline
+hs_error_t validDatabase(const hs_database_t *db) {
+    if (!db || db->magic != HS_DB_MAGIC) {
+        return HS_INVALID;
+    }
+    if (db->version != HS_DB_VERSION) {
+        return HS_DB_VERSION_ERROR;
+    }
+
+    return HS_SUCCESS;
+}
+
+hs_error_t dbIsValid(const struct hs_database *db);
+struct hs_database *dbCreate(const char *bytecode, size_t len, u64a platform);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* DATABASE_H_D467FD6F343DDE */
--- a/src/fdr/CMakeLists.txt
+++ b/src/fdr/CMakeLists.txt
@ -0,0 +1,39 @@
+# The set of rules and other nastiness for generating FDR/Teddy source
+
+# we need to add these as explicit dependencies
+set(AUTOGEN_PY_FILES
+    arch.py
+    autogen.py
+    autogen_utils.py
+    base_autogen.py
+    fdr_autogen.py
+    teddy_autogen.py
+)
+
+function(fdr_autogen type out)
+    add_custom_command (
+        COMMENT "AUTOGEN ${out}"
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${out}
+        COMMAND ${PYTHON} ${CMAKE_CURRENT_SOURCE_DIR}/autogen.py ${type} > ${CMAKE_CURRENT_BINARY_DIR}/${out}
+        DEPENDS ${AUTOGEN_PY_FILES}
+        )
+    add_custom_target(autogen_${type} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${out})
+endfunction(fdr_autogen)
+
+#now build the functions
+fdr_autogen(runtime fdr_autogen.c)
+fdr_autogen(compiler fdr_autogen_compiler.cpp)
+fdr_autogen(teddy_runtime teddy_autogen.c)
+fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp)
+
+set(fdr_GENERATED_SRC
+${CMAKE_BINARY_DIR}/src/fdr/fdr_autogen.c
+${CMAKE_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp
+${CMAKE_BINARY_DIR}/src/fdr/teddy_autogen.c
+${CMAKE_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
+PARENT_SCOPE)
+
+set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+
--- a/src/fdr/arch.py
+++ b/src/fdr/arch.py
@ -0,0 +1,58 @@
+#!/usr/bin/python
+
+# Copyright (c) 2015, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import autogen_utils
+
+# wrapper for architectures
+
+class Arch:
+    def __init__(self, name, extensions = []):
+        self.name = name
+        self.extensions = extensions
+        self.target = None
+
+    def get_guard(self):
+        # these defines definitely fall into the "belt-and-suspenders"
+        # category of paranoia
+        if (self.guard_list == []):
+            return "#if 1"
+
+        return "#if " + " && ".join(self.guard_list)
+
+class X86Arch(Arch):
+    def __init__(self, name, extensions = []):
+        Arch.__init__(self, name, extensions)
+        self.guard_list = [ ]
+        self.target = "0"
+
+        if "AVX2" in extensions:
+            self.target += " | HS_CPU_FEATURES_AVX2"
+            self.guard_list += [ "defined(__AVX2__)" ]
+
+
+arch_x86_64            = X86Arch("x86_64", extensions = [ ])
+arch_x86_64_avx2       = X86Arch("x86_64_avx2", extensions = [ "AVX2" ])
--- a/src/fdr/autogen.py
+++ b/src/fdr/autogen.py
@ -0,0 +1,159 @@
+#!/usr/bin/python
+
+# Copyright (c) 2015, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+from autogen_utils import *
+from fdr_autogen import *
+from teddy_autogen import *
+from arch import *
+
+# FDR setup
+
+# these are either produced - if the guard succeeds, or #defined to zeroes.
+# either the function or the zero is fine in our array of function pointers
+
+def produce_fdr_runtimes(l):
+    for m in l:
+        m.produce_code()
+
+def produce_fdr_compiles(l):
+    print "void getFdrDescriptions(vector<FDREngineDescription> *out) {"
+    print "    static const FDREngineDef defns[] = {"
+    for m in l:
+        m.produce_compile_call()
+    print "    };"
+    print "    out->clear();"
+    print "    for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {"
+    print "        out->push_back(FDREngineDescription(defns[i]));"
+    print "    }"
+    print "}"
+
+def build_fdr_matchers():
+    all_matchers = [ ]
+    domains = [8, 10, 11, 12, 13]
+    big_domains = [ 14, 15 ]
+
+    common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
+    for d in domains:
+        all_matchers += [ M3(stride = 1, domain = d, **common) ]
+        all_matchers += [ M3(stride = 2, domain = d, **common) ]
+        all_matchers += [ M3(stride = 4, domain = d, **common) ]
+    for d in big_domains:
+        all_matchers += [ M3(stride = 1, domain = d, **common) ]
+
+    return all_matchers
+
+# teddy setup
+
+def build_teddy_matchers():
+    all_matchers = [ ]
+
+    # AVX2
+    all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = False) ]
+    all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = True) ]
+    for n_msk in range(1, 5):
+        all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = False, num_masks = n_msk, num_buckets = 16) ]
+        all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = True, num_masks = n_msk, num_buckets = 16) ]
+
+    # SSE/SSE2/SSSE3
+    for n_msk in range(1, 5):
+        all_matchers += [ MT(arch = arch_x86_64, packed = False, num_masks = n_msk, num_buckets = 8) ]
+        all_matchers += [ MT(arch = arch_x86_64, packed = True, num_masks = n_msk, num_buckets = 8) ]
+
+    return all_matchers
+
+def produce_teddy_compiles(l):
+    print "void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {"
+    print "    static const TeddyEngineDef defns[] = {"
+    for m in l:
+        m.produce_compile_call()
+    print "    };"
+    print "    out->clear();"
+    print "    for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {"
+    print "        out->push_back(TeddyEngineDescription(defns[i]));"
+    print "    }"
+    print "}"
+
+# see below - we don't produce our 'zeros' at the point of the teddy runtimes as they
+# are linked. So we either generate the function or we don't - then at the point of the
+# header in fdr_autogen.c we either generate the header or we #define the zero.
+
+def produce_teddy_runtimes(l):
+    # Since we're using -Wmissing-prototypes, we need headers first.
+    for m in l:
+	m.produce_guard()
+        print m.produce_header(visible = True, header_only = True)
+	m.close_guard()
+
+    for m in l:
+	m.produce_guard()
+        m.produce_code()
+	m.close_guard()
+
+# see produce_teddy_runtimes() comment for the rationale
+
+def produce_teddy_headers(l):
+    for m in l:
+	m.produce_guard()
+        print m.produce_header(visible = True, header_only = True)
+	m.produce_zero_alternative()
+
+# general utilities
+
+def make_fdr_function_pointers(matcher_list):
+    print  """
+typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
+static FDRFUNCTYPE funcs[] = {
+"""
+    all_funcs = ",\n".join([ "    %s" % m.get_name() for m in matcher_list ])
+    print all_funcs
+    print """
+};
+"""
+
+def assign_ids(matcher_list, next_id):
+    for m in matcher_list:
+        m.id = next_id
+        next_id += 1
+    return next_id
+
+# Main entry point
+
+m = build_fdr_matchers()
+next_id = assign_ids(m, 0)
+tm = build_teddy_matchers()
+next_id = assign_ids(tm, next_id)
+if sys.argv[1] == "compiler":
+    produce_fdr_compiles(m)
+elif sys.argv[1] == "runtime":
+    produce_fdr_runtimes(m)
+    produce_teddy_headers(tm)
+    make_fdr_function_pointers(m+tm)
+elif sys.argv[1] == "teddy_runtime":
+    produce_teddy_runtimes(tm)
+elif sys.argv[1] == "teddy_compiler":
+    produce_teddy_compiles(tm)
--- a/src/fdr/autogen_utils.py
+++ b/src/fdr/autogen_utils.py
@ -0,0 +1,285 @@
+#!/usr/bin/python
+
+# Copyright (c) 2015, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+def fail_out(msg = ""):
+    print >>sys.stderr, "Internal failure in autogen.py: " + msg
+    sys.exit(1)
+
+class IntegerType:
+    def __init__(self, size):
+        self.size = size
+
+    def get_name(self):
+        return { 256: "m256", 128 : "m128", 64 : "u64a", 32 : "u32" , 16 : "u16", 8 : "u8"}[self.size]
+
+    def size_in_bytes(self):
+        return self.size / 8
+
+    def isSIMDOnIntel(self):
+        return False
+
+    def zero_expression(self):
+        return "0"
+
+    def constant_to_string(self, n):
+        if self.size == 64:
+            suffix = "ULL"
+        else:
+            suffix = ""
+        return "0x%x%s" % (n & ((1 << self.size) - 1), suffix)
+
+    def lowbits(self, n):
+        return (1 << n) - 1
+
+    def highbits(self, n):
+        return ~(self.lowbits(self.size - n))
+
+    def lowbit_mask(self, n):
+        return self.constant_to_string(self.lowbits(n))
+
+    def highbit_mask(self, n):
+        return self.constant_to_string(self.highbits(n))
+
+    def lowbit_extract_expr(self, expr_string, n):
+         return "(%s & %s)" % ( expr_string, self.lowbit_mask(n))
+
+    def highbit_extract_expr(self, expr_string, n):
+        return "(%s >> %d)" % (expr_string, self.size - n)
+
+    def flip_lowbits_expr(self, expr_string, n):
+         return "(%s ^ %s)" % ( expr_string, self.lowbit_mask(n))
+
+    def bit_extract_expr(self, expr_string, low, high):
+        lbm = self.lowbit_mask(high - low)
+        return "((%s >> %d) & %s)" % (expr_string, low, lbm)
+
+    # shifts are +ve if left and -ve if right
+    def shift_expr(self, expr_string, n):
+        if n <= -self.size or n >= self.size:
+            return self.zero_expression()
+        elif (n > 0):
+            return "(%s << %d)" % (expr_string, n)
+        elif (n < 0):
+            return "(%s >> %d)" % (expr_string, -n)
+        else:
+            return "(%s)" % (expr_string)
+
+    # code is:
+    # "normal" (always between buf and len) - the default
+    # "aligned" (means normal + aligned to a natural boundary)
+    # "cautious_forward" (means may go off the end of buf+len)
+    # "cautious_backwards" (means may go off the start of buf)
+    # "cautious_everywhere" (means may go off both)
+
+    def load_expr_data(self, offset = 0, code = "normal",
+                       base_string = "ptr", bounds_lo = "buf", bounds_hi = "buf + len"):
+        if code is "normal":
+            return "lv_%s(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
+        elif code is "aligned":
+            if self.size is 8:
+                fail_out("no aligned byte loads")
+            return "lv_%s_a(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
+        elif code is "cautious_forward":
+            return "lv_%s_cf(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
+        elif code is "cautious_backward":
+            return "lv_%s_cb(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
+        elif code is "cautious_everywhere":
+            return "lv_%s_ce(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
+
+
+class SIMDIntegerType(IntegerType):
+    def __init__(self, size):
+        IntegerType.__init__(self, size)
+
+    def isSIMDOnIntel(self):
+        return True
+
+    def zero_expression(self):
+        return "zeroes128()"
+
+    def lowbit_extract_expr(self, expr_string, n):
+        if (n <= 32):
+            tmpType = IntegerType(32)
+            tmpExpr = "movd(%s)" % expr_string
+        elif (32 < n <= 64):
+            tmpType = IntegerType(64)
+            tmpExpr = "movq(%s)" % expr_string
+        return tmpType.lowbit_extract_expr(tmpExpr, n)
+
+    def highbit_extract_expr(self, expr_string, n):
+        fail_out("Unimplemented high bit extract on m128")
+
+    def bit_extract_expr(self, expr_string, low, high, flip):
+        fail_out("Unimplemented bit extract on m128")
+
+    def shift_expr(self, expr_string, n):
+        if n % 8 != 0:
+            fail_out("Trying to shift a m128 by a bit granular value")
+
+        # should check that n is divisible by 8
+        if n <= -self.size or n >= self.size:
+            return self.zero_expression()
+        elif (n > 0):
+            return "_mm_slli_si128(%s, %s)" % (expr_string, n / 8)
+        elif (n < 0):
+            return "_mm_srli_si128(%s, %s)" % (expr_string, -n / 8)
+        else:
+            return "(%s)" % (expr_string)
+
+    def lowbit_mask(self, n):
+        if n % 8 != 0:
+            fail_out("Trying to make a lowbit mask in a m128 by a bit granular value")
+        return self.shift_expr("ones128()", -(128 - n))
+
+def getRequiredType(bits):
+    if bits == 128:
+        return SIMDIntegerType(bits)
+    for b in [ 8, 16, 32, 64]:
+        if (bits <= b):
+            return IntegerType(b)
+    return None
+
+class IntegerVariable:
+    def __init__(self, name, type):
+        self.name = name
+        self.type = type
+
+    def gen_initializer_stmt(self, initialization_string = None):
+        if initialization_string:
+            return "%s %s = %s;" % (self.type.get_name(), self.name, initialization_string)
+        else:
+            return "%s %s;" % (self.type.get_name(), self.name)
+
+
+class Step:
+    def __init__(self, context, offset = 0):
+        self.context = context
+        self.matcher = context.matcher
+        self.offset = offset
+        self.latency = 1
+        self.dependency_list = []
+        self.latest = None
+        self.context.add_step(self)
+
+    # return a string, complete with indentation
+    def emit(self):
+        indent = " " * (self.offset*2 + self.matcher.default_body_indent)
+        s = "\n".join( [ indent + line for line in self.val.split("\n")] )
+        if self.latest:
+            s += " // " + str(self.debug_step) + " L" + str(self.latency) + " LTST:%d" % self.latest
+            if self.dependency_list:
+                s += " Derps: "
+                for (d,l) in self.dependency_list:
+                    s += "%d/%d " % (d.debug_step,l)
+        return s
+
+    def add_dependency(self, step, anti_dependency = False, output_dependency = False):
+        if anti_dependency or output_dependency:
+            self.dependency_list += [ (step, 1) ]
+        else:
+            self.dependency_list += [ (step, step.latency) ]
+
+    def nv(self, type, var_name):
+        return self.context.new_var(self, type, var_name)
+
+    def gv(self, var_name, reader = True, writer = False):
+        return self.context.get_var(self, var_name, reader = reader, writer = writer)
+
+# utility steps, generic
+
+class LabelStep(Step):
+    def __init__(self, context, offset = 0, label_prefix = "off"):
+        Step.__init__(self, context, offset)
+        self.val = "%s%d: UNUSED;" % (label_prefix, offset)
+
+class OpenScopeStep(Step):
+    def __init__(self, context, offset = 0):
+        Step.__init__(self, context, offset)
+        self.val = "{"
+
+class CloseScopeStep(Step):
+    def __init__(self, context, offset = 0):
+        Step.__init__(self, context, offset)
+        self.val = "}"
+
+
+class CodeGenContext:
+    def __init__(self, matcher):
+        self.vars = {}
+        self.steps = []
+        self.ctr = 0
+        self.matcher = matcher
+        self.var_writer = {} # var to a single writer
+        self.var_readers = {} # var to a list of all the readers that read the last value
+
+    def new_var(self, step, type, var_name):
+        var = IntegerVariable(var_name, type)
+        self.vars[var_name] = var
+        self.var_writer[var_name] = step
+        return var
+
+    def get_var(self, step, var_name, reader = True, writer = False):
+        if reader:
+            writer_step = self.var_writer[var_name]
+            if writer_step:
+                step.add_dependency(writer_step)
+            self.var_readers.setdefault(var_name, []).append(step)
+        if writer and not reader:
+            if self.var_writer[var_name]:
+                step.add_dependency(self.var_writer[var_name], output_dependency = True)
+        if writer:
+            if self.var_readers.has_key(var_name):
+                for reader in [ r for r in self.var_readers[var_name] if r is not step ]:
+                    step.add_dependency(reader, anti_dependency = True)
+                self.var_readers[var_name] = []
+            self.var_writer[var_name] = step
+        return self.vars[var_name]
+
+    def add_step(self, step):
+        self.steps += [ step ]
+        step.debug_step = self.ctr
+        self.ctr += 1
+
+    def dontschedule(self, finals):
+        return "\n".join( [ s.emit() for s in self.steps ] )
+
+    def schedule(self, finals):
+        for f in finals:
+            f.latest = f.latency
+        worklist = finals
+        while worklist:
+            current = worklist[0]
+            worklist = worklist[1:]
+            for (dep, lat) in current.dependency_list:
+                if dep.latest is None or dep.latest < (current.latest + dep.latency):
+                    dep.latest = current.latest + lat
+                    if dep not in worklist:
+                        worklist += [ dep ]
+        self.steps.sort(reverse = True, key = lambda s : s.latest)
+        return "\n".join( [ s.emit() for s in self.steps ] )
--- a/src/fdr/base_autogen.py
+++ b/src/fdr/base_autogen.py
@ -0,0 +1,167 @@
+#!/usr/bin/python
+
+# Copyright (c) 2015, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+from autogen_utils import *
+from base_autogen import *
+from string import Template
+
+class MatcherBase:
+
+    def __init__(self):
+        pass
+
+    def get_name(self):
+        return "fdr_exec_%03d" % self.id
+
+    def produce_header(self, visible, header_only = False):
+        s = ""
+        if not visible:
+            s += "static never_inline"
+        s += """
+hwlm_error_t %s(UNUSED const struct FDR *fdr,
+                UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name()
+        if header_only:
+            s += ";"
+        else:
+            s += "{"
+        s += "\n"
+        return s
+
+    def produce_guard(self):
+	print self.arch.get_guard()
+    
+    def produce_zero_alternative(self):
+	print """
+#else
+#define %s 0
+#endif
+""" % self.get_name()
+
+    # trivial function for documentation/modularity
+    def close_guard(self):
+	print "#endif"
+
+    def produce_common_declarations(self):
+        return """
+    const u8 * buf = a->buf;
+    const size_t len = a->len;
+    const u8 * ptr = buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t * control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 * tryFloodDetect = a->firstFloodDetect;
+    UNUSED u32 bit, bitRem, confSplit, idx;
+    u32 byte, cf;
+    const struct FDRConfirm *fdrc;
+    u32 last_match = (u32)-1;
+"""
+
+    def produce_continue_check(self):
+        return """if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
+    *a->groups = controlVal;
+    return HWLM_TERMINATED;
+}
+"""
+    def produce_flood_check(self):
+        return """
+        if (P0(ptr > tryFloodDetect)) {
+            tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
+            if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
+                *a->groups = controlVal;
+                return HWLM_TERMINATED;
+            }
+        }
+"""
+
+    def produce_footer(self):
+        return """
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+"""
+
+    def produce_confirm_base(self, conf_var_name, conf_var_size, offset, cautious, enable_confirmless, do_bailout = False):
+        if cautious:
+            caution_string = "VECTORING"
+        else:
+            caution_string = "NOT_CAUTIOUS"
+        conf_split_mask = IntegerType(32).constant_to_string(
+                            self.conf_top_level_split - 1)
+        if enable_confirmless:
+            quick_check_string = """
+        if (!fdrc->mult) {
+            u32 id = fdrc->nBitsOrSoleID;
+            if ((last_match == id) && (fdrc->flags & NoRepeat))
+                continue;
+           last_match = id;
+           controlVal = a->cb(ptr+byte-buf, ptr+byte-buf, id, a->ctxt);
+           continue;
+        } """
+        else:
+            quick_check_string = ""
+        if do_bailout:
+            bailout_string = """
+        if ((ptr + byte < buf + a->start_offset) || (ptr + byte >= buf + len)) continue;"""
+        else:
+            bailout_string = ""
+
+        return Template("""
+if (P0(!!$CONFVAR)) {
+    do  {
+        bit = findAndClearLSB_$CONFVAR_SIZE(&$CONFVAR);
+        byte  = bit / $NUM_BUCKETS + $OFFSET;
+        bitRem  = bit % $NUM_BUCKETS;
+        $BAILOUT_STRING
+        confSplit = *(ptr+byte) & $SPLIT_MASK;
+        idx = confSplit * $NUM_BUCKETS + bitRem;
+        cf = confBase[idx];
+        if (!cf)
+            continue;
+        fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);
+        if (!(fdrc->groups & *control))
+            continue;
+        $QUICK_CHECK_STRING
+        confWithBit(fdrc, a, ptr - buf + byte, $CAUTION_STRING, $CONF_PULL_BACK, control, &last_match);
+    } while(P0(!!$CONFVAR));
+    if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
+        *a->groups = controlVal;
+        return HWLM_TERMINATED;
+    }
+}""").substitute(CONFVAR = conf_var_name,
+                 CONFVAR_SIZE = conf_var_size,
+                 NUM_BUCKETS = self.num_buckets,
+                 OFFSET = offset,
+                 SPLIT_MASK = conf_split_mask,
+                 QUICK_CHECK_STRING = quick_check_string,
+                 BAILOUT_STRING = bailout_string,
+                 CAUTION_STRING = caution_string,
+                 CONF_PULL_BACK = self.conf_pull_back)
+
+
+def indent(block, depth):
+    return "\n".join([ (" " * (4*depth)) + line for line in block.splitlines() ] )
--- a/src/fdr/engine_description.cpp
+++ b/src/fdr/engine_description.cpp
@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "engine_description.h"
+#include "hs_compile.h" // for hs_platform_info
+#include "util/target_info.h"
+
+namespace ue2 {
+
+EngineDescription::~EngineDescription() {}
+
+bool EngineDescription::isValidOnTarget(const target_t &target_in) const {
+    return target_in.can_run_on_code_built_for(code_target);
+}
+
+target_t targetByArchFeatures(u64a cpu_features) {
+    hs_platform_info p;
+    p.tune = HS_TUNE_FAMILY_GENERIC;
+    p.cpu_features = cpu_features;
+
+    return target_t(p);
+}
+
+} // namespace ue2
--- a/src/fdr/engine_description.h
+++ b/src/fdr/engine_description.h
@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ENGINE_DESCRIPTION_H
+#define ENGINE_DESCRIPTION_H
+
+#include "ue2common.h"
+#include "util/target_info.h"
+
+namespace ue2 {
+
+class EngineDescription {
+    u32 id;
+    target_t code_target; // the target that we built this code for
+    u32 numBuckets;
+    u32 confirmPullBackDistance;
+    u32 confirmTopLevelSplit;
+
+public:
+    EngineDescription(u32 id_in, const target_t &code_target_in,
+                      u32 numBuckets_in, u32 confirmPullBackDistance_in,
+                      u32 confirmTopLevelSplit_in)
+        : id(id_in), code_target(code_target_in), numBuckets(numBuckets_in),
+          confirmPullBackDistance(confirmPullBackDistance_in),
+          confirmTopLevelSplit(confirmTopLevelSplit_in) {}
+
+    virtual ~EngineDescription();
+
+    u32 getID() const { return id; }
+    u32 getNumBuckets() const { return numBuckets; }
+    u32 getConfirmPullBackDistance() const { return confirmPullBackDistance; }
+    u32 getConfirmTopLevelSplit() const { return confirmTopLevelSplit; }
+
+    bool isValidOnTarget(const target_t &target_in) const;
+    virtual u32 getDefaultFloodSuffixLength() const = 0;
+
+    virtual bool typicallyHoldsOneCharLits() const { return true; }
+};
+
+/** Returns a target given a CPU feature set value. */
+target_t targetByArchFeatures(u64a cpu_features);
+
+} // namespace ue2
+
+#endif
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util/simd_utils.h"
+
+#define P0(cnd) unlikely(cnd)
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "teddy_internal.h"
+
+#include "flood_runtime.h"
+
+#include "fdr_confirm.h"
+#include "fdr_confirm_runtime.h"
+#include "fdr_streaming_runtime.h"
+#include "fdr_loadval.h"
+
+static really_inline UNUSED
+u32 getPreStartVal(const struct FDR_Runtime_Args *a, u32 numBits) {
+    u32 r = 0;
+    if (a->start_offset == 0) {
+        if (numBits <= 8) {
+            r = a->buf_history[a->len_history - 1];
+        } else {
+            r = a->buf_history[a->len_history - 1];
+            r |= (a->buf[0] << 8);
+        }
+    } else {
+        if (numBits <= 8) {
+            r = a->buf[a->start_offset - 1];
+        } else {
+            r = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
+        }
+    }
+    return r & ((1 << numBits) - 1);
+}
+
+#include "fdr_autogen.c"
+
+#define FAKE_HISTORY_SIZE 16
+static const u8 fake_history[FAKE_HISTORY_SIZE];
+
+hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, size_t start,
+                     HWLMCallback cb, void *ctxt, hwlm_group_t groups) {
+
+    const struct FDR_Runtime_Args a = {
+        buf,
+        len,
+        fake_history,
+        0,
+        fake_history, // nocase
+        0,
+        start,
+        cb,
+        ctxt,
+        &groups,
+        nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
+        0
+    };
+    if (unlikely(a.start_offset >= a.len)) {
+        return HWLM_SUCCESS;
+    } else {
+        assert(funcs[fdr->engineID]);
+        return funcs[fdr->engineID](fdr, &a);
+    }
+}
+
+hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
+                              size_t hlen, const u8 *buf, size_t len,
+                              size_t start, HWLMCallback cb, void *ctxt,
+                              hwlm_group_t groups, u8 * stream_state) {
+    struct FDR_Runtime_Args a = {
+        buf,
+        len,
+        hbuf,
+        hlen,
+        hbuf, // nocase - start same as caseful, override later if needed
+        hlen, // nocase
+        start,
+        cb,
+        ctxt,
+        &groups,
+        nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
+        hbuf ? CONF_LOADVAL_CALL_CAUTIOUS(hbuf + hlen - 8, hbuf, hbuf + hlen)
+             : (u64a)0
+
+    };
+    fdrUnpackState(fdr, &a, stream_state);
+
+    hwlm_error_t ret;
+    if (unlikely(a.start_offset >= a.len)) {
+        ret = HWLM_SUCCESS;
+    } else {
+        assert(funcs[fdr->engineID]);
+        ret = funcs[fdr->engineID](fdr, &a);
+    }
+
+    fdrPackState(fdr, &a, stream_state);
+    return ret;
+}
--- a/src/fdr/fdr.h
+++ b/src/fdr/fdr.h
@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: runtime API.
+ */
+
+#ifndef FDR_H
+#define FDR_H
+
+#include "ue2common.h"
+#include "hwlm/hwlm.h"
+
+// C linkage in the API
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct FDR;
+
+/** \brief Returns size in bytes of the given FDR engine. */
+size_t fdrSize(const struct FDR *fdr);
+
+/** \brief Returns non-zero if the contents of the stream state indicate that
+ * there is active FDR history beyond the regularly used history. */
+u32 fdrStreamStateActive(const struct FDR *fdr, const u8 *stream_state);
+
+/**
+ * \brief Block-mode scan.
+ *
+ * \param fdr FDR matcher engine.
+ * \param buf Buffer to scan.
+ * \param len Length of buffer to scan.
+ * \param start First offset in buf at which a match may end.
+ * \param cb Callback to call when a match is found.
+ * \param ctxt Caller-provided context pointer supplied to callback on match.
+ * \param groups Initial groups mask.
+ */
+hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
+                     size_t start, HWLMCallback cb, void *ctxt,
+                     hwlm_group_t groups);
+
+/**
+ * \brief Streaming-mode scan.
+ *
+ * \param fdr FDR matcher engine.
+ * \param hbuf History buffer.
+ * \param hlen Length of history buffer (hbuf).
+ * \param buf Buffer to scan.
+ * \param len Length of buffer to scan (buf).
+ * \param start First offset in buf at which a match may end.
+ * \param cb Callback to call when a match is found.
+ * \param ctxt Caller-provided context pointer supplied to callback on match.
+ * \param groups Initial groups mask.
+ * \param stream_state Persistent stream state for use by FDR.
+ */
+hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
+                              size_t hlen, const u8 *buf, size_t len,
+                              size_t start, HWLMCallback cb, void *ctxt,
+                              hwlm_group_t groups, u8 *stream_state);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // FDR_H
--- a/src/fdr/fdr_autogen.py
+++ b/src/fdr/fdr_autogen.py
@ -0,0 +1,574 @@
+#!/usr/bin/python
+
+# Copyright (c) 2015, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+from autogen_utils import *
+from base_autogen import *
+from string import Template
+
+class OrStep(Step):
+    def __init__(self, context, offset, width):
+        Step.__init__(self, context, offset)
+        s_var = self.gv("st%d" % offset)
+        if width < 128:
+            self.val = "s |= %s;" % s_var.name
+        else:
+            self.val = "s = or%d(s, %s);" % (width, s_var.name)
+
+class ShiftStateStep(Step):
+    def __init__(self, context, offset = 0, stride_used = 1):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        state = m.state_variable
+        shift_distance = -1 * stride_used * m.num_buckets
+        self.val = "%s = %s;" % (state.name, state.type.shift_expr(state.name, shift_distance))
+
+class BulkLoadStep(Step):
+    def __init__(self, context, offset, size, define_var = True, aligned = True):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        self.latency = 4
+        blt = m.bulk_load_type
+        if aligned:
+            init_string = blt.load_expr_data(self.offset, code = "aligned")
+        else:
+            init_string = blt.load_expr_data(self.offset)
+
+        var_name = "current_data_%d" % offset
+        if define_var:
+            lb_var = self.nv(blt, var_name)
+            self.val = lb_var.gen_initializer_stmt(init_string)
+        else:
+            lb_var = self.gv(var_name, reader = False, writer = True)
+            self.val = "%s = %s;" % (var_name, init_string)
+
+class ValueExtractStep(Step):
+    def __init__(self, context, offset, sub_load_cautious = False):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        self.latency = 2
+        dsb = m.datasize_bytes
+        modval = offset % dsb
+
+        if m.domain > 8 and modval == dsb - 1:
+            # Case 1: reading more than one byte over the end of the bulk load
+
+            self.latency = 4
+            if sub_load_cautious:
+                code_string = "cautious_forward" 
+            else:
+                code_string = "normal"
+            load_string = m.single_load_type.load_expr_data(self.offset, code_string)
+            temp_string = "(%s << %d)" % (load_string, m.reach_shift_adjust)
+        else:
+            # Case 2: reading a value that can be found entirely in the current register
+            if m.fdr2_force_naive_load:
+                load_string = m.single_load_type.load_expr_data(self.offset, "normal")
+                temp_string = "(%s << %d)" % (load_string, m.reach_shift_adjust)
+            else:
+                lb_var = self.gv("current_data_%d" % (offset - modval))
+                if modval == 0:
+                    # Case 2a: value is at LSB end of the register and must be left-
+                    # shifted into place if there is a "reach_shift_adjust" required
+                    temp_string = "(%s << %d)" % (lb_var.name, m.reach_shift_adjust)
+                else:
+                    # Case 2b: value is in the middle of the register and will be
+                    # right-shifted into place (adjusted by "reach_shift_adjust")
+                    temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)
+
+
+        init_string = "(%s) & 0x%x" % (temp_string, m.reach_mask)
+        v_var = self.nv(m.value_extract_type, "v%d" % offset)
+        self.val = v_var.gen_initializer_stmt(init_string)
+
+class TableLookupStep(Step):
+    def __init__(self, context, reach_multiplier, offset = 0):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        self.latency = 4
+        v_var = self.gv("v%d" % offset)
+        s_var = self.nv(m.state_type, "st%d" % offset)
+        init_string = "*(const %s *)(ft + %s*%dU)" % ( m.state_type.get_name(),
+                                                       v_var.name, reach_multiplier)
+        self.val = s_var.gen_initializer_stmt(init_string)
+
+class ShiftReachMaskStep(Step):
+    def __init__(self, context, offset):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        extr = m.extract_frequency
+        modval = offset % extr
+        s_var = self.gv("st%d" % offset, writer = True)
+        self.val = "%s = %s;" % (s_var.name, s_var.type.shift_expr(s_var.name, modval * m.num_buckets))
+
+class ConfExtractStep(Step):
+    def __init__(self, context, offset):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        if m.state_type.isSIMDOnIntel():
+            self.latency = 2
+        init_string = m.state_type.lowbit_extract_expr("s", m.extract_size)
+        extr_var = self.nv(m.extr_type, "extr%d" % offset)
+        self.val = extr_var.gen_initializer_stmt(init_string)
+
+class ConfAccumulateStep(Step):
+    def __init__(self, context, extract_offset, conf_offset, define_var = True):
+        Step.__init__(self, context, extract_offset)
+        m = self.matcher
+        extr_var = self.gv("extr%d" % extract_offset)
+        extr_var_cast = "((%s)%s)" % (m.conf_type.get_name(), extr_var.name)
+        if extract_offset == conf_offset:
+            # create conf_var as a straight copy of extr
+            if define_var:
+                conf_var = self.nv(m.conf_type, "conf%d" % conf_offset)
+                self.val = conf_var.gen_initializer_stmt(extr_var_cast)
+            else:
+                conf_var = self.gv("conf%d" % conf_offset, writer = True, reader = True)
+                self.val = "%s = %s;" % (conf_var.name, extr_var_cast)
+        else:
+            # shift extr_var and insert/OR it in conf_var
+            conf_var = self.gv("conf%d" % conf_offset, writer = True, reader = True)
+            shift_dist = (extract_offset - conf_offset) * m.num_buckets
+            self.val = "%s |= %s;" % (conf_var.name, m.conf_type.shift_expr(extr_var_cast, shift_dist))
+            self.latency = 2
+
+class ConfirmFlipStep(Step):
+    def __init__(self, context, offset):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        conf_var = self.gv("conf%d" % self.offset, writer = True)
+        self.val = "%s = %s;" % (conf_var.name,
+                       conf_var.type.flip_lowbits_expr(conf_var.name, self.matcher.confirm_frequency * m.num_buckets))
+
+class ConfirmStep(Step):
+    def __init__(self, context, offset, cautious = False):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        conf_var = self.gv("conf%d" % offset, writer = True)
+        self.val = m.produce_confirm_base(conf_var.name, conf_var.type.size, offset, cautious,
+                                          enable_confirmless = m.stride == 1, do_bailout = False)
+
+class M3(MatcherBase):
+    def get_hash_safety_parameters(self):
+        h_size = self.single_load_type.size_in_bytes()
+        return (0, h_size - 1)
+
+    def produce_compile_call(self):
+        print "    { %d, %d, %d, %d, %d, %s, %d, %d }," % (
+              self.id, self.state_width, self.num_buckets,
+              self.stride, self.domain,
+              self.arch.target, self.conf_pull_back, self.conf_top_level_split)
+
+    def produce_main_loop(self, switch_variant = False):
+        stride_offsets = xrange(0, self.loop_bytes, self.stride)
+        stride_offsetSet = set(stride_offsets)
+        so_steps_last_block = []
+        sh = None
+        last_confirm = None
+        ctxt = CodeGenContext(self)
+
+        if switch_variant:
+            print " ptr -= (iterBytes - dist);"
+            print " { " # need an extra scope around switch variant to stop its globals escaping
+        else:
+            print "    if (doMainLoop) {"
+            print "    for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {"
+            print self.produce_flood_check()
+            print "        __builtin_prefetch(ptr + (iterBytes*4));"
+            print "        assert(((size_t)ptr % START_MOD) == 0);"
+
+
+        # just do globally for now
+        if switch_variant:
+            subsidiary_load_cautious = True
+            confirm_cautious = True
+        else:
+            subsidiary_load_cautious = False
+            confirm_cautious = False
+
+        if not self.fdr2_force_naive_load:
+            bulk_load_steps = [ off for off in range(self.loop_bytes)
+                                if off % self.datasize_bytes == 0 and
+                                   (set(range(off, off + self.datasize_bytes - 1)) & stride_offsetSet)]
+        else:
+            bulk_load_steps = []
+
+        confirm_steps = [ off for off in range(self.loop_bytes) if off % self.confirm_frequency == 0 ]
+
+        for off in bulk_load_steps:
+            lb_var = ctxt.new_var(None, self.bulk_load_type, "current_data_%d" % off)
+            print "        " + lb_var.gen_initializer_stmt()
+
+
+        for off in confirm_steps:
+            var_name = "conf%d" % off
+            conf_def_var = ctxt.new_var(None, self.conf_type, var_name)
+            if switch_variant:
+                init_string = "(%s)-1" % self.conf_type.get_name()
+            else:
+                init_string = ""
+            print "        " + conf_def_var.gen_initializer_stmt(init_string)
+
+        if switch_variant:
+            print "        switch(iterBytes - dist) {"
+            for i in range(0, self.loop_bytes):
+                print "            case %d:" % i
+
+                # init and poison conf; over-precise but harmless
+                conf_id = (i / self.confirm_frequency) * self.confirm_frequency
+                if i % self.confirm_frequency:
+                    conf_fixup_bits = self.conf_type.size - (self.num_buckets * (i % self.confirm_frequency))
+                    print "                conf%d >>= %d;" % (conf_id, conf_fixup_bits)
+                else:
+                    print "                conf%d = 0;" % conf_id
+
+                # init state
+                state_fixup = i % self.extract_frequency
+                state = self.state_variable
+                shift_distance = self.num_buckets * state_fixup
+                if state_fixup:
+                    print "                %s = %s;" % (state.name, state.type.shift_expr(state.name, shift_distance))
+                    if self.state_width < 128:
+                        print "                %s |= %s;" % (state.name, state.type.lowbit_mask(shift_distance))
+                    else:
+                        print "                %s = or%d(%s, %s);" % (state.name, self.state_width, state.name, state.type.lowbit_mask(shift_distance))
+
+                if not self.fdr2_force_naive_load:
+                    # init current_data (could poison it in some cases)
+                    load_mod = i % self.datasize_bytes
+                    load_offset = i - load_mod
+                    if load_mod:
+                        # not coming in on an even boundary means having to do a load var
+                        # actually, there are a bunch of things we can do on this bulk load
+                        # to avoid having to be 'cautious_backwards' but I'm not completely
+                        # sure they are good ideas
+                        init_string = self.bulk_load_type.load_expr_data(load_offset,
+                                                                         code = "cautious_backward")
+                        var_name = "current_data_%d" % load_offset
+                        lb_var = ctxt.get_var(None, var_name, reader = False, writer = True)
+                        print "                %s = %s;" % (lb_var.name, init_string)
+
+                print "                goto off%d;" % i
+            print "            case %d: goto skipSwitch;" % self.loop_bytes
+            print "        }"
+            print "        {"
+
+
+        for off in range(self.loop_bytes):
+            # X_mod is the offset we're up to relative to the last X operation
+            # X_offset is which of the last X operations matches this iteration
+
+            if (switch_variant):
+                LabelStep(ctxt, off)
+
+            if off in bulk_load_steps:
+                if not self.fdr2_force_naive_load:
+                    BulkLoadStep(ctxt, off, self.datasize, define_var = False, aligned = not switch_variant)
+
+            if off in stride_offsets:
+                if switch_variant:
+                    OpenScopeStep(ctxt, off)
+                ValueExtractStep(ctxt, off, sub_load_cautious = subsidiary_load_cautious)
+                TableLookupStep(ctxt, self.reach_mult, off)
+                if off % self.extract_frequency:
+                    ShiftReachMaskStep(ctxt, off)
+                so = OrStep(ctxt, off, self.state_width)
+                if switch_variant:
+                    CloseScopeStep(ctxt, off)
+                if sh != None:
+                    so.add_dependency(sh)
+                so_steps_last_block += [ so ]
+
+            extract_mod = off % self.extract_frequency
+            extract_offset = off - extract_mod
+            extract_ready = extract_mod == self.extract_frequency - 1
+            if extract_ready:
+                if switch_variant:
+                    OpenScopeStep(ctxt, off)
+                ex = ConfExtractStep(ctxt, extract_offset)
+                ConfAccumulateStep(ctxt, extract_offset, confirm_offset, define_var = False)
+                for so_step in so_steps_last_block:
+                    ex.add_dependency(so_step)
+                if switch_variant:
+                    CloseScopeStep(ctxt, off)
+                so_steps_last_block = []
+                sh = ShiftStateStep(ctxt, extract_offset, stride_used = self.extract_frequency)
+                sh.add_dependency(ex)
+
+            confirm_mod = off % self.confirm_frequency
+            confirm_offset = off - confirm_mod
+            confirm_ready = confirm_mod == self.confirm_frequency - 1
+            if confirm_ready:
+                cflip = ConfirmFlipStep(ctxt, confirm_offset)
+                cf = ConfirmStep(ctxt, confirm_offset, cautious = confirm_cautious )
+                if last_confirm:
+                    cf.add_dependency(last_confirm)
+                last_confirm = cf
+
+
+        if not switch_variant:
+            print ctxt.schedule([ last_confirm, sh ])
+        else:
+            print ctxt.dontschedule([ last_confirm, sh ])
+
+        if switch_variant:
+            print "skipSwitch:;"
+            print "    ptr += iterBytes;"
+        print "    }" # close extra scope around switch variant
+        print "    }"
+
+
+    def produce_init_state(self):
+        state = self.state_variable
+        s_type = self.state_type
+        shift_distance = -1 * self.num_buckets
+        shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance))
+
+        s = Template("""
+            $TYPENAME s;
+            if (a->len_history) {
+                u32 tmp = getPreStartVal(a, $DOMAIN);
+                s = *((const $TYPENAME *)ft + tmp);
+                $SHIFT_EXPR;
+            } else {
+                s = *(const $TYPENAME *)&fdr->start;
+            }
+""").substitute(TYPENAME = s_type.get_name(),
+                ZERO_EXPR = s_type.zero_expression(),
+                DOMAIN = self.domain,
+                SHIFT_EXPR = shift_expr)
+        return s
+
+    def produce_code(self):
+
+        (behind, ahead) = self.get_hash_safety_parameters()
+        loop_read_behind = behind
+        loop_read_ahead = self.loop_bytes + ahead
+
+        # we set up mask and shift stuff for extracting our masks from registers
+        #
+        # we have a choice as to whether to mask out the value early or
+        # extract the value (shift first) then mask it
+        #
+        # Intel has a free scaling factor from 1/2/4/8 so we want to combine
+        # the extra needed shift for SSE registers with the mask operation
+
+        ssb = self.state_type.size / 8 # state size in bytes
+
+        # Intel path
+        if ssb == 16 and self.domain == 16:
+            # obscure corner - we don't have the room in the register to
+            # do this for all values so we don't. domain==16 is pretty
+            # bad anyhow, of course
+            self.reach_mult = 8
+        else:
+            self.reach_mult = ssb
+
+        shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
+        self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
+        self.reach_mask = ((1 << self.domain) - 1) << self.reach_shift_adjust
+
+        print self.produce_header(visible = False)
+
+        print "// ",
+        print " Arch: " + self.arch.name,
+        print " State type: " + self.state_type.get_name(),
+        print " Num buckets: %d" % self.num_buckets,
+        print " Domain: %d" % self.domain,
+        print " Stride: %d" % self.stride
+
+        print self.produce_common_declarations()
+        print
+
+        print "\tconst size_t tabSize = %d;" % self.table_size
+        print """
+    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
+    const u32 * confBase = (const u32 *)(ft + tabSize);
+"""
+        print self.produce_init_state()
+        print "\tconst size_t iterBytes = %d;" % self.loop_bytes
+        print "\tconst size_t START_MOD = %d;" % self.datasize_bytes
+        print "\tconst size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
+
+        print """
+    while (ptr < buf + len) {
+
+        u8 doMainLoop = 1;
+        size_t remaining = len - (ptr - buf);
+        size_t dist;
+        if (remaining <= iterBytes) {
+            dist = remaining; // once through the switch and we're done
+        } else if (remaining < 2 * iterBytes) {
+            // nibble some stuff off the front, skip the main loop,
+            // then come back here
+            dist = iterBytes;  // maybe could be cleverer
+        } else {
+            // now, we need to see if we can make it to a main loop iteration
+            // if so, we need to ensure that the main loop iteration is aligned
+            // to a START_MOD boundary and i >= 8 so we can read ptr + i - 8
+
+            // see if we can do it - if not, just switch the main loop off,
+            // eat iterBytes in cautious mode, and come back to this loop
+
+            const u8 * target = MAX(buf + 8, ptr);
+            target = ROUNDUP_PTR(target, START_MOD);
+            dist = target - ptr;
+            if (dist > iterBytes) {
+                doMainLoop = 0;
+                dist = iterBytes;
+            }
+        }
+"""
+        self.produce_main_loop(switch_variant = True)
+        self.produce_main_loop(switch_variant = False)
+        print """
+    }
+"""
+        print self.produce_footer()
+
+    def get_name(self):
+        return "fdr_exec_%s_d%d_s%d_w%d" % (self.arch.name, self.domain, self.stride, self.state_width)
+
+    def __init__(self, state_width, domain, stride,
+                 arch,
+                 table_state_width = None,
+                 num_buckets = 8,
+                 extract_frequency = None,
+                 confirm_frequency = None):
+
+        # First - set up the values that are fundamental to how this matcher will operate
+        self.arch = arch
+
+        # get the width of the state width on which we operate internally
+        if state_width not in [ 128 ]:
+            fail_out("Unknown state width: %d" % state_width)
+        self.state_width = state_width
+        self.state_type = getRequiredType(self.state_width)
+        self.state_variable = IntegerVariable("s", self.state_type)
+
+        table_state_width = state_width
+        self.table_state_width = state_width
+        self.table_state_type = getRequiredType(self.table_state_width)
+
+        # domain is the number of bits that we draw from our input to
+        # index our 'reach' table
+        if not 8 <= domain <= 16:
+            fail_out("Unsupported domain: %d" % domain)
+        self.domain = domain
+        # this is the load type required for this domain if we want to
+        # load it one at a time
+        self.single_load_type = getRequiredType(self.domain)
+
+        # table size
+        self.table_size = 2**domain * table_state_width // 8
+
+        # stride is the frequency with which we make data-driven
+        # accesses to our reach table
+        if stride not in [ 1, 2, 4, 8]:
+            fail_out("Unsupported stride: %d" % stride)
+        if stride * num_buckets > state_width:
+            fail_out("Stride %d is too big for the number of buckets %d given state width %d\n" % (stride, num_buckets, state_width))
+        self.stride = stride
+
+        if num_buckets != 8:
+            fail_out("Unsupported number of buckets: %d" % num_buckets)
+        if state_width % num_buckets and state_width == 128:
+            fail_out("Bucket scheme requires bit-shifts on m128 (failing)")
+        self.num_buckets = num_buckets
+
+        # Second - set up derived or optimization values - these can be
+        # overridden by arguments that are passed in
+
+        self.datasize = 64
+        self.bulk_load_type = IntegerType(self.datasize)
+        self.datasize_bytes = self.datasize/8
+
+        self.value_extract_type = IntegerType(self.datasize)
+
+        self.fdr2_force_naive_load = False # disable everywhere for trunk
+
+        # extract frequency is how frequently (in bytes) we destructively shift
+        # our state value after having pulled out that many bytes into a
+        # confirm register (of one sort or another).
+        # none means a default value - datasize, our biggest easily available GPR
+        if extract_frequency is None:
+            extract_frequency = self.datasize_bytes
+        self.extract_frequency = extract_frequency
+        self.extract_size = self.extract_frequency*self.num_buckets
+        if extract_frequency < stride:
+            fail_out("Can't extract at extract frequency %d with stride %d" % (extract_frequency, stride))
+        if extract_frequency not in [ None, 1, 2, 4, 8, 16]:
+            fail_out("Weird extract frequency: %d" % extract_frequency)
+
+        if self.extract_size <= 32:
+            self.extr_type = IntegerType(32)
+        elif self.extract_size <= 64:
+            self.extr_type = IntegerType(64)
+        else:
+            fail_out("Implausible size %d required for confirm extract step" % size)
+
+        # extract_frequency is how often we pull out our state and place
+        # it somewhere in a lossless fashion
+        # confirm_frequency, on the other hand, is how frequently we
+        # take the state extracted by extract_frequency and cobble it
+        # together into a matching loop
+        # confirm_frequency must be a multiple of extract_frequency
+        # and must fit into a fast register; for now; we're going to
+        # stay in the GPR domain
+        if confirm_frequency is None:
+            confirm_frequency = self.extract_frequency
+        self.confirm_frequency = confirm_frequency
+        if confirm_frequency % self.extract_frequency:
+            fail_out("Confirm frequency %d must be evenly divisible by extract_frequency %d" % (confirm_frequency, self.extract_frequency))
+
+        self.conf_size = self.confirm_frequency * self.num_buckets
+        if self.conf_size <= 32:
+            self.conf_type = IntegerType(32)
+        elif self.conf_size <= 64:
+            self.conf_type = IntegerType(64)
+        else:
+            fail_out("Implausible size %d required for confirm accumulate step" % self.conf_size)
+
+        # how many bytes in flight at once
+        self.loop_bytes = 16
+
+        # confirm configuration
+
+        # how many entries in the top-level confirm table - 256 means
+        # complete split on the last character
+        self.conf_top_level_split = 256
+
+        # how much we 'pull back' in confirm - this is obviously related
+        # to the first level conf but we will keep two separate paramters
+        # for this to avoid the risk of conflating these
+        self.conf_pull_back = 1
+
+        if self.conf_pull_back > 0 and self.conf_top_level_split < 256:
+            fail_out("Pull back distance %d not supported by top level split %d" % (self.conf_pull_back, self.conf_top_level_split))
+
+        # minor stuff
+        self.default_body_indent = 8
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: build API.
+ */
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_compile.h"
+#include "fdr_confirm.h"
+#include "fdr_compile_internal.h"
+#include "fdr_engine_description.h"
+#include "teddy_compile.h"
+#include "teddy_engine_description.h"
+#include "grey.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/compare.h"
+#include "util/dump_mask.h"
+#include "util/target_info.h"
+#include "util/ue2string.h"
+#include "util/verify_types.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <boost/core/noncopyable.hpp>
+
+using namespace std;
+
+namespace ue2 {
+
+namespace {
+
+class FDRCompiler : boost::noncopyable {
+private:
+    const FDREngineDescription &eng;
+    vector<u8> tab;
+    const vector<hwlmLiteral> &lits;
+    map<BucketIndex, std::vector<LiteralIndex> > bucketToLits;
+    bool make_small;
+
+    u8 *tabIndexToMask(u32 indexInTable);
+    void assignStringToBucket(LiteralIndex l, BucketIndex b);
+    void assignStringsToBuckets();
+#ifdef DEBUG
+    void dumpMasks(const u8 *defaultMask);
+#endif
+    void setupTab();
+    aligned_unique_ptr<FDR> setupFDR(pair<u8 *, size_t> link);
+    void createInitialState(FDR *fdr);
+
+public:
+    FDRCompiler(const vector<hwlmLiteral> &lits_in,
+                const FDREngineDescription &eng_in, bool make_small_in)
+        : eng(eng_in), tab(eng_in.getTabSizeBytes()), lits(lits_in),
+          make_small(make_small_in) {}
+
+    aligned_unique_ptr<FDR> build(pair<u8 *, size_t> link);
+};
+
+u8 *FDRCompiler::tabIndexToMask(u32 indexInTable) {
+    assert(indexInTable < tab.size());
+    return &tab[0] + (indexInTable * (eng.getSchemeWidth() / 8));
+}
+
+static
+void setbit(u8 *msk, u32 bit) {
+    msk[bit / 8] |= 1U << (bit % 8);
+}
+
+static
+void clearbit(u8 *msk, u32 bit) {
+    msk[bit / 8] &= ~(1U << (bit % 8));
+}
+
+static
+void andMask(u8 *dest, const u8 *a, const u8 *b, u32 num_bytes) {
+    for (u32 i = 0; i < num_bytes; i++) {
+        dest[i] = a[i] & b[i];
+    }
+}
+
+void FDRCompiler::createInitialState(FDR *fdr) {
+    u8 *start = (u8 *)&fdr->start;
+
+    /* initial state should to be 1 in each slot in the bucket up to bucket
+     * minlen - 1, and 0 thereafter */
+    for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
+        // Find the minimum length for the literals in this bucket.
+        const vector<LiteralIndex> &bucket_lits = bucketToLits[b];
+        u32 min_len = ~0U;
+        for (vector<LiteralIndex>::const_iterator it = bucket_lits.begin(),
+                                                  ite = bucket_lits.end();
+             it != ite; ++it) {
+            min_len = min(min_len, verify_u32(lits[*it].s.length()));
+        }
+
+        DEBUG_PRINTF("bucket %u has min_len=%u\n", b, min_len);
+        assert(min_len);
+
+        for (PositionInBucket i = 0; i < eng.getBucketWidth(b); i++) {
+            if (i < min_len - 1) {
+                setbit(start, eng.getSchemeBit(b, i));
+            }
+        }
+    }
+}
+
+aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
+    size_t tabSize = eng.getTabSizeBytes();
+
+    pair<u8 *, size_t> floodControlTmp = setupFDRFloodControl(lits, eng);
+
+    pair<u8 *, size_t> confirmTmp =
+        setupFullMultiConfs(lits, eng, bucketToLits, make_small);
+
+    assert(ISALIGNED_16(tabSize));
+    assert(ISALIGNED_16(confirmTmp.second));
+    assert(ISALIGNED_16(floodControlTmp.second));
+    assert(ISALIGNED_16(link.second));
+    size_t headerSize = ROUNDUP_16(sizeof(FDR));
+    size_t size = ROUNDUP_16(headerSize + tabSize + confirmTmp.second +
+                             floodControlTmp.second + link.second);
+
+    DEBUG_PRINTF("sizes base=%zu tabSize=%zu confirm=%zu floodControl=%zu "
+                 "total=%zu\n",
+                 headerSize, tabSize, confirmTmp.second, floodControlTmp.second,
+                 size);
+
+    aligned_unique_ptr<FDR> fdr = aligned_zmalloc_unique<FDR>(size);
+    assert(fdr); // otherwise would have thrown std::bad_alloc
+
+    fdr->size = size;
+    fdr->engineID = eng.getID();
+    fdr->maxStringLen = verify_u32(maxLen(lits));
+    createInitialState(fdr.get());
+
+    u8 *fdr_base = (u8 *)fdr.get();
+    u8 * ptr = fdr_base + ROUNDUP_16(sizeof(FDR));
+    copy(tab.begin(), tab.end(), ptr);
+    ptr += tabSize;
+
+    memcpy(ptr, confirmTmp.first, confirmTmp.second);
+    ptr += confirmTmp.second;
+    aligned_free(confirmTmp.first);
+
+    fdr->floodOffset = verify_u32(ptr - fdr_base);
+    memcpy(ptr, floodControlTmp.first, floodControlTmp.second);
+    ptr += floodControlTmp.second;
+    aligned_free(floodControlTmp.first);
+
+    if (link.first) {
+        fdr->link = verify_u32(ptr - fdr_base);
+        memcpy(ptr, link.first, link.second);
+        aligned_free(link.first);
+    } else {
+        fdr->link = 0;
+    }
+
+    return fdr;
+}
+
+void FDRCompiler::assignStringToBucket(LiteralIndex l, BucketIndex b) {
+    bucketToLits[b].push_back(l);
+}
+
+struct LitOrder {
+    explicit LitOrder(const vector<hwlmLiteral> &vl_) : vl(vl_) {}
+    bool operator()(const u32 &i1, const u32 &i2) const {
+        const string &i1s = vl[i1].s;
+        const string &i2s = vl[i2].s;
+
+        size_t len1 = i1s.size(), len2 = i2s.size();
+
+        if (len1 != len2) {
+            return len1 < len2;
+        } else {
+            string::const_reverse_iterator it1, it2;
+            tie(it1, it2) =
+                std::mismatch(i1s.rbegin(), i1s.rend(), i2s.rbegin());
+            if (it1 == i1s.rend()) {
+                return false;
+            }
+            return *it1 < *it2;
+        }
+    }
+
+private:
+    const vector<hwlmLiteral> &vl;
+};
+
+static u64a getScoreUtil(u32 len, u32 count) {
+    if (len == 0) {
+        return (u64a)-1;
+    }
+    const u32 LEN_THRESH = 128;
+    const u32 elen = (len > LEN_THRESH) ? LEN_THRESH : len;
+    const u64a lenScore =
+        (LEN_THRESH * LEN_THRESH * LEN_THRESH) / (elen * elen * elen);
+    return count * lenScore; // deemphasize count - possibly more than needed
+                             // this might be overkill in the other direction
+}
+
+//#define DEBUG_ASSIGNMENT
+void FDRCompiler::assignStringsToBuckets() {
+    typedef u64a SCORE; // 'Score' type
+    const SCORE MAX_SCORE = (SCORE)-1;
+    const u32 CHUNK_MAX = 512;
+    const u32 BUCKET_MAX = 16;
+    typedef pair<SCORE, u32> SCORE_INDEX_PAIR;
+
+    u32 ls = verify_u32(lits.size());
+    // make a vector that contains our literals as pointers or u32 LiteralIndex values
+    vector<LiteralIndex> vli;
+    vli.resize(ls);
+    map<u32, u32> lenCounts;
+    for (LiteralIndex l = 0; l < ls; l++) {
+        vli[l] = l;
+        lenCounts[lits[l].s.size()]++;
+    }
+    // sort vector by literal length + if tied on length, 'magic' criteria of some kind (tbd)
+    stable_sort(vli.begin(), vli.end(), LitOrder(lits));
+
+#ifdef DEBUG_ASSIGNMENT
+    for (map<u32, u32>::iterator i = lenCounts.begin(), e = lenCounts.end();
+         i != e; ++i) {
+        printf("l<%d>:%d ", i->first, i->second);
+    }
+    printf("\n");
+#endif
+
+    // TODO: detailed early stage literal analysis for v. small cases (actually look at lits)
+    // yes - after we factor this out and merge in the Teddy style of building we can look
+    // at this, although the teddy merge modelling is quite different. It's still probably
+    // adaptable to some extent for this class of problem
+
+    u32 firstIds[CHUNK_MAX]; // how many are in this chunk (CHUNK_MAX - 1 contains 'last' bound)
+    u32 count[CHUNK_MAX]; // how many are in this chunk
+    u32 length[CHUNK_MAX]; // how long things in the chunk are
+
+    const u32 MAX_CONSIDERED_LENGTH = 16;
+    u32 currentChunk = 0;
+    u32 currentSize = 0;
+    u32 chunkStartID = 0;
+    u32 maxPerChunk  = ls/(CHUNK_MAX - MIN(MAX_CONSIDERED_LENGTH, lenCounts.size())) + 1;
+
+    for (u32 i = 0; i < ls && currentChunk < CHUNK_MAX - 1; i++) {
+        LiteralIndex l = vli[i];
+        if ((currentSize < MAX_CONSIDERED_LENGTH && (lits[l].s.size() != currentSize)) ||
+            (currentSize != 1 && ((i - chunkStartID) >= maxPerChunk))) {
+            currentSize = lits[l].s.size();
+            if (currentChunk) {
+                count[currentChunk - 1 ] = i - chunkStartID;
+            }
+            chunkStartID = firstIds[currentChunk] = i;
+            length[currentChunk] = currentSize;
+            currentChunk++;
+        }
+    }
+    count[currentChunk - 1] = ls - chunkStartID;
+    // close off chunks with an empty row
+    firstIds[currentChunk] = ls;
+    length[currentChunk] = 0;
+    count[currentChunk] = 0;
+    u32 nChunks = currentChunk + 1;
+
+#ifdef DEBUG_ASSIGNMENT
+    for (u32 j = 0; j < nChunks; j++) {
+        printf("%d %d %d %d\n", j, firstIds[j], count[j], length[j]);
+    }
+#endif
+
+    SCORE_INDEX_PAIR t[CHUNK_MAX][BUCKET_MAX]; // pair of score, index
+    u32 nb = eng.getNumBuckets();
+
+    for (u32 j = 0; j < nChunks; j++) {
+        u32 cnt = 0;
+        for (u32 k = j; k < nChunks; ++k) {
+            cnt += count[k];
+        }
+        t[j][0] = make_pair(getScoreUtil(length[j], cnt), 0);
+    }
+
+    for (u32 i = 1; i < nb; i++) {
+        for (u32 j = 0; j < nChunks - 1; j++) { // don't process last, empty row
+            SCORE_INDEX_PAIR best = make_pair(MAX_SCORE, 0);
+            u32 cnt = count[j];
+            for (u32 k = j + 1; k < nChunks - 1; k++, cnt += count[k]) {
+                SCORE score = getScoreUtil(length[j], cnt);
+                if (score > best.first) {
+                    break; // if we're now worse locally than our best score, give up
+                }
+                score += t[k][i-1].first;
+                if (score < best.first) {
+                    best = make_pair(score, k);
+                }
+            }
+            t[j][i] = best;
+        }
+        t[nChunks - 1][i] = make_pair(0,0); // fill in empty final row for next iteration
+    }
+
+#ifdef DEBUG_ASSIGNMENT
+    for (u32 j = 0; j < nChunks; j++) {
+        for (u32 i = 0; i < nb; i++) {
+            SCORE_INDEX_PAIR v = t[j][i];
+            printf("<%7lld,%3d>", v.first, v.second);
+        }
+        printf("\n");
+    }
+#endif
+
+    // our best score is in best[0][N_BUCKETS-1] and we can follow the links
+    // to find where our buckets should start and what goes into them
+    for (u32 i = 0, n = nb; n && (i != nChunks - 1); n--) {
+        u32 j = t[i][n - 1].second;
+        if (j == 0) {
+            j = nChunks - 1;
+        }
+        // put chunks between i - j into bucket (NBUCKETS-1) - n
+#ifdef DEBUG_ASSIGNMENT
+        printf("placing from %d to %d in bucket %d\n", firstIds[i], firstIds[j],
+               nb - n);
+#endif
+        for (u32 k = firstIds[i]; k < firstIds[j]; k++) {
+            assignStringToBucket((LiteralIndex)vli[k], nb - n);
+        }
+        i = j;
+    }
+}
+
+#ifdef DEBUG
+void FDRCompiler::dumpMasks(const u8 *defaultMask) {
+    const size_t width = eng.getSchemeWidth();
+    printf("default mask: %s\n", dumpMask(defaultMask, width).c_str());
+    for (u32 i = 0; i < eng.getNumTableEntries(); i++) {
+        u8 *m = tabIndexToMask(i);
+        if (memcmp(m, defaultMask, width / 8)) {
+            printf("tab %04x: %s\n", i, dumpMask(m, width).c_str());
+        }
+    }
+}
+#endif
+
+static
+bool getMultiEntriesAtPosition(const FDREngineDescription &eng,
+                               const vector<LiteralIndex> &vl,
+                               const vector<hwlmLiteral> &lits,
+                               SuffixPositionInString pos,
+                               std::map<u32, ue2::unordered_set<u32> > &m2) {
+    u32 distance = 0;
+    if (eng.bits <= 8) {
+        distance = 1;
+    } else if (eng.bits <= 16) {
+        distance = 2;
+    } else if (eng.bits <= 32) {
+        distance = 4;
+    }
+
+    for (vector<LiteralIndex>::const_iterator i = vl.begin(), e = vl.end();
+         i != e; ++i) {
+        if (e - i > 5) {
+            __builtin_prefetch(&lits[*(i + 5)]);
+        }
+        const hwlmLiteral &lit = lits[*i];
+        const size_t sz = lit.s.size();
+        u32 mask = 0;
+        u32 dontCares = 0;
+        for (u32 cnt = 0; cnt < distance; cnt++) {
+            int newPos = pos - cnt;
+            u8 dontCareByte = 0x0;
+            u8 maskByte = 0x0;
+            if (newPos < 0 || ((u32)newPos >= sz)) {
+                dontCareByte = 0xff;
+            } else {
+                u8 c = lit.s[sz - newPos - 1];
+                maskByte = c;
+                u32 remainder = eng.bits - cnt * 8;
+                assert(remainder != 0);
+                if (remainder < 8) {
+                    u8 cmask = (1U << remainder) - 1;
+                    maskByte &= cmask;
+                    dontCareByte |= ~cmask;
+                }
+                if (lit.nocase && ourisalpha(c)) {
+                    maskByte &= 0xdf;
+                    dontCareByte |= 0x20;
+                }
+            }
+            u32 loc =  cnt * 8;
+            mask |= maskByte << loc;
+            dontCares |= dontCareByte << loc;
+        }
+
+        // truncate m and dc down to nBits
+        mask &= (1U << eng.bits) - 1;
+        dontCares &= (1U << eng.bits) - 1;
+        if (dontCares == ((1U << eng.bits) - 1)) {
+            return true;
+        }
+        m2[dontCares].insert(mask);
+    }
+    return false;
+}
+
+void FDRCompiler::setupTab() {
+    const size_t mask_size = eng.getSchemeWidth() / 8;
+    assert(mask_size);
+
+    vector<u8> defaultMask(mask_size, 0xff);
+    for (u32 i = 0; i < eng.getNumTableEntries(); i++) {
+        memcpy(tabIndexToMask(i), &defaultMask[0], mask_size);
+    }
+
+    typedef std::map<u32, ue2::unordered_set<u32> > M2SET;
+
+    for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
+        const vector<LiteralIndex> &vl = bucketToLits[b];
+        SuffixPositionInString pLimit = eng.getBucketWidth(b);
+        for (SuffixPositionInString pos = 0; pos < pLimit; pos++) {
+            u32 bit = eng.getSchemeBit(b, pos);
+            M2SET m2;
+            bool done = getMultiEntriesAtPosition(eng, vl, lits, pos, m2);
+            if (done) {
+                clearbit(&defaultMask[0], bit);
+                continue;
+            }
+            for (M2SET::const_iterator i = m2.begin(), e = m2.end(); i != e;
+                 ++i) {
+                u32 dc = i->first;
+                const ue2::unordered_set<u32> &mskSet = i->second;
+                u32 v = ~dc;
+                do {
+                    u32 b2 = v & dc;
+                    for (ue2::unordered_set<u32>::const_iterator
+                             i2 = mskSet.begin(),
+                             e2 = mskSet.end();
+                         i2 != e2; ++i2) {
+                        u32 val = (*i2 & ~dc) | b2;
+                        clearbit(tabIndexToMask(val), bit);
+                    }
+                    v = (v + (dc & -dc)) | ~dc;
+                } while (v != ~dc);
+            }
+        }
+    }
+
+    for (u32 i = 0; i < eng.getNumTableEntries(); i++) {
+        u8 *m = tabIndexToMask(i);
+        andMask(m, m, &defaultMask[0], mask_size);
+    }
+#ifdef DEBUG
+    dumpMasks(&defaultMask[0]);
+#endif
+}
+
+aligned_unique_ptr<FDR> FDRCompiler::build(pair<u8 *, size_t> link) {
+    assignStringsToBuckets();
+    setupTab();
+    return setupFDR(link);
+}
+
+} // namespace
+
+static
+aligned_unique_ptr<FDR>
+fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
+                      const target_t &target, const Grey &grey, u32 hint,
+                      hwlmStreamingControl *stream_control) {
+    pair<u8 *, size_t> link(nullptr, 0);
+    if (stream_control) {
+        link = fdrBuildTableStreaming(lits, stream_control);
+    }
+
+    DEBUG_PRINTF("cpu has %s\n", target.has_avx2() ? "avx2" : "no-avx2");
+
+    if (grey.fdrAllowTeddy) {
+        aligned_unique_ptr<FDR> fdr
+            = teddyBuildTableHinted(lits, make_small, hint, target, link);
+        if (fdr) {
+            DEBUG_PRINTF("build with teddy succeeded\n");
+            return fdr;
+        } else {
+            DEBUG_PRINTF("build with teddy failed, will try with FDR\n");
+        }
+    }
+
+    const unique_ptr<FDREngineDescription> des =
+        (hint == HINT_INVALID) ? chooseEngine(target, lits, make_small)
+                               : getFdrDescription(hint);
+
+    if (!des) {
+        return nullptr;
+    }
+
+    FDRCompiler fc(lits, *des, make_small);
+    return fc.build(link);
+}
+
+aligned_unique_ptr<FDR> fdrBuildTable(const vector<hwlmLiteral> &lits,
+                                      bool make_small, const target_t &target,
+                                      const Grey &grey,
+                                      hwlmStreamingControl *stream_control) {
+    return fdrBuildTableInternal(lits, make_small, target, grey, HINT_INVALID,
+                                 stream_control);
+}
+
+#if !defined(RELEASE_BUILD)
+
+aligned_unique_ptr<FDR>
+fdrBuildTableHinted(const vector<hwlmLiteral> &lits, bool make_small, u32 hint,
+                    const target_t &target, const Grey &grey,
+                    hwlmStreamingControl *stream_control) {
+    pair<u8 *, size_t> link(nullptr, 0);
+    return fdrBuildTableInternal(lits, make_small, target, grey, hint,
+                                 stream_control);
+}
+
+#endif
+
+} // namespace ue2
+
+// FIXME: should be compile-time only
+size_t fdrSize(const FDR *fdr) {
+    assert(fdr);
+    return fdr->size;
+}
--- a/src/fdr/fdr_compile.h
+++ b/src/fdr/fdr_compile.h
@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: build API.
+ */
+
+#ifndef FDR_COMPILE_H
+#define FDR_COMPILE_H
+
+#include "ue2common.h"
+#include "util/alloc.h"
+
+#include <vector>
+
+struct FDR;
+
+namespace ue2 {
+
+struct hwlmLiteral;
+struct hwlmStreamingControl;
+struct Grey;
+struct target_t;
+
+ue2::aligned_unique_ptr<FDR>
+fdrBuildTable(const std::vector<hwlmLiteral> &lits, bool make_small,
+              const target_t &target, const Grey &grey,
+              hwlmStreamingControl *stream_control = nullptr);
+
+#if !defined(RELEASE_BUILD)
+
+ue2::aligned_unique_ptr<FDR>
+fdrBuildTableHinted(const std::vector<hwlmLiteral> &lits, bool make_small,
+                    u32 hint, const target_t &target, const Grey &grey,
+                    hwlmStreamingControl *stream_control = nullptr);
+
+#endif
+
+} // namespace ue2
+
+#endif
--- a/src/fdr/fdr_compile_internal.h
+++ b/src/fdr/fdr_compile_internal.h
@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_COMPILE_INTERNAL_H
+#define FDR_COMPILE_INTERNAL_H
+
+#include "ue2common.h"
+#include "hwlm/hwlm_literal.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+
+struct FDRConfirm;
+struct LitInfo;
+
+namespace ue2 {
+
+// a pile of decorative typedefs
+// good for documentation purposes more than anything else
+typedef u32 LiteralIndex;
+typedef u32 ConfirmIndex;
+typedef u32 SuffixPositionInString; // zero is last byte, counting back
+                                    // into the string
+typedef u32 BucketIndex;
+typedef u32 SchemeBitIndex;
+typedef u32 PositionInBucket;  // zero is 'we are matching right now!",
+                               // counting towards future matches
+
+class EngineDescription;
+class FDREngineDescription;
+struct hwlmStreamingControl;
+
+size_t getFDRConfirm(const std::vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
+                     bool make_small);
+
+std::pair<u8 *, size_t> setupFullMultiConfs(
+    const std::vector<hwlmLiteral> &lits, const EngineDescription &eng,
+    std::map<BucketIndex, std::vector<LiteralIndex> > &bucketToLits,
+    bool make_small);
+
+// all suffixes include an implicit max_bucket_width suffix to ensure that
+// we always read a full-scale flood "behind" us in terms of what's in our
+// state; if we don't have a flood that's long enough we won't be in the
+// right state yet to allow blindly advancing
+std::pair<u8 *, size_t>
+setupFDRFloodControl(const std::vector<hwlmLiteral> &lits,
+                     const EngineDescription &eng);
+
+std::pair<u8 *, size_t>
+fdrBuildTableStreaming(const std::vector<hwlmLiteral> &lits,
+                       hwlmStreamingControl *stream_control);
+
+static constexpr u32 HINT_INVALID = 0xffffffff;
+
+// fdr_compile_util.cpp utilities
+size_t maxLen(const std::vector<hwlmLiteral> &lits);
+size_t minLenCount(const std::vector<hwlmLiteral> &lits, size_t *count);
+u32 absdiff(u32 i, u32 j);
+
+} // namespace ue2
+
+#endif
--- a/src/fdr/fdr_compile_util.cpp
+++ b/src/fdr/fdr_compile_util.cpp
@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr_compile_internal.h"
+#include "hwlm/hwlm_literal.h"
+
+#include <algorithm>
+#include <vector>
+
+using namespace std;
+
+namespace ue2 {
+
+size_t maxLen(const vector<hwlmLiteral> &lits) {
+    size_t rv = 0;
+    for (const auto &lit : lits) {
+        rv = max(rv, lit.s.size());
+    }
+    return rv;
+}
+
+size_t minLenCount(const vector<hwlmLiteral> &lits, size_t *count) {
+    size_t rv = (size_t)-1;
+    *count = 0;
+    for (const auto &lit : lits) {
+        if (lit.s.size() < rv) {
+            rv = lit.s.size();
+            *count = 1;
+        } else if (lit.s.size() == rv) {
+            (*count)++;
+        }
+    }
+    return rv;
+}
+
+u32 absdiff(u32 i, u32 j) {
+    return (i > j) ? (i - j) : (j - i);
+}
+
+} // namespace ue2
--- a/src/fdr/fdr_confirm.h
+++ b/src/fdr/fdr_confirm.h
@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_CONFIRM_H
+#define FDR_CONFIRM_H
+
+#include "ue2common.h"
+#include "hwlm/hwlm.h"
+
+static really_inline
+u32 mul_hash_64(u64a lv, u64a andmsk, u64a mult, u32 nBits) {
+    return ((lv & andmsk) * mult) >> (sizeof(u64a)*8 - nBits);
+}
+
+// data structures
+// TODO: fix this hard-coding
+#define CONF_TYPE u64a
+#define CONF_HASH_CALL mul_hash_64
+
+typedef enum LitInfoFlags {
+    NoFlags = 0,
+    Caseless = 1,
+    NoRepeat = 2,
+    ComplexConfirm = 4
+} LitInfoFlags;
+
+/**
+ * \brief Structure describing a literal, linked to by FDRConfirm.
+ *
+ * This structure is followed in memory by a variable-sized string prefix at
+ * LitInfo::s, for strings that are longer than CONF_TYPE.
+ */
+struct LitInfo {
+    CONF_TYPE v;
+    CONF_TYPE msk;
+    hwlm_group_t groups;
+    u32 size;
+    u32 id; // literal ID as passed in
+    u8 flags; /* LitInfoFlags */
+    u8 next;
+    u8 extended_size;
+    u8 s[1]; // literal prefix, which continues "beyond" this struct.
+};
+
+#define FDRC_FLAG_NO_CONFIRM 1
+
+/**
+ * \brief FDR confirm header.
+ *
+ * This structure is followed in memory by:
+ *
+ * -# lit index mapping (array of u32)
+ * -# list of LitInfo structures
+ */
+struct FDRConfirm {
+    CONF_TYPE andmsk;
+    CONF_TYPE mult;
+    u32 nBitsOrSoleID; // if flags is NO_CONFIRM then this is soleID
+    u32 flags;  // sole meaning is 'non-zero means no-confirm' (that is all)
+    hwlm_group_t groups;
+    u32 soleLitSize;
+    u32 soleLitCmp;
+    u32 soleLitMsk;
+};
+
+static really_inline
+const u32 *getConfirmLitIndex(const struct FDRConfirm *fdrc) {
+    const u8 *base = (const u8 *)fdrc;
+    const u32 *litIndex =
+        (const u32 *)(base + ROUNDUP_N(sizeof(*fdrc), alignof(u32)));
+    assert(ISALIGNED(litIndex));
+    return litIndex;
+}
+
+#endif // FDR_CONFIRM_H
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_compile_internal.h"
+#include "fdr_confirm.h"
+#include "engine_description.h"
+#include "teddy_engine_description.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+#include "util/verify_types.h"
+
+#include <algorithm>
+#include <cstring>
+#include <set>
+
+using namespace std;
+
+namespace ue2 {
+
+typedef u8 ConfSplitType;
+typedef pair<BucketIndex, ConfSplitType> BucketSplitPair;
+typedef map<BucketSplitPair, pair<FDRConfirm *, size_t> > BC2CONF;
+
+// return the number of bytes beyond a length threshold in all strings in lits
+static
+size_t thresholdedSize(const vector<hwlmLiteral> &lits, size_t threshold) {
+    size_t tot = 0;
+    for (const auto &lit : lits) {
+        size_t sz = lit.s.size();
+        if (sz > threshold) {
+            tot += ROUNDUP_N(sz - threshold, 8);
+        }
+    }
+    return tot;
+}
+
+static
+u64a make_u64a_mask(const vector<u8> &v) {
+    assert(v.size() <= sizeof(u64a));
+    if (v.size() > sizeof(u64a)) {
+        throw std::exception();
+    }
+
+    u64a mask = 0;
+    size_t vlen = v.size();
+    size_t len = std::min(vlen, sizeof(mask));
+    unsigned char *m = (unsigned char *)&mask;
+    memcpy(m + sizeof(mask) - len, &v[vlen - len], len);
+    return mask;
+}
+
+/**
+ * Build a temporary vector of LitInfo structures (without the corresponding
+ * pointers to the actual strings; these cannot be laid out yet). These
+ * stay in 1:1 correspondence with the lits[] vector as that's the only
+ * place we have to obtain our full strings.
+ */
+static
+void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,
+                 CONF_TYPE &andmsk) {
+    const CONF_TYPE all_ones = ~(u64a)0;
+    andmsk = all_ones; // fill in with 'and' of all literal masks
+
+    for (LiteralIndex i = 0; i < lits.size(); i++) {
+        const hwlmLiteral &lit = lits[i];
+        LitInfo &info = tmpLitInfo[i];
+        memset(&info, 0, sizeof(info));
+        info.id = lit.id;
+        u8 flags = NoFlags;
+        if (lit.nocase) {
+            flags |= Caseless;
+        }
+        if (lit.noruns) {
+            flags |= NoRepeat;
+        }
+        if (lit.msk.size() > lit.s.size()) {
+            flags |= ComplexConfirm;
+            info.extended_size = verify_u8(lit.msk.size());
+        }
+        info.flags = flags;
+        info.size = verify_u32(lit.s.size());
+        info.groups = lit.groups;
+
+        // these are built up assuming a LE machine
+        CONF_TYPE msk = all_ones;
+        CONF_TYPE val = 0;
+        for (u32 j = 0; j < sizeof(CONF_TYPE); j++) {
+            u32 shiftLoc = (sizeof(CONF_TYPE) - j - 1) * 8;
+            if (j >= lit.s.size()) {
+                msk &= ~((CONF_TYPE)0xff << shiftLoc);
+            } else {
+                u8 c = lit.s[lit.s.size() - j - 1];
+                if (lit.nocase && ourisalpha(c)) {
+                    msk &= ~((CONF_TYPE)CASE_BIT << shiftLoc);
+                    val |= (CONF_TYPE)(c & CASE_CLEAR) << shiftLoc;
+                } else {
+                    val |= (CONF_TYPE)c << shiftLoc;
+                }
+            }
+        }
+
+        info.v = val;
+        info.msk = msk;
+        if (!lit.msk.empty()) {
+            u64a l_msk = make_u64a_mask(lit.msk);
+            u64a l_cmp = make_u64a_mask(lit.cmp);
+
+            // test for consistency - if there's intersection, then v and msk
+            // values must line up
+            UNUSED u64a intersection = l_msk & info.msk;
+            assert((info.v & intersection) == (l_cmp & intersection));
+
+            // incorporate lit.msk, lit.cmp into v and msk
+            info.msk |= l_msk;
+            info.v |= l_cmp;
+        }
+
+        andmsk &= info.msk;
+    }
+}
+
+//#define FDR_CONFIRM_DUMP 1
+
+static
+size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
+                     bool applyOneCharOpt, bool make_small, bool make_confirm) {
+    vector<LitInfo> tmpLitInfo(lits.size());
+    CONF_TYPE andmsk;
+    fillLitInfo(lits, tmpLitInfo, andmsk);
+
+#ifdef FDR_CONFIRM_DUMP
+    printf("-------------------\n");
+#endif
+
+    // just magic numbers and crude measures for now
+    u32 nBits;
+    if (make_small) {
+        nBits = min(10U, lg2(lits.size()) + 1);
+    } else {
+        nBits = min(13U, lg2(lits.size()) + 4);
+    }
+
+    CONF_TYPE mult = (CONF_TYPE)0x0b4e0ef37bc32127ULL;
+    u32 flags = 0;
+    // we use next three variables for 'confirmless' case to speed-up
+    // confirmation process
+    u32 soleLitSize = 0;
+    u32 soleLitCmp = 0;
+    u32 soleLitMsk = 0;
+
+    if ((applyOneCharOpt && lits.size() == 1 && lits[0].s.size() == 0 &&
+            lits[0].msk.empty()) || make_confirm == false) {
+        flags = FDRC_FLAG_NO_CONFIRM;
+        if (lits[0].noruns) {
+            flags |= NoRepeat; // messy - need to clean this up later as flags is sorta kinda obsoleted
+        }
+        mult = 0;
+        soleLitSize = lits[0].s.size() - 1;
+        // we can get to this point only in confirmless case;
+        // it means that we have only one literal per FDRConfirm (no packing),
+        // with no literal mask and size of literal is less or equal
+        // to the number of masks of Teddy engine;
+        // maximum number of masks for Teddy is 4, so the size of
+        // literal is definitely less or equal to size of u32
+        assert(lits[0].s.size() <= sizeof(u32));
+        for (u32 i = 0; i < lits[0].s.size(); i++) {
+            u32 shiftLoc = (sizeof(u32) - i - 1) * 8;
+            u8 c = lits[0].s[lits[0].s.size() - i - 1];
+            if (lits[0].nocase && ourisalpha(c)) {
+                soleLitCmp |= (u32)(c & CASE_CLEAR) << shiftLoc;
+                soleLitMsk |= (u32)CASE_CLEAR << shiftLoc;
+            }
+            else {
+                soleLitCmp |= (u32)c << shiftLoc;
+                soleLitMsk |= (u32)0xff << shiftLoc;
+            }
+        }
+    }
+
+    // we can walk the vector and assign elements from the vectors to a
+    // map by hash value
+    map<u32, vector<LiteralIndex> > res2lits;
+    hwlm_group_t gm = 0;
+    for (LiteralIndex i = 0; i < lits.size(); i++) {
+        LitInfo & li = tmpLitInfo[i];
+        u32 hash = CONF_HASH_CALL(li.v, andmsk, mult, nBits);
+        DEBUG_PRINTF("%016llx --> %u\n", li.v, hash);
+        res2lits[hash].push_back(i);
+        gm |= li.groups;
+    }
+
+#ifdef FDR_CONFIRM_DUMP
+    // print out the literals reversed - makes it easier to line up analyses
+    // that are end-offset based
+    for (map<u32, vector<LiteralIndex> >::iterator i = res2lits.begin(),
+         e = res2lits.end(); i != e; ++i) {
+        u32 hash = i->first;
+        vector<LiteralIndex> & vlidx = i->second;
+        if (vlidx.size() > 1) {
+            printf("%x -> %zu literals\n", hash, vlidx.size());
+            u32 min_len = lits[vlidx.front()].s.size();
+            vector<set<u8> > vsl; // contains the set of chars at each location
+                                  // reversed from the end
+            vsl.resize(1024);
+            u32 total_string_size = 0;
+            for (vector<LiteralIndex>::iterator i2 = vlidx.begin(),
+                 e2 = vlidx.end(); i2 != e2; ++i2) {
+                LiteralIndex litIdx = *i2;
+                total_string_size += lits[litIdx].s.size();
+                for (u32 j = lits[litIdx].s.size(); j != 0 ; j--) {
+                    vsl[lits[litIdx].s.size()-j].insert(lits[litIdx].s.c_str()[j - 1]);
+                }
+                min_len = MIN(min_len, lits[litIdx].s.size());
+            }
+            printf("common     ");
+            for (u32 j = 0; j < min_len; j++) {
+                if (vsl[j].size() == 1) {
+                    printf("%02x", (u32)*vsl[j].begin());
+                } else {
+                    printf("__");
+                }
+            }
+            printf("\n");
+            for (vector<LiteralIndex>::iterator i2 = vlidx.begin(),
+                 e2 = vlidx.end(); i2 != e2; ++i2) {
+                LiteralIndex litIdx = *i2;
+                printf("%8x  %c", lits[litIdx].id, lits[litIdx].nocase ? '!' : ' ');
+                for (u32 j = lits[litIdx].s.size(); j != 0 ; j--) {
+                    u32 dist_from_end = lits[litIdx].s.size() - j;
+                    if (dist_from_end < min_len && vsl[dist_from_end].size() == 1) {
+                        printf("__");
+                    } else {
+                        printf("%02x", (u32)lits[litIdx].s.c_str()[j-1]);
+                    }
+                }
+                printf("\n");
+            }
+            u32 total_compares = 0;
+            for (u32 j = 0; j < 1024; j++) { // naughty
+                total_compares += vsl[j].size();
+            }
+            printf("Total compare load: %d Total string size: %d\n\n", total_compares, total_string_size);
+        }
+    }
+#endif
+
+    const size_t bitsToLitIndexSize = (1U << nBits) * sizeof(u32);
+    const size_t totalLitSize = thresholdedSize(lits, sizeof(CONF_TYPE));
+
+    // this size can now be a worst-case as we can always be a bit smaller
+    size_t size = ROUNDUP_N(sizeof(FDRConfirm), alignof(u32)) +
+                  ROUNDUP_N(bitsToLitIndexSize, alignof(LitInfo)) +
+                  sizeof(LitInfo) * lits.size() + totalLitSize;
+    size = ROUNDUP_N(size, alignof(FDRConfirm));
+
+    FDRConfirm *fdrc = (FDRConfirm *)aligned_zmalloc(size);
+    assert(fdrc); // otherwise would have thrown std::bad_alloc
+
+    fdrc->andmsk = andmsk;
+    fdrc->mult = mult;
+    fdrc->nBitsOrSoleID = (flags & FDRC_FLAG_NO_CONFIRM) ? lits[0].id : nBits;
+    fdrc->flags = flags;
+    fdrc->soleLitSize = soleLitSize;
+    fdrc->soleLitCmp = soleLitCmp;
+    fdrc->soleLitMsk = soleLitMsk;
+
+    fdrc->groups = gm;
+
+    // After the FDRConfirm, we have the lit index array.
+    u8 *fdrc_base = (u8 *)fdrc;
+    u8 *ptr = fdrc_base + sizeof(*fdrc);
+    ptr = ROUNDUP_PTR(ptr, alignof(u32));
+    u32 *bitsToLitIndex = (u32 *)ptr;
+    ptr += bitsToLitIndexSize;
+
+    // After the lit index array, we have the LitInfo structures themselves,
+    // which vary in size (as each may have a variable-length string after it).
+    ptr = ROUNDUP_PTR(ptr, alignof(LitInfo));
+
+    // Walk the map by hash value assigning indexes and laying out the
+    // elements (and their associated string confirm material) in memory.
+    for (std::map<u32, vector<LiteralIndex> >::const_iterator
+             i = res2lits.begin(), e = res2lits.end(); i != e; ++i) {
+        const u32 hash = i->first;
+        const vector<LiteralIndex> &vlidx = i->second;
+        bitsToLitIndex[hash] = verify_u32(ptr - (u8 *)fdrc);
+        for (vector<LiteralIndex>::const_iterator i2 = vlidx.begin(),
+             e2 = vlidx.end(); i2 != e2; ++i2) {
+            LiteralIndex litIdx = *i2;
+
+            // Write LitInfo header.
+            u8 *oldPtr = ptr;
+            LitInfo &finalLI = *(LitInfo *)ptr;
+            finalLI = tmpLitInfo[litIdx];
+
+            ptr += sizeof(LitInfo); // String starts directly after LitInfo.
+
+            // Write literal prefix (everything before the last N characters,
+            // as the last N are already confirmed).
+            const string &t = lits[litIdx].s;
+            if (t.size() > sizeof(CONF_TYPE)) {
+                size_t prefix_len = t.size() - sizeof(CONF_TYPE);
+                memcpy(&finalLI.s[0], t.c_str(), prefix_len);
+                ptr = &finalLI.s[0] + prefix_len;
+            }
+
+            ptr = ROUNDUP_PTR(ptr, alignof(LitInfo));
+            if (i2 + 1 == e2) {
+                finalLI.next = 0x0;
+            } else {
+                // our next field represents an adjustment on top of
+                // current address + the actual size of the literal
+                // so we track any rounding up done for alignment and
+                // add this in - that way we don't have to use bigger
+                // than a u8 (for now)
+                assert((size_t)(ptr - oldPtr) > t.size());
+                finalLI.next = verify_u8(ptr - oldPtr - t.size());
+            }
+        }
+        assert((size_t)(ptr - fdrc_base) <= size);
+    }
+
+    *fdrc_p = fdrc;
+
+    // Return actual used size, not worst-case size. Must be rounded up to
+    // FDRConfirm alignment so that the caller can lay out a sequence of these.
+    size_t actual_size = ROUNDUP_N((size_t)(ptr - fdrc_base),
+                                   alignof(FDRConfirm));
+    assert(actual_size <= size);
+    return actual_size;
+}
+
+static
+u32 setupMultiConfirms(const vector<hwlmLiteral> &lits,
+                       const EngineDescription &eng, BC2CONF &bc2Conf,
+                       map<BucketIndex, vector<LiteralIndex> > &bucketToLits,
+                       bool make_small) {
+    u32 pullBack = eng.getConfirmPullBackDistance();
+    u32 splitMask = eng.getConfirmTopLevelSplit() - 1;
+    bool splitHasCase = splitMask & 0x20;
+
+    bool makeConfirm = true;
+    unique_ptr<TeddyEngineDescription> teddyDescr =
+        getTeddyDescription(eng.getID());
+    if (teddyDescr) {
+        makeConfirm = teddyDescr->needConfirm(lits);
+    }
+
+    u32 totalConfirmSize = 0;
+    for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
+        if (!bucketToLits[b].empty()) {
+            vector<vector<hwlmLiteral> > vl(eng.getConfirmTopLevelSplit());
+            for (vector<LiteralIndex>::const_iterator
+                     i = bucketToLits[b].begin(),
+                     e = bucketToLits[b].end();
+                 i != e; ++i) {
+                hwlmLiteral lit = lits[*i]; // copy
+                // c is last char of this literal
+                u8 c = *(lit.s.rbegin());
+
+                bool suppressSplit = false;
+                if (pullBack) {
+                    // make a shorter string to work over if we're pulling back
+                    // getFDRConfirm doesn't know about that stuff
+                    assert(lit.s.size() >= pullBack);
+                    lit.s.resize(lit.s.size() - pullBack);
+
+                    u8 c_sub, c_sub_msk;
+                    if (lit.msk.empty()) {
+                        c_sub = 0;
+                        c_sub_msk = 0;
+                    } else {
+                        c_sub = *(lit.cmp.rbegin());
+                        c_sub_msk = *(lit.msk.rbegin());
+                        size_t len = lit.msk.size() -
+                                     min(lit.msk.size(), (size_t)pullBack);
+                        lit.msk.resize(len);
+                        lit.cmp.resize(len);
+                    }
+
+                    // if c_sub_msk is 0xff and lit.nocase
+                    // resteer 'c' to an exact value and set suppressSplit
+                    if ((c_sub_msk == 0xff) && (lit.nocase)) {
+                        suppressSplit = true;
+                        c = c_sub;
+                    }
+                }
+
+                if (!suppressSplit && splitHasCase && lit.nocase &&
+                    ourisalpha(c)) {
+                    vl[(u8)(mytoupper(c) & splitMask)].push_back(lit);
+                    vl[(u8)(mytolower(c) & splitMask)].push_back(lit);
+                } else {
+                    vl[c & splitMask].push_back(lit);
+                }
+            }
+
+            for (u32 c = 0; c < eng.getConfirmTopLevelSplit(); c++) {
+                if (!vl[c].empty()) {
+                    DEBUG_PRINTF("b %d c %02x sz %zu\n", b, c, vl[c].size());
+                    FDRConfirm *fdrc;
+                    size_t size = getFDRConfirm(vl[c], &fdrc,
+                                                eng.typicallyHoldsOneCharLits(),
+                                                make_small, makeConfirm);
+                    BucketSplitPair p = make_pair(b, c);
+                    bc2Conf[p] = make_pair(fdrc, size);
+                    totalConfirmSize += size;
+                }
+            }
+        }
+    }
+    return totalConfirmSize;
+}
+
+pair<u8 *, size_t> setupFullMultiConfs(const vector<hwlmLiteral> &lits,
+        const EngineDescription &eng,
+        map<BucketIndex, vector<LiteralIndex> > &bucketToLits,
+        bool make_small) {
+    BC2CONF bc2Conf;
+    u32 totalConfirmSize = setupMultiConfirms(lits, eng, bc2Conf, bucketToLits,
+                                              make_small);
+
+    u32 primarySwitch = eng.getConfirmTopLevelSplit();
+    u32 nBuckets = eng.getNumBuckets();
+    u32 totalConfSwitchSize = primarySwitch * nBuckets * sizeof(u32);
+    u32 totalSize = ROUNDUP_16(totalConfSwitchSize + totalConfirmSize);
+
+    u8 *buf = (u8 *)aligned_zmalloc(totalSize);
+    assert(buf); // otherwise would have thrown std::bad_alloc
+
+    u32 *confBase = (u32 *)buf;
+    u8 *ptr = buf + totalConfSwitchSize;
+
+    for (BC2CONF::const_iterator i = bc2Conf.begin(), e = bc2Conf.end(); i != e;
+         ++i) {
+        const pair<FDRConfirm *, size_t> &p = i->second;
+        // confirm offset is relative to the base of this structure, now
+        u32 confirm_offset = verify_u32(ptr - (u8 *)buf);
+        memcpy(ptr, p.first, p.second);
+        ptr += p.second;
+        aligned_free(p.first);
+        BucketIndex b = i->first.first;
+        u8 c = i->first.second;
+        u32 idx = c * nBuckets + b;
+        confBase[idx] = confirm_offset;
+    }
+    return make_pair(buf, totalSize);
+}
+
+} // namespace ue2
--- a/src/fdr/fdr_confirm_runtime.h
+++ b/src/fdr/fdr_confirm_runtime.h
@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_CONFIRM_RUNTIME_H
+#define FDR_CONFIRM_RUNTIME_H
+
+#include "fdr_internal.h"
+#include "fdr_loadval.h"
+#include "hwlm/hwlm.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+
+#define CONF_LOADVAL_CALL lv_u64a
+#define CONF_LOADVAL_CALL_CAUTIOUS lv_u64a_ce
+
+// this is ordinary confirmation function which runs through
+// the whole confirmation procedure
+static really_inline
+void confWithBit(const struct FDRConfirm * fdrc,
+                 const struct FDR_Runtime_Args * a,
+                 size_t i,
+                 CautionReason r,
+                 u32 pullBackAmount,
+                 hwlmcb_rv_t *control,
+                 u32 * last_match) {
+    assert(i < a->len);
+    assert(ISALIGNED(fdrc));
+
+    const u8 * buf = a->buf;
+    const size_t len = a->len;
+
+    CONF_TYPE v;
+    const u8 * confirm_loc = buf + i - pullBackAmount - 7;
+    if (likely(r == NOT_CAUTIOUS || confirm_loc >= buf)) {
+        v = CONF_LOADVAL_CALL(confirm_loc, buf, buf + len);
+    } else { // r == VECTORING, confirm_loc < buf
+        u64a histBytes = a->histBytes;
+        v = CONF_LOADVAL_CALL_CAUTIOUS(confirm_loc, buf, buf + len);
+        // stitch together v (which doesn't move) and history (which does)
+        u32 overhang = buf - confirm_loc;
+        histBytes >>= 64 - (overhang * 8);
+        v |= histBytes;
+    }
+
+    u32 c = CONF_HASH_CALL(v, fdrc->andmsk, fdrc->mult, fdrc->nBitsOrSoleID);
+    u32 start = getConfirmLitIndex(fdrc)[c];
+    if (P0(start)) {
+        const struct LitInfo *l =
+            (const struct LitInfo *)((const u8 *)fdrc + start);
+
+        u8 oldNext; // initialized in loop
+        do {
+            assert(ISALIGNED(l));
+
+            if (P0( (v & l->msk) != l->v)) {
+                goto out;
+            }
+
+            if ((*last_match == l->id) && (l->flags & NoRepeat)) {
+                goto out;
+            }
+
+            const u8 * loc = buf + i - l->size + 1 - pullBackAmount;
+
+            u8 caseless = l->flags & Caseless;
+            if (loc < buf) {
+                u32 full_overhang = buf - loc;
+
+                const u8 * history = (caseless) ?
+                                      a->buf_history_nocase : a->buf_history;
+                size_t len_history = (caseless) ?
+                                      a->len_history_nocase : a->len_history;
+
+                // can't do a vectored confirm either if we don't have
+                // the bytes
+                if (full_overhang > len_history) {
+                    goto out;
+                }
+
+                // as for the regular case, no need to do a full confirm if
+                // we're a short literal
+                if (unlikely(l->size > sizeof(CONF_TYPE))) {
+                    const u8 * s1 = l->s;
+                    const u8 * s2 = s1 + full_overhang;
+                    const u8 * loc1 = history + len_history - full_overhang;
+                    const u8 * loc2 = buf;
+                    size_t size1 = MIN(full_overhang,
+                                       l->size - sizeof(CONF_TYPE));
+                    size_t wind_size2_back = sizeof(CONF_TYPE) +
+                                             full_overhang;
+                    size_t size2 = wind_size2_back > l->size ?
+                                   0 : l->size - wind_size2_back;
+
+                    if (cmpForward(loc1, s1, size1, caseless)) {
+                        goto out;
+                    }
+                    if (cmpForward(loc2, s2, size2, caseless)) {
+                        goto out;
+                    }
+                }
+            } else { // NON-VECTORING PATH
+
+                // if string < conf_type we don't need regular string cmp
+                if (unlikely(l->size > sizeof(CONF_TYPE))) {
+                    if (cmpForward(loc, l->s, l->size - sizeof(CONF_TYPE), caseless)) {
+                        goto out;
+                    }
+                }
+            }
+
+            if (P0(!(l->groups & *control))) {
+                goto out;
+            }
+
+            if (unlikely(l->flags & ComplexConfirm)) {
+                const u8 * loc2 = buf + i - l->extended_size + 1 - pullBackAmount;
+                if (loc2 < buf) {
+                    u32 full_overhang = buf - loc2;
+                    size_t len_history = (caseless) ?
+                                          a->len_history_nocase : a->len_history;
+                    if (full_overhang > len_history) {
+                        goto out;
+                    }
+                }
+            }
+
+            *last_match = l->id;
+            *control = a->cb(loc - buf, i, l->id, a->ctxt);
+out:
+            oldNext = l->next; // oldNext is either 0 or an 'adjust' value
+            l = (const struct LitInfo*)((const u8 *)l + oldNext + l->size);
+        } while (oldNext);
+    }
+}
+
+// 'light-weight' confirmation function which is used by 1-mask Teddy;
+// in the 'confirmless' case it simply calls callback function,
+// otherwise it calls 'confWithBit' function for the full confirmation procedure
+static really_inline
+void confWithBit1(const struct FDRConfirm * fdrc,
+                  const struct FDR_Runtime_Args * a,
+                  size_t i,
+                  CautionReason r,
+                  hwlmcb_rv_t *control,
+                  u32 * last_match) {
+    assert(i < a->len);
+    assert(ISALIGNED(fdrc));
+
+    if (unlikely(fdrc->mult)) {
+        confWithBit(fdrc, a, i, r, 0, control, last_match);
+        return;
+    } else {
+        u32 id = fdrc->nBitsOrSoleID;
+
+        if ((*last_match == id) && (fdrc->flags & NoRepeat)) {
+            return;
+        }
+        *last_match = id;
+        *control = a->cb(i, i, id, a->ctxt);
+    }
+}
+
+// This is 'light-weight' confirmation function which is used by 2-3-4-mask Teddy
+// In the 'confirmless' case it makes fast 32-bit comparison,
+// otherwise it calls 'confWithBit' function for the full confirmation procedure
+static really_inline
+void confWithBitMany(const struct FDRConfirm * fdrc,
+                     const struct FDR_Runtime_Args * a,
+                     size_t i,
+                     CautionReason r,
+                     hwlmcb_rv_t *control,
+                     u32 * last_match) {
+    assert(i < a->len);
+    assert(ISALIGNED(fdrc));
+
+    if (i < a->start_offset) {
+        return;
+    }
+
+    if (unlikely(fdrc->mult)) {
+        confWithBit(fdrc, a, i, r, 0, control, last_match);
+        return;
+    } else {
+        const u32 id = fdrc->nBitsOrSoleID;
+        const u32 len = fdrc->soleLitSize;
+
+        if ((*last_match == id) && (fdrc->flags & NoRepeat)) {
+            return;
+        }
+
+        if (r == VECTORING && len > i - a->start_offset) {
+            if (len > (i + a->len_history)) {
+                return;
+            }
+
+            u32 cmp = (u32)a->buf[i] << 24;
+
+            if (len <= i) {
+                for (u32 j = 1; j <= len; j++) {
+                    cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
+                }
+            } else {
+                for (u32 j = 1; j <= i; j++) {
+                    cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
+                }
+                cmp |= (u32)(a->histBytes >> (40 + i * 8));
+            }
+
+            if ((fdrc->soleLitMsk & cmp) != fdrc->soleLitCmp) {
+               return;
+            }
+        }
+        *last_match = id;
+        *control = a->cb(i - len, i, id, a->ctxt);
+    }
+}
+
+#endif
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_compile_internal.h"
+#include "fdr_dump.h"
+#include "fdr_engine_description.h"
+#include "teddy_engine_description.h"
+#include "ue2common.h"
+
+#include <cstdio>
+#include <memory>
+
+#ifndef DUMP_SUPPORT
+#error No dump support!
+#endif
+
+using std::unique_ptr;
+
+namespace ue2 {
+
+static
+bool fdrIsTeddy(const FDR *fdr) {
+    assert(fdr);
+    u32 engine = fdr->engineID;
+
+    /* teddys don't have an fdr engine description (which is why the dump code
+     * is so broken). */
+
+    return !getFdrDescription(engine);
+}
+
+void fdrPrintStats(const FDR *fdr, FILE *f) {
+    const bool isTeddy = fdrIsTeddy(fdr);
+
+    if (isTeddy) {
+        fprintf(f, "TEDDY:         %u\n", fdr->engineID);
+    } else {
+        fprintf(f, "FDR:           %u\n", fdr->engineID);
+    }
+
+    if (isTeddy) {
+        unique_ptr<TeddyEngineDescription> des =
+            getTeddyDescription(fdr->engineID);
+        if (des) {
+            fprintf(f, "    masks      %u\n", des->numMasks);
+            fprintf(f, "    buckets    %u\n", des->getNumBuckets());
+            fprintf(f, "    packed     %s\n", des->packed ? "true" : "false");
+        } else {
+            fprintf(f, "   <unknown engine>\n");
+        }
+    } else {
+        unique_ptr<FDREngineDescription> des =
+            getFdrDescription(fdr->engineID);
+        if (des) {
+            fprintf(f, "    stride     %u\n", des->stride);
+            fprintf(f, "    buckets    %u\n", des->getNumBuckets());
+            fprintf(f, "    width      %u\n", des->schemeWidth);
+        } else {
+            fprintf(f, "   <unknown engine>\n");
+        }
+    }
+
+    fprintf(f, "    strings    ???\n");
+    fprintf(f, "    size       %zu bytes\n", fdrSize(fdr));
+    fprintf(f, "    max length %u\n", fdr->maxStringLen);
+    fprintf(f, "    floodoff   %u (%x)\n", fdr->floodOffset, fdr->floodOffset);
+}
+
+} // namespace ue2
--- a/src/fdr/fdr_dump.h
+++ b/src/fdr/fdr_dump.h
@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: dump API.
+ */
+
+#ifndef FDR_DUMP_H
+#define FDR_DUMP_H
+
+#if defined(DUMP_SUPPORT)
+
+#include <cstdio>
+
+struct FDR;
+
+namespace ue2 {
+
+void fdrPrintStats(const struct FDR *fdr, FILE *f);
+
+} // namespace ue2
+
+#endif // DUMP_SUPPORT
+#endif // FDR_DUMP_H
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr_compile_internal.h"
+#include "fdr_engine_description.h"
+#include "hs_compile.h"
+#include "util/target_info.h"
+#include "util/compare.h" // for ourisalpha()
+#include "util/make_unique.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <map>
+#include <string>
+
+using namespace std;
+
+namespace ue2 {
+
+#include "fdr_autogen_compiler.cpp"
+
+FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
+    : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
+                        def.numBuckets, def.confirmPullBackDistance,
+                        def.confirmTopLevelSplit),
+      schemeWidth(def.schemeWidth), stride(def.stride), bits(def.bits) {}
+
+u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
+    // rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
+    // the +1 avoids pain due to various reach choices
+    return ((getSchemeWidth() + getNumBuckets() - 1) / getNumBuckets()) + 1;
+}
+
+static
+u32 findDesiredStride(size_t num_lits, size_t min_len, size_t min_len_count) {
+    u32 desiredStride = 1; // always our safe fallback
+    if (min_len > 1) {
+        if (num_lits < 250) {
+            // small cases we just go for it
+            desiredStride = min_len;
+        } else if (num_lits < 800) {
+            // intermediate cases
+            desiredStride = min_len - 1;
+        } else if (num_lits < 5000) {
+            // for larger but not huge sizes, go to stride 2 only if we have at
+            // least minlen 3
+            desiredStride = MIN(min_len - 1, 2);
+        }
+    }
+
+    // patch if count is quite large - a ton of length 2 literals can
+    // break things
+#ifdef TRY_THIS_LATER
+    if ((min_len == 2) && (desiredStride == 2) && (min_len_count > 20)) {
+        desiredStride = 1;
+    }
+#endif
+
+    // patch stuff just for the stride 4 case; don't let min_len=4,
+    // desiredStride=4 through as even a few length 4 literals can break things
+    // (far more fragile)
+    if ((min_len == 4) && (desiredStride == 4) && (min_len_count > 2)) {
+        desiredStride = 2;
+    }
+
+    return desiredStride;
+}
+
+unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
+                                              const vector<hwlmLiteral> &vl,
+                                              bool make_small) {
+    vector<FDREngineDescription> allDescs;
+    getFdrDescriptions(&allDescs);
+
+    // find desired stride
+    size_t count;
+    size_t msl = minLenCount(vl, &count);
+    u32 desiredStride = findDesiredStride(vl.size(), msl, count);
+
+    DEBUG_PRINTF("%zu lits, msl=%zu, desiredStride=%u\n", vl.size(), msl,
+                 desiredStride);
+
+    const FDREngineDescription *best = nullptr;
+    u32 best_score = 0;
+
+    for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
+        const FDREngineDescription &eng = allDescs[engineID];
+        if (!eng.isValidOnTarget(target)) {
+            continue;
+        }
+        if (msl < eng.stride) {
+            continue;
+        }
+
+        u32 score = 100;
+
+        score -= absdiff(desiredStride, eng.stride);
+
+        if (eng.stride <= desiredStride) {
+            score += eng.stride;
+        }
+
+        u32 effLits = vl.size(); /* * desiredStride;*/
+        u32 ideal;
+        if (effLits < eng.getNumBuckets()) {
+            if (eng.stride == 1) {
+                ideal = 8;
+            } else {
+                ideal = 10;
+            }
+        } else if (effLits < 20) {
+            ideal = 10;
+        } else if (effLits < 100) {
+            ideal = 11;
+        } else if (effLits < 1000) {
+            ideal = 12;
+        } else if (effLits < 10000) {
+            ideal = 13;
+        } else {
+            ideal = 15;
+        }
+
+        if (ideal != 8 && eng.schemeWidth == 32) {
+            ideal += 1;
+        }
+
+        if (make_small) {
+            ideal -= 2;
+        }
+
+        if (eng.stride > 1) {
+            ideal++;
+        }
+
+        DEBUG_PRINTF("effLits %u\n", effLits);
+
+        if (target.is_atom_class() && !make_small && effLits < 4000) {
+            /* Unless it is a very heavy case, we want to build smaller tables
+             * on lightweight machines due to their small caches. */
+            ideal -= 2;
+        }
+
+        score -= absdiff(ideal, eng.bits);
+
+        DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
+                     "-> score=%u\n",
+                     eng.getID(), eng.schemeWidth, eng.bits,
+                     eng.getNumBuckets(), eng.stride, score);
+
+        if (!best || score > best_score) {
+            best = &eng;
+            best_score = score;
+        }
+    }
+
+    if (!best) {
+        DEBUG_PRINTF("failed to find engine\n");
+        return nullptr;
+    }
+
+    DEBUG_PRINTF("using engine %u\n", best->getID());
+    return ue2::make_unique<FDREngineDescription>(*best);
+}
+
+SchemeBitIndex FDREngineDescription::getSchemeBit(BucketIndex b,
+                                                  PositionInBucket p) const {
+    assert(p < getBucketWidth(b));
+    SchemeBitIndex sbi = p * getNumBuckets() + b;
+    assert(sbi < getSchemeWidth());
+    return sbi;
+}
+
+u32 FDREngineDescription::getBucketWidth(BucketIndex) const {
+    u32 sw = getSchemeWidth();
+    u32 nm = getNumBuckets();
+    assert(sw % nm == 0);
+    return sw/nm;
+}
+
+unique_ptr<FDREngineDescription> getFdrDescription(u32 engineID) {
+    vector<FDREngineDescription> allDescs;
+    getFdrDescriptions(&allDescs);
+
+    if (engineID >= allDescs.size()) {
+        return nullptr;
+    }
+
+    return ue2::make_unique<FDREngineDescription>(allDescs[engineID]);
+}
+
+} // namespace ue2
--- a/src/fdr/fdr_engine_description.h
+++ b/src/fdr/fdr_engine_description.h
@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_ENGINE_DESCRIPTION_H
+#define FDR_ENGINE_DESCRIPTION_H
+
+#include "engine_description.h"
+#include "util/ue2_containers.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+namespace ue2 {
+
+struct FDREngineDef {
+    u32 id;
+    u32 schemeWidth;
+    u32 numBuckets;
+    u32 stride;
+    u32 bits;
+    u64a cpu_features;
+    u32 confirmPullBackDistance;
+    u32 confirmTopLevelSplit;
+};
+
+class FDREngineDescription : public EngineDescription {
+public:
+    u32 schemeWidth;
+    u32 stride;
+    u32 bits;
+
+    u32 getSchemeWidth() const { return schemeWidth; }
+    u32 getBucketWidth(BucketIndex b) const;
+    SchemeBitIndex getSchemeBit(BucketIndex b, PositionInBucket p) const;
+    u32 getNumTableEntries() const { return 1 << bits; }
+    u32 getTabSizeBytes() const {
+        return schemeWidth / 8 * getNumTableEntries();
+    }
+
+    explicit FDREngineDescription(const FDREngineDef &def);
+
+    u32 getDefaultFloodSuffixLength() const override;
+    bool typicallyHoldsOneCharLits() const override { return stride == 1; }
+};
+
+std::unique_ptr<FDREngineDescription>
+chooseEngine(const target_t &target, const std::vector<hwlmLiteral> &vl,
+             bool make_small);
+std::unique_ptr<FDREngineDescription> getFdrDescription(u32 engineID);
+void getFdrDescriptions(std::vector<FDREngineDescription> *out);
+
+} // namespace ue2
+
+#endif
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: data structures.
+ */
+
+#ifndef FDR_INTERNAL_H
+#define FDR_INTERNAL_H
+
+#include "ue2common.h"
+#include "hwlm/hwlm.h" // for hwlm_group_t, HWLMCallback
+
+typedef enum {
+    NOT_CAUTIOUS, //!< not near a boundary (quantify?)
+    VECTORING     //!< potentially vectoring
+} CautionReason;
+
+/** \brief number of different ids that can be triggered by floods of any given
+ * character. */
+#define FDR_FLOOD_MAX_IDS 16
+
+struct FDRFlood {
+    hwlm_group_t allGroups; //!< all the groups or'd together
+    u32 suffix;
+
+    /** \brief 0 to FDR_FLOOD_MAX_IDS-1 ids that are generated once per char on
+     * a flood.
+     * If larger we won't handle this through the flood path at all. */
+    u16 idCount;
+
+    u32 ids[FDR_FLOOD_MAX_IDS]; //!< the ids
+    hwlm_group_t groups[FDR_FLOOD_MAX_IDS]; //!< group ids to go with string ids
+    u32 len[FDR_FLOOD_MAX_IDS]; //!< lengths to go with the string ids
+};
+
+/** \brief FDR structure.
+ *
+ * 1. struct as-is
+ * 2. primary matching table
+ * 3. confirm stuff
+ */
+struct FDR {
+    u32 engineID;
+    u32 size;
+    u32 maxStringLen;
+    u32 floodOffset;
+
+    /** link is the relative offset of a secondary included FDR table for
+     * stream handling if we're a primary FDR table or the subsidiary tertiary
+     * structures (spillover strings and hash table) if we're a secondary
+     * structure. */
+    u32 link;
+    u32 pad1;
+    u32 pad2;
+    u32 pad3;
+
+    union {
+        u32 s_u32;
+        u64a s_u64a;
+        m128 s_m128;
+    } start;
+};
+
+/** \brief FDR runtime arguments.
+ *
+ * This structure handles read-only things that are passed extensively around
+ * the FDR run-time functions. They are set by the API, passed by value into
+ * the main function, then a pointer is passed around to all the various
+ * sub-functions (confirm & flood). */
+struct FDR_Runtime_Args {
+    const u8 *buf;
+    size_t len;
+    const u8 *buf_history;
+    size_t len_history;
+    const u8 *buf_history_nocase;
+    size_t len_history_nocase;
+    size_t start_offset;
+    HWLMCallback cb;
+    void *ctxt;
+    hwlm_group_t *groups;
+    const u8 *firstFloodDetect;
+    const u64a histBytes;
+};
+
+#endif
--- a/src/fdr/fdr_loadval.h
+++ b/src/fdr/fdr_loadval.h
@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_LOADVAL_H
+#define FDR_LOADVAL_H
+
+#include "fdr_internal.h"
+#include "ue2common.h"
+#include "util/unaligned.h"
+#include "util/simd_utils.h"
+
+#define MAKE_LOADVAL(type, name) \
+    static really_inline type name (const u8 * ptr, UNUSED const u8 * lo, UNUSED const u8 * hi)
+
+#define NORMAL_SAFE(type)            assert(ptr >= lo && (ptr + sizeof(type) - 1) < hi)
+#define ALIGNED_SAFE(type)           NORMAL_SAFE(type); assert(((size_t)ptr % sizeof(type)) == 0);
+// these ones need asserts to test the property that we're not handling dynamically
+#define CAUTIOUS_FORWARD_SAFE(type)  assert(ptr >= lo)
+#define CAUTIOUS_BACKWARD_SAFE(type) assert((ptr + sizeof(type) - 1) < hi)
+
+#define CF_INDEX_CHECK                        (ptr + i < hi)
+#define CB_INDEX_CHECK     (lo <= ptr + i)
+#define CE_INDEX_CHECK     (lo <= ptr + i) && (ptr + i < hi)
+
+#define MAKE_LOOP(TYPE, COND, SHIFT_FIDDLE)                                    \
+    TYPE v = 0;                                                                \
+    for (TYPE i = 0; i < sizeof(TYPE); i++) {                                  \
+        if (COND) {                                                            \
+            v += (TYPE)ptr[i] << ((SHIFT_FIDDLE)*8);                           \
+        }                                                                      \
+    }                                                                          \
+    return v;
+
+#define MAKE_LOOP_BE(TYPE, COND) \
+    MAKE_LOOP(TYPE, COND, sizeof(TYPE)-i-1)
+
+#define MAKE_LOOP_LE(TYPE, COND) \
+    MAKE_LOOP(TYPE, COND, i)
+
+
+#define MAKE_LOOP_BE_CF(TYPE) CAUTIOUS_FORWARD_SAFE(TYPE);  MAKE_LOOP_BE(TYPE, CF_INDEX_CHECK)
+#define MAKE_LOOP_BE_CB(TYPE) CAUTIOUS_BACKWARD_SAFE(TYPE); MAKE_LOOP_BE(TYPE, CB_INDEX_CHECK)
+#define MAKE_LOOP_BE_CE(TYPE)                               MAKE_LOOP_BE(TYPE, CE_INDEX_CHECK)
+#define MAKE_LOOP_LE_CF(TYPE) CAUTIOUS_FORWARD_SAFE(TYPE);  MAKE_LOOP_LE(TYPE, CF_INDEX_CHECK)
+#define MAKE_LOOP_LE_CB(TYPE) CAUTIOUS_BACKWARD_SAFE(TYPE); MAKE_LOOP_LE(TYPE, CB_INDEX_CHECK)
+#define MAKE_LOOP_LE_CE(TYPE)                               MAKE_LOOP_LE(TYPE, CE_INDEX_CHECK)
+
+// no suffix = normal (unaligned)
+// _a        = aligned
+// _cf       = cautious forwards, base is always in bounds, but may read over the end of the buffer (test against hi)
+// _cb       = cautious backwards, final byte is always in bounds, but may read over the start of the buffer (test against lo)
+// _ce       = cautious everywhere (in both directions); test against hi and lo
+
+// u8 loadvals
+MAKE_LOADVAL(u8, lv_u8) {
+    NORMAL_SAFE(u8);
+    return *ptr;
+}
+
+MAKE_LOADVAL(u8, lv_u8_cf) {
+    CAUTIOUS_FORWARD_SAFE(u8);
+    if (ptr < hi) {
+        return *ptr;
+    } else {
+        return 0;
+    }
+}
+
+MAKE_LOADVAL(u8, lv_u8_cb) {
+    CAUTIOUS_BACKWARD_SAFE(u8);
+    if (lo <= ptr) {
+        return *ptr;
+    } else {
+        return 0;
+    }
+}
+
+MAKE_LOADVAL(u8, lv_u8_ce) {
+    if ((lo <= ptr) && (ptr < hi)) {
+        return *ptr;
+    } else {
+        return 0;
+    }
+}
+
+MAKE_LOADVAL(u16, lv_u16) {
+    NORMAL_SAFE(u16);
+    return unaligned_load_u16(ptr);
+}
+
+MAKE_LOADVAL(u16, lv_u16_a) {
+    ALIGNED_SAFE(u16);
+    return *(const u16 *)ptr;
+}
+
+MAKE_LOADVAL(u32, lv_u32) {
+    NORMAL_SAFE(u32);
+    return unaligned_load_u32(ptr);
+}
+
+MAKE_LOADVAL(u32, lv_u32_a) {
+    ALIGNED_SAFE(u32);
+    return *(const u32 *)ptr;
+}
+
+MAKE_LOADVAL(u64a, lv_u64a) {
+    NORMAL_SAFE(u32);
+    return unaligned_load_u64a(ptr);
+}
+
+MAKE_LOADVAL(u64a, lv_u64a_a) {
+    ALIGNED_SAFE(u64a);
+    return *(const u64a *)ptr;
+}
+
+MAKE_LOADVAL(u16, lv_u16_cf) { MAKE_LOOP_LE_CF(u16); }
+MAKE_LOADVAL(u16, lv_u16_cb) { MAKE_LOOP_LE_CB(u16); }
+MAKE_LOADVAL(u16, lv_u16_ce) { MAKE_LOOP_LE_CE(u16); }
+
+MAKE_LOADVAL(u32, lv_u32_cf) { MAKE_LOOP_LE_CF(u32); }
+MAKE_LOADVAL(u32, lv_u32_cb) { MAKE_LOOP_LE_CB(u32); }
+MAKE_LOADVAL(u32, lv_u32_ce) { MAKE_LOOP_LE_CE(u32); }
+
+MAKE_LOADVAL(u64a, lv_u64a_cf) { MAKE_LOOP_LE_CF(u64a); }
+MAKE_LOADVAL(u64a, lv_u64a_cb) { MAKE_LOOP_LE_CB(u64a); }
+MAKE_LOADVAL(u64a, lv_u64a_ce) { MAKE_LOOP_LE_CE(u64a); }
+
+MAKE_LOADVAL(m128, lv_m128) {
+    NORMAL_SAFE(m128);
+    return loadu128(ptr);
+}
+
+MAKE_LOADVAL(m128, lv_m128_a) {
+    ALIGNED_SAFE(m128);
+    assert((size_t)ptr % sizeof(m128) == 0);
+    return *(const m128 *)ptr;
+}
+
+// m128 cases need to be manually created
+
+MAKE_LOADVAL(m128, lv_m128_cf) {
+    CAUTIOUS_FORWARD_SAFE(m128);
+    union {
+        u8 val8[16];
+        m128 val128;
+    } u;
+
+    for (u32 i = 0; i < 16; i++) {
+        if (ptr + i < hi) {
+            u.val8[i] = ptr[i];
+        } else {
+            u.val8[i] = 0;
+        }
+    }
+    return u.val128;
+}
+
+MAKE_LOADVAL(m128, lv_m128_cb) {
+    CAUTIOUS_BACKWARD_SAFE(m128);
+    union {
+        u8 val8[16];
+        m128 val128;
+    } u;
+
+    for (u32 i = 0; i < 16; i++) {
+        if (lo <= ptr + i) {
+            u.val8[i] = ptr[i];
+        } else {
+            u.val8[i] = 0;
+        }
+    }
+    return u.val128;
+}
+
+MAKE_LOADVAL(m128, lv_m128_ce) {
+    union {
+        u8 val8[16];
+        m128 val128;
+    } u;
+
+    for (u32 i = 0; i < 16; i++) {
+        if ((lo <= ptr + i) && (ptr + i < hi)) {
+            u.val8[i] = ptr[i];
+        } else {
+            u.val8[i] = 0;
+        }
+    }
+    return u.val128;
+}
+
+#endif
--- a/src/fdr/fdr_streaming_compile.cpp
+++ b/src/fdr/fdr_streaming_compile.cpp
@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_streaming_internal.h"
+#include "fdr_compile_internal.h"
+#include "hwlm/hwlm_build.h"
+#include "util/alloc.h"
+#include "util/bitutils.h"
+#include "util/target_info.h"
+#include "util/verify_types.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <deque>
+#include <set>
+
+#include <boost/dynamic_bitset.hpp>
+
+using namespace std;
+using boost::dynamic_bitset;
+
+namespace ue2 {
+
+namespace {
+struct LongLitOrder {
+    bool operator()(const hwlmLiteral &i1, const hwlmLiteral &i2) const {
+        if (i1.nocase != i2.nocase) {
+            return i1.nocase < i2.nocase;
+        } else {
+            return i1.s < i2.s;
+        }
+    }
+};
+}
+
+static
+bool hwlmLitEqual(const hwlmLiteral &l1, const hwlmLiteral &l2) {
+    return l1.s == l2.s && l1.nocase == l2.nocase;
+}
+
+static
+u32 roundUpToPowerOfTwo(u32 x) {
+    x -= 1;
+    x |= (x >> 1);
+    x |= (x >> 2);
+    x |= (x >> 4);
+    x |= (x >> 8);
+    x |= (x >> 16);
+    return x + 1;
+}
+
+/**
+ * \brief Creates a long literals vector containing all literals of length > max_len.
+ *
+ * The last char of each literal is trimmed as we're not interested in full
+ * matches, only partial matches.
+ *
+ * Literals are sorted (by caseful/caseless, then lexicographical order) and
+ * made unique.
+ *
+ * The ID of each literal is set to its position in the vector.
+ *
+ * \return False if there aren't any long literals.
+ */
+static
+bool setupLongLits(const vector<hwlmLiteral> &lits,
+                   vector<hwlmLiteral> &long_lits, size_t max_len) {
+    long_lits.reserve(lits.size());
+    for (vector<hwlmLiteral>::const_iterator it = lits.begin();
+         it != lits.end(); ++it) {
+        if (it->s.length() > max_len) {
+            hwlmLiteral tmp = *it; // copy
+            tmp.s.erase(tmp.s.size() - 1, 1); // erase last char
+            tmp.id = 0; // recalc later
+            tmp.groups = 0; // filled in later by hash bucket(s)
+            long_lits.push_back(tmp);
+        }
+    }
+
+    if (long_lits.empty()) {
+        return false;
+    }
+
+    // sort long_literals by caseful/caseless and in lexicographical order,
+    // remove duplicates
+    stable_sort(long_lits.begin(), long_lits.end(), LongLitOrder());
+    vector<hwlmLiteral>::iterator new_end =
+        unique(long_lits.begin(), long_lits.end(), hwlmLitEqual);
+    long_lits.erase(new_end, long_lits.end());
+
+    // fill in ids; not currently used
+    for (vector<hwlmLiteral>::iterator i = long_lits.begin(),
+                                       e = long_lits.end();
+         i != e; ++i) {
+        i->id = i - long_lits.begin();
+    }
+    return true;
+}
+
+// boundaries are the 'start' boundaries for each 'mode'
+// so boundary[CASEFUL] is the index one above the largest caseful index
+// positions[CASEFUL] is the # of positions in caseful strings (stream)
+// hashedPositions[CASEFUL] is the # of positions in caseful strings
+//                          (not returned - a temporary)
+// hashEntries[CASEFUL] is the # of positions hashed for caseful strings
+//                    (rounded up to the nearest power of two)
+static
+void analyzeLits(const vector<hwlmLiteral> &long_lits, size_t max_len,
+                 u32 *boundaries, u32 *positions, u32 *hashEntries) {
+    u32 hashedPositions[MAX_MODES];
+
+    for (u32 m = CASEFUL; m < MAX_MODES; ++m) {
+        boundaries[m] = verify_u32(long_lits.size());
+        positions[m] = 0;
+        hashedPositions[m] = 0;
+    }
+
+    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
+                                             e = long_lits.end();
+         i != e; ++i) {
+        if (i->nocase) {
+            boundaries[CASEFUL] = verify_u32(i - long_lits.begin());
+            break;
+        }
+    }
+
+    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
+                                             e = long_lits.end();
+         i != e; ++i) {
+        MODES m = i->nocase ? CASELESS : CASEFUL;
+        for (u32 j = 1; j < i->s.size() - max_len + 1; j++) {
+            hashedPositions[m]++;
+        }
+        positions[m] += i->s.size();
+    }
+
+    for (u32 m = CASEFUL; m < MAX_MODES; m++) {
+        hashEntries[m] = hashedPositions[m]
+                ? roundUpToPowerOfTwo(MAX(4096, hashedPositions[m]))
+                : 0;
+    }
+
+#ifdef DEBUG_COMPILE
+    printf("analyzeLits:\n");
+    for (MODES m = CASEFUL; m < MAX_MODES; m++) {
+        printf("mode %s boundary %d positions %d hashedPositions %d "
+               "hashEntries %d\n",
+               (m == CASEFUL) ? "caseful" : "caseless", boundaries[m],
+               positions[m], hashedPositions[m], hashEntries[m]);
+    }
+    printf("\n");
+#endif
+}
+
+static
+u32 hashLit(const hwlmLiteral &l, u32 offset, size_t max_len, MODES m) {
+    return streaming_hash((const u8 *)l.s.c_str() + offset, max_len, m);
+}
+
+// sort by 'distance from start'
+namespace {
+struct OffsetIDFromEndOrder {
+    const vector<hwlmLiteral> &lits; // not currently used
+    explicit OffsetIDFromEndOrder(const vector<hwlmLiteral> &lits_in)
+        : lits(lits_in) {}
+    bool operator()(const pair<u32, u32> &i1, const pair<u32, u32> &i2) const {
+        if (i1.second != i2.second) {
+            // longest is 'first', so > not <
+            return i1.second > i2.second;
+        }
+        return i1.first < i2.first;
+    }
+};
+}
+
+static
+void fillHashes(const vector<hwlmLiteral> &long_lits, size_t max_len,
+                FDRSHashEntry *tab, size_t numEntries, MODES m,
+                map<u32, u32> &litToOffsetVal) {
+    const u32 nbits = lg2(numEntries);
+    map<u32, deque<pair<u32, u32> > > bucketToLitOffPairs;
+    map<u32, u64a> bucketToBitfield;
+
+    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
+                                             e = long_lits.end();
+         i != e; ++i) {
+        const hwlmLiteral &l = *i;
+        if ((m == CASELESS) != i->nocase) {
+            continue;
+        }
+        for (u32 j = 1; j < i->s.size() - max_len + 1; j++) {
+            u32 h = hashLit(l, j, max_len, m);
+            u32 h_ent = h & ((1U << nbits) - 1);
+            u32 h_low = (h >> nbits) & 63;
+            bucketToLitOffPairs[h_ent].push_back(make_pair(i->id, j));
+            bucketToBitfield[h_ent] |= (1ULL << h_low);
+        }
+    }
+
+    // this used to be a set<u32>, but a bitset is much much faster given that
+    // we're using it only for membership testing.
+    dynamic_bitset<> filledBuckets(numEntries); // all bits zero by default.
+
+    // sweep out bitfield entries and save the results swapped accordingly
+    // also, anything with bitfield entries is put in filledBuckets
+    for (map<u32, u64a>::const_iterator i = bucketToBitfield.begin(),
+                                        e = bucketToBitfield.end();
+         i != e; ++i) {
+        u32 bucket = i->first;
+        u64a contents = i->second;
+        tab[bucket].bitfield = contents;
+        filledBuckets.set(bucket);
+    }
+
+    // store out all our chains based on free values in our hash table.
+    // find nearest free locations that are empty (there will always be more
+    // entries than strings, at present)
+    for (map<u32, deque<pair<u32, u32> > >::iterator
+             i = bucketToLitOffPairs.begin(),
+             e = bucketToLitOffPairs.end();
+         i != e; ++i) {
+        u32 bucket = i->first;
+        deque<pair<u32, u32> > &d = i->second;
+
+        // sort d by distance of the residual string (len minus our depth into
+        // the string). We need to put the 'furthest back' string first...
+        stable_sort(d.begin(), d.end(), OffsetIDFromEndOrder(long_lits));
+
+        while (1) {
+            // first time through is always at bucket, then we fill in links
+            filledBuckets.set(bucket);
+            FDRSHashEntry *ent = &tab[bucket];
+            u32 lit_id = d.front().first;
+            u32 offset = d.front().second;
+
+            ent->state = verify_u32(litToOffsetVal[lit_id] + offset + max_len);
+            ent->link = (u32)LINK_INVALID;
+
+            d.pop_front();
+            if (d.empty()) {
+                break;
+            }
+            // now, if there is another value
+            // find a bucket for it and put in 'bucket' and repeat
+            // all we really need to do is find something not in filledBuckets,
+            // ideally something close to bucket
+            // we search backward and forward from bucket, trying to stay as
+            // close as possible.
+            UNUSED bool found = false;
+            int bucket_candidate = 0;
+            for (u32 k = 1; k < numEntries * 2; k++) {
+                bucket_candidate = bucket + (((k & 1) == 0)
+                        ? (-(int)k / 2) : (k / 2));
+                if (bucket_candidate < 0 ||
+                    (size_t)bucket_candidate >= numEntries) {
+                    continue;
+                }
+                if (!filledBuckets.test(bucket_candidate)) {
+                    found = true;
+                    break;
+                }
+            }
+
+            assert(found);
+            bucket = bucket_candidate;
+            ent->link = bucket;
+        }
+    }
+}
+
+static
+size_t maxMaskLen(const vector<hwlmLiteral> &lits) {
+    size_t rv = 0;
+    vector<hwlmLiteral>::const_iterator it, ite;
+    for (it = lits.begin(), ite = lits.end(); it != ite; ++it) {
+        rv = max(rv, it->msk.size());
+    }
+    return rv;
+}
+
+pair<u8 *, size_t>
+fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
+                       hwlmStreamingControl *stream_control) {
+    // refuse to compile if we are forced to have smaller than minimum
+    // history required for long-literal support, full stop
+    // otherwise, choose the maximum of the preferred history quantity
+    // (currently a fairly extravagant 32) or the already used history
+    // quantity - subject to the limitation of stream_control->history_max
+
+    const size_t MIN_HISTORY_REQUIRED = 32;
+
+    if (MIN_HISTORY_REQUIRED > stream_control->history_max) {
+        throw std::logic_error("Cannot set history to minimum history required");
+    }
+
+    size_t max_len =
+        MIN(stream_control->history_max,
+            MAX(MIN_HISTORY_REQUIRED, stream_control->history_min));
+    assert(max_len >= MIN_HISTORY_REQUIRED);
+    size_t max_mask_len = maxMaskLen(lits);
+
+    vector<hwlmLiteral> long_lits;
+    if (!setupLongLits(lits, long_lits, max_len) || false) {
+        // "Don't need to do anything" path, not really a fail
+        DEBUG_PRINTF("Streaming literal path produces no table\n");
+
+        // we want enough history to manage the longest literal and the longest
+        // mask.
+        stream_control->literal_history_required =
+                    max(maxLen(lits), max_mask_len) - 1;
+        stream_control->literal_stream_state_required = 0;
+        return make_pair(nullptr, size_t{0});
+    }
+
+    // Ensure that we have enough room for the longest mask.
+    if (max_mask_len) {
+        max_len = max(max_len, max_mask_len - 1);
+    }
+
+    u32 boundary[MAX_MODES];
+    u32 positions[MAX_MODES];
+    u32 hashEntries[MAX_MODES];
+
+    analyzeLits(long_lits, max_len, boundary, positions, hashEntries);
+
+    // first assess the size and find our caseless threshold
+    size_t headerSize = ROUNDUP_16(sizeof(FDRSTableHeader));
+
+    size_t litTabOffset = headerSize;
+
+    size_t litTabNumEntries = long_lits.size() + 1;
+    size_t litTabSize = ROUNDUP_16(litTabNumEntries * sizeof(FDRSLiteral));
+
+    size_t wholeLitTabOffset = litTabOffset + litTabSize;
+    size_t totalWholeLitTabSize = ROUNDUP_16(positions[CASEFUL] +
+                                             positions[CASELESS]);
+
+    size_t htOffset[MAX_MODES];
+    size_t htSize[MAX_MODES];
+
+    htOffset[CASEFUL] = wholeLitTabOffset + totalWholeLitTabSize;
+    htSize[CASEFUL] = hashEntries[CASEFUL] * sizeof(FDRSHashEntry);
+    htOffset[CASELESS] = htOffset[CASEFUL] + htSize[CASEFUL];
+    htSize[CASELESS] = hashEntries[CASELESS] * sizeof(FDRSHashEntry);
+
+    size_t tabSize = ROUNDUP_16(htOffset[CASELESS] + htSize[CASELESS]);
+
+    // need to add +2 to both of these to allow space for the actual largest
+    // value as well as handling the fact that we add one to the space when
+    // storing out a position to allow zero to mean "no stream state value"
+    u8 streamBits[MAX_MODES];
+    streamBits[CASEFUL] = lg2(roundUpToPowerOfTwo(positions[CASEFUL] + 2));
+    streamBits[CASELESS] = lg2(roundUpToPowerOfTwo(positions[CASELESS] + 2));
+    u32 tot_state_bytes = (streamBits[CASEFUL] + streamBits[CASELESS] + 7) / 8;
+
+    u8 * secondaryTable = (u8 *)aligned_zmalloc(tabSize);
+    assert(secondaryTable); // otherwise would have thrown std::bad_alloc
+
+    // then fill it in
+    u8 * ptr = secondaryTable;
+    FDRSTableHeader * header = (FDRSTableHeader *)ptr;
+    // fill in header
+    header->pseudoEngineID = (u32)0xffffffff;
+    header->N = verify_u8(max_len); // u8 so doesn't matter; won't go > 255
+    for (u32 m = CASEFUL; m < MAX_MODES; ++m) {
+        header->boundary[m] = boundary[m];
+        header->hashOffset[m] = verify_u32(htOffset[m]);
+        header->hashNBits[m] = lg2(hashEntries[m]);
+        header->streamStateBits[m] = streamBits[m];
+    }
+    assert(tot_state_bytes < sizeof(u64a));
+    header->streamStateBytes = verify_u8(tot_state_bytes); // u8
+
+    ptr += headerSize;
+
+    // now fill in the rest
+
+    FDRSLiteral * litTabPtr = (FDRSLiteral *)ptr;
+    ptr += litTabSize;
+
+    map<u32, u32> litToOffsetVal;
+    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
+                                             e = long_lits.end();
+         i != e; ++i) {
+        u32 entry = verify_u32(i - long_lits.begin());
+        u32 offset = verify_u32(ptr - secondaryTable);
+
+        // point the table entry to the string location
+        litTabPtr[entry].offset = offset;
+
+        litToOffsetVal[entry] = offset;
+
+        // copy the string into the string location
+        memcpy(ptr, i->s.c_str(), i->s.size());
+
+        ptr += i->s.size(); // and the string location
+    }
+
+    // fill in final lit table entry with current ptr (serves as end value)
+    litTabPtr[long_lits.size()].offset = verify_u32(ptr - secondaryTable);
+
+    // fill hash tables
+    ptr = secondaryTable + htOffset[CASEFUL];
+    for (u32 m = CASEFUL; m < MAX_MODES; ++m) {
+        fillHashes(long_lits, max_len, (FDRSHashEntry *)ptr, hashEntries[m],
+                   (MODES)m, litToOffsetVal);
+        ptr += htSize[m];
+    }
+
+    // tell the world what we did
+    stream_control->literal_history_required = max_len;
+    stream_control->literal_stream_state_required = tot_state_bytes;
+    return make_pair(secondaryTable, tabSize);
+}
+
+} // namespace ue2
--- a/src/fdr/fdr_streaming_internal.h
+++ b/src/fdr/fdr_streaming_internal.h
@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_STREAMING_INTERNAL_H
+#define FDR_STREAMING_INTERNAL_H
+
+#include "ue2common.h"
+#include "fdr_internal.h"
+#include "util/unaligned.h"
+
+// tertiary table:
+// a header (FDRSTableHeader)
+// long_lits.size()+1 entries holding an offset to the string in the
+//       'whole literal table' (FDRSLiteral structure)
+// the whole literal table - every string packed in (freeform)
+// hash table (caseful) (FDRSHashEntry)
+// hash table (caseless) (FDRSHashEntry)
+
+typedef enum {
+    CASEFUL = 0,
+    CASELESS = 1,
+    MAX_MODES = 2
+} MODES;
+
+// We have one of these structures hanging off the 'link' of our secondary
+// FDR table that handles streaming strings
+struct FDRSTableHeader {
+    u32 pseudoEngineID; // set to 0xffffffff to indicate this isn't an FDR
+
+    // string id one beyond the maximum entry for this type of literal
+    // boundary[CASEFUL] is the end of the caseful literals
+    // boundary[CASELESS] is the end of the caseless literals and one beyond
+    // the largest literal id (the size of the littab)
+    u32 boundary[MAX_MODES];
+
+    // offsets are 0 if no such table exists
+    // offset from the base of the tertiary structure to the hash table
+    u32 hashOffset[MAX_MODES];
+    u32 hashNBits[MAX_MODES]; // lg2 of the size of the hash table
+
+    u8 streamStateBits[MAX_MODES];
+    u8 streamStateBytes; // total size of packed stream state in bytes
+    u8 N; // prefix lengths
+    u16 pad;
+};
+
+// One of these structures per literal entry in our secondary FDR table.
+struct FDRSLiteral {
+    u32 offset;
+    // potentially - another u32 to point to the 'next lesser included literal'
+    // which would be a literal that overlaps this one in such a way that a
+    // failure to match _this_ literal can leave us in a state that we might
+    // still match that literal. Offset information might also be called for,
+    // in which case we might be wanting to use a FDRSLiteralOffset
+};
+
+typedef u32 FDRSLiteralOffset;
+
+#define LINK_INVALID 0xffffffff
+
+// One of these structures per hash table entry in our secondary FDR table
+struct FDRSHashEntry {
+    u64a bitfield;
+    FDRSLiteralOffset state;
+    u32 link;
+};
+
+static really_inline
+u32 get_start_lit_idx(const struct FDRSTableHeader * h, MODES m) {
+    return m == CASEFUL ? 0 : h->boundary[m-1];
+}
+
+static really_inline
+u32 get_end_lit_idx(const struct FDRSTableHeader * h, MODES m) {
+    return h->boundary[m];
+}
+
+static really_inline
+const struct FDRSLiteral * getLitTab(const struct FDRSTableHeader * h) {
+    return (const struct FDRSLiteral *) (((const u8 *)h) +
+            ROUNDUP_16(sizeof(struct FDRSTableHeader)));
+}
+
+static really_inline
+u32 getBaseOffsetOfLits(const struct FDRSTableHeader * h, MODES m) {
+    return getLitTab(h)[get_start_lit_idx(h, m)].offset;
+}
+
+static really_inline
+u32 packStateVal(const struct FDRSTableHeader * h, MODES m, u32 v) {
+    return v - getBaseOffsetOfLits(h, m) + 1;
+}
+
+static really_inline
+u32 unpackStateVal(const struct FDRSTableHeader * h, MODES m, u32 v) {
+    return v + getBaseOffsetOfLits(h, m) - 1;
+}
+
+static really_inline
+u32 has_bit(const struct FDRSHashEntry * ent, u32 bit) {
+    return (ent->bitfield >> bit) & 0x1;
+}
+
+static really_inline
+u32 streaming_hash(const u8 *ptr, UNUSED size_t len, MODES mode) {
+    const u64a CASEMASK = 0xdfdfdfdfdfdfdfdfULL;
+    const u64a MULTIPLIER = 0x0b4e0ef37bc32127ULL;
+    assert(len >= 32);
+
+    u64a v1 = unaligned_load_u64a(ptr);
+    u64a v2 = unaligned_load_u64a(ptr + 8);
+    u64a v3 = unaligned_load_u64a(ptr + 16);
+    if (mode == CASELESS) {
+        v1 &= CASEMASK;
+        v2 &= CASEMASK;
+        v3 &= CASEMASK;
+    }
+    v1 *= MULTIPLIER;
+    v2 *= (MULTIPLIER*MULTIPLIER);
+    v3 *= (MULTIPLIER*MULTIPLIER*MULTIPLIER);
+    v1 >>= 32;
+    v2 >>= 32;
+    v3 >>= 32;
+    return v1 ^ v2 ^ v3;
+}
+
+#endif
--- a/src/fdr/fdr_streaming_runtime.h
+++ b/src/fdr/fdr_streaming_runtime.h
@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_STREAMING_RUNTIME_H
+#define FDR_STREAMING_RUNTIME_H
+
+#include "fdr_streaming_internal.h"
+#include "util/partial_store.h"
+
+static really_inline
+const struct FDRSTableHeader * getSHDR(const struct FDR * fdr) {
+    const u8 * linkPtr = ((const u8 *)fdr) + fdr->link;
+    // test if it's not really a engineID, but a 'pseudo engine id'
+    assert(*(const u32 *)linkPtr == 0xffffffff);
+    assert(linkPtr);
+    return (const struct FDRSTableHeader *)linkPtr;
+}
+
+// Reads from stream state and unpacks values into stream state table.
+static really_inline
+void getStreamStates(const struct FDRSTableHeader * streamingTable,
+                     const u8 * stream_state, u32 * table) {
+    assert(streamingTable);
+    assert(stream_state);
+    assert(table);
+
+    u8 ss_bytes = streamingTable->streamStateBytes;
+    u8 ssb = streamingTable->streamStateBits[CASEFUL];
+    UNUSED u8 ssb_nc = streamingTable->streamStateBits[CASELESS];
+    assert(ss_bytes == (ssb + ssb_nc + 7) / 8);
+
+#if defined(ARCH_32_BIT)
+    // On 32-bit hosts, we may be able to avoid having to do any u64a
+    // manipulation at all.
+    if (ss_bytes <= 4) {
+        u32 ssb_mask = (1U << ssb) - 1;
+        u32 streamVal = partial_load_u32(stream_state, ss_bytes);
+        table[CASEFUL] = (u32)(streamVal & ssb_mask);
+        table[CASELESS] = (u32)(streamVal >> ssb);
+        return;
+    }
+#endif
+
+    u64a ssb_mask = (1ULL << ssb) - 1;
+    u64a streamVal = partial_load_u64a(stream_state, ss_bytes);
+    table[CASEFUL] = (u32)(streamVal & ssb_mask);
+    table[CASELESS] = (u32)(streamVal >> (u64a)ssb);
+}
+
+#ifndef NDEBUG
+// Defensive checking (used in assert) that these table values don't overflow
+// outside the range available.
+static really_inline UNUSED
+u32 streamingTableOverflow(u32 * table, u8 ssb, u8 ssb_nc) {
+    u32 ssb_mask = (1ULL << (ssb)) - 1;
+    if (table[CASEFUL] & ~ssb_mask) {
+        return 1;
+    }
+    u32 ssb_nc_mask = (1ULL << (ssb_nc)) - 1;
+    if (table[CASELESS] & ~ssb_nc_mask) {
+        return 1;
+    }
+    return 0;
+}
+#endif
+
+// Reads from stream state table and packs values into stream state.
+static really_inline
+void setStreamStates(const struct FDRSTableHeader * streamingTable,
+                     u8 * stream_state, u32 * table) {
+    assert(streamingTable);
+    assert(stream_state);
+    assert(table);
+
+    u8 ss_bytes = streamingTable->streamStateBytes;
+    u8 ssb = streamingTable->streamStateBits[CASEFUL];
+    UNUSED u8 ssb_nc = streamingTable->streamStateBits[CASELESS];
+    assert(ss_bytes == (ssb + ssb_nc + 7) / 8);
+    assert(!streamingTableOverflow(table, ssb, ssb_nc));
+
+#if defined(ARCH_32_BIT)
+    // On 32-bit hosts, we may be able to avoid having to do any u64a
+    // manipulation at all.
+    if (ss_bytes <= 4) {
+        u32 stagingStreamState = table[CASEFUL];
+        stagingStreamState |= (table[CASELESS] << ssb);
+
+        partial_store_u32(stream_state, stagingStreamState, ss_bytes);
+        return;
+    }
+#endif
+
+    u64a stagingStreamState = (u64a)table[CASEFUL];
+    stagingStreamState |= (u64a)table[CASELESS] << ((u64a)ssb);
+    partial_store_u64a(stream_state, stagingStreamState, ss_bytes);
+}
+
+u32 fdrStreamStateActive(const struct FDR * fdr, const u8 * stream_state) {
+    if (!stream_state) {
+        return 0;
+    }
+    const struct FDRSTableHeader * streamingTable = getSHDR(fdr);
+    u8 ss_bytes = streamingTable->streamStateBytes;
+
+    // We just care if there are any bits set, and the test below is faster
+    // than a partial_load_u64a (especially on 32-bit hosts).
+    for (u32 i = 0; i < ss_bytes; i++) {
+        if (*stream_state) {
+            return 1;
+        }
+        ++stream_state;
+    }
+    return 0;
+}
+
+// binary search for the literal index that contains the current state
+static really_inline
+u32 findLitTabEntry(const struct FDRSTableHeader * streamingTable,
+                    u32 stateValue, MODES m) {
+    const struct FDRSLiteral * litTab = getLitTab(streamingTable);
+    u32 lo = get_start_lit_idx(streamingTable, m);
+    u32 hi = get_end_lit_idx(streamingTable, m);
+
+    // Now move stateValue back by one so that we're looking for the
+    // litTab entry that includes it the string, not the one 'one past' it
+    stateValue -= 1;
+    assert(lo != hi);
+    assert(litTab[lo].offset <= stateValue);
+    assert(litTab[hi].offset > stateValue);
+
+    // binary search to find the entry e such that:
+    // litTab[e].offsetToLiteral <= stateValue < litTab[e+1].offsetToLiteral
+    while (lo + 1 < hi) {
+        u32 mid = (lo + hi) / 2;
+        if (litTab[mid].offset <= stateValue) {
+            lo = mid;
+        } else { //(litTab[mid].offset > stateValue) {
+            hi = mid;
+        }
+    }
+    assert(litTab[lo].offset <= stateValue);
+    assert(litTab[hi].offset > stateValue);
+    return lo;
+}
+
+static really_inline
+void fdrUnpackStateMode(struct FDR_Runtime_Args *a,
+                        const struct FDRSTableHeader *streamingTable,
+                        const struct FDRSLiteral * litTab,
+                        const u32 *state_table,
+                        const MODES m) {
+    if (!state_table[m]) {
+        return;
+    }
+
+    u32 stateValue = unpackStateVal(streamingTable, m, state_table[m]);
+    u32 idx = findLitTabEntry(streamingTable, stateValue, m);
+    size_t found_offset = litTab[idx].offset;
+    const u8 * found_buf = found_offset + (const u8 *)streamingTable;
+    size_t found_sz = stateValue - found_offset;
+    if (m == CASEFUL) {
+        a->buf_history = found_buf;
+        a->len_history = found_sz;
+    } else {
+        a->buf_history_nocase = found_buf;
+        a->len_history_nocase = found_sz;
+    }
+}
+
+static really_inline
+void fdrUnpackState(const struct FDR * fdr, struct FDR_Runtime_Args * a,
+                    const u8 * stream_state) {
+    // nothing to do if there's no stream state for the case
+    if (!stream_state) {
+        return;
+    }
+
+    const struct FDRSTableHeader * streamingTable = getSHDR(fdr);
+    const struct FDRSLiteral * litTab = getLitTab(streamingTable);
+
+    u32 state_table[MAX_MODES];
+    getStreamStates(streamingTable, stream_state, state_table);
+
+    fdrUnpackStateMode(a, streamingTable, litTab, state_table, CASEFUL);
+    fdrUnpackStateMode(a, streamingTable, litTab, state_table, CASELESS);
+}
+
+static really_inline
+u32 do_single_confirm(const struct FDRSTableHeader * streamingTable,
+                      const struct FDR_Runtime_Args * a, u32 hashState, MODES m) {
+    const struct FDRSLiteral * litTab = getLitTab(streamingTable);
+    u32 idx = findLitTabEntry(streamingTable, hashState, m);
+    size_t found_offset = litTab[idx].offset;
+    const u8 * s1 = found_offset + (const u8 *)streamingTable;
+    assert(hashState > found_offset);
+    size_t l1 = hashState - found_offset;
+    const u8 * buf = a->buf;
+    size_t len = a->len;
+    const char nocase = m != CASEFUL;
+
+    if (l1 > len) {
+        const u8 * hist = nocase ? a->buf_history_nocase : a->buf_history;
+        size_t hist_len = nocase ? a->len_history_nocase : a->len_history;
+
+        if (l1 > len+hist_len) {
+            return 0; // Break out - not enough total history
+        }
+
+        size_t overhang = l1 - len;
+        assert(overhang <= hist_len);
+
+        if (cmpForward(hist + hist_len - overhang, s1, overhang, nocase)) {
+            return 0;
+        }
+        s1 += overhang;
+        l1 -= overhang;
+    }
+    // if we got here, we don't need history or we compared ok out of history
+    assert(l1 <= len);
+
+    if (cmpForward(buf + len - l1, s1, l1, nocase)) {
+        return 0;
+    }
+    return hashState; // our new state
+}
+
+static really_inline
+void fdrFindStreamingHash(const struct FDR_Runtime_Args *a,
+                          const struct FDRSTableHeader *streamingTable,
+                          u8 hash_len, u32 *hashes) {
+    u8 tempbuf[128];
+    const u8 *base;
+    if (hash_len > a->len) {
+        assert(hash_len <= 128);
+        size_t overhang = hash_len - a->len;
+        assert(overhang <= a->len_history);
+        memcpy(tempbuf, a->buf_history + a->len_history - overhang, overhang);
+        memcpy(tempbuf + overhang, a->buf, a->len);
+        base = tempbuf;
+    } else {
+        assert(hash_len <= a->len);
+        base = a->buf + a->len - hash_len;
+    }
+
+    if (streamingTable->hashNBits[CASEFUL]) {
+        hashes[CASEFUL] = streaming_hash(base, hash_len, CASEFUL);
+    }
+    if (streamingTable->hashNBits[CASELESS]) {
+        hashes[CASELESS] = streaming_hash(base, hash_len, CASELESS);
+    }
+}
+
+static really_inline
+const struct FDRSHashEntry *getEnt(const struct FDRSTableHeader *streamingTable,
+                                   u32 h, const MODES m) {
+    u32 nbits = streamingTable->hashNBits[m];
+    if (!nbits) {
+        return NULL;
+    }
+
+    u32 h_ent = h & ((1 << nbits) - 1);
+    u32 h_low = (h >> nbits) & 63;
+
+    const struct FDRSHashEntry *tab =
+        (const struct FDRSHashEntry *)((const u8 *)streamingTable
+                                       + streamingTable->hashOffset[m]);
+    const struct FDRSHashEntry *ent = tab + h_ent;
+
+    if (!has_bit(ent, h_low)) {
+        return NULL;
+    }
+
+    return ent;
+}
+
+static really_inline
+void fdrPackStateMode(u32 *state_table, const struct FDR_Runtime_Args *a,
+                      const struct FDRSTableHeader *streamingTable,
+                      const struct FDRSHashEntry *ent, const MODES m) {
+    assert(ent);
+    assert(streamingTable->hashNBits[m]);
+
+    const struct FDRSHashEntry *tab =
+        (const struct FDRSHashEntry *)((const u8 *)streamingTable
+                                       + streamingTable->hashOffset[m]);
+
+    while (1) {
+        u32 tmp = 0;
+        if ((tmp = do_single_confirm(streamingTable, a, ent->state, m))) {
+            state_table[m] = packStateVal(streamingTable, m, tmp);
+            break;
+        }
+        if (ent->link == LINK_INVALID) {
+            break;
+        }
+        ent = tab + ent->link;
+    }
+}
+
+static really_inline
+void fdrPackState(const struct FDR *fdr, const struct FDR_Runtime_Args *a,
+                  u8 *stream_state) {
+    // nothing to do if there's no stream state for the case
+    if (!stream_state) {
+        return;
+    }
+
+    // get pointers to the streamer FDR and the tertiary structure
+    const struct FDRSTableHeader *streamingTable = getSHDR(fdr);
+
+    assert(streamingTable->N);
+
+    u32 state_table[MAX_MODES] = {0, 0};
+
+    // if we don't have enough history, we don't need to do anything
+    if (streamingTable->N <= a->len + a->len_history) {
+        u32 hashes[MAX_MODES] = {0, 0};
+
+        fdrFindStreamingHash(a, streamingTable, streamingTable->N, hashes);
+
+        const struct FDRSHashEntry *ent_ful = getEnt(streamingTable,
+                                                    hashes[CASEFUL], CASEFUL);
+        const struct FDRSHashEntry *ent_less = getEnt(streamingTable,
+                                                    hashes[CASELESS], CASELESS);
+
+        if (ent_ful) {
+            fdrPackStateMode(state_table, a, streamingTable, ent_ful,
+                             CASEFUL);
+        }
+
+        if (ent_less) {
+            fdrPackStateMode(state_table, a, streamingTable, ent_less,
+                             CASELESS);
+        }
+    }
+
+    setStreamStates(streamingTable, stream_state, state_table);
+}
+
+#endif
--- a/src/fdr/flood_compile.cpp
+++ b/src/fdr/flood_compile.cpp
@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_confirm.h"
+#include "fdr_compile_internal.h"
+#include "fdr_engine_description.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/bitutils.h"
+#include "util/charreach.h"
+#include "util/compare.h"
+#include "util/ue2string.h"
+#include "util/verify_types.h"
+
+#include <cstring>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+using namespace std;
+
+namespace ue2 {
+
+namespace {
+struct FloodComparator {
+    bool operator()(const FDRFlood &f1, const FDRFlood &f2) const {
+        return std::memcmp(&f1, &f2, sizeof(f1)) < 0;
+    }
+};
+}
+
+static
+bool isDifferent(u8 oldC, u8 c, bool caseless) {
+    if (caseless) {
+        return mytolower(oldC) != mytolower(c);
+    } else {
+        return oldC != c;
+    }
+}
+
+static
+void updateFloodSuffix(vector<FDRFlood> &tmpFlood, u8 c, u32 suffix) {
+    FDRFlood &fl = tmpFlood[c];
+    fl.suffix = MAX(fl.suffix, suffix + 1);
+    DEBUG_PRINTF("Updated Flood Suffix for char '%c' to %u\n", c, fl.suffix);
+}
+
+static
+void addFlood(vector<FDRFlood> &tmpFlood, u8 c, const hwlmLiteral &lit,
+              u32 suffix) {
+    FDRFlood &fl = tmpFlood[c];
+    fl.suffix = MAX(fl.suffix, suffix + 1);
+    if (fl.idCount < FDR_FLOOD_MAX_IDS) {
+        fl.ids[fl.idCount] = lit.id;
+        fl.allGroups |= lit.groups;
+        fl.groups[fl.idCount] = lit.groups;
+        fl.len[fl.idCount] = suffix;
+        // when idCount gets to max_ids this flood no longer happens
+        // only incremented one more time to avoid arithmetic overflow
+        DEBUG_PRINTF("Added Flood for char '%c' suffix=%u len[%hu]=%u\n",
+                                        c, fl.suffix, fl.idCount, suffix);
+        fl.idCount++;
+   }
+}
+
+pair<u8 *, size_t> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
+                                        const EngineDescription &eng) {
+    vector<FDRFlood> tmpFlood(N_CHARS);
+    u32 default_suffix = eng.getDefaultFloodSuffixLength();
+
+    // zero everything to avoid spurious distinctions in the compares
+    memset(&tmpFlood[0], 0, N_CHARS * sizeof(FDRFlood));
+
+    for (u32 c = 0; c < N_CHARS; c++) {
+        tmpFlood[c].suffix = default_suffix;
+    }
+
+    for (const auto &lit : lits) {
+        DEBUG_PRINTF("lit: '%s'%s\n", escapeString(lit.s).c_str(),
+                     lit.nocase ? " (nocase)" : "");
+        u32 litSize = verify_u32(lit.s.size());
+        u32 maskSize = (u32)lit.msk.size();
+        u8 c = lit.s[litSize - 1];
+        bool nocase = ourisalpha(c) ? lit.nocase : false;
+
+        if (nocase && maskSize && (lit.msk[maskSize - 1] & CASE_BIT)) {
+            c = (lit.cmp[maskSize - 1] & CASE_BIT) ? mytolower(c) : mytoupper(c);
+            nocase = false;
+        }
+
+        u32 iEnd = MAX(litSize, maskSize);
+        u32 upSuffix = iEnd; // upSuffix is used as an upper case suffix length
+                             // for case-less, or as a suffix length for case-sensitive;
+        u32 loSuffix = iEnd; // loSuffix used only for case-less as a lower case suffix
+                             // length;
+
+        for (u32 i = 0; i < iEnd; i++) {
+            if (i < litSize) {
+                if (isDifferent(c, lit.s[litSize - i - 1], lit.nocase)) {
+                    DEBUG_PRINTF("non-flood char in literal[%u] %c != %c\n",
+                                                i, c, lit.s[litSize - i - 1]);
+                    upSuffix = MIN(upSuffix, i);
+                    loSuffix = MIN(loSuffix, i); // makes sense only for case-less
+                    break;
+                }
+            }
+            if (i < maskSize) {
+                u8 m = lit.msk[maskSize - i - 1];
+                u8 cm = lit.cmp[maskSize - i - 1] & m;
+                if(nocase) {
+                    if ((mytoupper(c) & m) != cm) {
+                        DEBUG_PRINTF("non-flood char in mask[%u] %c != %c\n",
+                                                            i, mytoupper(c), cm);
+                        upSuffix = MIN(upSuffix, i);
+                    }
+                    if ((mytolower(c) & m) != cm) {
+                        DEBUG_PRINTF("non-flood char in mask[%u] %c != %c\n",
+                                                            i, mytolower(c), cm);
+                        loSuffix = MIN(loSuffix, i);
+                    }
+                    if (loSuffix != iEnd && upSuffix != iEnd) {
+                        break;
+                    }
+                } else if ((c & m) != cm) {
+                    DEBUG_PRINTF("non-flood char in mask[%u] %c != %c\n", i, c, cm);
+                    upSuffix = MIN(upSuffix, i);
+                    break;
+                }
+            }
+        }
+        if(upSuffix != iEnd) {
+            updateFloodSuffix(tmpFlood, nocase ? mytoupper(c) : c, upSuffix);
+        } else {
+            addFlood(tmpFlood, nocase ? mytoupper(c) : c, lit, upSuffix);
+        }
+        if (nocase) {
+            if(loSuffix != iEnd) {
+                updateFloodSuffix(tmpFlood, mytolower(c), loSuffix);
+            } else {
+                addFlood(tmpFlood, mytolower(c), lit, loSuffix);
+            }
+        }
+    }
+
+#ifdef DEBUG
+    for (u32 i = 0; i < N_CHARS; i++) {
+        FDRFlood &fl = tmpFlood[i];
+        if (!fl.idCount) {
+            continue;
+        }
+
+        printf("i is %02x fl->idCount is %hd fl->suffix is %d fl->allGroups is "
+               "%016llx\n", i, fl.idCount, fl.suffix, fl.allGroups);
+        for (u32 j = 0; j < fl.idCount; j++) {
+            printf("j is %d fl.groups[j] %016llx fl.len[j] %d \n", j,
+                   fl.groups[j], fl.len[j]);
+        }
+    }
+#endif
+
+    map<FDRFlood, CharReach, FloodComparator> flood2chars;
+    for (u32 i = 0; i < N_CHARS; i++) {
+        FDRFlood fl = tmpFlood[i];
+        flood2chars[fl].set(i);
+    }
+
+    u32 nDistinctFloods = flood2chars.size();
+    size_t floodHeaderSize = sizeof(u32) * N_CHARS;
+    size_t floodStructSize = sizeof(FDRFlood) * nDistinctFloods;
+    size_t totalSize = ROUNDUP_16(floodHeaderSize + floodStructSize);
+    u8 *buf = (u8 *)aligned_zmalloc(totalSize);
+    assert(buf); // otherwise would have thrown std::bad_alloc
+
+    u32 *floodHeader = (u32 *)buf;
+    FDRFlood *layoutFlood = (FDRFlood * )(buf + floodHeaderSize);
+
+    u32 currentFloodIndex = 0;
+    for (const auto &m : flood2chars) {
+        const FDRFlood &fl = m.first;
+        const CharReach &cr = m.second;
+        layoutFlood[currentFloodIndex] = fl;
+        for (size_t c = cr.find_first(); c != cr.npos; c = cr.find_next(c)) {
+            floodHeader[c] = currentFloodIndex;
+        }
+        currentFloodIndex++;
+    }
+
+    DEBUG_PRINTF("made a flood structure with %zu + %zu = %zu\n",
+                 floodHeaderSize, floodStructSize, totalSize);
+
+    return make_pair((u8 *)buf, totalSize);
+}
+
+} // namespace ue2
--- a/src/fdr/flood_runtime.h
+++ b/src/fdr/flood_runtime.h
@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FLOOD_RUNTIME
+#define FLOOD_RUNTIME
+
+#if defined(ARCH_64_BIT)
+#define FLOOD_64
+#else
+#define FLOOD_32
+#endif
+#define FLOOD_MINIMUM_SIZE 256
+#define FLOOD_BACKOFF_START 32
+
+static really_inline
+const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {
+    // if we don't have a flood at either the start or end,
+    // or have a very small buffer, don't bother with flood detection
+    if (len < FLOOD_MINIMUM_SIZE) {
+        return buf + len;
+    }
+
+    /* entry points in runtime.c prefetch relevant data */
+#ifndef FLOOD_32
+    u64a x11 = *(const u64a *)ROUNDUP_PTR(buf, 8);
+    u64a x12 = *(const u64a *)ROUNDUP_PTR(buf+8, 8);
+    if (x11 == x12) {
+        return buf + floodBackoff;
+    }
+    u64a x21 = *(const u64a *)ROUNDUP_PTR(buf + len/2, 8);
+    u64a x22 = *(const u64a *)ROUNDUP_PTR(buf + len/2 + 8, 8);
+    if (x21 == x22) {
+        return buf + floodBackoff;
+    }
+    u64a x31 = *(const u64a *)ROUNDUP_PTR(buf + len - 24, 8);
+    u64a x32 = *(const u64a *)ROUNDUP_PTR(buf + len - 16, 8);
+    if (x31 == x32) {
+        return buf + floodBackoff;
+    }
+#else
+    u32 x11 = *(const u32 *)ROUNDUP_PTR(buf, 4);
+    u32 x12 = *(const u32 *)ROUNDUP_PTR(buf+4, 4);
+    if (x11 == x12) {
+        return buf + floodBackoff;
+    }
+    u32 x21 = *(const u32 *)ROUNDUP_PTR(buf + len/2, 4);
+    u32 x22 = *(const u32 *)ROUNDUP_PTR(buf + len/2 + 4, 4);
+    if (x21 == x22) {
+        return buf + floodBackoff;
+    }
+    u32 x31 = *(const u32 *)ROUNDUP_PTR(buf + len - 12, 4);
+    u32 x32 = *(const u32 *)ROUNDUP_PTR(buf + len - 8, 4);
+    if (x31 == x32) {
+        return buf + floodBackoff;
+    }
+#endif
+    return buf + len;
+}
+
+static really_inline
+const u8 * floodDetect(const struct FDR * fdr,
+                       const struct FDR_Runtime_Args * a,
+                       const u8 ** ptrPtr,
+                       const u8 * tryFloodDetect,
+                       u32 * floodBackoffPtr,
+                       hwlmcb_rv_t * control,
+                       u32 iterBytes) {
+    DEBUG_PRINTF("attempting flood detection at %p\n", tryFloodDetect);
+    const u8 * buf = a->buf;
+    const size_t len = a->len;
+    HWLMCallback cb = a->cb;
+    void * ctxt = a->ctxt;
+
+    const u8 * ptr = *ptrPtr;
+    // tryFloodDetect is never put in places where unconditional
+    // reads a short distance forward or backward here
+    // TODO: rationale for this line needs to be rediscovered!!
+    size_t mainLoopLen = len > iterBytes ? len - iterBytes : 0;
+    const u32 i = ptr - buf;
+    u32 j = i;
+
+    // go from c to our FDRFlood structure
+    u8 c = buf[i];
+    const u8 * fBase = ((const u8 *)fdr) + fdr->floodOffset;
+    u32 fIdx = ((const u32 *)fBase)[c];
+    const struct FDRFlood * fsb = (const struct FDRFlood *)(fBase + sizeof(u32) * 256);
+    const struct FDRFlood * fl = &fsb[fIdx];
+
+#ifndef FLOOD_32
+    u64a cmpVal = c;
+    cmpVal |= cmpVal << 8;
+    cmpVal |= cmpVal << 16;
+    cmpVal |= cmpVal << 32;
+    u64a probe = *(const u64a *)ROUNDUP_PTR(buf+i, 8);
+#else
+    u32 cmpVal = c;
+    cmpVal |= cmpVal << 8;
+    cmpVal |= cmpVal << 16;
+    u32 probe = *(const u32 *)ROUNDUP_PTR(buf+i, 4);
+#endif
+
+    if ((probe != cmpVal) || (fl->idCount >= FDR_FLOOD_MAX_IDS)) {
+        *floodBackoffPtr *= 2;
+        goto floodout;
+    }
+
+    if (i < fl->suffix + 7) {
+        *floodBackoffPtr *= 2;
+        goto floodout;
+    }
+
+    j = i - fl->suffix;
+
+#ifndef FLOOD_32
+    j -= (u32)((uintptr_t)buf + j) & 0x7; // push j back to yield 8-aligned addrs
+    for (; j + 32 < mainLoopLen; j += 32) {
+        u64a v = *(const u64a *)(buf + j);
+        u64a v2 = *(const u64a *)(buf + j + 8);
+        u64a v3 = *(const u64a *)(buf + j + 16);
+        u64a v4 = *(const u64a *)(buf + j + 24);
+        if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) {
+            break;
+        }
+    }
+    for (; j + 8 < mainLoopLen; j += 8) {
+        u64a v = *(const u64a *)(buf + j);
+        if (v != cmpVal) {
+            break;
+        }
+    }
+#else
+    j -= (u32)((size_t)buf + j) & 0x3; // push j back to yield 4-aligned addrs
+    for (; j + 16 < mainLoopLen; j += 16) {
+        u32 v = *(const u32 *)(buf + j);
+        u32 v2 = *(const u32 *)(buf + j + 4);
+        u32 v3 = *(const u32 *)(buf + j + 8);
+        u32 v4 = *(const u32 *)(buf + j + 12);
+        if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) {
+            break;
+        }
+    }
+    for (; j + 4 < mainLoopLen; j += 4) {
+        u32 v = *(const u32 *)(buf + j);
+        if (v != cmpVal) {
+            break;
+        }
+    }
+#endif
+    for (; j < mainLoopLen; j++) {
+        u8 v = *(const u8 *)(buf + j);
+        if (v != c) {
+            break;
+        }
+    }
+    if (j > i ) {
+        j--; // needed for some reaches
+        u32 itersAhead = (j-i)/iterBytes;
+        u32 floodSize = itersAhead*iterBytes;
+
+        DEBUG_PRINTF("flooding %u size j %u i %u fl->idCount %hu "
+                     "*control %016llx fl->allGroups %016llx\n",
+                     floodSize, j, i, fl->idCount, *control, fl->allGroups);
+        DEBUG_PRINTF("mainloopLen %zu mainStart ??? mainEnd ??? len %zu\n",
+                     mainLoopLen, len);
+
+        if (fl->idCount && (*control & fl->allGroups)) {
+            switch (fl->idCount) {
+#if !defined(FLOOD_DEBUG)
+            // Carefully unrolled code
+            case 1:
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups);
+                     t += 4) {
+                    DEBUG_PRINTF("aaa %u %llx\n", t, fl->groups[0]);
+                    u32 len0 = fl->len[0] - 1;
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 0 - len0, i + t + 0, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 2 - len0, i + t + 2, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 3 - len0, i + t + 3, fl->ids[0], ctxt);
+                    }
+                }
+                break;
+            case 2:
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 4) {
+                    u32 len0 = fl->len[0] - 1;
+                    u32 len1 = fl->len[1] - 1;
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t - len0, i + t, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t - len1, i + t, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control =
+                            cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 1 - len1, i + t + 1, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 2 - len0, i + t + 2, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 2 - len1, i + t + 2, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 3 - len0, i + t + 3, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 3 - len1, i + t + 3, fl->ids[1], ctxt);
+                    }
+                }
+                break;
+            case 3:
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 2) {
+                    u32 len0 = fl->len[0] - 1;
+                    u32 len1 = fl->len[1] - 1;
+                    u32 len2 = fl->len[2] - 1;
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t - len0, i + t, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t - len1, i + t, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[2]) {
+                        *control = cb(i + t - len2, i + t, fl->ids[2], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 1 - len1, i + t + 1, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[2]) {
+                        *control = cb(i + t + 1 - len2, i + t + 1, fl->ids[2], ctxt);
+                    }
+                }
+                break;
+            default:
+                // slow generalized loop
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 2) {
+                    u32 len0 = fl->len[0] - 1;
+                    u32 len1 = fl->len[1] - 1;
+                    u32 len2 = fl->len[2] - 1;
+                    u32 len3 = fl->len[3] - 1;
+
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t - len0, i + t, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t - len1, i + t, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[2]) {
+                        *control = cb(i + t - len2, i + t, fl->ids[2], ctxt);
+                    }
+                    if (*control & fl->groups[3]) {
+                        *control = cb(i + t - len3, i + t, fl->ids[3], ctxt);
+                    }
+
+                    for (u32 t2 = 4; t2 < fl->idCount; t2++) {
+                        if (*control & fl->groups[t2]) {
+                            *control = cb(i + t - (fl->len[t2] - 1), i + t, fl->ids[t2], ctxt);
+                        }
+                    }
+
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 1 - len1, i + t + 1, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[2]) {
+                        *control = cb(i + t + 1 - len2, i + t + 1, fl->ids[2], ctxt);
+                    }
+                    if (*control & fl->groups[3]) {
+                        *control = cb(i + t + 1 - len3, i + t + 1, fl->ids[3], ctxt);
+                    }
+
+                    for (u32 t2 = 4; t2 < fl->idCount; t2++) {
+                        if (*control & fl->groups[t2]) {
+                            *control = cb(i + t + 1 - (fl->len[t2] - 1), i + t + 1, fl->ids[t2], ctxt);
+                        }
+                    }
+                }
+                break;
+#else
+            // Fallback for debugging
+            default:
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t++) {
+                    for (u32 t2 = 0; t2 < fl->idCount; t2++) {
+                        if (*control & fl->groups[t2]) {
+                            *control = cb(i + t - (fl->len[t2] - 1), i + t, fl->ids[t2], ctxt);
+                        }
+                    }
+                }
+#endif
+            }
+        }
+        ptr += floodSize;
+    } else {
+        *floodBackoffPtr *= 2;
+    }
+
+floodout:
+    if (j + *floodBackoffPtr < mainLoopLen - 128) {
+        tryFloodDetect = buf + MAX(i,j) + *floodBackoffPtr;
+    } else {
+        tryFloodDetect = buf + mainLoopLen; // set so we never do another flood detect
+    }
+    *ptrPtr = ptr;
+    DEBUG_PRINTF("finished flood detection at %p (next check %p)\n",
+                 ptr, tryFloodDetect);
+    return tryFloodDetect;
+}
+
+#endif
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "util/simd_utils.h"
+#include "util/simd_utils_ssse3.h"
+
+static const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
+};
+
+// Note: p_mask is an output param that initialises a poison mask.
+UNUSED static really_inline
+m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history,
+                     const u32 nMasks) {
+    union {
+        u8 val8[16];
+        m128 val128;
+    } u;
+    u.val128 = zeroes128();
+
+    if (ptr >= lo) {
+        u32 avail = (u32)(hi - ptr);
+        if (avail >= 16) {
+            *p_mask = load128((const void*)(p_mask_arr[16] + 16));
+            return loadu128(ptr);
+        }
+        *p_mask = load128((const void*)(p_mask_arr[avail] + 16));
+        for (u32 i = 0; i < avail; i++) {
+            u.val8[i] = ptr[i];
+        }
+    } else {
+        u32 need = MIN((u32)(lo - ptr), MIN(len_history, nMasks - 1));
+        u32 start = (u32)(lo - ptr);
+        u32 i;
+        for (i = start - need; ptr + i < lo; i++) {
+            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+        }
+        u32 end = MIN(16, (u32)(hi - ptr));
+        *p_mask = loadu128((const void*)(p_mask_arr[end - start] + 16 - start));
+        for (; i < end; i++) {
+            u.val8[i] = ptr[i];
+        }
+    }
+
+    return u.val128;
+}
+
+
+#if defined(__AVX2__)
+
+UNUSED static really_inline
+m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history,
+                     const u32 nMasks) {
+    m128 p_mask128;
+    m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history, len_history, nMasks));
+    *p_mask = set2x128(p_mask128);
+    return ret;
+}
+
+static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
+};
+
+
+UNUSED static really_inline
+m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history) {
+    union {
+        u8 val8[32];
+        m256 val256;
+    } u;
+
+    if (ptr >= lo) {
+        u32 avail = (u32)(hi - ptr);
+        if (avail >= 32) {
+            *p_mask = load256((const void*)(p_mask_arr256[32] + 32));
+            return loadu256(ptr);
+        }
+        *p_mask = load256((const void*)(p_mask_arr256[avail] + 32));
+        for (u32 i = 0; i < avail; i++) {
+            u.val8[i] = ptr[i];
+        }
+    } else {
+        // need contains "how many chars to pull from history"
+        // calculate based on what we need, what we have in the buffer
+        // and only what we need to make primary confirm work
+        u32 start = (u32)(lo - ptr);
+        u32 i;
+        for (i = start; ptr + i < lo; i++) {
+            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+        }
+        u32 end = MIN(32, (u32)(hi - ptr));
+        *p_mask = loadu256((const void*)(p_mask_arr256[end - start] + 32 - start));
+        for (; i < end; i++) {
+            u.val8[i] = ptr[i];
+        }
+    }
+
+    return u.val256;
+}
+
+
+#endif // __AVX2__
+
+#define P0(cnd) unlikely(cnd)
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "flood_runtime.h"
+
+#include "fdr_confirm.h"
+#include "fdr_confirm_runtime.h"
+
+#include "fdr_loadval.h"
+#include "util/bitutils.h"
+#include "teddy_internal.h"
+
+#include "teddy_autogen.c"
--- a/src/fdr/teddy_autogen.py
+++ b/src/fdr/teddy_autogen.py
@ -0,0 +1,545 @@
+#!/usr/bin/python
+
+# Copyright (c) 2015, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+from autogen_utils import *
+from base_autogen import *
+from string import Template
+
+class MT(MatcherBase):
+    def produce_confirm(self, iter, var_name, offset, bits, cautious = True):
+        if self.packed:
+            print self.produce_confirm_base(var_name, bits, iter*16 + offset, cautious, enable_confirmless = False, do_bailout = False)
+        else:
+            if self.num_masks == 1:
+                conf_func = "confWithBit1"
+            else:
+                conf_func = "confWithBitMany"
+
+            if cautious:
+                caution_string = "VECTORING"
+            else:
+                caution_string = "NOT_CAUTIOUS"
+
+            print "            if (P0(!!%s)) {" % var_name
+            print "                do  {"
+            if bits == 64:
+                print "                    bit = findAndClearLSB_64(&%s);" % (var_name)
+            else:
+                print "                    bit = findAndClearLSB_32(&%s);" % (var_name)
+            print "                    byte  = bit / %d + %d;" % (self.num_buckets, iter*16 + offset)
+            print "                    idx  = bit %% %d;" % self.num_buckets
+            print "                    cf = confBase[idx];"
+            print "                    fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
+            print "                    if (!(fdrc->groups & *control))"
+            print "                        continue;"
+            print "                    %s(fdrc, a, ptr - buf + byte, %s, control, &last_match);" % (conf_func, caution_string)
+            print "                } while(P0(!!%s));" % var_name
+            print "                if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
+            print "                    *a->groups = controlVal;"
+            print "                    return HWLM_TERMINATED;"
+            print "                }"
+            print "            }"
+
+    def produce_needed_temporaries(self, max_iterations):
+        print "        m128 p_mask;"
+        for iter in range(0, max_iterations):
+            print "        m128 val_%d;" % iter
+            print "        m128 val_%d_lo;" % iter
+            print "        m128 val_%d_hi;" % iter
+            for x in range(self.num_masks):
+                print "        m128 res_%d_%d;" % (iter, x)
+                if x != 0:
+                    print "        m128 res_shifted_%d_%d;" % (iter, x)
+            print "        m128 r_%d;" % iter
+            print "#ifdef ARCH_64_BIT"
+            print "            u64a r_%d_lopart;" % iter
+            print "            u64a r_%d_hipart;" % iter
+            print "#else"
+            print "            u32 r_%d_part1;" % iter
+            print "            u32 r_%d_part2;" % iter
+            print "            u32 r_%d_part3;" % iter
+            print "            u32 r_%d_part4;" % iter
+            print "#endif"
+
+    def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
+                                         cautious, save_old):
+        if cautious:
+            print "        val_%d = vectoredLoad128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
+        else:
+            print "        val_%d = load128(ptr + %d);" % (iter, iter*16)
+        print "        val_%d_lo = and128(val_%d, lomask);" % (iter, iter)
+        print "        val_%d_hi = rshift2x64(val_%d, 4);" % (iter, iter)
+        print "        val_%d_hi = and128(val_%d_hi, lomask);" % (iter, iter)
+        print
+        for x in range(self.num_masks):
+            print Template("""
+        res_${ITER}_${X} = and128(pshufb(maskBase[${X}*2]  , val_${ITER}_lo),
+                                  pshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
+            if x != 0:
+                if iter == 0:
+                    print "        res_shifted_%d_%d = palignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x,   iter, x,         x,   x)
+                else:
+                    print "        res_shifted_%d_%d = palignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x,    iter, x, iter-1, x,   x)
+            if x != 0 and iter == effective_num_iterations - 1 and save_old:
+                print "        res_old_%d = res_%d_%d;" % (x, iter, x)
+        print
+        if cautious:
+            print "        r_%d = and128(res_%d_0, p_mask);" % (iter, iter)
+        else:
+            print "        r_%d = res_%d_0;" % (iter, iter)
+        for x in range(1, self.num_masks):
+            print "        r_%d = and128(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
+        print
+
+    def produce_one_iteration_confirm(self, iter, confirmCautious):
+        setup64 = [ (0, "r_%d_lopart" % iter, "movq(r_%d)" % iter),
+                    (8, "r_%d_hipart" % iter, "movq(byteShiftRight128(r_%d, 8))" % iter) ]
+
+        setup32 = [ (0, "r_%d_part1" % iter, "movd(r_%d)" % iter),
+                    (4, "r_%d_part2" % iter, "movd(byteShiftRight128(r_%d, 4))" % iter),
+                    (8, "r_%d_part3" % iter, "movd(byteShiftRight128(r_%d, 8))" % iter),
+                    (12, "r_%d_part4" % iter, "movd(byteShiftRight128(r_%d, 12))" % iter) ]
+
+        print "        if (P0(isnonzero128(r_%d))) {" % (iter)
+        print "#ifdef ARCH_64_BIT"
+        for (off, val, init) in setup64:
+            print "            %s = %s;" % (val, init)
+        for (off, val, init) in setup64:
+            self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
+        print "#else"
+        for (off, val, init) in setup32:
+            print "            %s = %s;" % (val, init)
+        for (off, val, init) in setup32:
+            self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
+        print "#endif"
+        print "        }"
+
+    def produce_one_iteration(self, iter, effective_num_iterations, cautious = False,
+                              confirmCautious = True, save_old = True):
+        self.produce_one_iteration_state_calc(iter, effective_num_iterations, cautious, save_old)
+        self.produce_one_iteration_confirm(iter, confirmCautious)
+
+    def produce_code(self):
+        print self.produce_header(visible = True, header_only = False)
+        print self.produce_common_declarations()
+        print
+
+        self.produce_needed_temporaries(self.num_iterations)
+        print
+
+        print "    const struct Teddy * teddy = (const struct Teddy *)fdr;"
+        print "    const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
+        print "    const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32));" % self.num_masks
+        print "    const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
+        print "    const size_t iterBytes = %d;" % (self.num_iterations * 16)
+
+        print '    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
+                                ' buf, len, a->start_offset);'
+        print '    DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
+                                ' mainStart);'
+
+        for x in range(self.num_masks):
+            if (x != 0):
+                print "    m128 res_old_%d = ones128();" % x
+        print "    m128 lomask = set16x8(0xf);"
+
+        print "    if (ptr < mainStart) {"
+        print "         ptr = mainStart - 16;"
+        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
+        print "         ptr += 16;"
+        print "    }"
+
+        print "    if (ptr + 16 < buf + len) {"
+        self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
+        print "         ptr += 16;"
+        print "    }"
+
+        print "    for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
+        print "        __builtin_prefetch(ptr + (iterBytes*4));"
+        print self.produce_flood_check()
+
+        for iter in range(self.num_iterations):
+            self.produce_one_iteration(iter, self.num_iterations, cautious = False, confirmCautious = False)
+
+        print "    }"
+
+        print "    for (; ptr < buf + len; ptr += 16) {"
+        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
+        print "    }"
+
+        print self.produce_footer()
+
+    def produce_compile_call(self):
+        packed_str = { False : "false", True : "true"}[self.packed]
+        print "        { %d, %s, %d, %d, %s, %d, %d }," % (
+            self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
+            self.conf_pull_back, self.conf_top_level_split)
+
+    def get_name(self):
+        if self.packed:
+            pck_string = "_pck"
+        else:
+            pck_string = ""
+
+        if self.num_buckets == 16:
+            type_string = "_fat"
+        else:
+            type_string = ""
+
+        return "fdr_exec_teddy_%s_msks%d%s%s" % (self.arch.name, self.num_masks, pck_string, type_string)
+
+    def __init__(self, arch, packed = False, num_masks = 1, num_buckets = 8):
+        self.arch = arch
+        self.packed = packed
+        self.num_masks = num_masks
+        self.num_buckets = num_buckets
+        self.num_iterations = 2
+
+        if packed:
+            self.conf_top_level_split = 32
+        else:
+            self.conf_top_level_split = 1
+        self.conf_pull_back = 0
+
+class MTFat(MT):
+    def produce_needed_temporaries(self, max_iterations):
+        print "        m256 p_mask;"
+        for iter in range(0, max_iterations):
+            print "        m256 val_%d;" % iter
+            print "        m256 val_%d_lo;" % iter
+            print "        m256 val_%d_hi;" % iter
+            for x in range(self.num_masks):
+                print "        m256 res_%d_%d;" % (iter, x)
+                if x != 0:
+                    print "        m256 res_shifted_%d_%d;" % (iter, x)
+            print "        m256 r_%d;" % iter
+            print "#ifdef ARCH_64_BIT"
+            print "            u64a r_%d_part1;" % iter
+            print "            u64a r_%d_part2;" % iter
+            print "            u64a r_%d_part3;" % iter
+            print "            u64a r_%d_part4;" % iter
+            print "#else"
+            print "            u32 r_%d_part1;" % iter
+            print "            u32 r_%d_part2;" % iter
+            print "            u32 r_%d_part3;" % iter
+            print "            u32 r_%d_part4;" % iter
+            print "            u32 r_%d_part5;" % iter
+            print "            u32 r_%d_part6;" % iter
+            print "            u32 r_%d_part7;" % iter
+            print "            u32 r_%d_part8;" % iter
+            print "#endif"
+
+    def produce_code(self):
+        print self.produce_header(visible = True, header_only = False)
+        print self.produce_common_declarations()
+        print
+
+        self.produce_needed_temporaries(self.num_iterations)
+        print
+
+        print "    const struct Teddy * teddy = (const struct Teddy *)fdr;"
+        print "    const m256 * maskBase = (const m256 *)((const u8 *)fdr + sizeof(struct Teddy));"
+        print "    const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32*2));" % self.num_masks
+        print "    const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
+        print "    const size_t iterBytes = %d;" % (self.num_iterations * 16)
+
+        print '    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
+                                ' buf, len, a->start_offset);'
+        print '    DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
+                                ' mainStart);'
+
+        for x in range(self.num_masks):
+            if (x != 0):
+                print "    m256 res_old_%d = ones256();" % x
+        print "    m256 lomask = set32x8(0xf);"
+
+        print "    if (ptr < mainStart) {"
+        print "         ptr = mainStart - 16;"
+        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
+        print "         ptr += 16;"
+        print "    }"
+
+        print "    if (ptr + 16 < buf + len) {"
+        self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
+        print "         ptr += 16;"
+        print "    }"
+
+        print "    for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
+        print "        __builtin_prefetch(ptr + (iterBytes*4));"
+        print self.produce_flood_check()
+
+        for iter in range(self.num_iterations):
+            self.produce_one_iteration(iter, self.num_iterations, False, confirmCautious = False)
+
+        print "    }"
+
+        print "    for (; ptr < buf + len; ptr += 16) {"
+        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
+        print "    }"
+
+        print self.produce_footer()
+
+    def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
+                                         cautious, save_old):
+        if cautious:
+            print "        val_%d = vectoredLoad2x128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
+        else:
+            print "        val_%d = load2x128(ptr + %d);" % (iter, iter*16)
+        print "        val_%d_lo = and256(val_%d, lomask);" % (iter, iter)
+        print "        val_%d_hi = rshift4x64(val_%d, 4);" % (iter, iter)
+        print "        val_%d_hi = and256(val_%d_hi, lomask);" % (iter, iter)
+        print
+        for x in range(self.num_masks):
+            print Template("""
+        res_${ITER}_${X} = and256(vpshufb(maskBase[${X}*2]  , val_${ITER}_lo),
+                                  vpshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
+            if x != 0:
+                if iter == 0:
+                    print "        res_shifted_%d_%d = vpalignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x,   iter, x,         x,   x)
+                else:
+                    print "        res_shifted_%d_%d = vpalignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x,    iter, x, iter-1, x,   x)
+            if x != 0 and iter == effective_num_iterations - 1 and save_old:
+                print "        res_old_%d = res_%d_%d;" % (x, iter, x)
+        print
+        if cautious:
+            print "        r_%d = and256(res_%d_0, p_mask);" % (iter, iter)
+        else:
+            print "        r_%d = res_%d_0;" % (iter, iter)
+        for x in range(1, self.num_masks):
+            print "        r_%d = and256(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
+        print
+
+    def produce_one_iteration_confirm(self, iter, confirmCautious):
+        setup64 = [ (0, "r_%d_part1" % iter, "extractlow64from256(r)"),
+                    (4, "r_%d_part2" % iter, "extract64from256(r, 1);\n            r = interleave256hi(r_%d, r_swap)" % (iter)),
+                    (8, "r_%d_part3" % iter, "extractlow64from256(r)"),
+                    (12, "r_%d_part4" % iter, "extract64from256(r, 1)") ]
+
+        setup32 = [ (0, "r_%d_part1" % iter, "extractlow32from256(r)"),
+                    (2, "r_%d_part2" % iter, "extract32from256(r, 1)"),
+                    (4, "r_%d_part3" % iter, "extract32from256(r, 2)"),
+                    (6, "r_%d_part4" % iter, "extract32from256(r, 3);\n            r = interleave256hi(r_%d, r_swap)" % (iter)),
+                    (8, "r_%d_part5" % iter, "extractlow32from256(r)"),
+                    (10, "r_%d_part6" % iter, "extract32from256(r, 1)"),
+                    (12, "r_%d_part7" % iter, "extract32from256(r, 2)"),
+                    (14, "r_%d_part8" % iter, "extract32from256(r, 3)") ]
+
+        print "        if (P0(isnonzero256(r_%d))) {" % (iter)
+        print "            m256 r_swap = swap128in256(r_%d);" % (iter)
+        print "            m256 r = interleave256lo(r_%d, r_swap);" % (iter)
+        print "#ifdef ARCH_64_BIT"
+        for (off, val, init) in setup64:
+            print "            %s = %s;" % (val, init)
+
+        for (off, val, init) in setup64:
+            self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
+        print "#else"
+        for (off, val, init) in setup32:
+            print "            %s = %s;" % (val, init)
+
+        for (off, val, init) in setup32:
+            self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
+        print "#endif"
+        print "        }"
+
+class MTFast(MatcherBase):
+
+    def produce_confirm(self, cautious):
+        if cautious:
+            cautious_str = "VECTORING"
+        else:
+            cautious_str = "NOT_CAUTIOUS"
+
+        print "            for (u32 i = 0; i < arrCnt; i++) {"
+        print "                byte = bitArr[i] / 8;"
+        if self.packed:
+            conf_split_mask = IntegerType(32).constant_to_string(
+                                self.conf_top_level_split - 1)
+            print "                bitRem  = bitArr[i] % 8;"
+            print "                confSplit = *(ptr+byte) & 0x1f;"
+            print "                idx = confSplit * %d + bitRem;" % self.num_buckets
+            print "                cf = confBase[idx];"
+            print "                if (!cf)"
+            print "                    continue;"
+            print "                fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
+            print "                if (!(fdrc->groups & *control))"
+            print "                    continue;"
+            print "                confWithBit(fdrc, a, ptr - buf + byte, %s, 0, control, &last_match);" % cautious_str
+        else:
+            print "                cf = confBase[bitArr[i] % 8];"
+            print "                fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
+            print "                confWithBit1(fdrc, a, ptr - buf + byte, %s, control, &last_match);" % cautious_str
+        print "                if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
+        print "                    *a->groups = controlVal;"
+        print "                    return HWLM_TERMINATED;"
+        print "                }"
+        print "            }"
+
+    def produce_needed_temporaries(self, max_iterations):
+        print "        u32 arrCnt;"
+        print "        u16 bitArr[512];"
+        print "        m256 p_mask;"
+        print "        m256 val_0;"
+        print "        m256 val_0_lo;"
+        print "        m256 val_0_hi;"
+        print "        m256 res_0;"
+        print "        m256 res_1;"
+        print "        m128 lo_part;"
+        print "        m128 hi_part;"
+        print "#ifdef ARCH_64_BIT"
+        print "        u64a r_0_part;"
+        print "#else"
+        print "        u32 r_0_part;"
+        print "#endif"
+
+    def produce_bit_scan(self, offset, bits):
+        print "                while (P0(!!r_0_part)) {"
+        if bits == 64:
+            print "                    bitArr[arrCnt++] = (u16)findAndClearLSB_64(&r_0_part) + 64 * %d;" % (offset)
+        else:
+            print "                    bitArr[arrCnt++] = (u16)findAndClearLSB_32(&r_0_part) + 32 * %d;" % (offset)
+        print "                }"
+
+    def produce_bit_check_128(self, var_name, offset):
+        print "            if (P0(isnonzero128(%s))) {" % (var_name)
+        print "#ifdef ARCH_64_BIT"
+        print "                r_0_part = movq(%s);" % (var_name)
+        self.produce_bit_scan(offset, 64)
+        print "                r_0_part = movq(byteShiftRight128(%s, 8));" % (var_name)
+        self.produce_bit_scan(offset + 1, 64)
+        print "#else"
+        print "                r_0_part = movd(%s);" % (var_name)
+        self.produce_bit_scan(offset * 2, 32)
+        for step in range(1, 4):
+            print "                r_0_part = movd(byteShiftRight128(%s, %d));" % (var_name, step * 4)
+            self.produce_bit_scan(offset * 2 + step, 32)
+        print "#endif"
+        print "            }"
+
+    def produce_bit_check_256(self, iter, single_iter, cautious):
+        print "        if (P0(isnonzero256(res_%d))) {" % (iter)
+        if single_iter:
+            print "            arrCnt = 0;"
+        print "            lo_part = cast256to128(res_%d);" % (iter)
+        print "            hi_part = cast256to128(swap128in256(res_%d));" % (iter)
+        self.produce_bit_check_128("lo_part", iter * 4)
+        self.produce_bit_check_128("hi_part", iter * 4 + 2)
+        if single_iter:
+            self.produce_confirm(cautious)
+        print "        }"
+
+    def produce_one_iteration_state_calc(self, iter, cautious):
+        if cautious:
+            print "        val_0 = vectoredLoad256(&p_mask, ptr + %d, buf+a->start_offset, buf+len, a->buf_history, a->len_history);" % (iter * 32)
+        else:
+            print "        val_0 = load256(ptr + %d);" % (iter * 32)
+        print "        val_0_lo = and256(val_0, lomask);"
+        print "        val_0_hi = rshift4x64(val_0, 4);"
+        print "        val_0_hi = and256(val_0_hi, lomask);"
+        print "        res_%d = and256(vpshufb(maskLo  , val_0_lo), vpshufb(maskHi, val_0_hi));" % (iter)
+        if cautious:
+            print "        res_%d = and256(res_%d, p_mask);" % (iter, iter)
+
+    def produce_code(self):
+        print self.produce_header(visible = True, header_only = False)
+        print self.produce_common_declarations()
+        print
+
+        self.produce_needed_temporaries(self.num_iterations)
+
+        print "    const struct Teddy * teddy = (const struct Teddy *)fdr;"
+        print "    const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
+        print "    const m256 maskLo = set2x128(maskBase[0]);"
+        print "    const m256 maskHi = set2x128(maskBase[1]);"
+        print "    const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + 32);"
+        print "    const u8 * mainStart = ROUNDUP_PTR(ptr, 32);"
+        print "    const size_t iterBytes = %d;" % (self.num_iterations * 32)
+
+        print '    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
+                                ' buf, len, a->start_offset);'
+        print '    DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
+                                ' mainStart);'
+        print "    const m256 lomask = set32x8(0xf);"
+
+        print "    if (ptr < mainStart) {"
+        print "        ptr = mainStart - 32;"
+        self.produce_one_iteration_state_calc(iter = 0, cautious = True)
+        self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
+        print "        ptr += 32;"
+        print "    }"
+
+        print "    if (ptr + 32 < buf + len) {"
+        self.produce_one_iteration_state_calc(iter = 0, cautious = False)
+        self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
+        print "        ptr += 32;"
+        print "    }"
+        print "    for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
+        print "        __builtin_prefetch(ptr + (iterBytes*4));"
+        print self.produce_flood_check()
+        for iter in range (0, self.num_iterations):
+            self.produce_one_iteration_state_calc(iter = iter, cautious = False)
+        print "        arrCnt = 0;"
+        for iter in range (0, self.num_iterations):
+            self.produce_bit_check_256(iter = iter, single_iter = False, cautious = False)
+        self.produce_confirm(cautious = False)
+        print "    }"
+
+        print "    for (; ptr < buf + len; ptr += 32) {"
+        self.produce_one_iteration_state_calc(iter = 0, cautious = True)
+        self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
+        print "    }"
+
+        print self.produce_footer()
+
+    def get_name(self):
+        if self.packed:
+            pck_string = "_pck"
+        else:
+            pck_string = ""
+        return "fdr_exec_teddy_%s_msks%d%s_fast" % (self.arch.name, self.num_masks, pck_string)
+
+    def produce_compile_call(self):
+        packed_str = { False : "false", True : "true"}[self.packed]
+        print "        { %d, %s, %d, %d, %s, %d, %d }," % (
+            self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
+            self.conf_pull_back, self.conf_top_level_split)
+
+    def __init__(self, arch, packed = False):
+        self.arch = arch
+        self.packed = packed
+        self.num_masks = 1
+        self.num_buckets = 8
+        self.num_iterations = 2
+
+        self.conf_top_level_split = 1
+        self.conf_pull_back = 0
+        if packed:
+            self.conf_top_level_split = 32
+        else:
+            self.conf_top_level_split = 1
+        self.conf_pull_back = 0
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_compile_internal.h"
+#include "fdr_confirm.h"
+#include "fdr_engine_description.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/compare.h"
+#include "util/popcount.h"
+#include "util/target_info.h"
+#include "util/verify_types.h"
+
+#include "teddy_compile.h"
+#include "teddy_internal.h"
+#include "teddy_engine_description.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <boost/core/noncopyable.hpp>
+
+using namespace std;
+
+namespace ue2 {
+
+namespace {
+
+//#define TEDDY_DEBUG
+
+class TeddyCompiler : boost::noncopyable {
+    const TeddyEngineDescription &eng;
+    const vector<hwlmLiteral> &lits;
+    bool make_small;
+
+public:
+    TeddyCompiler(const vector<hwlmLiteral> &lits_in,
+                  const TeddyEngineDescription &eng_in, bool make_small_in)
+        : eng(eng_in), lits(lits_in), make_small(make_small_in) {}
+
+    aligned_unique_ptr<FDR> build(pair<u8 *, size_t> link);
+    bool pack(map<BucketIndex, std::vector<LiteralIndex> > &bucketToLits);
+};
+
+class TeddySet {
+    const vector<hwlmLiteral> &lits;
+    u32 len;
+    // nibbleSets is a series of bitfields over 16 predicates
+    // that represent the whether shufti nibble set
+    // so for num_masks = 4 we will represent our strings by
+    // 8 u16s in the vector that indicate what a shufti bucket
+    // would have to look like
+    vector<u16> nibbleSets;
+    set<u32> litIds;
+public:
+    TeddySet(const vector<hwlmLiteral> &lits_in, u32 len_in)
+        : lits(lits_in), len(len_in), nibbleSets(len_in * 2, 0) {}
+    const set<u32> & getLits() const { return litIds; }
+    size_t litCount() const { return litIds.size(); }
+
+    bool operator<(const TeddySet & s) const {
+        return litIds < s.litIds;
+    }
+
+#ifdef TEDDY_DEBUG
+    void dump() const {
+        printf("TS: ");
+        for (u32 i = 0; i < nibbleSets.size(); i++) {
+            printf("%04x ", (u32)nibbleSets[i]);
+        }
+        printf("\nnlits: %zu\nLit ids: ", litCount());
+        printf("Prob: %llu\n", probability());
+        for (set<u32>::iterator i = litIds.begin(), e = litIds.end(); i != e; ++i) {
+            printf("%u ", *i);
+        }
+        printf("\n");
+        printf("Flood prone : %s\n", isRunProne()?"yes":"no");
+    }
+#endif
+
+    bool identicalTail(const TeddySet & ts) const {
+        return nibbleSets == ts.nibbleSets;
+    }
+
+    void addLiteral(u32 lit_id) {
+        const string &s = lits[lit_id].s;
+        for (u32 i = 0; i < len; i++) {
+            if (i < s.size()) {
+                u8 c = s[s.size() - i - 1];
+                u8 c_hi = (c >> 4) & 0xf;
+                u8 c_lo = c & 0xf;
+                nibbleSets[i*2] = 1 << c_lo;
+                if (lits[lit_id].nocase && ourisalpha(c)) {
+                    nibbleSets[i*2+1] =  (1 << (c_hi&0xd)) | (1 << (c_hi|0x2));
+                } else {
+                    nibbleSets[i*2+1] =  1 << c_hi;
+                }
+            } else {
+                nibbleSets[i*2] = nibbleSets[i*2+1] = 0xffff;
+            }
+        }
+        litIds.insert(lit_id);
+    }
+
+    void merge(const TeddySet &ts) {
+        for (u32 i = 0; i < nibbleSets.size(); i++) {
+            nibbleSets[i] |= ts.nibbleSets[i];
+        }
+        litIds.insert(ts.litIds.begin(), ts.litIds.end());
+    }
+
+    // return a value p from 0 .. MAXINT64 that gives p/MAXINT64
+    // likelihood of this TeddySet firing a first-stage accept
+    // if it was given a bucket of its own and random data were
+    // to be passed in
+    u64a probability() const {
+        u64a val = 1;
+        for (size_t i = 0; i < nibbleSets.size(); i++) {
+            val *= popcount32((u32)nibbleSets[i]);
+        }
+        return val;
+    }
+
+    // return a score based around the chance of this hitting times
+    // a small fixed cost + the cost of traversing some sort of followup
+    // (assumption is that the followup is linear)
+    u64a heuristic() const {
+        return probability() * (2+litCount());
+    }
+
+    bool isRunProne() const {
+        u16 lo_and = 0xffff;
+        u16 hi_and = 0xffff;
+        for (u32 i = 0; i < len; i++) {
+            lo_and &= nibbleSets[i*2];
+            hi_and &= nibbleSets[i*2+1];
+        }
+        // we're not flood-prone if there's no way to get
+        // through with a flood
+        if (!lo_and || !hi_and) {
+            return false;
+        }
+        return true;
+    }
+};
+
+bool TeddyCompiler::pack(map<BucketIndex,
+                             std::vector<LiteralIndex> > &bucketToLits) {
+    set<TeddySet> sts;
+
+    for (u32 i = 0; i < lits.size(); i++) {
+        TeddySet ts(lits, eng.numMasks);
+        ts.addLiteral(i);
+        sts.insert(ts);
+    }
+
+    while (1) {
+#ifdef TEDDY_DEBUG
+        printf("Size %zu\n", sts.size());
+        for (set<TeddySet>::const_iterator i1 = sts.begin(), e1 = sts.end(); i1 != e1; ++i1) {
+            printf("\n"); i1->dump();
+        }
+        printf("\n===============================================\n");
+#endif
+
+        set<TeddySet>::iterator m1 = sts.end(), m2 = sts.end();
+        u64a best = 0xffffffffffffffffULL;
+
+        for (set<TeddySet>::iterator i1 = sts.begin(), e1 = sts.end(); i1 != e1; ++i1) {
+            set<TeddySet>::iterator i2 = i1;
+            ++i2;
+            const TeddySet &s1 = *i1;
+            for (set<TeddySet>::iterator e2 = sts.end(); i2 != e2; ++i2) {
+                const TeddySet &s2 = *i2;
+
+                // be more conservative if we don't absolutely need to
+                // keep packing
+                if ((sts.size() <= eng.getNumBuckets()) &&
+                    !s1.identicalTail(s2)) {
+                    continue;
+                }
+
+                TeddySet tmpSet(lits, eng.numMasks);
+                tmpSet.merge(s1);
+                tmpSet.merge(s2);
+                u64a newScore = tmpSet.heuristic();
+                u64a oldScore = s1.heuristic() + s2.heuristic();
+                if (newScore < oldScore) {
+                    m1 = i1;
+                    m2 = i2;
+                    break;
+                } else {
+                    u64a score = newScore - oldScore;
+                    bool oldRunProne = s1.isRunProne() && s2.isRunProne();
+                    bool newRunProne = tmpSet.isRunProne();
+                    if (newRunProne && !oldRunProne) {
+                        continue;
+                    }
+                    if (score < best) {
+                        best = score;
+                        m1 = i1;
+                        m2 = i2;
+                    }
+                }
+            }
+        }
+        // if we didn't find a merge candidate, bail out
+        if ((m1 == sts.end()) || (m2 == sts.end())) {
+            break;
+        }
+
+        // do the merge
+        TeddySet nts(lits, eng.numMasks);
+        nts.merge(*m1);
+        nts.merge(*m2);
+#ifdef TEDDY_DEBUG
+        printf("Merging\n");
+        printf("m1 = \n");
+        m1->dump();
+        printf("m2 = \n");
+        m2->dump();
+        printf("nts = \n");
+        nts.dump();
+        printf("\n===============================================\n");
+#endif
+        sts.erase(m1);
+        sts.erase(m2);
+        sts.insert(nts);
+    }
+    u32 cnt = 0;
+
+    if (sts.size() > eng.getNumBuckets()) {
+        return false;
+    }
+
+    for (set<TeddySet>::const_iterator i = sts.begin(), e = sts.end(); i != e;
+         ++i) {
+        for (set<u32>::const_iterator i2 = i->getLits().begin(),
+                                      e2 = i->getLits().end();
+             i2 != e2; ++i2) {
+            bucketToLits[cnt].push_back(*i2);
+        }
+        cnt++;
+    }
+    return true;
+}
+
+aligned_unique_ptr<FDR> TeddyCompiler::build(pair<u8 *, size_t> link) {
+    if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
+        DEBUG_PRINTF("too many literals: %zu\n", lits.size());
+        return nullptr;
+    }
+
+#ifdef TEDDY_DEBUG
+    for (size_t i = 0; i < lits.size(); i++) {
+        printf("lit %zu (len = %zu, %s) is ", i, lits[i].s.size(),
+               lits[i].nocase ? "caseless" : "caseful");
+        for (size_t j = 0; j < lits[i].s.size(); j++) {
+            printf("%02x", ((u32)lits[i].s[j])&0xff);
+        }
+        printf("\n");
+    }
+#endif
+
+    map<BucketIndex, std::vector<LiteralIndex> > bucketToLits;
+    if(eng.needConfirm(lits)) {
+        if (!pack(bucketToLits)) {
+            DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n",
+                         lits.size(), eng.getNumBuckets());
+            return nullptr;
+        }
+    } else {
+        for (u32 i = 0; i < lits.size(); i++) {
+            bucketToLits[i].push_back(i);
+        }
+    }
+    u32 maskWidth = eng.getNumBuckets() / 8;
+
+    size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
+
+    pair<u8 *, size_t> floodControlTmp = setupFDRFloodControl(lits, eng);
+    pair<u8 *, size_t> confirmTmp
+        = setupFullMultiConfs(lits, eng, bucketToLits, make_small);
+
+    size_t size = ROUNDUP_N(sizeof(Teddy) +
+                             maskLen +
+                             confirmTmp.second +
+                             floodControlTmp.second +
+                             link.second, 16 * maskWidth);
+
+    aligned_unique_ptr<FDR> fdr = aligned_zmalloc_unique<FDR>(size);
+    assert(fdr); // otherwise would have thrown std::bad_alloc
+    Teddy *teddy = (Teddy *)fdr.get(); // ugly
+    u8 *teddy_base = (u8 *)teddy;
+
+    teddy->size = size;
+    teddy->engineID = eng.getID();
+    teddy->maxStringLen = verify_u32(maxLen(lits));
+
+    u8 *ptr = teddy_base + sizeof(Teddy) + maskLen;
+    memcpy(ptr, confirmTmp.first, confirmTmp.second);
+    ptr += confirmTmp.second;
+    aligned_free(confirmTmp.first);
+
+    teddy->floodOffset = verify_u32(ptr - teddy_base);
+    memcpy(ptr, floodControlTmp.first, floodControlTmp.second);
+    ptr += floodControlTmp.second;
+    aligned_free(floodControlTmp.first);
+
+    if (link.first) {
+        teddy->link = verify_u32(ptr - teddy_base);
+        memcpy(ptr, link.first, link.second);
+        aligned_free(link.first);
+    } else {
+        teddy->link = 0;
+    }
+
+    u8 *baseMsk = teddy_base + sizeof(Teddy);
+
+    for (map<BucketIndex, std::vector<LiteralIndex> >::const_iterator
+             i = bucketToLits.begin(),
+             e = bucketToLits.end();
+         i != e; ++i) {
+        const u32 bucket_id = i->first;
+        const vector<LiteralIndex> &ids = i->second;
+        const u8 bmsk = 1U << (bucket_id % 8);
+
+        for (vector<LiteralIndex>::const_iterator i2 = ids.begin(),
+                                                  e2 = ids.end();
+             i2 != e2; ++i2) {
+            LiteralIndex lit_id = *i2;
+            const hwlmLiteral & l = lits[lit_id];
+            DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
+            const u32 sz = verify_u32(l.s.size());
+
+            // fill in masks
+            for (u32 j = 0; j < eng.numMasks; j++) {
+                u32 msk_id_lo = j * 2 * maskWidth + (bucket_id  / 8);
+                u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id  / 8);
+
+                // if we don't have a char at this position, fill in i
+                // locations in these masks with '1'
+                if (j >= sz) {
+                    for (u32 n = 0; n < 16; n++) {
+                        baseMsk[msk_id_lo * 16 + n] |= bmsk;
+                        baseMsk[msk_id_hi * 16 + n] |= bmsk;
+                    }
+                } else {
+                    u8 c = l.s[sz - 1 - j];
+                    // if we do have a char at this position
+                    const u32 hiShift = 4;
+                    u32 n_hi = (c >> hiShift) & 0xf;
+                    u32 n_lo = c & 0xf;
+
+                    if (j < l.msk.size() && l.msk[l.msk.size() - 1 - j]) {
+                        u8 m = l.msk[l.msk.size() - 1 - j];
+                        u8 m_hi = (m >> hiShift) & 0xf;
+                        u8 m_lo = m & 0xf;
+                        u8 cmp = l.cmp[l.msk.size() - 1 - j];
+                        u8 cmp_lo = cmp & 0xf;
+                        u8 cmp_hi = (cmp >> hiShift) & 0xf;
+
+                        for (u8 cm = 0; cm < 0x10; cm++) {
+                            if ((cm & m_lo) == (cmp_lo & m_lo)) {
+                                baseMsk[msk_id_lo * 16 + cm] |= bmsk;
+                            }
+                            if ((cm & m_hi) == (cmp_hi & m_hi)) {
+                                baseMsk[msk_id_hi * 16 + cm] |= bmsk;
+                            }
+                        }
+                    } else{
+                        if (l.nocase && ourisalpha(c)) {
+                            u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
+                            u32 cmHalfSet   = (0x20 >> hiShift) & 0xf;
+                            baseMsk[msk_id_hi * 16 + (n_hi & cmHalfClear)] |= bmsk;
+                            baseMsk[msk_id_hi * 16 + (n_hi | cmHalfSet  )] |= bmsk;
+                        } else {
+                            baseMsk[msk_id_hi * 16 + n_hi] |= bmsk;
+                        }
+                        baseMsk[msk_id_lo * 16 + n_lo] |= bmsk;
+                    }
+                }
+            }
+        }
+    }
+
+
+#ifdef TEDDY_DEBUG
+    for (u32 i = 0; i < eng.numMasks * 2; i++) {
+        for (u32 j = 0; j < 16; j++) {
+            u8 val = baseMsk[i * 16 + j];
+            for (u32 k = 0; k < 8; k++) {
+                printf("%s", ((val >> k) & 0x1) ? "1" : "0");
+            }
+            printf(" ");
+        }
+        printf("\n");
+    }
+#endif
+
+    return fdr;
+}
+
+} // namespace
+
+aligned_unique_ptr<FDR> teddyBuildTableHinted(const vector<hwlmLiteral> &lits,
+                                              bool make_small, u32 hint,
+                                              const target_t &target,
+                                              pair<u8 *, size_t> link) {
+    unique_ptr<TeddyEngineDescription> des;
+    if (hint == HINT_INVALID) {
+        des = chooseTeddyEngine(target, lits);
+    } else {
+        des = getTeddyDescription(hint);
+    }
+    if (!des) {
+        return nullptr;
+    }
+    TeddyCompiler tc(lits, *des, make_small);
+    return tc.build(link);
+}
+
+} // namespace ue2
--- a/src/fdr/teddy_compile.h
+++ b/src/fdr/teddy_compile.h
@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: Teddy build API.
+ */
+
+#ifndef TEDDY_COMPILE_H
+#define TEDDY_COMPILE_H
+
+#include "ue2common.h"
+#include "util/alloc.h"
+
+#include <vector>
+#include <utility> // std::pair
+
+struct FDR;
+struct target_t;
+
+namespace ue2 {
+
+struct hwlmLiteral;
+
+ue2::aligned_unique_ptr<FDR>
+teddyBuildTableHinted(const std::vector<hwlmLiteral> &lits, bool make_small,
+                      u32 hint, const target_t &target,
+                      std::pair<u8 *, size_t> link);
+
+} // namespace ue2
+
+#endif // TEDDY_COMPILE_H
--- a/src/fdr/teddy_engine_description.cpp
+++ b/src/fdr/teddy_engine_description.cpp
@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_compile_internal.h"
+#include "fdr_confirm.h"
+#include "ue2common.h"
+#include "hs_internal.h"
+#include "fdr_engine_description.h"
+#include "teddy_internal.h"
+#include "teddy_engine_description.h"
+#include "util/make_unique.h"
+
+#include <cmath>
+
+using namespace std;
+
+namespace ue2 {
+
+TeddyEngineDescription::TeddyEngineDescription(const TeddyEngineDef &def)
+    : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
+                        def.numBuckets, def.confirmPullBackDistance,
+                        def.confirmTopLevelSplit),
+      numMasks(def.numMasks), packed(def.packed) {}
+
+u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {
+    return numMasks;
+}
+
+bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const {
+    if (packed || lits.size() > getNumBuckets()) {
+        return true;
+    }
+    for (const auto &lit : lits) {
+        if (lit.s.size() > numMasks || !lit.msk.empty()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+#include "teddy_autogen_compiler.cpp"
+
+static
+size_t maxFloodTailLen(const vector<hwlmLiteral> &vl) {
+    size_t max_flood_tail = 0;
+    for (const auto &lit : vl) {
+        const string &s = lit.s;
+        assert(!s.empty());
+        size_t j;
+        for (j = 1; j < s.length(); j++) {
+            if (s[s.length() - j - 1] != s[s.length() - 1]) {
+                break;
+            }
+        }
+        max_flood_tail = max(max_flood_tail, j);
+    }
+    return max_flood_tail;
+}
+
+/**
+ * \brief True if this Teddy engine is qualified to handle this set of literals
+ * on this target.
+ */
+static
+bool isAllowed(const vector<hwlmLiteral> &vl, const TeddyEngineDescription &eng,
+               const size_t max_lit_len, const target_t &target) {
+    if (!eng.isValidOnTarget(target)) {
+        DEBUG_PRINTF("%u disallowed: not valid on target\n", eng.getID());
+        return false;
+    }
+    if (eng.getNumBuckets() < vl.size() && !eng.packed) {
+        DEBUG_PRINTF("%u disallowed: num buckets < num lits and not packed\n",
+                     eng.getID());
+        return false;
+    }
+    if (eng.getNumBuckets() * TEDDY_BUCKET_LOAD < vl.size()) {
+        DEBUG_PRINTF("%u disallowed: too many lits for num buckets\n",
+                     eng.getID());
+        return false;
+    }
+    if (eng.numMasks > max_lit_len) {
+        DEBUG_PRINTF("%u disallowed: more masks than max lit len (%zu)\n",
+                     eng.getID(), max_lit_len);
+        return false;
+    }
+
+    if (vl.size() > 40) {
+        u32 n_small_lits = 0;
+        for (const auto &lit : vl) {
+            if (lit.s.length() < eng.numMasks) {
+                n_small_lits++;
+            }
+        }
+        if (n_small_lits * 5 > vl.size()) {
+            DEBUG_PRINTF("too many short literals (%u)\n", n_small_lits);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+unique_ptr<TeddyEngineDescription>
+chooseTeddyEngine(const target_t &target, const vector<hwlmLiteral> &vl) {
+    vector<TeddyEngineDescription> descs;
+    getTeddyDescriptions(&descs);
+    const TeddyEngineDescription *best = nullptr;
+
+    const size_t max_lit_len = maxLen(vl);
+    const size_t max_flood_tail = maxFloodTailLen(vl);
+    DEBUG_PRINTF("%zu lits, max_lit_len=%zu, max_flood_tail=%zu\n", vl.size(),
+                 max_lit_len, max_flood_tail);
+
+    u32 best_score = 0;
+    for (size_t engineID = 0; engineID < descs.size(); engineID++) {
+        const TeddyEngineDescription &eng = descs[engineID];
+        if (!isAllowed(vl, eng, max_lit_len, target)) {
+            continue;
+        }
+
+        u32 score = 0;
+
+        // We prefer unpacked Teddy models.
+        if (!eng.packed) {
+            score += 100;
+        }
+
+        // If we're heavily loaded, we prefer to have more masks.
+        if (vl.size() > 4 * eng.getNumBuckets()) {
+            score += eng.numMasks * 4;
+        } else {
+            // Lightly loaded cases are great.
+            score += 100;
+        }
+
+        // We want enough masks to avoid becoming flood-prone.
+        if (eng.numMasks > max_flood_tail) {
+            score += 50;
+        }
+
+        // We prefer having 3 masks. 3 is just right.
+        score += 6 / (abs(3 - (int)eng.numMasks) + 1);
+
+        // We prefer cheaper, smaller Teddy models.
+        score += 16 / eng.getNumBuckets();
+
+        DEBUG_PRINTF("teddy %u: masks=%u, buckets=%u, packed=%u "
+                     "-> score=%u\n",
+                     eng.getID(), eng.numMasks, eng.getNumBuckets(),
+                     eng.packed ? 1U : 0U, score);
+
+        if (!best || score > best_score) {
+            best = &eng;
+            best_score = score;
+        }
+    }
+
+    if (!best) {
+        DEBUG_PRINTF("failed to find engine\n");
+        return nullptr;
+    }
+
+    DEBUG_PRINTF("using engine %u\n", best->getID());
+    return ue2::make_unique<TeddyEngineDescription>(*best);
+}
+
+unique_ptr<TeddyEngineDescription> getTeddyDescription(u32 engineID) {
+    vector<TeddyEngineDescription> descs;
+    getTeddyDescriptions(&descs);
+
+    for (const auto &desc : descs) {
+        if (desc.getID() == engineID) {
+            return ue2::make_unique<TeddyEngineDescription>(desc);
+        }
+    }
+
+    return nullptr;
+}
+
+} // namespace ue2
--- a/src/fdr/teddy_engine_description.h
+++ b/src/fdr/teddy_engine_description.h
@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef TEDDY_ENGINE_DESCRIPTION_H
+#define TEDDY_ENGINE_DESCRIPTION_H
+
+#include "engine_description.h"
+#include "fdr_compile_internal.h"
+
+#include <memory>
+#include <vector>
+
+namespace ue2 {
+
+#define TEDDY_BUCKET_LOAD 6
+
+struct TeddyEngineDef {
+    u32 id;
+    u64a cpu_features;
+    u32 numMasks;
+    u32 numBuckets;
+    bool packed;
+    u32 confirmPullBackDistance;
+    u32 confirmTopLevelSplit;
+};
+
+class TeddyEngineDescription : public EngineDescription {
+public:
+    u32 numMasks;
+    bool packed;
+
+    explicit TeddyEngineDescription(const TeddyEngineDef &def);
+
+    u32 getDefaultFloodSuffixLength() const override;
+    bool needConfirm(const std::vector<hwlmLiteral> &lits) const;
+};
+
+std::unique_ptr<TeddyEngineDescription>
+chooseTeddyEngine(const target_t &target, const std::vector<hwlmLiteral> &vl);
+std::unique_ptr<TeddyEngineDescription> getTeddyDescription(u32 engineID);
+void getTeddyDescriptions(std::vector<TeddyEngineDescription> *out);
+
+} // namespace ue2
+
+#endif
--- a/src/fdr/teddy_internal.h
+++ b/src/fdr/teddy_internal.h
@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef TEDDY_INTERNAL_H
+#define TEDDY_INTERNAL_H
+
+#include "ue2common.h"
+
+// first part is compatible with an FDR
+struct Teddy {
+    u32 engineID;
+    u32 size;
+    u32 maxStringLen;
+    u32 floodOffset;
+    u32 link;
+    u32 pad1;
+    u32 pad2;
+    u32 pad3;
+};
+
+#endif
--- a/src/grey.cpp
+++ b/src/grey.cpp
@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "grey.h"
+#include "ue2common.h"
+
+#include <algorithm>
+#include <cstdlib> // exit
+#include <string>
+#include <vector>
+
+#define DEFAULT_MAX_HISTORY 60
+
+using namespace std;
+
+namespace ue2 {
+
+Grey::Grey(void) :
+                   optimiseComponentTree(true),
+                   performGraphSimplification(true),
+                   prefilterReductions(true),
+                   removeEdgeRedundancy(true),
+                   allowGough(true),
+                   allowHaigLit(true),
+                   allowLitHaig(true),
+                   allowLbr(true),
+                   allowMcClellan(true),
+                   allowPuff(true),
+                   allowRose(true),
+                   allowExtendedNFA(true), /* bounded repeats of course */
+                   allowLimExNFA(true),
+                   allowSidecar(true),
+                   allowAnchoredAcyclic(true),
+                   allowSmallLiteralSet(true),
+                   allowCastle(true),
+                   allowDecoratedLiteral(true),
+                   allowNoodle(true),
+                   fdrAllowTeddy(true),
+                   puffImproveHead(true),
+                   castleExclusive(true),
+                   mergeSEP(true), /* short exhaustible passthroughs */
+                   mergeRose(true), // roses inside rose
+                   mergeSuffixes(true), // suffix nfas inside rose
+                   mergeOutfixes(true),
+                   onlyOneOutfix(false),
+                   allowShermanStates(true),
+                   allowMcClellan8(true),
+                   highlanderPruneDFA(true),
+                   minimizeDFA(true),
+                   accelerateDFA(true),
+                   accelerateNFA(true),
+                   reverseAccelerate(true),
+                   squashNFA(true),
+                   compressNFAState(true),
+                   numberNFAStatesWrong(false), /* debugging only */
+                   highlanderSquash(true),
+                   allowZombies(true),
+                   floodAsPuffette(false),
+                   nfaForceSize(0),
+                   nfaForceShifts(0),
+                   maxHistoryAvailable(DEFAULT_MAX_HISTORY),
+                   minHistoryAvailable(0), /* debugging only */
+                   maxAnchoredRegion(63), /* for rose's atable to run over */
+                   minRoseLiteralLength(3),
+                   minRoseNetflowLiteralLength(2),
+                   maxRoseNetflowEdges(50000), /* otherwise no netflow pass. */
+                   minExtBoundedRepeatSize(32),
+                   goughCopyPropagate(true),
+                   goughRegisterAllocate(true),
+                   shortcutLiterals(true),
+                   roseGraphReduction(true),
+                   roseRoleAliasing(true),
+                   roseMasks(true),
+                   roseMaxBadLeafLength(5),
+                   roseConvertInfBadLeaves(true),
+                   roseConvertFloodProneSuffixes(true),
+                   roseMergeRosesDuringAliasing(true),
+                   roseMultiTopRoses(true),
+                   roseHamsterMasks(true),
+                   roseLookaroundMasks(true),
+                   roseMcClellanPrefix(1),
+                   roseMcClellanSuffix(1),
+                   roseMcClellanOutfix(2),
+                   roseTransformDelay(true),
+                   roseDesiredSplit(4),
+                   earlyMcClellanPrefix(true),
+                   earlyMcClellanInfix(true),
+                   earlyMcClellanSuffix(true),
+                   allowCountingMiracles(true),
+                   allowSomChain(true),
+                   somMaxRevNfaLength(126),
+                   hamsterAccelForward(true),
+                   hamsterAccelReverse(false),
+                   miracleHistoryBonus(16),
+                   equivalenceEnable(true),
+
+                   allowSmallWrite(true), // McClellan dfas for small patterns
+
+                   smallWriteLargestBuffer(70), // largest buffer that can be
+                                                // considered a small write
+                                                // all blocks larger than this
+                                                // are given to rose &co
+                   smallWriteLargestBufferBad(35),
+                   limitSmallWriteOutfixSize(1048576), // 1 MB
+                   dumpFlags(0),
+                   limitPatternCount(8000000), // 8M patterns
+                   limitPatternLength(16000),  // 16K bytes
+                   limitGraphVertices(500000), // 500K vertices
+                   limitGraphEdges(1000000), // 1M edges
+                   limitReportCount(4*8000000),
+                   limitLiteralCount(8000000), // 8M literals
+                   limitLiteralLength(16000),
+                   limitLiteralMatcherChars(1073741824), // 1 GB
+                   limitLiteralMatcherSize(1073741824), // 1 GB
+                   limitRoseRoleCount(4*8000000),
+                   limitRoseEngineCount(8000000), // 8M engines
+                   limitRoseAnchoredSize(1073741824), // 1 GB
+                   limitEngineSize(1073741824), // 1 GB
+                   limitDFASize(1073741824), // 1 GB
+                   limitNFASize(1048576), // 1 MB
+                   limitLBRSize(1048576) // 1 MB
+{
+    assert(maxAnchoredRegion < 64); /* a[lm]_log_sum have limited capacity */
+}
+
+} // namespace ue2
+
+#ifndef RELEASE_BUILD
+
+#include <boost/lexical_cast.hpp>
+using boost::lexical_cast;
+
+namespace ue2 {
+
+void applyGreyOverrides(Grey *g, const string &s) {
+    string::const_iterator p = s.begin();
+    string::const_iterator pe = s.end();
+    string help = "help:0";
+    bool invalid_key_seen = false;
+    Grey defaultg;
+
+    if (s == "help" || s == "help:") {
+        printf("Valid grey overrides:\n");
+        p = help.begin();
+        pe = help.end();
+    }
+
+    while (p != pe) {
+        string::const_iterator ke = find(p, pe, ':');
+
+        if (ke == pe) {
+            break;
+        }
+
+        string key(p, ke);
+
+        string::const_iterator ve = find(ke, pe, ';');
+
+        unsigned int value = lexical_cast<unsigned int>(string(ke + 1, ve));
+        bool done = false;
+
+        /* surely there exists a nice template to go with this macro to make
+         * all the boring code disappear */
+#define G_UPDATE(k) do {                                                \
+            if (key == ""#k) { g->k = value; done = 1;}                 \
+            if (key == "help") {                                        \
+                printf("\t%-30s\tdefault: %s\n", #k,                    \
+                       lexical_cast<string>(defaultg.k).c_str());       \
+            }                                                           \
+        } while (0)
+
+        G_UPDATE(optimiseComponentTree);
+        G_UPDATE(performGraphSimplification);
+        G_UPDATE(prefilterReductions);
+        G_UPDATE(removeEdgeRedundancy);
+        G_UPDATE(allowGough);
+        G_UPDATE(allowHaigLit);
+        G_UPDATE(allowLitHaig);
+        G_UPDATE(allowLbr);
+        G_UPDATE(allowMcClellan);
+        G_UPDATE(allowPuff);
+        G_UPDATE(allowRose);
+        G_UPDATE(allowExtendedNFA);
+        G_UPDATE(allowLimExNFA);
+        G_UPDATE(allowSidecar);
+        G_UPDATE(allowAnchoredAcyclic);
+        G_UPDATE(allowSmallLiteralSet);
+        G_UPDATE(allowCastle);
+        G_UPDATE(allowDecoratedLiteral);
+        G_UPDATE(allowNoodle);
+        G_UPDATE(fdrAllowTeddy);
+        G_UPDATE(puffImproveHead);
+        G_UPDATE(castleExclusive);
+        G_UPDATE(mergeSEP);
+        G_UPDATE(mergeRose);
+        G_UPDATE(mergeSuffixes);
+        G_UPDATE(mergeOutfixes);
+        G_UPDATE(onlyOneOutfix);
+        G_UPDATE(allowShermanStates);
+        G_UPDATE(allowMcClellan8);
+        G_UPDATE(highlanderPruneDFA);
+        G_UPDATE(minimizeDFA);
+        G_UPDATE(accelerateDFA);
+        G_UPDATE(accelerateNFA);
+        G_UPDATE(reverseAccelerate);
+        G_UPDATE(squashNFA);
+        G_UPDATE(compressNFAState);
+        G_UPDATE(numberNFAStatesWrong);
+        G_UPDATE(allowZombies);
+        G_UPDATE(floodAsPuffette);
+        G_UPDATE(nfaForceSize);
+        G_UPDATE(nfaForceShifts);
+        G_UPDATE(highlanderSquash);
+        G_UPDATE(maxHistoryAvailable);
+        G_UPDATE(minHistoryAvailable);
+        G_UPDATE(maxAnchoredRegion);
+        G_UPDATE(minRoseLiteralLength);
+        G_UPDATE(minRoseNetflowLiteralLength);
+        G_UPDATE(maxRoseNetflowEdges);
+        G_UPDATE(minExtBoundedRepeatSize);
+        G_UPDATE(goughCopyPropagate);
+        G_UPDATE(goughRegisterAllocate);
+        G_UPDATE(shortcutLiterals);
+        G_UPDATE(roseGraphReduction);
+        G_UPDATE(roseRoleAliasing);
+        G_UPDATE(roseMasks);
+        G_UPDATE(roseMaxBadLeafLength);
+        G_UPDATE(roseConvertInfBadLeaves);
+        G_UPDATE(roseConvertFloodProneSuffixes);
+        G_UPDATE(roseMergeRosesDuringAliasing);
+        G_UPDATE(roseMultiTopRoses);
+        G_UPDATE(roseHamsterMasks);
+        G_UPDATE(roseLookaroundMasks);
+        G_UPDATE(roseMcClellanPrefix);
+        G_UPDATE(roseMcClellanSuffix);
+        G_UPDATE(roseMcClellanOutfix);
+        G_UPDATE(roseTransformDelay);
+        G_UPDATE(roseDesiredSplit);
+        G_UPDATE(earlyMcClellanPrefix);
+        G_UPDATE(earlyMcClellanInfix);
+        G_UPDATE(earlyMcClellanSuffix);
+        G_UPDATE(allowSomChain);
+        G_UPDATE(allowCountingMiracles);
+        G_UPDATE(somMaxRevNfaLength);
+        G_UPDATE(hamsterAccelForward);
+        G_UPDATE(hamsterAccelReverse);
+        G_UPDATE(miracleHistoryBonus);
+        G_UPDATE(equivalenceEnable);
+        G_UPDATE(allowSmallWrite);
+        G_UPDATE(smallWriteLargestBuffer);
+        G_UPDATE(smallWriteLargestBufferBad);
+        G_UPDATE(limitSmallWriteOutfixSize);
+        G_UPDATE(limitPatternCount);
+        G_UPDATE(limitPatternLength);
+        G_UPDATE(limitGraphVertices);
+        G_UPDATE(limitGraphEdges);
+        G_UPDATE(limitReportCount);
+        G_UPDATE(limitLiteralCount);
+        G_UPDATE(limitLiteralLength);
+        G_UPDATE(limitLiteralMatcherChars);
+        G_UPDATE(limitLiteralMatcherSize);
+        G_UPDATE(limitRoseRoleCount);
+        G_UPDATE(limitRoseEngineCount);
+        G_UPDATE(limitRoseAnchoredSize);
+        G_UPDATE(limitEngineSize);
+        G_UPDATE(limitDFASize);
+        G_UPDATE(limitNFASize);
+        G_UPDATE(limitLBRSize);
+
+#undef G_UPDATE
+        if (key == "simple_som") {
+            g->allowHaigLit = false;
+            g->allowLitHaig = false;
+            g->allowSomChain = false;
+            g->somMaxRevNfaLength = 0;
+            done = true;
+        }
+        if (key == "forceOutfixesNFA") {
+            g->allowAnchoredAcyclic = false;
+            g->allowCastle = false;
+            g->allowDecoratedLiteral = false;
+            g->allowGough = false;
+            g->allowHaigLit = false;
+            g->allowLbr = false;
+            g->allowLimExNFA = true;
+            g->allowLitHaig = false;
+            g->allowMcClellan = false;
+            g->allowPuff = false;
+            g->allowRose = false;
+            g->allowSmallLiteralSet = false;
+            g->roseMasks = false;
+            done = true;
+        }
+        if (key == "forceOutfixesDFA") {
+            g->allowAnchoredAcyclic = false;
+            g->allowCastle = false;
+            g->allowDecoratedLiteral = false;
+            g->allowGough = false;
+            g->allowHaigLit = false;
+            g->allowLbr = false;
+            g->allowLimExNFA = false;
+            g->allowLitHaig = false;
+            g->allowMcClellan = true;
+            g->allowPuff = false;
+            g->allowRose = false;
+            g->allowSmallLiteralSet = false;
+            g->roseMasks = false;
+            done = true;
+        }
+        if (key == "forceOutfixes") {
+            g->allowAnchoredAcyclic = false;
+            g->allowCastle = false;
+            g->allowDecoratedLiteral = false;
+            g->allowGough = true;
+            g->allowHaigLit = false;
+            g->allowLbr = false;
+            g->allowLimExNFA = true;
+            g->allowLitHaig = false;
+            g->allowMcClellan = true;
+            g->allowPuff = false;
+            g->allowRose = false;
+            g->allowSmallLiteralSet = false;
+            g->roseMasks = false;
+            done = true;
+        }
+
+        if (!done && key != "help") {
+            printf("Invalid grey override key %s:%u\n", key.c_str(), value);
+            invalid_key_seen = true;
+        }
+
+        p = ve;
+
+        if (p != pe) {
+            ++p;
+        }
+    }
+
+    if (invalid_key_seen) {
+        applyGreyOverrides(g, "help");
+        exit(1);
+    }
+
+    assert(g->maxAnchoredRegion < 64); /* a[lm]_log_sum have limited capacity */
+}
+
+} // namespace ue2
+
+#endif
--- a/src/grey.h
+++ b/src/grey.h
@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef GREY_H
+#define GREY_H
+
+#include <vector>
+#include <string>
+
+#include "ue2common.h"
+
+namespace ue2 {
+
+struct Grey {
+    Grey(void);
+
+    bool optimiseComponentTree;
+
+    bool performGraphSimplification;
+    bool prefilterReductions;
+    bool removeEdgeRedundancy;
+
+    bool allowGough;
+    bool allowHaigLit;
+    bool allowLitHaig;
+    bool allowLbr;
+    bool allowMcClellan;
+    bool allowPuff;
+    bool allowRose;
+    bool allowExtendedNFA;
+    bool allowLimExNFA;
+    bool allowSidecar;
+    bool allowAnchoredAcyclic;
+    bool allowSmallLiteralSet;
+    bool allowCastle;
+    bool allowDecoratedLiteral;
+
+    bool allowNoodle;
+    bool fdrAllowTeddy;
+
+    bool puffImproveHead;
+    bool castleExclusive; // enable castle mutual exclusion analysis
+
+    bool mergeSEP;
+    bool mergeRose;
+    bool mergeSuffixes;
+    bool mergeOutfixes;
+    bool onlyOneOutfix; // if > 1 outfix, fail compile
+
+    bool allowShermanStates;
+    bool allowMcClellan8;
+    bool highlanderPruneDFA;
+    bool minimizeDFA;
+
+    bool accelerateDFA;
+    bool accelerateNFA;
+    bool reverseAccelerate;
+
+    bool squashNFA;
+    bool compressNFAState;
+    bool numberNFAStatesWrong;
+    bool highlanderSquash;
+    bool allowZombies;
+    bool floodAsPuffette;
+
+    u32 nfaForceSize;
+    u32 nfaForceShifts;
+
+    u32 maxHistoryAvailable;
+    u32 minHistoryAvailable;
+    u32 maxAnchoredRegion;
+    u32 minRoseLiteralLength;
+    u32 minRoseNetflowLiteralLength;
+    u32 maxRoseNetflowEdges;
+
+    u32 minExtBoundedRepeatSize; /* to be considered for ng_repeat */
+
+    bool goughCopyPropagate;
+    bool goughRegisterAllocate;
+
+    bool shortcutLiterals;
+
+    bool roseGraphReduction;
+    bool roseRoleAliasing;
+    bool roseMasks;
+    u32 roseMaxBadLeafLength;
+    bool roseConvertInfBadLeaves;
+    bool roseConvertFloodProneSuffixes;
+    bool roseMergeRosesDuringAliasing;
+    bool roseMultiTopRoses;
+    bool roseHamsterMasks;
+    bool roseLookaroundMasks;
+    u32 roseMcClellanPrefix; /* 0 = off, 1 = only if large nfa, 2 = always */
+    u32 roseMcClellanSuffix; /* 0 = off, 1 = only if very large nfa, 2 =
+                              * always */
+    u32 roseMcClellanOutfix; /* 0 = off, 1 = sometimes, 2 = almost always */
+    bool roseTransformDelay;
+    u32 roseDesiredSplit;
+
+    bool earlyMcClellanPrefix;
+    bool earlyMcClellanInfix;
+    bool earlyMcClellanSuffix;
+
+    bool allowCountingMiracles;
+
+    bool allowSomChain;
+    u32 somMaxRevNfaLength;
+
+    bool hamsterAccelForward;
+    bool hamsterAccelReverse; // currently not implemented
+
+    u32 miracleHistoryBonus; /* cheap hack to make miracles better, TODO
+                              * something dignified */
+
+    bool equivalenceEnable;
+
+    // SmallWrite engine
+    bool allowSmallWrite;
+    u32 smallWriteLargestBuffer;  // largest buffer that can be small write
+    u32 smallWriteLargestBufferBad;// largest buffer that can be small write
+    u32 limitSmallWriteOutfixSize; //!< max total size of outfix DFAs
+
+    enum DumpFlags {
+        DUMP_NONE       = 0,
+        DUMP_BASICS     = 1 << 0, // Dump basic textual data
+        DUMP_PARSE      = 1 << 1, // Dump component tree to .txt
+        DUMP_INT_GRAPH  = 1 << 2, // Dump non-implementation graphs
+        DUMP_IMPL       = 1 << 3  // Dump implementation graphs
+    };
+
+    u32 dumpFlags;
+    std::string dumpPath;
+
+    /* Resource limits. These are somewhat arbitrary, but are intended to bound
+     * the input to many of our internal structures. Exceeding one of these
+     * limits will cause an error to be returned to the user.
+     *
+     * NOTE: Raising these limitations make cause smoke to come out of parts of
+     * the runtime. */
+
+    u32 limitPatternCount;  //!< max number of patterns
+    u32 limitPatternLength; //!< max number of characters in a regex
+    u32 limitGraphVertices; //!< max number of states in built NFA graph
+    u32 limitGraphEdges;    //!< max number of edges in build NFA graph
+    u32 limitReportCount;   //!< max number of ReportIDs allocated internally
+
+    // HWLM literal matcher limits.
+    u32 limitLiteralCount;        //!< max number of literals in an HWLM table
+    u32 limitLiteralLength;       //!< max number of characters in a literal
+    u32 limitLiteralMatcherChars; //!< max characters in an HWLM literal matcher
+    u32 limitLiteralMatcherSize;  //!< max size of an HWLM matcher (in bytes)
+
+    // Rose limits.
+    u32 limitRoseRoleCount;    //!< max number of Rose roles
+    u32 limitRoseEngineCount;  //!< max prefix/infix/suffix/outfix engines
+    u32 limitRoseAnchoredSize; //!< max total size of anchored DFAs (bytes)
+
+    // Engine (DFA/NFA/etc) limits.
+    u32 limitEngineSize; //!< max size of an engine (in bytes)
+    u32 limitDFASize;    //!< max size of a DFA (in bytes)
+    u32 limitNFASize;    //!< max size of an NFA (in bytes)
+    u32 limitLBRSize;    //!< max size of an LBR engine (in bytes)
+};
+
+#ifndef RELEASE_BUILD
+#include <string>
+void applyGreyOverrides(Grey *g, const std::string &overrides);
+#endif
+
+} // namespace ue2
+
+#endif
--- a/src/hs.cpp
+++ b/src/hs.cpp
@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Compiler front-end, including public API calls for compilation.
+ */
+#include "allocator.h"
+#include "ue2common.h"
+#include "grey.h"
+#include "hs_compile.h"
+#include "hs_internal.h"
+#include "database.h"
+#include "compiler/compiler.h"
+#include "compiler/error.h"
+#include "nfagraph/ng.h"
+#include "nfagraph/ng_expr_info.h"
+#include "parser/parse_error.h"
+#include "parser/Parser.h"
+#include "parser/prefilter.h"
+#include "util/compile_error.h"
+#include "util/cpuid_flags.h"
+#include "util/depth.h"
+#include "util/popcount.h"
+#include "util/target_info.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <limits.h>
+#include <string>
+#include <vector>
+
+using namespace std;
+using namespace ue2;
+
+/** \brief Cheap check that no unexpected mode flags are on. */
+static
+bool validModeFlags(unsigned int mode) {
+    static const unsigned allModeFlags = HS_MODE_BLOCK
+                                       | HS_MODE_STREAM
+                                       | HS_MODE_VECTORED
+                                       | HS_MODE_SOM_HORIZON_LARGE
+                                       | HS_MODE_SOM_HORIZON_MEDIUM
+                                       | HS_MODE_SOM_HORIZON_SMALL;
+
+    return !(mode & ~allModeFlags);
+}
+
+/** \brief Validate mode flags. */
+static
+bool checkMode(unsigned int mode, hs_compile_error **comp_error) {
+    // First, check that only bits with meaning are on.
+    if (!validModeFlags(mode)) {
+        *comp_error = generateCompileError("Invalid parameter: "
+                "unrecognised mode flags.", -1);
+        return false;
+    }
+
+    // Our mode must be ONE of (block, streaming, vectored).
+    unsigned checkmode
+        = mode & (HS_MODE_STREAM | HS_MODE_BLOCK | HS_MODE_VECTORED);
+    if (popcount32(checkmode) != 1) {
+        *comp_error = generateCompileError(
+            "Invalid parameter: mode must have one "
+            "(and only one) of HS_MODE_BLOCK, HS_MODE_STREAM or "
+            "HS_MODE_VECTORED set.",
+            -1);
+        return false;
+    }
+
+    // If you specify SOM precision, you must be in streaming mode and you only
+    // get to have one.
+    unsigned somMode = mode & (HS_MODE_SOM_HORIZON_LARGE |
+                               HS_MODE_SOM_HORIZON_MEDIUM |
+                               HS_MODE_SOM_HORIZON_SMALL);
+    if (somMode) {
+        if (!(mode & HS_MODE_STREAM)) {
+            *comp_error = generateCompileError("Invalid parameter: the "
+                    "HS_MODE_SOM_HORIZON_ mode flags may only be set in "
+                    "streaming mode.", -1);
+            return false;
+
+        }
+        if ((somMode & (somMode - 1)) != 0) {
+            *comp_error = generateCompileError("Invalid parameter: only one "
+                    "HS_MODE_SOM_HORIZON_ mode flag can be set.", -1);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static
+bool checkPlatform(const hs_platform_info *p, hs_compile_error **comp_error) {
+#define HS_TUNE_LAST HS_TUNE_FAMILY_BDW
+#define HS_CPU_FEATURES_ALL (HS_CPU_FEATURES_AVX2)
+
+    if (!p) {
+        return true;
+    }
+
+    if (p->cpu_features & ~HS_CPU_FEATURES_ALL) {
+        *comp_error = generateCompileError("Invalid cpu features specified in "
+                                           "the platform information.", -1);
+        return false;
+   }
+
+    if (p->tune > HS_TUNE_LAST) {
+        *comp_error = generateCompileError("Invalid tuning value specified in "
+                                           "the platform information.", -1);
+        return false;
+    }
+
+    return true;
+}
+
+/** \brief Convert from SOM mode to bytes of precision. */
+static
+unsigned getSomPrecision(unsigned mode) {
+    if (mode & HS_MODE_VECTORED) {
+        /* always assume full precision for vectoring */
+        return 8;
+    }
+
+    if (mode & HS_MODE_SOM_HORIZON_LARGE) {
+        return 8;
+    } else if (mode & HS_MODE_SOM_HORIZON_MEDIUM) {
+        return 4;
+    } else if (mode & HS_MODE_SOM_HORIZON_SMALL) {
+        return 2;
+    }
+    return 0;
+}
+
+namespace ue2 {
+
+hs_error_t
+hs_compile_multi_int(const char *const *expressions, const unsigned *flags,
+                     const unsigned *ids, const hs_expr_ext *const *ext,
+                     unsigned elements, unsigned mode,
+                     const hs_platform_info_t *platform, hs_database_t **db,
+                     hs_compile_error_t **comp_error, const Grey &g) {
+    // Check the args: note that it's OK for flags, ids or ext to be null.
+    if (!comp_error) {
+        if (db) {
+            *db = nullptr;
+        }
+        // nowhere to write the string, but we can still report an error code
+        return HS_COMPILER_ERROR;
+    }
+    if (!db) {
+        *comp_error = generateCompileError("Invalid parameter: db is NULL", -1);
+        return HS_COMPILER_ERROR;
+    }
+    if (!expressions) {
+        *db = nullptr;
+        *comp_error
+            = generateCompileError("Invalid parameter: expressions is NULL",
+                                   -1);
+        return HS_COMPILER_ERROR;
+    }
+    if (elements == 0) {
+        *db = nullptr;
+        *comp_error = generateCompileError("Invalid parameter: elements is zero", -1);
+        return HS_COMPILER_ERROR;
+    }
+
+    if (!checkMode(mode, comp_error)) {
+        *db = nullptr;
+        assert(*comp_error); // set by checkMode.
+        return HS_COMPILER_ERROR;
+    }
+
+    if (!checkPlatform(platform, comp_error)) {
+        *db = nullptr;
+        assert(*comp_error); // set by checkPlatform.
+        return HS_COMPILER_ERROR;
+    }
+
+    if (elements > g.limitPatternCount) {
+        *db = nullptr;
+        *comp_error = generateCompileError("Number of patterns too large", -1);
+        return HS_COMPILER_ERROR;
+    }
+
+    // This function is simply a wrapper around both the parser and compiler
+    bool isStreaming = mode & (HS_MODE_STREAM | HS_MODE_VECTORED);
+    bool isVectored = mode & HS_MODE_VECTORED;
+    unsigned somPrecision = getSomPrecision(mode);
+
+    target_t target_info = platform ? target_t(*platform)
+                                    : get_current_target();
+
+    CompileContext cc(isStreaming, isVectored, target_info, g);
+    NG ng(cc, somPrecision);
+
+    try {
+        for (unsigned int i = 0; i < elements; i++) {
+            // Add this expression to the compiler
+            try {
+                addExpression(ng, i, expressions[i], flags ? flags[i] : 0,
+                              ext ? ext[i] : nullptr, ids ? ids[i] : 0);
+            } catch (CompileError &e) {
+                /* Caught a parse error:
+                 * throw it upstream as a CompileError with a specific index */
+                e.setExpressionIndex(i);
+                throw; /* do not slice */
+            }
+        }
+
+        unsigned length = 0;
+        struct hs_database *out = build(ng, &length);
+
+        assert(out);    // should have thrown exception on error
+        assert(length);
+
+        *db = out;
+        *comp_error = nullptr;
+
+        return HS_SUCCESS;
+    }
+    catch (const CompileError &e) {
+        // Compiler error occurred
+        *db = nullptr;
+        *comp_error = generateCompileError(e.reason,
+                                           e.hasIndex ? (int)e.index : -1);
+        return HS_COMPILER_ERROR;
+    }
+    catch (std::bad_alloc) {
+        *db = nullptr;
+        *comp_error = const_cast<hs_compile_error_t *>(&hs_enomem);
+        return HS_COMPILER_ERROR;
+    }
+    catch (...) {
+        assert(!"Internal error, unexpected exception");
+        *db = nullptr;
+        *comp_error = const_cast<hs_compile_error_t *>(&hs_einternal);
+        return HS_COMPILER_ERROR;
+    }
+}
+
+} // namespace ue2
+
+extern "C" HS_PUBLIC_API
+hs_error_t hs_compile(const char *expression, unsigned flags, unsigned mode,
+                      const hs_platform_info_t *platform, hs_database_t **db,
+                      hs_compile_error_t **error) {
+    if (expression == nullptr) {
+        *db = nullptr;
+        *error = generateCompileError("Invalid parameter: expression is NULL",
+                                      -1);
+        return HS_COMPILER_ERROR;
+    }
+
+    unsigned id = 0; // single expressions get zero as an ID
+    const hs_expr_ext * const *ext = nullptr; // unused for this call.
+
+    return hs_compile_multi_int(&expression, &flags, &id, ext, 1, mode,
+                                platform, db, error, Grey());
+}
+
+extern "C" HS_PUBLIC_API
+hs_error_t hs_compile_multi(const char * const *expressions,
+                            const unsigned *flags, const unsigned *ids,
+                            unsigned elements, unsigned mode,
+                            const hs_platform_info_t *platform,
+                            hs_database_t **db, hs_compile_error_t **error) {
+    const hs_expr_ext * const *ext = nullptr; // unused for this call.
+    return hs_compile_multi_int(expressions, flags, ids, ext, elements, mode,
+                                platform, db, error, Grey());
+}
+
+extern "C" HS_PUBLIC_API
+hs_error_t hs_compile_ext_multi(const char * const *expressions,
+                                const unsigned *flags, const unsigned *ids,
+                                const hs_expr_ext * const *ext,
+                                unsigned elements, unsigned mode,
+                                const hs_platform_info_t *platform,
+                                hs_database_t **db,
+                                hs_compile_error_t **error) {
+    return hs_compile_multi_int(expressions, flags, ids, ext, elements, mode,
+                                platform, db, error, Grey());
+}
+
+static
+hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
+                                  unsigned int mode, hs_expr_info_t **info,
+                                  hs_compile_error_t **error) {
+    if (!error) {
+        // nowhere to write an error, but we can still return an error code.
+        return HS_COMPILER_ERROR;
+    }
+
+    if (!info) {
+        *error = generateCompileError("Invalid parameter: info is NULL", -1);
+        return HS_COMPILER_ERROR;
+    }
+
+    if (!expression) {
+        *error = generateCompileError("Invalid parameter: expression is NULL",
+                                      -1);
+        return HS_COMPILER_ERROR;
+    }
+
+    *info = nullptr;
+    *error = nullptr;
+
+    hs_expr_info local_info;
+    memset(&local_info, 0, sizeof(local_info));
+
+    try {
+        bool isStreaming = mode & (HS_MODE_STREAM | HS_MODE_VECTORED);
+        bool isVectored = mode & HS_MODE_VECTORED;
+
+        CompileContext cc(isStreaming, isVectored, get_current_target(),
+                          Grey());
+
+        // Ensure that our pattern isn't too long (in characters).
+        if (strlen(expression) > cc.grey.limitPatternLength) {
+            throw ParseError("Pattern length exceeds limit.");
+        }
+
+        ReportManager rm(cc.grey);
+        ParsedExpression pe(0, expression, flags, 0);
+        assert(pe.component);
+
+        // Apply prefiltering transformations if desired.
+        if (pe.prefilter) {
+            prefilterTree(pe.component, ParseMode(flags));
+        }
+
+        unique_ptr<NGWrapper> g = buildWrapper(rm, cc, pe);
+
+        if (!g) {
+            DEBUG_PRINTF("NFA build failed, but no exception was thrown.\n");
+            throw ParseError("Internal error.");
+        }
+
+        fillExpressionInfo(rm, *g, &local_info);
+    }
+    catch (const CompileError &e) {
+        // Compiler error occurred
+        *error = generateCompileError(e);
+        return HS_COMPILER_ERROR;
+    }
+    catch (std::bad_alloc) {
+        *error = const_cast<hs_compile_error_t *>(&hs_enomem);
+        return HS_COMPILER_ERROR;
+    }
+    catch (...) {
+        assert(!"Internal error, unexpected exception");
+        *error = const_cast<hs_compile_error_t *>(&hs_einternal);
+        return HS_COMPILER_ERROR;
+    }
+
+    hs_expr_info *rv = (hs_expr_info *)hs_misc_alloc(sizeof(*rv));
+    if (!rv) {
+        *error = const_cast<hs_compile_error_t *>(&hs_enomem);
+        return HS_COMPILER_ERROR;
+    }
+
+    *rv = local_info;
+    *info = rv;
+    return HS_SUCCESS;
+}
+
+extern "C" HS_PUBLIC_API
+hs_error_t hs_expression_info(const char *expression, unsigned int flags,
+                              hs_expr_info_t **info,
+                              hs_compile_error_t **error) {
+    return hs_expression_info_int(expression, flags, HS_MODE_BLOCK, info,
+                                  error);
+}
+
+extern "C" HS_PUBLIC_API
+hs_error_t hs_populate_platform(hs_platform_info_t *platform) {
+    if (!platform) {
+        return HS_INVALID;
+    }
+
+    memset(platform, 0, sizeof(*platform));
+
+    platform->cpu_features = cpuid_flags();
+    platform->tune = cpuid_tune();
+
+    return HS_SUCCESS;
+}
+
+extern "C" HS_PUBLIC_API
+hs_error_t hs_free_compile_error(hs_compile_error_t *error) {
+    freeCompileError(error);
+    return HS_SUCCESS;
+}
--- a/src/hs.h
+++ b/src/hs.h
@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HS_H_
+#define HS_H_
+
+/**
+ * @file
+ * @brief The complete Hyperscan API definition.
+ *
+ * Hyperscan is a high speed regular expression engine.
+ *
+ * This header includes both the Hyperscan compiler and runtime components. See
+ * the individual component headers for documentation.
+ */
+
+#include "hs_compile.h"
+#include "hs_runtime.h"
+
+#endif /* HS_H_ */
--- a/src/hs_common.h
+++ b/src/hs_common.h
@ -0,0 +1,509 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HS_COMMON_H_
+#define HS_COMMON_H_
+
+#include <stdlib.h>
+
+/**
+ * @file
+ * @brief The Hyperscan common API definition.
+ *
+ * Hyperscan is a high speed regular expression engine.
+ *
+ * This header contains functions available to both the Hyperscan compiler and
+ * runtime.
+ */
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+struct hs_database;
+
+/**
+ * A Hyperscan pattern database.
+ *
+ * Generated by one of the Hyperscan compiler functions:
+ *  - @ref hs_compile()
+ *  - @ref hs_compile_multi()
+ *  - @ref hs_compile_ext_multi()
+ */
+typedef struct hs_database hs_database_t;
+
+/**
+ * A type for errors returned by Hyperscan functions.
+ */
+typedef int hs_error_t;
+
+/**
+ * Free a compiled pattern database.
+ *
+ * The free callback set by @ref hs_set_database_allocator() (or @ref
+ * hs_set_allocator()) will be used by this function.
+ *
+ * @param db
+ *      A compiled pattern database. NULL may also be safely provided, in which
+ *      case the function does nothing.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_free_database(hs_database_t *db);
+
+/**
+ * Serialize a pattern database to a stream of bytes.
+ *
+ * The allocator callback set by @ref hs_set_misc_allocator() (or @ref
+ * hs_set_allocator()) will be used by this function.
+ *
+ * @param db
+ *      A compiled pattern database.
+ *
+ * @param bytes
+ *      On success, a pointer to an array of bytes will be returned here.
+ *      These bytes can be subsequently relocated or written to disk. The
+ *      caller is responsible for freeing this block.
+ *
+ * @param length
+ *      On success, the number of bytes in the generated byte array will be
+ *      returned here.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, @ref HS_NOMEM if the byte array cannot be
+ *      allocated, other values may be returned if errors are detected.
+ */
+hs_error_t hs_serialize_database(const hs_database_t *db, char **bytes,
+                                 size_t *length);
+
+/**
+ * Reconstruct a pattern database from a stream of bytes previously generated
+ * by @ref hs_serialize_database().
+ *
+ * This function will allocate sufficient space for the database using the
+ * allocator set with @ref hs_set_database_allocator() (or @ref
+ * hs_set_allocator()); to use a pre-allocated region of memory, use the @ref
+ * hs_deserialize_database_at() function.
+ *
+ * @param bytes
+ *      A byte array generated by @ref hs_serialize_database() representing a
+ *      compiled pattern database.
+ *
+ * @param length
+ *      The length of the byte array generated by @ref hs_serialize_database().
+ *      This should be the same value as that returned by @ref
+ *      hs_serialize_database().
+ *
+ * @param db
+ *      On success, a pointer to a newly allocated @ref hs_database_t will be
+ *      returned here. This database can then be used for scanning, and
+ *      eventually freed by the caller using @ref hs_free_database().
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_deserialize_database(const char *bytes, const size_t length,
+                                   hs_database_t **db);
+
+/**
+ * Reconstruct a pattern database from a stream of bytes previously generated
+ * by @ref hs_serialize_database() at a given memory location.
+ *
+ * This function (unlike @ref hs_deserialize_database()) will write the
+ * reconstructed database to the memory location given in the @a db parameter.
+ * The amount of space required at this location can be determined with the
+ * @ref hs_serialized_database_size() function.
+ *
+ * @param bytes
+ *      A byte array generated by @ref hs_serialize_database() representing a
+ *      compiled pattern database.
+ *
+ * @param length
+ *      The length of the byte array generated by @ref hs_serialize_database().
+ *      This should be the same value as that returned by @ref
+ *      hs_serialize_database().
+ *
+ * @param db
+ *      Pointer to an 8-byte aligned block of memory of sufficient size to hold
+ *      the deserialized database. On success, the reconstructed database will
+ *      be written to this location. This database can then be used for pattern
+ *      matching. The user is responsible for freeing this memory; the @ref
+ *      hs_free_database() call should not be used.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_deserialize_database_at(const char *bytes, const size_t length,
+                                      hs_database_t *db);
+
+/**
+ * Provides the size of the stream state allocated by a single stream opened
+ * against the given database.
+ *
+ * @param database
+ *      Pointer to a compiled (streaming mode) pattern database.
+ *
+ * @param stream_size
+ *      On success, the size in bytes of an individual stream opened against the
+ *      given database is placed in this parameter.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_stream_size(const hs_database_t *database, size_t *stream_size);
+
+/**
+ * Provides the size of the given database in bytes.
+ *
+ * @param database
+ *      Pointer to compiled pattern database.
+ *
+ * @param database_size
+ *      On success, the size of the compiled database in bytes is placed in this
+ *      parameter.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_database_size(const hs_database_t *database,
+                            size_t *database_size);
+
+/**
+ * Utility function for reporting the size that would be required by a
+ * database if it were deserialized.
+ *
+ * This can be used to allocate a shared memory region or other "special"
+ * allocation prior to deserializing with the @ref hs_deserialize_database_at()
+ * function.
+ *
+ * @param bytes
+ *      Pointer to a byte array generated by @ref hs_serialize_database()
+ *      representing a compiled pattern database.
+ *
+ * @param length
+ *      The length of the byte array generated by @ref hs_serialize_database().
+ *      This should be the same value as that returned by @ref
+ *      hs_serialize_database().
+ *
+ * @param deserialized_size
+ *      On success, the size of the compiled database that would be generated
+ *      by @ref hs_deserialize_database_at() is returned here.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_serialized_database_size(const char *bytes, const size_t length,
+                                       size_t *deserialized_size);
+
+/**
+ * Utility function providing information about a database.
+ *
+ * @param database
+ *      Pointer to a compiled database.
+ *
+ * @param info
+ *      On success, a string containing the version and platform information for
+ *      the supplied database is placed in the parameter. The string is
+ *      allocated using the allocator supplied in @ref hs_set_misc_allocator()
+ *      (or malloc() if no allocator was set) and should be freed by the caller.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_database_info(const hs_database_t *database, char **info);
+
+/**
+ * Utility function providing information about a serialized database.
+ *
+ * @param bytes
+ *      Pointer to a serialized database.
+ *
+ * @param length
+ *      Length in bytes of the serialized database.
+ *
+ * @param info
+ *      On success, a string containing the version and platform information
+ *      for the supplied serialized database is placed in the parameter. The
+ *      string is allocated using the allocator supplied in @ref
+ *      hs_set_misc_allocator() (or malloc() if no allocator was set) and
+ *      should be freed by the caller.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_serialized_database_info(const char *bytes, size_t length,
+                                       char **info);
+
+/**
+ * The type of the callback function that will be used by Hyperscan to allocate
+ * more memory at runtime as required, for example in @ref hs_open_stream() to
+ * allocate stream state.
+ *
+ * If Hyperscan is to be used in a multi-threaded, or similarly concurrent
+ * environment, the allocation function will need to be re-entrant, or
+ * similarly safe for concurrent use.
+ *
+ * @param size
+ *      The number of bytes to allocate.
+ * @return
+ *      A pointer to the region of memory allocated, or NULL on error.
+ */
+typedef void *(*hs_alloc_t)(size_t size);
+
+/**
+ * The type of the callback function that will be used by Hyperscan to free
+ * memory regions previously allocated using the @ref hs_alloc_t function.
+ *
+ * @param ptr
+ *      The region of memory to be freed.
+ */
+typedef void (*hs_free_t)(void *ptr);
+
+/**
+ * Set the allocate and free functions used by Hyperscan for allocating
+ * memory at runtime for stream state, scratch space, database bytecode,
+ * and various other data structure returned by the Hyperscan API.
+ *
+ * The function is equivalent to calling @ref hs_set_stream_allocator(),
+ * @ref hs_set_scratch_allocator(), @ref hs_set_database_allocator() and
+ * @ref hs_set_misc_allocator() with the provided parameters.
+ *
+ * This call will override any previous allocators that have been set.
+ *
+ * Note: there is no way to change the allocator used for temporary objects
+ * created during the various compile calls (@ref hs_compile(), @ref
+ * hs_compile_multi(), @ref hs_compile_ext_multi()).
+ *
+ * @param alloc_func
+ *      A callback function pointer that allocates memory. This function must
+ *      return memory suitably aligned for the largest representable data type
+ *      on this platform.
+ *
+ * @param free_func
+ *      A callback function pointer that frees allocated memory.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_set_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
+
+/**
+ * Set the allocate and free functions used by Hyperscan for allocating memory
+ * for database bytecode produced by the compile calls (@ref hs_compile(), @ref
+ * hs_compile_multi(), @ref hs_compile_ext_multi()) and by database
+ * deserialization (@ref hs_deserialize_database()).
+ *
+ * If no database allocation functions are set, or if NULL is used in place of
+ * both parameters, then memory allocation will default to standard methods
+ * (such as the system malloc() and free() calls).
+ *
+ * This call will override any previous database allocators that have been set.
+ *
+ * Note: the database allocator may also be set by calling @ref
+ * hs_set_allocator().
+ *
+ * Note: there is no way to change how temporary objects created during the
+ * various compile calls (@ref hs_compile(), @ref hs_compile_multi(), @ref
+ * hs_compile_ext_multi()) are allocated.
+ *
+ * @param alloc_func
+ *      A callback function pointer that allocates memory. This function must
+ *      return memory suitably aligned for the largest representable data type
+ *      on this platform.
+ *
+ * @param free_func
+ *      A callback function pointer that frees allocated memory.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_set_database_allocator(hs_alloc_t alloc_func,
+                                     hs_free_t free_func);
+
+/**
+ * Set the allocate and free functions used by Hyperscan for allocating memory
+ * for items returned by the Hyperscan API such as @ref hs_compile_error_t, @ref
+ * hs_expr_info_t and serialized databases.
+ *
+ * If no misc allocation functions are set, or if NULL is used in place of both
+ * parameters, then memory allocation will default to standard methods (such as
+ * the system malloc() and free() calls).
+ *
+ * This call will override any previous misc allocators that have been set.
+ *
+ * Note: the misc allocator may also be set by calling @ref hs_set_allocator().
+ *
+ * @param alloc_func
+ *      A callback function pointer that allocates memory. This function must
+ *      return memory suitably aligned for the largest representable data type
+ *      on this platform.
+ *
+ * @param free_func
+ *      A callback function pointer that frees allocated memory.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_set_misc_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
+
+/**
+ * Set the allocate and free functions used by Hyperscan for allocating memory
+ * for scratch space by @ref hs_alloc_scratch() and @ref hs_clone_scratch().
+ *
+ * If no scratch allocation functions are set, or if NULL is used in place of
+ * both parameters, then memory allocation will default to standard methods
+ * (such as the system malloc() and free() calls).
+ *
+ * This call will override any previous scratch allocators that have been set.
+ *
+ * Note: the scratch allocator may also be set by calling @ref
+ * hs_set_allocator().
+ *
+ * @param alloc_func
+ *      A callback function pointer that allocates memory. This function must
+ *      return memory suitably aligned for the largest representable data type
+ *      on this platform.
+ *
+ * @param free_func
+ *      A callback function pointer that frees allocated memory.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_set_scratch_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
+
+/**
+ * Set the allocate and free functions used by Hyperscan for allocating memory
+ * for stream state by @ref hs_open_stream().
+ *
+ * If no stream allocation functions are set, or if NULL is used in place of
+ * both parameters, then memory allocation will default to standard methods
+ * (such as the system malloc() and free() calls).
+ *
+ * This call will override any previous stream allocators that have been set.
+ *
+ * Note: the stream allocator may also be set by calling @ref
+ * hs_set_allocator().
+ *
+ * @param alloc_func
+ *      A callback function pointer that allocates memory. This function must
+ *      return memory suitably aligned for the largest representable data type
+ *      on this platform.
+ *
+ * @param free_func
+ *      A callback function pointer that frees allocated memory.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_set_stream_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
+
+/**
+ * Utility function for identifying this release version.
+ *
+ * @return
+ *      A string containing the version number of this release build and the
+ *      date of the build. It is allocated statically, so it does not need to
+ *      be freed by the caller.
+ */
+const char *hs_version(void);
+
+/**
+ * @defgroup HS_ERROR hs_error_t values
+ *
+ * @{
+ */
+
+/**
+ * The engine completed normally.
+ */
+#define HS_SUCCESS              0
+
+/**
+ * A parameter passed to this function was invalid.
+ */
+#define HS_INVALID              (-1)
+
+/**
+ * A memory allocation failed.
+ */
+#define HS_NOMEM                (-2)
+
+/**
+ * The engine was terminated by callback.
+ *
+ * This return value indicates that the target buffer was partially scanned,
+ * but that the callback function requested that scanning cease after a match
+ * was located.
+ */
+#define HS_SCAN_TERMINATED      (-3)
+
+/**
+ * The pattern compiler failed, and the @ref hs_compile_error_t should be
+ * inspected for more detail.
+ */
+#define HS_COMPILER_ERROR       (-4)
+
+/**
+ * The given database was built for a different version of Hyperscan.
+ */
+#define HS_DB_VERSION_ERROR     (-5)
+
+/**
+ * The given database was built for a different platform (i.e., CPU type).
+ */
+#define HS_DB_PLATFORM_ERROR    (-6)
+
+/**
+ * The given database was built for a different mode of operation. This error
+ * is returned when streaming calls are used with a block or vectored database
+ * and vice versa.
+ */
+#define HS_DB_MODE_ERROR        (-7)
+
+/**
+ * A parameter passed to this function was not correctly aligned.
+ */
+#define HS_BAD_ALIGN            (-8)
+
+/**
+ * The memory allocator (either malloc() or the allocator set with @ref
+ * hs_set_allocator()) did not correctly return memory suitably aligned for the
+ * largest representable data type on this platform.
+ */
+#define HS_BAD_ALLOC            (-9)
+
+/** @} */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* HS_COMMON_H_ */
--- a/src/hs_compile.h
+++ b/src/hs_compile.h
@ -0,0 +1,848 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HS_COMPILE_H_
+#define HS_COMPILE_H_
+
+/**
+ * @file
+ * @brief The Hyperscan compiler API definition.
+ *
+ * Hyperscan is a high speed regular expression engine.
+ *
+ * This header contains functions for compiling regular expressions into
+ * Hyperscan databases that can be used by the Hyperscan runtime.
+ */
+
+#include "hs_common.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * A type containing error details that is returned by the compile calls (@ref
+ * hs_compile(), @ref hs_compile_multi() and @ref hs_compile_ext_multi()) on
+ * failure. The caller may inspect the values returned in this type to
+ * determine the cause of failure.
+ *
+ * Common errors generated during the compile process include:
+ *
+ *    - *Invalid parameter*
+ *
+ *      An invalid argument was specified in the compile call.
+ *
+ *    - *Unrecognised flag*
+ *
+ *      An unrecognised value was passed in the flags argument.
+ *
+ *    - *Pattern matches empty buffer*
+ *
+ *      By default, Hyperscan only supports patterns that will *always*
+ *      consume at least one byte of input. Patterns that do not have this
+ *      property (such as `/(abc)?/`) will produce this error unless
+ *      the @ref HS_FLAG_ALLOWEMPTY flag is supplied. Note that such
+ *      patterns will produce a match for *every* byte when scanned.
+ *
+ *    - *Embedded anchors not supported*
+ *
+ *      Hyperscan only supports the use of anchor meta-characters (such as
+ *      `^` and `$`) in patterns where they could *only* match
+ *      at the start or end of a buffer. A pattern containing an embedded
+ *      anchor, such as `/abc^def/`, can never match, as there is no
+ *      way for `abc` to precede the start of the data stream.
+ *
+ *    - *Bounded repeat is too large*
+ *
+ *      The pattern contains a repeated construct with very large finite
+ *      bounds.
+ *
+ *    - *Unsupported component type*
+ *
+ *      An unsupported PCRE construct was used in the pattern.
+ *
+ *    - *Unable to generate bytecode*
+ *
+ *      This error indicates that Hyperscan was unable to compile a pattern
+ *      that is syntactically valid. The most common cause is a pattern that is
+ *      very long and complex or contains a large repeated subpattern.
+ *
+ *    - *Unable to allocate memory*
+ *
+ *      The library was unable to allocate temporary storage used during
+ *      compilation time.
+ *
+ *    - *Internal error*
+ *
+ *      An unexpected error occurred: if this error is reported, please contact
+ *      the Hyperscan team with a description of the situation.
+ */
+typedef struct hs_compile_error {
+    /**
+     * A human-readable error message describing the error.
+     */
+    char *message;
+
+    /**
+     * The zero-based number of the expression that caused the error (if this
+     * can be determined). If the error is not specific to an expression, then
+     * this value will be less than zero.
+     */
+    int expression;
+} hs_compile_error_t;
+
+/**
+ * A type containing information on the target platform which may optionally be
+ * provided to the compile calls (@ref hs_compile(), @ref hs_compile_multi(),
+ * @ref hs_compile_ext_multi()).
+ *
+ * A hs_platform_info structure may be populated for the current platform by
+ * using the @ref hs_populate_platform() call.
+ */
+typedef struct hs_platform_info {
+    /**
+     * Information about the target platform which may be used to guide the
+     * optimisation process of the compile.
+     *
+     * Use of this field does not limit the processors that the resulting
+     * database can run on, but may impact the performance of the resulting
+     * database.
+     */
+    unsigned int tune;
+
+    /**
+     * Relevant CPU features available on the target platform
+     *
+     * This value may be produced by combining HS_CPU_FEATURE_* flags (such as
+     * @ref HS_CPU_FEATURES_AVX2). Multiple CPU features may be or'ed together
+     * to produce the value.
+     */
+    unsigned long long cpu_features;
+
+    /**
+     * Reserved for future use.
+     */
+    unsigned long long reserved1;
+
+    /**
+     * Reserved for future use.
+     */
+    unsigned long long reserved2;
+} hs_platform_info_t;
+
+/**
+ * A type containing information related to an expression that is returned by
+ * @ref hs_expression_info().
+ */
+typedef struct hs_expr_info {
+    /**
+     * The minimum length in bytes of a match for the pattern.
+     */
+    unsigned int min_width;
+
+    /**
+     * The maximum length in bytes of a match for the pattern. If the pattern
+     * has an unbounded maximum width, this will be set to the maximum value of
+     * an unsigned int (UINT_MAX).
+     */
+    unsigned int max_width;
+
+    /**
+     * Whether this expression can produce matches that are not returned in
+     * order, such as those produced by assertions. Zero if false, non-zero if
+     * true.
+     */
+    char unordered_matches;
+
+    /**
+     * Whether this expression can produce matches at end of data (EOD). In
+     * streaming mode, EOD matches are raised during @ref hs_close_stream(),
+     * since it is only when @ref hs_close_stream() is called that the EOD
+     * location is known. Zero if false, non-zero if true.
+     *
+     * Note: trailing `\b` word boundary assertions may also result in EOD
+     * matches as end-of-data can act as a word boundary.
+     */
+    char matches_at_eod;
+
+    /**
+     * Whether this expression can *only* produce matches at end of data (EOD).
+     * In streaming mode, all matches for this expression are raised during
+     * @ref hs_close_stream(). Zero if false, non-zero if true.
+     */
+    char matches_only_at_eod;
+} hs_expr_info_t;
+
+/**
+ * A structure containing additional parameters related to an expression,
+ * passed in at build time to @ref hs_compile_ext_multi().
+ *
+ * These parameters allow the set of matches produced by a pattern to be
+ * constrained at compile time, rather than relying on the application to
+ * process unwanted matches at runtime.
+ */
+typedef struct hs_expr_ext {
+    /**
+     * Flags governing which parts of this structure are to be used by the
+     * compiler. See @ref HS_EXT_FLAG.
+     */
+    unsigned long long flags;
+
+    /**
+     * The minimum end offset in the data stream at which this expression
+     * should match successfully. To use this parameter, set the
+     * @ref HS_EXT_FLAG_MIN_OFFSET flag in the hs_expr_ext::flags field.
+     */
+    unsigned long long min_offset;
+
+    /**
+     * The maximum end offset in the data stream at which this expression
+     * should match successfully. To use this parameter, set the
+     * @ref HS_EXT_FLAG_MAX_OFFSET flag in the hs_expr_ext::flags field.
+     */
+    unsigned long long max_offset;
+
+    /**
+     * The minimum match length (from start to end) required to successfully
+     * match this expression. To use this parameter, set the
+     * @ref HS_EXT_FLAG_MIN_LENGTH flag in the hs_expr_ext::flags field.
+     */
+    unsigned long long min_length;
+} hs_expr_ext_t;
+
+/**
+ * @defgroup HS_EXT_FLAG hs_expr_ext_t flags
+ *
+ * These flags are used in @ref hs_expr_ext_t::flags to indicate which fields
+ * are used.
+ *
+ * @{
+ */
+
+/** Flag indicating that the hs_expr_ext::min_offset field is used. */
+#define HS_EXT_FLAG_MIN_OFFSET      1ULL
+
+/** Flag indicating that the hs_expr_ext::max_offset field is used. */
+#define HS_EXT_FLAG_MAX_OFFSET      2ULL
+
+/** Flag indicating that the hs_expr_ext::min_length field is used. */
+#define HS_EXT_FLAG_MIN_LENGTH      4ULL
+
+/** @} */
+
+/**
+ * The basic regular expression compiler.
+ *
+ * This is the function call with which an expression is compiled into a
+ * Hyperscan database which can be passed to the runtime functions (such as
+ * @ref hs_scan(), @ref hs_open_stream(), etc.)
+ *
+ * @param expression
+ *      The NULL-terminated expression to parse. Note that this string must
+ *      represent ONLY the pattern to be matched, with no delimiters or flags;
+ *      any global flags should be specified with the @a flags argument. For
+ *      example, the expression `/abc?def/i` should be compiled by providing
+ *      `abc?def` as the @a expression, and @ref HS_FLAG_CASELESS as the @a
+ *      flags.
+ *
+ * @param flags
+ *      Flags which modify the behaviour of the expression. Multiple flags may
+ *      be used by ORing them together. Valid values are:
+ *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
+ *       - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
+ *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
+ *       - HS_FLAG_SINGLEMATCH - Only one match will be generated for the
+ *                               expression per stream.
+ *       - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
+ *                              empty string, such as `.*`.
+ *       - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
+ *       - HS_FLAG_UCP - Use Unicode properties for character classes.
+ *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
+ *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
+ *                                when a match is found.
+ *
+ * @param mode
+ *      Compiler mode flags that affect the database as a whole. One of @ref
+ *      HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
+ *      supplied, to select between the generation of a streaming, block or
+ *      vectored database. In addition, other flags (beginning with HS_MODE_)
+ *      may be supplied to enable specific features. See @ref HS_MODE_FLAG for
+ *      more details.
+ *
+ * @param platform
+ *      If not NULL, the platform structure is used to determine the target
+ *      platform for the database. If NULL, a database suitable for running
+ *      on the current host platform is produced.
+ *
+ * @param db
+ *      On success, a pointer to the generated database will be returned in
+ *      this parameter, or NULL on failure. The caller is responsible for
+ *      deallocating the buffer using the @ref hs_free_database() function.
+ *
+ * @param error
+ *      If the compile fails, a pointer to a @ref hs_compile_error_t will be
+ *      returned, providing details of the error condition. The caller is
+ *      responsible for deallocating the buffer using the @ref
+ *      hs_free_compile_error() function.
+ *
+ * @return
+ *      @ref HS_SUCCESS is returned on successful compilation; @ref
+ *      HS_COMPILER_ERROR on failure, with details provided in the error
+ *      parameter.
+ */
+hs_error_t hs_compile(const char *expression, unsigned int flags,
+                      unsigned int mode, const hs_platform_info_t *platform,
+                      hs_database_t **db, hs_compile_error_t **error);
+
+/**
+ * The multiple regular expression compiler.
+ *
+ * This is the function call with which a set of expressions is compiled into a
+ * database which can be passed to the runtime functions (such as @ref
+ * hs_scan(), @ref hs_open_stream(), etc.) Each expression can be labelled with
+ * a unique integer which is passed into the match callback to identify the
+ * pattern that has matched.
+ *
+ * @param expressions
+ *      Array of NULL-terminated expressions to compile. Note that (as for @ref
+ *      hs_compile()) these strings must contain only the pattern to be
+ *      matched, with no delimiters or flags. For example, the expression
+ *      `/abc?def/i` should be compiled by providing `abc?def` as the first
+ *      string in the @a expressions array, and @ref HS_FLAG_CASELESS as the
+ *      first value in the @a flags array.
+ *
+ * @param flags
+ *      Array of flags which modify the behaviour of each expression. Multiple
+ *      flags may be used by ORing them together.  Specifying the NULL pointer
+ *      in place of an array will set the flags value for all patterns to zero.
+ *      Valid values are:
+ *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
+ *       - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
+ *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
+ *       - HS_FLAG_SINGLEMATCH - Only one match will be generated by patterns
+ *                               with this match id per stream.
+ *       - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
+ *                              empty string, such as `.*`.
+ *       - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
+ *       - HS_FLAG_UCP - Use Unicode properties for character classes.
+ *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
+ *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
+ *                                when a match is found.
+ *
+ * @param ids
+ *      An array of integers specifying the ID number to be associated with the
+ *      corresponding pattern in the expressions array. Specifying the NULL
+ *      pointer in place of an array will set the ID value for all patterns to
+ *      zero.
+ *
+ * @param elements
+ *      The number of elements in the input arrays.
+ *
+ * @param mode
+ *      Compiler mode flags that affect the database as a whole. One of @ref
+ *      HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
+ *      supplied, to select between the generation of a streaming, block or
+ *      vectored database. In addition, other flags (beginning with HS_MODE_)
+ *      may be supplied to enable specific features. See @ref HS_MODE_FLAG for
+ *      more details.
+ *
+ * @param platform
+ *      If not NULL, the platform structure is used to determine the target
+ *      platform for the database. If NULL, a database suitable for running
+ *      on the current host platform is produced.
+ *
+ * @param db
+ *      On success, a pointer to the generated database will be returned in
+ *      this parameter, or NULL on failure. The caller is responsible for
+ *      deallocating the buffer using the @ref hs_free_database() function.
+ *
+ * @param error
+ *      If the compile fails, a pointer to a @ref hs_compile_error_t will be
+ *      returned, providing details of the error condition. The caller is
+ *      responsible for deallocating the buffer using the @ref
+ *      hs_free_compile_error() function.
+ *
+ * @return
+ *      @ref HS_SUCCESS is returned on successful compilation; @ref
+ *      HS_COMPILER_ERROR on failure, with details provided in the @a error
+ *      parameter.
+ *
+ */
+hs_error_t hs_compile_multi(const char *const *expressions,
+                            const unsigned int *flags, const unsigned int *ids,
+                            unsigned int elements, unsigned int mode,
+                            const hs_platform_info_t *platform,
+                            hs_database_t **db, hs_compile_error_t **error);
+
+/**
+ * The multiple regular expression compiler with extended pattern support.
+ *
+ * This function call compiles a group of expressions into a database in the
+ * same way as @ref hs_compile_multi(), but allows additional parameters to be
+ * specified via an @ref hs_expr_ext_t structure per expression.
+ *
+ * @param expressions
+ *      Array of NULL-terminated expressions to compile. Note that (as for @ref
+ *      hs_compile()) these strings must contain only the pattern to be
+ *      matched, with no delimiters or flags. For example, the expression
+ *      `/abc?def/i` should be compiled by providing `abc?def` as the first
+ *      string in the @a expressions array, and @ref HS_FLAG_CASELESS as the
+ *      first value in the @a flags array.
+ *
+ * @param flags
+ *      Array of flags which modify the behaviour of each expression. Multiple
+ *      flags may be used by ORing them together. Specifying the NULL pointer
+ *      in place of an array will set the flags value for all patterns to zero.
+ *      Valid values are:
+ *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
+ *       - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
+ *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
+ *       - HS_FLAG_SINGLEMATCH - Only one match will be generated by patterns
+ *                               with this match id per stream.
+ *       - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
+ *                              empty string, such as `.*`.
+ *       - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
+ *       - HS_FLAG_UCP - Use Unicode properties for character classes.
+ *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
+ *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
+ *                                when a match is found.
+ *
+ * @param ids
+ *      An array of integers specifying the ID number to be associated with the
+ *      corresponding pattern in the expressions array. Specifying the NULL
+ *      pointer in place of an array will set the ID value for all patterns to
+ *      zero.
+ *
+ * @param ext
+ *      An array of pointers to filled @ref hs_expr_ext_t structures that
+ *      define extended behaviour for each pattern. NULL may be specified if no
+ *      extended behaviour is needed for an individual pattern, or in place of
+ *      the whole array if it is not needed for any expressions. Memory used by
+ *      these structures must be both allocated and freed by the caller.
+ *
+ * @param elements
+ *      The number of elements in the input arrays.
+ *
+ * @param mode
+ *      Compiler mode flags that affect the database as a whole. One of @ref
+ *      HS_MODE_STREAM, @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be
+ *      supplied, to select between the generation of a streaming, block or
+ *      vectored database. In addition, other flags (beginning with HS_MODE_)
+ *      may be supplied to enable specific features. See @ref HS_MODE_FLAG for
+ *      more details.
+ *
+ * @param platform
+ *      If not NULL, the platform structure is used to determine the target
+ *      platform for the database. If NULL, a database suitable for running
+ *      on the current host platform is produced.
+ *
+ * @param db
+ *      On success, a pointer to the generated database will be returned in
+ *      this parameter, or NULL on failure. The caller is responsible for
+ *      deallocating the buffer using the @ref hs_free_database() function.
+ *
+ * @param error
+ *      If the compile fails, a pointer to a @ref hs_compile_error_t will be
+ *      returned, providing details of the error condition. The caller is
+ *      responsible for deallocating the buffer using the @ref
+ *      hs_free_compile_error() function.
+ *
+ * @return
+ *      @ref HS_SUCCESS is returned on successful compilation; @ref
+ *      HS_COMPILER_ERROR on failure, with details provided in the @a error
+ *      parameter.
+ *
+ */
+hs_error_t hs_compile_ext_multi(const char *const *expressions,
+                                const unsigned int *flags,
+                                const unsigned int *ids,
+                                const hs_expr_ext_t *const *ext,
+                                unsigned int elements, unsigned int mode,
+                                const hs_platform_info_t *platform,
+                                hs_database_t **db, hs_compile_error_t **error);
+
+/**
+ * Free an error structure generated by @ref hs_compile(), @ref
+ * hs_compile_multi() or @ref hs_compile_ext_multi().
+ *
+ * @param error
+ *      The @ref hs_compile_error_t to be freed. NULL may also be safely
+ *      provided.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_free_compile_error(hs_compile_error_t *error);
+
+/**
+ * Utility function providing information about a regular expression. The
+ * information provided in @ref hs_expr_info_t includes the minimum and maximum
+ * width of a pattern match.
+ *
+ * @param expression
+ *      The NULL-terminated expression to parse. Note that this string must
+ *      represent ONLY the pattern to be matched, with no delimiters or flags;
+ *      any global flags should be specified with the @a flags argument.  For
+ *      example, the expression `/abc?def/i` should be compiled by providing
+ *      `abc?def` as the @a expression, and @ref HS_FLAG_CASELESS as the @a
+ *      flags.
+ *
+ * @param flags
+ *      Flags which modify the behaviour of the expression. Multiple flags may
+ *      be used by ORing them together. Valid values are:
+ *       - HS_FLAG_CASELESS - Matching will be performed case-insensitively.
+ *       - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines.
+ *       - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
+ *       - HS_FLAG_SINGLEMATCH - Only one match will be generated by the
+ *                               expression per stream.
+ *       - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an
+ *                              empty string, such as `.*`.
+ *       - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
+ *       - HS_FLAG_UCP - Use Unicode properties for character classes.
+ *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
+ *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
+ *                                when a match is found.
+ *
+ * @param info
+ *      On success, a pointer to the pattern information will be returned in
+ *      this parameter, or NULL on failure. This structure is allocated using
+ *      the allocator supplied in @ref hs_set_allocator() (or malloc() if no
+ *      allocator was set) and should be freed by the caller.
+ *
+ * @param error
+ *      If the call fails, a pointer to a @ref hs_compile_error_t will be
+ *      returned, providing details of the error condition. The caller is
+ *      responsible for deallocating the buffer using the @ref
+ *      hs_free_compile_error() function.
+ *
+ * @return
+ *      @ref HS_SUCCESS is returned on successful compilation; @ref
+ *      HS_COMPILER_ERROR on failure, with details provided in the error
+ *      parameter.
+ */
+hs_error_t hs_expression_info(const char *expression, unsigned int flags,
+                              hs_expr_info_t **info,
+                              hs_compile_error_t **error);
+
+/**
+ * Populates the platform information based on the current host.
+ *
+ * @param platform
+ *      On success, the pointed to structure is populated based on the current
+ *      host.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_populate_platform(hs_platform_info_t *platform);
+
+/**
+ * @defgroup HS_PATTERN_FLAG Pattern flags
+ *
+ * @{
+ */
+
+/**
+ * Compile flag: Set case-insensitive matching.
+ *
+ * This flag sets the expression to be matched case-insensitively by default.
+ * The expression may still use PCRE tokens (notably `(?i)` and
+ * `(?-i)`) to switch case-insensitive matching on and off.
+ */
+#define HS_FLAG_CASELESS        1
+
+/**
+ * Compile flag: Matching a `.` will not exclude newlines.
+ *
+ * This flag sets any instances of the `.` token to match newline characters as
+ * well as all other characters. The PCRE specification states that the `.`
+ * token does not match newline characters by default, so without this flag the
+ * `.` token will not cross line boundaries.
+ */
+#define HS_FLAG_DOTALL          2
+
+/**
+ * Compile flag: Set multi-line anchoring.
+ *
+ * This flag instructs the expression to make the `^` and `$` tokens match
+ * newline characters as well as the start and end of the stream. If this flag
+ * is not specified, the `^` token will only ever match at the start of a
+ * stream, and the `$` token will only ever match at the end of a stream within
+ * the guidelines of the PCRE specification.
+ */
+#define HS_FLAG_MULTILINE       4
+
+/**
+ * Compile flag: Set single-match only mode.
+ *
+ * This flag sets the expression's match ID to match at most once. In streaming
+ * mode, this means that the expression will return only a single match over
+ * the lifetime of the stream, rather than reporting every match as per
+ * standard Hyperscan semantics. In block mode or vectored mode, only the first
+ * match for each invocation of @ref hs_scan() or @ref hs_scan_vector() will be
+ * returned.
+ *
+ * If multiple expressions in the database share the same match ID, then they
+ * either must all specify @ref HS_FLAG_SINGLEMATCH or none of them specify
+ * @ref HS_FLAG_SINGLEMATCH. If a group of expressions sharing a match ID
+ * specify the flag, then at most one match with the match ID will be generated
+ * per stream.
+ *
+ * Note: The use of this flag in combination with @ref HS_FLAG_SOM_LEFTMOST
+ * is not currently supported.
+ */
+#define HS_FLAG_SINGLEMATCH     8
+
+/**
+ * Compile flag: Allow expressions that can match against empty buffers.
+ *
+ * This flag instructs the compiler to allow expressions that can match against
+ * empty buffers, such as `.?`, `.*`, `(a|)`. Since Hyperscan can return every
+ * possible match for an expression, such expressions generally execute very
+ * slowly; the default behaviour is to return an error when an attempt to
+ * compile one is made. Using this flag will force the compiler to allow such
+ * an expression.
+ */
+#define HS_FLAG_ALLOWEMPTY      16
+
+/**
+ * Compile flag: Enable UTF-8 mode for this expression.
+ *
+ * This flag instructs Hyperscan to treat the pattern as a sequence of UTF-8
+ * characters. The results of scanning invalid UTF-8 sequences with a Hyperscan
+ * library that has been compiled with one or more patterns using this flag are
+ * undefined.
+ */
+#define HS_FLAG_UTF8            32
+
+/**
+ * Compile flag: Enable Unicode property support for this expression.
+ *
+ * This flag instructs Hyperscan to use Unicode properties, rather than the
+ * default ASCII interpretations, for character mnemonics like `\w` and `\s` as
+ * well as the POSIX character classes. It is only meaningful in conjunction
+ * with @ref HS_FLAG_UTF8.
+ */
+#define HS_FLAG_UCP             64
+
+/**
+ * Compile flag: Enable prefiltering mode for this expression.
+ *
+ * This flag instructs Hyperscan to compile an "approximate" version of this
+ * pattern for use in a prefiltering application, even if Hyperscan does not
+ * support the pattern in normal operation.
+ *
+ * The set of matches returned when this flag is used is guaranteed to be a
+ * superset of the matches specified by the non-prefiltering expression.
+ *
+ * If the pattern contains pattern constructs not supported by Hyperscan (such
+ * as zero-width assertions, back-references or conditional references) these
+ * constructs will be replaced internally with broader constructs that may
+ * match more often.
+ *
+ * Furthermore, in prefiltering mode Hyperscan may simplify a pattern that
+ * would otherwise return a "Pattern too large" error at compile time, or for
+ * performance reasons (subject to the matching guarantee above).
+ *
+ * It is generally expected that the application will subsequently confirm
+ * prefilter matches with another regular expression matcher that can provide
+ * exact matches for the pattern.
+ *
+ * Note: The use of this flag in combination with @ref HS_FLAG_SOM_LEFTMOST
+ * is not currently supported.
+ */
+#define HS_FLAG_PREFILTER       128
+
+/**
+ * Compile flag: Enable leftmost start of match reporting.
+ *
+ * This flag instructs Hyperscan to report the leftmost possible start of match
+ * offset when a match is reported for this expression. (By default, no start
+ * of match is returned.)
+ *
+ * Enabling this behaviour may reduce performance and increase stream state
+ * requirements in streaming mode.
+ */
+#define HS_FLAG_SOM_LEFTMOST    256
+
+/** @} */
+
+/**
+ * @defgroup HS_CPU_FEATURES_FLAG CPU feature support flags
+ *
+ * @{
+ */
+
+/**
+ * CPU features flag - Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2)
+ *
+ * Setting this flag indicates that the target platform supports AVX2
+ * instructions.
+ */
+#define HS_CPU_FEATURES_AVX2             (1ULL << 2)
+
+/** @} */
+
+/**
+ * @defgroup HS_TUNE_FLAG Tuning flags
+ *
+ * @{
+ */
+
+/**
+ * Tuning Parameter - Generic
+ *
+ * This indicates that the compiled database should not be tuned for any
+ * particular target platform.
+ */
+#define HS_TUNE_FAMILY_GENERIC 0
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Sandy Bridge
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Sandy Bridge microarchitecture.
+ */
+#define HS_TUNE_FAMILY_SNB 1
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Ivy Bridge
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Ivy Bridge microarchitecture.
+ */
+#define HS_TUNE_FAMILY_IVB 2
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Haswell
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Haswell microarchitecture.
+ */
+#define HS_TUNE_FAMILY_HSW 3
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Silvermont
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Silvermont microarchitecture.
+ */
+#define HS_TUNE_FAMILY_SLM 4
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Broadwell
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Broadwell microarchitecture.
+ */
+#define HS_TUNE_FAMILY_BDW 5
+
+/** @} */
+
+/**
+ * @defgroup HS_MODE_FLAG Compile mode flags
+ *
+ * The mode flags are used as values for the mode parameter of the various
+ * compile calls (@ref hs_compile(), @ref hs_compile_multi() and @ref
+ * hs_compile_ext_multi()).
+ *
+ * A mode value can be built by ORing these flag values together; the only
+ * required flag is one of @ref HS_MODE_BLOCK, @ref HS_MODE_STREAM or @ref
+ * HS_MODE_VECTORED. Other flags may be added to enable support for additional
+ * features.
+ *
+ *  @{
+ */
+
+/**
+ * Compiler mode flag: Block scan (non-streaming) database.
+ */
+#define HS_MODE_BLOCK           1
+
+/**
+ * Compiler mode flag: Alias for @ref HS_MODE_BLOCK.
+ */
+#define HS_MODE_NOSTREAM        1
+
+/**
+ * Compiler mode flag: Streaming database.
+ */
+#define HS_MODE_STREAM          2
+
+/**
+ * Compiler mode flag: Vectored scanning database.
+ */
+#define HS_MODE_VECTORED        4
+
+/**
+ * Compiler mode flag: use full precision to track start of match offsets in
+ * stream state.
+ *
+ * This mode will use the most stream state per pattern, but will always return
+ * an accurate start of match offset regardless of how far back in the past it
+ * was found.
+ *
+ * One of the SOM_HORIZON modes must be selected to use the @ref
+ * HS_FLAG_SOM_LEFTMOST expression flag.
+ */
+#define HS_MODE_SOM_HORIZON_LARGE   (1U << 24)
+
+/**
+ * Compiler mode flag: use medium precision to track start of match offsets in
+ * stream state.
+ *
+ * This mode will use less stream state than @ref HS_MODE_SOM_HORIZON_LARGE and
+ * will limit start of match accuracy to offsets within 2^32 bytes of the
+ * end of match offset reported.
+ *
+ * One of the SOM_HORIZON modes must be selected to use the @ref
+ * HS_FLAG_SOM_LEFTMOST expression flag.
+ */
+#define HS_MODE_SOM_HORIZON_MEDIUM  (1U << 25)
+
+/**
+ * Compiler mode flag: use limited precision to track start of match offsets in
+ * stream state.
+ *
+ * This mode will use less stream state than @ref HS_MODE_SOM_HORIZON_LARGE and
+ * will limit start of match accuracy to offsets within 2^16 bytes of the
+ * end of match offset reported.
+ *
+ * One of the SOM_HORIZON modes must be selected to use the @ref
+ * HS_FLAG_SOM_LEFTMOST expression flag.
+ */
+#define HS_MODE_SOM_HORIZON_SMALL   (1U << 26)
+
+/** @} */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* HS_COMPILE_H_ */
--- a/src/hs_internal.h
+++ b/src/hs_internal.h
@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Internal-use only definitions. Available to internal tools.
+ */
+
+#ifndef HS_INTERNAL_H
+#define HS_INTERNAL_H
+
+#include "ue2common.h"
+#include "hs.h"
+
+#ifdef __cplusplus
+
+namespace ue2 {
+
+struct Grey;
+
+/** \brief Internal use only: takes a Grey argument so that we can use it in
+ * tools. */
+hs_error_t hs_compile_multi_int(const char *const *expressions,
+                                const unsigned *flags, const unsigned *ids,
+                                const hs_expr_ext *const *ext,
+                                unsigned elements, unsigned mode,
+                                const hs_platform_info_t *platform,
+                                hs_database_t **db,
+                                hs_compile_error_t **comp_error, const Grey &g);
+
+} // namespace ue2
+
+extern "C"
+{
+#endif
+
+#define HS_MATCH_FLAG_ADJUSTED  1U
+
+/** \brief Bitmask of all valid Hyperscan flags. */
+#define HS_FLAG_ALL ( HS_FLAG_CASELESS \
+                    | HS_FLAG_DOTALL \
+                    | HS_FLAG_MULTILINE \
+                    | HS_FLAG_UTF8 \
+                    | HS_FLAG_UCP \
+                    | HS_FLAG_PREFILTER \
+                    | HS_FLAG_SINGLEMATCH \
+                    | HS_FLAG_ALLOWEMPTY \
+                    | HS_FLAG_SOM_LEFTMOST)
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
--- a/src/hs_runtime.h
+++ b/src/hs_runtime.h
@ -0,0 +1,493 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HS_RUNTIME_H_
+#define HS_RUNTIME_H_
+
+#include <stdlib.h>
+
+/**
+ * @file
+ * @brief The Hyperscan runtime API definition.
+ *
+ * Hyperscan is a high speed regular expression engine.
+ *
+ * This header contains functions for using compiled Hyperscan databases for
+ * scanning data at runtime.
+ */
+
+#include "hs_common.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * Definition of the stream identifier type.
+ */
+struct hs_stream;
+
+/**
+ * The stream identifier returned by @ref hs_open_stream().
+ */
+typedef struct hs_stream hs_stream_t;
+
+struct hs_scratch;
+
+/**
+ * A Hyperscan scratch space.
+ */
+typedef struct hs_scratch hs_scratch_t;
+
+/**
+ * Definition of the match event callback function type.
+ *
+ * A callback function matching the defined type must be provided by the
+ * application calling the @ref hs_scan(), @ref hs_scan_vector() or @ref
+ * hs_scan_stream() functions (or other streaming calls which can produce
+ * matches).
+ *
+ * This callback function will be invoked whenever a match is located in the
+ * target data during the execution of a scan. The details of the match are
+ * passed in as parameters to the callback function, and the callback function
+ * should return a value indicating whether or not matching should continue on
+ * the target data. If no callbacks are desired from a scan call, NULL may be
+ * provided in order to suppress match production.
+ *
+ * This callback function should not attempt to call Hyperscan API functions on
+ * the same stream nor should it attempt to reuse the scratch space allocated
+ * for the API calls that caused it to be triggered. Making another call to the
+ * Hyperscan library with completely independent parameters should work (for
+ * example, scanning a different database in a new stream and with new scratch
+ * space), but reusing data structures like stream state and/or scratch space
+ * will produce undefined behavior.
+ *
+ * @param id
+ *      The ID number of the expression that matched. If the expression was a
+ *      single expression compiled with @ref hs_compile(), this value will be
+ *      zero.
+ *
+ * @param from
+ *      - If a start of match flag is enabled for the current pattern, this
+ *        argument will be set to the start of match for the pattern assuming
+ *        that that start of match value lies within the current 'start of match
+ *        horizon' chosen by one of the SOM_HORIZON mode flags.
+
+ *      - If the start of match value lies outside this horizon (possible only
+ *        when the SOM_HORIZON value is not @ref HS_MODE_SOM_HORIZON_LARGE),
+ *        the @a from value will be set to @ref HS_OFFSET_PAST_HORIZON.
+
+ *      - This argument will be set to zero if the Start of Match flag is not
+ *        enabled for the given pattern.
+ *
+ * @param to
+ *      The offset after the last byte that matches the expression.
+ *
+ * @param flags
+ *      This is provided for future use and is unused at present.
+ *
+ * @param context
+ *      The pointer supplied by the user to the @ref hs_scan(), @ref
+ *      hs_scan_vector() or @ref hs_scan_stream() function.
+ *
+ * @return
+ *      Non-zero if the matching should cease, else zero. If scanning is
+ *      performed in streaming mode and a non-zero value is returned, any
+ *      subsequent calls to @ref hs_scan_stream() for that stream will
+ *      immediately return with @ref HS_SCAN_TERMINATED.
+ */
+typedef int (*match_event_handler)(unsigned int id,
+                                   unsigned long long from,
+                                   unsigned long long to,
+                                   unsigned int flags,
+                                   void *context);
+
+/**
+ * Open and initialise a stream.
+ *
+ * @param db
+ *      A compiled pattern database.
+ *
+ * @param flags
+ *      Flags modifying the behaviour of the stream. This parameter is provided
+ *      for future use and is unused at present.
+ *
+ * @param stream
+ *      On success, a pointer to the generated @ref hs_stream_t will be
+ *      returned; NULL on failure.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_open_stream(const hs_database_t *db, unsigned int flags,
+                          hs_stream_t **stream);
+
+/**
+ * Write data to be scanned to the opened stream.
+ *
+ * This is the function call in which the actual pattern matching takes place
+ * as data is written to the stream. Matches will be returned via the @ref
+ * match_event_handler callback supplied.
+ *
+ * @param id
+ *      The stream ID (returned by @ref hs_open_stream()) to which the data
+ *      will be written.
+ *
+ * @param data
+ *      Pointer to the data to be scanned.
+ *
+ * @param length
+ *      The number of bytes to scan.
+ *
+ * @param flags
+ *      Flags modifying the behaviour of the stream. This parameter is provided
+ *      for future use and is unused at present.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch().
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param ctxt
+ *      The user defined pointer which will be passed to the callback function
+ *      when a match occurs.
+ *
+ * @return
+ *      Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
+ *      match callback indicated that scanning should stop; other values on
+ *      error.
+ */
+hs_error_t hs_scan_stream(hs_stream_t *id, const char *data,
+                          unsigned int length, unsigned int flags,
+                          hs_scratch_t *scratch, match_event_handler onEvent,
+                          void *ctxt);
+
+/**
+ * Close a stream.
+ *
+ * This function must be called for any stream created with @ref
+ * hs_open_stream(), even if scanning has been terminated by a non-zero return
+ * from the match callback function.
+ *
+ * Note: This operation may result in matches being returned (via calls to the
+ * match event callback) for expressions anchored to the end of the data stream
+ * (for example, via the use of the `$` meta-character). If these matches are
+ * not desired, NULL may be provided as the @ref match_event_handler callback.
+ *
+ * If NULL is provided as the @ref match_event_handler callback, it is
+ * permissible to provide a NULL scratch.
+ *
+ * @param id
+ *      The stream ID returned by @ref hs_open_stream().
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch(). This is
+ *      allowed to be NULL only if the @a onEvent callback is also NULL.
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param ctxt
+ *      The user defined pointer which will be passed to the callback function
+ *      when a match occurs.
+ *
+ * @return
+ *      Returns @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
+                           match_event_handler onEvent, void *ctxt);
+
+/**
+ * Reset a stream to an initial state.
+ *
+ * Conceptually, this is equivalent to performing @ref hs_close_stream() on the
+ * given stream, followed by a @ref hs_open_stream(). This new stream replaces
+ * the original stream in memory, avoiding the overhead of freeing the old
+ * stream and allocating the new one.
+ *
+ * Note: This operation may result in matches being returned (via calls to the
+ * match event callback) for expressions anchored to the end of the original
+ * data stream (for example, via the use of the `$` meta-character). If these
+ * matches are not desired, NULL may be provided as the @ref match_event_handler
+ * callback.
+ *
+ * Note: the stream will also be tied to the same database.
+ *
+ * @param id
+ *      The stream (as created by @ref hs_open_stream()) to be replaced.
+ *
+ * @param flags
+ *      Flags modifying the behaviour of the stream. This parameter is provided
+ *      for future use and is unused at present.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch().
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param context
+ *      The user defined pointer which will be passed to the callback function
+ *      when a match occurs.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_reset_stream(hs_stream_t *id, unsigned int flags,
+                           hs_scratch_t *scratch, match_event_handler onEvent,
+                           void *context);
+
+/**
+ * Duplicate the given stream. The new stream will have the same state as the
+ * original including the current stream offset.
+ *
+ * @param to_id
+ *      On success, a pointer to the new, copied @ref hs_stream_t will be
+ *      returned; NULL on failure.
+ *
+ * @param from_id
+ *      The stream (as created by @ref hs_open_stream()) to be copied.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_copy_stream(hs_stream_t **to_id, const hs_stream_t *from_id);
+
+/**
+ * Duplicate the given 'from' stream state onto the 'to' stream. The 'to' stream
+ * will first be reset (reporting any EOD matches if a non-NULL @a onEvent
+ * callback handler is provided).
+ *
+ * Note: the 'to' stream and the 'from' stream must be open against the same
+ * database.
+ *
+ * @param to_id
+ *      On success, a pointer to the new, copied @ref hs_stream_t will be
+ *      returned; NULL on failure.
+ *
+ * @param from_id
+ *      The stream (as created by @ref hs_open_stream()) to be copied.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch().
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param context
+ *      The user defined pointer which will be passed to the callback function
+ *      when a match occurs.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_reset_and_copy_stream(hs_stream_t *to_id,
+                                    const hs_stream_t *from_id,
+                                    hs_scratch_t *scratch,
+                                    match_event_handler onEvent,
+                                    void *context);
+
+/**
+ * The block (non-streaming) regular expression scanner.
+ *
+ * This is the function call in which the actual pattern matching takes place
+ * for block-mode pattern databases.
+ *
+ * @param db
+ *      A compiled pattern database.
+ *
+ * @param data
+ *      Pointer to the data to be scanned.
+ *
+ * @param length
+ *      The number of bytes to scan.
+ *
+ * @param flags
+ *      Flags modifying the behaviour of this function. This parameter is
+ *      provided for future use and is unused at present.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch() for this
+ *      database.
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param context
+ *      The user defined pointer which will be passed to the callback function.
+ *
+ * @return
+ *      Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the
+ *      match callback indicated that scanning should stop; other values on
+ *      error.
+ */
+hs_error_t hs_scan(const hs_database_t *db, const char *data,
+                   unsigned int length, unsigned int flags,
+                   hs_scratch_t *scratch, match_event_handler onEvent,
+                   void *context);
+
+/**
+ * The vectored regular expression scanner.
+ *
+ * This is the function call in which the actual pattern matching takes place
+ * for vectoring-mode pattern databases.
+ *
+ * @param db
+ *      A compiled pattern database.
+ *
+ * @param data
+ *      An array of pointers to the data blocks to be scanned.
+ *
+ * @param length
+ *      An array of lengths (in bytes) of each data block to scan.
+ *
+ * @param count
+ *      Number of data blocks to scan. This should correspond to the size of
+ *      of the @a data and @a length arrays.
+ *
+ * @param flags
+ *      Flags modifying the behaviour of this function. This parameter is
+ *      provided for future use and is unused at present.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch() for
+ *      this database.
+ *
+ * @param onEvent
+ *      Pointer to a match event callback function. If a NULL pointer is given,
+ *      no matches will be returned.
+ *
+ * @param context
+ *      The user defined pointer which will be passed to the callback function.
+ *
+ * @return
+ *      Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the match
+ *      callback indicated that scanning should stop; other values on error.
+ */
+hs_error_t hs_scan_vector(const hs_database_t *db, const char *const *data,
+                          const unsigned int *length, unsigned int count,
+                          unsigned int flags, hs_scratch_t *scratch,
+                          match_event_handler onEvent, void *context);
+
+/**
+ * Allocate a "scratch" space for use by Hyperscan.
+ *
+ * This is required for runtime use, and one scratch space per thread, or
+ * concurrent caller, is required. Any allocator callback set by @ref
+ * hs_set_scratch_allocator() or @ref hs_set_allocator() will be used by this
+ * function.
+ *
+ * @param db
+ *      The database, as produced by @ref hs_compile().
+ *
+ * @param scratch
+ *      On first allocation, a pointer to NULL should be provided so a new
+ *      scratch can be allocated. If a scratch block has been previously
+ *      allocated, then a pointer to it should be passed back in to see if it
+ *      is valid for this database block. If a new scratch block is required,
+ *      the original will be freed and the new one returned, otherwise the
+ *      previous scratch block will be returned. On success, the scratch block
+ *      will be suitable for use with the provided database in addition to any
+ *      databases that original scratch space was suitable for.
+ *
+ * @return
+ *      @ref HS_SUCCESS on successful allocation; @ref HS_NOMEM if the
+ *      allocation fails.  Other errors may be returned if invalid parameters
+ *      are specified.
+ */
+hs_error_t hs_alloc_scratch(const hs_database_t *db, hs_scratch_t **scratch);
+
+/**
+ * Allocate a scratch space that is a clone of an existing scratch space.
+ *
+ * This is useful when multiple concurrent threads will be using the same set
+ * of compiled databases, and another scratch space is required. Any allocator
+ * callback set by @ref hs_set_scratch_allocator() or @ref hs_set_allocator()
+ * will be used by this function.
+ *
+ * @param src
+ *      The existing @ref hs_scratch_t to be cloned.
+ *
+ * @param dest
+ *      A pointer to the new scratch space will be returned here.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success; @ref HS_NOMEM if the allocation fails.
+ *      Other errors may be returned if invalid parameters are specified.
+ */
+hs_error_t hs_clone_scratch(const hs_scratch_t *src, hs_scratch_t **dest);
+
+/**
+ * Provides the size of the given scratch space.
+ *
+ * @param scratch
+ *      A per-thread scratch space allocated by @ref hs_alloc_scratch() or @ref
+ *      hs_clone_scratch().
+ *
+ * @param scratch_size
+ *      On success, the size of the scratch space in bytes is placed in this
+ *      parameter.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_scratch_size(const hs_scratch_t *scratch, size_t *scratch_size);
+
+/**
+ * Free a scratch block previously allocated by @ref hs_alloc_scratch() or @ref
+ * hs_clone_scratch().
+ *
+ * The free callback set by @ref hs_set_scratch_allocator() or @ref
+ * hs_set_allocator() will be used by this function.
+ *
+ * @param scratch
+ *      The scratch block to be freed. NULL may also be safely provided.
+ *
+ * @return
+ *      @ref HS_SUCCESS on success, other values on failure.
+ */
+hs_error_t hs_free_scratch(hs_scratch_t *scratch);
+
+/**
+ * Callback 'from' return value, indicating that the start of this match was
+ * too early to be tracked with the requested SOM_HORIZON precision.
+ */
+#define HS_OFFSET_PAST_HORIZON    (~0ULL)
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* HS_RUNTIME_H_ */
--- a/src/hs_version.c
+++ b/src/hs_version.c
@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ue2common.h"
+#include "hs_common.h"
+#include "hs_version.h"
+
+HS_PUBLIC_API
+const char *hs_version(void) {
+    return HS_VERSION_STRING;
+}
--- a/src/hs_version.h.in
+++ b/src/hs_version.h.in
@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HS_VERSION_H_C6428FAF8E3713
+#define HS_VERSION_H_C6428FAF8E3713
+
+/**
+ * A version string to identify this release of Hyperscan.
+ */
+#define HS_VERSION_STRING "@HS_VERSION@ @BUILD_DATE@"
+
+#define HS_VERSION_32BIT ((@HS_MAJOR_VERSION@ << 24) | (@HS_MINOR_VERSION@ << 16) | (@HS_PATCH_VERSION@ << 8) | 0)
+
+#endif /* HS_VERSION_H_C6428FAF8E3713 */
+
--- a/src/hwlm/hwlm.c
+++ b/src/hwlm/hwlm.c
@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Hamster Wheel Literal Matcher: runtime.
+ */
+#include "hwlm.h"
+#include "hwlm_internal.h"
+#include "noodle_engine.h"
+#include "scratch.h"
+#include "ue2common.h"
+#include "fdr/fdr.h"
+#include "nfa/accel.h"
+#include "nfa/shufti.h"
+#include "nfa/vermicelli.h"
+#include <string.h>
+
+#define MIN_ACCEL_LEN_BLOCK  16
+#define MIN_ACCEL_LEN_STREAM 16
+
+static really_inline
+const u8 *run_hwlm_accel(const union AccelAux *aux, const u8 *ptr,
+                         const u8 *end) {
+    switch (aux->accel_type) {
+    case ACCEL_VERM:
+        DEBUG_PRINTF("single vermicelli for 0x%02hhx\n", aux->verm.c);
+        return vermicelliExec(aux->verm.c, 0, ptr, end);
+    case ACCEL_VERM_NOCASE:
+        DEBUG_PRINTF("single vermicelli-nocase for 0x%02hhx\n", aux->verm.c);
+        return vermicelliExec(aux->verm.c, 1, ptr, end);
+    case ACCEL_DVERM:
+        DEBUG_PRINTF("double vermicelli for 0x%02hhx%02hhx\n", aux->dverm.c1,
+                     aux->dverm.c2);
+        return vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2, 0, ptr, end);
+    case ACCEL_DVERM_NOCASE:
+        DEBUG_PRINTF("double vermicelli-nocase for 0x%02hhx%02hhx\n",
+                     aux->dverm.c1, aux->dverm.c2);
+        return vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2, 1, ptr, end);
+    case ACCEL_SHUFTI:
+        DEBUG_PRINTF("single shufti\n");
+        return shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end);
+    default:
+        /* no acceleration, fall through and return current ptr */
+        return ptr;
+    }
+}
+
+static really_inline
+void do_accel_block(const union AccelAux *aux, const u8 *buf, size_t len,
+                    size_t *start) {
+    if (len - *start < MIN_ACCEL_LEN_BLOCK) {
+        return;
+    }
+
+    const u8 *ptr = buf + *start;
+    const u8 *end = buf + len;
+    const u8 offset = aux->generic.offset;
+    ptr = run_hwlm_accel(aux, ptr, end);
+
+    if (offset) {
+        ptr -= offset;
+        if (ptr < buf) {
+            ptr = buf;
+        }
+    }
+    assert(ptr >= buf);
+    *start = ptr - buf;
+}
+
+static really_inline
+int inaccurate_accel(u8 type) {
+    /* accels which don't always catch up to the boundary
+     * DSHUFTI is also inaccurate but it is not used by the hamsters */
+    return type == ACCEL_DVERM_NOCASE || type == ACCEL_DVERM;
+}
+
+static never_inline
+void do_accel_streaming(const union AccelAux *aux, const u8 *hbuf, size_t hlen,
+                        const u8 *buf, size_t len, size_t *start) {
+    if (aux->accel_type == ACCEL_NONE || len - *start < MIN_ACCEL_LEN_STREAM) {
+        return;
+    }
+
+    const u8 offset = aux->generic.offset;
+
+    DEBUG_PRINTF("using accel %hhu offset %hhu\n", aux->accel_type, offset);
+
+    // Scan history buffer, but only if the start offset (which always refers to
+    // buf) is zero.
+
+    if (!*start && hlen) {
+        const u8 *ptr1 = hbuf;
+        const u8 *end1 = hbuf + hlen;
+        if (hlen >= 16) {
+            ptr1 = run_hwlm_accel(aux, ptr1, end1);
+        }
+
+        if ((hlen <= 16 || inaccurate_accel(aux->accel_type))
+            && end1 != ptr1 && end1 - ptr1 <= 16) {
+            DEBUG_PRINTF("already scanned %zu/%zu\n", ptr1 - hbuf, hlen);
+            /* see if we can finish off the history buffer completely */
+            u8 ALIGN_DIRECTIVE temp[17];
+            ptrdiff_t tlen = end1 - ptr1;
+            memcpy(temp, ptr1, tlen);
+            memset(temp + tlen, 0, 17 - tlen);
+            if (len) { /* for dverm */
+                temp[end1 - ptr1] = *buf;
+            }
+
+            const u8 *tempp = run_hwlm_accel(aux, temp, temp + 17);
+
+            if (tempp - temp >= tlen) {
+                ptr1 = end1;
+            }
+            DEBUG_PRINTF("got %zu\n", tempp - temp);
+        }
+
+        if (ptr1 != end1) {
+            DEBUG_PRINTF("bailing in history\n");
+            return;
+        }
+    }
+
+    DEBUG_PRINTF("scanning main buffer, start=%zu, len=%zu\n", *start, len);
+
+    const u8 *ptr2 = buf + *start;
+    const u8 *end2 = buf + len;
+
+    const u8 *found = run_hwlm_accel(aux, ptr2, end2);
+
+    if (found >= ptr2 + offset) {
+        size_t delta = found - offset - ptr2;
+        DEBUG_PRINTF("got %zu/%zu in 2nd buffer\n", delta, len);
+        *start += delta;
+    } else if (hlen) {
+        UNUSED size_t remaining = offset + ptr2 - found;
+        DEBUG_PRINTF("got %zu/%zu remaining in 1st buffer\n", remaining, hlen);
+    }
+}
+
+hwlm_error_t hwlmExec(const struct HWLM *t, const u8 *buf, size_t len,
+                      size_t start, HWLMCallback cb, void *ctxt,
+                      hwlm_group_t groups) {
+    DEBUG_PRINTF("buf len=%zu, start=%zu, groups=%llx\n", len, start, groups);
+    if (!groups) {
+        DEBUG_PRINTF("groups all off\n");
+        return HWLM_SUCCESS;
+    }
+
+    assert(start < len);
+
+    if (t->type == HWLM_ENGINE_NOOD) {
+        DEBUG_PRINTF("calling noodExec\n");
+        return noodExec(HWLM_C_DATA(t), buf + start, len - start, start, cb,
+                        ctxt);
+    } else {
+        assert(t->type == HWLM_ENGINE_FDR);
+        const union AccelAux *aa = &t->accel0;
+        if ((groups & ~t->accel1_groups) == 0) {
+            DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type);
+            aa = &t->accel1;
+        }
+        do_accel_block(aa, buf, len, &start);
+        DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups,
+                     start);
+        return fdrExec(HWLM_C_DATA(t), buf, len, start, cb, ctxt, groups);
+    }
+}
+
+hwlm_error_t hwlmExecStreaming(const struct HWLM *t, struct hs_scratch *scratch,
+                               size_t len, size_t start, HWLMCallback cb,
+                               void *ctxt, hwlm_group_t groups,
+                               u8 *stream_state) {
+    const u8 *hbuf = scratch->core_info.hbuf;
+    const size_t hlen = scratch->core_info.hlen;
+    const u8 *buf = scratch->core_info.buf;
+
+    DEBUG_PRINTF("hbuf len=%zu, buf len=%zu, start=%zu, groups=%llx\n", hlen,
+                 len, start, groups);
+
+    if (!groups) {
+        return HWLM_SUCCESS;
+    }
+
+    assert(start < len);
+
+    if (t->type == HWLM_ENGINE_NOOD) {
+        DEBUG_PRINTF("calling noodExec\n");
+        // If we've been handed a start offset, we can use a block mode scan at
+        // that offset.
+        if (start) {
+            return noodExec(HWLM_C_DATA(t), buf + start, len - start, start,
+                            cb, ctxt);
+        } else {
+            return noodExecStreaming(HWLM_C_DATA(t), hbuf, hlen, buf, len, cb,
+                                     ctxt, scratch->fdr_temp_buf,
+                                     FDR_TEMP_BUF_SIZE);
+        }
+    } else {
+        // t->type == HWLM_ENGINE_FDR
+        const union AccelAux *aa = &t->accel0;
+        if ((groups & ~t->accel1_groups) == 0) {
+            DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type);
+            aa = &t->accel1;
+        }
+        // if no active stream state, use acceleration
+        if (!fdrStreamStateActive(HWLM_C_DATA(t), stream_state)) {
+            do_accel_streaming(aa, hbuf, hlen, buf, len, &start);
+        }
+        DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups,
+                     start);
+        return fdrExecStreaming(HWLM_C_DATA(t), hbuf, hlen, buf, len,
+                                start, cb, ctxt, groups, stream_state);
+    }
+}
--- a/src/hwlm/hwlm.h
+++ b/src/hwlm/hwlm.h
@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Hamster Wheel Literal Matcher: runtime API.
+ */
+
+#ifndef HWLM_H
+#define HWLM_H
+
+#include "ue2common.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** \brief Error return type for exec functions. */
+typedef int hwlm_error_t;
+
+/** \brief Type representing a set of groups as a bitmap. */
+typedef u64a hwlm_group_t;
+
+/** \brief HWLM callback return type. */
+typedef hwlm_group_t hwlmcb_rv_t;
+
+/** \brief Value representing all possible literal groups. */
+#define HWLM_ALL_GROUPS         ((hwlm_group_t)~0ULL)
+
+/** \brief Callback return value indicating that we should continue matching. */
+#define HWLM_CONTINUE_MATCHING  HWLM_ALL_GROUPS
+
+/** \brief Callback return value indicating that we should halt matching. */
+#define HWLM_TERMINATE_MATCHING 0
+
+/** \brief Matching finished without being terminated by the user. */
+#define HWLM_SUCCESS       0
+
+/** \brief The user terminated matching by returning HWLM_TERMINATE_MATCHING
+ * from the match callback. */
+#define HWLM_TERMINATED    1
+
+/** \brief An error occurred during matching.
+ *
+ * This should only be used if an unsupported engine was called (like one
+ * designed for a different architecture). */
+#define HWLM_ERROR_UNKNOWN 2
+
+struct hs_scratch;
+struct HWLM;
+
+/** \brief The type for an HWLM callback.
+ *
+ * This callback receives a start-of-match offset, an end-of-match offset, the
+ * ID of the match and the context pointer that was passed into \ref
+ * hwlmExec or \ref hwlmExecStreaming.
+ *
+ * A callback return of \ref HWLM_TERMINATE_MATCHING will stop matching.
+ *
+ * A callback return of \ref HWLM_CONTINUE_MATCHING continues matching.
+ *
+ * An arbitrary group mask may be given as the return value. This will be taken
+ * as a hint by the underlying engine that only literals with groups
+ * overlapping the provided mask need to be reported.
+ *
+ * The underlying engine may choose not to report a match if there is no group
+ * belonging to the literal which was active at the when the end match location
+ * was first reached.
+ */
+typedef hwlmcb_rv_t (*HWLMCallback)(size_t start, size_t end, u32 id,
+                                    void *context);
+
+/** \brief Match strings in table.
+ *
+ * If a match occurs, the callback function given will be called with the index
+ * of the last character in the string and the \p context (passed through
+ * without interpretation).
+ *
+ * Returns \ref HWLM_TERMINATED if scanning is cancelled due to the callback
+ * returning \ref HWLM_TERMINATE_MATCHING.
+ *
+ * \p start is the first offset at which a match may start.
+ *
+ * The underlying engine may choose not to report any match which starts before
+ * the first possible match of a literal which is in the initial group mask.
+ */
+hwlm_error_t hwlmExec(const struct HWLM *tab, const u8 *buf, size_t len,
+                      size_t start, HWLMCallback callback, void *context,
+                      hwlm_group_t groups);
+
+/** \brief As for \ref hwlmExec, but a streaming case across two buffers.
+ *
+ * \p scratch is used to access fdr_temp_buf and to access the history buffer,
+ * history length and the main buffer.
+ *
+ * \p len is the length of the main buffer to be scanned.
+ *
+ * \p start is an advisory hint representing the first offset at which a match
+ * may start. Some underlying literal matches may not respect it.
+ *
+ * Two buffers/lengths are provided. Matches that occur entirely within
+ * the history buffer will not be reported by this function. The offsets
+ * reported for the main buffer are relative to the start of that buffer (a
+ * match at byte 10 of the main buffer is reported as 10). Matches that start
+ * in the history buffer will have starts reported with 'negative' values.
+ */
+hwlm_error_t hwlmExecStreaming(const struct HWLM *tab,
+                               struct hs_scratch *scratch, size_t len,
+                               size_t start, HWLMCallback callback,
+                               void *context, hwlm_group_t groups,
+                               u8 *stream_state);
+
+#ifdef __cplusplus
+}       /* extern "C" */
+#endif
+
+#endif
--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@ -0,0 +1,635 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Hamster Wheel Literal Matcher: build code.
+ */
+#include "grey.h"
+#include "hwlm.h"
+#include "hwlm_build.h"
+#include "hwlm_internal.h"
+#include "noodle_engine.h"
+#include "noodle_build.h"
+#include "ue2common.h"
+#include "fdr/fdr_compile.h"
+#include "fdr/fdr.h"
+#include "nfa/shufticompile.h"
+#include "util/alloc.h"
+#include "util/bitutils.h"
+#include "util/charreach.h"
+#include "util/compare.h"
+#include "util/compile_context.h"
+#include "util/compile_error.h"
+#include "util/dump_charclass.h"
+#include "util/target_info.h"
+#include "util/ue2string.h"
+#include "util/verify_types.h"
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+using namespace std;
+
+namespace ue2 {
+
+static const unsigned int MAX_ACCEL_OFFSET = 16;
+static const unsigned int MAX_SHUFTI_WIDTH = 240;
+
+static
+bool findDVerm(const vector<const hwlmLiteral *> &lits, AccelAux *aux) {
+    const hwlmLiteral &first = *lits.front();
+
+    struct candidate {
+        candidate(void)
+            : c1(0), c2(0), max_offset(0), b5insens(false), valid(false) {}
+        candidate(const hwlmLiteral &base, u32 offset)
+            : c1(base.s[offset]), c2(base.s[offset + 1]), max_offset(0),
+              b5insens(false), valid(true) {}
+        char c1;
+        char c2;
+        u32 max_offset;
+        bool b5insens;
+        bool valid;
+
+        bool operator>(const candidate &other) const {
+            if (!valid) {
+                return false;
+            }
+
+            if (!other.valid) {
+                return true;
+            }
+
+            if (other.cdiffers() && !cdiffers()) {
+                return false;
+            }
+
+            if (!other.cdiffers() && cdiffers()) {
+                return true;
+            }
+
+            if (!other.b5insens && b5insens) {
+                return false;
+            }
+
+            if (other.b5insens && !b5insens) {
+                return true;
+            }
+
+            if (max_offset > other.max_offset) {
+                return false;
+            }
+
+            return true;
+        }
+
+        bool cdiffers(void) const {
+            if (!b5insens) {
+                return c1 != c2;
+            }
+            return (c1 & CASE_CLEAR) != (c2 & CASE_CLEAR);
+        }
+    };
+
+    candidate best;
+
+    for (u32 i = 0; i < MIN(MAX_ACCEL_OFFSET, first.s.length()) - 1; i++) {
+        candidate curr(first, i);
+
+        /* check to see if this pair appears in each string */
+        for (const auto &lit_ptr : lits) {
+            const hwlmLiteral &lit = *lit_ptr;
+            if (lit.nocase && (ourisalpha(curr.c1) || ourisalpha(curr.c2))) {
+                curr.b5insens = true; /* no choice but to be case insensitive */
+            }
+
+            bool found = false;
+            bool found_nc = false;
+            for (u32 j = 0;
+                 !found && j < MIN(MAX_ACCEL_OFFSET, lit.s.length()) - 1; j++) {
+                found |= curr.c1 == lit.s[j] && curr.c2 == lit.s[j + 1];
+                found_nc |= (curr.c1 & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR)
+                    && (curr.c2 & CASE_CLEAR) == (lit.s[j + 1] & CASE_CLEAR);
+
+                if (curr.b5insens) {
+                    found = found_nc;
+                }
+            }
+
+            if (!curr.b5insens && !found && found_nc) {
+                curr.b5insens = true;
+                found = true;
+            }
+
+            if (!found) {
+                goto next_candidate;
+            }
+        }
+
+        /* check to find the max offset where this appears */
+        for (const auto &lit_ptr : lits) {
+            const hwlmLiteral &lit = *lit_ptr;
+            for (u32 j = 0; j < MIN(MAX_ACCEL_OFFSET, lit.s.length()) - 1;
+                 j++) {
+                bool found = false;
+                if (curr.b5insens) {
+                    found = (curr.c1 & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR)
+                     && (curr.c2 & CASE_CLEAR) == (lit.s[j + 1] & CASE_CLEAR);
+                } else {
+                    found = curr.c1 == lit.s[j] && curr.c2 == lit.s[j + 1];
+                }
+
+                if (found) {
+                    curr.max_offset = MAX(curr.max_offset, j);
+                    break;
+                }
+            }
+        }
+
+        if (curr > best) {
+            best = curr;
+        }
+
+    next_candidate:;
+    }
+
+    if (!best.valid) {
+        return false;
+    }
+
+    aux->dverm.offset = verify_u8(best.max_offset);
+
+    if (!best.b5insens) {
+        aux->dverm.accel_type = ACCEL_DVERM;
+        aux->dverm.c1 = best.c1;
+        aux->dverm.c2 = best.c2;
+        DEBUG_PRINTF("built dverm for %02hhx%02hhx\n",
+                     aux->dverm.c1, aux->dverm.c2);
+    } else {
+        aux->dverm.accel_type = ACCEL_DVERM_NOCASE;
+        aux->dverm.c1 = best.c1 & CASE_CLEAR;
+        aux->dverm.c2 = best.c2 & CASE_CLEAR;
+        DEBUG_PRINTF("built dverm nc for %02hhx%02hhx\n",
+                     aux->dverm.c1, aux->dverm.c2);
+    }
+    return true;
+}
+
+static
+bool findSVerm(const vector<const hwlmLiteral *> &lits, AccelAux *aux) {
+    const hwlmLiteral &first = *lits.front();
+
+    struct candidate {
+        candidate(void)
+            : c(0), max_offset(0), b5insens(false), valid(false) {}
+        candidate(const hwlmLiteral &base, u32 offset)
+            : c(base.s[offset]), max_offset(0),
+              b5insens(false), valid(true) {}
+        char c;
+        u32 max_offset;
+        bool b5insens;
+        bool valid;
+
+        bool operator>(const candidate &other) const {
+            if (!valid) {
+                return false;
+            }
+
+            if (!other.valid) {
+                return true;
+            }
+
+            if (!other.b5insens && b5insens) {
+                return false;
+            }
+
+            if (other.b5insens && !b5insens) {
+                return true;
+            }
+
+            if (max_offset > other.max_offset) {
+                return false;
+            }
+
+            return true;
+        }
+    };
+
+    candidate best;
+
+    for (u32 i = 0; i < MIN(MAX_ACCEL_OFFSET, first.s.length()); i++) {
+        candidate curr(first, i);
+
+        /* check to see if this pair appears in each string */
+        for (const auto &lit_ptr : lits) {
+            const hwlmLiteral &lit = *lit_ptr;
+            if (lit.nocase && ourisalpha(curr.c)) {
+                curr.b5insens = true; /* no choice but to be case insensitive */
+            }
+
+            bool found = false;
+            bool found_nc = false;
+            for (u32 j = 0;
+                 !found && j < MIN(MAX_ACCEL_OFFSET, lit.s.length()); j++) {
+                found |= curr.c == lit.s[j];
+                found_nc |= (curr.c & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR);
+
+                if (curr.b5insens) {
+                    found = found_nc;
+                }
+            }
+
+            if (!curr.b5insens && !found && found_nc) {
+                curr.b5insens = true;
+                found = true;
+            }
+
+            if (!found) {
+                goto next_candidate;
+            }
+        }
+
+        /* check to find the max offset where this appears */
+        for (const auto &lit_ptr : lits) {
+            const hwlmLiteral &lit = *lit_ptr;
+            for (u32 j = 0; j < MIN(MAX_ACCEL_OFFSET, lit.s.length()); j++) {
+                bool found = false;
+                if (curr.b5insens) {
+                    found = (curr.c & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR);
+                } else {
+                    found = curr.c == lit.s[j];
+                }
+
+                if (found) {
+                    curr.max_offset = MAX(curr.max_offset, j);
+                    break;
+                }
+            }
+        }
+
+        if (curr > best) {
+            best = curr;
+        }
+
+    next_candidate:;
+    }
+
+    if (!best.valid) {
+        return false;
+    }
+
+    if (!best.b5insens) {
+        aux->verm.accel_type = ACCEL_VERM;
+        aux->verm.c = best.c;
+        DEBUG_PRINTF("built verm for %02hhx\n", aux->verm.c);
+    } else {
+        aux->verm.accel_type = ACCEL_VERM_NOCASE;
+        aux->verm.c = best.c & CASE_CLEAR;
+        DEBUG_PRINTF("built verm nc for %02hhx\n", aux->verm.c);
+    }
+    aux->verm.offset = verify_u8(best.max_offset);
+
+    return true;
+}
+
+static
+void filterLits(const vector<hwlmLiteral> &lits, hwlm_group_t expected_groups,
+                vector<const hwlmLiteral *> *filtered_lits, u32 *min_len) {
+    *min_len = MAX_ACCEL_OFFSET;
+
+    for (const auto &lit : lits) {
+        if (!(lit.groups & expected_groups)) {
+            continue;
+        }
+
+        const size_t lit_len = lit.s.length();
+        if (lit_len < *min_len) {
+            *min_len = verify_u32(lit_len);
+        }
+
+        filtered_lits->push_back(&lit);
+
+#ifdef DEBUG
+        DEBUG_PRINTF("lit:");
+        for (u32 i = 0; i < lit.s.length(); i++) {
+            printf("%02hhx", lit.s[i]);
+        }
+        printf("\n");
+#endif
+    }
+}
+
+static
+void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
+                            hwlm_group_t expected_groups, AccelAux *aux) {
+    DEBUG_PRINTF("building accel expected=%016llx\n", expected_groups);
+    u32 min_len = MAX_ACCEL_OFFSET;
+    vector<const hwlmLiteral *> filtered_lits;
+
+    filterLits(lits, expected_groups, &filtered_lits, &min_len);
+    if (filtered_lits.empty()) {
+        return;
+    }
+
+    if (findDVerm(filtered_lits, aux)
+        || findSVerm(filtered_lits, aux)) {
+        return;
+    }
+
+    vector<CharReach> reach(MAX_ACCEL_OFFSET, CharReach());
+    for (const auto &lit : lits) {
+        if (!(lit.groups & expected_groups)) {
+            continue;
+        }
+
+        for (u32 i = 0; i < MAX_ACCEL_OFFSET && i < lit.s.length(); i++) {
+            unsigned char c = lit.s[i];
+            if (lit.nocase) {
+                DEBUG_PRINTF("adding %02hhx to %u\n", mytoupper(c), i);
+                DEBUG_PRINTF("adding %02hhx to %u\n", mytolower(c), i);
+                reach[i].set(mytoupper(c));
+                reach[i].set(mytolower(c));
+            } else {
+                DEBUG_PRINTF("adding %02hhx to %u\n", c, i);
+                reach[i].set(c);
+            }
+        }
+    }
+
+    u32 min_count = ~0U;
+    u32 min_offset = ~0U;
+    for (u32 i = 0; i < min_len; i++) {
+        size_t count = reach[i].count();
+        DEBUG_PRINTF("offset %u is %s (reach %zu)\n", i,
+                     describeClass(reach[i]).c_str(), count);
+        if (count < min_count) {
+            min_count = (u32)count;
+            min_offset = i;
+        }
+    }
+    assert(min_offset <= min_len);
+
+    if (min_count > MAX_SHUFTI_WIDTH) {
+        DEBUG_PRINTF("min shufti with %u chars is too wide\n", min_count);
+        return;
+    }
+
+    const CharReach &cr = reach[min_offset];
+    if (shuftiBuildMasks(cr, &aux->shufti.lo, &aux->shufti.hi) != -1) {
+        DEBUG_PRINTF("built shufti for %s (%zu chars, offset %u)\n",
+                     describeClass(cr).c_str(), cr.count(), min_offset);
+        aux->shufti.accel_type = ACCEL_SHUFTI;
+        aux->shufti.offset = verify_u8(min_offset);
+        return;
+    }
+
+    DEBUG_PRINTF("fail\n");
+}
+
+static
+void buildForwardAccel(HWLM *h, const vector<hwlmLiteral> &lits,
+                       hwlm_group_t expected_groups) {
+    findForwardAccelScheme(lits, expected_groups, &h->accel1);
+    findForwardAccelScheme(lits, HWLM_ALL_GROUPS, &h->accel0);
+
+    h->accel1_groups = expected_groups;
+}
+
+static
+void dumpLits(UNUSED const vector<hwlmLiteral> &lits) {
+#ifdef DEBUG
+    DEBUG_PRINTF("building lit table for:\n");
+    for (const auto &lit : lits) {
+        printf("\t%u:%016llx %s%s\n", lit.id, lit.groups,
+               escapeString(lit.s).c_str(), lit.nocase ? " (nc)" : "");
+    }
+#endif
+}
+
+#ifndef NDEBUG
+// Called by an assertion.
+static
+bool everyoneHasGroups(const vector<hwlmLiteral> &lits) {
+    for (const auto &lit : lits) {
+        if (!lit.groups) {
+            return false;
+        }
+    }
+    return true;
+}
+#endif
+
+static
+bool isNoodleable(const vector<hwlmLiteral> &lits,
+                  const hwlmStreamingControl *stream_control,
+                  const CompileContext &cc) {
+    if (!cc.grey.allowNoodle) {
+        return false;
+    }
+
+    if (lits.size() != 1) {
+        DEBUG_PRINTF("too many literals for noodle\n");
+        return false;
+    }
+
+    if (stream_control) { // nullptr if in block mode
+        if (lits.front().s.length() + 1 > stream_control->history_max) {
+            DEBUG_PRINTF("length of %zu too long for history max %zu\n",
+                         lits.front().s.length(),
+                         stream_control->history_max);
+            return false;
+        }
+    }
+
+    if (!lits.front().msk.empty()) {
+        DEBUG_PRINTF("noodle can't handle supplementary masks\n");
+        return false;
+    }
+
+    return true;
+}
+
+aligned_unique_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits,
+                                   hwlmStreamingControl *stream_control,
+                                   bool make_small, const CompileContext &cc,
+                                   hwlm_group_t expected_groups) {
+    assert(!lits.empty());
+    dumpLits(lits);
+
+    if (stream_control) {
+        assert(stream_control->history_min <= stream_control->history_max);
+    }
+
+    // Check that we haven't exceeded the maximum number of literals.
+    if (lits.size() > cc.grey.limitLiteralCount) {
+        throw ResourceLimitError();
+    }
+
+    // Safety and resource limit checks.
+    u64a total_chars = 0;
+    for (const auto &lit : lits) {
+        assert(!lit.s.empty());
+
+        if (lit.s.length() > cc.grey.limitLiteralLength) {
+            throw ResourceLimitError();
+        }
+        total_chars += lit.s.length();
+        if (total_chars > cc.grey.limitLiteralMatcherChars) {
+            throw ResourceLimitError();
+        }
+
+        // We do not allow the all-ones ID, as we reserve that for internal use
+        // within literal matchers.
+        if (lit.id == 0xffffffffu) {
+            assert(!"reserved id 0xffffffff used");
+            throw CompileError("Internal error.");
+        }
+    }
+
+    u8 engType = 0;
+    size_t engSize = 0;
+    shared_ptr<void> eng;
+
+    DEBUG_PRINTF("building table with %zu strings\n", lits.size());
+
+    assert(everyoneHasGroups(lits));
+
+    if (isNoodleable(lits, stream_control, cc)) {
+        DEBUG_PRINTF("build noodle table\n");
+        engType = HWLM_ENGINE_NOOD;
+        const hwlmLiteral &lit = lits.front();
+        auto noodle = noodBuildTable((const u8 *)lit.s.c_str(), lit.s.length(),
+                                     lit.nocase, lit.id);
+        if (noodle) {
+            engSize = noodSize(noodle.get());
+        }
+        if (stream_control) {
+            // For now, a single literal still goes to noodle and asks
+            // for a great big history
+            stream_control->literal_history_required = lit.s.length() - 1;
+            assert(stream_control->literal_history_required
+                   <= stream_control->history_max);
+            stream_control->literal_stream_state_required = 0;
+        }
+        eng = move(noodle);
+    } else {
+        DEBUG_PRINTF("building a new deal\n");
+        engType = HWLM_ENGINE_FDR;
+        auto fdr = fdrBuildTable(lits, make_small, cc.target_info, cc.grey,
+                            stream_control);
+        if (fdr) {
+            engSize = fdrSize(fdr.get());
+        }
+        eng = move(fdr);
+    }
+
+    if (!eng) {
+        return nullptr;
+    }
+
+    assert(engSize);
+    if (engSize > cc.grey.limitLiteralMatcherSize) {
+        throw ResourceLimitError();
+    }
+
+    auto h = aligned_zmalloc_unique<HWLM>(ROUNDUP_CL(sizeof(HWLM)) + engSize);
+
+    h->type = engType;
+    memcpy(HWLM_DATA(h.get()), eng.get(), engSize);
+
+    if (engType == HWLM_ENGINE_FDR && cc.grey.hamsterAccelForward) {
+        buildForwardAccel(h.get(), lits, expected_groups);
+    }
+
+    if (stream_control) {
+        DEBUG_PRINTF("requires %zu (of max %zu) bytes of history\n",
+                     stream_control->literal_history_required,
+                     stream_control->history_max);
+        assert(stream_control->literal_history_required
+                    <= stream_control->history_max);
+    }
+
+    return h;
+}
+
+size_t hwlmSize(const HWLM *h) {
+    size_t engSize = 0;
+
+    switch (h->type) {
+    case HWLM_ENGINE_NOOD:
+        engSize = noodSize((const noodTable *)HWLM_C_DATA(h));
+        break;
+    case HWLM_ENGINE_FDR:
+        engSize = fdrSize((const FDR *)HWLM_C_DATA(h));
+        break;
+    }
+
+    if (!engSize) {
+        return 0;
+    }
+
+    return engSize + ROUNDUP_CL(sizeof(*h));
+}
+
+size_t hwlmFloodProneSuffixLen(size_t numLiterals, const CompileContext &cc) {
+    const size_t NO_LIMIT = ~(size_t)0;
+
+    // NOTE: this function contains a number of magic numbers which are
+    // conservative estimates of flood-proneness based on internal details of
+    // the various literal engines that fall under the HWLM aegis. If you
+    // change those engines, you might need to change this function too.
+
+    DEBUG_PRINTF("%zu literals\n", numLiterals);
+
+    if (cc.grey.allowNoodle && numLiterals <= 1) {
+        DEBUG_PRINTF("noodle\n");
+        return NO_LIMIT;
+    }
+
+    if (cc.grey.fdrAllowTeddy) {
+        if (numLiterals <= 48) {
+            DEBUG_PRINTF("teddy\n");
+            return 3;
+        }
+        if (cc.target_info.has_avx2() && numLiterals <= 96) {
+            DEBUG_PRINTF("avx2 teddy\n");
+            return 3;
+        }
+    }
+
+    // TODO: we had thought we could push this value up to 9, but it seems that
+    // hurts performance on floods in some FDR models. Super-conservative for
+    // now.
+    DEBUG_PRINTF("fdr\n");
+    return 3;
+}
+
+} // namespace ue2
--- a/src/hwlm/hwlm_build.h
+++ b/src/hwlm/hwlm_build.h
@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Hamster Wheel Literal Matcher: build API.
+ */
+
+#ifndef HWLM_BUILD_H
+#define HWLM_BUILD_H
+
+#include "hwlm.h"
+#include "hwlm_literal.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+
+#include <memory>
+#include <vector>
+
+struct HWLM;
+
+namespace ue2 {
+
+struct CompileContext;
+struct Grey;
+struct target_t;
+
+/** \brief Structure gathering together the input/output parameters related to
+ * streaming mode operation. */
+struct hwlmStreamingControl {
+    /** \brief IN parameter: Upper limit on the amount of history that can be
+     * requested. */
+    size_t history_max;
+
+    /** \brief IN parameter: History already known to be used before literal
+     * analysis. */
+    size_t history_min;
+
+    /** \brief OUT parameter: History required by the literal matcher to
+     * correctly match all literals. */
+    size_t literal_history_required;
+
+    /** OUT parameter: Stream state required by literal matcher in bytes. Can
+     * be zero, and generally will be small (0-8 bytes). */
+    size_t literal_stream_state_required;
+};
+
+/** \brief Build an \ref HWLM literal matcher runtime structure for a group of
+ * literals.
+ *
+ * \param lits The group of literals.
+ * \param stream_control Streaming control parameters. If the matcher will
+ *        operate in non-streaming (block) mode, this pointer should be NULL.
+ * \param make_small Optimise matcher for small size.
+ * \param cc Compile context.
+ * \param expected_groups FIXME: document me!
+ *
+ * Build failures are generally a result of memory allocation failure. These
+ * may result in a nullptr return value, or a std::bad_alloc exception being
+ * thrown.
+ */
+aligned_unique_ptr<HWLM>
+hwlmBuild(const std::vector<hwlmLiteral> &lits,
+          hwlmStreamingControl *stream_control, bool make_small,
+          const CompileContext &cc,
+          hwlm_group_t expected_groups = HWLM_ALL_GROUPS);
+
+/**
+ * Returns an estimate of the number of repeated characters on the end of a
+ * literal that will make a literal set of size \a numLiterals suffer
+ * performance degradation.
+ */
+size_t hwlmFloodProneSuffixLen(size_t numLiterals, const CompileContext &cc);
+
+/** \brief Return the size in bytes of an HWLM structure. */
+size_t hwlmSize(const HWLM *h);
+
+} // namespace
+
+#endif // HWLM_BUILD_H
--- a/src/hwlm/hwlm_dump.cpp
+++ b/src/hwlm/hwlm_dump.cpp
@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Hamster Wheel Literal Matcher: dump code.
+ */
+
+#include "config.h"
+
+#include "hwlm_dump.h"
+#include "hwlm_internal.h"
+#include "noodle_build.h"
+#include "ue2common.h"
+#include "fdr/fdr_dump.h"
+#include "nfa/accel_dump.h"
+
+#include <cstdio>
+
+#ifndef DUMP_SUPPORT
+#error No dump support!
+#endif
+
+namespace ue2 {
+
+void hwlmPrintStats(const HWLM *h, FILE *f) {
+    switch (h->type) {
+    case HWLM_ENGINE_NOOD:
+        noodPrintStats((const noodTable *)HWLM_C_DATA(h), f);
+        break;
+    case HWLM_ENGINE_FDR:
+        fdrPrintStats((const FDR *)HWLM_C_DATA(h), f);
+        break;
+    default:
+        fprintf(f, "<unknown hwlm subengine>\n");
+    }
+
+    fprintf(f, "accel1_groups: %016llx\n", h->accel1_groups);
+
+    fprintf(f, "accel1:");
+    dumpAccelInfo(f, h->accel1);
+    fprintf(f, "accel0:");
+    dumpAccelInfo(f, h->accel0);
+}
+
+} // namespace ue2
--- a/src/hwlm/hwlm_dump.h
+++ b/src/hwlm/hwlm_dump.h
@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Hamster Wheel Literal Matcher: dump API.
+ */
+
+#ifndef HWLM_DUMP_H
+#define HWLM_DUMP_H
+
+#ifdef DUMP_SUPPORT
+
+#include <cstdio>
+
+struct HWLM;
+
+namespace ue2 {
+
+/** \brief Dump some information about the give HWLM structure. */
+void hwlmPrintStats(const HWLM *h, FILE *f);
+
+} // namespace ue2
+
+#endif
+#endif
--- a/src/hwlm/hwlm_internal.h
+++ b/src/hwlm/hwlm_internal.h
@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Hamster Wheel Literal Matcher: data structures.
+ */
+
+#ifndef HWLM_INTERNAL_H
+#define HWLM_INTERNAL_H
+
+#include "hwlm.h"
+#include "ue2common.h"
+#include "nfa/accel.h"
+
+/** \brief Underlying engine is FDR. */
+#define HWLM_ENGINE_FDR     12
+
+/** \brief Underlying engine is Noodle. */
+#define HWLM_ENGINE_NOOD    16
+
+/** \brief Main Hamster Wheel Literal Matcher header. Followed by
+ * engine-specific structure. */
+struct HWLM {
+    u8 type; /**< HWLM_ENGINE_NOOD or HWLM_ENGINE_FDR */
+    hwlm_group_t accel1_groups; /**< accelerable groups. */
+    union AccelAux accel1; /**< used if group mask is subset of accel1_groups */
+    union AccelAux accel0; /**< fallback accel scheme */
+};
+
+/** \brief Fetch a const pointer to the underlying engine. */
+#define HWLM_C_DATA(p) ((const void *)((const char *)(p)                  \
+                                       + ROUNDUP_CL(sizeof(struct HWLM))))
+
+/** \brief Fetch a pointer to the underlying engine. */
+#define HWLM_DATA(p) ((void *)((char *)(p) + ROUNDUP_CL(sizeof(struct HWLM))))
+
+#endif
--- a/src/hwlm/hwlm_literal.cpp
+++ b/src/hwlm/hwlm_literal.cpp
@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Hamster Wheel Literal Matcher: literal representation at build time.
+ */
+#include "hwlm_literal.h"
+#include "util/bitutils.h" // for CASE_BIT
+#include "util/compare.h" // for ourisalpha
+#include "util/ue2string.h" // for escapeString
+
+#include <iomanip>
+#include <sstream>
+
+#include <boost/algorithm/cxx11/all_of.hpp>
+
+using namespace std;
+using namespace boost::algorithm;
+
+namespace ue2 {
+
+#ifdef DEBUG
+static UNUSED
+std::string dumpMask(const vector<u8> &v) {
+    ostringstream oss;
+    vector<u8>::const_iterator it, ite;
+    for (it = v.begin(), ite = v.end(); it != ite; ++it) {
+        oss << setfill('0') << setw(2) << hex << (unsigned int)*it;
+    }
+    return oss.str();
+}
+#endif
+
+bool maskIsConsistent(const std::string &s, bool nocase, const vector<u8> &msk,
+                      const vector<u8> &cmp) {
+    string::const_reverse_iterator si = s.rbegin();
+    vector<u8>::const_reverse_iterator mi = msk.rbegin(), ci = cmp.rbegin();
+
+    for (; si != s.rend() && mi != msk.rend(); ++si, ++mi, ++ci) {
+        u8 c = *si, m = *mi, v = *ci;
+        if (nocase && ourisalpha(c)) {
+            m &= ~CASE_BIT;
+            v &= ~CASE_BIT;
+        }
+
+        assert(ci != cmp.rend());
+        if ((c & m) != v) {
+            DEBUG_PRINTF("c = %02hhx; *ci = %02hhx m =%02hhx\n", c, *ci, m);
+            DEBUG_PRINTF("s = %s; dist = %zd\n", s.c_str(), si - s.rbegin());
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/** \brief Complete constructor, takes group information and msk/cmp.
+ *
+ * This constructor takes a msk/cmp pair. Both must be vectors of length <=
+ * \ref HWLM_MASKLEN. */
+hwlmLiteral::hwlmLiteral(const std::string &s_in, bool nocase_in,
+                         bool noruns_in, u32 id_in, hwlm_group_t groups_in,
+                         const vector<u8> &msk_in, const vector<u8> &cmp_in)
+    : s(s_in), id(id_in), nocase(nocase_in), noruns(noruns_in),
+      groups(groups_in), msk(msk_in), cmp(cmp_in) {
+    assert(msk.size() <= HWLM_MASKLEN);
+    assert(msk.size() == cmp.size());
+
+    DEBUG_PRINTF("literal '%s', msk=%s, cmp=%s\n",
+                 escapeString(s).c_str(), dumpMask(msk).c_str(),
+                 dumpMask(cmp).c_str());
+
+    // Mask and compare vectors MUST be the same size.
+    assert(msk.size() == cmp.size());
+
+    // We must have been passed a msk/cmp that can be applied to s.
+    assert(maskIsConsistent(s, nocase, msk, cmp));
+
+    // In the name of good hygiene, zap msk/cmp if msk is all zeroes.
+    if (all_of_equal(msk.begin(), msk.end(), 0)) {
+        msk.clear();
+        cmp.clear();
+    }
+}
+
+} // namespace ue2
--- a/src/hwlm/hwlm_literal.h
+++ b/src/hwlm/hwlm_literal.h
@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Hamster Wheel Literal Matcher: literal representation at build time.
+ */
+
+#ifndef HWLM_LITERAL_H
+#define HWLM_LITERAL_H
+
+#include "hwlm.h"
+#include "ue2common.h"
+
+#include <string>
+#include <vector>
+
+namespace ue2 {
+
+/** \brief Max length of the hwlmLiteral::msk and hwlmLiteral::cmp vectors. */
+#define HWLM_MASKLEN 8
+
+/** \brief Class representing a literal, fed to \ref hwlmBuild. */
+struct hwlmLiteral {
+    std::string s; //!< \brief The literal itself.
+
+    /** \brief The ID to pass to the callback if this literal matches.
+     *
+     * Note that the special value 0xFFFFFFFF is reserved for internal use and
+     * should not be used. */
+    u32 id;
+
+    bool nocase; //!< \brief True if literal is case-insensitive.
+
+    /** \brief Matches for runs of this literal can be quashed.
+     *
+     * Advisory flag meaning that there is no value in returning runs of
+     * additional matches for a literal after the first one, so such matches
+     * can be quashed by the literal matcher. */
+    bool noruns;
+
+    /** \brief Set of groups that literal belongs to.
+     *
+     * Use \ref HWLM_ALL_GROUPS for a literal that could match regardless of
+     * the groups that are switched on. */
+    hwlm_group_t groups;
+
+    /** \brief Supplementary comparison mask.
+     *
+     * These two values add a supplementary comparison that is done over the
+     * final 8 bytes of the string -- if v is those bytes, then the string must
+     * match as well as (v & msk) == cmp.
+     *
+     * An empty msk is the safe way of not adding any comparison to the string
+     * unnecessarily filling in msk may turn off optimizations.
+     *
+     * The msk/cmp mechanism must NOT place a value into the literal that
+     * conflicts with the contents of the string, but can be allowed to add
+     * additional power within the string -- for example, to allow some case
+     * sensitivity within a case-insensitive string.
+
+     * Values are stored in memory order -- i.e. the last byte of the mask
+     * corresponds to the last byte of the string. Both vectors must be the
+     * same size, and must not exceed \ref HWLM_MASKLEN in length.
+     */
+    std::vector<u8> msk;
+
+    /** \brief Supplementary comparison value.
+     *
+     * See documentation for \ref msk.
+     */
+    std::vector<u8> cmp;
+
+    /** \brief Simple constructor: no group information, no msk/cmp. */
+    hwlmLiteral(const std::string &s_in, bool nocase_in, u32 id_in)
+        : s(s_in), id(id_in), nocase(nocase_in), noruns(false),
+          groups(HWLM_ALL_GROUPS), msk(0), cmp(0) {}
+
+    /** \brief Complete constructor, takes group information and msk/cmp.
+     *
+     * This constructor takes a msk/cmp pair. Both must be vectors of length <=
+     * \ref HWLM_MASKLEN. */
+    hwlmLiteral(const std::string &s_in, bool nocase_in, bool noruns_in,
+                u32 id_in, hwlm_group_t groups_in,
+                const std::vector<u8> &msk_in, const std::vector<u8> &cmp_in);
+};
+
+/**
+ * Consistency test; returns false if the given msk/cmp test can never match
+ * the literal string s.
+ */
+bool maskIsConsistent(const std::string &s, bool nocase,
+                      const std::vector<u8> &msk, const std::vector<u8> &cmp);
+
+} // namespace ue2
+
+#endif // HWLM_LITERAL_H
--- a/src/hwlm/noodle_build.cpp
+++ b/src/hwlm/noodle_build.cpp
@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Noodle literal matcher: build code.
+ */
+#include <cstring> // for memcpy
+
+#include "noodle_build.h"
+#include "noodle_internal.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/compare.h"
+#include "util/verify_types.h"
+
+namespace ue2 {
+
+static
+size_t findNoodFragOffset(const u8 *lit, size_t len, bool nocase) {
+    size_t offset = 0;
+    for (size_t i = 0; i + 1 < len; i++) {
+        int diff = 0;
+        const char c = lit[i];
+        const char d = lit[i + 1];
+        if (nocase && ourisalpha(c)) {
+            diff = (mytoupper(c) != mytoupper(d));
+        } else {
+            diff = (c != d);
+        }
+        offset = i;
+        if (diff) {
+            break;
+        }
+    }
+    return offset;
+}
+
+/** \brief Construct a Noodle matcher for the given literal. */
+aligned_unique_ptr<noodTable> noodBuildTable(const u8 *lit, size_t len,
+                                             bool nocase, u32 id) {
+    size_t noodle_len = sizeof(noodTable) + len;
+    aligned_unique_ptr<noodTable> n =
+        aligned_zmalloc_unique<noodTable>(noodle_len);
+    assert(n);
+
+    size_t key_offset = findNoodFragOffset(lit, len, nocase);
+
+    n->id = id;
+    n->len = verify_u32(len);
+    n->key_offset = verify_u32(key_offset);
+    n->nocase = nocase ? 1 : 0;
+    memcpy(n->str, lit, len);
+
+    return n;
+}
+
+size_t noodSize(const noodTable *n) {
+    assert(n); // shouldn't call with null
+    return sizeof(*n) + n->len;
+}
+
+} // namespace ue2
+
+#ifdef DUMP_SUPPORT
+#include <cctype>
+
+namespace ue2 {
+
+void noodPrintStats(const noodTable *n, FILE *f) {
+    fprintf(f, "Noodle table\n");
+    fprintf(f, "Len: %u Key Offset: %u\n", n->len, n->key_offset);
+    fprintf(f, "String: ");
+    for (u32 i = 0; i < n->len; i++) {
+        if (isgraph(n->str[i]) && n->str[i] != '\\') {
+            fprintf(f, "%c", n->str[i]);
+        } else {
+            fprintf(f, "\\x%02hhx", n->str[i]);
+        }
+    }
+    fprintf(f, "\n");
+}
+
+} // namespace ue2
+
+#endif
--- a/src/hwlm/noodle_build.h
+++ b/src/hwlm/noodle_build.h
@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Noodle literal matcher: build code.
+ */
+
+#ifndef NOODLE_BUILD_H_048A1A6D585A9A
+#define NOODLE_BUILD_H_048A1A6D585A9A
+
+#include "ue2common.h"
+#include "util/alloc.h"
+
+struct noodTable;
+
+namespace ue2 {
+
+/** \brief Construct a Noodle matcher for the given literal. */
+ue2::aligned_unique_ptr<noodTable> noodBuildTable(const u8 *lit, size_t len,
+                                                  bool nocase, u32 id);
+
+size_t noodSize(const noodTable *n);
+
+} // namespace ue2
+
+#ifdef DUMP_SUPPORT
+
+#include <cstdio>
+
+namespace ue2 {
+
+void noodPrintStats(const noodTable *n, FILE *f);
+
+} // namespace ue2
+
+#endif // DUMP_SUPPORT
+
+#endif /* NOODLE_BUILD_H_048A1A6D585A9A */
+
--- a/src/hwlm/noodle_engine.c
+++ b/src/hwlm/noodle_engine.c
@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Noodle literal matcher: runtime.
+ */
+#include "hwlm.h"
+#include "noodle_engine.h"
+#include "noodle_internal.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+#include "util/masked_move.h"
+#include "util/simd_utils.h"
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <string.h>
+
+/** \brief Noodle runtime context. */
+struct cb_info {
+    HWLMCallback cb; //!< callback function called on match
+    u32 id; //!< ID to pass to callback on match
+    void *ctx; //!< caller-supplied context to pass to callback
+    size_t offsetAdj; //!< used in streaming mode
+};
+
+#define RETURN_IF_TERMINATED(x)                                                \
+    {                                                                          \
+        if ((x) == HWLM_TERMINATED) {                                          \
+            return HWLM_TERMINATED;                                            \
+        }                                                                      \
+    }
+
+#define SINGLE_ZSCAN()                                                         \
+    do {                                                                       \
+        while (unlikely(z)) {                                                  \
+            u32 pos = findAndClearLSB_32(&z);                                  \
+            size_t matchPos = d - buf + pos;                                   \
+            hwlmcb_rv_t rv = final(buf, len, key, 1, 0, 0, noCase, cbi,        \
+                                   matchPos);                                  \
+            RETURN_IF_TERMINATED(rv);                                          \
+        }                                                                      \
+    } while (0)
+
+#define DOUBLE_ZSCAN()                                                         \
+    do {                                                                       \
+        while (unlikely(z)) {                                                  \
+            u32 pos = findAndClearLSB_32(&z);                                  \
+            size_t matchPos = d - buf + pos - 1;                               \
+            hwlmcb_rv_t rv = final(buf, len, key, keyLen, keyOffset, 1,        \
+                                   noCase, cbi, matchPos);                     \
+            RETURN_IF_TERMINATED(rv);                                          \
+        }                                                                      \
+    } while (0)
+
+static really_inline
+u8 caseClear8(u8 x, bool noCase) {
+    return (u8)(noCase ? (x & (u8)0xdf) : x);
+}
+
+// Make sure the rest of the string is there. The single character scanner
+// is used only for single chars with case insensitivity used correctly,
+// so it can go straight to the callback if we get this far.
+static really_inline
+hwlm_error_t final(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
+                   size_t keyOffset, bool is_double, bool noCase,
+                   const struct cb_info *cbi, size_t pos) {
+    pos -= keyOffset;
+    if (is_double) {
+        if (pos + keyLen > len) {
+            return HWLM_SUCCESS;
+        }
+        if (cmpForward(buf + pos, key, keyLen, noCase)) { // ret 1 on mismatch
+            return HWLM_SUCCESS;
+        }
+    }
+    pos += cbi->offsetAdj;
+    DEBUG_PRINTF("match @ %zu->%zu\n", pos, (pos + keyLen - 1));
+    hwlmcb_rv_t rv = cbi->cb(pos, (pos + keyLen - 1), cbi->id, cbi->ctx);
+    if (rv == HWLM_TERMINATE_MATCHING) {
+        return HWLM_TERMINATED;
+    }
+    return HWLM_SUCCESS;
+}
+
+#if defined(__AVX2__)
+#define CHUNKSIZE 32
+#define MASK_TYPE m256
+#include "noodle_engine_avx2.c"
+#else
+#define CHUNKSIZE 16
+#define MASK_TYPE m128
+#include "noodle_engine_sse.c"
+#endif
+
+static really_inline
+hwlm_error_t scanSingleMain(const u8 *buf, size_t len, const u8 *key,
+                            bool noCase, const struct cb_info *cbi) {
+    hwlm_error_t rv;
+    size_t end = len;
+
+    const MASK_TYPE mask1 = getMask(key[0], noCase);
+    const MASK_TYPE caseMask = getCaseMask();
+
+    if (len < CHUNKSIZE) {
+        rv = scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, 0, len);
+        return rv;
+    }
+
+    if (len == CHUNKSIZE) {
+        rv = scanSingleUnaligned(buf, len, 0, key, noCase, caseMask, mask1, cbi,
+                                 0, len);
+        return rv;
+    }
+
+    uintptr_t data = (uintptr_t)buf;
+    uintptr_t s2Start = ROUNDUP_N(data, CHUNKSIZE) - data;
+    uintptr_t last = data + end;
+    uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
+    uintptr_t s3Start = len - CHUNKSIZE;
+
+    if (s2Start) {
+        // first scan out to the fast scan starting point
+        DEBUG_PRINTF("stage 1: -> %zu\n", s2Start);
+        rv = scanSingleUnaligned(buf, len, 0, key, noCase, caseMask, mask1, cbi,
+                                 0, s2Start);
+        RETURN_IF_TERMINATED(rv);
+    }
+
+    if (likely(s2Start != s2End)) {
+        // scan as far as we can, bounded by the last point this key can
+        // possibly match
+        DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s2End);
+        rv = scanSingleFast(buf, len, key, noCase, caseMask, mask1, cbi,
+                            s2Start, s2End);
+        RETURN_IF_TERMINATED(rv);
+    }
+
+    // if we are done bail out
+    if (s2End == end) {
+        return HWLM_SUCCESS;
+    }
+
+    DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, end);
+    rv = scanSingleUnaligned(buf, len, s3Start, key, noCase, caseMask, mask1,
+                             cbi, s2End, end);
+
+    return rv;
+}
+
+static really_inline
+hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key,
+                            size_t keyLen, size_t keyOffset, bool noCase,
+                            const struct cb_info *cbi) {
+    hwlm_error_t rv;
+    // we stop scanning for the key-fragment when the rest of the key can't
+    // possibly fit in the remaining buffer
+    size_t end = len - keyLen + keyOffset + 2;
+
+    const MASK_TYPE caseMask = getCaseMask();
+    const MASK_TYPE mask1 = getMask(key[keyOffset + 0], noCase);
+    const MASK_TYPE mask2 = getMask(key[keyOffset + 1], noCase);
+
+    if (end - keyOffset < CHUNKSIZE) {
+        rv = scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask,
+                             mask1, mask2, cbi, keyOffset, end);
+        return rv;
+    }
+    if (end - keyOffset == CHUNKSIZE) {
+        rv = scanDoubleUnaligned(buf, len, keyOffset, key, keyLen, keyOffset,
+                                 noCase, caseMask, mask1, mask2, cbi, keyOffset,
+                                 end);
+        return rv;
+    }
+
+    uintptr_t data = (uintptr_t)buf;
+    uintptr_t s2Start = ROUNDUP_N(data + keyOffset, CHUNKSIZE) - data;
+    uintptr_t s1End = s2Start + 1;
+    uintptr_t last = data + end;
+    uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
+    uintptr_t s3Start = end - CHUNKSIZE;
+    uintptr_t off = keyOffset;
+
+    if (s2Start != keyOffset) {
+        // first scan out to the fast scan starting point plus one char past to
+        // catch the key on the overlap
+        DEBUG_PRINTF("stage 1: -> %zu\n", s2Start);
+        rv = scanDoubleUnaligned(buf, len, keyOffset, key, keyLen, keyOffset,
+                                 noCase, caseMask, mask1, mask2, cbi, off,
+                                 s1End);
+        RETURN_IF_TERMINATED(rv);
+    }
+    off = s1End;
+
+    if (s2Start >= end) {
+        DEBUG_PRINTF("s2 == mL %zu\n", end);
+        return HWLM_SUCCESS;
+    }
+
+    if (likely(s2Start != s2End)) {
+        // scan as far as we can, bounded by the last point this key can
+        // possibly match
+        DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s3Start);
+        rv = scanDoubleFast(buf, len, key, keyLen, keyOffset, noCase, caseMask,
+                            mask1, mask2, cbi, s2Start, s2End);
+        RETURN_IF_TERMINATED(rv);
+        off = s2End;
+    }
+
+    // if there isn't enough data left to match the key, bail out
+    if (s2End == end) {
+        return HWLM_SUCCESS;
+    }
+
+    DEBUG_PRINTF("stage 3: %zu -> %zu\n", s3Start, end);
+    rv = scanDoubleUnaligned(buf, len, s3Start, key, keyLen, keyOffset, noCase,
+                             caseMask, mask1, mask2, cbi, off, end);
+
+    return rv;
+}
+
+
+static really_inline
+hwlm_error_t scanSingleNoCase(const u8 *buf, size_t len, const u8 *key,
+                              const struct cb_info *cbi) {
+    return scanSingleMain(buf, len, key, 1, cbi);
+}
+
+static really_inline
+hwlm_error_t scanSingleCase(const u8 *buf, size_t len, const u8 *key,
+                            const struct cb_info *cbi) {
+    return scanSingleMain(buf, len, key, 0, cbi);
+}
+
+// Single-character specialisation, used when keyLen = 1
+static really_inline
+hwlm_error_t scanSingle(const u8 *buf, size_t len, const u8 *key, bool noCase,
+                        const struct cb_info *cbi) {
+    if (!ourisalpha(key[0])) {
+        noCase = 0; // force noCase off if we don't have an alphabetic char
+    }
+
+    // kinda ugly, but this forces constant propagation
+    if (noCase) {
+        return scanSingleNoCase(buf, len, key, cbi);
+    } else {
+        return scanSingleCase(buf, len, key, cbi);
+    }
+}
+
+
+static really_inline
+hwlm_error_t scanDoubleNoCase(const u8 *buf, size_t len, const u8 *key,
+                              size_t keyLen, size_t keyOffset,
+                              const struct cb_info *cbi) {
+    return scanDoubleMain(buf, len, key, keyLen, keyOffset, 1, cbi);
+}
+
+static really_inline
+hwlm_error_t scanDoubleCase(const u8 *buf, size_t len, const u8 *key,
+                            size_t keyLen, size_t keyOffset,
+                            const struct cb_info *cbi) {
+    return scanDoubleMain(buf, len, key, keyLen, keyOffset, 0, cbi);
+}
+
+
+static really_inline
+hwlm_error_t scanDouble(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
+                        size_t keyOffset, bool noCase,
+                        const struct cb_info *cbi) {
+    // kinda ugly, but this forces constant propagation
+    if (noCase) {
+        return scanDoubleNoCase(buf, len, key, keyLen, keyOffset, cbi);
+    } else {
+        return scanDoubleCase(buf, len, key, keyLen, keyOffset, cbi);
+    }
+}
+
+// main entry point for the scan code
+static really_inline
+hwlm_error_t scan(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
+                  size_t keyOffset, bool noCase, const struct cb_info *cbi) {
+    if (len < keyLen) {
+        // can't find string of length keyLen in a shorter buffer
+        return HWLM_SUCCESS;
+    }
+
+    if (keyLen == 1) {
+        assert(keyOffset == 0);
+        return scanSingle(buf, len, key, noCase, cbi);
+    } else {
+        return scanDouble(buf, len, key, keyLen, keyOffset, noCase, cbi);
+    }
+}
+
+/** \brief Block-mode scanner. */
+hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len,
+                      size_t offset_adj, HWLMCallback cb, void *ctxt) {
+    assert(n && buf);
+
+    struct cb_info cbi = { cb, n->id, ctxt, offset_adj };
+    DEBUG_PRINTF("nood scan of %zu bytes for %*s\n", len, n->len, n->str);
+    return scan(buf, len, n->str, n->len, n->key_offset, n->nocase, &cbi);
+}
+
+/** \brief Streaming-mode scanner. */
+hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf,
+                               size_t hlen, const u8 *buf, size_t len,
+                               HWLMCallback cb, void *ctxt, u8 *temp_buf,
+                               UNUSED size_t temp_buffer_size) {
+    assert(n);
+
+    struct cb_info cbi = {cb, n->id, ctxt, 0};
+    hwlm_error_t rv;
+
+    if (hlen) {
+        assert(hbuf);
+
+        size_t tl1 = MIN(n->len - 1, hlen);
+        size_t tl2 = MIN(n->len - 1, len);
+        size_t temp_len = tl1 + tl2;
+        assert(temp_len < temp_buffer_size);
+        memcpy(temp_buf, hbuf + hlen - tl1, tl1);
+        memcpy(temp_buf + tl1, buf, tl2);
+
+        cbi.offsetAdj = -tl1;
+        rv = scan(temp_buf, temp_len, n->str, n->len, n->key_offset, n->nocase,
+                  &cbi);
+        if (rv == HWLM_TERMINATED) {
+            return HWLM_TERMINATED;
+        }
+    }
+
+    assert(buf);
+
+    cbi.offsetAdj = 0;
+    return scan(buf, len, n->str, n->len, n->key_offset, n->nocase, &cbi);
+}
--- a/Show More
+++ b/Show More