Initial commit of Hyperscan

2025-11-18 18:20:35 +03:00 · 2015-10-20 09:13:35 +11:00
commit 904e436f11
610 changed files with 213627 additions and 0 deletions
--- a/src/fdr/CMakeLists.txt
+++ b/src/fdr/CMakeLists.txt
@@ -0,0 +1,39 @@
+# The set of rules and other nastiness for generating FDR/Teddy source
+
+# we need to add these as explicit dependencies
+set(AUTOGEN_PY_FILES
+    arch.py
+    autogen.py
+    autogen_utils.py
+    base_autogen.py
+    fdr_autogen.py
+    teddy_autogen.py
+)
+
+function(fdr_autogen type out)
+    add_custom_command (
+        COMMENT "AUTOGEN ${out}"
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${out}
+        COMMAND ${PYTHON} ${CMAKE_CURRENT_SOURCE_DIR}/autogen.py ${type} > ${CMAKE_CURRENT_BINARY_DIR}/${out}
+        DEPENDS ${AUTOGEN_PY_FILES}
+        )
+    add_custom_target(autogen_${type} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${out})
+endfunction(fdr_autogen)
+
+#now build the functions
+fdr_autogen(runtime fdr_autogen.c)
+fdr_autogen(compiler fdr_autogen_compiler.cpp)
+fdr_autogen(teddy_runtime teddy_autogen.c)
+fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp)
+
+set(fdr_GENERATED_SRC
+${CMAKE_BINARY_DIR}/src/fdr/fdr_autogen.c
+${CMAKE_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp
+${CMAKE_BINARY_DIR}/src/fdr/teddy_autogen.c
+${CMAKE_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
+PARENT_SCOPE)
+
+set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+
--- a/src/fdr/arch.py
+++ b/src/fdr/arch.py
@@ -0,0 +1,58 @@
+#!/usr/bin/python
+
+# Copyright (c) 2015, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import autogen_utils
+
+# wrapper for architectures
+
+class Arch:
+    def __init__(self, name, extensions = []):
+        self.name = name
+        self.extensions = extensions
+        self.target = None
+
+    def get_guard(self):
+        # these defines definitely fall into the "belt-and-suspenders"
+        # category of paranoia
+        if (self.guard_list == []):
+            return "#if 1"
+
+        return "#if " + " && ".join(self.guard_list)
+
+class X86Arch(Arch):
+    def __init__(self, name, extensions = []):
+        Arch.__init__(self, name, extensions)
+        self.guard_list = [ ]
+        self.target = "0"
+
+        if "AVX2" in extensions:
+            self.target += " | HS_CPU_FEATURES_AVX2"
+            self.guard_list += [ "defined(__AVX2__)" ]
+
+
+arch_x86_64            = X86Arch("x86_64", extensions = [ ])
+arch_x86_64_avx2       = X86Arch("x86_64_avx2", extensions = [ "AVX2" ])
--- a/src/fdr/autogen.py
+++ b/src/fdr/autogen.py
@@ -0,0 +1,159 @@
+#!/usr/bin/python
+
+# Copyright (c) 2015, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+from autogen_utils import *
+from fdr_autogen import *
+from teddy_autogen import *
+from arch import *
+
+# FDR setup
+
+# these are either produced - if the guard succeeds, or #defined to zeroes.
+# either the function or the zero is fine in our array of function pointers
+
+def produce_fdr_runtimes(l):
+    for m in l:
+        m.produce_code()
+
+def produce_fdr_compiles(l):
+    print "void getFdrDescriptions(vector<FDREngineDescription> *out) {"
+    print "    static const FDREngineDef defns[] = {"
+    for m in l:
+        m.produce_compile_call()
+    print "    };"
+    print "    out->clear();"
+    print "    for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {"
+    print "        out->push_back(FDREngineDescription(defns[i]));"
+    print "    }"
+    print "}"
+
+def build_fdr_matchers():
+    all_matchers = [ ]
+    domains = [8, 10, 11, 12, 13]
+    big_domains = [ 14, 15 ]
+
+    common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
+    for d in domains:
+        all_matchers += [ M3(stride = 1, domain = d, **common) ]
+        all_matchers += [ M3(stride = 2, domain = d, **common) ]
+        all_matchers += [ M3(stride = 4, domain = d, **common) ]
+    for d in big_domains:
+        all_matchers += [ M3(stride = 1, domain = d, **common) ]
+
+    return all_matchers
+
+# teddy setup
+
+def build_teddy_matchers():
+    all_matchers = [ ]
+
+    # AVX2
+    all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = False) ]
+    all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = True) ]
+    for n_msk in range(1, 5):
+        all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = False, num_masks = n_msk, num_buckets = 16) ]
+        all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = True, num_masks = n_msk, num_buckets = 16) ]
+
+    # SSE/SSE2/SSSE3
+    for n_msk in range(1, 5):
+        all_matchers += [ MT(arch = arch_x86_64, packed = False, num_masks = n_msk, num_buckets = 8) ]
+        all_matchers += [ MT(arch = arch_x86_64, packed = True, num_masks = n_msk, num_buckets = 8) ]
+
+    return all_matchers
+
+def produce_teddy_compiles(l):
+    print "void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {"
+    print "    static const TeddyEngineDef defns[] = {"
+    for m in l:
+        m.produce_compile_call()
+    print "    };"
+    print "    out->clear();"
+    print "    for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {"
+    print "        out->push_back(TeddyEngineDescription(defns[i]));"
+    print "    }"
+    print "}"
+
+# see below - we don't produce our 'zeros' at the point of the teddy runtimes as they
+# are linked. So we either generate the function or we don't - then at the point of the
+# header in fdr_autogen.c we either generate the header or we #define the zero.
+
+def produce_teddy_runtimes(l):
+    # Since we're using -Wmissing-prototypes, we need headers first.
+    for m in l:
+	m.produce_guard()
+        print m.produce_header(visible = True, header_only = True)
+	m.close_guard()
+
+    for m in l:
+	m.produce_guard()
+        m.produce_code()
+	m.close_guard()
+
+# see produce_teddy_runtimes() comment for the rationale
+
+def produce_teddy_headers(l):
+    for m in l:
+	m.produce_guard()
+        print m.produce_header(visible = True, header_only = True)
+	m.produce_zero_alternative()
+
+# general utilities
+
+def make_fdr_function_pointers(matcher_list):
+    print  """
+typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
+static FDRFUNCTYPE funcs[] = {
+"""
+    all_funcs = ",\n".join([ "    %s" % m.get_name() for m in matcher_list ])
+    print all_funcs
+    print """
+};
+"""
+
+def assign_ids(matcher_list, next_id):
+    for m in matcher_list:
+        m.id = next_id
+        next_id += 1
+    return next_id
+
+# Main entry point
+
+m = build_fdr_matchers()
+next_id = assign_ids(m, 0)
+tm = build_teddy_matchers()
+next_id = assign_ids(tm, next_id)
+if sys.argv[1] == "compiler":
+    produce_fdr_compiles(m)
+elif sys.argv[1] == "runtime":
+    produce_fdr_runtimes(m)
+    produce_teddy_headers(tm)
+    make_fdr_function_pointers(m+tm)
+elif sys.argv[1] == "teddy_runtime":
+    produce_teddy_runtimes(tm)
+elif sys.argv[1] == "teddy_compiler":
+    produce_teddy_compiles(tm)
--- a/src/fdr/autogen_utils.py
+++ b/src/fdr/autogen_utils.py
@@ -0,0 +1,285 @@
+#!/usr/bin/python
+
+# Copyright (c) 2015, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+def fail_out(msg = ""):
+    print >>sys.stderr, "Internal failure in autogen.py: " + msg
+    sys.exit(1)
+
+class IntegerType:
+    def __init__(self, size):
+        self.size = size
+
+    def get_name(self):
+        return { 256: "m256", 128 : "m128", 64 : "u64a", 32 : "u32" , 16 : "u16", 8 : "u8"}[self.size]
+
+    def size_in_bytes(self):
+        return self.size / 8
+
+    def isSIMDOnIntel(self):
+        return False
+
+    def zero_expression(self):
+        return "0"
+
+    def constant_to_string(self, n):
+        if self.size == 64:
+            suffix = "ULL"
+        else:
+            suffix = ""
+        return "0x%x%s" % (n & ((1 << self.size) - 1), suffix)
+
+    def lowbits(self, n):
+        return (1 << n) - 1
+
+    def highbits(self, n):
+        return ~(self.lowbits(self.size - n))
+
+    def lowbit_mask(self, n):
+        return self.constant_to_string(self.lowbits(n))
+
+    def highbit_mask(self, n):
+        return self.constant_to_string(self.highbits(n))
+
+    def lowbit_extract_expr(self, expr_string, n):
+         return "(%s & %s)" % ( expr_string, self.lowbit_mask(n))
+
+    def highbit_extract_expr(self, expr_string, n):
+        return "(%s >> %d)" % (expr_string, self.size - n)
+
+    def flip_lowbits_expr(self, expr_string, n):
+         return "(%s ^ %s)" % ( expr_string, self.lowbit_mask(n))
+
+    def bit_extract_expr(self, expr_string, low, high):
+        lbm = self.lowbit_mask(high - low)
+        return "((%s >> %d) & %s)" % (expr_string, low, lbm)
+
+    # shifts are +ve if left and -ve if right
+    def shift_expr(self, expr_string, n):
+        if n <= -self.size or n >= self.size:
+            return self.zero_expression()
+        elif (n > 0):
+            return "(%s << %d)" % (expr_string, n)
+        elif (n < 0):
+            return "(%s >> %d)" % (expr_string, -n)
+        else:
+            return "(%s)" % (expr_string)
+
+    # code is:
+    # "normal" (always between buf and len) - the default
+    # "aligned" (means normal + aligned to a natural boundary)
+    # "cautious_forward" (means may go off the end of buf+len)
+    # "cautious_backwards" (means may go off the start of buf)
+    # "cautious_everywhere" (means may go off both)
+
+    def load_expr_data(self, offset = 0, code = "normal",
+                       base_string = "ptr", bounds_lo = "buf", bounds_hi = "buf + len"):
+        if code is "normal":
+            return "lv_%s(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
+        elif code is "aligned":
+            if self.size is 8:
+                fail_out("no aligned byte loads")
+            return "lv_%s_a(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
+        elif code is "cautious_forward":
+            return "lv_%s_cf(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
+        elif code is "cautious_backward":
+            return "lv_%s_cb(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
+        elif code is "cautious_everywhere":
+            return "lv_%s_ce(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
+
+
+class SIMDIntegerType(IntegerType):
+    def __init__(self, size):
+        IntegerType.__init__(self, size)
+
+    def isSIMDOnIntel(self):
+        return True
+
+    def zero_expression(self):
+        return "zeroes128()"
+
+    def lowbit_extract_expr(self, expr_string, n):
+        if (n <= 32):
+            tmpType = IntegerType(32)
+            tmpExpr = "movd(%s)" % expr_string
+        elif (32 < n <= 64):
+            tmpType = IntegerType(64)
+            tmpExpr = "movq(%s)" % expr_string
+        return tmpType.lowbit_extract_expr(tmpExpr, n)
+
+    def highbit_extract_expr(self, expr_string, n):
+        fail_out("Unimplemented high bit extract on m128")
+
+    def bit_extract_expr(self, expr_string, low, high, flip):
+        fail_out("Unimplemented bit extract on m128")
+
+    def shift_expr(self, expr_string, n):
+        if n % 8 != 0:
+            fail_out("Trying to shift a m128 by a bit granular value")
+
+        # should check that n is divisible by 8
+        if n <= -self.size or n >= self.size:
+            return self.zero_expression()
+        elif (n > 0):
+            return "_mm_slli_si128(%s, %s)" % (expr_string, n / 8)
+        elif (n < 0):
+            return "_mm_srli_si128(%s, %s)" % (expr_string, -n / 8)
+        else:
+            return "(%s)" % (expr_string)
+
+    def lowbit_mask(self, n):
+        if n % 8 != 0:
+            fail_out("Trying to make a lowbit mask in a m128 by a bit granular value")
+        return self.shift_expr("ones128()", -(128 - n))
+
+def getRequiredType(bits):
+    if bits == 128:
+        return SIMDIntegerType(bits)
+    for b in [ 8, 16, 32, 64]:
+        if (bits <= b):
+            return IntegerType(b)
+    return None
+
+class IntegerVariable:
+    def __init__(self, name, type):
+        self.name = name
+        self.type = type
+
+    def gen_initializer_stmt(self, initialization_string = None):
+        if initialization_string:
+            return "%s %s = %s;" % (self.type.get_name(), self.name, initialization_string)
+        else:
+            return "%s %s;" % (self.type.get_name(), self.name)
+
+
+class Step:
+    def __init__(self, context, offset = 0):
+        self.context = context
+        self.matcher = context.matcher
+        self.offset = offset
+        self.latency = 1
+        self.dependency_list = []
+        self.latest = None
+        self.context.add_step(self)
+
+    # return a string, complete with indentation
+    def emit(self):
+        indent = " " * (self.offset*2 + self.matcher.default_body_indent)
+        s = "\n".join( [ indent + line for line in self.val.split("\n")] )
+        if self.latest:
+            s += " // " + str(self.debug_step) + " L" + str(self.latency) + " LTST:%d" % self.latest
+            if self.dependency_list:
+                s += " Derps: "
+                for (d,l) in self.dependency_list:
+                    s += "%d/%d " % (d.debug_step,l)
+        return s
+
+    def add_dependency(self, step, anti_dependency = False, output_dependency = False):
+        if anti_dependency or output_dependency:
+            self.dependency_list += [ (step, 1) ]
+        else:
+            self.dependency_list += [ (step, step.latency) ]
+
+    def nv(self, type, var_name):
+        return self.context.new_var(self, type, var_name)
+
+    def gv(self, var_name, reader = True, writer = False):
+        return self.context.get_var(self, var_name, reader = reader, writer = writer)
+
+# utility steps, generic
+
+class LabelStep(Step):
+    def __init__(self, context, offset = 0, label_prefix = "off"):
+        Step.__init__(self, context, offset)
+        self.val = "%s%d: UNUSED;" % (label_prefix, offset)
+
+class OpenScopeStep(Step):
+    def __init__(self, context, offset = 0):
+        Step.__init__(self, context, offset)
+        self.val = "{"
+
+class CloseScopeStep(Step):
+    def __init__(self, context, offset = 0):
+        Step.__init__(self, context, offset)
+        self.val = "}"
+
+
+class CodeGenContext:
+    def __init__(self, matcher):
+        self.vars = {}
+        self.steps = []
+        self.ctr = 0
+        self.matcher = matcher
+        self.var_writer = {} # var to a single writer
+        self.var_readers = {} # var to a list of all the readers that read the last value
+
+    def new_var(self, step, type, var_name):
+        var = IntegerVariable(var_name, type)
+        self.vars[var_name] = var
+        self.var_writer[var_name] = step
+        return var
+
+    def get_var(self, step, var_name, reader = True, writer = False):
+        if reader:
+            writer_step = self.var_writer[var_name]
+            if writer_step:
+                step.add_dependency(writer_step)
+            self.var_readers.setdefault(var_name, []).append(step)
+        if writer and not reader:
+            if self.var_writer[var_name]:
+                step.add_dependency(self.var_writer[var_name], output_dependency = True)
+        if writer:
+            if self.var_readers.has_key(var_name):
+                for reader in [ r for r in self.var_readers[var_name] if r is not step ]:
+                    step.add_dependency(reader, anti_dependency = True)
+                self.var_readers[var_name] = []
+            self.var_writer[var_name] = step
+        return self.vars[var_name]
+
+    def add_step(self, step):
+        self.steps += [ step ]
+        step.debug_step = self.ctr
+        self.ctr += 1
+
+    def dontschedule(self, finals):
+        return "\n".join( [ s.emit() for s in self.steps ] )
+
+    def schedule(self, finals):
+        for f in finals:
+            f.latest = f.latency
+        worklist = finals
+        while worklist:
+            current = worklist[0]
+            worklist = worklist[1:]
+            for (dep, lat) in current.dependency_list:
+                if dep.latest is None or dep.latest < (current.latest + dep.latency):
+                    dep.latest = current.latest + lat
+                    if dep not in worklist:
+                        worklist += [ dep ]
+        self.steps.sort(reverse = True, key = lambda s : s.latest)
+        return "\n".join( [ s.emit() for s in self.steps ] )
--- a/src/fdr/base_autogen.py
+++ b/src/fdr/base_autogen.py
@@ -0,0 +1,167 @@
+#!/usr/bin/python
+
+# Copyright (c) 2015, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+from autogen_utils import *
+from base_autogen import *
+from string import Template
+
+class MatcherBase:
+
+    def __init__(self):
+        pass
+
+    def get_name(self):
+        return "fdr_exec_%03d" % self.id
+
+    def produce_header(self, visible, header_only = False):
+        s = ""
+        if not visible:
+            s += "static never_inline"
+        s += """
+hwlm_error_t %s(UNUSED const struct FDR *fdr,
+                UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name()
+        if header_only:
+            s += ";"
+        else:
+            s += "{"
+        s += "\n"
+        return s
+
+    def produce_guard(self):
+	print self.arch.get_guard()
+    
+    def produce_zero_alternative(self):
+	print """
+#else
+#define %s 0
+#endif
+""" % self.get_name()
+
+    # trivial function for documentation/modularity
+    def close_guard(self):
+	print "#endif"
+
+    def produce_common_declarations(self):
+        return """
+    const u8 * buf = a->buf;
+    const size_t len = a->len;
+    const u8 * ptr = buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t * control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 * tryFloodDetect = a->firstFloodDetect;
+    UNUSED u32 bit, bitRem, confSplit, idx;
+    u32 byte, cf;
+    const struct FDRConfirm *fdrc;
+    u32 last_match = (u32)-1;
+"""
+
+    def produce_continue_check(self):
+        return """if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
+    *a->groups = controlVal;
+    return HWLM_TERMINATED;
+}
+"""
+    def produce_flood_check(self):
+        return """
+        if (P0(ptr > tryFloodDetect)) {
+            tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
+            if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
+                *a->groups = controlVal;
+                return HWLM_TERMINATED;
+            }
+        }
+"""
+
+    def produce_footer(self):
+        return """
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+"""
+
+    def produce_confirm_base(self, conf_var_name, conf_var_size, offset, cautious, enable_confirmless, do_bailout = False):
+        if cautious:
+            caution_string = "VECTORING"
+        else:
+            caution_string = "NOT_CAUTIOUS"
+        conf_split_mask = IntegerType(32).constant_to_string(
+                            self.conf_top_level_split - 1)
+        if enable_confirmless:
+            quick_check_string = """
+        if (!fdrc->mult) {
+            u32 id = fdrc->nBitsOrSoleID;
+            if ((last_match == id) && (fdrc->flags & NoRepeat))
+                continue;
+           last_match = id;
+           controlVal = a->cb(ptr+byte-buf, ptr+byte-buf, id, a->ctxt);
+           continue;
+        } """
+        else:
+            quick_check_string = ""
+        if do_bailout:
+            bailout_string = """
+        if ((ptr + byte < buf + a->start_offset) || (ptr + byte >= buf + len)) continue;"""
+        else:
+            bailout_string = ""
+
+        return Template("""
+if (P0(!!$CONFVAR)) {
+    do  {
+        bit = findAndClearLSB_$CONFVAR_SIZE(&$CONFVAR);
+        byte  = bit / $NUM_BUCKETS + $OFFSET;
+        bitRem  = bit % $NUM_BUCKETS;
+        $BAILOUT_STRING
+        confSplit = *(ptr+byte) & $SPLIT_MASK;
+        idx = confSplit * $NUM_BUCKETS + bitRem;
+        cf = confBase[idx];
+        if (!cf)
+            continue;
+        fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);
+        if (!(fdrc->groups & *control))
+            continue;
+        $QUICK_CHECK_STRING
+        confWithBit(fdrc, a, ptr - buf + byte, $CAUTION_STRING, $CONF_PULL_BACK, control, &last_match);
+    } while(P0(!!$CONFVAR));
+    if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
+        *a->groups = controlVal;
+        return HWLM_TERMINATED;
+    }
+}""").substitute(CONFVAR = conf_var_name,
+                 CONFVAR_SIZE = conf_var_size,
+                 NUM_BUCKETS = self.num_buckets,
+                 OFFSET = offset,
+                 SPLIT_MASK = conf_split_mask,
+                 QUICK_CHECK_STRING = quick_check_string,
+                 BAILOUT_STRING = bailout_string,
+                 CAUTION_STRING = caution_string,
+                 CONF_PULL_BACK = self.conf_pull_back)
+
+
+def indent(block, depth):
+    return "\n".join([ (" " * (4*depth)) + line for line in block.splitlines() ] )
--- a/src/fdr/engine_description.cpp
+++ b/src/fdr/engine_description.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "engine_description.h"
+#include "hs_compile.h" // for hs_platform_info
+#include "util/target_info.h"
+
+namespace ue2 {
+
+EngineDescription::~EngineDescription() {}
+
+bool EngineDescription::isValidOnTarget(const target_t &target_in) const {
+    return target_in.can_run_on_code_built_for(code_target);
+}
+
+target_t targetByArchFeatures(u64a cpu_features) {
+    hs_platform_info p;
+    p.tune = HS_TUNE_FAMILY_GENERIC;
+    p.cpu_features = cpu_features;
+
+    return target_t(p);
+}
+
+} // namespace ue2
--- a/src/fdr/engine_description.h
+++ b/src/fdr/engine_description.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ENGINE_DESCRIPTION_H
+#define ENGINE_DESCRIPTION_H
+
+#include "ue2common.h"
+#include "util/target_info.h"
+
+namespace ue2 {
+
+class EngineDescription {
+    u32 id;
+    target_t code_target; // the target that we built this code for
+    u32 numBuckets;
+    u32 confirmPullBackDistance;
+    u32 confirmTopLevelSplit;
+
+public:
+    EngineDescription(u32 id_in, const target_t &code_target_in,
+                      u32 numBuckets_in, u32 confirmPullBackDistance_in,
+                      u32 confirmTopLevelSplit_in)
+        : id(id_in), code_target(code_target_in), numBuckets(numBuckets_in),
+          confirmPullBackDistance(confirmPullBackDistance_in),
+          confirmTopLevelSplit(confirmTopLevelSplit_in) {}
+
+    virtual ~EngineDescription();
+
+    u32 getID() const { return id; }
+    u32 getNumBuckets() const { return numBuckets; }
+    u32 getConfirmPullBackDistance() const { return confirmPullBackDistance; }
+    u32 getConfirmTopLevelSplit() const { return confirmTopLevelSplit; }
+
+    bool isValidOnTarget(const target_t &target_in) const;
+    virtual u32 getDefaultFloodSuffixLength() const = 0;
+
+    virtual bool typicallyHoldsOneCharLits() const { return true; }
+};
+
+/** Returns a target given a CPU feature set value. */
+target_t targetByArchFeatures(u64a cpu_features);
+
+} // namespace ue2
+
+#endif
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util/simd_utils.h"
+
+#define P0(cnd) unlikely(cnd)
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "teddy_internal.h"
+
+#include "flood_runtime.h"
+
+#include "fdr_confirm.h"
+#include "fdr_confirm_runtime.h"
+#include "fdr_streaming_runtime.h"
+#include "fdr_loadval.h"
+
+static really_inline UNUSED
+u32 getPreStartVal(const struct FDR_Runtime_Args *a, u32 numBits) {
+    u32 r = 0;
+    if (a->start_offset == 0) {
+        if (numBits <= 8) {
+            r = a->buf_history[a->len_history - 1];
+        } else {
+            r = a->buf_history[a->len_history - 1];
+            r |= (a->buf[0] << 8);
+        }
+    } else {
+        if (numBits <= 8) {
+            r = a->buf[a->start_offset - 1];
+        } else {
+            r = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
+        }
+    }
+    return r & ((1 << numBits) - 1);
+}
+
+#include "fdr_autogen.c"
+
+#define FAKE_HISTORY_SIZE 16
+static const u8 fake_history[FAKE_HISTORY_SIZE];
+
+hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, size_t start,
+                     HWLMCallback cb, void *ctxt, hwlm_group_t groups) {
+
+    const struct FDR_Runtime_Args a = {
+        buf,
+        len,
+        fake_history,
+        0,
+        fake_history, // nocase
+        0,
+        start,
+        cb,
+        ctxt,
+        &groups,
+        nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
+        0
+    };
+    if (unlikely(a.start_offset >= a.len)) {
+        return HWLM_SUCCESS;
+    } else {
+        assert(funcs[fdr->engineID]);
+        return funcs[fdr->engineID](fdr, &a);
+    }
+}
+
+hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
+                              size_t hlen, const u8 *buf, size_t len,
+                              size_t start, HWLMCallback cb, void *ctxt,
+                              hwlm_group_t groups, u8 * stream_state) {
+    struct FDR_Runtime_Args a = {
+        buf,
+        len,
+        hbuf,
+        hlen,
+        hbuf, // nocase - start same as caseful, override later if needed
+        hlen, // nocase
+        start,
+        cb,
+        ctxt,
+        &groups,
+        nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
+        hbuf ? CONF_LOADVAL_CALL_CAUTIOUS(hbuf + hlen - 8, hbuf, hbuf + hlen)
+             : (u64a)0
+
+    };
+    fdrUnpackState(fdr, &a, stream_state);
+
+    hwlm_error_t ret;
+    if (unlikely(a.start_offset >= a.len)) {
+        ret = HWLM_SUCCESS;
+    } else {
+        assert(funcs[fdr->engineID]);
+        ret = funcs[fdr->engineID](fdr, &a);
+    }
+
+    fdrPackState(fdr, &a, stream_state);
+    return ret;
+}
--- a/src/fdr/fdr.h
+++ b/src/fdr/fdr.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: runtime API.
+ */
+
+#ifndef FDR_H
+#define FDR_H
+
+#include "ue2common.h"
+#include "hwlm/hwlm.h"
+
+// C linkage in the API
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct FDR;
+
+/** \brief Returns size in bytes of the given FDR engine. */
+size_t fdrSize(const struct FDR *fdr);
+
+/** \brief Returns non-zero if the contents of the stream state indicate that
+ * there is active FDR history beyond the regularly used history. */
+u32 fdrStreamStateActive(const struct FDR *fdr, const u8 *stream_state);
+
+/**
+ * \brief Block-mode scan.
+ *
+ * \param fdr FDR matcher engine.
+ * \param buf Buffer to scan.
+ * \param len Length of buffer to scan.
+ * \param start First offset in buf at which a match may end.
+ * \param cb Callback to call when a match is found.
+ * \param ctxt Caller-provided context pointer supplied to callback on match.
+ * \param groups Initial groups mask.
+ */
+hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
+                     size_t start, HWLMCallback cb, void *ctxt,
+                     hwlm_group_t groups);
+
+/**
+ * \brief Streaming-mode scan.
+ *
+ * \param fdr FDR matcher engine.
+ * \param hbuf History buffer.
+ * \param hlen Length of history buffer (hbuf).
+ * \param buf Buffer to scan.
+ * \param len Length of buffer to scan (buf).
+ * \param start First offset in buf at which a match may end.
+ * \param cb Callback to call when a match is found.
+ * \param ctxt Caller-provided context pointer supplied to callback on match.
+ * \param groups Initial groups mask.
+ * \param stream_state Persistent stream state for use by FDR.
+ */
+hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
+                              size_t hlen, const u8 *buf, size_t len,
+                              size_t start, HWLMCallback cb, void *ctxt,
+                              hwlm_group_t groups, u8 *stream_state);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // FDR_H
--- a/src/fdr/fdr_autogen.py
+++ b/src/fdr/fdr_autogen.py
@@ -0,0 +1,574 @@
+#!/usr/bin/python
+
+# Copyright (c) 2015, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+from autogen_utils import *
+from base_autogen import *
+from string import Template
+
+class OrStep(Step):
+    def __init__(self, context, offset, width):
+        Step.__init__(self, context, offset)
+        s_var = self.gv("st%d" % offset)
+        if width < 128:
+            self.val = "s |= %s;" % s_var.name
+        else:
+            self.val = "s = or%d(s, %s);" % (width, s_var.name)
+
+class ShiftStateStep(Step):
+    def __init__(self, context, offset = 0, stride_used = 1):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        state = m.state_variable
+        shift_distance = -1 * stride_used * m.num_buckets
+        self.val = "%s = %s;" % (state.name, state.type.shift_expr(state.name, shift_distance))
+
+class BulkLoadStep(Step):
+    def __init__(self, context, offset, size, define_var = True, aligned = True):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        self.latency = 4
+        blt = m.bulk_load_type
+        if aligned:
+            init_string = blt.load_expr_data(self.offset, code = "aligned")
+        else:
+            init_string = blt.load_expr_data(self.offset)
+
+        var_name = "current_data_%d" % offset
+        if define_var:
+            lb_var = self.nv(blt, var_name)
+            self.val = lb_var.gen_initializer_stmt(init_string)
+        else:
+            lb_var = self.gv(var_name, reader = False, writer = True)
+            self.val = "%s = %s;" % (var_name, init_string)
+
+class ValueExtractStep(Step):
+    def __init__(self, context, offset, sub_load_cautious = False):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        self.latency = 2
+        dsb = m.datasize_bytes
+        modval = offset % dsb
+
+        if m.domain > 8 and modval == dsb - 1:
+            # Case 1: reading more than one byte over the end of the bulk load
+
+            self.latency = 4
+            if sub_load_cautious:
+                code_string = "cautious_forward" 
+            else:
+                code_string = "normal"
+            load_string = m.single_load_type.load_expr_data(self.offset, code_string)
+            temp_string = "(%s << %d)" % (load_string, m.reach_shift_adjust)
+        else:
+            # Case 2: reading a value that can be found entirely in the current register
+            if m.fdr2_force_naive_load:
+                load_string = m.single_load_type.load_expr_data(self.offset, "normal")
+                temp_string = "(%s << %d)" % (load_string, m.reach_shift_adjust)
+            else:
+                lb_var = self.gv("current_data_%d" % (offset - modval))
+                if modval == 0:
+                    # Case 2a: value is at LSB end of the register and must be left-
+                    # shifted into place if there is a "reach_shift_adjust" required
+                    temp_string = "(%s << %d)" % (lb_var.name, m.reach_shift_adjust)
+                else:
+                    # Case 2b: value is in the middle of the register and will be
+                    # right-shifted into place (adjusted by "reach_shift_adjust")
+                    temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)
+
+
+        init_string = "(%s) & 0x%x" % (temp_string, m.reach_mask)
+        v_var = self.nv(m.value_extract_type, "v%d" % offset)
+        self.val = v_var.gen_initializer_stmt(init_string)
+
+class TableLookupStep(Step):
+    def __init__(self, context, reach_multiplier, offset = 0):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        self.latency = 4
+        v_var = self.gv("v%d" % offset)
+        s_var = self.nv(m.state_type, "st%d" % offset)
+        init_string = "*(const %s *)(ft + %s*%dU)" % ( m.state_type.get_name(),
+                                                       v_var.name, reach_multiplier)
+        self.val = s_var.gen_initializer_stmt(init_string)
+
+class ShiftReachMaskStep(Step):
+    def __init__(self, context, offset):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        extr = m.extract_frequency
+        modval = offset % extr
+        s_var = self.gv("st%d" % offset, writer = True)
+        self.val = "%s = %s;" % (s_var.name, s_var.type.shift_expr(s_var.name, modval * m.num_buckets))
+
+class ConfExtractStep(Step):
+    def __init__(self, context, offset):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        if m.state_type.isSIMDOnIntel():
+            self.latency = 2
+        init_string = m.state_type.lowbit_extract_expr("s", m.extract_size)
+        extr_var = self.nv(m.extr_type, "extr%d" % offset)
+        self.val = extr_var.gen_initializer_stmt(init_string)
+
+class ConfAccumulateStep(Step):
+    def __init__(self, context, extract_offset, conf_offset, define_var = True):
+        Step.__init__(self, context, extract_offset)
+        m = self.matcher
+        extr_var = self.gv("extr%d" % extract_offset)
+        extr_var_cast = "((%s)%s)" % (m.conf_type.get_name(), extr_var.name)
+        if extract_offset == conf_offset:
+            # create conf_var as a straight copy of extr
+            if define_var:
+                conf_var = self.nv(m.conf_type, "conf%d" % conf_offset)
+                self.val = conf_var.gen_initializer_stmt(extr_var_cast)
+            else:
+                conf_var = self.gv("conf%d" % conf_offset, writer = True, reader = True)
+                self.val = "%s = %s;" % (conf_var.name, extr_var_cast)
+        else:
+            # shift extr_var and insert/OR it in conf_var
+            conf_var = self.gv("conf%d" % conf_offset, writer = True, reader = True)
+            shift_dist = (extract_offset - conf_offset) * m.num_buckets
+            self.val = "%s |= %s;" % (conf_var.name, m.conf_type.shift_expr(extr_var_cast, shift_dist))
+            self.latency = 2
+
+class ConfirmFlipStep(Step):
+    def __init__(self, context, offset):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        conf_var = self.gv("conf%d" % self.offset, writer = True)
+        self.val = "%s = %s;" % (conf_var.name,
+                       conf_var.type.flip_lowbits_expr(conf_var.name, self.matcher.confirm_frequency * m.num_buckets))
+
+class ConfirmStep(Step):
+    def __init__(self, context, offset, cautious = False):
+        Step.__init__(self, context, offset)
+        m = self.matcher
+        conf_var = self.gv("conf%d" % offset, writer = True)
+        self.val = m.produce_confirm_base(conf_var.name, conf_var.type.size, offset, cautious,
+                                          enable_confirmless = m.stride == 1, do_bailout = False)
+
+class M3(MatcherBase):
+    def get_hash_safety_parameters(self):
+        h_size = self.single_load_type.size_in_bytes()
+        return (0, h_size - 1)
+
+    def produce_compile_call(self):
+        print "    { %d, %d, %d, %d, %d, %s, %d, %d }," % (
+              self.id, self.state_width, self.num_buckets,
+              self.stride, self.domain,
+              self.arch.target, self.conf_pull_back, self.conf_top_level_split)
+
+    def produce_main_loop(self, switch_variant = False):
+        stride_offsets = xrange(0, self.loop_bytes, self.stride)
+        stride_offsetSet = set(stride_offsets)
+        so_steps_last_block = []
+        sh = None
+        last_confirm = None
+        ctxt = CodeGenContext(self)
+
+        if switch_variant:
+            print " ptr -= (iterBytes - dist);"
+            print " { " # need an extra scope around switch variant to stop its globals escaping
+        else:
+            print "    if (doMainLoop) {"
+            print "    for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {"
+            print self.produce_flood_check()
+            print "        __builtin_prefetch(ptr + (iterBytes*4));"
+            print "        assert(((size_t)ptr % START_MOD) == 0);"
+
+
+        # just do globally for now
+        if switch_variant:
+            subsidiary_load_cautious = True
+            confirm_cautious = True
+        else:
+            subsidiary_load_cautious = False
+            confirm_cautious = False
+
+        if not self.fdr2_force_naive_load:
+            bulk_load_steps = [ off for off in range(self.loop_bytes)
+                                if off % self.datasize_bytes == 0 and
+                                   (set(range(off, off + self.datasize_bytes - 1)) & stride_offsetSet)]
+        else:
+            bulk_load_steps = []
+
+        confirm_steps = [ off for off in range(self.loop_bytes) if off % self.confirm_frequency == 0 ]
+
+        for off in bulk_load_steps:
+            lb_var = ctxt.new_var(None, self.bulk_load_type, "current_data_%d" % off)
+            print "        " + lb_var.gen_initializer_stmt()
+
+
+        for off in confirm_steps:
+            var_name = "conf%d" % off
+            conf_def_var = ctxt.new_var(None, self.conf_type, var_name)
+            if switch_variant:
+                init_string = "(%s)-1" % self.conf_type.get_name()
+            else:
+                init_string = ""
+            print "        " + conf_def_var.gen_initializer_stmt(init_string)
+
+        if switch_variant:
+            print "        switch(iterBytes - dist) {"
+            for i in range(0, self.loop_bytes):
+                print "            case %d:" % i
+
+                # init and poison conf; over-precise but harmless
+                conf_id = (i / self.confirm_frequency) * self.confirm_frequency
+                if i % self.confirm_frequency:
+                    conf_fixup_bits = self.conf_type.size - (self.num_buckets * (i % self.confirm_frequency))
+                    print "                conf%d >>= %d;" % (conf_id, conf_fixup_bits)
+                else:
+                    print "                conf%d = 0;" % conf_id
+
+                # init state
+                state_fixup = i % self.extract_frequency
+                state = self.state_variable
+                shift_distance = self.num_buckets * state_fixup
+                if state_fixup:
+                    print "                %s = %s;" % (state.name, state.type.shift_expr(state.name, shift_distance))
+                    if self.state_width < 128:
+                        print "                %s |= %s;" % (state.name, state.type.lowbit_mask(shift_distance))
+                    else:
+                        print "                %s = or%d(%s, %s);" % (state.name, self.state_width, state.name, state.type.lowbit_mask(shift_distance))
+
+                if not self.fdr2_force_naive_load:
+                    # init current_data (could poison it in some cases)
+                    load_mod = i % self.datasize_bytes
+                    load_offset = i - load_mod
+                    if load_mod:
+                        # not coming in on an even boundary means having to do a load var
+                        # actually, there are a bunch of things we can do on this bulk load
+                        # to avoid having to be 'cautious_backwards' but I'm not completely
+                        # sure they are good ideas
+                        init_string = self.bulk_load_type.load_expr_data(load_offset,
+                                                                         code = "cautious_backward")
+                        var_name = "current_data_%d" % load_offset
+                        lb_var = ctxt.get_var(None, var_name, reader = False, writer = True)
+                        print "                %s = %s;" % (lb_var.name, init_string)
+
+                print "                goto off%d;" % i
+            print "            case %d: goto skipSwitch;" % self.loop_bytes
+            print "        }"
+            print "        {"
+
+
+        for off in range(self.loop_bytes):
+            # X_mod is the offset we're up to relative to the last X operation
+            # X_offset is which of the last X operations matches this iteration
+
+            if (switch_variant):
+                LabelStep(ctxt, off)
+
+            if off in bulk_load_steps:
+                if not self.fdr2_force_naive_load:
+                    BulkLoadStep(ctxt, off, self.datasize, define_var = False, aligned = not switch_variant)
+
+            if off in stride_offsets:
+                if switch_variant:
+                    OpenScopeStep(ctxt, off)
+                ValueExtractStep(ctxt, off, sub_load_cautious = subsidiary_load_cautious)
+                TableLookupStep(ctxt, self.reach_mult, off)
+                if off % self.extract_frequency:
+                    ShiftReachMaskStep(ctxt, off)
+                so = OrStep(ctxt, off, self.state_width)
+                if switch_variant:
+                    CloseScopeStep(ctxt, off)
+                if sh != None:
+                    so.add_dependency(sh)
+                so_steps_last_block += [ so ]
+
+            extract_mod = off % self.extract_frequency
+            extract_offset = off - extract_mod
+            extract_ready = extract_mod == self.extract_frequency - 1
+            if extract_ready:
+                if switch_variant:
+                    OpenScopeStep(ctxt, off)
+                ex = ConfExtractStep(ctxt, extract_offset)
+                ConfAccumulateStep(ctxt, extract_offset, confirm_offset, define_var = False)
+                for so_step in so_steps_last_block:
+                    ex.add_dependency(so_step)
+                if switch_variant:
+                    CloseScopeStep(ctxt, off)
+                so_steps_last_block = []
+                sh = ShiftStateStep(ctxt, extract_offset, stride_used = self.extract_frequency)
+                sh.add_dependency(ex)
+
+            confirm_mod = off % self.confirm_frequency
+            confirm_offset = off - confirm_mod
+            confirm_ready = confirm_mod == self.confirm_frequency - 1
+            if confirm_ready:
+                cflip = ConfirmFlipStep(ctxt, confirm_offset)
+                cf = ConfirmStep(ctxt, confirm_offset, cautious = confirm_cautious )
+                if last_confirm:
+                    cf.add_dependency(last_confirm)
+                last_confirm = cf
+
+
+        if not switch_variant:
+            print ctxt.schedule([ last_confirm, sh ])
+        else:
+            print ctxt.dontschedule([ last_confirm, sh ])
+
+        if switch_variant:
+            print "skipSwitch:;"
+            print "    ptr += iterBytes;"
+        print "    }" # close extra scope around switch variant
+        print "    }"
+
+
+    def produce_init_state(self):
+        state = self.state_variable
+        s_type = self.state_type
+        shift_distance = -1 * self.num_buckets
+        shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance))
+
+        s = Template("""
+            $TYPENAME s;
+            if (a->len_history) {
+                u32 tmp = getPreStartVal(a, $DOMAIN);
+                s = *((const $TYPENAME *)ft + tmp);
+                $SHIFT_EXPR;
+            } else {
+                s = *(const $TYPENAME *)&fdr->start;
+            }
+""").substitute(TYPENAME = s_type.get_name(),
+                ZERO_EXPR = s_type.zero_expression(),
+                DOMAIN = self.domain,
+                SHIFT_EXPR = shift_expr)
+        return s
+
+    def produce_code(self):
+
+        (behind, ahead) = self.get_hash_safety_parameters()
+        loop_read_behind = behind
+        loop_read_ahead = self.loop_bytes + ahead
+
+        # we set up mask and shift stuff for extracting our masks from registers
+        #
+        # we have a choice as to whether to mask out the value early or
+        # extract the value (shift first) then mask it
+        #
+        # Intel has a free scaling factor from 1/2/4/8 so we want to combine
+        # the extra needed shift for SSE registers with the mask operation
+
+        ssb = self.state_type.size / 8 # state size in bytes
+
+        # Intel path
+        if ssb == 16 and self.domain == 16:
+            # obscure corner - we don't have the room in the register to
+            # do this for all values so we don't. domain==16 is pretty
+            # bad anyhow, of course
+            self.reach_mult = 8
+        else:
+            self.reach_mult = ssb
+
+        shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
+        self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
+        self.reach_mask = ((1 << self.domain) - 1) << self.reach_shift_adjust
+
+        print self.produce_header(visible = False)
+
+        print "// ",
+        print " Arch: " + self.arch.name,
+        print " State type: " + self.state_type.get_name(),
+        print " Num buckets: %d" % self.num_buckets,
+        print " Domain: %d" % self.domain,
+        print " Stride: %d" % self.stride
+
+        print self.produce_common_declarations()
+        print
+
+        print "\tconst size_t tabSize = %d;" % self.table_size
+        print """
+    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
+    const u32 * confBase = (const u32 *)(ft + tabSize);
+"""
+        print self.produce_init_state()
+        print "\tconst size_t iterBytes = %d;" % self.loop_bytes
+        print "\tconst size_t START_MOD = %d;" % self.datasize_bytes
+        print "\tconst size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
+
+        print """
+    while (ptr < buf + len) {
+
+        u8 doMainLoop = 1;
+        size_t remaining = len - (ptr - buf);
+        size_t dist;
+        if (remaining <= iterBytes) {
+            dist = remaining; // once through the switch and we're done
+        } else if (remaining < 2 * iterBytes) {
+            // nibble some stuff off the front, skip the main loop,
+            // then come back here
+            dist = iterBytes;  // maybe could be cleverer
+        } else {
+            // now, we need to see if we can make it to a main loop iteration
+            // if so, we need to ensure that the main loop iteration is aligned
+            // to a START_MOD boundary and i >= 8 so we can read ptr + i - 8
+
+            // see if we can do it - if not, just switch the main loop off,
+            // eat iterBytes in cautious mode, and come back to this loop
+
+            const u8 * target = MAX(buf + 8, ptr);
+            target = ROUNDUP_PTR(target, START_MOD);
+            dist = target - ptr;
+            if (dist > iterBytes) {
+                doMainLoop = 0;
+                dist = iterBytes;
+            }
+        }
+"""
+        self.produce_main_loop(switch_variant = True)
+        self.produce_main_loop(switch_variant = False)
+        print """
+    }
+"""
+        print self.produce_footer()
+
+    def get_name(self):
+        return "fdr_exec_%s_d%d_s%d_w%d" % (self.arch.name, self.domain, self.stride, self.state_width)
+
+    def __init__(self, state_width, domain, stride,
+                 arch,
+                 table_state_width = None,
+                 num_buckets = 8,
+                 extract_frequency = None,
+                 confirm_frequency = None):
+
+        # First - set up the values that are fundamental to how this matcher will operate
+        self.arch = arch
+
+        # get the width of the state width on which we operate internally
+        if state_width not in [ 128 ]:
+            fail_out("Unknown state width: %d" % state_width)
+        self.state_width = state_width
+        self.state_type = getRequiredType(self.state_width)
+        self.state_variable = IntegerVariable("s", self.state_type)
+
+        table_state_width = state_width
+        self.table_state_width = state_width
+        self.table_state_type = getRequiredType(self.table_state_width)
+
+        # domain is the number of bits that we draw from our input to
+        # index our 'reach' table
+        if not 8 <= domain <= 16:
+            fail_out("Unsupported domain: %d" % domain)
+        self.domain = domain
+        # this is the load type required for this domain if we want to
+        # load it one at a time
+        self.single_load_type = getRequiredType(self.domain)
+
+        # table size
+        self.table_size = 2**domain * table_state_width // 8
+
+        # stride is the frequency with which we make data-driven
+        # accesses to our reach table
+        if stride not in [ 1, 2, 4, 8]:
+            fail_out("Unsupported stride: %d" % stride)
+        if stride * num_buckets > state_width:
+            fail_out("Stride %d is too big for the number of buckets %d given state width %d\n" % (stride, num_buckets, state_width))
+        self.stride = stride
+
+        if num_buckets != 8:
+            fail_out("Unsupported number of buckets: %d" % num_buckets)
+        if state_width % num_buckets and state_width == 128:
+            fail_out("Bucket scheme requires bit-shifts on m128 (failing)")
+        self.num_buckets = num_buckets
+
+        # Second - set up derived or optimization values - these can be
+        # overridden by arguments that are passed in
+
+        self.datasize = 64
+        self.bulk_load_type = IntegerType(self.datasize)
+        self.datasize_bytes = self.datasize/8
+
+        self.value_extract_type = IntegerType(self.datasize)
+
+        self.fdr2_force_naive_load = False # disable everywhere for trunk
+
+        # extract frequency is how frequently (in bytes) we destructively shift
+        # our state value after having pulled out that many bytes into a
+        # confirm register (of one sort or another).
+        # none means a default value - datasize, our biggest easily available GPR
+        if extract_frequency is None:
+            extract_frequency = self.datasize_bytes
+        self.extract_frequency = extract_frequency
+        self.extract_size = self.extract_frequency*self.num_buckets
+        if extract_frequency < stride:
+            fail_out("Can't extract at extract frequency %d with stride %d" % (extract_frequency, stride))
+        if extract_frequency not in [ None, 1, 2, 4, 8, 16]:
+            fail_out("Weird extract frequency: %d" % extract_frequency)
+
+        if self.extract_size <= 32:
+            self.extr_type = IntegerType(32)
+        elif self.extract_size <= 64:
+            self.extr_type = IntegerType(64)
+        else:
+            fail_out("Implausible size %d required for confirm extract step" % size)
+
+        # extract_frequency is how often we pull out our state and place
+        # it somewhere in a lossless fashion
+        # confirm_frequency, on the other hand, is how frequently we
+        # take the state extracted by extract_frequency and cobble it
+        # together into a matching loop
+        # confirm_frequency must be a multiple of extract_frequency
+        # and must fit into a fast register; for now; we're going to
+        # stay in the GPR domain
+        if confirm_frequency is None:
+            confirm_frequency = self.extract_frequency
+        self.confirm_frequency = confirm_frequency
+        if confirm_frequency % self.extract_frequency:
+            fail_out("Confirm frequency %d must be evenly divisible by extract_frequency %d" % (confirm_frequency, self.extract_frequency))
+
+        self.conf_size = self.confirm_frequency * self.num_buckets
+        if self.conf_size <= 32:
+            self.conf_type = IntegerType(32)
+        elif self.conf_size <= 64:
+            self.conf_type = IntegerType(64)
+        else:
+            fail_out("Implausible size %d required for confirm accumulate step" % self.conf_size)
+
+        # how many bytes in flight at once
+        self.loop_bytes = 16
+
+        # confirm configuration
+
+        # how many entries in the top-level confirm table - 256 means
+        # complete split on the last character
+        self.conf_top_level_split = 256
+
+        # how much we 'pull back' in confirm - this is obviously related
+        # to the first level conf but we will keep two separate paramters
+        # for this to avoid the risk of conflating these
+        self.conf_pull_back = 1
+
+        if self.conf_pull_back > 0 and self.conf_top_level_split < 256:
+            fail_out("Pull back distance %d not supported by top level split %d" % (self.conf_pull_back, self.conf_top_level_split))
+
+        # minor stuff
+        self.default_body_indent = 8
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: build API.
+ */
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_compile.h"
+#include "fdr_confirm.h"
+#include "fdr_compile_internal.h"
+#include "fdr_engine_description.h"
+#include "teddy_compile.h"
+#include "teddy_engine_description.h"
+#include "grey.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/compare.h"
+#include "util/dump_mask.h"
+#include "util/target_info.h"
+#include "util/ue2string.h"
+#include "util/verify_types.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <boost/core/noncopyable.hpp>
+
+using namespace std;
+
+namespace ue2 {
+
+namespace {
+
+class FDRCompiler : boost::noncopyable {
+private:
+    const FDREngineDescription &eng;
+    vector<u8> tab;
+    const vector<hwlmLiteral> &lits;
+    map<BucketIndex, std::vector<LiteralIndex> > bucketToLits;
+    bool make_small;
+
+    u8 *tabIndexToMask(u32 indexInTable);
+    void assignStringToBucket(LiteralIndex l, BucketIndex b);
+    void assignStringsToBuckets();
+#ifdef DEBUG
+    void dumpMasks(const u8 *defaultMask);
+#endif
+    void setupTab();
+    aligned_unique_ptr<FDR> setupFDR(pair<u8 *, size_t> link);
+    void createInitialState(FDR *fdr);
+
+public:
+    FDRCompiler(const vector<hwlmLiteral> &lits_in,
+                const FDREngineDescription &eng_in, bool make_small_in)
+        : eng(eng_in), tab(eng_in.getTabSizeBytes()), lits(lits_in),
+          make_small(make_small_in) {}
+
+    aligned_unique_ptr<FDR> build(pair<u8 *, size_t> link);
+};
+
+u8 *FDRCompiler::tabIndexToMask(u32 indexInTable) {
+    assert(indexInTable < tab.size());
+    return &tab[0] + (indexInTable * (eng.getSchemeWidth() / 8));
+}
+
+static
+void setbit(u8 *msk, u32 bit) {
+    msk[bit / 8] |= 1U << (bit % 8);
+}
+
+static
+void clearbit(u8 *msk, u32 bit) {
+    msk[bit / 8] &= ~(1U << (bit % 8));
+}
+
+static
+void andMask(u8 *dest, const u8 *a, const u8 *b, u32 num_bytes) {
+    for (u32 i = 0; i < num_bytes; i++) {
+        dest[i] = a[i] & b[i];
+    }
+}
+
+void FDRCompiler::createInitialState(FDR *fdr) {
+    u8 *start = (u8 *)&fdr->start;
+
+    /* initial state should to be 1 in each slot in the bucket up to bucket
+     * minlen - 1, and 0 thereafter */
+    for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
+        // Find the minimum length for the literals in this bucket.
+        const vector<LiteralIndex> &bucket_lits = bucketToLits[b];
+        u32 min_len = ~0U;
+        for (vector<LiteralIndex>::const_iterator it = bucket_lits.begin(),
+                                                  ite = bucket_lits.end();
+             it != ite; ++it) {
+            min_len = min(min_len, verify_u32(lits[*it].s.length()));
+        }
+
+        DEBUG_PRINTF("bucket %u has min_len=%u\n", b, min_len);
+        assert(min_len);
+
+        for (PositionInBucket i = 0; i < eng.getBucketWidth(b); i++) {
+            if (i < min_len - 1) {
+                setbit(start, eng.getSchemeBit(b, i));
+            }
+        }
+    }
+}
+
+aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
+    size_t tabSize = eng.getTabSizeBytes();
+
+    pair<u8 *, size_t> floodControlTmp = setupFDRFloodControl(lits, eng);
+
+    pair<u8 *, size_t> confirmTmp =
+        setupFullMultiConfs(lits, eng, bucketToLits, make_small);
+
+    assert(ISALIGNED_16(tabSize));
+    assert(ISALIGNED_16(confirmTmp.second));
+    assert(ISALIGNED_16(floodControlTmp.second));
+    assert(ISALIGNED_16(link.second));
+    size_t headerSize = ROUNDUP_16(sizeof(FDR));
+    size_t size = ROUNDUP_16(headerSize + tabSize + confirmTmp.second +
+                             floodControlTmp.second + link.second);
+
+    DEBUG_PRINTF("sizes base=%zu tabSize=%zu confirm=%zu floodControl=%zu "
+                 "total=%zu\n",
+                 headerSize, tabSize, confirmTmp.second, floodControlTmp.second,
+                 size);
+
+    aligned_unique_ptr<FDR> fdr = aligned_zmalloc_unique<FDR>(size);
+    assert(fdr); // otherwise would have thrown std::bad_alloc
+
+    fdr->size = size;
+    fdr->engineID = eng.getID();
+    fdr->maxStringLen = verify_u32(maxLen(lits));
+    createInitialState(fdr.get());
+
+    u8 *fdr_base = (u8 *)fdr.get();
+    u8 * ptr = fdr_base + ROUNDUP_16(sizeof(FDR));
+    copy(tab.begin(), tab.end(), ptr);
+    ptr += tabSize;
+
+    memcpy(ptr, confirmTmp.first, confirmTmp.second);
+    ptr += confirmTmp.second;
+    aligned_free(confirmTmp.first);
+
+    fdr->floodOffset = verify_u32(ptr - fdr_base);
+    memcpy(ptr, floodControlTmp.first, floodControlTmp.second);
+    ptr += floodControlTmp.second;
+    aligned_free(floodControlTmp.first);
+
+    if (link.first) {
+        fdr->link = verify_u32(ptr - fdr_base);
+        memcpy(ptr, link.first, link.second);
+        aligned_free(link.first);
+    } else {
+        fdr->link = 0;
+    }
+
+    return fdr;
+}
+
+void FDRCompiler::assignStringToBucket(LiteralIndex l, BucketIndex b) {
+    bucketToLits[b].push_back(l);
+}
+
+struct LitOrder {
+    explicit LitOrder(const vector<hwlmLiteral> &vl_) : vl(vl_) {}
+    bool operator()(const u32 &i1, const u32 &i2) const {
+        const string &i1s = vl[i1].s;
+        const string &i2s = vl[i2].s;
+
+        size_t len1 = i1s.size(), len2 = i2s.size();
+
+        if (len1 != len2) {
+            return len1 < len2;
+        } else {
+            string::const_reverse_iterator it1, it2;
+            tie(it1, it2) =
+                std::mismatch(i1s.rbegin(), i1s.rend(), i2s.rbegin());
+            if (it1 == i1s.rend()) {
+                return false;
+            }
+            return *it1 < *it2;
+        }
+    }
+
+private:
+    const vector<hwlmLiteral> &vl;
+};
+
+static u64a getScoreUtil(u32 len, u32 count) {
+    if (len == 0) {
+        return (u64a)-1;
+    }
+    const u32 LEN_THRESH = 128;
+    const u32 elen = (len > LEN_THRESH) ? LEN_THRESH : len;
+    const u64a lenScore =
+        (LEN_THRESH * LEN_THRESH * LEN_THRESH) / (elen * elen * elen);
+    return count * lenScore; // deemphasize count - possibly more than needed
+                             // this might be overkill in the other direction
+}
+
+//#define DEBUG_ASSIGNMENT
+void FDRCompiler::assignStringsToBuckets() {
+    typedef u64a SCORE; // 'Score' type
+    const SCORE MAX_SCORE = (SCORE)-1;
+    const u32 CHUNK_MAX = 512;
+    const u32 BUCKET_MAX = 16;
+    typedef pair<SCORE, u32> SCORE_INDEX_PAIR;
+
+    u32 ls = verify_u32(lits.size());
+    // make a vector that contains our literals as pointers or u32 LiteralIndex values
+    vector<LiteralIndex> vli;
+    vli.resize(ls);
+    map<u32, u32> lenCounts;
+    for (LiteralIndex l = 0; l < ls; l++) {
+        vli[l] = l;
+        lenCounts[lits[l].s.size()]++;
+    }
+    // sort vector by literal length + if tied on length, 'magic' criteria of some kind (tbd)
+    stable_sort(vli.begin(), vli.end(), LitOrder(lits));
+
+#ifdef DEBUG_ASSIGNMENT
+    for (map<u32, u32>::iterator i = lenCounts.begin(), e = lenCounts.end();
+         i != e; ++i) {
+        printf("l<%d>:%d ", i->first, i->second);
+    }
+    printf("\n");
+#endif
+
+    // TODO: detailed early stage literal analysis for v. small cases (actually look at lits)
+    // yes - after we factor this out and merge in the Teddy style of building we can look
+    // at this, although the teddy merge modelling is quite different. It's still probably
+    // adaptable to some extent for this class of problem
+
+    u32 firstIds[CHUNK_MAX]; // how many are in this chunk (CHUNK_MAX - 1 contains 'last' bound)
+    u32 count[CHUNK_MAX]; // how many are in this chunk
+    u32 length[CHUNK_MAX]; // how long things in the chunk are
+
+    const u32 MAX_CONSIDERED_LENGTH = 16;
+    u32 currentChunk = 0;
+    u32 currentSize = 0;
+    u32 chunkStartID = 0;
+    u32 maxPerChunk  = ls/(CHUNK_MAX - MIN(MAX_CONSIDERED_LENGTH, lenCounts.size())) + 1;
+
+    for (u32 i = 0; i < ls && currentChunk < CHUNK_MAX - 1; i++) {
+        LiteralIndex l = vli[i];
+        if ((currentSize < MAX_CONSIDERED_LENGTH && (lits[l].s.size() != currentSize)) ||
+            (currentSize != 1 && ((i - chunkStartID) >= maxPerChunk))) {
+            currentSize = lits[l].s.size();
+            if (currentChunk) {
+                count[currentChunk - 1 ] = i - chunkStartID;
+            }
+            chunkStartID = firstIds[currentChunk] = i;
+            length[currentChunk] = currentSize;
+            currentChunk++;
+        }
+    }
+    count[currentChunk - 1] = ls - chunkStartID;
+    // close off chunks with an empty row
+    firstIds[currentChunk] = ls;
+    length[currentChunk] = 0;
+    count[currentChunk] = 0;
+    u32 nChunks = currentChunk + 1;
+
+#ifdef DEBUG_ASSIGNMENT
+    for (u32 j = 0; j < nChunks; j++) {
+        printf("%d %d %d %d\n", j, firstIds[j], count[j], length[j]);
+    }
+#endif
+
+    SCORE_INDEX_PAIR t[CHUNK_MAX][BUCKET_MAX]; // pair of score, index
+    u32 nb = eng.getNumBuckets();
+
+    for (u32 j = 0; j < nChunks; j++) {
+        u32 cnt = 0;
+        for (u32 k = j; k < nChunks; ++k) {
+            cnt += count[k];
+        }
+        t[j][0] = make_pair(getScoreUtil(length[j], cnt), 0);
+    }
+
+    for (u32 i = 1; i < nb; i++) {
+        for (u32 j = 0; j < nChunks - 1; j++) { // don't process last, empty row
+            SCORE_INDEX_PAIR best = make_pair(MAX_SCORE, 0);
+            u32 cnt = count[j];
+            for (u32 k = j + 1; k < nChunks - 1; k++, cnt += count[k]) {
+                SCORE score = getScoreUtil(length[j], cnt);
+                if (score > best.first) {
+                    break; // if we're now worse locally than our best score, give up
+                }
+                score += t[k][i-1].first;
+                if (score < best.first) {
+                    best = make_pair(score, k);
+                }
+            }
+            t[j][i] = best;
+        }
+        t[nChunks - 1][i] = make_pair(0,0); // fill in empty final row for next iteration
+    }
+
+#ifdef DEBUG_ASSIGNMENT
+    for (u32 j = 0; j < nChunks; j++) {
+        for (u32 i = 0; i < nb; i++) {
+            SCORE_INDEX_PAIR v = t[j][i];
+            printf("<%7lld,%3d>", v.first, v.second);
+        }
+        printf("\n");
+    }
+#endif
+
+    // our best score is in best[0][N_BUCKETS-1] and we can follow the links
+    // to find where our buckets should start and what goes into them
+    for (u32 i = 0, n = nb; n && (i != nChunks - 1); n--) {
+        u32 j = t[i][n - 1].second;
+        if (j == 0) {
+            j = nChunks - 1;
+        }
+        // put chunks between i - j into bucket (NBUCKETS-1) - n
+#ifdef DEBUG_ASSIGNMENT
+        printf("placing from %d to %d in bucket %d\n", firstIds[i], firstIds[j],
+               nb - n);
+#endif
+        for (u32 k = firstIds[i]; k < firstIds[j]; k++) {
+            assignStringToBucket((LiteralIndex)vli[k], nb - n);
+        }
+        i = j;
+    }
+}
+
+#ifdef DEBUG
+void FDRCompiler::dumpMasks(const u8 *defaultMask) {
+    const size_t width = eng.getSchemeWidth();
+    printf("default mask: %s\n", dumpMask(defaultMask, width).c_str());
+    for (u32 i = 0; i < eng.getNumTableEntries(); i++) {
+        u8 *m = tabIndexToMask(i);
+        if (memcmp(m, defaultMask, width / 8)) {
+            printf("tab %04x: %s\n", i, dumpMask(m, width).c_str());
+        }
+    }
+}
+#endif
+
+static
+bool getMultiEntriesAtPosition(const FDREngineDescription &eng,
+                               const vector<LiteralIndex> &vl,
+                               const vector<hwlmLiteral> &lits,
+                               SuffixPositionInString pos,
+                               std::map<u32, ue2::unordered_set<u32> > &m2) {
+    u32 distance = 0;
+    if (eng.bits <= 8) {
+        distance = 1;
+    } else if (eng.bits <= 16) {
+        distance = 2;
+    } else if (eng.bits <= 32) {
+        distance = 4;
+    }
+
+    for (vector<LiteralIndex>::const_iterator i = vl.begin(), e = vl.end();
+         i != e; ++i) {
+        if (e - i > 5) {
+            __builtin_prefetch(&lits[*(i + 5)]);
+        }
+        const hwlmLiteral &lit = lits[*i];
+        const size_t sz = lit.s.size();
+        u32 mask = 0;
+        u32 dontCares = 0;
+        for (u32 cnt = 0; cnt < distance; cnt++) {
+            int newPos = pos - cnt;
+            u8 dontCareByte = 0x0;
+            u8 maskByte = 0x0;
+            if (newPos < 0 || ((u32)newPos >= sz)) {
+                dontCareByte = 0xff;
+            } else {
+                u8 c = lit.s[sz - newPos - 1];
+                maskByte = c;
+                u32 remainder = eng.bits - cnt * 8;
+                assert(remainder != 0);
+                if (remainder < 8) {
+                    u8 cmask = (1U << remainder) - 1;
+                    maskByte &= cmask;
+                    dontCareByte |= ~cmask;
+                }
+                if (lit.nocase && ourisalpha(c)) {
+                    maskByte &= 0xdf;
+                    dontCareByte |= 0x20;
+                }
+            }
+            u32 loc =  cnt * 8;
+            mask |= maskByte << loc;
+            dontCares |= dontCareByte << loc;
+        }
+
+        // truncate m and dc down to nBits
+        mask &= (1U << eng.bits) - 1;
+        dontCares &= (1U << eng.bits) - 1;
+        if (dontCares == ((1U << eng.bits) - 1)) {
+            return true;
+        }
+        m2[dontCares].insert(mask);
+    }
+    return false;
+}
+
+void FDRCompiler::setupTab() {
+    const size_t mask_size = eng.getSchemeWidth() / 8;
+    assert(mask_size);
+
+    vector<u8> defaultMask(mask_size, 0xff);
+    for (u32 i = 0; i < eng.getNumTableEntries(); i++) {
+        memcpy(tabIndexToMask(i), &defaultMask[0], mask_size);
+    }
+
+    typedef std::map<u32, ue2::unordered_set<u32> > M2SET;
+
+    for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
+        const vector<LiteralIndex> &vl = bucketToLits[b];
+        SuffixPositionInString pLimit = eng.getBucketWidth(b);
+        for (SuffixPositionInString pos = 0; pos < pLimit; pos++) {
+            u32 bit = eng.getSchemeBit(b, pos);
+            M2SET m2;
+            bool done = getMultiEntriesAtPosition(eng, vl, lits, pos, m2);
+            if (done) {
+                clearbit(&defaultMask[0], bit);
+                continue;
+            }
+            for (M2SET::const_iterator i = m2.begin(), e = m2.end(); i != e;
+                 ++i) {
+                u32 dc = i->first;
+                const ue2::unordered_set<u32> &mskSet = i->second;
+                u32 v = ~dc;
+                do {
+                    u32 b2 = v & dc;
+                    for (ue2::unordered_set<u32>::const_iterator
+                             i2 = mskSet.begin(),
+                             e2 = mskSet.end();
+                         i2 != e2; ++i2) {
+                        u32 val = (*i2 & ~dc) | b2;
+                        clearbit(tabIndexToMask(val), bit);
+                    }
+                    v = (v + (dc & -dc)) | ~dc;
+                } while (v != ~dc);
+            }
+        }
+    }
+
+    for (u32 i = 0; i < eng.getNumTableEntries(); i++) {
+        u8 *m = tabIndexToMask(i);
+        andMask(m, m, &defaultMask[0], mask_size);
+    }
+#ifdef DEBUG
+    dumpMasks(&defaultMask[0]);
+#endif
+}
+
+aligned_unique_ptr<FDR> FDRCompiler::build(pair<u8 *, size_t> link) {
+    assignStringsToBuckets();
+    setupTab();
+    return setupFDR(link);
+}
+
+} // namespace
+
+static
+aligned_unique_ptr<FDR>
+fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
+                      const target_t &target, const Grey &grey, u32 hint,
+                      hwlmStreamingControl *stream_control) {
+    pair<u8 *, size_t> link(nullptr, 0);
+    if (stream_control) {
+        link = fdrBuildTableStreaming(lits, stream_control);
+    }
+
+    DEBUG_PRINTF("cpu has %s\n", target.has_avx2() ? "avx2" : "no-avx2");
+
+    if (grey.fdrAllowTeddy) {
+        aligned_unique_ptr<FDR> fdr
+            = teddyBuildTableHinted(lits, make_small, hint, target, link);
+        if (fdr) {
+            DEBUG_PRINTF("build with teddy succeeded\n");
+            return fdr;
+        } else {
+            DEBUG_PRINTF("build with teddy failed, will try with FDR\n");
+        }
+    }
+
+    const unique_ptr<FDREngineDescription> des =
+        (hint == HINT_INVALID) ? chooseEngine(target, lits, make_small)
+                               : getFdrDescription(hint);
+
+    if (!des) {
+        return nullptr;
+    }
+
+    FDRCompiler fc(lits, *des, make_small);
+    return fc.build(link);
+}
+
+aligned_unique_ptr<FDR> fdrBuildTable(const vector<hwlmLiteral> &lits,
+                                      bool make_small, const target_t &target,
+                                      const Grey &grey,
+                                      hwlmStreamingControl *stream_control) {
+    return fdrBuildTableInternal(lits, make_small, target, grey, HINT_INVALID,
+                                 stream_control);
+}
+
+#if !defined(RELEASE_BUILD)
+
+aligned_unique_ptr<FDR>
+fdrBuildTableHinted(const vector<hwlmLiteral> &lits, bool make_small, u32 hint,
+                    const target_t &target, const Grey &grey,
+                    hwlmStreamingControl *stream_control) {
+    pair<u8 *, size_t> link(nullptr, 0);
+    return fdrBuildTableInternal(lits, make_small, target, grey, hint,
+                                 stream_control);
+}
+
+#endif
+
+} // namespace ue2
+
+// FIXME: should be compile-time only
+size_t fdrSize(const FDR *fdr) {
+    assert(fdr);
+    return fdr->size;
+}
--- a/src/fdr/fdr_compile.h
+++ b/src/fdr/fdr_compile.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: build API.
+ */
+
+#ifndef FDR_COMPILE_H
+#define FDR_COMPILE_H
+
+#include "ue2common.h"
+#include "util/alloc.h"
+
+#include <vector>
+
+struct FDR;
+
+namespace ue2 {
+
+struct hwlmLiteral;
+struct hwlmStreamingControl;
+struct Grey;
+struct target_t;
+
+ue2::aligned_unique_ptr<FDR>
+fdrBuildTable(const std::vector<hwlmLiteral> &lits, bool make_small,
+              const target_t &target, const Grey &grey,
+              hwlmStreamingControl *stream_control = nullptr);
+
+#if !defined(RELEASE_BUILD)
+
+ue2::aligned_unique_ptr<FDR>
+fdrBuildTableHinted(const std::vector<hwlmLiteral> &lits, bool make_small,
+                    u32 hint, const target_t &target, const Grey &grey,
+                    hwlmStreamingControl *stream_control = nullptr);
+
+#endif
+
+} // namespace ue2
+
+#endif
--- a/src/fdr/fdr_compile_internal.h
+++ b/src/fdr/fdr_compile_internal.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_COMPILE_INTERNAL_H
+#define FDR_COMPILE_INTERNAL_H
+
+#include "ue2common.h"
+#include "hwlm/hwlm_literal.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+
+struct FDRConfirm;
+struct LitInfo;
+
+namespace ue2 {
+
+// a pile of decorative typedefs
+// good for documentation purposes more than anything else
+typedef u32 LiteralIndex;
+typedef u32 ConfirmIndex;
+typedef u32 SuffixPositionInString; // zero is last byte, counting back
+                                    // into the string
+typedef u32 BucketIndex;
+typedef u32 SchemeBitIndex;
+typedef u32 PositionInBucket;  // zero is 'we are matching right now!",
+                               // counting towards future matches
+
+class EngineDescription;
+class FDREngineDescription;
+struct hwlmStreamingControl;
+
+size_t getFDRConfirm(const std::vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
+                     bool make_small);
+
+std::pair<u8 *, size_t> setupFullMultiConfs(
+    const std::vector<hwlmLiteral> &lits, const EngineDescription &eng,
+    std::map<BucketIndex, std::vector<LiteralIndex> > &bucketToLits,
+    bool make_small);
+
+// all suffixes include an implicit max_bucket_width suffix to ensure that
+// we always read a full-scale flood "behind" us in terms of what's in our
+// state; if we don't have a flood that's long enough we won't be in the
+// right state yet to allow blindly advancing
+std::pair<u8 *, size_t>
+setupFDRFloodControl(const std::vector<hwlmLiteral> &lits,
+                     const EngineDescription &eng);
+
+std::pair<u8 *, size_t>
+fdrBuildTableStreaming(const std::vector<hwlmLiteral> &lits,
+                       hwlmStreamingControl *stream_control);
+
+static constexpr u32 HINT_INVALID = 0xffffffff;
+
+// fdr_compile_util.cpp utilities
+size_t maxLen(const std::vector<hwlmLiteral> &lits);
+size_t minLenCount(const std::vector<hwlmLiteral> &lits, size_t *count);
+u32 absdiff(u32 i, u32 j);
+
+} // namespace ue2
+
+#endif
--- a/src/fdr/fdr_compile_util.cpp
+++ b/src/fdr/fdr_compile_util.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr_compile_internal.h"
+#include "hwlm/hwlm_literal.h"
+
+#include <algorithm>
+#include <vector>
+
+using namespace std;
+
+namespace ue2 {
+
+size_t maxLen(const vector<hwlmLiteral> &lits) {
+    size_t rv = 0;
+    for (const auto &lit : lits) {
+        rv = max(rv, lit.s.size());
+    }
+    return rv;
+}
+
+size_t minLenCount(const vector<hwlmLiteral> &lits, size_t *count) {
+    size_t rv = (size_t)-1;
+    *count = 0;
+    for (const auto &lit : lits) {
+        if (lit.s.size() < rv) {
+            rv = lit.s.size();
+            *count = 1;
+        } else if (lit.s.size() == rv) {
+            (*count)++;
+        }
+    }
+    return rv;
+}
+
+u32 absdiff(u32 i, u32 j) {
+    return (i > j) ? (i - j) : (j - i);
+}
+
+} // namespace ue2
--- a/src/fdr/fdr_confirm.h
+++ b/src/fdr/fdr_confirm.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_CONFIRM_H
+#define FDR_CONFIRM_H
+
+#include "ue2common.h"
+#include "hwlm/hwlm.h"
+
+static really_inline
+u32 mul_hash_64(u64a lv, u64a andmsk, u64a mult, u32 nBits) {
+    return ((lv & andmsk) * mult) >> (sizeof(u64a)*8 - nBits);
+}
+
+// data structures
+// TODO: fix this hard-coding
+#define CONF_TYPE u64a
+#define CONF_HASH_CALL mul_hash_64
+
+typedef enum LitInfoFlags {
+    NoFlags = 0,
+    Caseless = 1,
+    NoRepeat = 2,
+    ComplexConfirm = 4
+} LitInfoFlags;
+
+/**
+ * \brief Structure describing a literal, linked to by FDRConfirm.
+ *
+ * This structure is followed in memory by a variable-sized string prefix at
+ * LitInfo::s, for strings that are longer than CONF_TYPE.
+ */
+struct LitInfo {
+    CONF_TYPE v;
+    CONF_TYPE msk;
+    hwlm_group_t groups;
+    u32 size;
+    u32 id; // literal ID as passed in
+    u8 flags; /* LitInfoFlags */
+    u8 next;
+    u8 extended_size;
+    u8 s[1]; // literal prefix, which continues "beyond" this struct.
+};
+
+#define FDRC_FLAG_NO_CONFIRM 1
+
+/**
+ * \brief FDR confirm header.
+ *
+ * This structure is followed in memory by:
+ *
+ * -# lit index mapping (array of u32)
+ * -# list of LitInfo structures
+ */
+struct FDRConfirm {
+    CONF_TYPE andmsk;
+    CONF_TYPE mult;
+    u32 nBitsOrSoleID; // if flags is NO_CONFIRM then this is soleID
+    u32 flags;  // sole meaning is 'non-zero means no-confirm' (that is all)
+    hwlm_group_t groups;
+    u32 soleLitSize;
+    u32 soleLitCmp;
+    u32 soleLitMsk;
+};
+
+static really_inline
+const u32 *getConfirmLitIndex(const struct FDRConfirm *fdrc) {
+    const u8 *base = (const u8 *)fdrc;
+    const u32 *litIndex =
+        (const u32 *)(base + ROUNDUP_N(sizeof(*fdrc), alignof(u32)));
+    assert(ISALIGNED(litIndex));
+    return litIndex;
+}
+
+#endif // FDR_CONFIRM_H
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_compile_internal.h"
+#include "fdr_confirm.h"
+#include "engine_description.h"
+#include "teddy_engine_description.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+#include "util/verify_types.h"
+
+#include <algorithm>
+#include <cstring>
+#include <set>
+
+using namespace std;
+
+namespace ue2 {
+
+typedef u8 ConfSplitType;
+typedef pair<BucketIndex, ConfSplitType> BucketSplitPair;
+typedef map<BucketSplitPair, pair<FDRConfirm *, size_t> > BC2CONF;
+
+// return the number of bytes beyond a length threshold in all strings in lits
+static
+size_t thresholdedSize(const vector<hwlmLiteral> &lits, size_t threshold) {
+    size_t tot = 0;
+    for (const auto &lit : lits) {
+        size_t sz = lit.s.size();
+        if (sz > threshold) {
+            tot += ROUNDUP_N(sz - threshold, 8);
+        }
+    }
+    return tot;
+}
+
+static
+u64a make_u64a_mask(const vector<u8> &v) {
+    assert(v.size() <= sizeof(u64a));
+    if (v.size() > sizeof(u64a)) {
+        throw std::exception();
+    }
+
+    u64a mask = 0;
+    size_t vlen = v.size();
+    size_t len = std::min(vlen, sizeof(mask));
+    unsigned char *m = (unsigned char *)&mask;
+    memcpy(m + sizeof(mask) - len, &v[vlen - len], len);
+    return mask;
+}
+
+/**
+ * Build a temporary vector of LitInfo structures (without the corresponding
+ * pointers to the actual strings; these cannot be laid out yet). These
+ * stay in 1:1 correspondence with the lits[] vector as that's the only
+ * place we have to obtain our full strings.
+ */
+static
+void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,
+                 CONF_TYPE &andmsk) {
+    const CONF_TYPE all_ones = ~(u64a)0;
+    andmsk = all_ones; // fill in with 'and' of all literal masks
+
+    for (LiteralIndex i = 0; i < lits.size(); i++) {
+        const hwlmLiteral &lit = lits[i];
+        LitInfo &info = tmpLitInfo[i];
+        memset(&info, 0, sizeof(info));
+        info.id = lit.id;
+        u8 flags = NoFlags;
+        if (lit.nocase) {
+            flags |= Caseless;
+        }
+        if (lit.noruns) {
+            flags |= NoRepeat;
+        }
+        if (lit.msk.size() > lit.s.size()) {
+            flags |= ComplexConfirm;
+            info.extended_size = verify_u8(lit.msk.size());
+        }
+        info.flags = flags;
+        info.size = verify_u32(lit.s.size());
+        info.groups = lit.groups;
+
+        // these are built up assuming a LE machine
+        CONF_TYPE msk = all_ones;
+        CONF_TYPE val = 0;
+        for (u32 j = 0; j < sizeof(CONF_TYPE); j++) {
+            u32 shiftLoc = (sizeof(CONF_TYPE) - j - 1) * 8;
+            if (j >= lit.s.size()) {
+                msk &= ~((CONF_TYPE)0xff << shiftLoc);
+            } else {
+                u8 c = lit.s[lit.s.size() - j - 1];
+                if (lit.nocase && ourisalpha(c)) {
+                    msk &= ~((CONF_TYPE)CASE_BIT << shiftLoc);
+                    val |= (CONF_TYPE)(c & CASE_CLEAR) << shiftLoc;
+                } else {
+                    val |= (CONF_TYPE)c << shiftLoc;
+                }
+            }
+        }
+
+        info.v = val;
+        info.msk = msk;
+        if (!lit.msk.empty()) {
+            u64a l_msk = make_u64a_mask(lit.msk);
+            u64a l_cmp = make_u64a_mask(lit.cmp);
+
+            // test for consistency - if there's intersection, then v and msk
+            // values must line up
+            UNUSED u64a intersection = l_msk & info.msk;
+            assert((info.v & intersection) == (l_cmp & intersection));
+
+            // incorporate lit.msk, lit.cmp into v and msk
+            info.msk |= l_msk;
+            info.v |= l_cmp;
+        }
+
+        andmsk &= info.msk;
+    }
+}
+
+//#define FDR_CONFIRM_DUMP 1
+
+static
+size_t getFDRConfirm(const vector<hwlmLiteral> &lits, FDRConfirm **fdrc_p,
+                     bool applyOneCharOpt, bool make_small, bool make_confirm) {
+    vector<LitInfo> tmpLitInfo(lits.size());
+    CONF_TYPE andmsk;
+    fillLitInfo(lits, tmpLitInfo, andmsk);
+
+#ifdef FDR_CONFIRM_DUMP
+    printf("-------------------\n");
+#endif
+
+    // just magic numbers and crude measures for now
+    u32 nBits;
+    if (make_small) {
+        nBits = min(10U, lg2(lits.size()) + 1);
+    } else {
+        nBits = min(13U, lg2(lits.size()) + 4);
+    }
+
+    CONF_TYPE mult = (CONF_TYPE)0x0b4e0ef37bc32127ULL;
+    u32 flags = 0;
+    // we use next three variables for 'confirmless' case to speed-up
+    // confirmation process
+    u32 soleLitSize = 0;
+    u32 soleLitCmp = 0;
+    u32 soleLitMsk = 0;
+
+    if ((applyOneCharOpt && lits.size() == 1 && lits[0].s.size() == 0 &&
+            lits[0].msk.empty()) || make_confirm == false) {
+        flags = FDRC_FLAG_NO_CONFIRM;
+        if (lits[0].noruns) {
+            flags |= NoRepeat; // messy - need to clean this up later as flags is sorta kinda obsoleted
+        }
+        mult = 0;
+        soleLitSize = lits[0].s.size() - 1;
+        // we can get to this point only in confirmless case;
+        // it means that we have only one literal per FDRConfirm (no packing),
+        // with no literal mask and size of literal is less or equal
+        // to the number of masks of Teddy engine;
+        // maximum number of masks for Teddy is 4, so the size of
+        // literal is definitely less or equal to size of u32
+        assert(lits[0].s.size() <= sizeof(u32));
+        for (u32 i = 0; i < lits[0].s.size(); i++) {
+            u32 shiftLoc = (sizeof(u32) - i - 1) * 8;
+            u8 c = lits[0].s[lits[0].s.size() - i - 1];
+            if (lits[0].nocase && ourisalpha(c)) {
+                soleLitCmp |= (u32)(c & CASE_CLEAR) << shiftLoc;
+                soleLitMsk |= (u32)CASE_CLEAR << shiftLoc;
+            }
+            else {
+                soleLitCmp |= (u32)c << shiftLoc;
+                soleLitMsk |= (u32)0xff << shiftLoc;
+            }
+        }
+    }
+
+    // we can walk the vector and assign elements from the vectors to a
+    // map by hash value
+    map<u32, vector<LiteralIndex> > res2lits;
+    hwlm_group_t gm = 0;
+    for (LiteralIndex i = 0; i < lits.size(); i++) {
+        LitInfo & li = tmpLitInfo[i];
+        u32 hash = CONF_HASH_CALL(li.v, andmsk, mult, nBits);
+        DEBUG_PRINTF("%016llx --> %u\n", li.v, hash);
+        res2lits[hash].push_back(i);
+        gm |= li.groups;
+    }
+
+#ifdef FDR_CONFIRM_DUMP
+    // print out the literals reversed - makes it easier to line up analyses
+    // that are end-offset based
+    for (map<u32, vector<LiteralIndex> >::iterator i = res2lits.begin(),
+         e = res2lits.end(); i != e; ++i) {
+        u32 hash = i->first;
+        vector<LiteralIndex> & vlidx = i->second;
+        if (vlidx.size() > 1) {
+            printf("%x -> %zu literals\n", hash, vlidx.size());
+            u32 min_len = lits[vlidx.front()].s.size();
+            vector<set<u8> > vsl; // contains the set of chars at each location
+                                  // reversed from the end
+            vsl.resize(1024);
+            u32 total_string_size = 0;
+            for (vector<LiteralIndex>::iterator i2 = vlidx.begin(),
+                 e2 = vlidx.end(); i2 != e2; ++i2) {
+                LiteralIndex litIdx = *i2;
+                total_string_size += lits[litIdx].s.size();
+                for (u32 j = lits[litIdx].s.size(); j != 0 ; j--) {
+                    vsl[lits[litIdx].s.size()-j].insert(lits[litIdx].s.c_str()[j - 1]);
+                }
+                min_len = MIN(min_len, lits[litIdx].s.size());
+            }
+            printf("common     ");
+            for (u32 j = 0; j < min_len; j++) {
+                if (vsl[j].size() == 1) {
+                    printf("%02x", (u32)*vsl[j].begin());
+                } else {
+                    printf("__");
+                }
+            }
+            printf("\n");
+            for (vector<LiteralIndex>::iterator i2 = vlidx.begin(),
+                 e2 = vlidx.end(); i2 != e2; ++i2) {
+                LiteralIndex litIdx = *i2;
+                printf("%8x  %c", lits[litIdx].id, lits[litIdx].nocase ? '!' : ' ');
+                for (u32 j = lits[litIdx].s.size(); j != 0 ; j--) {
+                    u32 dist_from_end = lits[litIdx].s.size() - j;
+                    if (dist_from_end < min_len && vsl[dist_from_end].size() == 1) {
+                        printf("__");
+                    } else {
+                        printf("%02x", (u32)lits[litIdx].s.c_str()[j-1]);
+                    }
+                }
+                printf("\n");
+            }
+            u32 total_compares = 0;
+            for (u32 j = 0; j < 1024; j++) { // naughty
+                total_compares += vsl[j].size();
+            }
+            printf("Total compare load: %d Total string size: %d\n\n", total_compares, total_string_size);
+        }
+    }
+#endif
+
+    const size_t bitsToLitIndexSize = (1U << nBits) * sizeof(u32);
+    const size_t totalLitSize = thresholdedSize(lits, sizeof(CONF_TYPE));
+
+    // this size can now be a worst-case as we can always be a bit smaller
+    size_t size = ROUNDUP_N(sizeof(FDRConfirm), alignof(u32)) +
+                  ROUNDUP_N(bitsToLitIndexSize, alignof(LitInfo)) +
+                  sizeof(LitInfo) * lits.size() + totalLitSize;
+    size = ROUNDUP_N(size, alignof(FDRConfirm));
+
+    FDRConfirm *fdrc = (FDRConfirm *)aligned_zmalloc(size);
+    assert(fdrc); // otherwise would have thrown std::bad_alloc
+
+    fdrc->andmsk = andmsk;
+    fdrc->mult = mult;
+    fdrc->nBitsOrSoleID = (flags & FDRC_FLAG_NO_CONFIRM) ? lits[0].id : nBits;
+    fdrc->flags = flags;
+    fdrc->soleLitSize = soleLitSize;
+    fdrc->soleLitCmp = soleLitCmp;
+    fdrc->soleLitMsk = soleLitMsk;
+
+    fdrc->groups = gm;
+
+    // After the FDRConfirm, we have the lit index array.
+    u8 *fdrc_base = (u8 *)fdrc;
+    u8 *ptr = fdrc_base + sizeof(*fdrc);
+    ptr = ROUNDUP_PTR(ptr, alignof(u32));
+    u32 *bitsToLitIndex = (u32 *)ptr;
+    ptr += bitsToLitIndexSize;
+
+    // After the lit index array, we have the LitInfo structures themselves,
+    // which vary in size (as each may have a variable-length string after it).
+    ptr = ROUNDUP_PTR(ptr, alignof(LitInfo));
+
+    // Walk the map by hash value assigning indexes and laying out the
+    // elements (and their associated string confirm material) in memory.
+    for (std::map<u32, vector<LiteralIndex> >::const_iterator
+             i = res2lits.begin(), e = res2lits.end(); i != e; ++i) {
+        const u32 hash = i->first;
+        const vector<LiteralIndex> &vlidx = i->second;
+        bitsToLitIndex[hash] = verify_u32(ptr - (u8 *)fdrc);
+        for (vector<LiteralIndex>::const_iterator i2 = vlidx.begin(),
+             e2 = vlidx.end(); i2 != e2; ++i2) {
+            LiteralIndex litIdx = *i2;
+
+            // Write LitInfo header.
+            u8 *oldPtr = ptr;
+            LitInfo &finalLI = *(LitInfo *)ptr;
+            finalLI = tmpLitInfo[litIdx];
+
+            ptr += sizeof(LitInfo); // String starts directly after LitInfo.
+
+            // Write literal prefix (everything before the last N characters,
+            // as the last N are already confirmed).
+            const string &t = lits[litIdx].s;
+            if (t.size() > sizeof(CONF_TYPE)) {
+                size_t prefix_len = t.size() - sizeof(CONF_TYPE);
+                memcpy(&finalLI.s[0], t.c_str(), prefix_len);
+                ptr = &finalLI.s[0] + prefix_len;
+            }
+
+            ptr = ROUNDUP_PTR(ptr, alignof(LitInfo));
+            if (i2 + 1 == e2) {
+                finalLI.next = 0x0;
+            } else {
+                // our next field represents an adjustment on top of
+                // current address + the actual size of the literal
+                // so we track any rounding up done for alignment and
+                // add this in - that way we don't have to use bigger
+                // than a u8 (for now)
+                assert((size_t)(ptr - oldPtr) > t.size());
+                finalLI.next = verify_u8(ptr - oldPtr - t.size());
+            }
+        }
+        assert((size_t)(ptr - fdrc_base) <= size);
+    }
+
+    *fdrc_p = fdrc;
+
+    // Return actual used size, not worst-case size. Must be rounded up to
+    // FDRConfirm alignment so that the caller can lay out a sequence of these.
+    size_t actual_size = ROUNDUP_N((size_t)(ptr - fdrc_base),
+                                   alignof(FDRConfirm));
+    assert(actual_size <= size);
+    return actual_size;
+}
+
+static
+u32 setupMultiConfirms(const vector<hwlmLiteral> &lits,
+                       const EngineDescription &eng, BC2CONF &bc2Conf,
+                       map<BucketIndex, vector<LiteralIndex> > &bucketToLits,
+                       bool make_small) {
+    u32 pullBack = eng.getConfirmPullBackDistance();
+    u32 splitMask = eng.getConfirmTopLevelSplit() - 1;
+    bool splitHasCase = splitMask & 0x20;
+
+    bool makeConfirm = true;
+    unique_ptr<TeddyEngineDescription> teddyDescr =
+        getTeddyDescription(eng.getID());
+    if (teddyDescr) {
+        makeConfirm = teddyDescr->needConfirm(lits);
+    }
+
+    u32 totalConfirmSize = 0;
+    for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
+        if (!bucketToLits[b].empty()) {
+            vector<vector<hwlmLiteral> > vl(eng.getConfirmTopLevelSplit());
+            for (vector<LiteralIndex>::const_iterator
+                     i = bucketToLits[b].begin(),
+                     e = bucketToLits[b].end();
+                 i != e; ++i) {
+                hwlmLiteral lit = lits[*i]; // copy
+                // c is last char of this literal
+                u8 c = *(lit.s.rbegin());
+
+                bool suppressSplit = false;
+                if (pullBack) {
+                    // make a shorter string to work over if we're pulling back
+                    // getFDRConfirm doesn't know about that stuff
+                    assert(lit.s.size() >= pullBack);
+                    lit.s.resize(lit.s.size() - pullBack);
+
+                    u8 c_sub, c_sub_msk;
+                    if (lit.msk.empty()) {
+                        c_sub = 0;
+                        c_sub_msk = 0;
+                    } else {
+                        c_sub = *(lit.cmp.rbegin());
+                        c_sub_msk = *(lit.msk.rbegin());
+                        size_t len = lit.msk.size() -
+                                     min(lit.msk.size(), (size_t)pullBack);
+                        lit.msk.resize(len);
+                        lit.cmp.resize(len);
+                    }
+
+                    // if c_sub_msk is 0xff and lit.nocase
+                    // resteer 'c' to an exact value and set suppressSplit
+                    if ((c_sub_msk == 0xff) && (lit.nocase)) {
+                        suppressSplit = true;
+                        c = c_sub;
+                    }
+                }
+
+                if (!suppressSplit && splitHasCase && lit.nocase &&
+                    ourisalpha(c)) {
+                    vl[(u8)(mytoupper(c) & splitMask)].push_back(lit);
+                    vl[(u8)(mytolower(c) & splitMask)].push_back(lit);
+                } else {
+                    vl[c & splitMask].push_back(lit);
+                }
+            }
+
+            for (u32 c = 0; c < eng.getConfirmTopLevelSplit(); c++) {
+                if (!vl[c].empty()) {
+                    DEBUG_PRINTF("b %d c %02x sz %zu\n", b, c, vl[c].size());
+                    FDRConfirm *fdrc;
+                    size_t size = getFDRConfirm(vl[c], &fdrc,
+                                                eng.typicallyHoldsOneCharLits(),
+                                                make_small, makeConfirm);
+                    BucketSplitPair p = make_pair(b, c);
+                    bc2Conf[p] = make_pair(fdrc, size);
+                    totalConfirmSize += size;
+                }
+            }
+        }
+    }
+    return totalConfirmSize;
+}
+
+pair<u8 *, size_t> setupFullMultiConfs(const vector<hwlmLiteral> &lits,
+        const EngineDescription &eng,
+        map<BucketIndex, vector<LiteralIndex> > &bucketToLits,
+        bool make_small) {
+    BC2CONF bc2Conf;
+    u32 totalConfirmSize = setupMultiConfirms(lits, eng, bc2Conf, bucketToLits,
+                                              make_small);
+
+    u32 primarySwitch = eng.getConfirmTopLevelSplit();
+    u32 nBuckets = eng.getNumBuckets();
+    u32 totalConfSwitchSize = primarySwitch * nBuckets * sizeof(u32);
+    u32 totalSize = ROUNDUP_16(totalConfSwitchSize + totalConfirmSize);
+
+    u8 *buf = (u8 *)aligned_zmalloc(totalSize);
+    assert(buf); // otherwise would have thrown std::bad_alloc
+
+    u32 *confBase = (u32 *)buf;
+    u8 *ptr = buf + totalConfSwitchSize;
+
+    for (BC2CONF::const_iterator i = bc2Conf.begin(), e = bc2Conf.end(); i != e;
+         ++i) {
+        const pair<FDRConfirm *, size_t> &p = i->second;
+        // confirm offset is relative to the base of this structure, now
+        u32 confirm_offset = verify_u32(ptr - (u8 *)buf);
+        memcpy(ptr, p.first, p.second);
+        ptr += p.second;
+        aligned_free(p.first);
+        BucketIndex b = i->first.first;
+        u8 c = i->first.second;
+        u32 idx = c * nBuckets + b;
+        confBase[idx] = confirm_offset;
+    }
+    return make_pair(buf, totalSize);
+}
+
+} // namespace ue2
--- a/src/fdr/fdr_confirm_runtime.h
+++ b/src/fdr/fdr_confirm_runtime.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_CONFIRM_RUNTIME_H
+#define FDR_CONFIRM_RUNTIME_H
+
+#include "fdr_internal.h"
+#include "fdr_loadval.h"
+#include "hwlm/hwlm.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+
+#define CONF_LOADVAL_CALL lv_u64a
+#define CONF_LOADVAL_CALL_CAUTIOUS lv_u64a_ce
+
+// this is ordinary confirmation function which runs through
+// the whole confirmation procedure
+static really_inline
+void confWithBit(const struct FDRConfirm * fdrc,
+                 const struct FDR_Runtime_Args * a,
+                 size_t i,
+                 CautionReason r,
+                 u32 pullBackAmount,
+                 hwlmcb_rv_t *control,
+                 u32 * last_match) {
+    assert(i < a->len);
+    assert(ISALIGNED(fdrc));
+
+    const u8 * buf = a->buf;
+    const size_t len = a->len;
+
+    CONF_TYPE v;
+    const u8 * confirm_loc = buf + i - pullBackAmount - 7;
+    if (likely(r == NOT_CAUTIOUS || confirm_loc >= buf)) {
+        v = CONF_LOADVAL_CALL(confirm_loc, buf, buf + len);
+    } else { // r == VECTORING, confirm_loc < buf
+        u64a histBytes = a->histBytes;
+        v = CONF_LOADVAL_CALL_CAUTIOUS(confirm_loc, buf, buf + len);
+        // stitch together v (which doesn't move) and history (which does)
+        u32 overhang = buf - confirm_loc;
+        histBytes >>= 64 - (overhang * 8);
+        v |= histBytes;
+    }
+
+    u32 c = CONF_HASH_CALL(v, fdrc->andmsk, fdrc->mult, fdrc->nBitsOrSoleID);
+    u32 start = getConfirmLitIndex(fdrc)[c];
+    if (P0(start)) {
+        const struct LitInfo *l =
+            (const struct LitInfo *)((const u8 *)fdrc + start);
+
+        u8 oldNext; // initialized in loop
+        do {
+            assert(ISALIGNED(l));
+
+            if (P0( (v & l->msk) != l->v)) {
+                goto out;
+            }
+
+            if ((*last_match == l->id) && (l->flags & NoRepeat)) {
+                goto out;
+            }
+
+            const u8 * loc = buf + i - l->size + 1 - pullBackAmount;
+
+            u8 caseless = l->flags & Caseless;
+            if (loc < buf) {
+                u32 full_overhang = buf - loc;
+
+                const u8 * history = (caseless) ?
+                                      a->buf_history_nocase : a->buf_history;
+                size_t len_history = (caseless) ?
+                                      a->len_history_nocase : a->len_history;
+
+                // can't do a vectored confirm either if we don't have
+                // the bytes
+                if (full_overhang > len_history) {
+                    goto out;
+                }
+
+                // as for the regular case, no need to do a full confirm if
+                // we're a short literal
+                if (unlikely(l->size > sizeof(CONF_TYPE))) {
+                    const u8 * s1 = l->s;
+                    const u8 * s2 = s1 + full_overhang;
+                    const u8 * loc1 = history + len_history - full_overhang;
+                    const u8 * loc2 = buf;
+                    size_t size1 = MIN(full_overhang,
+                                       l->size - sizeof(CONF_TYPE));
+                    size_t wind_size2_back = sizeof(CONF_TYPE) +
+                                             full_overhang;
+                    size_t size2 = wind_size2_back > l->size ?
+                                   0 : l->size - wind_size2_back;
+
+                    if (cmpForward(loc1, s1, size1, caseless)) {
+                        goto out;
+                    }
+                    if (cmpForward(loc2, s2, size2, caseless)) {
+                        goto out;
+                    }
+                }
+            } else { // NON-VECTORING PATH
+
+                // if string < conf_type we don't need regular string cmp
+                if (unlikely(l->size > sizeof(CONF_TYPE))) {
+                    if (cmpForward(loc, l->s, l->size - sizeof(CONF_TYPE), caseless)) {
+                        goto out;
+                    }
+                }
+            }
+
+            if (P0(!(l->groups & *control))) {
+                goto out;
+            }
+
+            if (unlikely(l->flags & ComplexConfirm)) {
+                const u8 * loc2 = buf + i - l->extended_size + 1 - pullBackAmount;
+                if (loc2 < buf) {
+                    u32 full_overhang = buf - loc2;
+                    size_t len_history = (caseless) ?
+                                          a->len_history_nocase : a->len_history;
+                    if (full_overhang > len_history) {
+                        goto out;
+                    }
+                }
+            }
+
+            *last_match = l->id;
+            *control = a->cb(loc - buf, i, l->id, a->ctxt);
+out:
+            oldNext = l->next; // oldNext is either 0 or an 'adjust' value
+            l = (const struct LitInfo*)((const u8 *)l + oldNext + l->size);
+        } while (oldNext);
+    }
+}
+
+// 'light-weight' confirmation function which is used by 1-mask Teddy;
+// in the 'confirmless' case it simply calls callback function,
+// otherwise it calls 'confWithBit' function for the full confirmation procedure
+static really_inline
+void confWithBit1(const struct FDRConfirm * fdrc,
+                  const struct FDR_Runtime_Args * a,
+                  size_t i,
+                  CautionReason r,
+                  hwlmcb_rv_t *control,
+                  u32 * last_match) {
+    assert(i < a->len);
+    assert(ISALIGNED(fdrc));
+
+    if (unlikely(fdrc->mult)) {
+        confWithBit(fdrc, a, i, r, 0, control, last_match);
+        return;
+    } else {
+        u32 id = fdrc->nBitsOrSoleID;
+
+        if ((*last_match == id) && (fdrc->flags & NoRepeat)) {
+            return;
+        }
+        *last_match = id;
+        *control = a->cb(i, i, id, a->ctxt);
+    }
+}
+
+// This is 'light-weight' confirmation function which is used by 2-3-4-mask Teddy
+// In the 'confirmless' case it makes fast 32-bit comparison,
+// otherwise it calls 'confWithBit' function for the full confirmation procedure
+static really_inline
+void confWithBitMany(const struct FDRConfirm * fdrc,
+                     const struct FDR_Runtime_Args * a,
+                     size_t i,
+                     CautionReason r,
+                     hwlmcb_rv_t *control,
+                     u32 * last_match) {
+    assert(i < a->len);
+    assert(ISALIGNED(fdrc));
+
+    if (i < a->start_offset) {
+        return;
+    }
+
+    if (unlikely(fdrc->mult)) {
+        confWithBit(fdrc, a, i, r, 0, control, last_match);
+        return;
+    } else {
+        const u32 id = fdrc->nBitsOrSoleID;
+        const u32 len = fdrc->soleLitSize;
+
+        if ((*last_match == id) && (fdrc->flags & NoRepeat)) {
+            return;
+        }
+
+        if (r == VECTORING && len > i - a->start_offset) {
+            if (len > (i + a->len_history)) {
+                return;
+            }
+
+            u32 cmp = (u32)a->buf[i] << 24;
+
+            if (len <= i) {
+                for (u32 j = 1; j <= len; j++) {
+                    cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
+                }
+            } else {
+                for (u32 j = 1; j <= i; j++) {
+                    cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
+                }
+                cmp |= (u32)(a->histBytes >> (40 + i * 8));
+            }
+
+            if ((fdrc->soleLitMsk & cmp) != fdrc->soleLitCmp) {
+               return;
+            }
+        }
+        *last_match = id;
+        *control = a->cb(i - len, i, id, a->ctxt);
+    }
+}
+
+#endif
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_compile_internal.h"
+#include "fdr_dump.h"
+#include "fdr_engine_description.h"
+#include "teddy_engine_description.h"
+#include "ue2common.h"
+
+#include <cstdio>
+#include <memory>
+
+#ifndef DUMP_SUPPORT
+#error No dump support!
+#endif
+
+using std::unique_ptr;
+
+namespace ue2 {
+
+static
+bool fdrIsTeddy(const FDR *fdr) {
+    assert(fdr);
+    u32 engine = fdr->engineID;
+
+    /* teddys don't have an fdr engine description (which is why the dump code
+     * is so broken). */
+
+    return !getFdrDescription(engine);
+}
+
+void fdrPrintStats(const FDR *fdr, FILE *f) {
+    const bool isTeddy = fdrIsTeddy(fdr);
+
+    if (isTeddy) {
+        fprintf(f, "TEDDY:         %u\n", fdr->engineID);
+    } else {
+        fprintf(f, "FDR:           %u\n", fdr->engineID);
+    }
+
+    if (isTeddy) {
+        unique_ptr<TeddyEngineDescription> des =
+            getTeddyDescription(fdr->engineID);
+        if (des) {
+            fprintf(f, "    masks      %u\n", des->numMasks);
+            fprintf(f, "    buckets    %u\n", des->getNumBuckets());
+            fprintf(f, "    packed     %s\n", des->packed ? "true" : "false");
+        } else {
+            fprintf(f, "   <unknown engine>\n");
+        }
+    } else {
+        unique_ptr<FDREngineDescription> des =
+            getFdrDescription(fdr->engineID);
+        if (des) {
+            fprintf(f, "    stride     %u\n", des->stride);
+            fprintf(f, "    buckets    %u\n", des->getNumBuckets());
+            fprintf(f, "    width      %u\n", des->schemeWidth);
+        } else {
+            fprintf(f, "   <unknown engine>\n");
+        }
+    }
+
+    fprintf(f, "    strings    ???\n");
+    fprintf(f, "    size       %zu bytes\n", fdrSize(fdr));
+    fprintf(f, "    max length %u\n", fdr->maxStringLen);
+    fprintf(f, "    floodoff   %u (%x)\n", fdr->floodOffset, fdr->floodOffset);
+}
+
+} // namespace ue2
--- a/src/fdr/fdr_dump.h
+++ b/src/fdr/fdr_dump.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: dump API.
+ */
+
+#ifndef FDR_DUMP_H
+#define FDR_DUMP_H
+
+#if defined(DUMP_SUPPORT)
+
+#include <cstdio>
+
+struct FDR;
+
+namespace ue2 {
+
+void fdrPrintStats(const struct FDR *fdr, FILE *f);
+
+} // namespace ue2
+
+#endif // DUMP_SUPPORT
+#endif // FDR_DUMP_H
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr_compile_internal.h"
+#include "fdr_engine_description.h"
+#include "hs_compile.h"
+#include "util/target_info.h"
+#include "util/compare.h" // for ourisalpha()
+#include "util/make_unique.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <map>
+#include <string>
+
+using namespace std;
+
+namespace ue2 {
+
+#include "fdr_autogen_compiler.cpp"
+
+FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
+    : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
+                        def.numBuckets, def.confirmPullBackDistance,
+                        def.confirmTopLevelSplit),
+      schemeWidth(def.schemeWidth), stride(def.stride), bits(def.bits) {}
+
+u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
+    // rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
+    // the +1 avoids pain due to various reach choices
+    return ((getSchemeWidth() + getNumBuckets() - 1) / getNumBuckets()) + 1;
+}
+
+static
+u32 findDesiredStride(size_t num_lits, size_t min_len, size_t min_len_count) {
+    u32 desiredStride = 1; // always our safe fallback
+    if (min_len > 1) {
+        if (num_lits < 250) {
+            // small cases we just go for it
+            desiredStride = min_len;
+        } else if (num_lits < 800) {
+            // intermediate cases
+            desiredStride = min_len - 1;
+        } else if (num_lits < 5000) {
+            // for larger but not huge sizes, go to stride 2 only if we have at
+            // least minlen 3
+            desiredStride = MIN(min_len - 1, 2);
+        }
+    }
+
+    // patch if count is quite large - a ton of length 2 literals can
+    // break things
+#ifdef TRY_THIS_LATER
+    if ((min_len == 2) && (desiredStride == 2) && (min_len_count > 20)) {
+        desiredStride = 1;
+    }
+#endif
+
+    // patch stuff just for the stride 4 case; don't let min_len=4,
+    // desiredStride=4 through as even a few length 4 literals can break things
+    // (far more fragile)
+    if ((min_len == 4) && (desiredStride == 4) && (min_len_count > 2)) {
+        desiredStride = 2;
+    }
+
+    return desiredStride;
+}
+
+unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
+                                              const vector<hwlmLiteral> &vl,
+                                              bool make_small) {
+    vector<FDREngineDescription> allDescs;
+    getFdrDescriptions(&allDescs);
+
+    // find desired stride
+    size_t count;
+    size_t msl = minLenCount(vl, &count);
+    u32 desiredStride = findDesiredStride(vl.size(), msl, count);
+
+    DEBUG_PRINTF("%zu lits, msl=%zu, desiredStride=%u\n", vl.size(), msl,
+                 desiredStride);
+
+    const FDREngineDescription *best = nullptr;
+    u32 best_score = 0;
+
+    for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
+        const FDREngineDescription &eng = allDescs[engineID];
+        if (!eng.isValidOnTarget(target)) {
+            continue;
+        }
+        if (msl < eng.stride) {
+            continue;
+        }
+
+        u32 score = 100;
+
+        score -= absdiff(desiredStride, eng.stride);
+
+        if (eng.stride <= desiredStride) {
+            score += eng.stride;
+        }
+
+        u32 effLits = vl.size(); /* * desiredStride;*/
+        u32 ideal;
+        if (effLits < eng.getNumBuckets()) {
+            if (eng.stride == 1) {
+                ideal = 8;
+            } else {
+                ideal = 10;
+            }
+        } else if (effLits < 20) {
+            ideal = 10;
+        } else if (effLits < 100) {
+            ideal = 11;
+        } else if (effLits < 1000) {
+            ideal = 12;
+        } else if (effLits < 10000) {
+            ideal = 13;
+        } else {
+            ideal = 15;
+        }
+
+        if (ideal != 8 && eng.schemeWidth == 32) {
+            ideal += 1;
+        }
+
+        if (make_small) {
+            ideal -= 2;
+        }
+
+        if (eng.stride > 1) {
+            ideal++;
+        }
+
+        DEBUG_PRINTF("effLits %u\n", effLits);
+
+        if (target.is_atom_class() && !make_small && effLits < 4000) {
+            /* Unless it is a very heavy case, we want to build smaller tables
+             * on lightweight machines due to their small caches. */
+            ideal -= 2;
+        }
+
+        score -= absdiff(ideal, eng.bits);
+
+        DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
+                     "-> score=%u\n",
+                     eng.getID(), eng.schemeWidth, eng.bits,
+                     eng.getNumBuckets(), eng.stride, score);
+
+        if (!best || score > best_score) {
+            best = &eng;
+            best_score = score;
+        }
+    }
+
+    if (!best) {
+        DEBUG_PRINTF("failed to find engine\n");
+        return nullptr;
+    }
+
+    DEBUG_PRINTF("using engine %u\n", best->getID());
+    return ue2::make_unique<FDREngineDescription>(*best);
+}
+
+SchemeBitIndex FDREngineDescription::getSchemeBit(BucketIndex b,
+                                                  PositionInBucket p) const {
+    assert(p < getBucketWidth(b));
+    SchemeBitIndex sbi = p * getNumBuckets() + b;
+    assert(sbi < getSchemeWidth());
+    return sbi;
+}
+
+u32 FDREngineDescription::getBucketWidth(BucketIndex) const {
+    u32 sw = getSchemeWidth();
+    u32 nm = getNumBuckets();
+    assert(sw % nm == 0);
+    return sw/nm;
+}
+
+unique_ptr<FDREngineDescription> getFdrDescription(u32 engineID) {
+    vector<FDREngineDescription> allDescs;
+    getFdrDescriptions(&allDescs);
+
+    if (engineID >= allDescs.size()) {
+        return nullptr;
+    }
+
+    return ue2::make_unique<FDREngineDescription>(allDescs[engineID]);
+}
+
+} // namespace ue2
--- a/src/fdr/fdr_engine_description.h
+++ b/src/fdr/fdr_engine_description.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_ENGINE_DESCRIPTION_H
+#define FDR_ENGINE_DESCRIPTION_H
+
+#include "engine_description.h"
+#include "util/ue2_containers.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+namespace ue2 {
+
+struct FDREngineDef {
+    u32 id;
+    u32 schemeWidth;
+    u32 numBuckets;
+    u32 stride;
+    u32 bits;
+    u64a cpu_features;
+    u32 confirmPullBackDistance;
+    u32 confirmTopLevelSplit;
+};
+
+class FDREngineDescription : public EngineDescription {
+public:
+    u32 schemeWidth;
+    u32 stride;
+    u32 bits;
+
+    u32 getSchemeWidth() const { return schemeWidth; }
+    u32 getBucketWidth(BucketIndex b) const;
+    SchemeBitIndex getSchemeBit(BucketIndex b, PositionInBucket p) const;
+    u32 getNumTableEntries() const { return 1 << bits; }
+    u32 getTabSizeBytes() const {
+        return schemeWidth / 8 * getNumTableEntries();
+    }
+
+    explicit FDREngineDescription(const FDREngineDef &def);
+
+    u32 getDefaultFloodSuffixLength() const override;
+    bool typicallyHoldsOneCharLits() const override { return stride == 1; }
+};
+
+std::unique_ptr<FDREngineDescription>
+chooseEngine(const target_t &target, const std::vector<hwlmLiteral> &vl,
+             bool make_small);
+std::unique_ptr<FDREngineDescription> getFdrDescription(u32 engineID);
+void getFdrDescriptions(std::vector<FDREngineDescription> *out);
+
+} // namespace ue2
+
+#endif
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: data structures.
+ */
+
+#ifndef FDR_INTERNAL_H
+#define FDR_INTERNAL_H
+
+#include "ue2common.h"
+#include "hwlm/hwlm.h" // for hwlm_group_t, HWLMCallback
+
+typedef enum {
+    NOT_CAUTIOUS, //!< not near a boundary (quantify?)
+    VECTORING     //!< potentially vectoring
+} CautionReason;
+
+/** \brief number of different ids that can be triggered by floods of any given
+ * character. */
+#define FDR_FLOOD_MAX_IDS 16
+
+struct FDRFlood {
+    hwlm_group_t allGroups; //!< all the groups or'd together
+    u32 suffix;
+
+    /** \brief 0 to FDR_FLOOD_MAX_IDS-1 ids that are generated once per char on
+     * a flood.
+     * If larger we won't handle this through the flood path at all. */
+    u16 idCount;
+
+    u32 ids[FDR_FLOOD_MAX_IDS]; //!< the ids
+    hwlm_group_t groups[FDR_FLOOD_MAX_IDS]; //!< group ids to go with string ids
+    u32 len[FDR_FLOOD_MAX_IDS]; //!< lengths to go with the string ids
+};
+
+/** \brief FDR structure.
+ *
+ * 1. struct as-is
+ * 2. primary matching table
+ * 3. confirm stuff
+ */
+struct FDR {
+    u32 engineID;
+    u32 size;
+    u32 maxStringLen;
+    u32 floodOffset;
+
+    /** link is the relative offset of a secondary included FDR table for
+     * stream handling if we're a primary FDR table or the subsidiary tertiary
+     * structures (spillover strings and hash table) if we're a secondary
+     * structure. */
+    u32 link;
+    u32 pad1;
+    u32 pad2;
+    u32 pad3;
+
+    union {
+        u32 s_u32;
+        u64a s_u64a;
+        m128 s_m128;
+    } start;
+};
+
+/** \brief FDR runtime arguments.
+ *
+ * This structure handles read-only things that are passed extensively around
+ * the FDR run-time functions. They are set by the API, passed by value into
+ * the main function, then a pointer is passed around to all the various
+ * sub-functions (confirm & flood). */
+struct FDR_Runtime_Args {
+    const u8 *buf;
+    size_t len;
+    const u8 *buf_history;
+    size_t len_history;
+    const u8 *buf_history_nocase;
+    size_t len_history_nocase;
+    size_t start_offset;
+    HWLMCallback cb;
+    void *ctxt;
+    hwlm_group_t *groups;
+    const u8 *firstFloodDetect;
+    const u64a histBytes;
+};
+
+#endif
--- a/src/fdr/fdr_loadval.h
+++ b/src/fdr/fdr_loadval.h
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_LOADVAL_H
+#define FDR_LOADVAL_H
+
+#include "fdr_internal.h"
+#include "ue2common.h"
+#include "util/unaligned.h"
+#include "util/simd_utils.h"
+
+#define MAKE_LOADVAL(type, name) \
+    static really_inline type name (const u8 * ptr, UNUSED const u8 * lo, UNUSED const u8 * hi)
+
+#define NORMAL_SAFE(type)            assert(ptr >= lo && (ptr + sizeof(type) - 1) < hi)
+#define ALIGNED_SAFE(type)           NORMAL_SAFE(type); assert(((size_t)ptr % sizeof(type)) == 0);
+// these ones need asserts to test the property that we're not handling dynamically
+#define CAUTIOUS_FORWARD_SAFE(type)  assert(ptr >= lo)
+#define CAUTIOUS_BACKWARD_SAFE(type) assert((ptr + sizeof(type) - 1) < hi)
+
+#define CF_INDEX_CHECK                        (ptr + i < hi)
+#define CB_INDEX_CHECK     (lo <= ptr + i)
+#define CE_INDEX_CHECK     (lo <= ptr + i) && (ptr + i < hi)
+
+#define MAKE_LOOP(TYPE, COND, SHIFT_FIDDLE)                                    \
+    TYPE v = 0;                                                                \
+    for (TYPE i = 0; i < sizeof(TYPE); i++) {                                  \
+        if (COND) {                                                            \
+            v += (TYPE)ptr[i] << ((SHIFT_FIDDLE)*8);                           \
+        }                                                                      \
+    }                                                                          \
+    return v;
+
+#define MAKE_LOOP_BE(TYPE, COND) \
+    MAKE_LOOP(TYPE, COND, sizeof(TYPE)-i-1)
+
+#define MAKE_LOOP_LE(TYPE, COND) \
+    MAKE_LOOP(TYPE, COND, i)
+
+
+#define MAKE_LOOP_BE_CF(TYPE) CAUTIOUS_FORWARD_SAFE(TYPE);  MAKE_LOOP_BE(TYPE, CF_INDEX_CHECK)
+#define MAKE_LOOP_BE_CB(TYPE) CAUTIOUS_BACKWARD_SAFE(TYPE); MAKE_LOOP_BE(TYPE, CB_INDEX_CHECK)
+#define MAKE_LOOP_BE_CE(TYPE)                               MAKE_LOOP_BE(TYPE, CE_INDEX_CHECK)
+#define MAKE_LOOP_LE_CF(TYPE) CAUTIOUS_FORWARD_SAFE(TYPE);  MAKE_LOOP_LE(TYPE, CF_INDEX_CHECK)
+#define MAKE_LOOP_LE_CB(TYPE) CAUTIOUS_BACKWARD_SAFE(TYPE); MAKE_LOOP_LE(TYPE, CB_INDEX_CHECK)
+#define MAKE_LOOP_LE_CE(TYPE)                               MAKE_LOOP_LE(TYPE, CE_INDEX_CHECK)
+
+// no suffix = normal (unaligned)
+// _a        = aligned
+// _cf       = cautious forwards, base is always in bounds, but may read over the end of the buffer (test against hi)
+// _cb       = cautious backwards, final byte is always in bounds, but may read over the start of the buffer (test against lo)
+// _ce       = cautious everywhere (in both directions); test against hi and lo
+
+// u8 loadvals
+MAKE_LOADVAL(u8, lv_u8) {
+    NORMAL_SAFE(u8);
+    return *ptr;
+}
+
+MAKE_LOADVAL(u8, lv_u8_cf) {
+    CAUTIOUS_FORWARD_SAFE(u8);
+    if (ptr < hi) {
+        return *ptr;
+    } else {
+        return 0;
+    }
+}
+
+MAKE_LOADVAL(u8, lv_u8_cb) {
+    CAUTIOUS_BACKWARD_SAFE(u8);
+    if (lo <= ptr) {
+        return *ptr;
+    } else {
+        return 0;
+    }
+}
+
+MAKE_LOADVAL(u8, lv_u8_ce) {
+    if ((lo <= ptr) && (ptr < hi)) {
+        return *ptr;
+    } else {
+        return 0;
+    }
+}
+
+MAKE_LOADVAL(u16, lv_u16) {
+    NORMAL_SAFE(u16);
+    return unaligned_load_u16(ptr);
+}
+
+MAKE_LOADVAL(u16, lv_u16_a) {
+    ALIGNED_SAFE(u16);
+    return *(const u16 *)ptr;
+}
+
+MAKE_LOADVAL(u32, lv_u32) {
+    NORMAL_SAFE(u32);
+    return unaligned_load_u32(ptr);
+}
+
+MAKE_LOADVAL(u32, lv_u32_a) {
+    ALIGNED_SAFE(u32);
+    return *(const u32 *)ptr;
+}
+
+MAKE_LOADVAL(u64a, lv_u64a) {
+    NORMAL_SAFE(u32);
+    return unaligned_load_u64a(ptr);
+}
+
+MAKE_LOADVAL(u64a, lv_u64a_a) {
+    ALIGNED_SAFE(u64a);
+    return *(const u64a *)ptr;
+}
+
+MAKE_LOADVAL(u16, lv_u16_cf) { MAKE_LOOP_LE_CF(u16); }
+MAKE_LOADVAL(u16, lv_u16_cb) { MAKE_LOOP_LE_CB(u16); }
+MAKE_LOADVAL(u16, lv_u16_ce) { MAKE_LOOP_LE_CE(u16); }
+
+MAKE_LOADVAL(u32, lv_u32_cf) { MAKE_LOOP_LE_CF(u32); }
+MAKE_LOADVAL(u32, lv_u32_cb) { MAKE_LOOP_LE_CB(u32); }
+MAKE_LOADVAL(u32, lv_u32_ce) { MAKE_LOOP_LE_CE(u32); }
+
+MAKE_LOADVAL(u64a, lv_u64a_cf) { MAKE_LOOP_LE_CF(u64a); }
+MAKE_LOADVAL(u64a, lv_u64a_cb) { MAKE_LOOP_LE_CB(u64a); }
+MAKE_LOADVAL(u64a, lv_u64a_ce) { MAKE_LOOP_LE_CE(u64a); }
+
+MAKE_LOADVAL(m128, lv_m128) {
+    NORMAL_SAFE(m128);
+    return loadu128(ptr);
+}
+
+MAKE_LOADVAL(m128, lv_m128_a) {
+    ALIGNED_SAFE(m128);
+    assert((size_t)ptr % sizeof(m128) == 0);
+    return *(const m128 *)ptr;
+}
+
+// m128 cases need to be manually created
+
+MAKE_LOADVAL(m128, lv_m128_cf) {
+    CAUTIOUS_FORWARD_SAFE(m128);
+    union {
+        u8 val8[16];
+        m128 val128;
+    } u;
+
+    for (u32 i = 0; i < 16; i++) {
+        if (ptr + i < hi) {
+            u.val8[i] = ptr[i];
+        } else {
+            u.val8[i] = 0;
+        }
+    }
+    return u.val128;
+}
+
+MAKE_LOADVAL(m128, lv_m128_cb) {
+    CAUTIOUS_BACKWARD_SAFE(m128);
+    union {
+        u8 val8[16];
+        m128 val128;
+    } u;
+
+    for (u32 i = 0; i < 16; i++) {
+        if (lo <= ptr + i) {
+            u.val8[i] = ptr[i];
+        } else {
+            u.val8[i] = 0;
+        }
+    }
+    return u.val128;
+}
+
+MAKE_LOADVAL(m128, lv_m128_ce) {
+    union {
+        u8 val8[16];
+        m128 val128;
+    } u;
+
+    for (u32 i = 0; i < 16; i++) {
+        if ((lo <= ptr + i) && (ptr + i < hi)) {
+            u.val8[i] = ptr[i];
+        } else {
+            u.val8[i] = 0;
+        }
+    }
+    return u.val128;
+}
+
+#endif
--- a/src/fdr/fdr_streaming_compile.cpp
+++ b/src/fdr/fdr_streaming_compile.cpp
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_streaming_internal.h"
+#include "fdr_compile_internal.h"
+#include "hwlm/hwlm_build.h"
+#include "util/alloc.h"
+#include "util/bitutils.h"
+#include "util/target_info.h"
+#include "util/verify_types.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <deque>
+#include <set>
+
+#include <boost/dynamic_bitset.hpp>
+
+using namespace std;
+using boost::dynamic_bitset;
+
+namespace ue2 {
+
+namespace {
+struct LongLitOrder {
+    bool operator()(const hwlmLiteral &i1, const hwlmLiteral &i2) const {
+        if (i1.nocase != i2.nocase) {
+            return i1.nocase < i2.nocase;
+        } else {
+            return i1.s < i2.s;
+        }
+    }
+};
+}
+
+static
+bool hwlmLitEqual(const hwlmLiteral &l1, const hwlmLiteral &l2) {
+    return l1.s == l2.s && l1.nocase == l2.nocase;
+}
+
+static
+u32 roundUpToPowerOfTwo(u32 x) {
+    x -= 1;
+    x |= (x >> 1);
+    x |= (x >> 2);
+    x |= (x >> 4);
+    x |= (x >> 8);
+    x |= (x >> 16);
+    return x + 1;
+}
+
+/**
+ * \brief Creates a long literals vector containing all literals of length > max_len.
+ *
+ * The last char of each literal is trimmed as we're not interested in full
+ * matches, only partial matches.
+ *
+ * Literals are sorted (by caseful/caseless, then lexicographical order) and
+ * made unique.
+ *
+ * The ID of each literal is set to its position in the vector.
+ *
+ * \return False if there aren't any long literals.
+ */
+static
+bool setupLongLits(const vector<hwlmLiteral> &lits,
+                   vector<hwlmLiteral> &long_lits, size_t max_len) {
+    long_lits.reserve(lits.size());
+    for (vector<hwlmLiteral>::const_iterator it = lits.begin();
+         it != lits.end(); ++it) {
+        if (it->s.length() > max_len) {
+            hwlmLiteral tmp = *it; // copy
+            tmp.s.erase(tmp.s.size() - 1, 1); // erase last char
+            tmp.id = 0; // recalc later
+            tmp.groups = 0; // filled in later by hash bucket(s)
+            long_lits.push_back(tmp);
+        }
+    }
+
+    if (long_lits.empty()) {
+        return false;
+    }
+
+    // sort long_literals by caseful/caseless and in lexicographical order,
+    // remove duplicates
+    stable_sort(long_lits.begin(), long_lits.end(), LongLitOrder());
+    vector<hwlmLiteral>::iterator new_end =
+        unique(long_lits.begin(), long_lits.end(), hwlmLitEqual);
+    long_lits.erase(new_end, long_lits.end());
+
+    // fill in ids; not currently used
+    for (vector<hwlmLiteral>::iterator i = long_lits.begin(),
+                                       e = long_lits.end();
+         i != e; ++i) {
+        i->id = i - long_lits.begin();
+    }
+    return true;
+}
+
+// boundaries are the 'start' boundaries for each 'mode'
+// so boundary[CASEFUL] is the index one above the largest caseful index
+// positions[CASEFUL] is the # of positions in caseful strings (stream)
+// hashedPositions[CASEFUL] is the # of positions in caseful strings
+//                          (not returned - a temporary)
+// hashEntries[CASEFUL] is the # of positions hashed for caseful strings
+//                    (rounded up to the nearest power of two)
+static
+void analyzeLits(const vector<hwlmLiteral> &long_lits, size_t max_len,
+                 u32 *boundaries, u32 *positions, u32 *hashEntries) {
+    u32 hashedPositions[MAX_MODES];
+
+    for (u32 m = CASEFUL; m < MAX_MODES; ++m) {
+        boundaries[m] = verify_u32(long_lits.size());
+        positions[m] = 0;
+        hashedPositions[m] = 0;
+    }
+
+    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
+                                             e = long_lits.end();
+         i != e; ++i) {
+        if (i->nocase) {
+            boundaries[CASEFUL] = verify_u32(i - long_lits.begin());
+            break;
+        }
+    }
+
+    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
+                                             e = long_lits.end();
+         i != e; ++i) {
+        MODES m = i->nocase ? CASELESS : CASEFUL;
+        for (u32 j = 1; j < i->s.size() - max_len + 1; j++) {
+            hashedPositions[m]++;
+        }
+        positions[m] += i->s.size();
+    }
+
+    for (u32 m = CASEFUL; m < MAX_MODES; m++) {
+        hashEntries[m] = hashedPositions[m]
+                ? roundUpToPowerOfTwo(MAX(4096, hashedPositions[m]))
+                : 0;
+    }
+
+#ifdef DEBUG_COMPILE
+    printf("analyzeLits:\n");
+    for (MODES m = CASEFUL; m < MAX_MODES; m++) {
+        printf("mode %s boundary %d positions %d hashedPositions %d "
+               "hashEntries %d\n",
+               (m == CASEFUL) ? "caseful" : "caseless", boundaries[m],
+               positions[m], hashedPositions[m], hashEntries[m]);
+    }
+    printf("\n");
+#endif
+}
+
+static
+u32 hashLit(const hwlmLiteral &l, u32 offset, size_t max_len, MODES m) {
+    return streaming_hash((const u8 *)l.s.c_str() + offset, max_len, m);
+}
+
+// sort by 'distance from start'
+namespace {
+struct OffsetIDFromEndOrder {
+    const vector<hwlmLiteral> &lits; // not currently used
+    explicit OffsetIDFromEndOrder(const vector<hwlmLiteral> &lits_in)
+        : lits(lits_in) {}
+    bool operator()(const pair<u32, u32> &i1, const pair<u32, u32> &i2) const {
+        if (i1.second != i2.second) {
+            // longest is 'first', so > not <
+            return i1.second > i2.second;
+        }
+        return i1.first < i2.first;
+    }
+};
+}
+
+static
+void fillHashes(const vector<hwlmLiteral> &long_lits, size_t max_len,
+                FDRSHashEntry *tab, size_t numEntries, MODES m,
+                map<u32, u32> &litToOffsetVal) {
+    const u32 nbits = lg2(numEntries);
+    map<u32, deque<pair<u32, u32> > > bucketToLitOffPairs;
+    map<u32, u64a> bucketToBitfield;
+
+    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
+                                             e = long_lits.end();
+         i != e; ++i) {
+        const hwlmLiteral &l = *i;
+        if ((m == CASELESS) != i->nocase) {
+            continue;
+        }
+        for (u32 j = 1; j < i->s.size() - max_len + 1; j++) {
+            u32 h = hashLit(l, j, max_len, m);
+            u32 h_ent = h & ((1U << nbits) - 1);
+            u32 h_low = (h >> nbits) & 63;
+            bucketToLitOffPairs[h_ent].push_back(make_pair(i->id, j));
+            bucketToBitfield[h_ent] |= (1ULL << h_low);
+        }
+    }
+
+    // this used to be a set<u32>, but a bitset is much much faster given that
+    // we're using it only for membership testing.
+    dynamic_bitset<> filledBuckets(numEntries); // all bits zero by default.
+
+    // sweep out bitfield entries and save the results swapped accordingly
+    // also, anything with bitfield entries is put in filledBuckets
+    for (map<u32, u64a>::const_iterator i = bucketToBitfield.begin(),
+                                        e = bucketToBitfield.end();
+         i != e; ++i) {
+        u32 bucket = i->first;
+        u64a contents = i->second;
+        tab[bucket].bitfield = contents;
+        filledBuckets.set(bucket);
+    }
+
+    // store out all our chains based on free values in our hash table.
+    // find nearest free locations that are empty (there will always be more
+    // entries than strings, at present)
+    for (map<u32, deque<pair<u32, u32> > >::iterator
+             i = bucketToLitOffPairs.begin(),
+             e = bucketToLitOffPairs.end();
+         i != e; ++i) {
+        u32 bucket = i->first;
+        deque<pair<u32, u32> > &d = i->second;
+
+        // sort d by distance of the residual string (len minus our depth into
+        // the string). We need to put the 'furthest back' string first...
+        stable_sort(d.begin(), d.end(), OffsetIDFromEndOrder(long_lits));
+
+        while (1) {
+            // first time through is always at bucket, then we fill in links
+            filledBuckets.set(bucket);
+            FDRSHashEntry *ent = &tab[bucket];
+            u32 lit_id = d.front().first;
+            u32 offset = d.front().second;
+
+            ent->state = verify_u32(litToOffsetVal[lit_id] + offset + max_len);
+            ent->link = (u32)LINK_INVALID;
+
+            d.pop_front();
+            if (d.empty()) {
+                break;
+            }
+            // now, if there is another value
+            // find a bucket for it and put in 'bucket' and repeat
+            // all we really need to do is find something not in filledBuckets,
+            // ideally something close to bucket
+            // we search backward and forward from bucket, trying to stay as
+            // close as possible.
+            UNUSED bool found = false;
+            int bucket_candidate = 0;
+            for (u32 k = 1; k < numEntries * 2; k++) {
+                bucket_candidate = bucket + (((k & 1) == 0)
+                        ? (-(int)k / 2) : (k / 2));
+                if (bucket_candidate < 0 ||
+                    (size_t)bucket_candidate >= numEntries) {
+                    continue;
+                }
+                if (!filledBuckets.test(bucket_candidate)) {
+                    found = true;
+                    break;
+                }
+            }
+
+            assert(found);
+            bucket = bucket_candidate;
+            ent->link = bucket;
+        }
+    }
+}
+
+static
+size_t maxMaskLen(const vector<hwlmLiteral> &lits) {
+    size_t rv = 0;
+    vector<hwlmLiteral>::const_iterator it, ite;
+    for (it = lits.begin(), ite = lits.end(); it != ite; ++it) {
+        rv = max(rv, it->msk.size());
+    }
+    return rv;
+}
+
+pair<u8 *, size_t>
+fdrBuildTableStreaming(const vector<hwlmLiteral> &lits,
+                       hwlmStreamingControl *stream_control) {
+    // refuse to compile if we are forced to have smaller than minimum
+    // history required for long-literal support, full stop
+    // otherwise, choose the maximum of the preferred history quantity
+    // (currently a fairly extravagant 32) or the already used history
+    // quantity - subject to the limitation of stream_control->history_max
+
+    const size_t MIN_HISTORY_REQUIRED = 32;
+
+    if (MIN_HISTORY_REQUIRED > stream_control->history_max) {
+        throw std::logic_error("Cannot set history to minimum history required");
+    }
+
+    size_t max_len =
+        MIN(stream_control->history_max,
+            MAX(MIN_HISTORY_REQUIRED, stream_control->history_min));
+    assert(max_len >= MIN_HISTORY_REQUIRED);
+    size_t max_mask_len = maxMaskLen(lits);
+
+    vector<hwlmLiteral> long_lits;
+    if (!setupLongLits(lits, long_lits, max_len) || false) {
+        // "Don't need to do anything" path, not really a fail
+        DEBUG_PRINTF("Streaming literal path produces no table\n");
+
+        // we want enough history to manage the longest literal and the longest
+        // mask.
+        stream_control->literal_history_required =
+                    max(maxLen(lits), max_mask_len) - 1;
+        stream_control->literal_stream_state_required = 0;
+        return make_pair(nullptr, size_t{0});
+    }
+
+    // Ensure that we have enough room for the longest mask.
+    if (max_mask_len) {
+        max_len = max(max_len, max_mask_len - 1);
+    }
+
+    u32 boundary[MAX_MODES];
+    u32 positions[MAX_MODES];
+    u32 hashEntries[MAX_MODES];
+
+    analyzeLits(long_lits, max_len, boundary, positions, hashEntries);
+
+    // first assess the size and find our caseless threshold
+    size_t headerSize = ROUNDUP_16(sizeof(FDRSTableHeader));
+
+    size_t litTabOffset = headerSize;
+
+    size_t litTabNumEntries = long_lits.size() + 1;
+    size_t litTabSize = ROUNDUP_16(litTabNumEntries * sizeof(FDRSLiteral));
+
+    size_t wholeLitTabOffset = litTabOffset + litTabSize;
+    size_t totalWholeLitTabSize = ROUNDUP_16(positions[CASEFUL] +
+                                             positions[CASELESS]);
+
+    size_t htOffset[MAX_MODES];
+    size_t htSize[MAX_MODES];
+
+    htOffset[CASEFUL] = wholeLitTabOffset + totalWholeLitTabSize;
+    htSize[CASEFUL] = hashEntries[CASEFUL] * sizeof(FDRSHashEntry);
+    htOffset[CASELESS] = htOffset[CASEFUL] + htSize[CASEFUL];
+    htSize[CASELESS] = hashEntries[CASELESS] * sizeof(FDRSHashEntry);
+
+    size_t tabSize = ROUNDUP_16(htOffset[CASELESS] + htSize[CASELESS]);
+
+    // need to add +2 to both of these to allow space for the actual largest
+    // value as well as handling the fact that we add one to the space when
+    // storing out a position to allow zero to mean "no stream state value"
+    u8 streamBits[MAX_MODES];
+    streamBits[CASEFUL] = lg2(roundUpToPowerOfTwo(positions[CASEFUL] + 2));
+    streamBits[CASELESS] = lg2(roundUpToPowerOfTwo(positions[CASELESS] + 2));
+    u32 tot_state_bytes = (streamBits[CASEFUL] + streamBits[CASELESS] + 7) / 8;
+
+    u8 * secondaryTable = (u8 *)aligned_zmalloc(tabSize);
+    assert(secondaryTable); // otherwise would have thrown std::bad_alloc
+
+    // then fill it in
+    u8 * ptr = secondaryTable;
+    FDRSTableHeader * header = (FDRSTableHeader *)ptr;
+    // fill in header
+    header->pseudoEngineID = (u32)0xffffffff;
+    header->N = verify_u8(max_len); // u8 so doesn't matter; won't go > 255
+    for (u32 m = CASEFUL; m < MAX_MODES; ++m) {
+        header->boundary[m] = boundary[m];
+        header->hashOffset[m] = verify_u32(htOffset[m]);
+        header->hashNBits[m] = lg2(hashEntries[m]);
+        header->streamStateBits[m] = streamBits[m];
+    }
+    assert(tot_state_bytes < sizeof(u64a));
+    header->streamStateBytes = verify_u8(tot_state_bytes); // u8
+
+    ptr += headerSize;
+
+    // now fill in the rest
+
+    FDRSLiteral * litTabPtr = (FDRSLiteral *)ptr;
+    ptr += litTabSize;
+
+    map<u32, u32> litToOffsetVal;
+    for (vector<hwlmLiteral>::const_iterator i = long_lits.begin(),
+                                             e = long_lits.end();
+         i != e; ++i) {
+        u32 entry = verify_u32(i - long_lits.begin());
+        u32 offset = verify_u32(ptr - secondaryTable);
+
+        // point the table entry to the string location
+        litTabPtr[entry].offset = offset;
+
+        litToOffsetVal[entry] = offset;
+
+        // copy the string into the string location
+        memcpy(ptr, i->s.c_str(), i->s.size());
+
+        ptr += i->s.size(); // and the string location
+    }
+
+    // fill in final lit table entry with current ptr (serves as end value)
+    litTabPtr[long_lits.size()].offset = verify_u32(ptr - secondaryTable);
+
+    // fill hash tables
+    ptr = secondaryTable + htOffset[CASEFUL];
+    for (u32 m = CASEFUL; m < MAX_MODES; ++m) {
+        fillHashes(long_lits, max_len, (FDRSHashEntry *)ptr, hashEntries[m],
+                   (MODES)m, litToOffsetVal);
+        ptr += htSize[m];
+    }
+
+    // tell the world what we did
+    stream_control->literal_history_required = max_len;
+    stream_control->literal_stream_state_required = tot_state_bytes;
+    return make_pair(secondaryTable, tabSize);
+}
+
+} // namespace ue2
--- a/src/fdr/fdr_streaming_internal.h
+++ b/src/fdr/fdr_streaming_internal.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_STREAMING_INTERNAL_H
+#define FDR_STREAMING_INTERNAL_H
+
+#include "ue2common.h"
+#include "fdr_internal.h"
+#include "util/unaligned.h"
+
+// tertiary table:
+// a header (FDRSTableHeader)
+// long_lits.size()+1 entries holding an offset to the string in the
+//       'whole literal table' (FDRSLiteral structure)
+// the whole literal table - every string packed in (freeform)
+// hash table (caseful) (FDRSHashEntry)
+// hash table (caseless) (FDRSHashEntry)
+
+typedef enum {
+    CASEFUL = 0,
+    CASELESS = 1,
+    MAX_MODES = 2
+} MODES;
+
+// We have one of these structures hanging off the 'link' of our secondary
+// FDR table that handles streaming strings
+struct FDRSTableHeader {
+    u32 pseudoEngineID; // set to 0xffffffff to indicate this isn't an FDR
+
+    // string id one beyond the maximum entry for this type of literal
+    // boundary[CASEFUL] is the end of the caseful literals
+    // boundary[CASELESS] is the end of the caseless literals and one beyond
+    // the largest literal id (the size of the littab)
+    u32 boundary[MAX_MODES];
+
+    // offsets are 0 if no such table exists
+    // offset from the base of the tertiary structure to the hash table
+    u32 hashOffset[MAX_MODES];
+    u32 hashNBits[MAX_MODES]; // lg2 of the size of the hash table
+
+    u8 streamStateBits[MAX_MODES];
+    u8 streamStateBytes; // total size of packed stream state in bytes
+    u8 N; // prefix lengths
+    u16 pad;
+};
+
+// One of these structures per literal entry in our secondary FDR table.
+struct FDRSLiteral {
+    u32 offset;
+    // potentially - another u32 to point to the 'next lesser included literal'
+    // which would be a literal that overlaps this one in such a way that a
+    // failure to match _this_ literal can leave us in a state that we might
+    // still match that literal. Offset information might also be called for,
+    // in which case we might be wanting to use a FDRSLiteralOffset
+};
+
+typedef u32 FDRSLiteralOffset;
+
+#define LINK_INVALID 0xffffffff
+
+// One of these structures per hash table entry in our secondary FDR table
+struct FDRSHashEntry {
+    u64a bitfield;
+    FDRSLiteralOffset state;
+    u32 link;
+};
+
+static really_inline
+u32 get_start_lit_idx(const struct FDRSTableHeader * h, MODES m) {
+    return m == CASEFUL ? 0 : h->boundary[m-1];
+}
+
+static really_inline
+u32 get_end_lit_idx(const struct FDRSTableHeader * h, MODES m) {
+    return h->boundary[m];
+}
+
+static really_inline
+const struct FDRSLiteral * getLitTab(const struct FDRSTableHeader * h) {
+    return (const struct FDRSLiteral *) (((const u8 *)h) +
+            ROUNDUP_16(sizeof(struct FDRSTableHeader)));
+}
+
+static really_inline
+u32 getBaseOffsetOfLits(const struct FDRSTableHeader * h, MODES m) {
+    return getLitTab(h)[get_start_lit_idx(h, m)].offset;
+}
+
+static really_inline
+u32 packStateVal(const struct FDRSTableHeader * h, MODES m, u32 v) {
+    return v - getBaseOffsetOfLits(h, m) + 1;
+}
+
+static really_inline
+u32 unpackStateVal(const struct FDRSTableHeader * h, MODES m, u32 v) {
+    return v + getBaseOffsetOfLits(h, m) - 1;
+}
+
+static really_inline
+u32 has_bit(const struct FDRSHashEntry * ent, u32 bit) {
+    return (ent->bitfield >> bit) & 0x1;
+}
+
+static really_inline
+u32 streaming_hash(const u8 *ptr, UNUSED size_t len, MODES mode) {
+    const u64a CASEMASK = 0xdfdfdfdfdfdfdfdfULL;
+    const u64a MULTIPLIER = 0x0b4e0ef37bc32127ULL;
+    assert(len >= 32);
+
+    u64a v1 = unaligned_load_u64a(ptr);
+    u64a v2 = unaligned_load_u64a(ptr + 8);
+    u64a v3 = unaligned_load_u64a(ptr + 16);
+    if (mode == CASELESS) {
+        v1 &= CASEMASK;
+        v2 &= CASEMASK;
+        v3 &= CASEMASK;
+    }
+    v1 *= MULTIPLIER;
+    v2 *= (MULTIPLIER*MULTIPLIER);
+    v3 *= (MULTIPLIER*MULTIPLIER*MULTIPLIER);
+    v1 >>= 32;
+    v2 >>= 32;
+    v3 >>= 32;
+    return v1 ^ v2 ^ v3;
+}
+
+#endif
--- a/src/fdr/fdr_streaming_runtime.h
+++ b/src/fdr/fdr_streaming_runtime.h
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_STREAMING_RUNTIME_H
+#define FDR_STREAMING_RUNTIME_H
+
+#include "fdr_streaming_internal.h"
+#include "util/partial_store.h"
+
+static really_inline
+const struct FDRSTableHeader * getSHDR(const struct FDR * fdr) {
+    const u8 * linkPtr = ((const u8 *)fdr) + fdr->link;
+    // test if it's not really a engineID, but a 'pseudo engine id'
+    assert(*(const u32 *)linkPtr == 0xffffffff);
+    assert(linkPtr);
+    return (const struct FDRSTableHeader *)linkPtr;
+}
+
+// Reads from stream state and unpacks values into stream state table.
+static really_inline
+void getStreamStates(const struct FDRSTableHeader * streamingTable,
+                     const u8 * stream_state, u32 * table) {
+    assert(streamingTable);
+    assert(stream_state);
+    assert(table);
+
+    u8 ss_bytes = streamingTable->streamStateBytes;
+    u8 ssb = streamingTable->streamStateBits[CASEFUL];
+    UNUSED u8 ssb_nc = streamingTable->streamStateBits[CASELESS];
+    assert(ss_bytes == (ssb + ssb_nc + 7) / 8);
+
+#if defined(ARCH_32_BIT)
+    // On 32-bit hosts, we may be able to avoid having to do any u64a
+    // manipulation at all.
+    if (ss_bytes <= 4) {
+        u32 ssb_mask = (1U << ssb) - 1;
+        u32 streamVal = partial_load_u32(stream_state, ss_bytes);
+        table[CASEFUL] = (u32)(streamVal & ssb_mask);
+        table[CASELESS] = (u32)(streamVal >> ssb);
+        return;
+    }
+#endif
+
+    u64a ssb_mask = (1ULL << ssb) - 1;
+    u64a streamVal = partial_load_u64a(stream_state, ss_bytes);
+    table[CASEFUL] = (u32)(streamVal & ssb_mask);
+    table[CASELESS] = (u32)(streamVal >> (u64a)ssb);
+}
+
+#ifndef NDEBUG
+// Defensive checking (used in assert) that these table values don't overflow
+// outside the range available.
+static really_inline UNUSED
+u32 streamingTableOverflow(u32 * table, u8 ssb, u8 ssb_nc) {
+    u32 ssb_mask = (1ULL << (ssb)) - 1;
+    if (table[CASEFUL] & ~ssb_mask) {
+        return 1;
+    }
+    u32 ssb_nc_mask = (1ULL << (ssb_nc)) - 1;
+    if (table[CASELESS] & ~ssb_nc_mask) {
+        return 1;
+    }
+    return 0;
+}
+#endif
+
+// Reads from stream state table and packs values into stream state.
+static really_inline
+void setStreamStates(const struct FDRSTableHeader * streamingTable,
+                     u8 * stream_state, u32 * table) {
+    assert(streamingTable);
+    assert(stream_state);
+    assert(table);
+
+    u8 ss_bytes = streamingTable->streamStateBytes;
+    u8 ssb = streamingTable->streamStateBits[CASEFUL];
+    UNUSED u8 ssb_nc = streamingTable->streamStateBits[CASELESS];
+    assert(ss_bytes == (ssb + ssb_nc + 7) / 8);
+    assert(!streamingTableOverflow(table, ssb, ssb_nc));
+
+#if defined(ARCH_32_BIT)
+    // On 32-bit hosts, we may be able to avoid having to do any u64a
+    // manipulation at all.
+    if (ss_bytes <= 4) {
+        u32 stagingStreamState = table[CASEFUL];
+        stagingStreamState |= (table[CASELESS] << ssb);
+
+        partial_store_u32(stream_state, stagingStreamState, ss_bytes);
+        return;
+    }
+#endif
+
+    u64a stagingStreamState = (u64a)table[CASEFUL];
+    stagingStreamState |= (u64a)table[CASELESS] << ((u64a)ssb);
+    partial_store_u64a(stream_state, stagingStreamState, ss_bytes);
+}
+
+u32 fdrStreamStateActive(const struct FDR * fdr, const u8 * stream_state) {
+    if (!stream_state) {
+        return 0;
+    }
+    const struct FDRSTableHeader * streamingTable = getSHDR(fdr);
+    u8 ss_bytes = streamingTable->streamStateBytes;
+
+    // We just care if there are any bits set, and the test below is faster
+    // than a partial_load_u64a (especially on 32-bit hosts).
+    for (u32 i = 0; i < ss_bytes; i++) {
+        if (*stream_state) {
+            return 1;
+        }
+        ++stream_state;
+    }
+    return 0;
+}
+
+// binary search for the literal index that contains the current state
+static really_inline
+u32 findLitTabEntry(const struct FDRSTableHeader * streamingTable,
+                    u32 stateValue, MODES m) {
+    const struct FDRSLiteral * litTab = getLitTab(streamingTable);
+    u32 lo = get_start_lit_idx(streamingTable, m);
+    u32 hi = get_end_lit_idx(streamingTable, m);
+
+    // Now move stateValue back by one so that we're looking for the
+    // litTab entry that includes it the string, not the one 'one past' it
+    stateValue -= 1;
+    assert(lo != hi);
+    assert(litTab[lo].offset <= stateValue);
+    assert(litTab[hi].offset > stateValue);
+
+    // binary search to find the entry e such that:
+    // litTab[e].offsetToLiteral <= stateValue < litTab[e+1].offsetToLiteral
+    while (lo + 1 < hi) {
+        u32 mid = (lo + hi) / 2;
+        if (litTab[mid].offset <= stateValue) {
+            lo = mid;
+        } else { //(litTab[mid].offset > stateValue) {
+            hi = mid;
+        }
+    }
+    assert(litTab[lo].offset <= stateValue);
+    assert(litTab[hi].offset > stateValue);
+    return lo;
+}
+
+static really_inline
+void fdrUnpackStateMode(struct FDR_Runtime_Args *a,
+                        const struct FDRSTableHeader *streamingTable,
+                        const struct FDRSLiteral * litTab,
+                        const u32 *state_table,
+                        const MODES m) {
+    if (!state_table[m]) {
+        return;
+    }
+
+    u32 stateValue = unpackStateVal(streamingTable, m, state_table[m]);
+    u32 idx = findLitTabEntry(streamingTable, stateValue, m);
+    size_t found_offset = litTab[idx].offset;
+    const u8 * found_buf = found_offset + (const u8 *)streamingTable;
+    size_t found_sz = stateValue - found_offset;
+    if (m == CASEFUL) {
+        a->buf_history = found_buf;
+        a->len_history = found_sz;
+    } else {
+        a->buf_history_nocase = found_buf;
+        a->len_history_nocase = found_sz;
+    }
+}
+
+static really_inline
+void fdrUnpackState(const struct FDR * fdr, struct FDR_Runtime_Args * a,
+                    const u8 * stream_state) {
+    // nothing to do if there's no stream state for the case
+    if (!stream_state) {
+        return;
+    }
+
+    const struct FDRSTableHeader * streamingTable = getSHDR(fdr);
+    const struct FDRSLiteral * litTab = getLitTab(streamingTable);
+
+    u32 state_table[MAX_MODES];
+    getStreamStates(streamingTable, stream_state, state_table);
+
+    fdrUnpackStateMode(a, streamingTable, litTab, state_table, CASEFUL);
+    fdrUnpackStateMode(a, streamingTable, litTab, state_table, CASELESS);
+}
+
+static really_inline
+u32 do_single_confirm(const struct FDRSTableHeader * streamingTable,
+                      const struct FDR_Runtime_Args * a, u32 hashState, MODES m) {
+    const struct FDRSLiteral * litTab = getLitTab(streamingTable);
+    u32 idx = findLitTabEntry(streamingTable, hashState, m);
+    size_t found_offset = litTab[idx].offset;
+    const u8 * s1 = found_offset + (const u8 *)streamingTable;
+    assert(hashState > found_offset);
+    size_t l1 = hashState - found_offset;
+    const u8 * buf = a->buf;
+    size_t len = a->len;
+    const char nocase = m != CASEFUL;
+
+    if (l1 > len) {
+        const u8 * hist = nocase ? a->buf_history_nocase : a->buf_history;
+        size_t hist_len = nocase ? a->len_history_nocase : a->len_history;
+
+        if (l1 > len+hist_len) {
+            return 0; // Break out - not enough total history
+        }
+
+        size_t overhang = l1 - len;
+        assert(overhang <= hist_len);
+
+        if (cmpForward(hist + hist_len - overhang, s1, overhang, nocase)) {
+            return 0;
+        }
+        s1 += overhang;
+        l1 -= overhang;
+    }
+    // if we got here, we don't need history or we compared ok out of history
+    assert(l1 <= len);
+
+    if (cmpForward(buf + len - l1, s1, l1, nocase)) {
+        return 0;
+    }
+    return hashState; // our new state
+}
+
+static really_inline
+void fdrFindStreamingHash(const struct FDR_Runtime_Args *a,
+                          const struct FDRSTableHeader *streamingTable,
+                          u8 hash_len, u32 *hashes) {
+    u8 tempbuf[128];
+    const u8 *base;
+    if (hash_len > a->len) {
+        assert(hash_len <= 128);
+        size_t overhang = hash_len - a->len;
+        assert(overhang <= a->len_history);
+        memcpy(tempbuf, a->buf_history + a->len_history - overhang, overhang);
+        memcpy(tempbuf + overhang, a->buf, a->len);
+        base = tempbuf;
+    } else {
+        assert(hash_len <= a->len);
+        base = a->buf + a->len - hash_len;
+    }
+
+    if (streamingTable->hashNBits[CASEFUL]) {
+        hashes[CASEFUL] = streaming_hash(base, hash_len, CASEFUL);
+    }
+    if (streamingTable->hashNBits[CASELESS]) {
+        hashes[CASELESS] = streaming_hash(base, hash_len, CASELESS);
+    }
+}
+
+static really_inline
+const struct FDRSHashEntry *getEnt(const struct FDRSTableHeader *streamingTable,
+                                   u32 h, const MODES m) {
+    u32 nbits = streamingTable->hashNBits[m];
+    if (!nbits) {
+        return NULL;
+    }
+
+    u32 h_ent = h & ((1 << nbits) - 1);
+    u32 h_low = (h >> nbits) & 63;
+
+    const struct FDRSHashEntry *tab =
+        (const struct FDRSHashEntry *)((const u8 *)streamingTable
+                                       + streamingTable->hashOffset[m]);
+    const struct FDRSHashEntry *ent = tab + h_ent;
+
+    if (!has_bit(ent, h_low)) {
+        return NULL;
+    }
+
+    return ent;
+}
+
+static really_inline
+void fdrPackStateMode(u32 *state_table, const struct FDR_Runtime_Args *a,
+                      const struct FDRSTableHeader *streamingTable,
+                      const struct FDRSHashEntry *ent, const MODES m) {
+    assert(ent);
+    assert(streamingTable->hashNBits[m]);
+
+    const struct FDRSHashEntry *tab =
+        (const struct FDRSHashEntry *)((const u8 *)streamingTable
+                                       + streamingTable->hashOffset[m]);
+
+    while (1) {
+        u32 tmp = 0;
+        if ((tmp = do_single_confirm(streamingTable, a, ent->state, m))) {
+            state_table[m] = packStateVal(streamingTable, m, tmp);
+            break;
+        }
+        if (ent->link == LINK_INVALID) {
+            break;
+        }
+        ent = tab + ent->link;
+    }
+}
+
+static really_inline
+void fdrPackState(const struct FDR *fdr, const struct FDR_Runtime_Args *a,
+                  u8 *stream_state) {
+    // nothing to do if there's no stream state for the case
+    if (!stream_state) {
+        return;
+    }
+
+    // get pointers to the streamer FDR and the tertiary structure
+    const struct FDRSTableHeader *streamingTable = getSHDR(fdr);
+
+    assert(streamingTable->N);
+
+    u32 state_table[MAX_MODES] = {0, 0};
+
+    // if we don't have enough history, we don't need to do anything
+    if (streamingTable->N <= a->len + a->len_history) {
+        u32 hashes[MAX_MODES] = {0, 0};
+
+        fdrFindStreamingHash(a, streamingTable, streamingTable->N, hashes);
+
+        const struct FDRSHashEntry *ent_ful = getEnt(streamingTable,
+                                                    hashes[CASEFUL], CASEFUL);
+        const struct FDRSHashEntry *ent_less = getEnt(streamingTable,
+                                                    hashes[CASELESS], CASELESS);
+
+        if (ent_ful) {
+            fdrPackStateMode(state_table, a, streamingTable, ent_ful,
+                             CASEFUL);
+        }
+
+        if (ent_less) {
+            fdrPackStateMode(state_table, a, streamingTable, ent_less,
+                             CASELESS);
+        }
+    }
+
+    setStreamStates(streamingTable, stream_state, state_table);
+}
+
+#endif
--- a/src/fdr/flood_compile.cpp
+++ b/src/fdr/flood_compile.cpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_confirm.h"
+#include "fdr_compile_internal.h"
+#include "fdr_engine_description.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/bitutils.h"
+#include "util/charreach.h"
+#include "util/compare.h"
+#include "util/ue2string.h"
+#include "util/verify_types.h"
+
+#include <cstring>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+using namespace std;
+
+namespace ue2 {
+
+namespace {
+struct FloodComparator {
+    bool operator()(const FDRFlood &f1, const FDRFlood &f2) const {
+        return std::memcmp(&f1, &f2, sizeof(f1)) < 0;
+    }
+};
+}
+
+static
+bool isDifferent(u8 oldC, u8 c, bool caseless) {
+    if (caseless) {
+        return mytolower(oldC) != mytolower(c);
+    } else {
+        return oldC != c;
+    }
+}
+
+static
+void updateFloodSuffix(vector<FDRFlood> &tmpFlood, u8 c, u32 suffix) {
+    FDRFlood &fl = tmpFlood[c];
+    fl.suffix = MAX(fl.suffix, suffix + 1);
+    DEBUG_PRINTF("Updated Flood Suffix for char '%c' to %u\n", c, fl.suffix);
+}
+
+static
+void addFlood(vector<FDRFlood> &tmpFlood, u8 c, const hwlmLiteral &lit,
+              u32 suffix) {
+    FDRFlood &fl = tmpFlood[c];
+    fl.suffix = MAX(fl.suffix, suffix + 1);
+    if (fl.idCount < FDR_FLOOD_MAX_IDS) {
+        fl.ids[fl.idCount] = lit.id;
+        fl.allGroups |= lit.groups;
+        fl.groups[fl.idCount] = lit.groups;
+        fl.len[fl.idCount] = suffix;
+        // when idCount gets to max_ids this flood no longer happens
+        // only incremented one more time to avoid arithmetic overflow
+        DEBUG_PRINTF("Added Flood for char '%c' suffix=%u len[%hu]=%u\n",
+                                        c, fl.suffix, fl.idCount, suffix);
+        fl.idCount++;
+   }
+}
+
+pair<u8 *, size_t> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
+                                        const EngineDescription &eng) {
+    vector<FDRFlood> tmpFlood(N_CHARS);
+    u32 default_suffix = eng.getDefaultFloodSuffixLength();
+
+    // zero everything to avoid spurious distinctions in the compares
+    memset(&tmpFlood[0], 0, N_CHARS * sizeof(FDRFlood));
+
+    for (u32 c = 0; c < N_CHARS; c++) {
+        tmpFlood[c].suffix = default_suffix;
+    }
+
+    for (const auto &lit : lits) {
+        DEBUG_PRINTF("lit: '%s'%s\n", escapeString(lit.s).c_str(),
+                     lit.nocase ? " (nocase)" : "");
+        u32 litSize = verify_u32(lit.s.size());
+        u32 maskSize = (u32)lit.msk.size();
+        u8 c = lit.s[litSize - 1];
+        bool nocase = ourisalpha(c) ? lit.nocase : false;
+
+        if (nocase && maskSize && (lit.msk[maskSize - 1] & CASE_BIT)) {
+            c = (lit.cmp[maskSize - 1] & CASE_BIT) ? mytolower(c) : mytoupper(c);
+            nocase = false;
+        }
+
+        u32 iEnd = MAX(litSize, maskSize);
+        u32 upSuffix = iEnd; // upSuffix is used as an upper case suffix length
+                             // for case-less, or as a suffix length for case-sensitive;
+        u32 loSuffix = iEnd; // loSuffix used only for case-less as a lower case suffix
+                             // length;
+
+        for (u32 i = 0; i < iEnd; i++) {
+            if (i < litSize) {
+                if (isDifferent(c, lit.s[litSize - i - 1], lit.nocase)) {
+                    DEBUG_PRINTF("non-flood char in literal[%u] %c != %c\n",
+                                                i, c, lit.s[litSize - i - 1]);
+                    upSuffix = MIN(upSuffix, i);
+                    loSuffix = MIN(loSuffix, i); // makes sense only for case-less
+                    break;
+                }
+            }
+            if (i < maskSize) {
+                u8 m = lit.msk[maskSize - i - 1];
+                u8 cm = lit.cmp[maskSize - i - 1] & m;
+                if(nocase) {
+                    if ((mytoupper(c) & m) != cm) {
+                        DEBUG_PRINTF("non-flood char in mask[%u] %c != %c\n",
+                                                            i, mytoupper(c), cm);
+                        upSuffix = MIN(upSuffix, i);
+                    }
+                    if ((mytolower(c) & m) != cm) {
+                        DEBUG_PRINTF("non-flood char in mask[%u] %c != %c\n",
+                                                            i, mytolower(c), cm);
+                        loSuffix = MIN(loSuffix, i);
+                    }
+                    if (loSuffix != iEnd && upSuffix != iEnd) {
+                        break;
+                    }
+                } else if ((c & m) != cm) {
+                    DEBUG_PRINTF("non-flood char in mask[%u] %c != %c\n", i, c, cm);
+                    upSuffix = MIN(upSuffix, i);
+                    break;
+                }
+            }
+        }
+        if(upSuffix != iEnd) {
+            updateFloodSuffix(tmpFlood, nocase ? mytoupper(c) : c, upSuffix);
+        } else {
+            addFlood(tmpFlood, nocase ? mytoupper(c) : c, lit, upSuffix);
+        }
+        if (nocase) {
+            if(loSuffix != iEnd) {
+                updateFloodSuffix(tmpFlood, mytolower(c), loSuffix);
+            } else {
+                addFlood(tmpFlood, mytolower(c), lit, loSuffix);
+            }
+        }
+    }
+
+#ifdef DEBUG
+    for (u32 i = 0; i < N_CHARS; i++) {
+        FDRFlood &fl = tmpFlood[i];
+        if (!fl.idCount) {
+            continue;
+        }
+
+        printf("i is %02x fl->idCount is %hd fl->suffix is %d fl->allGroups is "
+               "%016llx\n", i, fl.idCount, fl.suffix, fl.allGroups);
+        for (u32 j = 0; j < fl.idCount; j++) {
+            printf("j is %d fl.groups[j] %016llx fl.len[j] %d \n", j,
+                   fl.groups[j], fl.len[j]);
+        }
+    }
+#endif
+
+    map<FDRFlood, CharReach, FloodComparator> flood2chars;
+    for (u32 i = 0; i < N_CHARS; i++) {
+        FDRFlood fl = tmpFlood[i];
+        flood2chars[fl].set(i);
+    }
+
+    u32 nDistinctFloods = flood2chars.size();
+    size_t floodHeaderSize = sizeof(u32) * N_CHARS;
+    size_t floodStructSize = sizeof(FDRFlood) * nDistinctFloods;
+    size_t totalSize = ROUNDUP_16(floodHeaderSize + floodStructSize);
+    u8 *buf = (u8 *)aligned_zmalloc(totalSize);
+    assert(buf); // otherwise would have thrown std::bad_alloc
+
+    u32 *floodHeader = (u32 *)buf;
+    FDRFlood *layoutFlood = (FDRFlood * )(buf + floodHeaderSize);
+
+    u32 currentFloodIndex = 0;
+    for (const auto &m : flood2chars) {
+        const FDRFlood &fl = m.first;
+        const CharReach &cr = m.second;
+        layoutFlood[currentFloodIndex] = fl;
+        for (size_t c = cr.find_first(); c != cr.npos; c = cr.find_next(c)) {
+            floodHeader[c] = currentFloodIndex;
+        }
+        currentFloodIndex++;
+    }
+
+    DEBUG_PRINTF("made a flood structure with %zu + %zu = %zu\n",
+                 floodHeaderSize, floodStructSize, totalSize);
+
+    return make_pair((u8 *)buf, totalSize);
+}
+
+} // namespace ue2
--- a/src/fdr/flood_runtime.h
+++ b/src/fdr/flood_runtime.h
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FLOOD_RUNTIME
+#define FLOOD_RUNTIME
+
+#if defined(ARCH_64_BIT)
+#define FLOOD_64
+#else
+#define FLOOD_32
+#endif
+#define FLOOD_MINIMUM_SIZE 256
+#define FLOOD_BACKOFF_START 32
+
+static really_inline
+const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {
+    // if we don't have a flood at either the start or end,
+    // or have a very small buffer, don't bother with flood detection
+    if (len < FLOOD_MINIMUM_SIZE) {
+        return buf + len;
+    }
+
+    /* entry points in runtime.c prefetch relevant data */
+#ifndef FLOOD_32
+    u64a x11 = *(const u64a *)ROUNDUP_PTR(buf, 8);
+    u64a x12 = *(const u64a *)ROUNDUP_PTR(buf+8, 8);
+    if (x11 == x12) {
+        return buf + floodBackoff;
+    }
+    u64a x21 = *(const u64a *)ROUNDUP_PTR(buf + len/2, 8);
+    u64a x22 = *(const u64a *)ROUNDUP_PTR(buf + len/2 + 8, 8);
+    if (x21 == x22) {
+        return buf + floodBackoff;
+    }
+    u64a x31 = *(const u64a *)ROUNDUP_PTR(buf + len - 24, 8);
+    u64a x32 = *(const u64a *)ROUNDUP_PTR(buf + len - 16, 8);
+    if (x31 == x32) {
+        return buf + floodBackoff;
+    }
+#else
+    u32 x11 = *(const u32 *)ROUNDUP_PTR(buf, 4);
+    u32 x12 = *(const u32 *)ROUNDUP_PTR(buf+4, 4);
+    if (x11 == x12) {
+        return buf + floodBackoff;
+    }
+    u32 x21 = *(const u32 *)ROUNDUP_PTR(buf + len/2, 4);
+    u32 x22 = *(const u32 *)ROUNDUP_PTR(buf + len/2 + 4, 4);
+    if (x21 == x22) {
+        return buf + floodBackoff;
+    }
+    u32 x31 = *(const u32 *)ROUNDUP_PTR(buf + len - 12, 4);
+    u32 x32 = *(const u32 *)ROUNDUP_PTR(buf + len - 8, 4);
+    if (x31 == x32) {
+        return buf + floodBackoff;
+    }
+#endif
+    return buf + len;
+}
+
+static really_inline
+const u8 * floodDetect(const struct FDR * fdr,
+                       const struct FDR_Runtime_Args * a,
+                       const u8 ** ptrPtr,
+                       const u8 * tryFloodDetect,
+                       u32 * floodBackoffPtr,
+                       hwlmcb_rv_t * control,
+                       u32 iterBytes) {
+    DEBUG_PRINTF("attempting flood detection at %p\n", tryFloodDetect);
+    const u8 * buf = a->buf;
+    const size_t len = a->len;
+    HWLMCallback cb = a->cb;
+    void * ctxt = a->ctxt;
+
+    const u8 * ptr = *ptrPtr;
+    // tryFloodDetect is never put in places where unconditional
+    // reads a short distance forward or backward here
+    // TODO: rationale for this line needs to be rediscovered!!
+    size_t mainLoopLen = len > iterBytes ? len - iterBytes : 0;
+    const u32 i = ptr - buf;
+    u32 j = i;
+
+    // go from c to our FDRFlood structure
+    u8 c = buf[i];
+    const u8 * fBase = ((const u8 *)fdr) + fdr->floodOffset;
+    u32 fIdx = ((const u32 *)fBase)[c];
+    const struct FDRFlood * fsb = (const struct FDRFlood *)(fBase + sizeof(u32) * 256);
+    const struct FDRFlood * fl = &fsb[fIdx];
+
+#ifndef FLOOD_32
+    u64a cmpVal = c;
+    cmpVal |= cmpVal << 8;
+    cmpVal |= cmpVal << 16;
+    cmpVal |= cmpVal << 32;
+    u64a probe = *(const u64a *)ROUNDUP_PTR(buf+i, 8);
+#else
+    u32 cmpVal = c;
+    cmpVal |= cmpVal << 8;
+    cmpVal |= cmpVal << 16;
+    u32 probe = *(const u32 *)ROUNDUP_PTR(buf+i, 4);
+#endif
+
+    if ((probe != cmpVal) || (fl->idCount >= FDR_FLOOD_MAX_IDS)) {
+        *floodBackoffPtr *= 2;
+        goto floodout;
+    }
+
+    if (i < fl->suffix + 7) {
+        *floodBackoffPtr *= 2;
+        goto floodout;
+    }
+
+    j = i - fl->suffix;
+
+#ifndef FLOOD_32
+    j -= (u32)((uintptr_t)buf + j) & 0x7; // push j back to yield 8-aligned addrs
+    for (; j + 32 < mainLoopLen; j += 32) {
+        u64a v = *(const u64a *)(buf + j);
+        u64a v2 = *(const u64a *)(buf + j + 8);
+        u64a v3 = *(const u64a *)(buf + j + 16);
+        u64a v4 = *(const u64a *)(buf + j + 24);
+        if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) {
+            break;
+        }
+    }
+    for (; j + 8 < mainLoopLen; j += 8) {
+        u64a v = *(const u64a *)(buf + j);
+        if (v != cmpVal) {
+            break;
+        }
+    }
+#else
+    j -= (u32)((size_t)buf + j) & 0x3; // push j back to yield 4-aligned addrs
+    for (; j + 16 < mainLoopLen; j += 16) {
+        u32 v = *(const u32 *)(buf + j);
+        u32 v2 = *(const u32 *)(buf + j + 4);
+        u32 v3 = *(const u32 *)(buf + j + 8);
+        u32 v4 = *(const u32 *)(buf + j + 12);
+        if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) {
+            break;
+        }
+    }
+    for (; j + 4 < mainLoopLen; j += 4) {
+        u32 v = *(const u32 *)(buf + j);
+        if (v != cmpVal) {
+            break;
+        }
+    }
+#endif
+    for (; j < mainLoopLen; j++) {
+        u8 v = *(const u8 *)(buf + j);
+        if (v != c) {
+            break;
+        }
+    }
+    if (j > i ) {
+        j--; // needed for some reaches
+        u32 itersAhead = (j-i)/iterBytes;
+        u32 floodSize = itersAhead*iterBytes;
+
+        DEBUG_PRINTF("flooding %u size j %u i %u fl->idCount %hu "
+                     "*control %016llx fl->allGroups %016llx\n",
+                     floodSize, j, i, fl->idCount, *control, fl->allGroups);
+        DEBUG_PRINTF("mainloopLen %zu mainStart ??? mainEnd ??? len %zu\n",
+                     mainLoopLen, len);
+
+        if (fl->idCount && (*control & fl->allGroups)) {
+            switch (fl->idCount) {
+#if !defined(FLOOD_DEBUG)
+            // Carefully unrolled code
+            case 1:
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups);
+                     t += 4) {
+                    DEBUG_PRINTF("aaa %u %llx\n", t, fl->groups[0]);
+                    u32 len0 = fl->len[0] - 1;
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 0 - len0, i + t + 0, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 2 - len0, i + t + 2, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 3 - len0, i + t + 3, fl->ids[0], ctxt);
+                    }
+                }
+                break;
+            case 2:
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 4) {
+                    u32 len0 = fl->len[0] - 1;
+                    u32 len1 = fl->len[1] - 1;
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t - len0, i + t, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t - len1, i + t, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control =
+                            cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 1 - len1, i + t + 1, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 2 - len0, i + t + 2, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 2 - len1, i + t + 2, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 3 - len0, i + t + 3, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 3 - len1, i + t + 3, fl->ids[1], ctxt);
+                    }
+                }
+                break;
+            case 3:
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 2) {
+                    u32 len0 = fl->len[0] - 1;
+                    u32 len1 = fl->len[1] - 1;
+                    u32 len2 = fl->len[2] - 1;
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t - len0, i + t, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t - len1, i + t, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[2]) {
+                        *control = cb(i + t - len2, i + t, fl->ids[2], ctxt);
+                    }
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 1 - len1, i + t + 1, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[2]) {
+                        *control = cb(i + t + 1 - len2, i + t + 1, fl->ids[2], ctxt);
+                    }
+                }
+                break;
+            default:
+                // slow generalized loop
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 2) {
+                    u32 len0 = fl->len[0] - 1;
+                    u32 len1 = fl->len[1] - 1;
+                    u32 len2 = fl->len[2] - 1;
+                    u32 len3 = fl->len[3] - 1;
+
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t - len0, i + t, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t - len1, i + t, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[2]) {
+                        *control = cb(i + t - len2, i + t, fl->ids[2], ctxt);
+                    }
+                    if (*control & fl->groups[3]) {
+                        *control = cb(i + t - len3, i + t, fl->ids[3], ctxt);
+                    }
+
+                    for (u32 t2 = 4; t2 < fl->idCount; t2++) {
+                        if (*control & fl->groups[t2]) {
+                            *control = cb(i + t - (fl->len[t2] - 1), i + t, fl->ids[t2], ctxt);
+                        }
+                    }
+
+                    if (*control & fl->groups[0]) {
+                        *control = cb(i + t + 1 - len0, i + t + 1, fl->ids[0], ctxt);
+                    }
+                    if (*control & fl->groups[1]) {
+                        *control = cb(i + t + 1 - len1, i + t + 1, fl->ids[1], ctxt);
+                    }
+                    if (*control & fl->groups[2]) {
+                        *control = cb(i + t + 1 - len2, i + t + 1, fl->ids[2], ctxt);
+                    }
+                    if (*control & fl->groups[3]) {
+                        *control = cb(i + t + 1 - len3, i + t + 1, fl->ids[3], ctxt);
+                    }
+
+                    for (u32 t2 = 4; t2 < fl->idCount; t2++) {
+                        if (*control & fl->groups[t2]) {
+                            *control = cb(i + t + 1 - (fl->len[t2] - 1), i + t + 1, fl->ids[t2], ctxt);
+                        }
+                    }
+                }
+                break;
+#else
+            // Fallback for debugging
+            default:
+                for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t++) {
+                    for (u32 t2 = 0; t2 < fl->idCount; t2++) {
+                        if (*control & fl->groups[t2]) {
+                            *control = cb(i + t - (fl->len[t2] - 1), i + t, fl->ids[t2], ctxt);
+                        }
+                    }
+                }
+#endif
+            }
+        }
+        ptr += floodSize;
+    } else {
+        *floodBackoffPtr *= 2;
+    }
+
+floodout:
+    if (j + *floodBackoffPtr < mainLoopLen - 128) {
+        tryFloodDetect = buf + MAX(i,j) + *floodBackoffPtr;
+    } else {
+        tryFloodDetect = buf + mainLoopLen; // set so we never do another flood detect
+    }
+    *ptrPtr = ptr;
+    DEBUG_PRINTF("finished flood detection at %p (next check %p)\n",
+                 ptr, tryFloodDetect);
+    return tryFloodDetect;
+}
+
+#endif
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "util/simd_utils.h"
+#include "util/simd_utils_ssse3.h"
+
+static const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
+};
+
+// Note: p_mask is an output param that initialises a poison mask.
+UNUSED static really_inline
+m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history,
+                     const u32 nMasks) {
+    union {
+        u8 val8[16];
+        m128 val128;
+    } u;
+    u.val128 = zeroes128();
+
+    if (ptr >= lo) {
+        u32 avail = (u32)(hi - ptr);
+        if (avail >= 16) {
+            *p_mask = load128((const void*)(p_mask_arr[16] + 16));
+            return loadu128(ptr);
+        }
+        *p_mask = load128((const void*)(p_mask_arr[avail] + 16));
+        for (u32 i = 0; i < avail; i++) {
+            u.val8[i] = ptr[i];
+        }
+    } else {
+        u32 need = MIN((u32)(lo - ptr), MIN(len_history, nMasks - 1));
+        u32 start = (u32)(lo - ptr);
+        u32 i;
+        for (i = start - need; ptr + i < lo; i++) {
+            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+        }
+        u32 end = MIN(16, (u32)(hi - ptr));
+        *p_mask = loadu128((const void*)(p_mask_arr[end - start] + 16 - start));
+        for (; i < end; i++) {
+            u.val8[i] = ptr[i];
+        }
+    }
+
+    return u.val128;
+}
+
+
+#if defined(__AVX2__)
+
+UNUSED static really_inline
+m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history,
+                     const u32 nMasks) {
+    m128 p_mask128;
+    m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history, len_history, nMasks));
+    *p_mask = set2x128(p_mask128);
+    return ret;
+}
+
+static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
+};
+
+
+UNUSED static really_inline
+m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history) {
+    union {
+        u8 val8[32];
+        m256 val256;
+    } u;
+
+    if (ptr >= lo) {
+        u32 avail = (u32)(hi - ptr);
+        if (avail >= 32) {
+            *p_mask = load256((const void*)(p_mask_arr256[32] + 32));
+            return loadu256(ptr);
+        }
+        *p_mask = load256((const void*)(p_mask_arr256[avail] + 32));
+        for (u32 i = 0; i < avail; i++) {
+            u.val8[i] = ptr[i];
+        }
+    } else {
+        // need contains "how many chars to pull from history"
+        // calculate based on what we need, what we have in the buffer
+        // and only what we need to make primary confirm work
+        u32 start = (u32)(lo - ptr);
+        u32 i;
+        for (i = start; ptr + i < lo; i++) {
+            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+        }
+        u32 end = MIN(32, (u32)(hi - ptr));
+        *p_mask = loadu256((const void*)(p_mask_arr256[end - start] + 32 - start));
+        for (; i < end; i++) {
+            u.val8[i] = ptr[i];
+        }
+    }
+
+    return u.val256;
+}
+
+
+#endif // __AVX2__
+
+#define P0(cnd) unlikely(cnd)
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "flood_runtime.h"
+
+#include "fdr_confirm.h"
+#include "fdr_confirm_runtime.h"
+
+#include "fdr_loadval.h"
+#include "util/bitutils.h"
+#include "teddy_internal.h"
+
+#include "teddy_autogen.c"
--- a/src/fdr/teddy_autogen.py
+++ b/src/fdr/teddy_autogen.py
@@ -0,0 +1,545 @@
+#!/usr/bin/python
+
+# Copyright (c) 2015, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+from autogen_utils import *
+from base_autogen import *
+from string import Template
+
+class MT(MatcherBase):
+    def produce_confirm(self, iter, var_name, offset, bits, cautious = True):
+        if self.packed:
+            print self.produce_confirm_base(var_name, bits, iter*16 + offset, cautious, enable_confirmless = False, do_bailout = False)
+        else:
+            if self.num_masks == 1:
+                conf_func = "confWithBit1"
+            else:
+                conf_func = "confWithBitMany"
+
+            if cautious:
+                caution_string = "VECTORING"
+            else:
+                caution_string = "NOT_CAUTIOUS"
+
+            print "            if (P0(!!%s)) {" % var_name
+            print "                do  {"
+            if bits == 64:
+                print "                    bit = findAndClearLSB_64(&%s);" % (var_name)
+            else:
+                print "                    bit = findAndClearLSB_32(&%s);" % (var_name)
+            print "                    byte  = bit / %d + %d;" % (self.num_buckets, iter*16 + offset)
+            print "                    idx  = bit %% %d;" % self.num_buckets
+            print "                    cf = confBase[idx];"
+            print "                    fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
+            print "                    if (!(fdrc->groups & *control))"
+            print "                        continue;"
+            print "                    %s(fdrc, a, ptr - buf + byte, %s, control, &last_match);" % (conf_func, caution_string)
+            print "                } while(P0(!!%s));" % var_name
+            print "                if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
+            print "                    *a->groups = controlVal;"
+            print "                    return HWLM_TERMINATED;"
+            print "                }"
+            print "            }"
+
+    def produce_needed_temporaries(self, max_iterations):
+        print "        m128 p_mask;"
+        for iter in range(0, max_iterations):
+            print "        m128 val_%d;" % iter
+            print "        m128 val_%d_lo;" % iter
+            print "        m128 val_%d_hi;" % iter
+            for x in range(self.num_masks):
+                print "        m128 res_%d_%d;" % (iter, x)
+                if x != 0:
+                    print "        m128 res_shifted_%d_%d;" % (iter, x)
+            print "        m128 r_%d;" % iter
+            print "#ifdef ARCH_64_BIT"
+            print "            u64a r_%d_lopart;" % iter
+            print "            u64a r_%d_hipart;" % iter
+            print "#else"
+            print "            u32 r_%d_part1;" % iter
+            print "            u32 r_%d_part2;" % iter
+            print "            u32 r_%d_part3;" % iter
+            print "            u32 r_%d_part4;" % iter
+            print "#endif"
+
+    def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
+                                         cautious, save_old):
+        if cautious:
+            print "        val_%d = vectoredLoad128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
+        else:
+            print "        val_%d = load128(ptr + %d);" % (iter, iter*16)
+        print "        val_%d_lo = and128(val_%d, lomask);" % (iter, iter)
+        print "        val_%d_hi = rshift2x64(val_%d, 4);" % (iter, iter)
+        print "        val_%d_hi = and128(val_%d_hi, lomask);" % (iter, iter)
+        print
+        for x in range(self.num_masks):
+            print Template("""
+        res_${ITER}_${X} = and128(pshufb(maskBase[${X}*2]  , val_${ITER}_lo),
+                                  pshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
+            if x != 0:
+                if iter == 0:
+                    print "        res_shifted_%d_%d = palignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x,   iter, x,         x,   x)
+                else:
+                    print "        res_shifted_%d_%d = palignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x,    iter, x, iter-1, x,   x)
+            if x != 0 and iter == effective_num_iterations - 1 and save_old:
+                print "        res_old_%d = res_%d_%d;" % (x, iter, x)
+        print
+        if cautious:
+            print "        r_%d = and128(res_%d_0, p_mask);" % (iter, iter)
+        else:
+            print "        r_%d = res_%d_0;" % (iter, iter)
+        for x in range(1, self.num_masks):
+            print "        r_%d = and128(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
+        print
+
+    def produce_one_iteration_confirm(self, iter, confirmCautious):
+        setup64 = [ (0, "r_%d_lopart" % iter, "movq(r_%d)" % iter),
+                    (8, "r_%d_hipart" % iter, "movq(byteShiftRight128(r_%d, 8))" % iter) ]
+
+        setup32 = [ (0, "r_%d_part1" % iter, "movd(r_%d)" % iter),
+                    (4, "r_%d_part2" % iter, "movd(byteShiftRight128(r_%d, 4))" % iter),
+                    (8, "r_%d_part3" % iter, "movd(byteShiftRight128(r_%d, 8))" % iter),
+                    (12, "r_%d_part4" % iter, "movd(byteShiftRight128(r_%d, 12))" % iter) ]
+
+        print "        if (P0(isnonzero128(r_%d))) {" % (iter)
+        print "#ifdef ARCH_64_BIT"
+        for (off, val, init) in setup64:
+            print "            %s = %s;" % (val, init)
+        for (off, val, init) in setup64:
+            self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
+        print "#else"
+        for (off, val, init) in setup32:
+            print "            %s = %s;" % (val, init)
+        for (off, val, init) in setup32:
+            self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
+        print "#endif"
+        print "        }"
+
+    def produce_one_iteration(self, iter, effective_num_iterations, cautious = False,
+                              confirmCautious = True, save_old = True):
+        self.produce_one_iteration_state_calc(iter, effective_num_iterations, cautious, save_old)
+        self.produce_one_iteration_confirm(iter, confirmCautious)
+
+    def produce_code(self):
+        print self.produce_header(visible = True, header_only = False)
+        print self.produce_common_declarations()
+        print
+
+        self.produce_needed_temporaries(self.num_iterations)
+        print
+
+        print "    const struct Teddy * teddy = (const struct Teddy *)fdr;"
+        print "    const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
+        print "    const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32));" % self.num_masks
+        print "    const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
+        print "    const size_t iterBytes = %d;" % (self.num_iterations * 16)
+
+        print '    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
+                                ' buf, len, a->start_offset);'
+        print '    DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
+                                ' mainStart);'
+
+        for x in range(self.num_masks):
+            if (x != 0):
+                print "    m128 res_old_%d = ones128();" % x
+        print "    m128 lomask = set16x8(0xf);"
+
+        print "    if (ptr < mainStart) {"
+        print "         ptr = mainStart - 16;"
+        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
+        print "         ptr += 16;"
+        print "    }"
+
+        print "    if (ptr + 16 < buf + len) {"
+        self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
+        print "         ptr += 16;"
+        print "    }"
+
+        print "    for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
+        print "        __builtin_prefetch(ptr + (iterBytes*4));"
+        print self.produce_flood_check()
+
+        for iter in range(self.num_iterations):
+            self.produce_one_iteration(iter, self.num_iterations, cautious = False, confirmCautious = False)
+
+        print "    }"
+
+        print "    for (; ptr < buf + len; ptr += 16) {"
+        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
+        print "    }"
+
+        print self.produce_footer()
+
+    def produce_compile_call(self):
+        packed_str = { False : "false", True : "true"}[self.packed]
+        print "        { %d, %s, %d, %d, %s, %d, %d }," % (
+            self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
+            self.conf_pull_back, self.conf_top_level_split)
+
+    def get_name(self):
+        if self.packed:
+            pck_string = "_pck"
+        else:
+            pck_string = ""
+
+        if self.num_buckets == 16:
+            type_string = "_fat"
+        else:
+            type_string = ""
+
+        return "fdr_exec_teddy_%s_msks%d%s%s" % (self.arch.name, self.num_masks, pck_string, type_string)
+
+    def __init__(self, arch, packed = False, num_masks = 1, num_buckets = 8):
+        self.arch = arch
+        self.packed = packed
+        self.num_masks = num_masks
+        self.num_buckets = num_buckets
+        self.num_iterations = 2
+
+        if packed:
+            self.conf_top_level_split = 32
+        else:
+            self.conf_top_level_split = 1
+        self.conf_pull_back = 0
+
+class MTFat(MT):
+    def produce_needed_temporaries(self, max_iterations):
+        print "        m256 p_mask;"
+        for iter in range(0, max_iterations):
+            print "        m256 val_%d;" % iter
+            print "        m256 val_%d_lo;" % iter
+            print "        m256 val_%d_hi;" % iter
+            for x in range(self.num_masks):
+                print "        m256 res_%d_%d;" % (iter, x)
+                if x != 0:
+                    print "        m256 res_shifted_%d_%d;" % (iter, x)
+            print "        m256 r_%d;" % iter
+            print "#ifdef ARCH_64_BIT"
+            print "            u64a r_%d_part1;" % iter
+            print "            u64a r_%d_part2;" % iter
+            print "            u64a r_%d_part3;" % iter
+            print "            u64a r_%d_part4;" % iter
+            print "#else"
+            print "            u32 r_%d_part1;" % iter
+            print "            u32 r_%d_part2;" % iter
+            print "            u32 r_%d_part3;" % iter
+            print "            u32 r_%d_part4;" % iter
+            print "            u32 r_%d_part5;" % iter
+            print "            u32 r_%d_part6;" % iter
+            print "            u32 r_%d_part7;" % iter
+            print "            u32 r_%d_part8;" % iter
+            print "#endif"
+
+    def produce_code(self):
+        print self.produce_header(visible = True, header_only = False)
+        print self.produce_common_declarations()
+        print
+
+        self.produce_needed_temporaries(self.num_iterations)
+        print
+
+        print "    const struct Teddy * teddy = (const struct Teddy *)fdr;"
+        print "    const m256 * maskBase = (const m256 *)((const u8 *)fdr + sizeof(struct Teddy));"
+        print "    const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32*2));" % self.num_masks
+        print "    const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
+        print "    const size_t iterBytes = %d;" % (self.num_iterations * 16)
+
+        print '    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
+                                ' buf, len, a->start_offset);'
+        print '    DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
+                                ' mainStart);'
+
+        for x in range(self.num_masks):
+            if (x != 0):
+                print "    m256 res_old_%d = ones256();" % x
+        print "    m256 lomask = set32x8(0xf);"
+
+        print "    if (ptr < mainStart) {"
+        print "         ptr = mainStart - 16;"
+        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
+        print "         ptr += 16;"
+        print "    }"
+
+        print "    if (ptr + 16 < buf + len) {"
+        self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
+        print "         ptr += 16;"
+        print "    }"
+
+        print "    for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
+        print "        __builtin_prefetch(ptr + (iterBytes*4));"
+        print self.produce_flood_check()
+
+        for iter in range(self.num_iterations):
+            self.produce_one_iteration(iter, self.num_iterations, False, confirmCautious = False)
+
+        print "    }"
+
+        print "    for (; ptr < buf + len; ptr += 16) {"
+        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
+        print "    }"
+
+        print self.produce_footer()
+
+    def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
+                                         cautious, save_old):
+        if cautious:
+            print "        val_%d = vectoredLoad2x128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
+        else:
+            print "        val_%d = load2x128(ptr + %d);" % (iter, iter*16)
+        print "        val_%d_lo = and256(val_%d, lomask);" % (iter, iter)
+        print "        val_%d_hi = rshift4x64(val_%d, 4);" % (iter, iter)
+        print "        val_%d_hi = and256(val_%d_hi, lomask);" % (iter, iter)
+        print
+        for x in range(self.num_masks):
+            print Template("""
+        res_${ITER}_${X} = and256(vpshufb(maskBase[${X}*2]  , val_${ITER}_lo),
+                                  vpshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
+            if x != 0:
+                if iter == 0:
+                    print "        res_shifted_%d_%d = vpalignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x,   iter, x,         x,   x)
+                else:
+                    print "        res_shifted_%d_%d = vpalignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x,    iter, x, iter-1, x,   x)
+            if x != 0 and iter == effective_num_iterations - 1 and save_old:
+                print "        res_old_%d = res_%d_%d;" % (x, iter, x)
+        print
+        if cautious:
+            print "        r_%d = and256(res_%d_0, p_mask);" % (iter, iter)
+        else:
+            print "        r_%d = res_%d_0;" % (iter, iter)
+        for x in range(1, self.num_masks):
+            print "        r_%d = and256(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
+        print
+
+    def produce_one_iteration_confirm(self, iter, confirmCautious):
+        setup64 = [ (0, "r_%d_part1" % iter, "extractlow64from256(r)"),
+                    (4, "r_%d_part2" % iter, "extract64from256(r, 1);\n            r = interleave256hi(r_%d, r_swap)" % (iter)),
+                    (8, "r_%d_part3" % iter, "extractlow64from256(r)"),
+                    (12, "r_%d_part4" % iter, "extract64from256(r, 1)") ]
+
+        setup32 = [ (0, "r_%d_part1" % iter, "extractlow32from256(r)"),
+                    (2, "r_%d_part2" % iter, "extract32from256(r, 1)"),
+                    (4, "r_%d_part3" % iter, "extract32from256(r, 2)"),
+                    (6, "r_%d_part4" % iter, "extract32from256(r, 3);\n            r = interleave256hi(r_%d, r_swap)" % (iter)),
+                    (8, "r_%d_part5" % iter, "extractlow32from256(r)"),
+                    (10, "r_%d_part6" % iter, "extract32from256(r, 1)"),
+                    (12, "r_%d_part7" % iter, "extract32from256(r, 2)"),
+                    (14, "r_%d_part8" % iter, "extract32from256(r, 3)") ]
+
+        print "        if (P0(isnonzero256(r_%d))) {" % (iter)
+        print "            m256 r_swap = swap128in256(r_%d);" % (iter)
+        print "            m256 r = interleave256lo(r_%d, r_swap);" % (iter)
+        print "#ifdef ARCH_64_BIT"
+        for (off, val, init) in setup64:
+            print "            %s = %s;" % (val, init)
+
+        for (off, val, init) in setup64:
+            self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
+        print "#else"
+        for (off, val, init) in setup32:
+            print "            %s = %s;" % (val, init)
+
+        for (off, val, init) in setup32:
+            self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
+        print "#endif"
+        print "        }"
+
+class MTFast(MatcherBase):
+
+    def produce_confirm(self, cautious):
+        if cautious:
+            cautious_str = "VECTORING"
+        else:
+            cautious_str = "NOT_CAUTIOUS"
+
+        print "            for (u32 i = 0; i < arrCnt; i++) {"
+        print "                byte = bitArr[i] / 8;"
+        if self.packed:
+            conf_split_mask = IntegerType(32).constant_to_string(
+                                self.conf_top_level_split - 1)
+            print "                bitRem  = bitArr[i] % 8;"
+            print "                confSplit = *(ptr+byte) & 0x1f;"
+            print "                idx = confSplit * %d + bitRem;" % self.num_buckets
+            print "                cf = confBase[idx];"
+            print "                if (!cf)"
+            print "                    continue;"
+            print "                fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
+            print "                if (!(fdrc->groups & *control))"
+            print "                    continue;"
+            print "                confWithBit(fdrc, a, ptr - buf + byte, %s, 0, control, &last_match);" % cautious_str
+        else:
+            print "                cf = confBase[bitArr[i] % 8];"
+            print "                fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
+            print "                confWithBit1(fdrc, a, ptr - buf + byte, %s, control, &last_match);" % cautious_str
+        print "                if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
+        print "                    *a->groups = controlVal;"
+        print "                    return HWLM_TERMINATED;"
+        print "                }"
+        print "            }"
+
+    def produce_needed_temporaries(self, max_iterations):
+        print "        u32 arrCnt;"
+        print "        u16 bitArr[512];"
+        print "        m256 p_mask;"
+        print "        m256 val_0;"
+        print "        m256 val_0_lo;"
+        print "        m256 val_0_hi;"
+        print "        m256 res_0;"
+        print "        m256 res_1;"
+        print "        m128 lo_part;"
+        print "        m128 hi_part;"
+        print "#ifdef ARCH_64_BIT"
+        print "        u64a r_0_part;"
+        print "#else"
+        print "        u32 r_0_part;"
+        print "#endif"
+
+    def produce_bit_scan(self, offset, bits):
+        print "                while (P0(!!r_0_part)) {"
+        if bits == 64:
+            print "                    bitArr[arrCnt++] = (u16)findAndClearLSB_64(&r_0_part) + 64 * %d;" % (offset)
+        else:
+            print "                    bitArr[arrCnt++] = (u16)findAndClearLSB_32(&r_0_part) + 32 * %d;" % (offset)
+        print "                }"
+
+    def produce_bit_check_128(self, var_name, offset):
+        print "            if (P0(isnonzero128(%s))) {" % (var_name)
+        print "#ifdef ARCH_64_BIT"
+        print "                r_0_part = movq(%s);" % (var_name)
+        self.produce_bit_scan(offset, 64)
+        print "                r_0_part = movq(byteShiftRight128(%s, 8));" % (var_name)
+        self.produce_bit_scan(offset + 1, 64)
+        print "#else"
+        print "                r_0_part = movd(%s);" % (var_name)
+        self.produce_bit_scan(offset * 2, 32)
+        for step in range(1, 4):
+            print "                r_0_part = movd(byteShiftRight128(%s, %d));" % (var_name, step * 4)
+            self.produce_bit_scan(offset * 2 + step, 32)
+        print "#endif"
+        print "            }"
+
+    def produce_bit_check_256(self, iter, single_iter, cautious):
+        print "        if (P0(isnonzero256(res_%d))) {" % (iter)
+        if single_iter:
+            print "            arrCnt = 0;"
+        print "            lo_part = cast256to128(res_%d);" % (iter)
+        print "            hi_part = cast256to128(swap128in256(res_%d));" % (iter)
+        self.produce_bit_check_128("lo_part", iter * 4)
+        self.produce_bit_check_128("hi_part", iter * 4 + 2)
+        if single_iter:
+            self.produce_confirm(cautious)
+        print "        }"
+
+    def produce_one_iteration_state_calc(self, iter, cautious):
+        if cautious:
+            print "        val_0 = vectoredLoad256(&p_mask, ptr + %d, buf+a->start_offset, buf+len, a->buf_history, a->len_history);" % (iter * 32)
+        else:
+            print "        val_0 = load256(ptr + %d);" % (iter * 32)
+        print "        val_0_lo = and256(val_0, lomask);"
+        print "        val_0_hi = rshift4x64(val_0, 4);"
+        print "        val_0_hi = and256(val_0_hi, lomask);"
+        print "        res_%d = and256(vpshufb(maskLo  , val_0_lo), vpshufb(maskHi, val_0_hi));" % (iter)
+        if cautious:
+            print "        res_%d = and256(res_%d, p_mask);" % (iter, iter)
+
+    def produce_code(self):
+        print self.produce_header(visible = True, header_only = False)
+        print self.produce_common_declarations()
+        print
+
+        self.produce_needed_temporaries(self.num_iterations)
+
+        print "    const struct Teddy * teddy = (const struct Teddy *)fdr;"
+        print "    const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
+        print "    const m256 maskLo = set2x128(maskBase[0]);"
+        print "    const m256 maskHi = set2x128(maskBase[1]);"
+        print "    const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + 32);"
+        print "    const u8 * mainStart = ROUNDUP_PTR(ptr, 32);"
+        print "    const size_t iterBytes = %d;" % (self.num_iterations * 32)
+
+        print '    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
+                                ' buf, len, a->start_offset);'
+        print '    DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
+                                ' mainStart);'
+        print "    const m256 lomask = set32x8(0xf);"
+
+        print "    if (ptr < mainStart) {"
+        print "        ptr = mainStart - 32;"
+        self.produce_one_iteration_state_calc(iter = 0, cautious = True)
+        self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
+        print "        ptr += 32;"
+        print "    }"
+
+        print "    if (ptr + 32 < buf + len) {"
+        self.produce_one_iteration_state_calc(iter = 0, cautious = False)
+        self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
+        print "        ptr += 32;"
+        print "    }"
+        print "    for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
+        print "        __builtin_prefetch(ptr + (iterBytes*4));"
+        print self.produce_flood_check()
+        for iter in range (0, self.num_iterations):
+            self.produce_one_iteration_state_calc(iter = iter, cautious = False)
+        print "        arrCnt = 0;"
+        for iter in range (0, self.num_iterations):
+            self.produce_bit_check_256(iter = iter, single_iter = False, cautious = False)
+        self.produce_confirm(cautious = False)
+        print "    }"
+
+        print "    for (; ptr < buf + len; ptr += 32) {"
+        self.produce_one_iteration_state_calc(iter = 0, cautious = True)
+        self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
+        print "    }"
+
+        print self.produce_footer()
+
+    def get_name(self):
+        if self.packed:
+            pck_string = "_pck"
+        else:
+            pck_string = ""
+        return "fdr_exec_teddy_%s_msks%d%s_fast" % (self.arch.name, self.num_masks, pck_string)
+
+    def produce_compile_call(self):
+        packed_str = { False : "false", True : "true"}[self.packed]
+        print "        { %d, %s, %d, %d, %s, %d, %d }," % (
+            self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
+            self.conf_pull_back, self.conf_top_level_split)
+
+    def __init__(self, arch, packed = False):
+        self.arch = arch
+        self.packed = packed
+        self.num_masks = 1
+        self.num_buckets = 8
+        self.num_iterations = 2
+
+        self.conf_top_level_split = 1
+        self.conf_pull_back = 0
+        if packed:
+            self.conf_top_level_split = 32
+        else:
+            self.conf_top_level_split = 1
+        self.conf_pull_back = 0
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_compile_internal.h"
+#include "fdr_confirm.h"
+#include "fdr_engine_description.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/compare.h"
+#include "util/popcount.h"
+#include "util/target_info.h"
+#include "util/verify_types.h"
+
+#include "teddy_compile.h"
+#include "teddy_internal.h"
+#include "teddy_engine_description.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <boost/core/noncopyable.hpp>
+
+using namespace std;
+
+namespace ue2 {
+
+namespace {
+
+//#define TEDDY_DEBUG
+
+class TeddyCompiler : boost::noncopyable {
+    const TeddyEngineDescription &eng;
+    const vector<hwlmLiteral> &lits;
+    bool make_small;
+
+public:
+    TeddyCompiler(const vector<hwlmLiteral> &lits_in,
+                  const TeddyEngineDescription &eng_in, bool make_small_in)
+        : eng(eng_in), lits(lits_in), make_small(make_small_in) {}
+
+    aligned_unique_ptr<FDR> build(pair<u8 *, size_t> link);
+    bool pack(map<BucketIndex, std::vector<LiteralIndex> > &bucketToLits);
+};
+
+class TeddySet {
+    const vector<hwlmLiteral> &lits;
+    u32 len;
+    // nibbleSets is a series of bitfields over 16 predicates
+    // that represent the whether shufti nibble set
+    // so for num_masks = 4 we will represent our strings by
+    // 8 u16s in the vector that indicate what a shufti bucket
+    // would have to look like
+    vector<u16> nibbleSets;
+    set<u32> litIds;
+public:
+    TeddySet(const vector<hwlmLiteral> &lits_in, u32 len_in)
+        : lits(lits_in), len(len_in), nibbleSets(len_in * 2, 0) {}
+    const set<u32> & getLits() const { return litIds; }
+    size_t litCount() const { return litIds.size(); }
+
+    bool operator<(const TeddySet & s) const {
+        return litIds < s.litIds;
+    }
+
+#ifdef TEDDY_DEBUG
+    void dump() const {
+        printf("TS: ");
+        for (u32 i = 0; i < nibbleSets.size(); i++) {
+            printf("%04x ", (u32)nibbleSets[i]);
+        }
+        printf("\nnlits: %zu\nLit ids: ", litCount());
+        printf("Prob: %llu\n", probability());
+        for (set<u32>::iterator i = litIds.begin(), e = litIds.end(); i != e; ++i) {
+            printf("%u ", *i);
+        }
+        printf("\n");
+        printf("Flood prone : %s\n", isRunProne()?"yes":"no");
+    }
+#endif
+
+    bool identicalTail(const TeddySet & ts) const {
+        return nibbleSets == ts.nibbleSets;
+    }
+
+    void addLiteral(u32 lit_id) {
+        const string &s = lits[lit_id].s;
+        for (u32 i = 0; i < len; i++) {
+            if (i < s.size()) {
+                u8 c = s[s.size() - i - 1];
+                u8 c_hi = (c >> 4) & 0xf;
+                u8 c_lo = c & 0xf;
+                nibbleSets[i*2] = 1 << c_lo;
+                if (lits[lit_id].nocase && ourisalpha(c)) {
+                    nibbleSets[i*2+1] =  (1 << (c_hi&0xd)) | (1 << (c_hi|0x2));
+                } else {
+                    nibbleSets[i*2+1] =  1 << c_hi;
+                }
+            } else {
+                nibbleSets[i*2] = nibbleSets[i*2+1] = 0xffff;
+            }
+        }
+        litIds.insert(lit_id);
+    }
+
+    void merge(const TeddySet &ts) {
+        for (u32 i = 0; i < nibbleSets.size(); i++) {
+            nibbleSets[i] |= ts.nibbleSets[i];
+        }
+        litIds.insert(ts.litIds.begin(), ts.litIds.end());
+    }
+
+    // return a value p from 0 .. MAXINT64 that gives p/MAXINT64
+    // likelihood of this TeddySet firing a first-stage accept
+    // if it was given a bucket of its own and random data were
+    // to be passed in
+    u64a probability() const {
+        u64a val = 1;
+        for (size_t i = 0; i < nibbleSets.size(); i++) {
+            val *= popcount32((u32)nibbleSets[i]);
+        }
+        return val;
+    }
+
+    // return a score based around the chance of this hitting times
+    // a small fixed cost + the cost of traversing some sort of followup
+    // (assumption is that the followup is linear)
+    u64a heuristic() const {
+        return probability() * (2+litCount());
+    }
+
+    bool isRunProne() const {
+        u16 lo_and = 0xffff;
+        u16 hi_and = 0xffff;
+        for (u32 i = 0; i < len; i++) {
+            lo_and &= nibbleSets[i*2];
+            hi_and &= nibbleSets[i*2+1];
+        }
+        // we're not flood-prone if there's no way to get
+        // through with a flood
+        if (!lo_and || !hi_and) {
+            return false;
+        }
+        return true;
+    }
+};
+
+bool TeddyCompiler::pack(map<BucketIndex,
+                             std::vector<LiteralIndex> > &bucketToLits) {
+    set<TeddySet> sts;
+
+    for (u32 i = 0; i < lits.size(); i++) {
+        TeddySet ts(lits, eng.numMasks);
+        ts.addLiteral(i);
+        sts.insert(ts);
+    }
+
+    while (1) {
+#ifdef TEDDY_DEBUG
+        printf("Size %zu\n", sts.size());
+        for (set<TeddySet>::const_iterator i1 = sts.begin(), e1 = sts.end(); i1 != e1; ++i1) {
+            printf("\n"); i1->dump();
+        }
+        printf("\n===============================================\n");
+#endif
+
+        set<TeddySet>::iterator m1 = sts.end(), m2 = sts.end();
+        u64a best = 0xffffffffffffffffULL;
+
+        for (set<TeddySet>::iterator i1 = sts.begin(), e1 = sts.end(); i1 != e1; ++i1) {
+            set<TeddySet>::iterator i2 = i1;
+            ++i2;
+            const TeddySet &s1 = *i1;
+            for (set<TeddySet>::iterator e2 = sts.end(); i2 != e2; ++i2) {
+                const TeddySet &s2 = *i2;
+
+                // be more conservative if we don't absolutely need to
+                // keep packing
+                if ((sts.size() <= eng.getNumBuckets()) &&
+                    !s1.identicalTail(s2)) {
+                    continue;
+                }
+
+                TeddySet tmpSet(lits, eng.numMasks);
+                tmpSet.merge(s1);
+                tmpSet.merge(s2);
+                u64a newScore = tmpSet.heuristic();
+                u64a oldScore = s1.heuristic() + s2.heuristic();
+                if (newScore < oldScore) {
+                    m1 = i1;
+                    m2 = i2;
+                    break;
+                } else {
+                    u64a score = newScore - oldScore;
+                    bool oldRunProne = s1.isRunProne() && s2.isRunProne();
+                    bool newRunProne = tmpSet.isRunProne();
+                    if (newRunProne && !oldRunProne) {
+                        continue;
+                    }
+                    if (score < best) {
+                        best = score;
+                        m1 = i1;
+                        m2 = i2;
+                    }
+                }
+            }
+        }
+        // if we didn't find a merge candidate, bail out
+        if ((m1 == sts.end()) || (m2 == sts.end())) {
+            break;
+        }
+
+        // do the merge
+        TeddySet nts(lits, eng.numMasks);
+        nts.merge(*m1);
+        nts.merge(*m2);
+#ifdef TEDDY_DEBUG
+        printf("Merging\n");
+        printf("m1 = \n");
+        m1->dump();
+        printf("m2 = \n");
+        m2->dump();
+        printf("nts = \n");
+        nts.dump();
+        printf("\n===============================================\n");
+#endif
+        sts.erase(m1);
+        sts.erase(m2);
+        sts.insert(nts);
+    }
+    u32 cnt = 0;
+
+    if (sts.size() > eng.getNumBuckets()) {
+        return false;
+    }
+
+    for (set<TeddySet>::const_iterator i = sts.begin(), e = sts.end(); i != e;
+         ++i) {
+        for (set<u32>::const_iterator i2 = i->getLits().begin(),
+                                      e2 = i->getLits().end();
+             i2 != e2; ++i2) {
+            bucketToLits[cnt].push_back(*i2);
+        }
+        cnt++;
+    }
+    return true;
+}
+
+aligned_unique_ptr<FDR> TeddyCompiler::build(pair<u8 *, size_t> link) {
+    if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
+        DEBUG_PRINTF("too many literals: %zu\n", lits.size());
+        return nullptr;
+    }
+
+#ifdef TEDDY_DEBUG
+    for (size_t i = 0; i < lits.size(); i++) {
+        printf("lit %zu (len = %zu, %s) is ", i, lits[i].s.size(),
+               lits[i].nocase ? "caseless" : "caseful");
+        for (size_t j = 0; j < lits[i].s.size(); j++) {
+            printf("%02x", ((u32)lits[i].s[j])&0xff);
+        }
+        printf("\n");
+    }
+#endif
+
+    map<BucketIndex, std::vector<LiteralIndex> > bucketToLits;
+    if(eng.needConfirm(lits)) {
+        if (!pack(bucketToLits)) {
+            DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n",
+                         lits.size(), eng.getNumBuckets());
+            return nullptr;
+        }
+    } else {
+        for (u32 i = 0; i < lits.size(); i++) {
+            bucketToLits[i].push_back(i);
+        }
+    }
+    u32 maskWidth = eng.getNumBuckets() / 8;
+
+    size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
+
+    pair<u8 *, size_t> floodControlTmp = setupFDRFloodControl(lits, eng);
+    pair<u8 *, size_t> confirmTmp
+        = setupFullMultiConfs(lits, eng, bucketToLits, make_small);
+
+    size_t size = ROUNDUP_N(sizeof(Teddy) +
+                             maskLen +
+                             confirmTmp.second +
+                             floodControlTmp.second +
+                             link.second, 16 * maskWidth);
+
+    aligned_unique_ptr<FDR> fdr = aligned_zmalloc_unique<FDR>(size);
+    assert(fdr); // otherwise would have thrown std::bad_alloc
+    Teddy *teddy = (Teddy *)fdr.get(); // ugly
+    u8 *teddy_base = (u8 *)teddy;
+
+    teddy->size = size;
+    teddy->engineID = eng.getID();
+    teddy->maxStringLen = verify_u32(maxLen(lits));
+
+    u8 *ptr = teddy_base + sizeof(Teddy) + maskLen;
+    memcpy(ptr, confirmTmp.first, confirmTmp.second);
+    ptr += confirmTmp.second;
+    aligned_free(confirmTmp.first);
+
+    teddy->floodOffset = verify_u32(ptr - teddy_base);
+    memcpy(ptr, floodControlTmp.first, floodControlTmp.second);
+    ptr += floodControlTmp.second;
+    aligned_free(floodControlTmp.first);
+
+    if (link.first) {
+        teddy->link = verify_u32(ptr - teddy_base);
+        memcpy(ptr, link.first, link.second);
+        aligned_free(link.first);
+    } else {
+        teddy->link = 0;
+    }
+
+    u8 *baseMsk = teddy_base + sizeof(Teddy);
+
+    for (map<BucketIndex, std::vector<LiteralIndex> >::const_iterator
+             i = bucketToLits.begin(),
+             e = bucketToLits.end();
+         i != e; ++i) {
+        const u32 bucket_id = i->first;
+        const vector<LiteralIndex> &ids = i->second;
+        const u8 bmsk = 1U << (bucket_id % 8);
+
+        for (vector<LiteralIndex>::const_iterator i2 = ids.begin(),
+                                                  e2 = ids.end();
+             i2 != e2; ++i2) {
+            LiteralIndex lit_id = *i2;
+            const hwlmLiteral & l = lits[lit_id];
+            DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
+            const u32 sz = verify_u32(l.s.size());
+
+            // fill in masks
+            for (u32 j = 0; j < eng.numMasks; j++) {
+                u32 msk_id_lo = j * 2 * maskWidth + (bucket_id  / 8);
+                u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id  / 8);
+
+                // if we don't have a char at this position, fill in i
+                // locations in these masks with '1'
+                if (j >= sz) {
+                    for (u32 n = 0; n < 16; n++) {
+                        baseMsk[msk_id_lo * 16 + n] |= bmsk;
+                        baseMsk[msk_id_hi * 16 + n] |= bmsk;
+                    }
+                } else {
+                    u8 c = l.s[sz - 1 - j];
+                    // if we do have a char at this position
+                    const u32 hiShift = 4;
+                    u32 n_hi = (c >> hiShift) & 0xf;
+                    u32 n_lo = c & 0xf;
+
+                    if (j < l.msk.size() && l.msk[l.msk.size() - 1 - j]) {
+                        u8 m = l.msk[l.msk.size() - 1 - j];
+                        u8 m_hi = (m >> hiShift) & 0xf;
+                        u8 m_lo = m & 0xf;
+                        u8 cmp = l.cmp[l.msk.size() - 1 - j];
+                        u8 cmp_lo = cmp & 0xf;
+                        u8 cmp_hi = (cmp >> hiShift) & 0xf;
+
+                        for (u8 cm = 0; cm < 0x10; cm++) {
+                            if ((cm & m_lo) == (cmp_lo & m_lo)) {
+                                baseMsk[msk_id_lo * 16 + cm] |= bmsk;
+                            }
+                            if ((cm & m_hi) == (cmp_hi & m_hi)) {
+                                baseMsk[msk_id_hi * 16 + cm] |= bmsk;
+                            }
+                        }
+                    } else{
+                        if (l.nocase && ourisalpha(c)) {
+                            u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
+                            u32 cmHalfSet   = (0x20 >> hiShift) & 0xf;
+                            baseMsk[msk_id_hi * 16 + (n_hi & cmHalfClear)] |= bmsk;
+                            baseMsk[msk_id_hi * 16 + (n_hi | cmHalfSet  )] |= bmsk;
+                        } else {
+                            baseMsk[msk_id_hi * 16 + n_hi] |= bmsk;
+                        }
+                        baseMsk[msk_id_lo * 16 + n_lo] |= bmsk;
+                    }
+                }
+            }
+        }
+    }
+
+
+#ifdef TEDDY_DEBUG
+    for (u32 i = 0; i < eng.numMasks * 2; i++) {
+        for (u32 j = 0; j < 16; j++) {
+            u8 val = baseMsk[i * 16 + j];
+            for (u32 k = 0; k < 8; k++) {
+                printf("%s", ((val >> k) & 0x1) ? "1" : "0");
+            }
+            printf(" ");
+        }
+        printf("\n");
+    }
+#endif
+
+    return fdr;
+}
+
+} // namespace
+
+aligned_unique_ptr<FDR> teddyBuildTableHinted(const vector<hwlmLiteral> &lits,
+                                              bool make_small, u32 hint,
+                                              const target_t &target,
+                                              pair<u8 *, size_t> link) {
+    unique_ptr<TeddyEngineDescription> des;
+    if (hint == HINT_INVALID) {
+        des = chooseTeddyEngine(target, lits);
+    } else {
+        des = getTeddyDescription(hint);
+    }
+    if (!des) {
+        return nullptr;
+    }
+    TeddyCompiler tc(lits, *des, make_small);
+    return tc.build(link);
+}
+
+} // namespace ue2
--- a/src/fdr/teddy_compile.h
+++ b/src/fdr/teddy_compile.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief FDR literal matcher: Teddy build API.
+ */
+
+#ifndef TEDDY_COMPILE_H
+#define TEDDY_COMPILE_H
+
+#include "ue2common.h"
+#include "util/alloc.h"
+
+#include <vector>
+#include <utility> // std::pair
+
+struct FDR;
+struct target_t;
+
+namespace ue2 {
+
+struct hwlmLiteral;
+
+ue2::aligned_unique_ptr<FDR>
+teddyBuildTableHinted(const std::vector<hwlmLiteral> &lits, bool make_small,
+                      u32 hint, const target_t &target,
+                      std::pair<u8 *, size_t> link);
+
+} // namespace ue2
+
+#endif // TEDDY_COMPILE_H
--- a/src/fdr/teddy_engine_description.cpp
+++ b/src/fdr/teddy_engine_description.cpp
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fdr.h"
+#include "fdr_internal.h"
+#include "fdr_compile_internal.h"
+#include "fdr_confirm.h"
+#include "ue2common.h"
+#include "hs_internal.h"
+#include "fdr_engine_description.h"
+#include "teddy_internal.h"
+#include "teddy_engine_description.h"
+#include "util/make_unique.h"
+
+#include <cmath>
+
+using namespace std;
+
+namespace ue2 {
+
+TeddyEngineDescription::TeddyEngineDescription(const TeddyEngineDef &def)
+    : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
+                        def.numBuckets, def.confirmPullBackDistance,
+                        def.confirmTopLevelSplit),
+      numMasks(def.numMasks), packed(def.packed) {}
+
+u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {
+    return numMasks;
+}
+
+bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const {
+    if (packed || lits.size() > getNumBuckets()) {
+        return true;
+    }
+    for (const auto &lit : lits) {
+        if (lit.s.size() > numMasks || !lit.msk.empty()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+#include "teddy_autogen_compiler.cpp"
+
+static
+size_t maxFloodTailLen(const vector<hwlmLiteral> &vl) {
+    size_t max_flood_tail = 0;
+    for (const auto &lit : vl) {
+        const string &s = lit.s;
+        assert(!s.empty());
+        size_t j;
+        for (j = 1; j < s.length(); j++) {
+            if (s[s.length() - j - 1] != s[s.length() - 1]) {
+                break;
+            }
+        }
+        max_flood_tail = max(max_flood_tail, j);
+    }
+    return max_flood_tail;
+}
+
+/**
+ * \brief True if this Teddy engine is qualified to handle this set of literals
+ * on this target.
+ */
+static
+bool isAllowed(const vector<hwlmLiteral> &vl, const TeddyEngineDescription &eng,
+               const size_t max_lit_len, const target_t &target) {
+    if (!eng.isValidOnTarget(target)) {
+        DEBUG_PRINTF("%u disallowed: not valid on target\n", eng.getID());
+        return false;
+    }
+    if (eng.getNumBuckets() < vl.size() && !eng.packed) {
+        DEBUG_PRINTF("%u disallowed: num buckets < num lits and not packed\n",
+                     eng.getID());
+        return false;
+    }
+    if (eng.getNumBuckets() * TEDDY_BUCKET_LOAD < vl.size()) {
+        DEBUG_PRINTF("%u disallowed: too many lits for num buckets\n",
+                     eng.getID());
+        return false;
+    }
+    if (eng.numMasks > max_lit_len) {
+        DEBUG_PRINTF("%u disallowed: more masks than max lit len (%zu)\n",
+                     eng.getID(), max_lit_len);
+        return false;
+    }
+
+    if (vl.size() > 40) {
+        u32 n_small_lits = 0;
+        for (const auto &lit : vl) {
+            if (lit.s.length() < eng.numMasks) {
+                n_small_lits++;
+            }
+        }
+        if (n_small_lits * 5 > vl.size()) {
+            DEBUG_PRINTF("too many short literals (%u)\n", n_small_lits);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+unique_ptr<TeddyEngineDescription>
+chooseTeddyEngine(const target_t &target, const vector<hwlmLiteral> &vl) {
+    vector<TeddyEngineDescription> descs;
+    getTeddyDescriptions(&descs);
+    const TeddyEngineDescription *best = nullptr;
+
+    const size_t max_lit_len = maxLen(vl);
+    const size_t max_flood_tail = maxFloodTailLen(vl);
+    DEBUG_PRINTF("%zu lits, max_lit_len=%zu, max_flood_tail=%zu\n", vl.size(),
+                 max_lit_len, max_flood_tail);
+
+    u32 best_score = 0;
+    for (size_t engineID = 0; engineID < descs.size(); engineID++) {
+        const TeddyEngineDescription &eng = descs[engineID];
+        if (!isAllowed(vl, eng, max_lit_len, target)) {
+            continue;
+        }
+
+        u32 score = 0;
+
+        // We prefer unpacked Teddy models.
+        if (!eng.packed) {
+            score += 100;
+        }
+
+        // If we're heavily loaded, we prefer to have more masks.
+        if (vl.size() > 4 * eng.getNumBuckets()) {
+            score += eng.numMasks * 4;
+        } else {
+            // Lightly loaded cases are great.
+            score += 100;
+        }
+
+        // We want enough masks to avoid becoming flood-prone.
+        if (eng.numMasks > max_flood_tail) {
+            score += 50;
+        }
+
+        // We prefer having 3 masks. 3 is just right.
+        score += 6 / (abs(3 - (int)eng.numMasks) + 1);
+
+        // We prefer cheaper, smaller Teddy models.
+        score += 16 / eng.getNumBuckets();
+
+        DEBUG_PRINTF("teddy %u: masks=%u, buckets=%u, packed=%u "
+                     "-> score=%u\n",
+                     eng.getID(), eng.numMasks, eng.getNumBuckets(),
+                     eng.packed ? 1U : 0U, score);
+
+        if (!best || score > best_score) {
+            best = &eng;
+            best_score = score;
+        }
+    }
+
+    if (!best) {
+        DEBUG_PRINTF("failed to find engine\n");
+        return nullptr;
+    }
+
+    DEBUG_PRINTF("using engine %u\n", best->getID());
+    return ue2::make_unique<TeddyEngineDescription>(*best);
+}
+
+unique_ptr<TeddyEngineDescription> getTeddyDescription(u32 engineID) {
+    vector<TeddyEngineDescription> descs;
+    getTeddyDescriptions(&descs);
+
+    for (const auto &desc : descs) {
+        if (desc.getID() == engineID) {
+            return ue2::make_unique<TeddyEngineDescription>(desc);
+        }
+    }
+
+    return nullptr;
+}
+
+} // namespace ue2
--- a/src/fdr/teddy_engine_description.h
+++ b/src/fdr/teddy_engine_description.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef TEDDY_ENGINE_DESCRIPTION_H
+#define TEDDY_ENGINE_DESCRIPTION_H
+
+#include "engine_description.h"
+#include "fdr_compile_internal.h"
+
+#include <memory>
+#include <vector>
+
+namespace ue2 {
+
+#define TEDDY_BUCKET_LOAD 6
+
+struct TeddyEngineDef {
+    u32 id;
+    u64a cpu_features;
+    u32 numMasks;
+    u32 numBuckets;
+    bool packed;
+    u32 confirmPullBackDistance;
+    u32 confirmTopLevelSplit;
+};
+
+class TeddyEngineDescription : public EngineDescription {
+public:
+    u32 numMasks;
+    bool packed;
+
+    explicit TeddyEngineDescription(const TeddyEngineDef &def);
+
+    u32 getDefaultFloodSuffixLength() const override;
+    bool needConfirm(const std::vector<hwlmLiteral> &lits) const;
+};
+
+std::unique_ptr<TeddyEngineDescription>
+chooseTeddyEngine(const target_t &target, const std::vector<hwlmLiteral> &vl);
+std::unique_ptr<TeddyEngineDescription> getTeddyDescription(u32 engineID);
+void getTeddyDescriptions(std::vector<TeddyEngineDescription> *out);
+
+} // namespace ue2
+
+#endif
--- a/src/fdr/teddy_internal.h
+++ b/src/fdr/teddy_internal.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef TEDDY_INTERNAL_H
+#define TEDDY_INTERNAL_H
+
+#include "ue2common.h"
+
+// first part is compatible with an FDR
+struct Teddy {
+    u32 engineID;
+    u32 size;
+    u32 maxStringLen;
+    u32 floodOffset;
+    u32 link;
+    u32 pad1;
+    u32 pad2;
+    u32 pad3;
+};
+
+#endif