From ed772380c05996cd71a77e18d6bec2081d49f0e0 Mon Sep 17 00:00:00 2001
From: Mohammad Abdul Awal <mohammad.abdul.awal@intel.com>
Date: Thu, 31 Mar 2016 11:28:42 +0100
Subject: [PATCH] teddy: remove python codegen, refactor code

Major cleanup of the Teddy runtime code. Removes python code generation,
splits AVX2 models into their own file, improves readability.
---
 .gitignore                           |    4 -
 CMakeLists.txt                       |   14 +-
 src/fdr/CMakeLists.txt               |   33 -
 src/fdr/arch.py                      |   58 --
 src/fdr/autogen.py                   |  118 ---
 src/fdr/autogen_utils.py             |  120 ---
 src/fdr/engine_description.h         |    3 +-
 src/fdr/fdr.c                        |   30 +-
 src/fdr/fdr_engine_description.cpp   |    2 +-
 src/fdr/teddy.c                      |  744 +++++++++++++----
 src/fdr/teddy.h                      |  108 +++
 src/fdr/teddy_autogen.py             |  773 ------------------
 src/fdr/teddy_avx2.c                 | 1110 ++++++++++++++++++++++++++
 src/fdr/teddy_engine_description.cpp |   27 +-
 src/fdr/teddy_runtime_common.h       |  256 ++++++
 15 files changed, 2114 insertions(+), 1286 deletions(-)
 delete mode 100644 src/fdr/CMakeLists.txt
 delete mode 100755 src/fdr/arch.py
 delete mode 100755 src/fdr/autogen.py
 delete mode 100755 src/fdr/autogen_utils.py
 create mode 100644 src/fdr/teddy.h
 delete mode 100755 src/fdr/teddy_autogen.py
 create mode 100644 src/fdr/teddy_avx2.c
 create mode 100644 src/fdr/teddy_runtime_common.h

diff --git a/.gitignore b/.gitignore
index 6e50ce45..4d984534 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,10 +46,6 @@ sqlite3
 src/config.h
 src/config.h.in
 src/hs_version.h
-src/fdr/fdr_autogen.c
-src/fdr/fdr_autogen_compiler.cpp
-src/fdr/teddy_autogen.c
-src/fdr/teddy_autogen_compiler.cpp
 src/parser/Parser.cpp
 
 # Generated PCRE files
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad7bb3f9..2bc68474 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -357,11 +357,6 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
 
 
-# include the autogen targets
-add_subdirectory(src/fdr)
-
-include_directories(${PROJECT_BINARY_DIR}/src/fdr)
-
 if(NOT WIN32)
 set(RAGEL_C_FLAGS "-Wno-unused")
 endif()
@@ -381,8 +376,6 @@ SET(hs_HEADERS
 )
 install(FILES ${hs_HEADERS} DESTINATION include/hs)
 
-set(fdr_autogen_targets autogen_runtime autogen_teddy_runtime)
-
 set (hs_exec_SRCS
     ${hs_HEADERS}
     src/hs_version.h
@@ -400,7 +393,10 @@ set (hs_exec_SRCS
     src/fdr/flood_runtime.h
     src/fdr/fdr_loadval.h
     src/fdr/teddy.c
+    src/fdr/teddy_avx2.c
+    src/fdr/teddy.h
     src/fdr/teddy_internal.h
+    src/fdr/teddy_runtime_common.h
     src/hwlm/hwlm.c
     src/hwlm/hwlm.h
     src/hwlm/hwlm_internal.h
@@ -929,11 +925,9 @@ set (LIB_VERSION ${HS_VERSION})
 set (LIB_SOVERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION})
 
 add_library(hs_exec OBJECT ${hs_exec_SRCS})
-add_dependencies(hs_exec ${fdr_autogen_targets})
 
 if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
 add_library(hs_exec_shared OBJECT ${hs_exec_SRCS})
-add_dependencies(hs_exec_shared ${fdr_autogen_targets})
 set_target_properties(hs_exec_shared PROPERTIES
     POSITION_INDEPENDENT_CODE TRUE)
 endif()
@@ -964,7 +958,6 @@ endif()
 add_library(hs STATIC ${hs_SRCS} $<TARGET_OBJECTS:hs_exec>)
 
 add_dependencies(hs ragel_Parser)
-add_dependencies(hs autogen_teddy_compiler)
 
 if (NOT BUILD_SHARED_LIBS)
 install(TARGETS hs DESTINATION lib)
@@ -973,7 +966,6 @@ endif()
 if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
     add_library(hs_shared SHARED ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_shared>)
     add_dependencies(hs_shared ragel_Parser)
-    add_dependencies(hs_shared autogen_compiler autogen_teddy_compiler)
     set_target_properties(hs_shared PROPERTIES
         OUTPUT_NAME hs
         VERSION ${LIB_VERSION}
diff --git a/src/fdr/CMakeLists.txt b/src/fdr/CMakeLists.txt
deleted file mode 100644
index 7bbf82ff..00000000
--- a/src/fdr/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-# The set of rules and other nastiness for generating FDR/Teddy source
-
-# we need to add these as explicit dependencies
-set(AUTOGEN_PY_FILES
-    arch.py
-    autogen.py
-    autogen_utils.py
-    teddy_autogen.py
-)
-
-function(fdr_autogen type out)
-    add_custom_command (
-        COMMENT "AUTOGEN ${out}"
-        OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${out}"
-        COMMAND ${PYTHON} "${CMAKE_CURRENT_SOURCE_DIR}/autogen.py" ${type} > "${CMAKE_CURRENT_BINARY_DIR}/${out}"
-        DEPENDS ${AUTOGEN_PY_FILES}
-        )
-    add_custom_target(autogen_${type} DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/${out}")
-endfunction(fdr_autogen)
-
-#now build the functions
-fdr_autogen(runtime fdr_autogen.c)
-fdr_autogen(teddy_runtime teddy_autogen.c)
-fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp)
-
-set(fdr_GENERATED_SRC
-    ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen.c
-    ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen.c
-    ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
-    PARENT_SCOPE)
-
-set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE)
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
diff --git a/src/fdr/arch.py b/src/fdr/arch.py
deleted file mode 100755
index 83a31254..00000000
--- a/src/fdr/arch.py
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/usr/bin/python
-
-# Copyright (c) 2015, Intel Corporation
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#     * Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * Neither the name of Intel Corporation nor the names of its contributors
-#       may be used to endorse or promote products derived from this software
-#       without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import autogen_utils
-
-# wrapper for architectures
-
-class Arch:
-    def __init__(self, name, extensions = []):
-        self.name = name
-        self.extensions = extensions
-        self.target = None
-
-    def get_guard(self):
-        # these defines definitely fall into the "belt-and-suspenders"
-        # category of paranoia
-        if (self.guard_list == []):
-            return "#if 1"
-
-        return "#if " + " && ".join(self.guard_list)
-
-class X86Arch(Arch):
-    def __init__(self, name, extensions = []):
-        Arch.__init__(self, name, extensions)
-        self.guard_list = [ ]
-        self.target = "0"
-
-        if "AVX2" in extensions:
-            self.target += " | HS_CPU_FEATURES_AVX2"
-            self.guard_list += [ "defined(__AVX2__)" ]
-
-
-arch_x86_64            = X86Arch("x86_64", extensions = [ ])
-arch_x86_64_avx2       = X86Arch("x86_64_avx2", extensions = [ "AVX2" ])
diff --git a/src/fdr/autogen.py b/src/fdr/autogen.py
deleted file mode 100755
index a8510487..00000000
--- a/src/fdr/autogen.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/python
-
-# Copyright (c) 2015-2016, Intel Corporation
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#     * Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * Neither the name of Intel Corporation nor the names of its contributors
-#       may be used to endorse or promote products derived from this software
-#       without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-from autogen_utils import *
-from teddy_autogen import *
-from arch import *
-
-# teddy setup
-
-def build_teddy_matchers():
-    all_matchers = [ ]
-
-    # AVX2
-    all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = False) ]
-    all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = True) ]
-    for n_msk in range(1, 5):
-        all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = False, num_masks = n_msk, num_buckets = 16) ]
-        all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = True, num_masks = n_msk, num_buckets = 16) ]
-
-    # SSE/SSE2/SSSE3
-    for n_msk in range(1, 5):
-        all_matchers += [ MT(arch = arch_x86_64, packed = False, num_masks = n_msk, num_buckets = 8) ]
-        all_matchers += [ MT(arch = arch_x86_64, packed = True, num_masks = n_msk, num_buckets = 8) ]
-
-    return all_matchers
-
-def produce_teddy_compiles(l):
-    print "void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {"
-    print "    static const TeddyEngineDef defns[] = {"
-    for m in l:
-        m.produce_compile_call()
-    print "    };"
-    print "    out->clear();"
-    print "    for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {"
-    print "        out->push_back(TeddyEngineDescription(defns[i]));"
-    print "    }"
-    print "}"
-
-# see below - we don't produce our 'zeros' at the point of the teddy runtimes as they
-# are linked. So we either generate the function or we don't - then at the point of the
-# header in fdr_autogen.c we either generate the header or we #define the zero.
-
-def produce_teddy_runtimes(l):
-    # Since we're using -Wmissing-prototypes, we need headers first.
-    for m in l:
-	m.produce_guard()
-        print m.produce_header(visible = True, header_only = True)
-	m.close_guard()
-
-    for m in l:
-	m.produce_guard()
-        m.produce_code()
-	m.close_guard()
-
-# see produce_teddy_runtimes() comment for the rationale
-
-def produce_teddy_headers(l):
-    for m in l:
-	m.produce_guard()
-        print m.produce_header(visible = True, header_only = True)
-	m.produce_zero_alternative()
-
-# general utilities
-
-def make_fdr_function_pointers(matcher_list):
-    print  """
-typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
-static FDRFUNCTYPE funcs[] = {
-"""
-    all_funcs = "    fdr_engine_exec,\n"
-    all_funcs += ",\n".join([ "    %s" % m.get_name() for m in matcher_list ])
-    print all_funcs
-    print """
-};
-"""
-
-def assign_ids(matcher_list, next_id):
-    for m in matcher_list:
-        m.id = next_id
-        next_id += 1
-    return next_id
-
-# Main entry point
-
-tm = build_teddy_matchers()
-next_id = assign_ids(tm, 1)
-if sys.argv[1] == "runtime":
-    produce_teddy_headers(tm)
-    make_fdr_function_pointers(tm)
-elif sys.argv[1] == "teddy_runtime":
-    produce_teddy_runtimes(tm)
-elif sys.argv[1] == "teddy_compiler":
-    produce_teddy_compiles(tm)
diff --git a/src/fdr/autogen_utils.py b/src/fdr/autogen_utils.py
deleted file mode 100755
index 3544bc7b..00000000
--- a/src/fdr/autogen_utils.py
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/usr/bin/python
-
-# Copyright (c) 2015-2016, Intel Corporation
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#     * Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * Neither the name of Intel Corporation nor the names of its contributors
-#       may be used to endorse or promote products derived from this software
-#       without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-
-def fail_out(msg = ""):
-    print >>sys.stderr, "Internal failure in autogen.py: " + msg
-    sys.exit(1)
-
-class IntegerType:
-    def __init__(self, size):
-        self.size = size
-
-    def get_name(self):
-        return { 256: "m256", 128 : "m128", 64 : "u64a", 32 : "u32" , 16 : "u16", 8 : "u8"}[self.size]
-
-    def size_in_bytes(self):
-        return self.size / 8
-
-    def zero_expression(self):
-        return "0"
-
-    def constant_to_string(self, n):
-        if self.size == 64:
-            suffix = "ULL"
-        else:
-            suffix = ""
-        return "0x%x%s" % (n & ((1 << self.size) - 1), suffix)
-
-    def lowbits(self, n):
-        return (1 << n) - 1
-
-    def highbits(self, n):
-        return ~(self.lowbits(self.size - n))
-
-    def lowbit_mask(self, n):
-        return self.constant_to_string(self.lowbits(n))
-
-    def lowbit_extract_expr(self, expr_string, n):
-         return "(%s & %s)" % ( expr_string, self.lowbit_mask(n))
-
-    def flip_lowbits_expr(self, expr_string, n):
-         return "(%s ^ %s)" % ( expr_string, self.lowbit_mask(n))
-
-    def bit_extract_expr(self, expr_string, low, high):
-        lbm = self.lowbit_mask(high - low)
-        return "((%s >> %d) & %s)" % (expr_string, low, lbm)
-
-    # shifts are +ve if left and -ve if right
-    def shift_expr(self, expr_string, n):
-        if n <= -self.size or n >= self.size:
-            return self.zero_expression()
-        elif (n > 0):
-            return "(%s << %d)" % (expr_string, n)
-        elif (n < 0):
-            return "(%s >> %d)" % (expr_string, -n)
-        else:
-            return "(%s)" % (expr_string)
-
-class SIMDIntegerType(IntegerType):
-    def __init__(self, size):
-        IntegerType.__init__(self, size)
-
-    def zero_expression(self):
-        return "zeroes128()"
-
-    def lowbit_extract_expr(self, expr_string, n):
-        if (n <= 32):
-            tmpType = IntegerType(32)
-            tmpExpr = "movd(%s)" % expr_string
-        elif (32 < n <= 64):
-            tmpType = IntegerType(64)
-            tmpExpr = "movq(%s)" % expr_string
-        return tmpType.lowbit_extract_expr(tmpExpr, n)
-
-    def bit_extract_expr(self, expr_string, low, high, flip):
-        fail_out("Unimplemented bit extract on m128")
-
-    def shift_expr(self, expr_string, n):
-        if n % 8 != 0:
-            fail_out("Trying to shift a m128 by a bit granular value")
-
-        # should check that n is divisible by 8
-        if n <= -self.size or n >= self.size:
-            return self.zero_expression()
-        elif (n > 0):
-            return "byteShiftLeft128(%s, %s)" % (expr_string, n / 8)
-        elif (n < 0):
-            return "byteShiftRight128(%s, %s)" % (expr_string, -n / 8)
-        else:
-            return "(%s)" % (expr_string)
-
-    def lowbit_mask(self, n):
-        if n % 8 != 0:
-            fail_out("Trying to make a lowbit mask in a m128 by a bit granular value")
-        return self.shift_expr("ones128()", -(128 - n))
diff --git a/src/fdr/engine_description.h b/src/fdr/engine_description.h
index 3c3026c3..09b16179 100644
--- a/src/fdr/engine_description.h
+++ b/src/fdr/engine_description.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -55,6 +55,7 @@ public:
     u32 getNumBuckets() const { return numBuckets; }
     u32 getConfirmPullBackDistance() const { return confirmPullBackDistance; }
     u32 getConfirmTopLevelSplit() const { return confirmTopLevelSplit; }
+    void setConfirmTopLevelSplit(u32 split) { confirmTopLevelSplit = split; }
 
     bool isValidOnTarget(const target_t &target_in) const;
     virtual u32 getDefaultFloodSuffixLength() const = 0;
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index bd7dbe83..51a041cc 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -33,6 +33,7 @@
 #include "fdr_loadval.h"
 #include "fdr_streaming_runtime.h"
 #include "flood_runtime.h"
+#include "teddy.h"
 #include "teddy_internal.h"
 #include "util/simd_utils.h"
 #include "util/simd_utils_ssse3.h"
@@ -764,7 +765,34 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
     return HWLM_SUCCESS;
 }
 
-#include "fdr_autogen.c"
+#if defined(__AVX2__)
+#define ONLY_AVX2(func) func
+#else
+#define ONLY_AVX2(func) NULL
+#endif
+
+typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
+static const FDRFUNCTYPE funcs[] = {
+    fdr_engine_exec,
+    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fast),
+    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fast),
+    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fat),
+    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fat),
+    ONLY_AVX2(fdr_exec_teddy_avx2_msks2_fat),
+    ONLY_AVX2(fdr_exec_teddy_avx2_msks2_pck_fat),
+    ONLY_AVX2(fdr_exec_teddy_avx2_msks3_fat),
+    ONLY_AVX2(fdr_exec_teddy_avx2_msks3_pck_fat),
+    ONLY_AVX2(fdr_exec_teddy_avx2_msks4_fat),
+    ONLY_AVX2(fdr_exec_teddy_avx2_msks4_pck_fat),
+    fdr_exec_teddy_msks1,
+    fdr_exec_teddy_msks1_pck,
+    fdr_exec_teddy_msks2,
+    fdr_exec_teddy_msks2_pck,
+    fdr_exec_teddy_msks3,
+    fdr_exec_teddy_msks3_pck,
+    fdr_exec_teddy_msks4,
+    fdr_exec_teddy_msks4_pck,
+};
 
 #define FAKE_HISTORY_SIZE 16
 static const u8 fake_history[FAKE_HISTORY_SIZE];
diff --git a/src/fdr/fdr_engine_description.cpp b/src/fdr/fdr_engine_description.cpp
index 103bc214..5e923b08 100644
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@@ -57,7 +57,7 @@ u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
 void getFdrDescriptions(vector<FDREngineDescription> *out) {
     static const FDREngineDef def = {0, 128, 8, 0, 1, 256};
     out->clear();
-    out->push_back(FDREngineDescription(def));
+    out->emplace_back(def);
 }
 
 static
diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 11df9d69..08b761c0 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,11 +26,19 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "config.h"
+/** \file
+ * \brief Teddy literal matcher: SSSE3 engine runtime.
+ */
+
+#include "fdr_internal.h"
+#include "flood_runtime.h"
+#include "teddy.h"
+#include "teddy_internal.h"
+#include "teddy_runtime_common.h"
 #include "util/simd_utils.h"
 #include "util/simd_utils_ssse3.h"
 
-static const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
+const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
     {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
     {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -67,178 +75,584 @@ static const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
 };
 
-// Note: p_mask is an output param that initialises a poison mask.
-UNUSED static really_inline
-m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
-                     const u8 *buf_history, size_t len_history,
-                     const u32 nMasks) {
-    union {
-        u8 val8[16];
-        m128 val128;
-    } u;
-    u.val128 = zeroes128();
+#ifdef ARCH_64_BIT
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(isnonzero128(var))) {                                      \
+        u64a lo = movq(var);                                                \
+        u64a hi = movq(byteShiftRight128(var, 8));                          \
+        if (unlikely(lo)) {                                                 \
+            conf_fn(&lo, bucket, offset, confBase, reason, a, ptr,          \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(hi)) {                                                 \
+            conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr,      \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+    }                                                                       \
+} while (0);
+#else
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(isnonzero128(var))) {                                      \
+        u32 part1 = movd(var);                                              \
+        u32 part2 = movd(byteShiftRight128(var, 4));                        \
+        u32 part3 = movd(byteShiftRight128(var, 8));                        \
+        u32 part4 = movd(byteShiftRight128(var, 12));                       \
+        if (unlikely(part1)) {                                              \
+            conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part2)) {                                              \
+            conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr,   \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part3)) {                                              \
+            conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr,   \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part4)) {                                              \
+            conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr,  \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+    }                                                                       \
+} while (0);
+#endif
 
-    if (ptr >= lo) {
-        u32 avail = (u32)(hi - ptr);
-        if (avail >= 16) {
-            *p_mask = load128((const void*)(p_mask_arr[16] + 16));
-            return loadu128(ptr);
-        }
-        *p_mask = load128((const void*)(p_mask_arr[avail] + 16));
-        for (u32 i = 0; i < avail; i++) {
-            u.val8[i] = ptr[i];
-        }
-    } else {
-        u32 need = MIN((u32)(lo - ptr), MIN(len_history, nMasks - 1));
-        u32 start = (u32)(lo - ptr);
-        u32 i;
-        for (i = start - need; ptr + i < lo; i++) {
-            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
-        }
-        u32 end = MIN(16, (u32)(hi - ptr));
-        *p_mask = loadu128((const void*)(p_mask_arr[end - start] + 16 - start));
-        for (; i < end; i++) {
-            u.val8[i] = ptr[i];
-        }
+static really_inline
+m128 prep_conf_teddy_m1(const m128 *maskBase, m128 p_mask, m128 val) {
+    m128 mask = set16x8(0xf);
+    m128 lo = and128(val, mask);
+    m128 hi = and128(rshift2x64(val, 4), mask);
+    return and128(and128(pshufb(maskBase[0*2], lo),
+                         pshufb(maskBase[0*2+1], hi)), p_mask);
+}
+
+static really_inline
+m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 p_mask,
+                        m128 val) {
+    m128 mask = set16x8(0xf);
+    m128 lo = and128(val, mask);
+    m128 hi = and128(rshift2x64(val, 4), mask);
+    m128 r = prep_conf_teddy_m1(maskBase, p_mask, val);
+
+    m128 res_1 = and128(pshufb(maskBase[1*2], lo),
+                        pshufb(maskBase[1*2+1], hi));
+    m128 res_shifted_1 = palignr(res_1, *old_1, 16-1);
+    *old_1 = res_1;
+    return and128(and128(r, p_mask), res_shifted_1);
+}
+
+static really_inline
+m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
+                        m128 p_mask, m128 val) {
+    m128 mask = set16x8(0xf);
+    m128 lo = and128(val, mask);
+    m128 hi = and128(rshift2x64(val, 4), mask);
+    m128 r = prep_conf_teddy_m2(maskBase, old_1, p_mask, val);
+
+    m128 res_2 = and128(pshufb(maskBase[2*2], lo),
+                        pshufb(maskBase[2*2+1], hi));
+    m128 res_shifted_2 = palignr(res_2, *old_2, 16-2);
+    *old_2 = res_2;
+    return and128(r, res_shifted_2);
+}
+
+static really_inline
+m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
+                        m128 *old_3, m128 p_mask, m128 val) {
+    m128 mask = set16x8(0xf);
+    m128 lo = and128(val, mask);
+    m128 hi = and128(rshift2x64(val, 4), mask);
+    m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, p_mask, val);
+
+    m128 res_3 = and128(pshufb(maskBase[3*2], lo),
+                        pshufb(maskBase[3*2+1], hi));
+    m128 res_shifted_3 = palignr(res_3, *old_3, 16-3);
+    *old_3 = res_3;
+    return and128(r, res_shifted_3);
+}
+
+hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+    const u32 *confBase = getConfBase(teddy, 1);
+
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 1);
+        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
+        ptr += 16;
     }
 
-    return u.val128;
-}
-
-
-#if defined(__AVX2__)
-
-UNUSED static really_inline
-m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
-                     const u8 *buf_history, size_t len_history,
-                     const u32 nMasks) {
-    m128 p_mask128;
-    m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history, len_history, nMasks));
-    *p_mask = set2x128(p_mask128);
-    return ret;
-}
-
-static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
-};
-
-
-UNUSED static really_inline
-m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
-                     const u8 *buf_history, size_t len_history) {
-    union {
-        u8 val8[32];
-        m256 val256;
-    } u;
-
-    if (ptr >= lo) {
-        u32 avail = (u32)(hi - ptr);
-        if (avail >= 32) {
-            *p_mask = load256((const void*)(p_mask_arr256[32] + 32));
-            return loadu256(ptr);
-        }
-        *p_mask = load256((const void*)(p_mask_arr256[avail] + 32));
-        for (u32 i = 0; i < avail; i++) {
-            u.val8[i] = ptr[i];
-        }
-    } else {
-        // need contains "how many chars to pull from history"
-        // calculate based on what we need, what we have in the buffer
-        // and only what we need to make primary confirm work
-        u32 start = (u32)(lo - ptr);
-        u32 i;
-        for (i = start; ptr + i < lo; i++) {
-            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
-        }
-        u32 end = MIN(32, (u32)(hi - ptr));
-        *p_mask = loadu256((const void*)(p_mask_arr256[end - start] + 32 - start));
-        for (; i < end; i++) {
-            u.val8[i] = ptr[i];
-        }
+    if (ptr + 16 < buf_end) {
+        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
+        ptr += 16;
     }
 
-    return u.val256;
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
+        m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
+        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 1);
+        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
 }
 
+hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
 
-#endif // __AVX2__
+    const m128 *maskBase = getMaskBase(teddy);
+    const u32 *confBase = getConfBase(teddy, 1);
 
-#define P0(cnd) unlikely(cnd)
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 1);
+        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
 
-#include "fdr.h"
-#include "fdr_internal.h"
-#include "flood_runtime.h"
+    if (ptr + 16 < buf_end) {
+        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
 
-#include "fdr_confirm.h"
-#include "fdr_confirm_runtime.h"
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+        m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
+        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+    }
 
-#include "fdr_loadval.h"
-#include "util/bitutils.h"
-#include "teddy_internal.h"
+    for (; ptr < buf_end; ptr += 16) {
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 1);
+        m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
 
-#include "teddy_autogen.c"
+hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+    const u32 *confBase = getConfBase(teddy, 2);
+
+    m128 res_old_1 = ones128();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 2);
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
+                                      load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+        ptr += 16;
+    }
+
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
+                                      load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
+                                      load128(ptr + 16));
+        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 2);
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+    const u32 *confBase = getConfBase(teddy, 2);
+
+    m128 res_old_1 = ones128();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 2);
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
+                                      load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
+
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
+                                      load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
+                                      load128(ptr + 16));
+        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                   a->buf_history, a->len_history, 2);
+        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+    const u32 *confBase = getConfBase(teddy, 3);
+
+    m128 res_old_1 = ones128();
+    m128 res_old_2 = ones128();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 3);
+        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                      p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                      ones128(), load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+        ptr += 16;
+    }
+
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                      ones128(), load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+        m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                      ones128(), load128(ptr + 16));
+        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 3);
+        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                      p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+    const u32 *confBase = getConfBase(teddy, 3);
+
+    m128 res_old_1 = ones128();
+    m128 res_old_2 = ones128();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 3);
+        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                      p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                      ones128(), load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
+
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                      ones128(), load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+        m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                      ones128(), load128(ptr + 16));
+        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 3);
+        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                      p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+    const u32 *confBase = getConfBase(teddy, 4);
+
+    m128 res_old_1 = ones128();
+    m128 res_old_2 = ones128();
+    m128 res_old_3 = ones128();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 4);
+        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                      &res_old_3, p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                      &res_old_3, ones128(), load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+        ptr += 16;
+    }
+
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                      &res_old_3, ones128(), load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+        m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                      &res_old_3, ones128(), load128(ptr + 16));
+        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 4);
+        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                      &res_old_3, p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+    const u32 *confBase = getConfBase(teddy, 4);
+
+    m128 res_old_1 = ones128();
+    m128 res_old_2 = ones128();
+    m128 res_old_3 = ones128();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 4);
+        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                      &res_old_3, p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                      &res_old_3, ones128(), load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
+
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                      &res_old_3, ones128(), load128(ptr));
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+        m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                      &res_old_3, ones128(), load128(ptr + 16));
+        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+                                     a->buf_history, a->len_history, 4);
+        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                      &res_old_3, p_mask, val_0);
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
diff --git a/src/fdr/teddy.h b/src/fdr/teddy.h
new file mode 100644
index 00000000..a0377f60
--- /dev/null
+++ b/src/fdr/teddy.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: function declarations.
+ */
+
+#ifndef TEDDY_H_
+#define TEDDY_H_
+
+#include "hwlm/hwlm.h"
+
+struct FDR; // forward declaration from fdr_internal.h
+struct FDR_Runtime_Args;
+
+hwlm_error_t fdr_exec_s1_w128(const struct FDR *fdr,
+                              const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_s2_w128(const struct FDR *fdr,
+                              const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_s4_w128(const struct FDR *fdr,
+                              const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a);
+
+#if defined(__AVX2__)
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
+                                           const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
+                                               const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
+                                           const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
+                                               const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
+                                           const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
+                                               const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
+                                           const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
+                                               const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
+                                            const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
+                                                const struct FDR_Runtime_Args *a);
+
+#endif /* __AVX2__ */
+
+#endif /* TEDDY_H_ */
diff --git a/src/fdr/teddy_autogen.py b/src/fdr/teddy_autogen.py
deleted file mode 100755
index 1cada00c..00000000
--- a/src/fdr/teddy_autogen.py
+++ /dev/null
@@ -1,773 +0,0 @@
-#!/usr/bin/python
-
-# Copyright (c) 2015-2016, Intel Corporation
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#     * Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * Neither the name of Intel Corporation nor the names of its contributors
-#       may be used to endorse or promote products derived from this software
-#       without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-from autogen_utils import *
-from string import Template
-
-class MT:
-    def produce_header(self, visible, header_only = False):
-        s = ""
-        if not visible:
-            s += "static never_inline"
-        s += """
-hwlm_error_t %s(UNUSED const struct FDR *fdr,
-                UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name()
-        if header_only:
-            s += ";"
-        else:
-            s += "{"
-        s += "\n"
-        return s
-
-    def produce_guard(self):
-        print self.arch.get_guard()
-
-    def produce_zero_alternative(self):
-        print """
-#else
-#define %s 0
-#endif
-""" % self.get_name()
-
-    def close_guard(self):
-        print "#endif"
-
-    def produce_confirm_base(self, conf_var_name, conf_var_size, offset, cautious, enable_confirmless, do_bailout = False):
-        if cautious:
-            caution_string = "VECTORING"
-        else:
-            caution_string = "NOT_CAUTIOUS"
-        conf_split_mask = IntegerType(32).constant_to_string(
-                            self.conf_top_level_split - 1)
-        if enable_confirmless:
-            quick_check_string = """
-        if (!fdrc->mult) {
-            u32 id = fdrc->nBitsOrSoleID;
-            if ((last_match == id) && (fdrc->flags & NoRepeat))
-                continue;
-           last_match = id;
-           controlVal = a->cb(ptr+byte-buf, ptr+byte-buf, id, a->ctxt);
-           continue;
-        } """
-        else:
-            quick_check_string = ""
-        if do_bailout:
-            bailout_string = """
-        if ((ptr + byte < buf + a->start_offset) || (ptr + byte >= buf + len)) continue;"""
-        else:
-            bailout_string = ""
-
-        return Template("""
-if (P0(!!$CONFVAR)) {
-    do  {
-        u32 bit = findAndClearLSB_$CONFVAR_SIZE(&$CONFVAR);
-        u32 byte  = bit / $NUM_BUCKETS + $OFFSET;
-        u32 bitRem  = bit % $NUM_BUCKETS;
-        $BAILOUT_STRING
-        u32 confSplit = *(ptr+byte) & $SPLIT_MASK;
-        u32 idx = confSplit * $NUM_BUCKETS + bitRem;
-        u32 cf = confBase[idx];
-        if (!cf)
-            continue;
-        fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);
-        if (!(fdrc->groups & *control))
-            continue;
-        $QUICK_CHECK_STRING
-        CautionReason reason = $CAUTION_STRING;
-        CONF_TYPE v;
-        const u8 * confirm_loc = ptr + byte - $CONF_PULL_BACK - 7;
-        if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
-            v = lv_u64a(confirm_loc, buf, buf + len);
-        } else { // r == VECTORING, confirm_loc < buf
-            u64a histBytes = a->histBytes;
-            v = lv_u64a_ce(confirm_loc, buf, buf + len);
-            // stitch together v (which doesn't move) and history (which does)
-            u32 overhang = buf - confirm_loc;
-            histBytes >>= 64 - (overhang * 8);
-            v |= histBytes;
-        }
-        confWithBit(fdrc, a, ptr - buf + byte, $CONF_PULL_BACK, control, &last_match, v);
-    } while(P0(!!$CONFVAR));
-    if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
-        *a->groups = controlVal;
-        return HWLM_TERMINATED;
-    }
-}""").substitute(CONFVAR = conf_var_name,
-                 CONFVAR_SIZE = conf_var_size,
-                 NUM_BUCKETS = self.num_buckets,
-                 OFFSET = offset,
-                 SPLIT_MASK = conf_split_mask,
-                 QUICK_CHECK_STRING = quick_check_string,
-                 BAILOUT_STRING = bailout_string,
-                 CAUTION_STRING = caution_string,
-                 CONF_PULL_BACK = self.conf_pull_back)
-
-    def produce_confirm(self, iter, var_name, offset, bits, cautious = True):
-        if self.packed:
-            print self.produce_confirm_base(var_name, bits, iter*16 + offset, cautious, enable_confirmless = False, do_bailout = False)
-        else:
-            if cautious:
-                caution_string = "VECTORING"
-            else:
-                caution_string = "NOT_CAUTIOUS"
-
-            print "            if (P0(!!%s)) {" % var_name
-            print "                do  {"
-            if bits == 64:
-                print "                    u32 bit = findAndClearLSB_64(&%s);" % (var_name)
-            else:
-                print "                    u32 bit = findAndClearLSB_32(&%s);" % (var_name)
-            print "                    u32 byte  = bit / %d + %d;" % (self.num_buckets, iter*16 + offset)
-            print "                    u32 idx  = bit %% %d;" % self.num_buckets
-            print "                    u32 cf = confBase[idx];"
-            print "                    fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
-            print "                    if (!(fdrc->groups & *control))"
-            print "                        continue;"
-            print """
-                CautionReason reason = %s;
-                CONF_TYPE v;
-                const u8 * confirm_loc = ptr + byte - 7;
-                if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
-                    v = lv_u64a(confirm_loc, buf, buf + len);
-                } else { // r == VECTORING, confirm_loc < buf
-                    u64a histBytes = a->histBytes;
-                    v = lv_u64a_ce(confirm_loc, buf, buf + len);
-                    // stitch together v (which doesn't move) and history (which does)
-                    u32 overhang = buf - confirm_loc;
-                    histBytes >>= 64 - (overhang * 8);
-                    v |= histBytes;
-                }""" % (caution_string)
-            if self.num_masks == 1:
-                print "                    confWithBit1(fdrc, a, ptr - buf + byte, control, &last_match, v);"
-            else:
-                print "                    confWithBitMany(fdrc, a, ptr - buf + byte, %s, control, &last_match, v);" % (caution_string)
-            print "                } while(P0(!!%s));" % var_name
-            print "                if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
-            print "                    *a->groups = controlVal;"
-            print "                    return HWLM_TERMINATED;"
-            print "                }"
-            print "            }"
-
-    def produce_needed_temporaries(self, max_iterations):
-        print "        m128 p_mask;"
-        for iter in range(0, max_iterations):
-            print "        m128 val_%d;" % iter
-            print "        m128 val_%d_lo;" % iter
-            print "        m128 val_%d_hi;" % iter
-            for x in range(self.num_masks):
-                print "        m128 res_%d_%d;" % (iter, x)
-                if x != 0:
-                    print "        m128 res_shifted_%d_%d;" % (iter, x)
-            print "        m128 r_%d;" % iter
-            print "#ifdef ARCH_64_BIT"
-            print "            u64a r_%d_lopart;" % iter
-            print "            u64a r_%d_hipart;" % iter
-            print "#else"
-            print "            u32 r_%d_part1;" % iter
-            print "            u32 r_%d_part2;" % iter
-            print "            u32 r_%d_part3;" % iter
-            print "            u32 r_%d_part4;" % iter
-            print "#endif"
-
-    def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
-                                         cautious, save_old):
-        if cautious:
-            print "        val_%d = vectoredLoad128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
-        else:
-            print "        val_%d = load128(ptr + %d);" % (iter, iter*16)
-        print "        val_%d_lo = and128(val_%d, lomask);" % (iter, iter)
-        print "        val_%d_hi = rshift2x64(val_%d, 4);" % (iter, iter)
-        print "        val_%d_hi = and128(val_%d_hi, lomask);" % (iter, iter)
-        print
-        for x in range(self.num_masks):
-            print Template("""
-        res_${ITER}_${X} = and128(pshufb(maskBase[${X}*2]  , val_${ITER}_lo),
-                                  pshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
-            if x != 0:
-                if iter == 0:
-                    print "        res_shifted_%d_%d = palignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x,   iter, x,         x,   x)
-                else:
-                    print "        res_shifted_%d_%d = palignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x,    iter, x, iter-1, x,   x)
-            if x != 0 and iter == effective_num_iterations - 1 and save_old:
-                print "        res_old_%d = res_%d_%d;" % (x, iter, x)
-        print
-        if cautious:
-            print "        r_%d = and128(res_%d_0, p_mask);" % (iter, iter)
-        else:
-            print "        r_%d = res_%d_0;" % (iter, iter)
-        for x in range(1, self.num_masks):
-            print "        r_%d = and128(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
-        print
-
-    def produce_one_iteration_confirm(self, iter, confirmCautious):
-        setup64 = [ (0, "r_%d_lopart" % iter, "movq(r_%d)" % iter),
-                    (8, "r_%d_hipart" % iter, "movq(byteShiftRight128(r_%d, 8))" % iter) ]
-
-        setup32 = [ (0, "r_%d_part1" % iter, "movd(r_%d)" % iter),
-                    (4, "r_%d_part2" % iter, "movd(byteShiftRight128(r_%d, 4))" % iter),
-                    (8, "r_%d_part3" % iter, "movd(byteShiftRight128(r_%d, 8))" % iter),
-                    (12, "r_%d_part4" % iter, "movd(byteShiftRight128(r_%d, 12))" % iter) ]
-
-        print "        if (P0(isnonzero128(r_%d))) {" % (iter)
-        print "#ifdef ARCH_64_BIT"
-        for (off, val, init) in setup64:
-            print "            %s = %s;" % (val, init)
-        for (off, val, init) in setup64:
-            self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
-        print "#else"
-        for (off, val, init) in setup32:
-            print "            %s = %s;" % (val, init)
-        for (off, val, init) in setup32:
-            self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
-        print "#endif"
-        print "        }"
-
-    def produce_one_iteration(self, iter, effective_num_iterations, cautious = False,
-                              confirmCautious = True, save_old = True):
-        self.produce_one_iteration_state_calc(iter, effective_num_iterations, cautious, save_old)
-        self.produce_one_iteration_confirm(iter, confirmCautious)
-
-    def produce_code(self):
-        print self.produce_header(visible = True, header_only = False)
-        print """
-    const u8 * buf = a->buf;
-    const size_t len = a->len;
-    const u8 * ptr = buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t * control = &controlVal;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 * tryFloodDetect = a->firstFloodDetect;
-    const struct FDRConfirm *fdrc;
-    u32 last_match = (u32)-1;
-"""
-        print
-
-        self.produce_needed_temporaries(self.num_iterations)
-        print
-
-        print "    const struct Teddy * teddy = (const struct Teddy *)fdr;"
-        print "    const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
-        print "    const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32));" % self.num_masks
-        print "    const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
-        print "    const size_t iterBytes = %d;" % (self.num_iterations * 16)
-
-        print '    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
-                                ' buf, len, a->start_offset);'
-        print '    DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
-                                ' mainStart);'
-
-        for x in range(self.num_masks):
-            if (x != 0):
-                print "    m128 res_old_%d = ones128();" % x
-        print "    m128 lomask = set16x8(0xf);"
-
-        print "    if (ptr < mainStart) {"
-        print "         ptr = mainStart - 16;"
-        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
-        print "         ptr += 16;"
-        print "    }"
-
-        print "    if (ptr + 16 < buf + len) {"
-        self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
-        print "         ptr += 16;"
-        print "    }"
-
-        print """
-    for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        if (P0(ptr > tryFloodDetect)) {
-            tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
-            if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
-                *a->groups = controlVal;
-                return HWLM_TERMINATED;
-            }
-        }
-"""
-        for iter in range(self.num_iterations):
-            self.produce_one_iteration(iter, self.num_iterations, cautious = False, confirmCautious = False)
-
-        print "    }"
-
-        print "    for (; ptr < buf + len; ptr += 16) {"
-        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
-        print "    }"
-
-        print """
-    *a->groups = controlVal;
-    return HWLM_SUCCESS;
-}
-"""
-
-    def produce_compile_call(self):
-        packed_str = { False : "false", True : "true"}[self.packed]
-        print "        { %d, %s, %d, %d, %s, %d, %d }," % (
-            self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
-            self.conf_pull_back, self.conf_top_level_split)
-
-    def get_name(self):
-        if self.packed:
-            pck_string = "_pck"
-        else:
-            pck_string = ""
-
-        if self.num_buckets == 16:
-            type_string = "_fat"
-        else:
-            type_string = ""
-
-        return "fdr_exec_teddy_%s_msks%d%s%s" % (self.arch.name, self.num_masks, pck_string, type_string)
-
-    def __init__(self, arch, packed = False, num_masks = 1, num_buckets = 8):
-        self.arch = arch
-        self.packed = packed
-        self.num_masks = num_masks
-        self.num_buckets = num_buckets
-        self.num_iterations = 2
-
-        if packed:
-            self.conf_top_level_split = 32
-        else:
-            self.conf_top_level_split = 1
-        self.conf_pull_back = 0
-
-class MTFat(MT):
-    def produce_needed_temporaries(self, max_iterations):
-        print "        m256 p_mask;"
-        for iter in range(0, max_iterations):
-            print "        m256 val_%d;" % iter
-            print "        m256 val_%d_lo;" % iter
-            print "        m256 val_%d_hi;" % iter
-            for x in range(self.num_masks):
-                print "        m256 res_%d_%d;" % (iter, x)
-                if x != 0:
-                    print "        m256 res_shifted_%d_%d;" % (iter, x)
-            print "        m256 r_%d;" % iter
-            print "#ifdef ARCH_64_BIT"
-            print "            u64a r_%d_part1;" % iter
-            print "            u64a r_%d_part2;" % iter
-            print "            u64a r_%d_part3;" % iter
-            print "            u64a r_%d_part4;" % iter
-            print "#else"
-            print "            u32 r_%d_part1;" % iter
-            print "            u32 r_%d_part2;" % iter
-            print "            u32 r_%d_part3;" % iter
-            print "            u32 r_%d_part4;" % iter
-            print "            u32 r_%d_part5;" % iter
-            print "            u32 r_%d_part6;" % iter
-            print "            u32 r_%d_part7;" % iter
-            print "            u32 r_%d_part8;" % iter
-            print "#endif"
-
-    def produce_code(self):
-        print self.produce_header(visible = True, header_only = False)
-        print """
-    const u8 * buf = a->buf;
-    const size_t len = a->len;
-    const u8 * ptr = buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t * control = &controlVal;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 * tryFloodDetect = a->firstFloodDetect;
-    const struct FDRConfirm *fdrc;
-    u32 last_match = (u32)-1;
-"""
-        print
-
-        self.produce_needed_temporaries(self.num_iterations)
-        print
-
-        print "    const struct Teddy * teddy = (const struct Teddy *)fdr;"
-        print "    const m256 * maskBase = (const m256 *)((const u8 *)fdr + sizeof(struct Teddy));"
-        print "    const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32*2));" % self.num_masks
-        print "    const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
-        print "    const size_t iterBytes = %d;" % (self.num_iterations * 16)
-
-        print '    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
-                                ' buf, len, a->start_offset);'
-        print '    DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
-                                ' mainStart);'
-
-        for x in range(self.num_masks):
-            if (x != 0):
-                print "    m256 res_old_%d = ones256();" % x
-        print "    m256 lomask = set32x8(0xf);"
-
-        print "    if (ptr < mainStart) {"
-        print "         ptr = mainStart - 16;"
-        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
-        print "         ptr += 16;"
-        print "    }"
-
-        print "    if (ptr + 16 < buf + len) {"
-        self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
-        print "         ptr += 16;"
-        print "    }"
-
-        print """
-    for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        if (P0(ptr > tryFloodDetect)) {
-            tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
-            if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
-                *a->groups = controlVal;
-                return HWLM_TERMINATED;
-            }
-        }
-"""
-
-        for iter in range(self.num_iterations):
-            self.produce_one_iteration(iter, self.num_iterations, False, confirmCautious = False)
-
-        print "    }"
-
-        print "    for (; ptr < buf + len; ptr += 16) {"
-        self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
-        print "    }"
-
-        print """
-    *a->groups = controlVal;
-    return HWLM_SUCCESS;
-}
-"""
-
-    def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
-                                         cautious, save_old):
-        if cautious:
-            print "        val_%d = vectoredLoad2x128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
-        else:
-            print "        val_%d = load2x128(ptr + %d);" % (iter, iter*16)
-        print "        val_%d_lo = and256(val_%d, lomask);" % (iter, iter)
-        print "        val_%d_hi = rshift4x64(val_%d, 4);" % (iter, iter)
-        print "        val_%d_hi = and256(val_%d_hi, lomask);" % (iter, iter)
-        print
-        for x in range(self.num_masks):
-            print Template("""
-        res_${ITER}_${X} = and256(vpshufb(maskBase[${X}*2]  , val_${ITER}_lo),
-                                  vpshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
-            if x != 0:
-                if iter == 0:
-                    print "        res_shifted_%d_%d = vpalignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x,   iter, x,         x,   x)
-                else:
-                    print "        res_shifted_%d_%d = vpalignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x,    iter, x, iter-1, x,   x)
-            if x != 0 and iter == effective_num_iterations - 1 and save_old:
-                print "        res_old_%d = res_%d_%d;" % (x, iter, x)
-        print
-        if cautious:
-            print "        r_%d = and256(res_%d_0, p_mask);" % (iter, iter)
-        else:
-            print "        r_%d = res_%d_0;" % (iter, iter)
-        for x in range(1, self.num_masks):
-            print "        r_%d = and256(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
-        print
-
-    def produce_one_iteration_confirm(self, iter, confirmCautious):
-        setup64 = [ (0, "r_%d_part1" % iter, "extractlow64from256(r)"),
-                    (4, "r_%d_part2" % iter, "extract64from256(r, 1);\n            r = interleave256hi(r_%d, r_swap)" % (iter)),
-                    (8, "r_%d_part3" % iter, "extractlow64from256(r)"),
-                    (12, "r_%d_part4" % iter, "extract64from256(r, 1)") ]
-
-        setup32 = [ (0, "r_%d_part1" % iter, "extractlow32from256(r)"),
-                    (2, "r_%d_part2" % iter, "extract32from256(r, 1)"),
-                    (4, "r_%d_part3" % iter, "extract32from256(r, 2)"),
-                    (6, "r_%d_part4" % iter, "extract32from256(r, 3);\n            r = interleave256hi(r_%d, r_swap)" % (iter)),
-                    (8, "r_%d_part5" % iter, "extractlow32from256(r)"),
-                    (10, "r_%d_part6" % iter, "extract32from256(r, 1)"),
-                    (12, "r_%d_part7" % iter, "extract32from256(r, 2)"),
-                    (14, "r_%d_part8" % iter, "extract32from256(r, 3)") ]
-
-        print "        if (P0(isnonzero256(r_%d))) {" % (iter)
-        print "            m256 r_swap = swap128in256(r_%d);" % (iter)
-        print "            m256 r = interleave256lo(r_%d, r_swap);" % (iter)
-        print "#ifdef ARCH_64_BIT"
-        for (off, val, init) in setup64:
-            print "            %s = %s;" % (val, init)
-
-        for (off, val, init) in setup64:
-            self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
-        print "#else"
-        for (off, val, init) in setup32:
-            print "            %s = %s;" % (val, init)
-
-        for (off, val, init) in setup32:
-            self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
-        print "#endif"
-        print "        }"
-
-class MTFast:
-    def produce_header(self, visible, header_only = False):
-        s = ""
-        if not visible:
-            s += "static never_inline"
-        s += """
-hwlm_error_t %s(UNUSED const struct FDR *fdr,
-                UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name()
-        if header_only:
-            s += ";"
-        else:
-            s += "{"
-        s += "\n"
-        return s
-
-    def produce_guard(self):
-        print self.arch.get_guard()
-
-    def produce_zero_alternative(self):
-        print """
-#else
-#define %s 0
-#endif
-""" % self.get_name()
-
-    def close_guard(self):
-        print "#endif"
-
-    def produce_confirm(self, cautious):
-        if cautious:
-            cautious_str = "VECTORING"
-        else:
-            cautious_str = "NOT_CAUTIOUS"
-
-        print "            for (u32 i = 0; i < arrCnt; i++) {"
-        print "                u32 byte = bitArr[i] / 8;"
-        if self.packed:
-            conf_split_mask = IntegerType(32).constant_to_string(
-                                self.conf_top_level_split - 1)
-            print "                u32 bitRem  = bitArr[i] % 8;"
-            print "                u32 confSplit = *(ptr+byte) & 0x1f;"
-            print "                u32 idx = confSplit * %d + bitRem;" % self.num_buckets
-            print "                u32 cf = confBase[idx];"
-            print "                if (!cf)"
-            print "                    continue;"
-            print "                fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
-            print "                if (!(fdrc->groups & *control))"
-            print "                    continue;"
-            print """
-                CautionReason reason = %s;
-                CONF_TYPE v;
-                const u8 * confirm_loc = ptr + byte - 7;
-                if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
-                    v = lv_u64a(confirm_loc, buf, buf + len);
-                } else { // r == VECTORING, confirm_loc < buf
-                    u64a histBytes = a->histBytes;
-                    v = lv_u64a_ce(confirm_loc, buf, buf + len);
-                    // stitch together v (which doesn't move) and history (which does)
-                    u32 overhang = buf - confirm_loc;
-                    histBytes >>= 64 - (overhang * 8);
-                    v |= histBytes;
-                }""" % (cautious_str)
-            print "                confWithBit(fdrc, a, ptr - buf + byte, 0, control, &last_match, v);"
-        else:
-            print "                u32 cf = confBase[bitArr[i] % 8];"
-            print "                fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
-            print """
-                CautionReason reason = %s;
-                CONF_TYPE v;
-                const u8 * confirm_loc = ptr + byte - 7;
-                if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
-                    v = lv_u64a(confirm_loc, buf, buf + len);
-                } else { // r == VECTORING, confirm_loc < buf
-                    u64a histBytes = a->histBytes;
-                    v = lv_u64a_ce(confirm_loc, buf, buf + len);
-                    // stitch together v (which doesn't move) and history (which does)
-                    u32 overhang = buf - confirm_loc;
-                    histBytes >>= 64 - (overhang * 8);
-                    v |= histBytes;
-                }""" % (cautious_str)
-            print "                confWithBit1(fdrc, a, ptr - buf + byte, control, &last_match, v);"
-        print "                if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
-        print "                    *a->groups = controlVal;"
-        print "                    return HWLM_TERMINATED;"
-        print "                }"
-        print "            }"
-
-    def produce_needed_temporaries(self, max_iterations):
-        print "        u32 arrCnt;"
-        print "        u16 bitArr[512];"
-        print "        m256 p_mask;"
-        print "        m256 val_0;"
-        print "        m256 val_0_lo;"
-        print "        m256 val_0_hi;"
-        print "        m256 res_0;"
-        print "        m256 res_1;"
-        print "        m128 lo_part;"
-        print "        m128 hi_part;"
-        print "#ifdef ARCH_64_BIT"
-        print "        u64a r_0_part;"
-        print "#else"
-        print "        u32 r_0_part;"
-        print "#endif"
-
-    def produce_bit_scan(self, offset, bits):
-        print "                while (P0(!!r_0_part)) {"
-        if bits == 64:
-            print "                    bitArr[arrCnt++] = (u16)findAndClearLSB_64(&r_0_part) + 64 * %d;" % (offset)
-        else:
-            print "                    bitArr[arrCnt++] = (u16)findAndClearLSB_32(&r_0_part) + 32 * %d;" % (offset)
-        print "                }"
-
-    def produce_bit_check_128(self, var_name, offset):
-        print "            if (P0(isnonzero128(%s))) {" % (var_name)
-        print "#ifdef ARCH_64_BIT"
-        print "                r_0_part = movq(%s);" % (var_name)
-        self.produce_bit_scan(offset, 64)
-        print "                r_0_part = movq(byteShiftRight128(%s, 8));" % (var_name)
-        self.produce_bit_scan(offset + 1, 64)
-        print "#else"
-        print "                r_0_part = movd(%s);" % (var_name)
-        self.produce_bit_scan(offset * 2, 32)
-        for step in range(1, 4):
-            print "                r_0_part = movd(byteShiftRight128(%s, %d));" % (var_name, step * 4)
-            self.produce_bit_scan(offset * 2 + step, 32)
-        print "#endif"
-        print "            }"
-
-    def produce_bit_check_256(self, iter, single_iter, cautious):
-        print "        if (P0(isnonzero256(res_%d))) {" % (iter)
-        if single_iter:
-            print "            arrCnt = 0;"
-        print "            lo_part = cast256to128(res_%d);" % (iter)
-        print "            hi_part = cast256to128(swap128in256(res_%d));" % (iter)
-        self.produce_bit_check_128("lo_part", iter * 4)
-        self.produce_bit_check_128("hi_part", iter * 4 + 2)
-        if single_iter:
-            self.produce_confirm(cautious)
-        print "        }"
-
-    def produce_one_iteration_state_calc(self, iter, cautious):
-        if cautious:
-            print "        val_0 = vectoredLoad256(&p_mask, ptr + %d, buf+a->start_offset, buf+len, a->buf_history, a->len_history);" % (iter * 32)
-        else:
-            print "        val_0 = load256(ptr + %d);" % (iter * 32)
-        print "        val_0_lo = and256(val_0, lomask);"
-        print "        val_0_hi = rshift4x64(val_0, 4);"
-        print "        val_0_hi = and256(val_0_hi, lomask);"
-        print "        res_%d = and256(vpshufb(maskLo  , val_0_lo), vpshufb(maskHi, val_0_hi));" % (iter)
-        if cautious:
-            print "        res_%d = and256(res_%d, p_mask);" % (iter, iter)
-
-    def produce_code(self):
-        print self.produce_header(visible = True, header_only = False)
-        print """
-    const u8 * buf = a->buf;
-    const size_t len = a->len;
-    const u8 * ptr = buf + a->start_offset;
-    hwlmcb_rv_t controlVal = *a->groups;
-    hwlmcb_rv_t * control = &controlVal;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 * tryFloodDetect = a->firstFloodDetect;
-    const struct FDRConfirm *fdrc;
-    u32 last_match = (u32)-1;
-"""
-        print
-
-        self.produce_needed_temporaries(self.num_iterations)
-
-        print "    const struct Teddy * teddy = (const struct Teddy *)fdr;"
-        print "    const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
-        print "    const m256 maskLo = set2x128(maskBase[0]);"
-        print "    const m256 maskHi = set2x128(maskBase[1]);"
-        print "    const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + 32);"
-        print "    const u8 * mainStart = ROUNDUP_PTR(ptr, 32);"
-        print "    const size_t iterBytes = %d;" % (self.num_iterations * 32)
-
-        print '    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
-                                ' buf, len, a->start_offset);'
-        print '    DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
-                                ' mainStart);'
-        print "    const m256 lomask = set32x8(0xf);"
-
-        print "    if (ptr < mainStart) {"
-        print "        ptr = mainStart - 32;"
-        self.produce_one_iteration_state_calc(iter = 0, cautious = True)
-        self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
-        print "        ptr += 32;"
-        print "    }"
-
-        print "    if (ptr + 32 < buf + len) {"
-        self.produce_one_iteration_state_calc(iter = 0, cautious = False)
-        self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
-        print "        ptr += 32;"
-        print "    }"
-        print """
-    for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        if (P0(ptr > tryFloodDetect)) {
-            tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
-            if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
-                *a->groups = controlVal;
-                return HWLM_TERMINATED;
-            }
-        }
-"""
-
-        for iter in range (0, self.num_iterations):
-            self.produce_one_iteration_state_calc(iter = iter, cautious = False)
-        print "        arrCnt = 0;"
-        for iter in range (0, self.num_iterations):
-            self.produce_bit_check_256(iter = iter, single_iter = False, cautious = False)
-        self.produce_confirm(cautious = False)
-        print "    }"
-
-        print "    for (; ptr < buf + len; ptr += 32) {"
-        self.produce_one_iteration_state_calc(iter = 0, cautious = True)
-        self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
-        print "    }"
-
-        print """
-    *a->groups = controlVal;
-    return HWLM_SUCCESS;
-}
-"""
-
-    def get_name(self):
-        if self.packed:
-            pck_string = "_pck"
-        else:
-            pck_string = ""
-        return "fdr_exec_teddy_%s_msks%d%s_fast" % (self.arch.name, self.num_masks, pck_string)
-
-    def produce_compile_call(self):
-        packed_str = { False : "false", True : "true"}[self.packed]
-        print "        { %d, %s, %d, %d, %s, %d, %d }," % (
-            self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
-            self.conf_pull_back, self.conf_top_level_split)
-
-    def __init__(self, arch, packed = False):
-        self.arch = arch
-        self.packed = packed
-        self.num_masks = 1
-        self.num_buckets = 8
-        self.num_iterations = 2
-
-        self.conf_top_level_split = 1
-        self.conf_pull_back = 0
-        if packed:
-            self.conf_top_level_split = 32
-        else:
-            self.conf_top_level_split = 1
-        self.conf_pull_back = 0
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
new file mode 100644
index 00000000..33dd8a30
--- /dev/null
+++ b/src/fdr/teddy_avx2.c
@@ -0,0 +1,1110 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: AVX2 engine runtime.
+ */
+
+#include "fdr_internal.h"
+#include "flood_runtime.h"
+#include "teddy.h"
+#include "teddy_internal.h"
+#include "teddy_runtime_common.h"
+#include "util/simd_utils.h"
+#include "util/simd_utils_ssse3.h"
+
+#if defined(__AVX2__)
+
+static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
+};
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
+do {                                                                        \
+    if (unlikely(isnonzero256(var))) {                                      \
+        m256 swap = swap128in256(var);                                      \
+        m256 r = interleave256lo(var, swap);                                \
+        u64a part1 = extractlow64from256(r);                                \
+        u64a part2 = extract64from256(r, 1);                                \
+        r = interleave256hi(var, swap);                                     \
+        u64a part3 = extractlow64from256(r);                                \
+        u64a part4 = extract64from256(r, 1);                                \
+        if (unlikely(part1)) {                                              \
+            conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part2)) {                                              \
+            conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr,   \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part3)) {                                              \
+            conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr,   \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part4)) {                                              \
+            conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr,  \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+    }                                                                       \
+} while (0);
+#else
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
+do {                                                                        \
+    if (unlikely(isnonzero256(var))) {                                      \
+        m256 swap = swap128in256(var);                                      \
+        m256 r = interleave256lo(var, swap);                                \
+        u32 part1 = extractlow32from256(r);                                 \
+        u32 part2 = extract32from256(r, 1);                                 \
+        u32 part3 = extract32from256(r, 2);                                 \
+        u32 part4 = extract32from256(r, 3);                                 \
+        r = interleave256hi(var, swap);                                     \
+        u32 part5 = extractlow32from256(r);                                 \
+        u32 part6 = extract32from256(r, 1);                                 \
+        u32 part7 = extract32from256(r, 2);                                 \
+        u32 part8 = extract32from256(r, 3);                                 \
+        if (unlikely(part1)) {                                              \
+            conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part2)) {                                              \
+            conf_fn(&part2, bucket, offset + 2, confBase, reason, a, ptr,   \
+                    control, &last_match);                                  \
+        }                                                                   \
+        if (unlikely(part3)) {                                              \
+            conf_fn(&part3, bucket, offset + 4, confBase, reason, a, ptr,   \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part4)) {                                              \
+            conf_fn(&part4, bucket, offset + 6, confBase, reason, a, ptr,   \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part5)) {                                              \
+            conf_fn(&part5, bucket, offset + 8, confBase, reason, a, ptr,   \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part6)) {                                              \
+            conf_fn(&part6, bucket, offset + 10, confBase, reason, a, ptr,  \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part7)) {                                              \
+            conf_fn(&part7, bucket, offset + 12, confBase, reason, a, ptr,  \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part8)) {                                              \
+            conf_fn(&part8, bucket, offset + 14, confBase, reason, a, ptr,  \
+                    control, &last_match);                                  \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+    }                                                                       \
+} while (0);
+#endif
+
+#define CONFIRM_FAST_TEDDY(var, offset, reason, conf_fn)                    \
+do {                                                                        \
+    if (unlikely(isnonzero256(var))) {                                      \
+        u32 arrCnt = 0;                                                     \
+        m128 lo = cast256to128(var);                                        \
+        m128 hi = cast256to128(swap128in256(var));                          \
+        bit_array_fast_teddy(lo, bitArr, &arrCnt, offset);                  \
+        bit_array_fast_teddy(hi, bitArr, &arrCnt, offset + 2);              \
+        for (u32 i = 0; i < arrCnt; i++) {                                  \
+            conf_fn(bitArr[i], confBase, reason, a, ptr, control,           \
+                    &last_match);                                           \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+    }                                                                       \
+} while (0);
+
+static really_inline
+m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+                       const u8 *buf_history, size_t len_history,
+                       const u32 nMasks) {
+    m128 p_mask128;
+    m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history,
+                                        len_history, nMasks));
+    *p_mask = set2x128(p_mask128);
+    return ret;
+}
+
+/*
+ * \brief Copy a block of [0,31] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad128.
+ */
+static really_inline
+void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
+    switch (len) {
+    case 0:
+        break;
+    case 1:
+        *dst = *src;
+        break;
+    case 2:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        break;
+    case 3:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        dst[2] = src[2];
+        break;
+    case 4:
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 5:
+    case 6:
+    case 7:
+        /* Perform copy with two overlapping 4-byte chunks. */
+        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 8:
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+        /* Perform copy with two overlapping 8-byte chunks. */
+        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 16:
+        storeu128(dst, loadu128(src));
+        break;
+    default:
+        /* Perform copy with two overlapping 16-byte chunks. */
+        assert(len < 32);
+        storeu128(dst + len - 16, loadu128(src + len - 16));
+        storeu128(dst, loadu128(src));
+        break;
+    }
+}
+
+static really_inline
+m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history) {
+    union {
+        u8 val8[32];
+        m256 val256;
+    } u;
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) {
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 32) {
+            *p_mask = load256(p_mask_arr256[32] + 32);
+            return loadu256(ptr);
+        }
+        *p_mask = load256(p_mask_arr256[avail] + 32);
+        copy_start = 0;
+        copy_len = avail;
+    } else {
+        // need contains "how many chars to pull from history"
+        // calculate based on what we need, what we have in the buffer
+        // and only what we need to make primary confirm work
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        uintptr_t i;
+        for (i = start; ptr + i < lo; i++) {
+            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+        }
+        uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
+        *p_mask = loadu256(p_mask_arr256[end - start] + 32 - start);
+        copy_start = i;
+        copy_len = end - i;
+    }
+
+    // Runt block from the buffer.
+    copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+    return u.val256;
+}
+
+static really_inline
+void do_confWithBit1_fast_teddy(u16 bits, const u32 *confBase,
+                                CautionReason reason,
+                                const struct FDR_Runtime_Args *a,
+                                const u8 *ptr, hwlmcb_rv_t *control,
+                                u32 *last_match) {
+    u32 byte = bits / 8;
+    u32 cf = confBase[bits % 8];
+    const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+                                    ((const u8 *)confBase + cf);
+    u64a confVal = getConfVal(a, ptr, byte, reason);
+    confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match, confVal);
+}
+
+static really_inline
+void do_confWithBit_fast_teddy(u16 bits, const u32 *confBase,
+                               CautionReason reason,
+                               const struct FDR_Runtime_Args *a, const u8 *ptr,
+                               hwlmcb_rv_t *control, u32 *last_match) {
+    u32 byte = bits / 8;
+    u32 bitRem = bits % 8;
+    u32 confSplit = *(ptr+byte) & 0x1f;
+    u32 idx = confSplit * 8 + bitRem;
+    u32 cf = confBase[idx];
+    if (!cf) {
+        return;
+    }
+    const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+                                    ((const u8 *)confBase + cf);
+    if (!(fdrc->groups & *control)) {
+        return;
+    }
+    u64a confVal = getConfVal(a, ptr, byte, reason);
+    confWithBit(fdrc, a, ptr - a->buf + byte, 0, control, last_match, confVal);
+}
+
+static really_inline
+void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) {
+    if (unlikely(isnonzero128(var))) {
+#ifdef ARCH_64_BIT
+        u64a part_0 = movq(var);
+        while (unlikely(part_0)) {
+            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_0) +
+                                    64 * (offset);
+            *arrCnt += 1;
+        }
+        u64a part_1 = movq(byteShiftRight128(var, 8));
+        while (unlikely(part_1)) {
+            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
+                                    64 * (offset + 1);
+            *arrCnt += 1;
+        }
+#else
+        u32 part_0 = movd(var);
+        while (unlikely(part_0)) {
+            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_0) +
+                                    32 * (offset * 2);
+            *arrCnt += 1;
+        }
+        u32 part_1 = movd(byteShiftRight128(var, 4));
+        while (unlikely(part_1)) {
+            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
+                                    32 * (offset * 2 + 1);
+            *arrCnt += 1;
+        }
+        u32 part_2 = movd(byteShiftRight128(var, 8));
+        while (unlikely(part_2)) {
+            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_2) +
+                                    32 * (offset * 2 + 2);
+            *arrCnt += 1;
+        }
+        u32 part_3 = movd(byteShiftRight128(var, 12));
+        while (unlikely(part_3)) {
+            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_3) +
+                                    32 * (offset * 2 + 3);
+            *arrCnt += 1;
+        }
+#endif
+    }
+}
+
+static really_inline
+m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 p_mask, m256 val) {
+    m256 mask = set32x8(0xf);
+    m256 lo = and256(val, mask);
+    m256 hi = and256(rshift4x64(val, 4), mask);
+    return and256(and256(vpshufb(maskBase[0*2], lo),
+                         vpshufb(maskBase[0*2+1], hi)), p_mask);
+}
+
+static really_inline
+m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 p_mask,
+                            m256 val) {
+    m256 mask = set32x8(0xf);
+    m256 lo = and256(val, mask);
+    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 r = prep_conf_fat_teddy_m1(maskBase, p_mask, val);
+
+    m256 res_1 = and256(vpshufb(maskBase[1*2], lo),
+                        vpshufb(maskBase[1*2+1], hi));
+    m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1);
+    *old_1 = res_1;
+    return and256(and256(r, p_mask), res_shifted_1);
+}
+
+static really_inline
+m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
+                            m256 p_mask, m256 val) {
+    m256 mask = set32x8(0xf);
+    m256 lo = and256(val, mask);
+    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, p_mask, val);
+
+    m256 res_2 = and256(vpshufb(maskBase[2*2], lo),
+                        vpshufb(maskBase[2*2+1], hi));
+    m256 res_shifted_2 = vpalignr(res_2, *old_2, 16-2);
+    *old_2 = res_2;
+    return and256(r, res_shifted_2);
+}
+
+static really_inline
+m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
+                            m256 *old_3, m256 p_mask, m256 val) {
+    m256 mask = set32x8(0xf);
+    m256 lo = and256(val, mask);
+    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, p_mask, val);
+
+    m256 res_3 = and256(vpshufb(maskBase[3*2], lo),
+                        vpshufb(maskBase[3*2+1], hi));
+    m256 res_shifted_3 = vpalignr(res_3, *old_3, 16-3);
+    *old_3 = res_3;
+    return and256(r, res_shifted_3);
+}
+
+static really_inline
+m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi,
+                             m256 p_mask) {
+    m256 lo = and256(val, mask);
+    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 res = and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi));
+    return and256(res, p_mask);
+}
+
+static really_inline
+const m256 * getMaskBase_avx2(const struct Teddy *teddy) {
+    return (const m256 *)((const u8 *)teddy + sizeof(struct Teddy));
+}
+
+static really_inline
+const u32 * getConfBase_avx2(const struct Teddy *teddy, u8 numMask) {
+    return (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) +
+                         (numMask*32*2));
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
+                                           const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m256 *maskBase = getMaskBase_avx2(teddy);
+    const u32 *confBase = getConfBase_avx2(teddy, 1);
+
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 1);
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
+        ptr += 16;
+    }
+
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
+        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, ones256(),
+                                          load2x128(ptr + 16));
+        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 1);
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
+                                               const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m256 *maskBase = getMaskBase_avx2(teddy);
+    const u32 *confBase = getConfBase_avx2(teddy, 1);
+
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 1);
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
+
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, ones256(),
+                                          load2x128(ptr + 16));
+        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 1);
+        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
+                                           const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m256 *maskBase = getMaskBase_avx2(teddy);
+    const u32 *confBase = getConfBase_avx2(teddy, 2);
+
+    m256 res_old_1 = ones256();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 2);
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+                                          load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+        ptr += 16;
+    }
+
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+                                          load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+                                          load2x128(ptr + 16));
+        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 2);
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
+                                               const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m256 *maskBase = getMaskBase_avx2(teddy);
+    const u32 *confBase = getConfBase_avx2(teddy, 2);
+
+    m256 res_old_1 = ones256();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 2);
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+                                          load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+         ptr += 16;
+    }
+
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+                                          load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+                                          load2x128(ptr + 16));
+        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 2);
+        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
+                                           const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m256 *maskBase = getMaskBase_avx2(teddy);
+    const u32 *confBase = getConfBase_avx2(teddy, 3);
+
+    m256 res_old_1 = ones256();
+    m256 res_old_2 = ones256();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 3);
+        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                          p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                          ones256(), load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+        ptr += 16;
+    }
+
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                          ones256(), load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+        m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                          ones256(), load2x128(ptr + 16));
+        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 3);
+        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                          p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
+                                               const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m256 *maskBase = getMaskBase_avx2(teddy);
+    const u32 *confBase = getConfBase_avx2(teddy, 3);
+
+    m256 res_old_1 = ones256();
+    m256 res_old_2 = ones256();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 3);
+        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                          p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                          ones256(), load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
+
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                          ones256(), load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+        m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                          ones256(), load2x128(ptr + 16));
+        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 3);
+        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+                                          p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
+                                           const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m256 *maskBase = getMaskBase_avx2(teddy);
+    const u32 *confBase = getConfBase_avx2(teddy, 4);
+
+    m256 res_old_1 = ones256();
+    m256 res_old_2 = ones256();
+    m256 res_old_3 = ones256();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 4);
+        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                          &res_old_3, p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                          &res_old_3, ones256(),
+                                          load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+        ptr += 16;
+    }
+
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                          &res_old_3, ones256(),
+                                          load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+        m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                          &res_old_3, ones256(),
+                                          load2x128(ptr + 16));
+        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 4);
+        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                          &res_old_3, p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
+                                               const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m256 *maskBase = getMaskBase_avx2(teddy);
+    const u32 *confBase = getConfBase_avx2(teddy, 4);
+
+    m256 res_old_1 = ones256();
+    m256 res_old_2 = ones256();
+    m256 res_old_3 = ones256();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 4);
+        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                          &res_old_3, p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
+
+    if (ptr + 16 < buf_end) {
+        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                          &res_old_3, ones256(),
+                                          load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+        ptr += 16;
+    }
+
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                          &res_old_3, ones256(),
+                                          load2x128(ptr));
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+        m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                          &res_old_3, ones256(),
+                                          load2x128(ptr + 16));
+        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 16) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+                                       a->buf_history, a->len_history, 4);
+        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+                                          &res_old_3, p_mask, val_0);
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
+                                            const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 64;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+    const u32 *confBase = getConfBase(teddy, 1);
+
+    const m256 maskLo = set2x128(maskBase[0]);
+    const m256 maskHi = set2x128(maskBase[1]);
+    const m256 mask = set32x8(0xf);
+    u16 bitArr[512];
+
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 32;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
+                                     buf_end, a->buf_history, a->len_history);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+                                             p_mask);
+        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
+        ptr += 32;
+    }
+
+    if (ptr + 32 < buf_end) {
+        m256 val_0 = load256(ptr + 0);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+                                             ones256());
+        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
+        ptr += 32;
+    }
+
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+
+        m256 val_0 = load256(ptr + 0);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+                                             ones256());
+        CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
+
+        m256 val_1 = load256(ptr + 32);
+        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi,
+                                             ones256());
+        CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 32) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
+                                     buf_end, a->buf_history, a->len_history);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+                                             p_mask);
+        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
+                                                const struct FDR_Runtime_Args *a) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    hwlmcb_rv_t controlVal = *a->groups;
+    hwlmcb_rv_t *control = &controlVal;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = (u32)-1;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 64;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+    const u32 *confBase = getConfBase(teddy, 1);
+
+    const m256 maskLo = set2x128(maskBase[0]);
+    const m256 maskHi = set2x128(maskBase[1]);
+    const m256 mask = set32x8(0xf);
+    u16 bitArr[512];
+
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 32;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
+                                     buf_end, a->buf_history, a->len_history);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+                                             p_mask);
+        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
+        ptr += 32;
+    }
+
+    if (ptr + 32 < buf_end) {
+        m256 val_0 = load256(ptr + 0);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+                                             ones256());
+        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
+        ptr += 32;
+    }
+
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes*4));
+        CHECK_FLOOD;
+
+        m256 val_0 = load256(ptr + 0);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+                                             ones256());
+        CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
+
+        m256 val_1 = load256(ptr + 32);
+        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi,
+                                             ones256());
+        CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
+    }
+
+    for (; ptr < buf_end; ptr += 32) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
+                                     buf_end, a->buf_history, a->len_history);
+        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+                                             p_mask);
+        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
+    }
+    *a->groups = controlVal;
+    return HWLM_SUCCESS;
+}
+
+#endif // __AVX2__
diff --git a/src/fdr/teddy_engine_description.cpp b/src/fdr/teddy_engine_description.cpp
index ead448a8..d95f4937 100644
--- a/src/fdr/teddy_engine_description.cpp
+++ b/src/fdr/teddy_engine_description.cpp
@@ -64,7 +64,32 @@ bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const
     return false;
 }
 
-#include "teddy_autogen_compiler.cpp"
+void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
+    static const TeddyEngineDef defns[] = {
+        { 1, 0 | HS_CPU_FEATURES_AVX2, 1, 8, false, 0, 1 },
+        { 2, 0 | HS_CPU_FEATURES_AVX2, 1, 8, true, 0, 32 },
+        { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false, 0, 1 },
+        { 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true, 0, 32 },
+        { 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false, 0, 1 },
+        { 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true, 0, 32 },
+        { 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false, 0, 1 },
+        { 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true, 0, 32 },
+        { 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false, 0, 1 },
+        { 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true, 0, 32 },
+        { 11, 0, 1, 8, false, 0, 1 },
+        { 12, 0, 1, 8, true, 0, 32 },
+        { 13, 0, 2, 8, false, 0, 1 },
+        { 14, 0, 2, 8, true, 0, 32 },
+        { 15, 0, 3, 8, false, 0, 1 },
+        { 16, 0, 3, 8, true, 0, 32 },
+        { 17, 0, 4, 8, false, 0, 1 },
+        { 18, 0, 4, 8, true, 0, 32 },
+    };
+    out->clear();
+    for (const auto &def : defns) {
+        out->emplace_back(def);
+    }
+}
 
 static
 size_t maxFloodTailLen(const vector<hwlmLiteral> &vl) {
diff --git a/src/fdr/teddy_runtime_common.h b/src/fdr/teddy_runtime_common.h
new file mode 100644
index 00000000..c50b4d16
--- /dev/null
+++ b/src/fdr/teddy_runtime_common.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: common runtime procedures.
+ */
+
+#ifndef TEDDY_RUNTIME_COMMON_H_
+#define TEDDY_RUNTIME_COMMON_H_
+
+#include "fdr_confirm.h"
+#include "fdr_confirm_runtime.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+
+extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
+
+#ifdef ARCH_64_BIT
+#define TEDDY_CONF_TYPE u64a
+#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
+#else
+#define TEDDY_CONF_TYPE u32
+#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf)
+#endif
+
+#define CHECK_HWLM_TERMINATE_MATCHING                                       \
+do {                                                                        \
+    if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) {                  \
+        *a->groups = controlVal;                                            \
+        return HWLM_TERMINATED;                                             \
+    }                                                                       \
+} while (0);
+
+#define CHECK_FLOOD                                                         \
+do {                                                                        \
+    if (unlikely(ptr > tryFloodDetect)) {                                   \
+        tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect,          \
+                                     &floodBackoff, &controlVal,            \
+                                     iterBytes);                            \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while (0);
+
+/*
+ * \brief Copy a block of [0,15] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad128.
+ */
+static really_inline
+void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
+    switch (len) {
+    case 0:
+        break;
+    case 1:
+        *dst = *src;
+        break;
+    case 2:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        break;
+    case 3:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        dst[2] = src[2];
+        break;
+    case 4:
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 5:
+    case 6:
+    case 7:
+        /* Perform copy with two overlapping 4-byte chunks. */
+        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 8:
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    default:
+        /* Perform copy with two overlapping 8-byte chunks. */
+        assert(len < 16);
+        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    }
+}
+
+// Note: p_mask is an output param that initialises a poison mask.
+static really_inline
+m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history,
+                     const u32 nMasks) {
+    union {
+        u8 val8[16];
+        m128 val128;
+    } u;
+    u.val128 = zeroes128();
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) {
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 16) {
+            *p_mask = load128(p_mask_arr[16] + 16);
+            return loadu128(ptr);
+        }
+        *p_mask = load128(p_mask_arr[avail] + 16);
+        copy_start = 0;
+        copy_len = avail;
+    } else {
+        uintptr_t need = MIN((uintptr_t)(lo - ptr),
+                             MIN(len_history, nMasks - 1));
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        uintptr_t i;
+        for (i = start - need; ptr + i < lo; i++) {
+            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+        }
+        uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
+        *p_mask = loadu128(p_mask_arr[end - start] + 16 - start);
+        copy_start = i;
+        copy_len = end - i;
+    }
+
+    // Runt block from the buffer.
+    copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+    return u.val128;
+}
+
+static really_inline
+u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
+                CautionReason reason) {
+    u64a confVal = 0;
+    const u8 *buf = a->buf;
+    size_t len = a->len;
+    const u8 *confirm_loc = ptr + byte - 7;
+    if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
+        confVal = lv_u64a(confirm_loc, buf, buf + len);
+    } else { // r == VECTORING, confirm_loc < buf
+        u64a histBytes = a->histBytes;
+        confVal = lv_u64a_ce(confirm_loc, buf, buf + len);
+        // stitch together confVal and history
+        u32 overhang = buf - confirm_loc;
+        histBytes >>= 64 - (overhang * 8);
+        confVal |= histBytes;
+    }
+    return confVal;
+}
+
+static really_inline
+void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
+                          const u32 *confBase, CautionReason reason,
+                          const struct FDR_Runtime_Args *a, const u8 *ptr,
+                          hwlmcb_rv_t *control, u32 *last_match) {
+    do  {
+        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
+        u32 byte = bit / bucket + offset;
+        u32 bitRem  = bit % bucket;
+        u32 confSplit = *(ptr+byte) & 0x1f;
+        u32 idx = confSplit * bucket + bitRem;
+        u32 cf = confBase[idx];
+        if (!cf) {
+            continue;
+        }
+        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+                                        ((const u8 *)confBase + cf);
+        if (!(fdrc->groups & *control)) {
+            continue;
+        }
+        u64a confVal = getConfVal(a, ptr, byte, reason);
+        confWithBit(fdrc, a, ptr - a->buf + byte, 0, control,
+                    last_match, confVal);
+    } while (unlikely(*conf));
+}
+
+static really_inline
+void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
+                           const u32 *confBase, CautionReason reason,
+                           const struct FDR_Runtime_Args *a, const u8 *ptr,
+                           hwlmcb_rv_t *control, u32 *last_match) {
+    do {
+        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
+        u32 byte = bit / bucket + offset;
+        u32 idx  = bit % bucket;
+        u32 cf = confBase[idx];
+        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+                                        ((const u8 *)confBase + cf);
+        if (!(fdrc->groups & *control)) {
+            continue;
+        }
+        u64a confVal = getConfVal(a, ptr, byte, reason);
+        confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match,
+                     confVal);
+    } while (unlikely(*conf));
+}
+
+static really_inline
+void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
+                              const u32 *confBase, CautionReason reason,
+                              const struct FDR_Runtime_Args *a, const u8 *ptr,
+                              hwlmcb_rv_t *control, u32 *last_match) {
+    do {
+        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
+        u32 byte = bit / bucket + offset;
+        u32 idx = bit % bucket;
+        u32 cf = confBase[idx];
+        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+                                        ((const u8 *)confBase + cf);
+        if (!(fdrc->groups & *control)) {
+            continue;
+        }
+        u64a confVal = getConfVal(a, ptr, byte, reason);
+        confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control,
+                        last_match, confVal);
+    } while (unlikely(*conf));
+}
+
+static really_inline
+const m128 * getMaskBase(const struct Teddy *teddy) {
+    return (const m128 *)((const u8 *)teddy + sizeof(struct Teddy));
+}
+
+static really_inline
+const u32 * getConfBase(const struct Teddy *teddy, u8 numMask) {
+    return (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) +
+                         (numMask*32));
+}
+
+#endif /* TEDDY_RUNTIME_COMMON_H_ */