diff --git a/src/fdr/autogen.py b/src/fdr/autogen.py
index 36e4c16c..e5b4f39e 100755
--- a/src/fdr/autogen.py
+++ b/src/fdr/autogen.py
@@ -54,16 +54,11 @@ def produce_fdr_compiles(l):
 
 def build_fdr_matchers():
     all_matchers = [ ]
-    domains = [8, 10, 11, 12, 13]
-    big_domains = [ 14, 15 ]
+    strides = [ 1, 2, 4 ]
 
     common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
-    for d in domains:
-        all_matchers += [ M3(stride = 1, domain = d, **common) ]
-        all_matchers += [ M3(stride = 2, domain = d, **common) ]
-        all_matchers += [ M3(stride = 4, domain = d, **common) ]
-    for d in big_domains:
-        all_matchers += [ M3(stride = 1, domain = d, **common) ]
+    for s in strides:
+        all_matchers += [ M3(stride = s, **common) ]
 
     return all_matchers
 
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index 082800f1..f83a4265 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -40,27 +40,6 @@
 #include "fdr_confirm_runtime.h"
 #include "fdr_streaming_runtime.h"
 #include "fdr_loadval.h"
-
-static really_inline UNUSED
-u32 getPreStartVal(const struct FDR_Runtime_Args *a, u32 numBits) {
-    u32 r = 0;
-    if (a->start_offset == 0) {
-        if (numBits <= 8) {
-            r = a->buf_history[a->len_history - 1];
-        } else {
-            r = a->buf_history[a->len_history - 1];
-            r |= (a->buf[0] << 8);
-        }
-    } else {
-        if (numBits <= 8) {
-            r = a->buf[a->start_offset - 1];
-        } else {
-            r = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
-        }
-    }
-    return r & ((1 << numBits) - 1);
-}
-
 #include "fdr_autogen.c"
 
 #define FAKE_HISTORY_SIZE 16
diff --git a/src/fdr/fdr_autogen.py b/src/fdr/fdr_autogen.py
index 685cca3b..748d811f 100755
--- a/src/fdr/fdr_autogen.py
+++ b/src/fdr/fdr_autogen.py
@@ -74,12 +74,12 @@ class ValueExtractStep(Step):
         dsb = m.datasize_bytes
         modval = offset % dsb
 
-        if m.domain > 8 and modval == dsb - 1:
+        if modval == dsb - 1:
             # Case 1: reading more than one byte over the end of the bulk load
 
             self.latency = 4
             if sub_load_cautious:
-                code_string = "cautious_forward" 
+                code_string = "cautious_forward"
             else:
                 code_string = "normal"
             load_string = m.single_load_type.load_expr_data(self.offset, code_string)
@@ -101,7 +101,7 @@ class ValueExtractStep(Step):
                     temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)
 
 
-        init_string = "(%s) & 0x%x" % (temp_string, m.reach_mask)
+        init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust)
         v_var = self.nv(m.value_extract_type, "v%d" % offset)
         self.val = v_var.gen_initializer_stmt(init_string)
 
@@ -173,14 +173,10 @@ class ConfirmStep(Step):
                                           enable_confirmless = m.stride == 1, do_bailout = False)
 
 class M3(MatcherBase):
-    def get_hash_safety_parameters(self):
-        h_size = self.single_load_type.size_in_bytes()
-        return (0, h_size - 1)
-
     def produce_compile_call(self):
-        print "    { %d, %d, %d, %d, %d, %s, %d, %d }," % (
+        print "    { %d, %d, %d, %d, %s, %d, %d }," % (
               self.id, self.state_width, self.num_buckets,
-              self.stride, self.domain,
+              self.stride,
               self.arch.target, self.conf_pull_back, self.conf_top_level_split)
 
     def produce_main_loop(self, switch_variant = False):
@@ -192,8 +188,8 @@ class M3(MatcherBase):
         ctxt = CodeGenContext(self)
 
         if switch_variant:
-            print " ptr -= (iterBytes - dist);"
-            print " { " # need an extra scope around switch variant to stop its globals escaping
+            print "    ptr -= (iterBytes - dist);"
+            print "    { " # need an extra scope around switch variant to stop its globals escaping
         else:
             print "    if (doMainLoop) {"
             print "    for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {"
@@ -349,25 +345,30 @@ class M3(MatcherBase):
         shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance))
 
         s = Template("""
-            $TYPENAME s;
-            if (a->len_history) {
-                u32 tmp = getPreStartVal(a, $DOMAIN);
-                s = *((const $TYPENAME *)ft + tmp);
-                $SHIFT_EXPR;
-            } else {
-                s = *(const $TYPENAME *)&fdr->start;
-            }
+    $TYPENAME s;
+    if (a->len_history) {
+        u32 tmp = 0;
+        if (a->start_offset == 0) {
+            tmp = a->buf_history[a->len_history - 1];
+            tmp |= (a->buf[0] << 8);
+        } else {
+            tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
+        }
+        tmp &= fdr->domainMask;
+        s = *((const $TYPENAME *)ft + tmp);
+        $SHIFT_EXPR;
+    } else {
+        s = *(const $TYPENAME *)&fdr->start;
+    }
 """).substitute(TYPENAME = s_type.get_name(),
                 ZERO_EXPR = s_type.zero_expression(),
-                DOMAIN = self.domain,
                 SHIFT_EXPR = shift_expr)
         return s
 
     def produce_code(self):
 
-        (behind, ahead) = self.get_hash_safety_parameters()
-        loop_read_behind = behind
-        loop_read_ahead = self.loop_bytes + ahead
+        loop_read_behind = 0
+        loop_read_ahead = self.loop_bytes + 1
 
         # we set up mask and shift stuff for extracting our masks from registers
         #
@@ -380,7 +381,7 @@ class M3(MatcherBase):
         ssb = self.state_type.size / 8 # state size in bytes
 
         # Intel path
-        if ssb == 16 and self.domain == 16:
+        if ssb == 16:
             # obscure corner - we don't have the room in the register to
             # do this for all values so we don't. domain==16 is pretty
             # bad anyhow, of course
@@ -390,7 +391,6 @@ class M3(MatcherBase):
 
         shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
         self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
-        self.reach_mask = ((1 << self.domain) - 1) << self.reach_shift_adjust
 
         print self.produce_header(visible = False)
 
@@ -398,21 +398,19 @@ class M3(MatcherBase):
         print " Arch: " + self.arch.name,
         print " State type: " + self.state_type.get_name(),
         print " Num buckets: %d" % self.num_buckets,
-        print " Domain: %d" % self.domain,
         print " Stride: %d" % self.stride
 
         print self.produce_common_declarations()
-        print
 
-        print "\tconst size_t tabSize = %d;" % self.table_size
-        print """
-    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
-    const u32 * confBase = (const u32 *)(ft + tabSize);
-"""
+        print "    assert(fdr->domain > 8 && fdr->domain < 16);"
+        print
+        print "    u64a domain_mask = fdr->domainMask;"
+        print "    const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));"
+        print "    const u32 * confBase = (const u32 *)(ft + fdr->tabSize);"
         print self.produce_init_state()
-        print "\tconst size_t iterBytes = %d;" % self.loop_bytes
-        print "\tconst size_t START_MOD = %d;" % self.datasize_bytes
-        print "\tconst size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
+        print "    const size_t iterBytes = %d;" % self.loop_bytes
+        print "    const size_t START_MOD = %d;" % self.datasize_bytes
+        print "    const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
 
         print """
     while (ptr < buf + len) {
@@ -451,9 +449,9 @@ class M3(MatcherBase):
         print self.produce_footer()
 
     def get_name(self):
-        return "fdr_exec_%s_d%d_s%d_w%d" % (self.arch.name, self.domain, self.stride, self.state_width)
+        return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width)
 
-    def __init__(self, state_width, domain, stride,
+    def __init__(self, state_width, stride,
                  arch,
                  table_state_width = None,
                  num_buckets = 8,
@@ -474,17 +472,9 @@ class M3(MatcherBase):
         self.table_state_width = state_width
         self.table_state_type = getRequiredType(self.table_state_width)
 
-        # domain is the number of bits that we draw from our input to
-        # index our 'reach' table
-        if not 8 <= domain <= 16:
-            fail_out("Unsupported domain: %d" % domain)
-        self.domain = domain
-        # this is the load type required for this domain if we want to
+        # this is the load type required for domain [9:15] if we want to
         # load it one at a time
-        self.single_load_type = getRequiredType(self.domain)
-
-        # table size
-        self.table_size = 2**domain * table_state_width // 8
+        self.single_load_type = IntegerType(16)
 
         # stride is the frequency with which we make data-driven
         # accesses to our reach table
diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index 8be44370..ccf17626 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -184,6 +184,13 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
     ptr += floodControlTmp.second;
     aligned_free(floodControlTmp.first);
 
+    /*  we are allowing domains 9 to 15 only */
+    assert(eng.bits > 8 && eng.bits < 16);
+    fdr->domain = eng.bits;
+    fdr->schemeWidthByte = eng.schemeWidth / 8;
+    fdr->domainMask = (1 << eng.bits) - 1;
+    fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte;
+
     if (link.first) {
         fdr->link = verify_u32(ptr - fdr_base);
         memcpy(ptr, link.first, link.second);
@@ -534,6 +541,11 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
         return nullptr;
     }
 
+    // temporary hack for unit testing
+    if (hint != HINT_INVALID) {
+        des->bits = 9;
+    }
+
     FDRCompiler fc(lits, *des, make_small);
     return fc.build(link);
 }
diff --git a/src/fdr/fdr_dump.cpp b/src/fdr/fdr_dump.cpp
index ae246270..158170c2 100644
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@@ -81,6 +81,7 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
         unique_ptr<FDREngineDescription> des =
             getFdrDescription(fdr->engineID);
         if (des) {
+            fprintf(f, "    domain     %u\n", des->bits);
             fprintf(f, "    stride     %u\n", des->stride);
             fprintf(f, "    buckets    %u\n", des->getNumBuckets());
             fprintf(f, "    width      %u\n", des->schemeWidth);
diff --git a/src/fdr/fdr_engine_description.cpp b/src/fdr/fdr_engine_description.cpp
index 2a6fda79..5d470c7e 100644
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@@ -48,7 +48,7 @@ FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
     : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
                         def.numBuckets, def.confirmPullBackDistance,
                         def.confirmTopLevelSplit),
-      schemeWidth(def.schemeWidth), stride(def.stride), bits(def.bits) {}
+      schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {}
 
 u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
     // rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
@@ -105,76 +105,83 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
     DEBUG_PRINTF("%zu lits, msl=%zu, desiredStride=%u\n", vl.size(), msl,
                  desiredStride);
 
-    const FDREngineDescription *best = nullptr;
+    FDREngineDescription *best = nullptr;
     u32 best_score = 0;
 
-    for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
-        const FDREngineDescription &eng = allDescs[engineID];
-        if (!eng.isValidOnTarget(target)) {
-            continue;
-        }
-        if (msl < eng.stride) {
-            continue;
-        }
-
-        u32 score = 100;
-
-        score -= absdiff(desiredStride, eng.stride);
-
-        if (eng.stride <= desiredStride) {
-            score += eng.stride;
-        }
-
-        u32 effLits = vl.size(); /* * desiredStride;*/
-        u32 ideal;
-        if (effLits < eng.getNumBuckets()) {
-            if (eng.stride == 1) {
-                ideal = 8;
-            } else {
-                ideal = 10;
+    for (u32 domain = 9; domain <= 15; domain++) {
+        for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
+            // to make sure that domains >=14 have stride 1 according to origin
+            if (domain > 13 && engineID > 0) {
+                continue;
+            }
+            FDREngineDescription &eng = allDescs[engineID];
+            if (!eng.isValidOnTarget(target)) {
+                continue;
+            }
+            if (msl < eng.stride) {
+                continue;
             }
-        } else if (effLits < 20) {
-            ideal = 10;
-        } else if (effLits < 100) {
-            ideal = 11;
-        } else if (effLits < 1000) {
-            ideal = 12;
-        } else if (effLits < 10000) {
-            ideal = 13;
-        } else {
-            ideal = 15;
-        }
 
-        if (ideal != 8 && eng.schemeWidth == 32) {
-            ideal += 1;
-        }
+            u32 score = 100;
 
-        if (make_small) {
-            ideal -= 2;
-        }
+            score -= absdiff(desiredStride, eng.stride);
 
-        if (eng.stride > 1) {
-            ideal++;
-        }
+            if (eng.stride <= desiredStride) {
+                score += eng.stride;
+            }
 
-        DEBUG_PRINTF("effLits %u\n", effLits);
+            u32 effLits = vl.size(); /* * desiredStride;*/
+            u32 ideal;
+            if (effLits < eng.getNumBuckets()) {
+                if (eng.stride == 1) {
+                    ideal = 8;
+                } else {
+                    ideal = 10;
+                }
+            } else if (effLits < 20) {
+                ideal = 10;
+            } else if (effLits < 100) {
+                ideal = 11;
+            } else if (effLits < 1000) {
+                ideal = 12;
+            } else if (effLits < 10000) {
+                ideal = 13;
+            } else {
+                ideal = 15;
+            }
 
-        if (target.is_atom_class() && !make_small && effLits < 4000) {
-            /* Unless it is a very heavy case, we want to build smaller tables
-             * on lightweight machines due to their small caches. */
-            ideal -= 2;
-        }
+            if (ideal != 8 && eng.schemeWidth == 32) {
+                ideal += 1;
+            }
 
-        score -= absdiff(ideal, eng.bits);
+            if (make_small) {
+                ideal -= 2;
+            }
 
-        DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
-                     "-> score=%u\n",
-                     eng.getID(), eng.schemeWidth, eng.bits,
-                     eng.getNumBuckets(), eng.stride, score);
+            if (eng.stride > 1) {
+                ideal++;
+            }
 
-        if (!best || score > best_score) {
-            best = &eng;
-            best_score = score;
+            DEBUG_PRINTF("effLits %u\n", effLits);
+
+            if (target.is_atom_class() && !make_small && effLits < 4000) {
+                /* Unless it is a very heavy case, we want to build smaller tables
+                 * on lightweight machines due to their small caches. */
+                ideal -= 2;
+            }
+
+            score -= absdiff(ideal, domain);
+
+            DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
+                         "-> score=%u\n",
+                         eng.getID(), eng.schemeWidth, eng.bits,
+                         eng.getNumBuckets(), eng.stride, score);
+
+            if (!best || score > best_score) {
+                eng.bits = domain;
+                best = &eng;
+                best_score = score;
+            }
         }
     }
 
diff --git a/src/fdr/fdr_engine_description.h b/src/fdr/fdr_engine_description.h
index d936095b..45f64ac0 100644
--- a/src/fdr/fdr_engine_description.h
+++ b/src/fdr/fdr_engine_description.h
@@ -43,7 +43,6 @@ struct FDREngineDef {
     u32 schemeWidth;
     u32 numBuckets;
     u32 stride;
-    u32 bits;
     u64a cpu_features;
     u32 confirmPullBackDistance;
     u32 confirmTopLevelSplit;
diff --git a/src/fdr/fdr_internal.h b/src/fdr/fdr_internal.h
index 6c722777..607e039c 100644
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@@ -76,9 +76,11 @@ struct FDR {
      * structures (spillover strings and hash table) if we're a secondary
      * structure. */
     u32 link;
+    u8 domain; /* dynamic domain info */
+    u8 schemeWidthByte;  /* scheme width in bytes */
+    u16 domainMask; /* pre-computed domain mask */
+    u32 tabSize; /* pre-computed hashtable size in bytes */
     u32 pad1;
-    u32 pad2;
-    u32 pad3;
 
     union {
         u32 s_u32;