diff --git a/src/hwlm/noodle_engine.cpp b/src/hwlm/noodle_engine.cpp
index 16280b59..58e0604d 100644
--- a/src/hwlm/noodle_engine.cpp
+++ b/src/hwlm/noodle_engine.cpp
@@ -100,9 +100,9 @@ match:
 
 static really_really_inline
 hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
-		Z_TYPE *z, size_t len, const struct cb_info *cbi) {
-    while (unlikely(*z)) {
-        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(z);
+		Z_TYPE z, size_t len, const struct cb_info *cbi) {
+    while (unlikely(z)) {
+        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);
         size_t matchPos = d - buf + pos;
         DEBUG_PRINTF("match pos %zu\n", matchPos);
         hwlmcb_rv_t rv = final(n, buf, len, 1, cbi, matchPos);
@@ -113,9 +113,9 @@ hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
 
 static really_really_inline
 hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
-		Z_TYPE *z, size_t len, const struct cb_info *cbi) {
-    while (unlikely(*z)) {
-        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(z);
+		Z_TYPE z, size_t len, const struct cb_info *cbi) {
+    while (unlikely(z)) {
+        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);
         size_t matchPos = d - buf + pos - 1;                               \
         DEBUG_PRINTF("match pos %zu\n", matchPos);
         hwlmcb_rv_t rv = final(n, buf, len, 0, cbi, matchPos);
@@ -127,126 +127,99 @@ hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
 template <uint16_t S>
 static really_inline
 hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
-                            size_t len, size_t start,
-			                SuperVector<S> caseMask, SuperVector<S> mask1,
+                            size_t len, size_t offset,
+                            SuperVector<S> caseMask, SuperVector<S> mask1,
                             const struct cb_info *cbi) {
-
-    size_t offset = start + n->msk_len - 1;
+    size_t start = offset + n->msk_len - 1;
     size_t end = len;
-    assert(offset < end);
 
-    hwlm_error_t rv;
+    const u8 *d = buf + start;
+    const u8 *e = buf + end;
+    DEBUG_PRINTF("start %p end %p \n", d, e);
+    assert(d < e);
+    if (d + S <= e) {
+        // peel off first part to cacheline boundary
+        const u8 *d1 = ROUNDUP_PTR(d, S);
+        DEBUG_PRINTF("until aligned %p \n", d1);
+        if (scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, start, d1 - buf) == HWLM_TERMINATED) {
+            return HWLM_TERMINATED;
+        }
+        d = d1;
 
-    if (end - offset <= S) {
-        return scanSingleUnaligned2(n, buf, caseMask, mask1, cbi, len, offset, end);
-        //return scanSingleUnaligned(n, buf, len, offset, caseMask.u.v512[0], mask1.u.v512[0], cbi, offset, end);
+        size_t loops = (end - (d - buf)) / S;
+        DEBUG_PRINTF("loops %ld \n", loops);
+
+        for (size_t i = 0; i < loops; i++, d+= S) {
+            DEBUG_PRINTF("d %p \n", d);
+            const u8 *base = ROUNDUP_PTR(d, 64);
+            // On large packet buffers, this prefetch appears to get us about 2%.
+            __builtin_prefetch(base + 256);
+
+            SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
+            typename SuperVector<S>::movemask_type z = mask1.eqmask(v);
+
+            hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
+            RETURN_IF_TERMINATED(rv);
+        }
     }
 
-    uintptr_t data = (uintptr_t)buf;
-    uintptr_t s2Start = ROUNDUP_N(data + offset, S) - data;
+    DEBUG_PRINTF("d %p e %p \n", d, e);
+    // finish off tail
 
-    if (offset != s2Start) {
-        // first scan out to the fast scan starting point
-        DEBUG_PRINTF("stage 1: -> %zu\n", s2Start);
-        rv = scanSingleUnaligned2(n, buf, caseMask, mask1, cbi, len, offset, s2Start);
-        //rv = scanSingleUnaligned(n, buf, len, offset, caseMask.u.v512[0], mask1.u.v512[0], cbi, offset, s2Start);
-        RETURN_IF_TERMINATED(rv);
-    }
-    uintptr_t last = data + end;
-    uintptr_t s2End = ROUNDDOWN_N(last, S) - data;
-    size_t loops = s2End / S;
-
-    if (likely(loops)) {
-    //if (likely(s2Start != s2End)) {
-        // scan as far as we can, bounded by the last point this key can
-        // possibly match
-        DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s2End);
-        rv = scanSingleFast2(n, buf, len, caseMask, mask1, cbi, s2Start, loops);
-        //rv = scanSingleFast(n, buf, len, caseMask.u.v512[0], mask1.u.v512[0], cbi, s2Start, s2End);
-        RETURN_IF_TERMINATED(rv);
-    }
-
-    if (s2End == len) {
-        return HWLM_SUCCESS;
-    }
-    // if we are done bail out
-    //if (s2End != len) {
-        DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, len);
-        rv = scanSingleUnaligned2(n, buf, caseMask, mask1, cbi, len, s2End, len);
-        //rv = scanSingleUnaligned(n, buf, len, s2End, caseMask.u.v512[0], mask1.u.v512[0], cbi, s2End, len);
-        return rv;
-     //}
-
-     //return HWLM_SUCCESS;
+    return scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, d - buf, end);
 }
 
 template <uint16_t S>
 static really_inline
 hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
-                            size_t len, size_t start, 
-			                SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
+                            size_t len, size_t offset, 
+                            SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
                             const struct cb_info *cbi) {
     // we stop scanning for the key-fragment when the rest of the key can't
     // possibly fit in the remaining buffer
     size_t end = len - n->key_offset + 2;
 
-    // the first place the key can match
-    size_t offset = start + n->msk_len - n->key_offset;
+    size_t start = offset + n->msk_len - n->key_offset;
 
-    hwlm_error_t rv;
+    typename SuperVector<S>::movemask_type lastz1{0};
 
-    if (end - offset <= S) {
-        rv = scanDoubleUnaligned2(n, buf, caseMask, mask1, mask2, cbi, len, offset, offset, end);
-        //rv = scanDoubleUnaligned(n, buf, len, offset, caseMask.u.v512[0], mask1.u.v512[0], mask2.u.v512[0], cbi, offset, end);
-        return rv;
+    const u8 *d = buf + start;
+    const u8 *e = buf + end;
+    DEBUG_PRINTF("start %p end %p \n", d, e);
+    assert(d < e);
+    if (d + S <= e) {
+        // peel off first part to cacheline boundary
+        const u8 *d1 = ROUNDUP_PTR(d, S);
+        DEBUG_PRINTF("until aligned %p \n", d1);
+        if (scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, &lastz1, cbi, len, start, d1 - buf) == HWLM_TERMINATED) {
+            return HWLM_TERMINATED;
+        }
+        d = d1;
+
+        size_t loops = (end - (d - buf)) / S;
+        DEBUG_PRINTF("loops %ld \n", loops);
+
+        for (size_t i = 0; i < loops; i++, d+= S) {
+            DEBUG_PRINTF("d %p \n", d);
+            const u8 *base = ROUNDUP_PTR(d, 64);
+            // On large packet buffers, this prefetch appears to get us about 2%.
+            __builtin_prefetch(base + 256);
+
+            SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
+            typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
+            typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
+            typename SuperVector<S>::movemask_type z = (z1 << 1 | lastz1) & z2;
+            lastz1 = z1 >> Z_SHIFT;
+
+            hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
+            RETURN_IF_TERMINATED(rv);
+        }
     }
 
-    uintptr_t data = (uintptr_t)buf;
-    uintptr_t s2Start = ROUNDUP_N(data + offset, S) - data;
-    uintptr_t s1End = s2Start + 1;
-    uintptr_t off = offset;
+    DEBUG_PRINTF("d %p e %p \n", d, e);
+    // finish off tail
 
-    if (s2Start != off) {
-        // first scan out to the fast scan starting point plus one char past to
-        // catch the key on the overlap
-        DEBUG_PRINTF("stage 1: %zu -> %zu\n", off, s2Start);
-        rv = scanDoubleUnaligned2(n, buf, caseMask, mask1, mask2, cbi, len, offset, off, s1End);
-        //rv = scanDoubleUnaligned(n, buf, len, offset, caseMask.u.v512[0], mask1.u.v512[0], mask2.u.v512[0], cbi, off, s1End);
-        RETURN_IF_TERMINATED(rv);
-    }
-    off = s1End;
-    uintptr_t last = data + end;
-    uintptr_t s2End = ROUNDDOWN_N(last, S) - data;
-    uintptr_t s3Start = end - S;
-
-    if (s2Start >= end) {
-        DEBUG_PRINTF("s2 == mL %zu\n", end);
-        return HWLM_SUCCESS;
-    }
-
-    //size_t loops = (s2End -s2Start)/ S;
-
-    if (likely(s2Start != s2End)) {
-    //if (likely(loops)) {
-        // scan as far as we can, bounded by the last point this key can
-        // possibly match
-        DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s3Start);
-        rv = scanDoubleFast2(n, buf, len, caseMask, mask1, mask2, cbi, s2Start, s2End);
-        //rv = scanDoubleFast(n, buf, len, caseMask.u.v512[0], mask1.u.v512[0], mask2.u.v512[0], cbi, s2Start, s2End);
-        RETURN_IF_TERMINATED(rv);
-        off = s2End;
-    }
-
-    // if there isn't enough data left to match the key, bail out
-    if (s2End == end) {
-        return HWLM_SUCCESS;
-    }
-
-    DEBUG_PRINTF("stage 3: %zu -> %zu\n", s3Start, end);
-    rv = scanDoubleUnaligned2(n, buf, caseMask, mask1, mask2, cbi, len, s3Start, off, end);
-    //rv = scanDoubleUnaligned(n, buf, len, s3Start, caseMask.u.v512[0], mask1.u.v512[0], mask2.u.v512[0], cbi, off, end);
-
-    return rv;
+    return scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, &lastz1, cbi, len, d - buf, end);
 }
 
 // Single-character specialisation, used when keyLen = 1
diff --git a/src/hwlm/noodle_engine_simd.hpp b/src/hwlm/noodle_engine_simd.hpp
index 98289d59..9c4f9b4b 100644
--- a/src/hwlm/noodle_engine_simd.hpp
+++ b/src/hwlm/noodle_engine_simd.hpp
@@ -37,19 +37,19 @@
 using Z_TYPE = u64a;
 #define Z_BITS 64
 #define Z_SHIFT 63
-#define DOUBLE_LOAD_MASK(l, off)   ((~0ULL) >> (Z_BITS -l)) 
+#define DOUBLE_LOAD_MASK(l)        ((~0ULL) >> (Z_BITS -l)) 
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << l) - 1ULL)
 #elif defined(HAVE_SIMD_256_BITS)
 using Z_TYPE = u32;
 #define Z_BITS 32
 #define Z_SHIFT 31
-#define DOUBLE_LOAD_MASK(l, off)   ((((1ULL) << l) - 1ULL) << off)
+#define DOUBLE_LOAD_MASK(l)        (((1ULL) << l) - 1ULL)
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << l) - 1ULL)
 #elif defined(HAVE_SIMD_128_BITS)
 using Z_TYPE = u32;
 #define Z_BITS 32
 #define Z_SHIFT 0
-#define DOUBLE_LOAD_MASK(l, off)   ((((1ULL) << l) - 1ULL) << off)
+#define DOUBLE_LOAD_MASK(l)        (((1ULL) << l) - 1ULL)
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << l) - 1ULL)
 #endif
 
@@ -77,13 +77,14 @@ static really_inline SuperVector<S> getCaseMask(void) {
 // function can't handle (due to small/unaligned chunk at end)
 template<uint16_t S>
 static really_inline
-hwlm_error_t scanSingleUnaligned2(const struct noodTable *n, const u8 *buf,
+hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
                                  SuperVector<S> caseMask, SuperVector<S> mask1,
                                  const struct cb_info *cbi, size_t len, size_t start,
                                  size_t end) {
     const u8 *d = buf + start;
     DEBUG_PRINTF("start %zu end %zu\n", start, end);
     const size_t l = end - start;
+    DEBUG_PRINTF("l = %ld\n", l);
     //assert(l <= 64);
     if (!l) {
         return HWLM_SUCCESS;
@@ -93,100 +94,28 @@ hwlm_error_t scanSingleUnaligned2(const struct noodTable *n, const u8 *buf,
     SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
     typename SuperVector<S>::movemask_type z = mask & mask1.eqmask(v);
 
-    return single_zscan(n, d, buf, &z, len, cbi);
+    return single_zscan(n, d, buf, z, len, cbi);
 }
 
 template<uint16_t S>
 static really_inline
-hwlm_error_t scanSingleFast2(const struct noodTable *n, const u8 *buf,
-                            size_t len, SuperVector<S> caseMask, SuperVector<S> mask1,
-                            const struct cb_info *cbi, size_t start,
-                            size_t loops) {
+hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
+                                 SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2, typename SuperVector<S>::movemask_type *lastz1,
+                                 const struct cb_info *cbi, size_t len, size_t start, size_t end) {
     const u8 *d = buf + start;
-
-    for (size_t i = 0; i < loops; i++, d+= S) {
-        const u8 *base = ROUNDUP_PTR(d, 64);
-        // On large packet buffers, this prefetch appears to get us about 2%.
-        __builtin_prefetch(base + 4*S);
-
-        SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
-        typename SuperVector<S>::movemask_type z = mask1.eqmask(v);
-
-        hwlm_error_t result = single_zscan(n, d, buf, &z, len, cbi);
-        if (unlikely(result != HWLM_SUCCESS))
-	    return result;
-    }
-    return HWLM_SUCCESS;
-}
-
-template<uint16_t S>
-static really_inline
-hwlm_error_t scanDoubleUnaligned2(const struct noodTable *n, const u8 *buf,
-                                 SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
-                                 const struct cb_info *cbi, size_t len, size_t offset, size_t start, size_t end) {
-    const u8 *d = buf + offset;
     DEBUG_PRINTF("start %zu end %zu", start, end);
     const size_t l = end - start;
     assert(l <= S);
     if (!l) {
         return HWLM_SUCCESS;
     }
-   u32 buf_off = start - offset;
-
     SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
 
-    typename SuperVector<S>::movemask_type mask = DOUBLE_LOAD_MASK(l, buf_off);
+    typename SuperVector<S>::movemask_type mask = DOUBLE_LOAD_MASK(l);
     typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
     typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
-    typename SuperVector<S>::movemask_type z = mask & (z1 << 1) & z2;
-#if defined(HAVE_AVX512) && defined(BUILD_AVX512)
-    DEBUG_PRINTF("buf_off = %d\n", buf_off);
-    DEBUG_PRINTF("l = %ld, mask = 0x%016llx\n", l, mask);
-    DEBUG_PRINTF("\nz1 = 0x%016llx\n", z1);
-    DEBUG_PRINTF("z2 = 0x%016llx\n", z2);
-    DEBUG_PRINTF("z  = 0x%016llx\n", z);
-    __mmask64 k = (~0ULL) >> (64 - l);
-    DEBUG_PRINTF("k    = 0x%016llx\n", k);
+    typename SuperVector<S>::movemask_type z = mask & (*lastz1 | z1 << 1) & z2;
+    *lastz1 = z1 >> (l -1);
 
-    m512 v1 = loadu_maskz_m512(k, d);
-    v1 = and512(v1, caseMask.u.v512[0]);
-
-    u64a z0_ = masked_eq512mask(k, mask1.u.v512[0], v1);
-    u64a z1_ = masked_eq512mask(k, mask2.u.v512[0], v1);
-    u64a z_ = (z0_ << 1) & z1_;
-    DEBUG_PRINTF("z0_ = 0x%016llx\n", z0_);
-    DEBUG_PRINTF("z1_ = 0x%016llx\n", z1_);
-    DEBUG_PRINTF("z_  = 0x%016llx\n", z_);
-    assert(z == z_);
-#endif
-
-    return double_zscan(n, d, buf, &z, len, cbi);
-}
-
-template<uint16_t S>
-static really_inline
-hwlm_error_t scanDoubleFast2(const struct noodTable *n, const u8 *buf,
-                            size_t len, SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
-                            const struct cb_info *cbi, size_t start, size_t end/*loops*/) {
-    const u8 *d = buf + start, *e = buf + end;
-    //DEBUG_PRINTF("start %zu loops %zu \n", start, loops);
-    typename SuperVector<S>::movemask_type lastz1{0};
-
-    //for (size_t i=0; i < loops; i++, d+= S) {
-    for (; d < e; d+= S) {
-        const u8 *base = ROUNDUP_PTR(d, 64);
-        // On large packet buffers, this prefetch appears to get us about 2%.
-        __builtin_prefetch(base + 4*S);
-
-        SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
-        typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
-        typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
-        typename SuperVector<S>::movemask_type z = (z1 << 1 | lastz1) & z2;
-        lastz1 = z1 >> Z_SHIFT;
-
-        hwlm_error_t result = double_zscan(n, d, buf, &z, len, cbi);
-        if (unlikely(result != HWLM_SUCCESS))
-	       return result;
-    }
-    return HWLM_SUCCESS;
+    return double_zscan(n, d, buf, z, len, cbi);
 }