simplify scanSingleMain() and scanDoubleMain()

2026-01-17 16:00:26 +03:00 · 2021-05-13 17:53:12 +03:00
parent f77837130d
commit c6406bebde
2 changed files with 90 additions and 188 deletions
--- a/src/hwlm/noodle_engine.cpp
+++ b/src/hwlm/noodle_engine.cpp
@@ -100,9 +100,9 @@ match:
 static really_really_inline
 hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
-		Z_TYPE *z, size_t len, const struct cb_info *cbi) {
+		Z_TYPE z, size_t len, const struct cb_info *cbi) {
-    while (unlikely(*z)) {
+    while (unlikely(z)) {
-        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(z);
+        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);
        size_t matchPos = d - buf + pos;
        DEBUG_PRINTF("match pos %zu\n", matchPos);
        hwlmcb_rv_t rv = final(n, buf, len, 1, cbi, matchPos);
@@ -113,9 +113,9 @@ hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
 static really_really_inline
 hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
-		Z_TYPE *z, size_t len, const struct cb_info *cbi) {
+		Z_TYPE z, size_t len, const struct cb_info *cbi) {
-    while (unlikely(*z)) {
+    while (unlikely(z)) {
-        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(z);
+        Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);
        size_t matchPos = d - buf + pos - 1;                               \
        DEBUG_PRINTF("match pos %zu\n", matchPos);
        hwlmcb_rv_t rv = final(n, buf, len, 0, cbi, matchPos);
@@ -127,126 +127,99 @@ hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
 template <uint16_t S>
 static really_inline
 hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
-                            size_t len, size_t start,
+                            size_t len, size_t offset,
                            SuperVector<S> caseMask, SuperVector<S> mask1,
                            const struct cb_info *cbi) {
-
+    size_t start = offset + n->msk_len - 1;
    size_t offset = start + n->msk_len - 1;
    size_t end = len;
    assert(offset < end);
-    hwlm_error_t rv;
+    const u8 *d = buf + start;
-
+    const u8 *e = buf + end;
-    if (end - offset <= S) {
+    DEBUG_PRINTF("start %p end %p \n", d, e);
-        return scanSingleUnaligned2(n, buf, caseMask, mask1, cbi, len, offset, end);
+    assert(d < e);
-        //return scanSingleUnaligned(n, buf, len, offset, caseMask.u.v512[0], mask1.u.v512[0], cbi, offset, end);
+    if (d + S <= e) {
        // peel off first part to cacheline boundary
        const u8 *d1 = ROUNDUP_PTR(d, S);
        DEBUG_PRINTF("until aligned %p \n", d1);
        if (scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, start, d1 - buf) == HWLM_TERMINATED) {
            return HWLM_TERMINATED;
        }
        d = d1;
-    uintptr_t data = (uintptr_t)buf;
+        size_t loops = (end - (d - buf)) / S;
-    uintptr_t s2Start = ROUNDUP_N(data + offset, S) - data;
+        DEBUG_PRINTF("loops %ld \n", loops);
-    if (offset != s2Start) {
+        for (size_t i = 0; i < loops; i++, d+= S) {
-        // first scan out to the fast scan starting point
+            DEBUG_PRINTF("d %p \n", d);
-        DEBUG_PRINTF("stage 1: -> %zu\n", s2Start);
+            const u8 *base = ROUNDUP_PTR(d, 64);
-        rv = scanSingleUnaligned2(n, buf, caseMask, mask1, cbi, len, offset, s2Start);
+            // On large packet buffers, this prefetch appears to get us about 2%.
-        //rv = scanSingleUnaligned(n, buf, len, offset, caseMask.u.v512[0], mask1.u.v512[0], cbi, offset, s2Start);
+            __builtin_prefetch(base + 256);
            SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
            typename SuperVector<S>::movemask_type z = mask1.eqmask(v);
            hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
            RETURN_IF_TERMINATED(rv);
        }
    uintptr_t last = data + end;
    uintptr_t s2End = ROUNDDOWN_N(last, S) - data;
    size_t loops = s2End / S;
    if (likely(loops)) {
    //if (likely(s2Start != s2End)) {
        // scan as far as we can, bounded by the last point this key can
        // possibly match
        DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s2End);
        rv = scanSingleFast2(n, buf, len, caseMask, mask1, cbi, s2Start, loops);
        //rv = scanSingleFast(n, buf, len, caseMask.u.v512[0], mask1.u.v512[0], cbi, s2Start, s2End);
        RETURN_IF_TERMINATED(rv);
    }
-    if (s2End == len) {
+    DEBUG_PRINTF("d %p e %p \n", d, e);
-        return HWLM_SUCCESS;
+    // finish off tail
    }
    // if we are done bail out
    //if (s2End != len) {
        DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, len);
        rv = scanSingleUnaligned2(n, buf, caseMask, mask1, cbi, len, s2End, len);
        //rv = scanSingleUnaligned(n, buf, len, s2End, caseMask.u.v512[0], mask1.u.v512[0], cbi, s2End, len);
        return rv;
     //}
-     //return HWLM_SUCCESS;
+    return scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, d - buf, end);
 }
 template <uint16_t S>
 static really_inline
 hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
-                            size_t len, size_t start, 
+                            size_t len, size_t offset, 
                            SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
                            const struct cb_info *cbi) {
    // we stop scanning for the key-fragment when the rest of the key can't
    // possibly fit in the remaining buffer
    size_t end = len - n->key_offset + 2;
-    // the first place the key can match
+    size_t start = offset + n->msk_len - n->key_offset;
    size_t offset = start + n->msk_len - n->key_offset;
-    hwlm_error_t rv;
+    typename SuperVector<S>::movemask_type lastz1{0};
-    if (end - offset <= S) {
+    const u8 *d = buf + start;
-        rv = scanDoubleUnaligned2(n, buf, caseMask, mask1, mask2, cbi, len, offset, offset, end);
+    const u8 *e = buf + end;
-        //rv = scanDoubleUnaligned(n, buf, len, offset, caseMask.u.v512[0], mask1.u.v512[0], mask2.u.v512[0], cbi, offset, end);
+    DEBUG_PRINTF("start %p end %p \n", d, e);
-        return rv;
+    assert(d < e);
    if (d + S <= e) {
        // peel off first part to cacheline boundary
        const u8 *d1 = ROUNDUP_PTR(d, S);
        DEBUG_PRINTF("until aligned %p \n", d1);
        if (scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, &lastz1, cbi, len, start, d1 - buf) == HWLM_TERMINATED) {
            return HWLM_TERMINATED;
        }
        d = d1;
-    uintptr_t data = (uintptr_t)buf;
+        size_t loops = (end - (d - buf)) / S;
-    uintptr_t s2Start = ROUNDUP_N(data + offset, S) - data;
+        DEBUG_PRINTF("loops %ld \n", loops);
    uintptr_t s1End = s2Start + 1;
    uintptr_t off = offset;
-    if (s2Start != off) {
+        for (size_t i = 0; i < loops; i++, d+= S) {
-        // first scan out to the fast scan starting point plus one char past to
+            DEBUG_PRINTF("d %p \n", d);
-        // catch the key on the overlap
+            const u8 *base = ROUNDUP_PTR(d, 64);
-        DEBUG_PRINTF("stage 1: %zu -> %zu\n", off, s2Start);
+            // On large packet buffers, this prefetch appears to get us about 2%.
-        rv = scanDoubleUnaligned2(n, buf, caseMask, mask1, mask2, cbi, len, offset, off, s1End);
+            __builtin_prefetch(base + 256);
-        //rv = scanDoubleUnaligned(n, buf, len, offset, caseMask.u.v512[0], mask1.u.v512[0], mask2.u.v512[0], cbi, off, s1End);
+
            SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
            typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
            typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
            typename SuperVector<S>::movemask_type z = (z1 << 1 | lastz1) & z2;
            lastz1 = z1 >> Z_SHIFT;
            hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
            RETURN_IF_TERMINATED(rv);
        }
    off = s1End;
    uintptr_t last = data + end;
    uintptr_t s2End = ROUNDDOWN_N(last, S) - data;
    uintptr_t s3Start = end - S;
    if (s2Start >= end) {
        DEBUG_PRINTF("s2 == mL %zu\n", end);
        return HWLM_SUCCESS;
    }
-    //size_t loops = (s2End -s2Start)/ S;
+    DEBUG_PRINTF("d %p e %p \n", d, e);
    // finish off tail
-    if (likely(s2Start != s2End)) {
+    return scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, &lastz1, cbi, len, d - buf, end);
    //if (likely(loops)) {
        // scan as far as we can, bounded by the last point this key can
        // possibly match
        DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s3Start);
        rv = scanDoubleFast2(n, buf, len, caseMask, mask1, mask2, cbi, s2Start, s2End);
        //rv = scanDoubleFast(n, buf, len, caseMask.u.v512[0], mask1.u.v512[0], mask2.u.v512[0], cbi, s2Start, s2End);
        RETURN_IF_TERMINATED(rv);
        off = s2End;
    }
    // if there isn't enough data left to match the key, bail out
    if (s2End == end) {
        return HWLM_SUCCESS;
    }
    DEBUG_PRINTF("stage 3: %zu -> %zu\n", s3Start, end);
    rv = scanDoubleUnaligned2(n, buf, caseMask, mask1, mask2, cbi, len, s3Start, off, end);
    //rv = scanDoubleUnaligned(n, buf, len, s3Start, caseMask.u.v512[0], mask1.u.v512[0], mask2.u.v512[0], cbi, off, end);
    return rv;
 }
 // Single-character specialisation, used when keyLen = 1
--- a/src/hwlm/noodle_engine_simd.hpp
+++ b/src/hwlm/noodle_engine_simd.hpp
@@ -37,19 +37,19 @@
 using Z_TYPE = u64a;
 #define Z_BITS 64
 #define Z_SHIFT 63
-#define DOUBLE_LOAD_MASK(l, off)   ((~0ULL) >> (Z_BITS -l)) 
+#define DOUBLE_LOAD_MASK(l)        ((~0ULL) >> (Z_BITS -l)) 
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << l) - 1ULL)
 #elif defined(HAVE_SIMD_256_BITS)
 using Z_TYPE = u32;
 #define Z_BITS 32
 #define Z_SHIFT 31
-#define DOUBLE_LOAD_MASK(l, off)   ((((1ULL) << l) - 1ULL) << off)
+#define DOUBLE_LOAD_MASK(l)        (((1ULL) << l) - 1ULL)
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << l) - 1ULL)
 #elif defined(HAVE_SIMD_128_BITS)
 using Z_TYPE = u32;
 #define Z_BITS 32
 #define Z_SHIFT 0
-#define DOUBLE_LOAD_MASK(l, off)   ((((1ULL) << l) - 1ULL) << off)
+#define DOUBLE_LOAD_MASK(l)        (((1ULL) << l) - 1ULL)
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << l) - 1ULL)
 #endif
@@ -77,13 +77,14 @@ static really_inline SuperVector<S> getCaseMask(void) {
 // function can't handle (due to small/unaligned chunk at end)
 template<uint16_t S>
 static really_inline
-hwlm_error_t scanSingleUnaligned2(const struct noodTable *n, const u8 *buf,
+hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
                                 SuperVector<S> caseMask, SuperVector<S> mask1,
                                 const struct cb_info *cbi, size_t len, size_t start,
                                 size_t end) {
    const u8 *d = buf + start;
    DEBUG_PRINTF("start %zu end %zu\n", start, end);
    const size_t l = end - start;
    DEBUG_PRINTF("l = %ld\n", l);
    //assert(l <= 64);
    if (!l) {
        return HWLM_SUCCESS;
@@ -93,100 +94,28 @@ hwlm_error_t scanSingleUnaligned2(const struct noodTable *n, const u8 *buf,
    SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
    typename SuperVector<S>::movemask_type z = mask & mask1.eqmask(v);
-    return single_zscan(n, d, buf, &z, len, cbi);
+    return single_zscan(n, d, buf, z, len, cbi);
 }
 template<uint16_t S>
 static really_inline
-hwlm_error_t scanSingleFast2(const struct noodTable *n, const u8 *buf,
+hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
-                            size_t len, SuperVector<S> caseMask, SuperVector<S> mask1,
+                                 SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2, typename SuperVector<S>::movemask_type *lastz1,
-                            const struct cb_info *cbi, size_t start,
+                                 const struct cb_info *cbi, size_t len, size_t start, size_t end) {
                            size_t loops) {
    const u8 *d = buf + start;
    for (size_t i = 0; i < loops; i++, d+= S) {
        const u8 *base = ROUNDUP_PTR(d, 64);
        // On large packet buffers, this prefetch appears to get us about 2%.
        __builtin_prefetch(base + 4*S);
        SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
        typename SuperVector<S>::movemask_type z = mask1.eqmask(v);
        hwlm_error_t result = single_zscan(n, d, buf, &z, len, cbi);
        if (unlikely(result != HWLM_SUCCESS))
 	    return result;
    }
    return HWLM_SUCCESS;
 }
 template<uint16_t S>
 static really_inline
 hwlm_error_t scanDoubleUnaligned2(const struct noodTable *n, const u8 *buf,
                                 SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
                                 const struct cb_info *cbi, size_t len, size_t offset, size_t start, size_t end) {
    const u8 *d = buf + offset;
    DEBUG_PRINTF("start %zu end %zu", start, end);
    const size_t l = end - start;
    assert(l <= S);
    if (!l) {
        return HWLM_SUCCESS;
    }
   u32 buf_off = start - offset;
    SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
-    typename SuperVector<S>::movemask_type mask = DOUBLE_LOAD_MASK(l, buf_off);
+    typename SuperVector<S>::movemask_type mask = DOUBLE_LOAD_MASK(l);
    typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
    typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
-    typename SuperVector<S>::movemask_type z = mask & (z1 << 1) & z2;
+    typename SuperVector<S>::movemask_type z = mask & (*lastz1 | z1 << 1) & z2;
-#if defined(HAVE_AVX512) && defined(BUILD_AVX512)
+    *lastz1 = z1 >> (l -1);
    DEBUG_PRINTF("buf_off = %d\n", buf_off);
    DEBUG_PRINTF("l = %ld, mask = 0x%016llx\n", l, mask);
    DEBUG_PRINTF("\nz1 = 0x%016llx\n", z1);
    DEBUG_PRINTF("z2 = 0x%016llx\n", z2);
    DEBUG_PRINTF("z  = 0x%016llx\n", z);
    __mmask64 k = (~0ULL) >> (64 - l);
    DEBUG_PRINTF("k    = 0x%016llx\n", k);
-    m512 v1 = loadu_maskz_m512(k, d);
+    return double_zscan(n, d, buf, z, len, cbi);
    v1 = and512(v1, caseMask.u.v512[0]);
    u64a z0_ = masked_eq512mask(k, mask1.u.v512[0], v1);
    u64a z1_ = masked_eq512mask(k, mask2.u.v512[0], v1);
    u64a z_ = (z0_ << 1) & z1_;
    DEBUG_PRINTF("z0_ = 0x%016llx\n", z0_);
    DEBUG_PRINTF("z1_ = 0x%016llx\n", z1_);
    DEBUG_PRINTF("z_  = 0x%016llx\n", z_);
    assert(z == z_);
 #endif
    return double_zscan(n, d, buf, &z, len, cbi);
 }
 template<uint16_t S>
 static really_inline
 hwlm_error_t scanDoubleFast2(const struct noodTable *n, const u8 *buf,
                            size_t len, SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
                            const struct cb_info *cbi, size_t start, size_t end/*loops*/) {
    const u8 *d = buf + start, *e = buf + end;
    //DEBUG_PRINTF("start %zu loops %zu \n", start, loops);
    typename SuperVector<S>::movemask_type lastz1{0};
    //for (size_t i=0; i < loops; i++, d+= S) {
    for (; d < e; d+= S) {
        const u8 *base = ROUNDUP_PTR(d, 64);
        // On large packet buffers, this prefetch appears to get us about 2%.
        __builtin_prefetch(base + 4*S);
        SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
        typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
        typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
        typename SuperVector<S>::movemask_type z = (z1 << 1 | lastz1) & z2;
        lastz1 = z1 >> Z_SHIFT;
        hwlm_error_t result = double_zscan(n, d, buf, &z, len, cbi);
        if (unlikely(result != HWLM_SUCCESS))
 	       return result;
    }
    return HWLM_SUCCESS;
 }