diff --git a/src/hwlm/noodle_engine.cpp b/src/hwlm/noodle_engine.cpp index 16280b59..58e0604d 100644 --- a/src/hwlm/noodle_engine.cpp +++ b/src/hwlm/noodle_engine.cpp @@ -100,9 +100,9 @@ match: static really_really_inline hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, - Z_TYPE *z, size_t len, const struct cb_info *cbi) { - while (unlikely(*z)) { - Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(z); + Z_TYPE z, size_t len, const struct cb_info *cbi) { + while (unlikely(z)) { + Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); size_t matchPos = d - buf + pos; DEBUG_PRINTF("match pos %zu\n", matchPos); hwlmcb_rv_t rv = final(n, buf, len, 1, cbi, matchPos); @@ -113,9 +113,9 @@ hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, static really_really_inline hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, - Z_TYPE *z, size_t len, const struct cb_info *cbi) { - while (unlikely(*z)) { - Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(z); + Z_TYPE z, size_t len, const struct cb_info *cbi) { + while (unlikely(z)) { + Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); size_t matchPos = d - buf + pos - 1; \ DEBUG_PRINTF("match pos %zu\n", matchPos); hwlmcb_rv_t rv = final(n, buf, len, 0, cbi, matchPos); @@ -127,126 +127,99 @@ hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, template static really_inline hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, - size_t len, size_t start, - SuperVector caseMask, SuperVector mask1, + size_t len, size_t offset, + SuperVector caseMask, SuperVector mask1, const struct cb_info *cbi) { - - size_t offset = start + n->msk_len - 1; + size_t start = offset + n->msk_len - 1; size_t end = len; - assert(offset < end); - hwlm_error_t rv; + const u8 *d = buf + start; + const u8 *e = buf + end; + DEBUG_PRINTF("start %p end %p \n", d, e); + assert(d < e); + if (d + S <= e) { + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDUP_PTR(d, S); + DEBUG_PRINTF("until aligned %p \n", d1); + if (scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, start, d1 - buf) == HWLM_TERMINATED) { + return HWLM_TERMINATED; + } + d = d1; - if (end - offset <= S) { - return scanSingleUnaligned2(n, buf, caseMask, mask1, cbi, len, offset, end); - //return scanSingleUnaligned(n, buf, len, offset, caseMask.u.v512[0], mask1.u.v512[0], cbi, offset, end); + size_t loops = (end - (d - buf)) / S; + DEBUG_PRINTF("loops %ld \n", loops); + + for (size_t i = 0; i < loops; i++, d+= S) { + DEBUG_PRINTF("d %p \n", d); + const u8 *base = ROUNDUP_PTR(d, 64); + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(base + 256); + + SuperVector v = SuperVector::load(d) & caseMask; + typename SuperVector::movemask_type z = mask1.eqmask(v); + + hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi); + RETURN_IF_TERMINATED(rv); + } } - uintptr_t data = (uintptr_t)buf; - uintptr_t s2Start = ROUNDUP_N(data + offset, S) - data; + DEBUG_PRINTF("d %p e %p \n", d, e); + // finish off tail - if (offset != s2Start) { - // first scan out to the fast scan starting point - DEBUG_PRINTF("stage 1: -> %zu\n", s2Start); - rv = scanSingleUnaligned2(n, buf, caseMask, mask1, cbi, len, offset, s2Start); - //rv = scanSingleUnaligned(n, buf, len, offset, caseMask.u.v512[0], mask1.u.v512[0], cbi, offset, s2Start); - RETURN_IF_TERMINATED(rv); - } - uintptr_t last = data + end; - uintptr_t s2End = ROUNDDOWN_N(last, S) - data; - size_t loops = s2End / S; - - if (likely(loops)) { - //if (likely(s2Start != s2End)) { - // scan as far as we can, bounded by the last point this key can - // possibly match - DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s2End); - rv = scanSingleFast2(n, buf, len, caseMask, mask1, cbi, s2Start, loops); - //rv = scanSingleFast(n, buf, len, caseMask.u.v512[0], mask1.u.v512[0], cbi, s2Start, s2End); - RETURN_IF_TERMINATED(rv); - } - - if (s2End == len) { - return HWLM_SUCCESS; - } - // if we are done bail out - //if (s2End != len) { - DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, len); - rv = scanSingleUnaligned2(n, buf, caseMask, mask1, cbi, len, s2End, len); - //rv = scanSingleUnaligned(n, buf, len, s2End, caseMask.u.v512[0], mask1.u.v512[0], cbi, s2End, len); - return rv; - //} - - //return HWLM_SUCCESS; + return scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, d - buf, end); } template static really_inline hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, - size_t len, size_t start, - SuperVector caseMask, SuperVector mask1, SuperVector mask2, + size_t len, size_t offset, + SuperVector caseMask, SuperVector mask1, SuperVector mask2, const struct cb_info *cbi) { // we stop scanning for the key-fragment when the rest of the key can't // possibly fit in the remaining buffer size_t end = len - n->key_offset + 2; - // the first place the key can match - size_t offset = start + n->msk_len - n->key_offset; + size_t start = offset + n->msk_len - n->key_offset; - hwlm_error_t rv; + typename SuperVector::movemask_type lastz1{0}; - if (end - offset <= S) { - rv = scanDoubleUnaligned2(n, buf, caseMask, mask1, mask2, cbi, len, offset, offset, end); - //rv = scanDoubleUnaligned(n, buf, len, offset, caseMask.u.v512[0], mask1.u.v512[0], mask2.u.v512[0], cbi, offset, end); - return rv; + const u8 *d = buf + start; + const u8 *e = buf + end; + DEBUG_PRINTF("start %p end %p \n", d, e); + assert(d < e); + if (d + S <= e) { + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDUP_PTR(d, S); + DEBUG_PRINTF("until aligned %p \n", d1); + if (scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, &lastz1, cbi, len, start, d1 - buf) == HWLM_TERMINATED) { + return HWLM_TERMINATED; + } + d = d1; + + size_t loops = (end - (d - buf)) / S; + DEBUG_PRINTF("loops %ld \n", loops); + + for (size_t i = 0; i < loops; i++, d+= S) { + DEBUG_PRINTF("d %p \n", d); + const u8 *base = ROUNDUP_PTR(d, 64); + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(base + 256); + + SuperVector v = SuperVector::load(d) & caseMask; + typename SuperVector::movemask_type z1 = mask1.eqmask(v); + typename SuperVector::movemask_type z2 = mask2.eqmask(v); + typename SuperVector::movemask_type z = (z1 << 1 | lastz1) & z2; + lastz1 = z1 >> Z_SHIFT; + + hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi); + RETURN_IF_TERMINATED(rv); + } } - uintptr_t data = (uintptr_t)buf; - uintptr_t s2Start = ROUNDUP_N(data + offset, S) - data; - uintptr_t s1End = s2Start + 1; - uintptr_t off = offset; + DEBUG_PRINTF("d %p e %p \n", d, e); + // finish off tail - if (s2Start != off) { - // first scan out to the fast scan starting point plus one char past to - // catch the key on the overlap - DEBUG_PRINTF("stage 1: %zu -> %zu\n", off, s2Start); - rv = scanDoubleUnaligned2(n, buf, caseMask, mask1, mask2, cbi, len, offset, off, s1End); - //rv = scanDoubleUnaligned(n, buf, len, offset, caseMask.u.v512[0], mask1.u.v512[0], mask2.u.v512[0], cbi, off, s1End); - RETURN_IF_TERMINATED(rv); - } - off = s1End; - uintptr_t last = data + end; - uintptr_t s2End = ROUNDDOWN_N(last, S) - data; - uintptr_t s3Start = end - S; - - if (s2Start >= end) { - DEBUG_PRINTF("s2 == mL %zu\n", end); - return HWLM_SUCCESS; - } - - //size_t loops = (s2End -s2Start)/ S; - - if (likely(s2Start != s2End)) { - //if (likely(loops)) { - // scan as far as we can, bounded by the last point this key can - // possibly match - DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s3Start); - rv = scanDoubleFast2(n, buf, len, caseMask, mask1, mask2, cbi, s2Start, s2End); - //rv = scanDoubleFast(n, buf, len, caseMask.u.v512[0], mask1.u.v512[0], mask2.u.v512[0], cbi, s2Start, s2End); - RETURN_IF_TERMINATED(rv); - off = s2End; - } - - // if there isn't enough data left to match the key, bail out - if (s2End == end) { - return HWLM_SUCCESS; - } - - DEBUG_PRINTF("stage 3: %zu -> %zu\n", s3Start, end); - rv = scanDoubleUnaligned2(n, buf, caseMask, mask1, mask2, cbi, len, s3Start, off, end); - //rv = scanDoubleUnaligned(n, buf, len, s3Start, caseMask.u.v512[0], mask1.u.v512[0], mask2.u.v512[0], cbi, off, end); - - return rv; + return scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, &lastz1, cbi, len, d - buf, end); } // Single-character specialisation, used when keyLen = 1 diff --git a/src/hwlm/noodle_engine_simd.hpp b/src/hwlm/noodle_engine_simd.hpp index 98289d59..9c4f9b4b 100644 --- a/src/hwlm/noodle_engine_simd.hpp +++ b/src/hwlm/noodle_engine_simd.hpp @@ -37,19 +37,19 @@ using Z_TYPE = u64a; #define Z_BITS 64 #define Z_SHIFT 63 -#define DOUBLE_LOAD_MASK(l, off) ((~0ULL) >> (Z_BITS -l)) +#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS -l)) #define SINGLE_LOAD_MASK(l) (((1ULL) << l) - 1ULL) #elif defined(HAVE_SIMD_256_BITS) using Z_TYPE = u32; #define Z_BITS 32 #define Z_SHIFT 31 -#define DOUBLE_LOAD_MASK(l, off) ((((1ULL) << l) - 1ULL) << off) +#define DOUBLE_LOAD_MASK(l) (((1ULL) << l) - 1ULL) #define SINGLE_LOAD_MASK(l) (((1ULL) << l) - 1ULL) #elif defined(HAVE_SIMD_128_BITS) using Z_TYPE = u32; #define Z_BITS 32 #define Z_SHIFT 0 -#define DOUBLE_LOAD_MASK(l, off) ((((1ULL) << l) - 1ULL) << off) +#define DOUBLE_LOAD_MASK(l) (((1ULL) << l) - 1ULL) #define SINGLE_LOAD_MASK(l) (((1ULL) << l) - 1ULL) #endif @@ -77,13 +77,14 @@ static really_inline SuperVector getCaseMask(void) { // function can't handle (due to small/unaligned chunk at end) template static really_inline -hwlm_error_t scanSingleUnaligned2(const struct noodTable *n, const u8 *buf, +hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, SuperVector caseMask, SuperVector mask1, const struct cb_info *cbi, size_t len, size_t start, size_t end) { const u8 *d = buf + start; DEBUG_PRINTF("start %zu end %zu\n", start, end); const size_t l = end - start; + DEBUG_PRINTF("l = %ld\n", l); //assert(l <= 64); if (!l) { return HWLM_SUCCESS; @@ -93,100 +94,28 @@ hwlm_error_t scanSingleUnaligned2(const struct noodTable *n, const u8 *buf, SuperVector v = SuperVector::loadu(d) & caseMask; typename SuperVector::movemask_type z = mask & mask1.eqmask(v); - return single_zscan(n, d, buf, &z, len, cbi); + return single_zscan(n, d, buf, z, len, cbi); } template static really_inline -hwlm_error_t scanSingleFast2(const struct noodTable *n, const u8 *buf, - size_t len, SuperVector caseMask, SuperVector mask1, - const struct cb_info *cbi, size_t start, - size_t loops) { +hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, + SuperVector caseMask, SuperVector mask1, SuperVector mask2, typename SuperVector::movemask_type *lastz1, + const struct cb_info *cbi, size_t len, size_t start, size_t end) { const u8 *d = buf + start; - - for (size_t i = 0; i < loops; i++, d+= S) { - const u8 *base = ROUNDUP_PTR(d, 64); - // On large packet buffers, this prefetch appears to get us about 2%. - __builtin_prefetch(base + 4*S); - - SuperVector v = SuperVector::load(d) & caseMask; - typename SuperVector::movemask_type z = mask1.eqmask(v); - - hwlm_error_t result = single_zscan(n, d, buf, &z, len, cbi); - if (unlikely(result != HWLM_SUCCESS)) - return result; - } - return HWLM_SUCCESS; -} - -template -static really_inline -hwlm_error_t scanDoubleUnaligned2(const struct noodTable *n, const u8 *buf, - SuperVector caseMask, SuperVector mask1, SuperVector mask2, - const struct cb_info *cbi, size_t len, size_t offset, size_t start, size_t end) { - const u8 *d = buf + offset; DEBUG_PRINTF("start %zu end %zu", start, end); const size_t l = end - start; assert(l <= S); if (!l) { return HWLM_SUCCESS; } - u32 buf_off = start - offset; - SuperVector v = SuperVector::loadu(d) & caseMask; - typename SuperVector::movemask_type mask = DOUBLE_LOAD_MASK(l, buf_off); + typename SuperVector::movemask_type mask = DOUBLE_LOAD_MASK(l); typename SuperVector::movemask_type z1 = mask1.eqmask(v); typename SuperVector::movemask_type z2 = mask2.eqmask(v); - typename SuperVector::movemask_type z = mask & (z1 << 1) & z2; -#if defined(HAVE_AVX512) && defined(BUILD_AVX512) - DEBUG_PRINTF("buf_off = %d\n", buf_off); - DEBUG_PRINTF("l = %ld, mask = 0x%016llx\n", l, mask); - DEBUG_PRINTF("\nz1 = 0x%016llx\n", z1); - DEBUG_PRINTF("z2 = 0x%016llx\n", z2); - DEBUG_PRINTF("z = 0x%016llx\n", z); - __mmask64 k = (~0ULL) >> (64 - l); - DEBUG_PRINTF("k = 0x%016llx\n", k); + typename SuperVector::movemask_type z = mask & (*lastz1 | z1 << 1) & z2; + *lastz1 = z1 >> (l -1); - m512 v1 = loadu_maskz_m512(k, d); - v1 = and512(v1, caseMask.u.v512[0]); - - u64a z0_ = masked_eq512mask(k, mask1.u.v512[0], v1); - u64a z1_ = masked_eq512mask(k, mask2.u.v512[0], v1); - u64a z_ = (z0_ << 1) & z1_; - DEBUG_PRINTF("z0_ = 0x%016llx\n", z0_); - DEBUG_PRINTF("z1_ = 0x%016llx\n", z1_); - DEBUG_PRINTF("z_ = 0x%016llx\n", z_); - assert(z == z_); -#endif - - return double_zscan(n, d, buf, &z, len, cbi); -} - -template -static really_inline -hwlm_error_t scanDoubleFast2(const struct noodTable *n, const u8 *buf, - size_t len, SuperVector caseMask, SuperVector mask1, SuperVector mask2, - const struct cb_info *cbi, size_t start, size_t end/*loops*/) { - const u8 *d = buf + start, *e = buf + end; - //DEBUG_PRINTF("start %zu loops %zu \n", start, loops); - typename SuperVector::movemask_type lastz1{0}; - - //for (size_t i=0; i < loops; i++, d+= S) { - for (; d < e; d+= S) { - const u8 *base = ROUNDUP_PTR(d, 64); - // On large packet buffers, this prefetch appears to get us about 2%. - __builtin_prefetch(base + 4*S); - - SuperVector v = SuperVector::load(d) & caseMask; - typename SuperVector::movemask_type z1 = mask1.eqmask(v); - typename SuperVector::movemask_type z2 = mask2.eqmask(v); - typename SuperVector::movemask_type z = (z1 << 1 | lastz1) & z2; - lastz1 = z1 >> Z_SHIFT; - - hwlm_error_t result = double_zscan(n, d, buf, &z, len, cbi); - if (unlikely(result != HWLM_SUCCESS)) - return result; - } - return HWLM_SUCCESS; + return double_zscan(n, d, buf, z, len, cbi); }