diff --git a/src/hwlm/noodle_engine.cpp b/src/hwlm/noodle_engine.cpp index 58e0604d..dfda1ce9 100644 --- a/src/hwlm/noodle_engine.cpp +++ b/src/hwlm/noodle_engine.cpp @@ -59,8 +59,6 @@ struct cb_info { }; -#include "noodle_engine_simd.hpp" - #define RETURN_IF_TERMINATED(x) \ { \ if ((x) == HWLM_TERMINATED) { \ @@ -68,6 +66,10 @@ struct cb_info { } \ } +#if !defined(HAVE_SVE) +#include "noodle_engine_simd.hpp" +#endif + // Make sure the rest of the string is there. The single character scanner // is used only for single chars with case insensitivity used correctly, // so it can go straight to the callback if we get this far. @@ -124,130 +126,6 @@ hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, return HWLM_SUCCESS; } -template -static really_inline -hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, - size_t len, size_t offset, - SuperVector caseMask, SuperVector mask1, - const struct cb_info *cbi) { - size_t start = offset + n->msk_len - 1; - size_t end = len; - - const u8 *d = buf + start; - const u8 *e = buf + end; - DEBUG_PRINTF("start %p end %p \n", d, e); - assert(d < e); - if (d + S <= e) { - // peel off first part to cacheline boundary - const u8 *d1 = ROUNDUP_PTR(d, S); - DEBUG_PRINTF("until aligned %p \n", d1); - if (scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, start, d1 - buf) == HWLM_TERMINATED) { - return HWLM_TERMINATED; - } - d = d1; - - size_t loops = (end - (d - buf)) / S; - DEBUG_PRINTF("loops %ld \n", loops); - - for (size_t i = 0; i < loops; i++, d+= S) { - DEBUG_PRINTF("d %p \n", d); - const u8 *base = ROUNDUP_PTR(d, 64); - // On large packet buffers, this prefetch appears to get us about 2%. - __builtin_prefetch(base + 256); - - SuperVector v = SuperVector::load(d) & caseMask; - typename SuperVector::movemask_type z = mask1.eqmask(v); - - hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi); - RETURN_IF_TERMINATED(rv); - } - } - - DEBUG_PRINTF("d %p e %p \n", d, e); - // finish off tail - - return scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, d - buf, end); -} - -template -static really_inline -hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, - size_t len, size_t offset, - SuperVector caseMask, SuperVector mask1, SuperVector mask2, - const struct cb_info *cbi) { - // we stop scanning for the key-fragment when the rest of the key can't - // possibly fit in the remaining buffer - size_t end = len - n->key_offset + 2; - - size_t start = offset + n->msk_len - n->key_offset; - - typename SuperVector::movemask_type lastz1{0}; - - const u8 *d = buf + start; - const u8 *e = buf + end; - DEBUG_PRINTF("start %p end %p \n", d, e); - assert(d < e); - if (d + S <= e) { - // peel off first part to cacheline boundary - const u8 *d1 = ROUNDUP_PTR(d, S); - DEBUG_PRINTF("until aligned %p \n", d1); - if (scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, &lastz1, cbi, len, start, d1 - buf) == HWLM_TERMINATED) { - return HWLM_TERMINATED; - } - d = d1; - - size_t loops = (end - (d - buf)) / S; - DEBUG_PRINTF("loops %ld \n", loops); - - for (size_t i = 0; i < loops; i++, d+= S) { - DEBUG_PRINTF("d %p \n", d); - const u8 *base = ROUNDUP_PTR(d, 64); - // On large packet buffers, this prefetch appears to get us about 2%. - __builtin_prefetch(base + 256); - - SuperVector v = SuperVector::load(d) & caseMask; - typename SuperVector::movemask_type z1 = mask1.eqmask(v); - typename SuperVector::movemask_type z2 = mask2.eqmask(v); - typename SuperVector::movemask_type z = (z1 << 1 | lastz1) & z2; - lastz1 = z1 >> Z_SHIFT; - - hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi); - RETURN_IF_TERMINATED(rv); - } - } - - DEBUG_PRINTF("d %p e %p \n", d, e); - // finish off tail - - return scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, &lastz1, cbi, len, d - buf, end); -} - -// Single-character specialisation, used when keyLen = 1 -static really_inline -hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len, - size_t start, bool noCase, const struct cb_info *cbi) { - if (!ourisalpha(n->key0)) { - noCase = 0; // force noCase off if we don't have an alphabetic char - } - - const SuperVector caseMask{noCase ? getCaseMask() : SuperVector::Ones()}; - const SuperVector mask1{getMask(n->key0, noCase)}; - - return scanSingleMain(n, buf, len, start, caseMask, mask1, cbi); -} - - -static really_inline -hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len, - size_t start, bool noCase, const struct cb_info *cbi) { - - const SuperVector caseMask{noCase ? getCaseMask() : SuperVector::Ones()}; - const SuperVector mask1{getMask(n->key0, noCase)}; - const SuperVector mask2{getMask(n->key1, noCase)}; - - return scanDoubleMain(n, buf, len, start, caseMask, mask1, mask2, cbi); -} - // main entry point for the scan code static really_inline hwlm_error_t scan(const struct noodTable *n, const u8 *buf, size_t len, diff --git a/src/hwlm/noodle_engine_simd.hpp b/src/hwlm/noodle_engine_simd.hpp index 9c4f9b4b..cabcde91 100644 --- a/src/hwlm/noodle_engine_simd.hpp +++ b/src/hwlm/noodle_engine_simd.hpp @@ -119,3 +119,127 @@ hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, return double_zscan(n, d, buf, z, len, cbi); } + +template +static really_inline +hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, + size_t len, size_t offset, + SuperVector caseMask, SuperVector mask1, + const struct cb_info *cbi) { + size_t start = offset + n->msk_len - 1; + size_t end = len; + + const u8 *d = buf + start; + const u8 *e = buf + end; + DEBUG_PRINTF("start %p end %p \n", d, e); + assert(d < e); + if (d + S <= e) { + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDUP_PTR(d, S); + DEBUG_PRINTF("until aligned %p \n", d1); + if (scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, start, d1 - buf) == HWLM_TERMINATED) { + return HWLM_TERMINATED; + } + d = d1; + + size_t loops = (end - (d - buf)) / S; + DEBUG_PRINTF("loops %ld \n", loops); + + for (size_t i = 0; i < loops; i++, d+= S) { + DEBUG_PRINTF("d %p \n", d); + const u8 *base = ROUNDUP_PTR(d, 64); + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(base + 256); + + SuperVector v = SuperVector::load(d) & caseMask; + typename SuperVector::movemask_type z = mask1.eqmask(v); + + hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi); + RETURN_IF_TERMINATED(rv); + } + } + + DEBUG_PRINTF("d %p e %p \n", d, e); + // finish off tail + + return scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, d - buf, end); +} + +template +static really_inline +hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, + size_t len, size_t offset, + SuperVector caseMask, SuperVector mask1, SuperVector mask2, + const struct cb_info *cbi) { + // we stop scanning for the key-fragment when the rest of the key can't + // possibly fit in the remaining buffer + size_t end = len - n->key_offset + 2; + + size_t start = offset + n->msk_len - n->key_offset; + + typename SuperVector::movemask_type lastz1{0}; + + const u8 *d = buf + start; + const u8 *e = buf + end; + DEBUG_PRINTF("start %p end %p \n", d, e); + assert(d < e); + if (d + S <= e) { + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDUP_PTR(d, S); + DEBUG_PRINTF("until aligned %p \n", d1); + if (scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, &lastz1, cbi, len, start, d1 - buf) == HWLM_TERMINATED) { + return HWLM_TERMINATED; + } + d = d1; + + size_t loops = (end - (d - buf)) / S; + DEBUG_PRINTF("loops %ld \n", loops); + + for (size_t i = 0; i < loops; i++, d+= S) { + DEBUG_PRINTF("d %p \n", d); + const u8 *base = ROUNDUP_PTR(d, 64); + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(base + 256); + + SuperVector v = SuperVector::load(d) & caseMask; + typename SuperVector::movemask_type z1 = mask1.eqmask(v); + typename SuperVector::movemask_type z2 = mask2.eqmask(v); + typename SuperVector::movemask_type z = (z1 << 1 | lastz1) & z2; + lastz1 = z1 >> Z_SHIFT; + + hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi); + RETURN_IF_TERMINATED(rv); + } + } + + DEBUG_PRINTF("d %p e %p \n", d, e); + // finish off tail + + return scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, &lastz1, cbi, len, d - buf, end); +} + +// Single-character specialisation, used when keyLen = 1 +static really_inline +hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len, + size_t start, bool noCase, const struct cb_info *cbi) { + if (!ourisalpha(n->key0)) { + noCase = 0; // force noCase off if we don't have an alphabetic char + } + + const SuperVector caseMask{noCase ? getCaseMask() : SuperVector::Ones()}; + const SuperVector mask1{getMask(n->key0, noCase)}; + + return scanSingleMain(n, buf, len, start, caseMask, mask1, cbi); +} + + +static really_inline +hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len, + size_t start, bool noCase, const struct cb_info *cbi) { + + const SuperVector caseMask{noCase ? getCaseMask() : SuperVector::Ones()}; + const SuperVector mask1{getMask(n->key0, noCase)}; + const SuperVector mask2{getMask(n->key1, noCase)}; + + return scanDoubleMain(n, buf, len, start, caseMask, mask1, mask2, cbi); +} \ No newline at end of file