diff --git a/src/hwlm/noodle_engine.c b/src/hwlm/noodle_engine.c index da61dfe8..28a8f4a5 100644 --- a/src/hwlm/noodle_engine.c +++ b/src/hwlm/noodle_engine.c @@ -143,14 +143,17 @@ match: #if defined(HAVE_AVX512) #define CHUNKSIZE 64 #define MASK_TYPE m512 +#define ONES ones512() #include "noodle_engine_avx512.c" #elif defined(HAVE_AVX2) #define CHUNKSIZE 32 #define MASK_TYPE m256 +#define ONES ones256() #include "noodle_engine_avx2.c" #else #define CHUNKSIZE 16 #define MASK_TYPE m128 +#define ONES ones128() #include "noodle_engine_sse.c" #endif @@ -160,7 +163,7 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, const struct cb_info *cbi) { const MASK_TYPE mask1 = getMask(n->key0, noCase); - const MASK_TYPE caseMask = getCaseMask(); + const MASK_TYPE caseMask = noCase ? getCaseMask() : ONES; size_t offset = start + n->msk_len - 1; size_t end = len; @@ -169,14 +172,14 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, #if !defined(HAVE_AVX512) hwlm_error_t rv; - if (end - offset < CHUNKSIZE) { +/* if (end - offset <= CHUNKSIZE) { rv = scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, offset, end); return rv; - } + }*/ - if (end - offset == CHUNKSIZE) { - rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1, + if (end - offset <= CHUNKSIZE) { + rv = scanSingleUnaligned(n, buf, len, offset, caseMask, mask1, cbi, offset, end); return rv; } @@ -190,7 +193,7 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, if (offset != s2Start) { // first scan out to the fast scan starting point DEBUG_PRINTF("stage 1: -> %zu\n", s2Start); - rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1, + rv = scanSingleUnaligned(n, buf, len, offset, caseMask, mask1, cbi, offset, s2Start); RETURN_IF_TERMINATED(rv); } @@ -199,7 +202,7 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, // scan as far as we can, bounded by the last point this key can // possibly match DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s2End); - rv = scanSingleFast(n, buf, len, noCase, caseMask, mask1, cbi, s2Start, + rv = scanSingleFast(n, buf, len, caseMask, mask1, cbi, s2Start, s2End); RETURN_IF_TERMINATED(rv); } @@ -210,7 +213,7 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, } DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, len); - rv = scanSingleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1, cbi, + rv = scanSingleUnaligned(n, buf, len, s3Start, caseMask, mask1, cbi, s2End, len); return rv; @@ -231,20 +234,20 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, // the first place the key can match size_t offset = start + n->msk_len - n->key_offset; - const MASK_TYPE caseMask = getCaseMask(); + const MASK_TYPE caseMask = noCase ? getCaseMask() : ONES; const MASK_TYPE mask1 = getMask(n->key0, noCase); const MASK_TYPE mask2 = getMask(n->key1, noCase); #if !defined(HAVE_AVX512) hwlm_error_t rv; - if (end - offset < CHUNKSIZE) { +/* if (end - offset <= CHUNKSIZE) { rv = scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi, offset, end); return rv; - } - if (end - offset == CHUNKSIZE) { - rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1, + }*/ + if (end - offset <= CHUNKSIZE) { + rv = scanDoubleUnaligned(n, buf, len, offset, caseMask, mask1, mask2, cbi, offset, end); return rv; } @@ -261,7 +264,7 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, // first scan out to the fast scan starting point plus one char past to // catch the key on the overlap DEBUG_PRINTF("stage 1: %zu -> %zu\n", off, s2Start); - rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1, + rv = scanDoubleUnaligned(n, buf, len, offset, caseMask, mask1, mask2, cbi, off, s1End); RETURN_IF_TERMINATED(rv); } @@ -276,7 +279,7 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, // scan as far as we can, bounded by the last point this key can // possibly match DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s3Start); - rv = scanDoubleFast(n, buf, len, noCase, caseMask, mask1, mask2, cbi, + rv = scanDoubleFast(n, buf, len, caseMask, mask1, mask2, cbi, s2Start, s2End); RETURN_IF_TERMINATED(rv); off = s2End; @@ -288,12 +291,12 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, } DEBUG_PRINTF("stage 3: %zu -> %zu\n", s3Start, end); - rv = scanDoubleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1, + rv = scanDoubleUnaligned(n, buf, len, s3Start, caseMask, mask1, mask2, cbi, off, end); return rv; #else // AVX512 - return scanDouble512(n, buf, len, noCase, caseMask, mask1, mask2, cbi, + return scanDouble512(n, buf, len, caseMask, mask1, mask2, cbi, offset, end); #endif // AVX512 } @@ -303,14 +306,14 @@ static really_inline hwlm_error_t scanSingleNoCase(const struct noodTable *n, const u8 *buf, size_t len, size_t start, const struct cb_info *cbi) { - return scanSingleMain(n, buf, len, start, 1, cbi); + return scanSingleMain(n, buf, len, start, true, cbi); } static really_inline hwlm_error_t scanSingleCase(const struct noodTable *n, const u8 *buf, size_t len, size_t start, const struct cb_info *cbi) { - return scanSingleMain(n, buf, len, start, 0, cbi); + return scanSingleMain(n, buf, len, start, false, cbi); } // Single-character specialisation, used when keyLen = 1 @@ -334,14 +337,14 @@ static really_inline hwlm_error_t scanDoubleNoCase(const struct noodTable *n, const u8 *buf, size_t len, size_t start, const struct cb_info *cbi) { - return scanDoubleMain(n, buf, len, start, 1, cbi); + return scanDoubleMain(n, buf, len, start, true, cbi); } static really_inline hwlm_error_t scanDoubleCase(const struct noodTable *n, const u8 *buf, size_t len, size_t start, const struct cb_info *cbi) { - return scanDoubleMain(n, buf, len, start, 0, cbi); + return scanDoubleMain(n, buf, len, start, false, cbi); } diff --git a/src/hwlm/noodle_engine_avx2.c b/src/hwlm/noodle_engine_avx2.c index 49fe168f..bb3ce9dc 100644 --- a/src/hwlm/noodle_engine_avx2.c +++ b/src/hwlm/noodle_engine_avx2.c @@ -39,19 +39,14 @@ static really_inline m256 getCaseMask(void) { static really_inline hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, - size_t len, size_t offset, bool noCase, - m256 caseMask, m256 mask1, + size_t len, size_t offset, + m256 caseMask, m256 mask1, const struct cb_info *cbi, size_t start, size_t end) { const u8 *d = buf + offset; DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); const size_t l = end - start; - - m256 v = loadu256(d); - - if (noCase) { - v = and256(v, caseMask); - } + m256 v = and256(loadu256(d), caseMask); u32 z = movemask256(eq256(mask1, v)); @@ -68,19 +63,14 @@ hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, static really_inline hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, - size_t len, size_t offset, bool noCase, + size_t len, size_t offset, m256 caseMask, m256 mask1, m256 mask2, const struct cb_info *cbi, size_t start, size_t end) { const u8 *d = buf + offset; DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); size_t l = end - start; - - m256 v = loadu256(d); - - if (noCase) { - v = and256(v, caseMask); - } + m256 v = and256(loadu256(d), caseMask); u32 z0 = movemask256(eq256(mask1, v)); u32 z1 = movemask256(eq256(mask2, v)); @@ -96,13 +86,13 @@ hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, return HWLM_SUCCESS; } - +/* // The short scan routine. It is used both to scan data up to an // alignment boundary if needed and to finish off data that the aligned scan // function can't handle (due to small/unaligned chunk at end) static really_inline hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, - size_t len, bool noCase, m256 caseMask, m256 mask1, + size_t len, m256 caseMask, m256 mask1, const struct cb_info *cbi, size_t start, size_t end) { const u8 *d = buf + start; @@ -112,7 +102,6 @@ hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, if (!l) { return HWLM_SUCCESS; } - m256 v; if (l < 4) { @@ -126,10 +115,7 @@ hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, v = masked_move256_len(d, l); } - if (noCase) { - v = and256(v, caseMask); - } - + m256 v = and256(v, caseMask); // mask out where we can't match u32 mask = (0xFFFFFFFF >> (32 - l)); @@ -142,7 +128,7 @@ hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, static really_inline hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, - size_t len, bool noCase, m256 caseMask, m256 mask1, + size_t len, m256 caseMask, m256 mask1, m256 mask2, const struct cb_info *cbi, size_t start, size_t end) { const u8 *d = buf + start; @@ -151,6 +137,8 @@ hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, return HWLM_SUCCESS; } assert(l <= 32); + u32 mask = (0xFFFFFFFF >> (32 - l)); + m256 v; DEBUG_PRINTF("d %zu\n", d - buf); @@ -164,33 +152,31 @@ hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, } else { v = masked_move256_len(d, l); } - if (noCase) { - v = and256(v, caseMask); - } + + m256 v = and256(v, caseMask); u32 z0 = movemask256(eq256(mask1, v)); u32 z1 = movemask256(eq256(mask2, v)); u32 z = (z0 << 1) & z1; // mask out where we can't match - u32 mask = (0xFFFFFFFF >> (32 - l)); z &= mask; DOUBLE_ZSCAN(); return HWLM_SUCCESS; -} +}*/ static really_inline hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf, - size_t len, bool noCase, m256 caseMask, m256 mask1, + size_t len, m256 caseMask, m256 mask1, const struct cb_info *cbi, size_t start, size_t end) { const u8 *d = buf + start, *e = buf + end; assert(d < e); for (; d < e; d += 32) { - m256 v = noCase ? and256(load256(d), caseMask) : load256(d); + m256 v = and256(load256(d), caseMask); u32 z = movemask256(eq256(mask1, v)); @@ -204,7 +190,7 @@ hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf, static really_inline hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf, - size_t len, bool noCase, m256 caseMask, m256 mask1, + size_t len, m256 caseMask, m256 mask1, m256 mask2, const struct cb_info *cbi, size_t start, size_t end) { const u8 *d = buf + start, *e = buf + end; @@ -213,7 +199,7 @@ hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf, u32 lastz0 = 0; for (; d < e; d += 32) { - m256 v = noCase ? and256(load256(d), caseMask) : load256(d); + m256 v = and256(load256(d), caseMask); // we have to pull the masks out of the AVX registers because we can't // byte shift between the lanes diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c index 0f14852d..5227c251 100644 --- a/src/hwlm/noodle_engine_sse.c +++ b/src/hwlm/noodle_engine_sse.c @@ -36,10 +36,10 @@ static really_inline m128 getMask(u8 c, bool noCase) { static really_inline m128 getCaseMask(void) { return set1_16x8(0xdf); } - +/* static really_inline hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, - size_t len, bool noCase, m128 caseMask, m128 mask1, + size_t len, m128 caseMask, m128 mask1, const struct cb_info *cbi, size_t start, size_t end) { const u8 *d = buf + start; @@ -49,22 +49,20 @@ hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, if (!l) { return HWLM_SUCCESS; } - m128 mask128 = noCase ? caseMask : ones128(); - m128 v = and128(loadu128(d), mask128); + m128 v = and128(loadu128(d), caseMask); // mask out where we can't match u32 mask = (0xFFFF >> (16 - l)); - u32 z = mask & movemask128(eq128(mask1, v)); SINGLE_ZSCAN(); return HWLM_SUCCESS; -} +}*/ static really_inline hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, - size_t len, size_t offset, bool noCase, + size_t len, size_t offset, m128 caseMask, m128 mask1, const struct cb_info *cbi, size_t start, size_t end) { @@ -72,26 +70,22 @@ hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); const size_t l = end - start; - m128 mask128 = noCase ? caseMask : ones128(); - m128 v = and128(loadu128(d), mask128); + m128 v = and128(loadu128(d), caseMask); u32 buf_off = start - offset; u32 mask = ((1 << l) - 1) << buf_off; - + DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z); u32 z = mask & movemask128(eq128(mask1, v)); - DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z); - - z &= mask; SINGLE_ZSCAN(); return HWLM_SUCCESS; } - +/* static really_inline hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, - size_t len, bool noCase, m128 caseMask, m128 mask1, + size_t len, m128 caseMask, m128 mask1, m128 mask2, const struct cb_info *cbi, size_t start, size_t end) { const u8 *d = buf + start; @@ -102,42 +96,36 @@ hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, assert(l <= 32); DEBUG_PRINTF("d %zu\n", d - buf); - m128 mask128 = noCase ? caseMask : ones128(); - m128 v = and128(loadu128(d), mask128); - - u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), - eq128(mask2, v))); - + m128 v = and128(loadu128(d), caseMask); + // mask out where we can't match u32 mask = (0xFFFF >> (16 - l)); - z &= mask; + u32 z = mask & movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), + eq128(mask2, v))); DOUBLE_ZSCAN(); return HWLM_SUCCESS; -} +}*/ static really_inline hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, - size_t len, size_t offset, bool noCase, + size_t len, size_t offset, m128 caseMask, m128 mask1, m128 mask2, const struct cb_info *cbi, size_t start, size_t end) { const u8 *d = buf + offset; DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); size_t l = end - start; + u32 buf_off = start - offset; - m128 mask128 = noCase ? caseMask : ones128(); - m128 v = and128(loadu128(d), mask128); - - u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), - eq128(mask2, v))); + m128 v = and128(loadu128(d), caseMask); // mask out where we can't match - u32 buf_off = start - offset; u32 mask = ((1 << l) - 1) << buf_off; DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z); - z &= mask; + u32 z = mask & movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), + eq128(mask2, v))); DOUBLE_ZSCAN(); @@ -146,16 +134,14 @@ hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, static really_inline hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf, - size_t len, bool noCase, m128 caseMask, m128 mask1, + size_t len, m128 caseMask, m128 mask1, const struct cb_info *cbi, size_t start, size_t end) { const u8 *d = buf + start, *e = buf + end; assert(d < e); - m128 mask128 = noCase ? caseMask : ones128(); for (; d < e; d += 16) { - m128 v = and128(load128(d), mask128); - + m128 v = and128(load128(d), caseMask); u32 z = movemask128(eq128(mask1, v)); // On large packet buffers, this prefetch appears to get us about 2%. @@ -168,16 +154,15 @@ hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf, static really_inline hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf, - size_t len, bool noCase, m128 caseMask, m128 mask1, + size_t len, m128 caseMask, m128 mask1, m128 mask2, const struct cb_info *cbi, size_t start, size_t end) { const u8 *d = buf + start, *e = buf + end; assert(d < e); m128 lastz1 = zeroes128(); - m128 mask128 = noCase ? caseMask : ones128(); for (; d < e; d += 16) { - m128 v = and128(load128(d), mask128); + m128 v = and128(load128(d), caseMask); m128 z1 = eq128(mask1, v); m128 z2 = eq128(mask2, v); u32 z = movemask128(and128(palignr(z1, lastz1, 15), z2));