From ec5531a6b185c58f724401ef220ff6aef45170eb Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 16 Mar 2021 17:47:00 +0200 Subject: [PATCH] minor optimizations --- src/hwlm/noodle_engine.c | 29 +++++++++++++---------------- src/hwlm/noodle_engine_sse.c | 17 +++++++++-------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/src/hwlm/noodle_engine.c b/src/hwlm/noodle_engine.c index 894a9f49..bc81982a 100644 --- a/src/hwlm/noodle_engine.c +++ b/src/hwlm/noodle_engine.c @@ -119,9 +119,9 @@ match: static really_really_inline hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, - Z_TYPE z, size_t len, const struct cb_info *cbi) { - while (unlikely(z)) { - Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); + Z_TYPE *z, size_t len, const struct cb_info *cbi) { + while (unlikely(*z)) { + Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(z); size_t matchPos = d - buf + pos; DEBUG_PRINTF("match pos %zu\n", matchPos); hwlmcb_rv_t rv = final(n, buf, len, 1, cbi, matchPos); @@ -132,9 +132,9 @@ hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, static really_really_inline hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, - Z_TYPE z, size_t len, const struct cb_info *cbi) { - while (unlikely(z)) { - Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); + Z_TYPE *z, size_t len, const struct cb_info *cbi) { + while (unlikely(*z)) { + Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(z); size_t matchPos = d - buf + pos - 1; \ DEBUG_PRINTF("match pos %zu\n", matchPos); hwlmcb_rv_t rv = final(n, buf, len, 0, cbi, matchPos); @@ -174,16 +174,12 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, hwlm_error_t rv; if (end - offset <= CHUNKSIZE) { - rv = scanSingleUnaligned(n, buf, len, offset, caseMask, mask1, + return scanSingleUnaligned(n, buf, len, offset, caseMask, mask1, cbi, offset, end); - return rv; } uintptr_t data = (uintptr_t)buf; uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data; - uintptr_t last = data + end; - uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data; - uintptr_t s3Start = end - CHUNKSIZE; if (offset != s2Start) { // first scan out to the fast scan starting point @@ -192,6 +188,8 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, cbi, offset, s2Start); RETURN_IF_TERMINATED(rv); } + uintptr_t last = data + end; + uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data; if (likely(s2Start != s2End)) { // scan as far as we can, bounded by the last point this key can @@ -208,7 +206,7 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, } DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, len); - rv = scanSingleUnaligned(n, buf, len, s3Start, caseMask, mask1, cbi, + rv = scanSingleUnaligned(n, buf, len, s2End, caseMask, mask1, cbi, s2End, len); return rv; @@ -226,7 +224,6 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, // the first place the key can match size_t offset = start + n->msk_len - n->key_offset; - hwlm_error_t rv; if (end - offset <= CHUNKSIZE) { @@ -238,9 +235,6 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, uintptr_t data = (uintptr_t)buf; uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data; uintptr_t s1End = s2Start + 1; - uintptr_t last = data + end; - uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data; - uintptr_t s3Start = end - CHUNKSIZE; uintptr_t off = offset; if (s2Start != off) { @@ -252,6 +246,9 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, RETURN_IF_TERMINATED(rv); } off = s1End; + uintptr_t last = data + end; + uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data; + uintptr_t s3Start = end - CHUNKSIZE; if (s2Start >= end) { DEBUG_PRINTF("s2 == mL %zu\n", end); diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c index 501aea85..e1da2083 100644 --- a/src/hwlm/noodle_engine_sse.c +++ b/src/hwlm/noodle_engine_sse.c @@ -53,7 +53,7 @@ hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, u32 z = mask & movemask128(eq128(mask1, v)); DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z); - return single_zscan(n, d, buf, z, len, cbi); + return single_zscan(n, d, buf, &z, len, cbi); } static really_inline @@ -71,11 +71,10 @@ hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, // mask out where we can't match u32 mask = ((1 << l) - 1) << buf_off; - u32 z = mask & movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), - eq128(mask2, v))); + u32 z = mask & movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), eq128(mask2, v))); DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z); - return double_zscan(n, d, buf, z, len, cbi); + return double_zscan(n, d, buf, &z, len, cbi); } static really_inline @@ -86,15 +85,16 @@ hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf, const u8 *d = buf + start, *e = buf + end; assert(d < e); + const u8 *base = ROUNDDOWN_PTR(d, 64); for (; d < e; d += 16) { m128 v = and128(load128(d), caseMask); u32 z = movemask128(eq128(mask1, v)); // On large packet buffers, this prefetch appears to get us about 2%. - __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64)); + __builtin_prefetch(base + 128); DEBUG_PRINTF("z 0x%08x\n", z); - hwlm_error_t result = single_zscan(n, d, buf, z, len, cbi); + hwlm_error_t result = single_zscan(n, d, buf, &z, len, cbi); if (unlikely(result != HWLM_SUCCESS)) return result; } @@ -110,6 +110,7 @@ hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf, assert(d < e); m128 lastz1 = zeroes128(); + const u8 *base = ROUNDDOWN_PTR(d, 64); for (; d < e; d += 16) { m128 v = and128(load128(d), caseMask); m128 z1 = eq128(mask1, v); @@ -118,10 +119,10 @@ hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf, lastz1 = z1; // On large packet buffers, this prefetch appears to get us about 2%. - __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64)); + __builtin_prefetch(base + 128); DEBUG_PRINTF("z 0x%08x\n", z); - hwlm_error_t result = double_zscan(n, d, buf, z, len, cbi); + hwlm_error_t result = double_zscan(n, d, buf, &z, len, cbi); if (unlikely(result != HWLM_SUCCESS)) return result;