From 1089fa501865844c7af945b537a7445a52b7f488 Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Mon, 8 Aug 2016 10:55:52 +1000 Subject: [PATCH] avx512: noodle --- src/hwlm/noodle_engine.c | 49 +++++++- src/hwlm/noodle_engine_avx512.c | 193 ++++++++++++++++++++++++++++++++ 2 files changed, 236 insertions(+), 6 deletions(-) create mode 100644 src/hwlm/noodle_engine_avx512.c diff --git a/src/hwlm/noodle_engine.c b/src/hwlm/noodle_engine.c index a30a59a5..24f78c8e 100644 --- a/src/hwlm/noodle_engine.c +++ b/src/hwlm/noodle_engine.c @@ -36,12 +36,14 @@ #include "util/arch.h" #include "util/bitutils.h" #include "util/compare.h" +#include "util/join.h" #include "util/masked_move.h" #include "util/simd_utils.h" #include #include #include +#include /** \brief Noodle runtime context. */ struct cb_info { @@ -51,6 +53,24 @@ struct cb_info { size_t offsetAdj; //!< used in streaming mode }; +#if defined(HAVE_AVX512) +#define CHUNKSIZE 64 +#define MASK_TYPE m512 +#define Z_BITS 64 +#define Z_TYPE u64a +#elif defined(HAVE_AVX2) +#define CHUNKSIZE 32 +#define MASK_TYPE m256 +#define Z_BITS 32 +#define Z_TYPE u32 +#else +#define CHUNKSIZE 16 +#define MASK_TYPE m128 +#define Z_BITS 32 +#define Z_TYPE u32 +#endif + + #define RETURN_IF_TERMINATED(x) \ { \ if ((x) == HWLM_TERMINATED) { \ @@ -61,8 +81,9 @@ struct cb_info { #define SINGLE_ZSCAN() \ do { \ while (unlikely(z)) { \ - u32 pos = findAndClearLSB_32(&z); \ + Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); \ size_t matchPos = d - buf + pos; \ + DEBUG_PRINTF("match pos %zu\n", matchPos); \ hwlmcb_rv_t rv = final(buf, len, key, 1, 0, 0, noCase, cbi, \ matchPos); \ RETURN_IF_TERMINATED(rv); \ @@ -72,8 +93,9 @@ struct cb_info { #define DOUBLE_ZSCAN() \ do { \ while (unlikely(z)) { \ - u32 pos = findAndClearLSB_32(&z); \ + Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); \ size_t matchPos = d - buf + pos - 1; \ + DEBUG_PRINTF("match pos %zu\n", matchPos); \ hwlmcb_rv_t rv = final(buf, len, key, keyLen, keyOffset, 1, \ noCase, cbi, matchPos); \ RETURN_IF_TERMINATED(rv); \ @@ -110,7 +132,11 @@ hwlm_error_t final(const u8 *buf, size_t len, const u8 *key, size_t keyLen, return HWLM_SUCCESS; } -#if defined(HAVE_AVX2) +#if defined(HAVE_AVX512) +#define CHUNKSIZE 64 +#define MASK_TYPE m512 +#include "noodle_engine_avx512.c" +#elif defined(HAVE_AVX2) #define CHUNKSIZE 32 #define MASK_TYPE m256 #include "noodle_engine_avx2.c" @@ -123,12 +149,14 @@ hwlm_error_t final(const u8 *buf, size_t len, const u8 *key, size_t keyLen, static really_inline hwlm_error_t scanSingleMain(const u8 *buf, size_t len, const u8 *key, bool noCase, const struct cb_info *cbi) { - hwlm_error_t rv; - size_t end = len; const MASK_TYPE mask1 = getMask(key[0], noCase); const MASK_TYPE caseMask = getCaseMask(); +#if !defined(HAVE_AVX512) + hwlm_error_t rv; + size_t end = len; + if (len < CHUNKSIZE) { rv = scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, 0, len); return rv; @@ -173,13 +201,15 @@ hwlm_error_t scanSingleMain(const u8 *buf, size_t len, const u8 *key, cbi, s2End, end); return rv; +#else // HAVE_AVX512 + return scanSingle512(buf, len, key, noCase, caseMask, mask1, cbi); +#endif } static really_inline hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key, size_t keyLen, size_t keyOffset, bool noCase, const struct cb_info *cbi) { - hwlm_error_t rv; // we stop scanning for the key-fragment when the rest of the key can't // possibly fit in the remaining buffer size_t end = len - keyLen + keyOffset + 2; @@ -188,6 +218,9 @@ hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key, const MASK_TYPE mask1 = getMask(key[keyOffset + 0], noCase); const MASK_TYPE mask2 = getMask(key[keyOffset + 1], noCase); +#if !defined(HAVE_AVX512) + hwlm_error_t rv; + if (end - keyOffset < CHUNKSIZE) { rv = scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask, mask1, mask2, cbi, keyOffset, end); @@ -244,6 +277,10 @@ hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key, caseMask, mask1, mask2, cbi, off, end); return rv; +#else // AVX512 + return scanDouble512(buf, len, key, keyLen, keyOffset, noCase, caseMask, + mask1, mask2, cbi, keyOffset, end); +#endif // AVX512 } diff --git a/src/hwlm/noodle_engine_avx512.c b/src/hwlm/noodle_engine_avx512.c new file mode 100644 index 00000000..d4e6527f --- /dev/null +++ b/src/hwlm/noodle_engine_avx512.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* noodle scan parts for AVX512 */ + +static really_inline +m512 getMask(u8 c, bool noCase) { + u8 k = caseClear8(c, noCase); + return set64x8(k); +} + +static really_inline +m512 getCaseMask(void) { + return set64x8(CASE_CLEAR); +} + +// The short scan routine. It is used both to scan data up to an +// alignment boundary if needed and to finish off data that the aligned scan +// function can't handle (due to small/unaligned chunk at end) +static really_inline +hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key, + bool noCase, m512 caseMask, m512 mask1, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start; + ptrdiff_t scan_len = end - start; + DEBUG_PRINTF("scan_len %zu\n", scan_len); + assert(scan_len <= 64); + if (!scan_len) { + return HWLM_SUCCESS; + } + + __mmask64 k = (~0ULL) >> (64 - scan_len); + DEBUG_PRINTF("load mask 0x%016llx\n", k); + + m512 v = loadu_maskz_m512(k, d); + + if (noCase) { + v = and512(v, caseMask); + } + + // reuse the load mask to indicate valid bytes + u64a z = masked_eq512mask(k, mask1, v); + + SINGLE_ZSCAN(); + + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanSingle512(const u8 *buf, size_t len, const u8 *key, + bool noCase, m512 caseMask, m512 mask1, + const struct cb_info *cbi) { + const u8 *d = buf; + const u8 *e = buf + len; + DEBUG_PRINTF("start %p end %p \n", d, e); + assert(d < e); + if (d + 64 >= e) { + goto tail; + } + + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDUP_PTR(d, 64); + if (scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, 0, + d1 - d) == HWLM_TERMINATED) { + return HWLM_TERMINATED; + } + d = d1; + + for (; d + 64 < e; d += 64) { + DEBUG_PRINTF("d %p e %p \n", d, e); + m512 v = noCase ? and512(load512(d), caseMask) : load512(d); + + u64a z = eq512mask(mask1, v); + __builtin_prefetch(d + 128); + + SINGLE_ZSCAN(); + } + +tail: + DEBUG_PRINTF("d %p e %p \n", d, e); + // finish off tail + + return scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, d - buf, + e - buf); +} + +static really_inline +hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key, + size_t keyLen, size_t keyOffset, bool noCase, + m512 caseMask, m512 mask1, m512 mask2, + const struct cb_info *cbi, u64a *lastz0, + size_t start, size_t end) { + DEBUG_PRINTF("start %zu end %zu last 0x%016llx\n", start, end, *lastz0); + const u8 *d = buf + start; + ptrdiff_t scan_len = end - start; + if (!scan_len) { + return HWLM_SUCCESS; + } + assert(scan_len <= 64); + __mmask64 k = (~0ULL) >> (64 - scan_len); + DEBUG_PRINTF("load mask 0x%016llx scan_len %zu\n", k, scan_len); + + m512 v = loadu_maskz_m512(k, d); + if (noCase) { + v = and512(v, caseMask); + } + + u64a z0 = masked_eq512mask(k, mask1, v); + u64a z1 = masked_eq512mask(k, mask2, v); + u64a z = (*lastz0 | (z0 << 1)) & z1; + DEBUG_PRINTF("z 0x%016llx\n", z); + + DOUBLE_ZSCAN(); + *lastz0 = z0 >> (scan_len - 1); + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanDouble512(const u8 *buf, size_t len, const u8 *key, + size_t keyLen, size_t keyOffset, bool noCase, + m512 caseMask, m512 mask1, m512 mask2, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start; + const u8 *e = buf + end; + u64a lastz0 = 0; + DEBUG_PRINTF("start %zu end %zu \n", start, end); + assert(d < e); + if (d + 64 >= e) { + goto tail; + } + + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDUP_PTR(d, 64); + if (scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask, + mask1, mask2, cbi, &lastz0, start, + d1 - buf) == HWLM_TERMINATED) { + return HWLM_TERMINATED; + } + d = d1; + + for (; d + 64 < e; d += 64) { + DEBUG_PRINTF("d %p e %p 0x%016llx\n", d, e, lastz0); + m512 v = noCase ? and512(load512(d), caseMask) : load512(d); + + /* we have to pull the masks out of the AVX registers because we can't + byte shift between the lanes */ + u64a z0 = eq512mask(mask1, v); + u64a z1 = eq512mask(mask2, v); + u64a z = (lastz0 | (z0 << 1)) & z1; + lastz0 = z0 >> 63; + + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(d + 256); + + DEBUG_PRINTF("z 0x%016llx\n", z); + + DOUBLE_ZSCAN(); + } + +tail: + DEBUG_PRINTF("d %p e %p off %zu \n", d, e, d - buf); + // finish off tail + + return scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask, + mask1, mask2, cbi, &lastz0, d - buf, end); +}