From 87d8b357a98aad59e1a7d82795d6d895cbcf18ad Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 12 Nov 2025 14:49:41 +0200 Subject: [PATCH] Feature/refactor fdr (#251) * remove the use of macros for critical loops, easier to debug removed switch, merged get_conf_stride functions into 1 * remove the use of macros for critical loops, easier to debug removed switch, merged get_conf_stride functions into 1 split FDR implementations into arch specific files (same for now) --- src/fdr/arm/fdr_impl.h | 196 +++++++++++++++ src/fdr/fdr.c | 370 +++-------------------------- src/fdr/fdr.h | 1 + src/fdr/fdr_impl.h | 119 ++++++++++ src/fdr/ppc64le/fdr_impl.h | 196 +++++++++++++++ src/fdr/x86/fdr_impl.h | 196 +++++++++++++++ src/util/arch/arm/simd_utils.h | 12 + src/util/arch/common/simd_utils.h | 8 + src/util/arch/ppc64el/simd_utils.h | 10 + src/util/arch/x86/simd_utils.h | 29 +++ 10 files changed, 798 insertions(+), 339 deletions(-) create mode 100644 src/fdr/arm/fdr_impl.h create mode 100644 src/fdr/fdr_impl.h create mode 100644 src/fdr/ppc64le/fdr_impl.h create mode 100644 src/fdr/x86/fdr_impl.h diff --git a/src/fdr/arm/fdr_impl.h b/src/fdr/arm/fdr_impl.h new file mode 100644 index 00000000..44f95d6a --- /dev/null +++ b/src/fdr/arm/fdr_impl.h @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2025, VectorCamp PC + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FDR_IMPL_ARM_H +#define FDR_IMPL_ARM_H + +static really_inline +void get_conf_stride(const u8 *itPtr, UNUSED const u8 *start_ptr, + UNUSED const u8 *end_ptr, u32 domain_mask, u8 stride, + const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { + assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + + // get_conf_stride_4 + u64a it_hi = *(const u64a *)itPtr; + u64a it_lo = *(const u64a *)(itPtr + 8); + u64a reach0 = domain_mask & it_hi; + u64a reach4 = domain_mask & (it_hi >> 32); + u64a reach8 = domain_mask & it_lo; + u64a reach12 = domain_mask & (it_lo >> 32); + + m128 st0 = load_m128_from_u64a(ft + reach0); + m128 st4 = load_m128_from_u64a(ft + reach4); + m128 st8 = load_m128_from_u64a(ft + reach8); + m128 st12 = load_m128_from_u64a(ft + reach12); + + st4 = lshiftbyte_m128(st4, 4); + st12 = lshiftbyte_m128(st12, 4); + + *s = or128(*s, st0); + *s = or128(*s, st4); + + if (stride == 4) { + *conf0 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf0 ^= ~0ULL; + + *s = or128(*s, st8); + *s = or128(*s, st12); + *conf8 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf8 ^= ~0ULL; + return; + } + + // get_conf_stride_2 + u64a reach2 = domain_mask & (it_hi >> 16); + u64a reach6 = domain_mask & (it_hi >> 48); + u64a reach10 = domain_mask & (it_lo >> 16); + u64a reach14 = domain_mask & (it_lo >> 48); + + m128 st2 = load_m128_from_u64a(ft + reach2); + m128 st6 = load_m128_from_u64a(ft + reach6); + m128 st10 = load_m128_from_u64a(ft + reach10); + m128 st14 = load_m128_from_u64a(ft + reach14); + + st2 = lshiftbyte_m128(st2, 2); + st6 = lshiftbyte_m128(st6, 6); + st10 = lshiftbyte_m128(st10, 2); + st14 = lshiftbyte_m128(st14, 6); + + *s = or128(*s, st2); + *s = or128(*s, st6); + + if (stride == 2) { + *conf0 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf0 ^= ~0ULL; + + *s = or128(*s, st8); + *s = or128(*s, st10); + *s = or128(*s, st12); + *s = or128(*s, st14); + + *conf8 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf8 ^= ~0ULL; + return; + } + + // get_conf_stride_1 + u64a reach1 = domain_mask & (it_hi >> 8); + u64a reach3 = domain_mask & (it_hi >> 24); + u64a reach5 = domain_mask & (it_hi >> 40); + u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8)); + u64a reach9 = domain_mask & (it_lo >> 8); + u64a reach11 = domain_mask & (it_lo >> 24); + u64a reach13 = domain_mask & (it_lo >> 40); + u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15); + + m128 st1 = load_m128_from_u64a(ft + reach1); + m128 st3 = load_m128_from_u64a(ft + reach3); + m128 st5 = load_m128_from_u64a(ft + reach5); + m128 st7 = load_m128_from_u64a(ft + reach7); + m128 st9 = load_m128_from_u64a(ft + reach9); + m128 st11 = load_m128_from_u64a(ft + reach11); + m128 st13 = load_m128_from_u64a(ft + reach13); + m128 st15 = load_m128_from_u64a(ft + reach15); + + st1 = lshiftbyte_m128(st1, 1); + st3 = lshiftbyte_m128(st3, 3); + st5 = lshiftbyte_m128(st5, 5); + st7 = lshiftbyte_m128(st7, 7); + st9 = lshiftbyte_m128(st9, 1); + st11 = lshiftbyte_m128(st11, 3); + st13 = lshiftbyte_m128(st13, 5); + st15 = lshiftbyte_m128(st15, 7); + + st0 = or128(st0, st1); + st2 = or128(st2, st3); + st4 = or128(st4, st5); + st6 = or128(st6, st7); + st0 = or128(st0, st2); + st4 = or128(st4, st6); + st0 = or128(st0, st4); + + st8 = or128(st8, st9); + st10 = or128(st10, st11); + st12 = or128(st12, st13); + st14 = or128(st14, st15); + st8 = or128(st8, st10); + st12 = or128(st12, st14); + st8 = or128(st8, st12); + + m128 st = or128(*s, st0); + *conf0 = movq(st) ^ ~0ULL; + st = rshiftbyte_m128(st, 8); + st = or128(st, st8); + + *conf8 = movq(st) ^ ~0ULL; + *s = rshiftbyte_m128(st, 8); +} + +static really_inline +void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control, + const u32 *confBase, const struct FDR_Runtime_Args *a, + const u8 *ptr, u32 *last_match_id, const struct zone *z) { + const u8 bucket = 8; + + if (likely(!*conf)) { + return; + } + + /* ptr is currently referring to a location in the zone's buffer, we also + * need a pointer in the original, main buffer for the final string compare. + */ + const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr) + + const u8 *confLoc = ptr; + + do { + u32 bit = findAndClearLSB_64(conf); + u32 byte = bit / bucket + offset; + u32 bitRem = bit % bucket; + u32 idx = bitRem; + u32 cf = confBase[idx]; + if (!cf) { + continue; + } + const struct FDRConfirm *fdrc = (const struct FDRConfirm *) + ((const u8 *)confBase + cf); + if (!(fdrc->groups & *control)) { + continue; + } + u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1); + confWithBit(fdrc, a, ptr_main - a->buf + byte, control, + last_match_id, confVal, conf, bit); + } while (unlikely(!!*conf)); +} + +#endif // FDR_IMPL_ARM_H \ No newline at end of file diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index 7aa22ef5..fad561fe 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2025, VectorCamp PC * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -31,6 +32,7 @@ #include "fdr_confirm_runtime.h" #include "fdr_internal.h" #include "fdr_loadval.h" +#include "fdr_impl.h" #include "flood_runtime.h" #include "scratch.h" #include "teddy.h" @@ -40,86 +42,6 @@ #include "util/simd_utils.h" #include "util/uniform_ops.h" -/** \brief number of bytes processed in each iteration */ -#define ITER_BYTES 16 - -/** \brief total zone buffer size */ -#define ZONE_TOTAL_SIZE 64 - -/** \brief maximum number of allowed zones */ -#define ZONE_MAX 3 - -/** \brief zone information. - * - * Zone represents a region of data to scan in FDR. - * - * The incoming buffer is to split in multiple zones to ensure two properties: - * 1: that we can read 8? bytes behind to generate a hash safely - * 2: that we can read the 3 byte after the current byte (domain > 8) - */ -struct zone { - /** \brief copied buffer, used only when it is a boundary zone. */ - u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE]; - - /** \brief shift amount for fdr state to avoid unwanted match. */ - u8 shift; - - /** \brief if boundary zone, start points into the zone buffer after the - * pre-padding. Otherwise, points to the main buffer, appropriately. */ - const u8 *start; - - /** \brief if boundary zone, end points to the end of zone. Otherwise, - * pointer to the main buffer, appropriately. */ - const u8 *end; - - /** \brief the amount to adjust to go from a pointer in the zones region - * (between start and end) to a pointer in the original data buffer. */ - ptrdiff_t zone_pointer_adjust; - - /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones, - * otherwise end of the zone buf. floodPtr always points inside the same - * buffer as the start pointe. */ - const u8 *floodPtr; -}; - -static -const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = { - { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 }, - { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } -}; - /* generates an initial state mask based on the last byte-ish of history rather * than being all accepting. If there is no history to consider, the state is * generated based on the minimum length of each bucket in order to prevent @@ -141,197 +63,6 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft, return s; } -static really_inline -void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, - UNUSED const u8 *end_ptr, u32 domain_mask_flipped, - const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { - /* +1: the zones ensure that we can read the byte at z->end */ - assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - u64a domain_mask = ~domain_mask_flipped; - - u64a it_hi = *(const u64a *)itPtr; - u64a it_lo = *(const u64a *)(itPtr + 8); - u64a reach0 = domain_mask & it_hi; - u64a reach1 = domain_mask & (it_hi >> 8); - u64a reach2 = domain_mask & (it_hi >> 16); - u64a reach3 = domain_mask & (it_hi >> 24); - u64a reach4 = domain_mask & (it_hi >> 32); - u64a reach5 = domain_mask & (it_hi >> 40); - u64a reach6 = domain_mask & (it_hi >> 48); - u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8)); - u64a reach8 = domain_mask & it_lo; - u64a reach9 = domain_mask & (it_lo >> 8); - u64a reach10 = domain_mask & (it_lo >> 16); - u64a reach11 = domain_mask & (it_lo >> 24); - u64a reach12 = domain_mask & (it_lo >> 32); - u64a reach13 = domain_mask & (it_lo >> 40); - u64a reach14 = domain_mask & (it_lo >> 48); - u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15); - - m128 st0 = load_m128_from_u64a(ft + reach0); - m128 st1 = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1); - m128 st2 = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2); - m128 st3 = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3); - m128 st4 = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4); - m128 st5 = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5); - m128 st6 = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6); - m128 st7 = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7); - m128 st8 = load_m128_from_u64a(ft + reach8); - m128 st9 = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1); - m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2); - m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3); - m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4); - m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5); - m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6); - m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7); - - st0 = or128(st0, st1); - st2 = or128(st2, st3); - st4 = or128(st4, st5); - st6 = or128(st6, st7); - st0 = or128(st0, st2); - st4 = or128(st4, st6); - st0 = or128(st0, st4); - - st8 = or128(st8, st9); - st10 = or128(st10, st11); - st12 = or128(st12, st13); - st14 = or128(st14, st15); - st8 = or128(st8, st10); - st12 = or128(st12, st14); - st8 = or128(st8, st12); - - m128 st = or128(*s, st0); - *conf0 = movq(st) ^ ~0ULL; - st = rshiftbyte_m128(st, 8); - st = or128(st, st8); - - *conf8 = movq(st) ^ ~0ULL; - *s = rshiftbyte_m128(st, 8); -} - -static really_inline -void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr, - UNUSED const u8 *end_ptr, u32 domain_mask_flipped, - const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { - assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - - u64a reach0 = andn(domain_mask_flipped, itPtr); - u64a reach2 = andn(domain_mask_flipped, itPtr + 2); - u64a reach4 = andn(domain_mask_flipped, itPtr + 4); - u64a reach6 = andn(domain_mask_flipped, itPtr + 6); - - m128 st0 = load_m128_from_u64a(ft + reach0); - m128 st2 = load_m128_from_u64a(ft + reach2); - m128 st4 = load_m128_from_u64a(ft + reach4); - m128 st6 = load_m128_from_u64a(ft + reach6); - - u64a reach8 = andn(domain_mask_flipped, itPtr + 8); - u64a reach10 = andn(domain_mask_flipped, itPtr + 10); - u64a reach12 = andn(domain_mask_flipped, itPtr + 12); - u64a reach14 = andn(domain_mask_flipped, itPtr + 14); - - m128 st8 = load_m128_from_u64a(ft + reach8); - m128 st10 = load_m128_from_u64a(ft + reach10); - m128 st12 = load_m128_from_u64a(ft + reach12); - m128 st14 = load_m128_from_u64a(ft + reach14); - - st2 = lshiftbyte_m128(st2, 2); - st4 = lshiftbyte_m128(st4, 4); - st6 = lshiftbyte_m128(st6, 6); - - *s = or128(*s, st0); - *s = or128(*s, st2); - *s = or128(*s, st4); - *s = or128(*s, st6); - - *conf0 = movq(*s); - *s = rshiftbyte_m128(*s, 8); - *conf0 ^= ~0ULL; - - st10 = lshiftbyte_m128(st10, 2); - st12 = lshiftbyte_m128(st12, 4); - st14 = lshiftbyte_m128(st14, 6); - - *s = or128(*s, st8); - *s = or128(*s, st10); - *s = or128(*s, st12); - *s = or128(*s, st14); - - *conf8 = movq(*s); - *s = rshiftbyte_m128(*s, 8); - *conf8 ^= ~0ULL; -} - -static really_inline -void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr, - UNUSED const u8 *end_ptr, u32 domain_mask_flipped, - const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { - assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - - u64a reach0 = andn(domain_mask_flipped, itPtr); - u64a reach4 = andn(domain_mask_flipped, itPtr + 4); - u64a reach8 = andn(domain_mask_flipped, itPtr + 8); - u64a reach12 = andn(domain_mask_flipped, itPtr + 12); - - m128 st0 = load_m128_from_u64a(ft + reach0); - m128 st4 = load_m128_from_u64a(ft + reach4); - m128 st8 = load_m128_from_u64a(ft + reach8); - m128 st12 = load_m128_from_u64a(ft + reach12); - - st4 = lshiftbyte_m128(st4, 4); - st12 = lshiftbyte_m128(st12, 4); - - *s = or128(*s, st0); - *s = or128(*s, st4); - *conf0 = movq(*s); - *s = rshiftbyte_m128(*s, 8); - *conf0 ^= ~0ULL; - - *s = or128(*s, st8); - *s = or128(*s, st12); - *conf8 = movq(*s); - *s = rshiftbyte_m128(*s, 8); - *conf8 ^= ~0ULL; -} - -static really_inline -void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control, - const u32 *confBase, const struct FDR_Runtime_Args *a, - const u8 *ptr, u32 *last_match_id, const struct zone *z) { - const u8 bucket = 8; - - if (likely(!*conf)) { - return; - } - - /* ptr is currently referring to a location in the zone's buffer, we also - * need a pointer in the original, main buffer for the final string compare. - */ - const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr) - - const u8 *confLoc = ptr; - - do { - u32 bit = findAndClearLSB_64(conf); - u32 byte = bit / bucket + offset; - u32 bitRem = bit % bucket; - u32 idx = bitRem; - u32 cf = confBase[idx]; - if (!cf) { - continue; - } - const struct FDRConfirm *fdrc = (const struct FDRConfirm *) - ((const u8 *)confBase + cf); - if (!(fdrc->groups & *control)) { - continue; - } - u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1); - confWithBit(fdrc, a, ptr_main - a->buf + byte, control, - last_match_id, confVal, conf, bit); - } while (unlikely(!!*conf)); -} - static really_inline void dumpZoneInfo(UNUSED const struct zone *z, UNUSED size_t zone_id) { #ifdef DEBUG @@ -660,41 +391,6 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend, #define INVALID_MATCH_ID (~0U) -#define FDR_MAIN_LOOP(zz, s, get_conf_fn) \ - do { \ - const u8 *tryFloodDetect = zz->floodPtr; \ - const u8 *start_ptr = zz->start; \ - const u8 *end_ptr = zz->end; \ - for (const u8 *itPtr = ROUNDDOWN_PTR(start_ptr, 64); itPtr + 4*ITER_BYTES <= end_ptr; \ - itPtr += 4*ITER_BYTES) { \ - __builtin_prefetch(itPtr); \ - } \ - \ - for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \ - itPtr += ITER_BYTES) { \ - if (unlikely(itPtr > tryFloodDetect)) { \ - tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\ - &floodBackoff, &control, \ - ITER_BYTES); \ - if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \ - return HWLM_TERMINATED; \ - } \ - } \ - __builtin_prefetch(itPtr + ITER_BYTES); \ - u64a conf0; \ - u64a conf8; \ - get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \ - ft, &conf0, &conf8, &s); \ - do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, \ - &last_match_id, zz); \ - do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, \ - &last_match_id, zz); \ - if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \ - return HWLM_TERMINATED; \ - } \ - } /* end for loop */ \ - } while (0) \ - static never_inline hwlm_error_t fdr_engine_exec(const struct FDR *fdr, const struct FDR_Runtime_Args *a, @@ -703,8 +399,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr, u32 floodBackoff = FLOOD_BACKOFF_START; u32 last_match_id = INVALID_MATCH_ID; - u32 domain_mask_flipped = ~fdr->domainMask; - u8 stride = fdr->stride; + const u64a *ft = (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR))); assert(ISALIGNED_CL(ft)); @@ -722,42 +417,39 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr, for (size_t curZone = 0; curZone < numZone; curZone++) { struct zone *z = &zones[curZone]; - dumpZoneInfo(z, curZone); + m128 zone_mask = load128(zone_or_mask[z->shift]); - /* When a zone contains less data than is processed in an iteration - * of FDR_MAIN_LOOP(), we need to scan over some extra data. - * - * We have chosen to scan this extra data at the start of the - * iteration. The extra data is either data we have already scanned or - * garbage (if it is earlier than offset 0), - * - * As a result we need to shift the incoming state back so that it will - * properly line up with the data being scanned. - * - * We also need to forbid reporting any matches in the data being - * rescanned as they have already been reported (or are over garbage but - * later stages should also provide that safety guarantee). - */ + const u8 *cacheline = ROUNDDOWN_PTR(z->start, 64); + __builtin_prefetch(cacheline); - u8 shift = z->shift; + const u8 *tryFloodDetect = z->floodPtr; - state = variable_byte_shift_m128(state, shift); + state = variable_byte_shift_m128(state, z->shift); + state = or128(state, zone_mask); - state = or128(state, load128(zone_or_mask[shift])); + for (const u8 *itPtr = z->start; itPtr + ITER_BYTES <= z->end; itPtr += ITER_BYTES) { + if (unlikely(itPtr > tryFloodDetect)) { + tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect, + &floodBackoff, &control, + ITER_BYTES); + if (unlikely(control == HWLM_TERMINATE_MATCHING)) { + return HWLM_TERMINATED; + } + } + u64a conf0; + u64a conf8; - switch (stride) { - case 1: - FDR_MAIN_LOOP(z, state, get_conf_stride_1); - break; - case 2: - FDR_MAIN_LOOP(z, state, get_conf_stride_2); - break; - case 4: - FDR_MAIN_LOOP(z, state, get_conf_stride_4); - break; - default: - break; - } + cacheline += 64; + __builtin_prefetch(cacheline); + + get_conf_stride(itPtr, z->start, z->end, fdr->domainMask, fdr->stride, ft, &conf0, &conf8, &state); + + do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, &last_match_id, z); + do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, &last_match_id, z); + if (unlikely(control == HWLM_TERMINATE_MATCHING)) { + return HWLM_TERMINATED; + } + } /* end for loop */ } return HWLM_SUCCESS; diff --git a/src/fdr/fdr.h b/src/fdr/fdr.h index 4dcef851..f6cf8f90 100644 --- a/src/fdr/fdr.h +++ b/src/fdr/fdr.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2025, VectorCamp PC * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/src/fdr/fdr_impl.h b/src/fdr/fdr_impl.h new file mode 100644 index 00000000..f7b755b6 --- /dev/null +++ b/src/fdr/fdr_impl.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2020-2025, VectorCamp PC + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \brief number of bytes processed in each iteration */ +#define ITER_BYTES 16 + +/** \brief total zone buffer size */ +#define ZONE_TOTAL_SIZE 64 + +/** \brief maximum number of allowed zones */ +#define ZONE_MAX 3 + +/** \brief zone information. + * + * Zone represents a region of data to scan in FDR. + * + * The incoming buffer is to split in multiple zones to ensure two properties: + * 1: that we can read 8? bytes behind to generate a hash safely + * 2: that we can read the 3 byte after the current byte (domain > 8) + */ +struct zone { + /** \brief copied buffer, used only when it is a boundary zone. */ + u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE]; + + /** \brief shift amount for fdr state to avoid unwanted match. */ + u8 shift; + + /** \brief if boundary zone, start points into the zone buffer after the + * pre-padding. Otherwise, points to the main buffer, appropriately. */ + const u8 *start; + + /** \brief if boundary zone, end points to the end of zone. Otherwise, + * pointer to the main buffer, appropriately. */ + const u8 *end; + + /** \brief the amount to adjust to go from a pointer in the zones region + * (between start and end) to a pointer in the original data buffer. */ + ptrdiff_t zone_pointer_adjust; + + /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones, + * otherwise end of the zone buf. floodPtr always points inside the same + * buffer as the start pointe. */ + const u8 *floodPtr; +}; + +static +const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = { + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } +}; + +#if defined(VS_SIMDE_BACKEND) +#include "x86/fdr_impl.h" +#else +#if defined(ARCH_IA32) || defined(ARCH_X86_64) +#include "x86/fdr_impl.h" +#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) +#include "arm/fdr_impl.h" +#elif defined(ARCH_PPC64EL) +#include "ppc64le/fdr_impl.h" +#endif +#endif \ No newline at end of file diff --git a/src/fdr/ppc64le/fdr_impl.h b/src/fdr/ppc64le/fdr_impl.h new file mode 100644 index 00000000..a6aedb53 --- /dev/null +++ b/src/fdr/ppc64le/fdr_impl.h @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2025, VectorCamp PC + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FDR_IMPL_PPC64LE_H +#define FDR_IMPL_PPC64LE_H + +static really_inline +void get_conf_stride(const u8 *itPtr, UNUSED const u8 *start_ptr, + UNUSED const u8 *end_ptr, u32 domain_mask, u8 stride, + const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { + assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + + // get_conf_stride_4 + u64a it_hi = *(const u64a *)itPtr; + u64a it_lo = *(const u64a *)(itPtr + 8); + u64a reach0 = domain_mask & it_hi; + u64a reach4 = domain_mask & (it_hi >> 32); + u64a reach8 = domain_mask & it_lo; + u64a reach12 = domain_mask & (it_lo >> 32); + + m128 st0 = load_m128_from_u64a(ft + reach0); + m128 st4 = load_m128_from_u64a(ft + reach4); + m128 st8 = load_m128_from_u64a(ft + reach8); + m128 st12 = load_m128_from_u64a(ft + reach12); + + st4 = lshiftbyte_m128(st4, 4); + st12 = lshiftbyte_m128(st12, 4); + + *s = or128(*s, st0); + *s = or128(*s, st4); + + if (stride == 4) { + *conf0 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf0 ^= ~0ULL; + + *s = or128(*s, st8); + *s = or128(*s, st12); + *conf8 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf8 ^= ~0ULL; + return; + } + + // get_conf_stride_2 + u64a reach2 = domain_mask & (it_hi >> 16); + u64a reach6 = domain_mask & (it_hi >> 48); + u64a reach10 = domain_mask & (it_lo >> 16); + u64a reach14 = domain_mask & (it_lo >> 48); + + m128 st2 = load_m128_from_u64a(ft + reach2); + m128 st6 = load_m128_from_u64a(ft + reach6); + m128 st10 = load_m128_from_u64a(ft + reach10); + m128 st14 = load_m128_from_u64a(ft + reach14); + + st2 = lshiftbyte_m128(st2, 2); + st6 = lshiftbyte_m128(st6, 6); + st10 = lshiftbyte_m128(st10, 2); + st14 = lshiftbyte_m128(st14, 6); + + *s = or128(*s, st2); + *s = or128(*s, st6); + + if (stride == 2) { + *conf0 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf0 ^= ~0ULL; + + *s = or128(*s, st8); + *s = or128(*s, st10); + *s = or128(*s, st12); + *s = or128(*s, st14); + + *conf8 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf8 ^= ~0ULL; + return; + } + + // get_conf_stride_1 + u64a reach1 = domain_mask & (it_hi >> 8); + u64a reach3 = domain_mask & (it_hi >> 24); + u64a reach5 = domain_mask & (it_hi >> 40); + u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8)); + u64a reach9 = domain_mask & (it_lo >> 8); + u64a reach11 = domain_mask & (it_lo >> 24); + u64a reach13 = domain_mask & (it_lo >> 40); + u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15); + + m128 st1 = load_m128_from_u64a(ft + reach1); + m128 st3 = load_m128_from_u64a(ft + reach3); + m128 st5 = load_m128_from_u64a(ft + reach5); + m128 st7 = load_m128_from_u64a(ft + reach7); + m128 st9 = load_m128_from_u64a(ft + reach9); + m128 st11 = load_m128_from_u64a(ft + reach11); + m128 st13 = load_m128_from_u64a(ft + reach13); + m128 st15 = load_m128_from_u64a(ft + reach15); + + st1 = lshiftbyte_m128(st1, 1); + st3 = lshiftbyte_m128(st3, 3); + st5 = lshiftbyte_m128(st5, 5); + st7 = lshiftbyte_m128(st7, 7); + st9 = lshiftbyte_m128(st9, 1); + st11 = lshiftbyte_m128(st11, 3); + st13 = lshiftbyte_m128(st13, 5); + st15 = lshiftbyte_m128(st15, 7); + + st0 = or128(st0, st1); + st2 = or128(st2, st3); + st4 = or128(st4, st5); + st6 = or128(st6, st7); + st0 = or128(st0, st2); + st4 = or128(st4, st6); + st0 = or128(st0, st4); + + st8 = or128(st8, st9); + st10 = or128(st10, st11); + st12 = or128(st12, st13); + st14 = or128(st14, st15); + st8 = or128(st8, st10); + st12 = or128(st12, st14); + st8 = or128(st8, st12); + + m128 st = or128(*s, st0); + *conf0 = movq(st) ^ ~0ULL; + st = rshiftbyte_m128(st, 8); + st = or128(st, st8); + + *conf8 = movq(st) ^ ~0ULL; + *s = rshiftbyte_m128(st, 8); +} + +static really_inline +void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control, + const u32 *confBase, const struct FDR_Runtime_Args *a, + const u8 *ptr, u32 *last_match_id, const struct zone *z) { + const u8 bucket = 8; + + if (likely(!*conf)) { + return; + } + + /* ptr is currently referring to a location in the zone's buffer, we also + * need a pointer in the original, main buffer for the final string compare. + */ + const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr) + + const u8 *confLoc = ptr; + + do { + u32 bit = findAndClearLSB_64(conf); + u32 byte = bit / bucket + offset; + u32 bitRem = bit % bucket; + u32 idx = bitRem; + u32 cf = confBase[idx]; + if (!cf) { + continue; + } + const struct FDRConfirm *fdrc = (const struct FDRConfirm *) + ((const u8 *)confBase + cf); + if (!(fdrc->groups & *control)) { + continue; + } + u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1); + confWithBit(fdrc, a, ptr_main - a->buf + byte, control, + last_match_id, confVal, conf, bit); + } while (unlikely(!!*conf)); +} + +#endif // FDR_IMPL_PPC64LE_H \ No newline at end of file diff --git a/src/fdr/x86/fdr_impl.h b/src/fdr/x86/fdr_impl.h new file mode 100644 index 00000000..bfca6315 --- /dev/null +++ b/src/fdr/x86/fdr_impl.h @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2025, VectorCamp PC + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FDR_IMPL_X86_H +#define FDR_IMPL_X86_H + +static really_inline +void get_conf_stride(const u8 *itPtr, UNUSED const u8 *start_ptr, + UNUSED const u8 *end_ptr, u32 domain_mask, u8 stride, + const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { + assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + + // get_conf_stride_4 + u64a it_hi = *(const u64a *)itPtr; + u64a it_lo = *(const u64a *)(itPtr + 8); + u64a reach0 = domain_mask & it_hi; + u64a reach4 = domain_mask & (it_hi >> 32); + u64a reach8 = domain_mask & it_lo; + u64a reach12 = domain_mask & (it_lo >> 32); + + m128 st0 = load_m128_from_u64a(ft + reach0); + m128 st4 = load_m128_from_u64a(ft + reach4); + m128 st8 = load_m128_from_u64a(ft + reach8); + m128 st12 = load_m128_from_u64a(ft + reach12); + + st4 = lshiftbyte_m128(st4, 4); + st12 = lshiftbyte_m128(st12, 4); + + *s = or128(*s, st0); + *s = or128(*s, st4); + + if (stride == 4) { + *conf0 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf0 ^= ~0ULL; + + *s = or128(*s, st8); + *s = or128(*s, st12); + *conf8 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf8 ^= ~0ULL; + return; + } + + // get_conf_stride_2 + u64a reach2 = domain_mask & (it_hi >> 16); + u64a reach6 = domain_mask & (it_hi >> 48); + u64a reach10 = domain_mask & (it_lo >> 16); + u64a reach14 = domain_mask & (it_lo >> 48); + + m128 st2 = load_m128_from_u64a(ft + reach2); + m128 st6 = load_m128_from_u64a(ft + reach6); + m128 st10 = load_m128_from_u64a(ft + reach10); + m128 st14 = load_m128_from_u64a(ft + reach14); + + st2 = lshiftbyte_m128(st2, 2); + st6 = lshiftbyte_m128(st6, 6); + st10 = lshiftbyte_m128(st10, 2); + st14 = lshiftbyte_m128(st14, 6); + + *s = or128(*s, st2); + *s = or128(*s, st6); + + if (stride == 2) { + *conf0 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf0 ^= ~0ULL; + + *s = or128(*s, st8); + *s = or128(*s, st10); + *s = or128(*s, st12); + *s = or128(*s, st14); + + *conf8 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf8 ^= ~0ULL; + return; + } + + // get_conf_stride_1 + u64a reach1 = domain_mask & (it_hi >> 8); + u64a reach3 = domain_mask & (it_hi >> 24); + u64a reach5 = domain_mask & (it_hi >> 40); + u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8)); + u64a reach9 = domain_mask & (it_lo >> 8); + u64a reach11 = domain_mask & (it_lo >> 24); + u64a reach13 = domain_mask & (it_lo >> 40); + u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15); + + m128 st1 = load_m128_from_u64a(ft + reach1); + m128 st3 = load_m128_from_u64a(ft + reach3); + m128 st5 = load_m128_from_u64a(ft + reach5); + m128 st7 = load_m128_from_u64a(ft + reach7); + m128 st9 = load_m128_from_u64a(ft + reach9); + m128 st11 = load_m128_from_u64a(ft + reach11); + m128 st13 = load_m128_from_u64a(ft + reach13); + m128 st15 = load_m128_from_u64a(ft + reach15); + + st1 = lshiftbyte_m128(st1, 1); + st3 = lshiftbyte_m128(st3, 3); + st5 = lshiftbyte_m128(st5, 5); + st7 = lshiftbyte_m128(st7, 7); + st9 = lshiftbyte_m128(st9, 1); + st11 = lshiftbyte_m128(st11, 3); + st13 = lshiftbyte_m128(st13, 5); + st15 = lshiftbyte_m128(st15, 7); + + st0 = or128(st0, st1); + st2 = or128(st2, st3); + st4 = or128(st4, st5); + st6 = or128(st6, st7); + st0 = or128(st0, st2); + st4 = or128(st4, st6); + st0 = or128(st0, st4); + + st8 = or128(st8, st9); + st10 = or128(st10, st11); + st12 = or128(st12, st13); + st14 = or128(st14, st15); + st8 = or128(st8, st10); + st12 = or128(st12, st14); + st8 = or128(st8, st12); + + m128 st = or128(*s, st0); + *conf0 = movq(st) ^ ~0ULL; + st = rshiftbyte_m128(st, 8); + st = or128(st, st8); + + *conf8 = movq(st) ^ ~0ULL; + *s = rshiftbyte_m128(st, 8); +} + +static really_inline +void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control, + const u32 *confBase, const struct FDR_Runtime_Args *a, + const u8 *ptr, u32 *last_match_id, const struct zone *z) { + const u8 bucket = 8; + + if (likely(!*conf)) { + return; + } + + /* ptr is currently referring to a location in the zone's buffer, we also + * need a pointer in the original, main buffer for the final string compare. + */ + const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr) + + const u8 *confLoc = ptr; + + do { + u32 bit = findAndClearLSB_64(conf); + u32 byte = bit / bucket + offset; + u32 bitRem = bit % bucket; + u32 idx = bitRem; + u32 cf = confBase[idx]; + if (!cf) { + continue; + } + const struct FDRConfirm *fdrc = (const struct FDRConfirm *) + ((const u8 *)confBase + cf); + if (!(fdrc->groups & *control)) { + continue; + } + u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1); + confWithBit(fdrc, a, ptr_main - a->buf + byte, control, + last_match_id, confVal, conf, bit); + } while (unlikely(!!*conf)); +} + +#endif // FDR_IMPL_X86_H \ No newline at end of file diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index c301f09f..45c00a2c 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -181,6 +181,8 @@ static really_inline m128 set1_2x64(u64a c) { return (m128) vdupq_n_u64(c); } +#define insert32_m128(in, val, imm) ((m128) vsetq_lane_u32(val, (uint32x4_t)in, imm)) + static really_inline u32 movd(const m128 in) { return vgetq_lane_u32((uint32x4_t) in, 0); } @@ -449,4 +451,14 @@ m128 set2x64(u64a hi, u64a lo) { return (m128) vld1q_u64((uint64_t *) data); } +static really_inline +m128 widenlo128(m128 x) { + return (m128) vmovl_u32(vget_low_u32((uint32x4_t)x)); +} + +static really_inline +m128 widenhi128(m128 x) { + return (m128) vmovl_u32(vget_high_u32((uint32x4_t)x)); +} + #endif // ARCH_ARM_SIMD_UTILS_H diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 6f091bc7..109b1158 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -388,6 +388,14 @@ m256 pshufb_m256(m256 a, m256 b) { return rv; } +static really_inline +m256 widen128(m128 x) { + m256 rv; + rv.lo = widenlo128(x); + rv.hi = widenhi128(x); + return rv; +} + #endif // HAVE_SIMD_256_BITS /**** diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index 9b6f7539..6d2c5d9e 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -429,6 +429,16 @@ m128 set2x64(u64a hi, u64a lo) { return (m128) v; } +static really_inline +m128 widenlo128(m128 x) { + return (m128) vec_mergel((m128)x, zeroes128()); +} + +static really_inline +m128 widenhi128(m128 x) { + return (m128) vec_mergeh((m128)x, zeroes128()); +} + #if defined(__clang__) && (__clang_major__ == 15) #pragma clang diagnostic pop #endif // defined(__clang__) && (__clang_major__ == 15) diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index 3fbe3f16..4df74008 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -123,6 +123,17 @@ m128 sub_2x64(m128 a, m128 b) { return (m128) _mm_sub_epi64(a, b); } +static really_really_inline +m128 lshift32_m128(m128 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm_slli_epi32(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm_sll_epi32(a, x); +} + static really_really_inline m128 lshift64_m128(m128 a, unsigned b) { #if defined(HAVE__BUILTIN_CONSTANT_P) @@ -158,6 +169,8 @@ static really_inline m128 set1_2x64(u64a c) { return _mm_set1_epi64x(c); } +#define insert32_m128(in, val, imm) (m128) (_mm_insert_epi32((m128) in, (m128) val, (m128) imm)) + static really_inline u32 movd(const m128 in) { return _mm_cvtsi128_si32(in); } @@ -474,6 +487,16 @@ m128 set2x64(u64a hi, u64a lo) { return _mm_set_epi64x(hi, lo); } +static really_inline +m128 widenlo128(m128 x) { + return _mm_unpacklo_epi32(x, zeroes128()); +} + +static really_inline +m128 widenhi128(m128 x) { + return _mm_unpackhi_epi32(x, zeroes128()); +} + /**** **** 256-bit Primitives ****/ @@ -750,6 +773,12 @@ m256 combine2x128(m128 hi, m128 lo) { return insert128to256(cast128to256(lo), hi, 1); #endif } + +static really_inline +m256 widen128(m128 x) { + return (m256) _mm256_cvtepu32_epi64(x); +} + #endif //AVX2 /****