Feature/refactor fdr (#251)

* remove the use of macros for critical loops, easier to debug removed switch, merged get_conf_stride functions into 1 * remove the use of macros for critical loops, easier to debug removed switch, merged get_conf_stride functions into 1 split FDR implementations into arch specific files (same for now)
2025-12-31 13:49:07 +03:00 · 2025-11-12 14:49:41 +02:00
parent eaa8f91c95
commit 87d8b357a9
10 changed files with 798 additions and 339 deletions
--- a/src/fdr/arm/fdr_impl.h
+++ b/src/fdr/arm/fdr_impl.h
@@ -0,0 +1,196 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
 * Copyright (c) 2020-2025, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef FDR_IMPL_ARM_H
 #define FDR_IMPL_ARM_H
 static really_inline
 void get_conf_stride(const u8 *itPtr, UNUSED const u8 *start_ptr,
                       UNUSED const u8 *end_ptr, u32 domain_mask, u8 stride,
                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
    // get_conf_stride_4
    u64a it_hi = *(const u64a *)itPtr;
    u64a it_lo = *(const u64a *)(itPtr + 8);
    u64a reach0  = domain_mask & it_hi;
    u64a reach4  = domain_mask & (it_hi >> 32);
    u64a reach8  = domain_mask & it_lo;
    u64a reach12 = domain_mask & (it_lo >> 32);
    m128 st0 = load_m128_from_u64a(ft + reach0);
    m128 st4 = load_m128_from_u64a(ft + reach4);
    m128 st8 = load_m128_from_u64a(ft + reach8);
    m128 st12 = load_m128_from_u64a(ft + reach12);
    st4 = lshiftbyte_m128(st4, 4);
    st12 = lshiftbyte_m128(st12, 4);
    *s = or128(*s, st0);
    *s = or128(*s, st4);
    if (stride == 4) {
        *conf0 = movq(*s);
        *s = rshiftbyte_m128(*s, 8);
        *conf0 ^= ~0ULL;
        *s = or128(*s, st8);
        *s = or128(*s, st12);
        *conf8 = movq(*s);
        *s = rshiftbyte_m128(*s, 8);
        *conf8 ^= ~0ULL;
        return;
    }
    // get_conf_stride_2
    u64a reach2  = domain_mask & (it_hi >> 16);
    u64a reach6  = domain_mask & (it_hi >> 48);
    u64a reach10 = domain_mask & (it_lo >> 16);
    u64a reach14 = domain_mask & (it_lo >> 48);
    m128 st2 = load_m128_from_u64a(ft + reach2);
    m128 st6 = load_m128_from_u64a(ft + reach6);
    m128 st10 = load_m128_from_u64a(ft + reach10);
    m128 st14 = load_m128_from_u64a(ft + reach14);
    st2  = lshiftbyte_m128(st2, 2);
    st6  = lshiftbyte_m128(st6, 6);
    st10 = lshiftbyte_m128(st10, 2);
    st14 = lshiftbyte_m128(st14, 6);
    *s = or128(*s, st2);
    *s = or128(*s, st6);
    if (stride == 2) {
        *conf0 = movq(*s);
        *s = rshiftbyte_m128(*s, 8);
        *conf0 ^= ~0ULL;
        *s = or128(*s, st8);
        *s = or128(*s, st10);
        *s = or128(*s, st12);
        *s = or128(*s, st14);
        *conf8 = movq(*s);
        *s = rshiftbyte_m128(*s, 8);
        *conf8 ^= ~0ULL;
        return;
    }
    // get_conf_stride_1
    u64a reach1  = domain_mask & (it_hi >> 8);
    u64a reach3  = domain_mask & (it_hi >> 24);
    u64a reach5  = domain_mask & (it_hi >> 40);
    u64a reach7  = domain_mask & ((it_hi >> 56) | (it_lo << 8));
    u64a reach9  = domain_mask & (it_lo >> 8);
    u64a reach11 = domain_mask & (it_lo >> 24);
    u64a reach13 = domain_mask & (it_lo >> 40);
    u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
    m128 st1 = load_m128_from_u64a(ft + reach1);
    m128 st3 = load_m128_from_u64a(ft + reach3);
    m128 st5 = load_m128_from_u64a(ft + reach5);
    m128 st7 = load_m128_from_u64a(ft + reach7);
    m128 st9 = load_m128_from_u64a(ft + reach9);
    m128 st11 = load_m128_from_u64a(ft + reach11);
    m128 st13 = load_m128_from_u64a(ft + reach13);
    m128 st15 = load_m128_from_u64a(ft + reach15);
    st1 = lshiftbyte_m128(st1, 1);
    st3 = lshiftbyte_m128(st3, 3);
    st5 = lshiftbyte_m128(st5, 5);
    st7 = lshiftbyte_m128(st7, 7);
    st9 = lshiftbyte_m128(st9, 1);
    st11 = lshiftbyte_m128(st11, 3);
    st13 = lshiftbyte_m128(st13, 5);
    st15 = lshiftbyte_m128(st15, 7);
    st0 = or128(st0, st1);
    st2 = or128(st2, st3);
    st4 = or128(st4, st5);
    st6 = or128(st6, st7);
    st0 = or128(st0, st2);
    st4 = or128(st4, st6);
    st0 = or128(st0, st4);
    st8 = or128(st8, st9);
    st10 = or128(st10, st11);
    st12 = or128(st12, st13);
    st14 = or128(st14, st15);
    st8 = or128(st8, st10);
    st12 = or128(st12, st14);
    st8 = or128(st8, st12);
    m128 st = or128(*s, st0);
    *conf0 = movq(st) ^ ~0ULL;
    st = rshiftbyte_m128(st, 8);
    st = or128(st, st8);
    *conf8 = movq(st) ^ ~0ULL;
    *s = rshiftbyte_m128(st, 8);
 }
 static really_inline
 void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
                    const u32 *confBase, const struct FDR_Runtime_Args *a,
                    const u8 *ptr, u32 *last_match_id, const struct zone *z) {
    const u8 bucket = 8;
    if (likely(!*conf)) {
        return;
    }
    /* ptr is currently referring to a location in the zone's buffer, we also
     * need a pointer in the original, main buffer for the final string compare.
     */
    const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
    const u8 *confLoc = ptr;
    do  {
        u32 bit = findAndClearLSB_64(conf);
        u32 byte = bit / bucket + offset;
        u32 bitRem = bit % bucket;
        u32 idx = bitRem;
        u32 cf = confBase[idx];
        if (!cf) {
            continue;
        }
        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
                                        ((const u8 *)confBase + cf);
        if (!(fdrc->groups & *control)) {
            continue;
        }
        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
        confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
                    last_match_id, confVal, conf, bit);
    } while (unlikely(!!*conf));
 }
 #endif // FDR_IMPL_ARM_H
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
 * Copyright (c) 2020-2025, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -31,6 +32,7 @@
 #include "fdr_confirm_runtime.h"
 #include "fdr_internal.h"
 #include "fdr_loadval.h"
 #include "fdr_impl.h"
 #include "flood_runtime.h"
 #include "scratch.h"
 #include "teddy.h"
@@ -40,86 +42,6 @@
 #include "util/simd_utils.h"
 #include "util/uniform_ops.h"
 /** \brief number of bytes processed in each iteration */
 #define ITER_BYTES          16
 /** \brief total zone buffer size */
 #define ZONE_TOTAL_SIZE     64
 /** \brief maximum number of allowed zones */
 #define ZONE_MAX            3
 /** \brief zone information.
 *
 * Zone represents a region of data to scan in FDR.
 *
 * The incoming buffer is to split in multiple zones to ensure two properties:
 * 1: that we can read 8? bytes behind to generate a hash safely
 * 2: that we can read the 3 byte after the current byte (domain > 8)
 */
 struct zone {
    /** \brief copied buffer, used only when it is a boundary zone. */
    u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
    /** \brief shift amount for fdr state to avoid unwanted match. */
    u8 shift;
    /** \brief if boundary zone, start points into the zone buffer after the
     * pre-padding. Otherwise, points to the main buffer, appropriately. */
    const u8 *start;
    /** \brief if boundary zone, end points to the end of zone. Otherwise,
     * pointer to the main buffer, appropriately. */
    const u8 *end;
    /** \brief the amount to adjust to go from a pointer in the zones region
     * (between start and end) to a pointer in the original data buffer. */
    ptrdiff_t zone_pointer_adjust;
    /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,
     * otherwise end of the zone buf. floodPtr always points inside the same
     * buffer as the start pointe. */
    const u8 *floodPtr;
 };
 static
 const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
 };
 /* generates an initial state mask based on the last byte-ish of history rather
 * than being all accepting. If there is no history to consider, the state is
 * generated based on the minimum length of each bucket in order to prevent
@@ -141,197 +63,6 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,
    return s;
 }
 static really_inline
 void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
    /* +1: the zones ensure that we can read the byte at z->end */
    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
    u64a domain_mask = ~domain_mask_flipped;
    u64a it_hi = *(const u64a *)itPtr;
    u64a it_lo = *(const u64a *)(itPtr + 8);
    u64a reach0  = domain_mask & it_hi;
    u64a reach1  = domain_mask & (it_hi >> 8);
    u64a reach2  = domain_mask & (it_hi >> 16);
    u64a reach3  = domain_mask & (it_hi >> 24);
    u64a reach4  = domain_mask & (it_hi >> 32);
    u64a reach5  = domain_mask & (it_hi >> 40);
    u64a reach6  = domain_mask & (it_hi >> 48);
    u64a reach7  = domain_mask & ((it_hi >> 56) | (it_lo << 8));
    u64a reach8  = domain_mask & it_lo;
    u64a reach9  = domain_mask & (it_lo >> 8);
    u64a reach10 = domain_mask & (it_lo >> 16);
    u64a reach11 = domain_mask & (it_lo >> 24);
    u64a reach12 = domain_mask & (it_lo >> 32);
    u64a reach13 = domain_mask & (it_lo >> 40);
    u64a reach14 = domain_mask & (it_lo >> 48);
    u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
    m128 st0  = load_m128_from_u64a(ft + reach0);
    m128 st1  = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1);
    m128 st2  = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2);
    m128 st3  = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3);
    m128 st4  = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4);
    m128 st5  = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5);
    m128 st6  = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6);
    m128 st7  = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7);
    m128 st8  = load_m128_from_u64a(ft + reach8);
    m128 st9  = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1);
    m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2);
    m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3);
    m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4);
    m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5);
    m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6);
    m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7);
    st0 = or128(st0, st1);
    st2 = or128(st2, st3);
    st4 = or128(st4, st5);
    st6 = or128(st6, st7);
    st0 = or128(st0, st2);
    st4 = or128(st4, st6);
    st0 = or128(st0, st4);
    st8 = or128(st8, st9);
    st10 = or128(st10, st11);
    st12 = or128(st12, st13);
    st14 = or128(st14, st15);
    st8 = or128(st8, st10);
    st12 = or128(st12, st14);
    st8 = or128(st8, st12);
    m128 st = or128(*s, st0);
    *conf0 = movq(st) ^ ~0ULL;
    st = rshiftbyte_m128(st, 8);
    st = or128(st, st8);
    *conf8 = movq(st) ^ ~0ULL;
    *s = rshiftbyte_m128(st, 8);
 }
 static really_inline
 void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
    u64a reach0 = andn(domain_mask_flipped, itPtr);
    u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
    u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
    m128 st0 = load_m128_from_u64a(ft + reach0);
    m128 st2 = load_m128_from_u64a(ft + reach2);
    m128 st4 = load_m128_from_u64a(ft + reach4);
    m128 st6 = load_m128_from_u64a(ft + reach6);
    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
    u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
    u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
    m128 st8 = load_m128_from_u64a(ft + reach8);
    m128 st10 = load_m128_from_u64a(ft + reach10);
    m128 st12 = load_m128_from_u64a(ft + reach12);
    m128 st14 = load_m128_from_u64a(ft + reach14);
    st2  = lshiftbyte_m128(st2, 2);
    st4  = lshiftbyte_m128(st4, 4);
    st6  = lshiftbyte_m128(st6, 6);
    *s = or128(*s, st0);
    *s = or128(*s, st2);
    *s = or128(*s, st4);
    *s = or128(*s, st6);
    *conf0 = movq(*s);
    *s = rshiftbyte_m128(*s, 8);
    *conf0 ^= ~0ULL;
    st10 = lshiftbyte_m128(st10, 2);
    st12 = lshiftbyte_m128(st12, 4);
    st14 = lshiftbyte_m128(st14, 6);
    *s = or128(*s, st8);
    *s = or128(*s, st10);
    *s = or128(*s, st12);
    *s = or128(*s, st14);
    *conf8 = movq(*s);
    *s = rshiftbyte_m128(*s, 8);
    *conf8 ^= ~0ULL;
 }
 static really_inline
 void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
    u64a reach0 = andn(domain_mask_flipped, itPtr);
    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
    m128 st0 = load_m128_from_u64a(ft + reach0);
    m128 st4 = load_m128_from_u64a(ft + reach4);
    m128 st8 = load_m128_from_u64a(ft + reach8);
    m128 st12 = load_m128_from_u64a(ft + reach12);
    st4 = lshiftbyte_m128(st4, 4);
    st12 = lshiftbyte_m128(st12, 4);
    *s = or128(*s, st0);
    *s = or128(*s, st4);
    *conf0 = movq(*s);
    *s = rshiftbyte_m128(*s, 8);
    *conf0 ^= ~0ULL;
    *s = or128(*s, st8);
    *s = or128(*s, st12);
    *conf8 = movq(*s);
    *s = rshiftbyte_m128(*s, 8);
    *conf8 ^= ~0ULL;
 }
 static really_inline
 void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
                    const u32 *confBase, const struct FDR_Runtime_Args *a,
                    const u8 *ptr, u32 *last_match_id, const struct zone *z) {
    const u8 bucket = 8;
    if (likely(!*conf)) {
        return;
    }
    /* ptr is currently referring to a location in the zone's buffer, we also
     * need a pointer in the original, main buffer for the final string compare.
     */
    const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
    const u8 *confLoc = ptr;
    do  {
        u32 bit = findAndClearLSB_64(conf);
        u32 byte = bit / bucket + offset;
        u32 bitRem = bit % bucket;
        u32 idx = bitRem;
        u32 cf = confBase[idx];
        if (!cf) {
            continue;
        }
        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
                                        ((const u8 *)confBase + cf);
        if (!(fdrc->groups & *control)) {
            continue;
        }
        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
        confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
                    last_match_id, confVal, conf, bit);
    } while (unlikely(!!*conf));
 }
 static really_inline
 void dumpZoneInfo(UNUSED const struct zone *z, UNUSED size_t zone_id) {
 #ifdef DEBUG
@@ -660,41 +391,6 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
 #define INVALID_MATCH_ID (~0U)
 #define FDR_MAIN_LOOP(zz, s, get_conf_fn)                                   \
    do {                                                                    \
        const u8 *tryFloodDetect = zz->floodPtr;                            \
        const u8 *start_ptr = zz->start;                                    \
        const u8 *end_ptr = zz->end;                                        \
        for (const u8 *itPtr = ROUNDDOWN_PTR(start_ptr, 64); itPtr + 4*ITER_BYTES <= end_ptr;      \
            itPtr += 4*ITER_BYTES) {                                        \
            __builtin_prefetch(itPtr);                                      \
        }                                                                   \
                                                                            \
        for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr;    \
            itPtr += ITER_BYTES) {                                          \
            if (unlikely(itPtr > tryFloodDetect)) {                         \
                tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
                                             &floodBackoff, &control,       \
                                             ITER_BYTES);                   \
                if (unlikely(control == HWLM_TERMINATE_MATCHING)) {         \
                    return HWLM_TERMINATED;                                 \
                }                                                           \
            }                                                               \
            __builtin_prefetch(itPtr + ITER_BYTES);                         \
            u64a conf0;                                                     \
            u64a conf8;                                                     \
            get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped,     \
                        ft, &conf0, &conf8, &s);                            \
            do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr,         \
                           &last_match_id, zz);                             \
            do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr,         \
                           &last_match_id, zz);                             \
            if (unlikely(control == HWLM_TERMINATE_MATCHING)) {             \
                return HWLM_TERMINATED;                                     \
            }                                                               \
        } /* end for loop */                                                \
    } while (0)                                                             \
 static never_inline
 hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
                             const struct FDR_Runtime_Args *a,
@@ -703,8 +399,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
    u32 floodBackoff = FLOOD_BACKOFF_START;
    u32 last_match_id = INVALID_MATCH_ID;
-    u32 domain_mask_flipped = ~fdr->domainMask;
+
    u8 stride = fdr->stride;
    const u64a *ft =
        (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR)));
    assert(ISALIGNED_CL(ft));
@@ -722,42 +417,39 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
    for (size_t curZone = 0; curZone < numZone; curZone++) {
        struct zone *z = &zones[curZone];
-        dumpZoneInfo(z, curZone);
+        m128 zone_mask = load128(zone_or_mask[z->shift]);
-        /* When a zone contains less data than is processed in an iteration
+        const u8 *cacheline = ROUNDDOWN_PTR(z->start, 64);
-         * of FDR_MAIN_LOOP(), we need to scan over some extra data.
+        __builtin_prefetch(cacheline);
         *
         * We have chosen to scan this extra data at the start of the
         * iteration. The extra data is either data we have already scanned or
         * garbage (if it is earlier than offset 0),
         *
         * As a result we need to shift the incoming state back so that it will
         * properly line up with the data being scanned.
         *
         * We also need to forbid reporting any matches in the data being
         * rescanned as they have already been reported (or are over garbage but
         * later stages should also provide that safety guarantee).
         */
-        u8 shift = z->shift;
+        const u8 *tryFloodDetect = z->floodPtr;
-        state = variable_byte_shift_m128(state, shift);
+        state = variable_byte_shift_m128(state, z->shift);
        state = or128(state, zone_mask);
-        state = or128(state, load128(zone_or_mask[shift]));
+        for (const u8 *itPtr = z->start; itPtr + ITER_BYTES <= z->end; itPtr += ITER_BYTES) {
            if (unlikely(itPtr > tryFloodDetect)) {
                tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,
                                             &floodBackoff, &control,
                                             ITER_BYTES);
                if (unlikely(control == HWLM_TERMINATE_MATCHING)) {
                    return HWLM_TERMINATED;
                }
            }
            u64a conf0;
            u64a conf8;
-        switch (stride) {
+            cacheline += 64;
-        case 1:
+            __builtin_prefetch(cacheline);
-            FDR_MAIN_LOOP(z, state, get_conf_stride_1);
+
-            break;
+            get_conf_stride(itPtr, z->start, z->end, fdr->domainMask, fdr->stride, ft, &conf0, &conf8, &state);
-        case 2:
+
-            FDR_MAIN_LOOP(z, state, get_conf_stride_2);
+            do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, &last_match_id, z);
-            break;
+            do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, &last_match_id, z);
-        case 4:
+            if (unlikely(control == HWLM_TERMINATE_MATCHING)) {
-            FDR_MAIN_LOOP(z, state, get_conf_stride_4);
+                return HWLM_TERMINATED;
-            break;
+            }
-        default:
+        } /* end for loop */
            break;
        }
    }
    return HWLM_SUCCESS;
--- a/src/fdr/fdr.h
+++ b/src/fdr/fdr.h
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
 * Copyright (c) 2020-2025, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
--- a/src/fdr/fdr_impl.h
+++ b/src/fdr/fdr_impl.h
@@ -0,0 +1,119 @@
 /*
 * Copyright (c) 2020-2025, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 /** \brief number of bytes processed in each iteration */
 #define ITER_BYTES          16
 /** \brief total zone buffer size */
 #define ZONE_TOTAL_SIZE     64
 /** \brief maximum number of allowed zones */
 #define ZONE_MAX            3
 /** \brief zone information.
 *
 * Zone represents a region of data to scan in FDR.
 *
 * The incoming buffer is to split in multiple zones to ensure two properties:
 * 1: that we can read 8? bytes behind to generate a hash safely
 * 2: that we can read the 3 byte after the current byte (domain > 8)
 */
 struct zone {
    /** \brief copied buffer, used only when it is a boundary zone. */
    u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
    /** \brief shift amount for fdr state to avoid unwanted match. */
    u8 shift;
    /** \brief if boundary zone, start points into the zone buffer after the
     * pre-padding. Otherwise, points to the main buffer, appropriately. */
    const u8 *start;
    /** \brief if boundary zone, end points to the end of zone. Otherwise,
     * pointer to the main buffer, appropriately. */
    const u8 *end;
    /** \brief the amount to adjust to go from a pointer in the zones region
     * (between start and end) to a pointer in the original data buffer. */
    ptrdiff_t zone_pointer_adjust;
    /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,
     * otherwise end of the zone buf. floodPtr always points inside the same
     * buffer as the start pointe. */
    const u8 *floodPtr;
 };
 static
 const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
 };
 #if defined(VS_SIMDE_BACKEND)
 #include "x86/fdr_impl.h"
 #else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "x86/fdr_impl.h"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "arm/fdr_impl.h"
 #elif defined(ARCH_PPC64EL)
 #include "ppc64le/fdr_impl.h"
 #endif
 #endif
--- a/src/fdr/ppc64le/fdr_impl.h
+++ b/src/fdr/ppc64le/fdr_impl.h
@@ -0,0 +1,196 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
 * Copyright (c) 2020-2025, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef FDR_IMPL_PPC64LE_H
 #define FDR_IMPL_PPC64LE_H
 static really_inline
 void get_conf_stride(const u8 *itPtr, UNUSED const u8 *start_ptr,
                       UNUSED const u8 *end_ptr, u32 domain_mask, u8 stride,
                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
    // get_conf_stride_4
    u64a it_hi = *(const u64a *)itPtr;
    u64a it_lo = *(const u64a *)(itPtr + 8);
    u64a reach0  = domain_mask & it_hi;
    u64a reach4  = domain_mask & (it_hi >> 32);
    u64a reach8  = domain_mask & it_lo;
    u64a reach12 = domain_mask & (it_lo >> 32);
    m128 st0 = load_m128_from_u64a(ft + reach0);
    m128 st4 = load_m128_from_u64a(ft + reach4);
    m128 st8 = load_m128_from_u64a(ft + reach8);
    m128 st12 = load_m128_from_u64a(ft + reach12);
    st4 = lshiftbyte_m128(st4, 4);
    st12 = lshiftbyte_m128(st12, 4);
    *s = or128(*s, st0);
    *s = or128(*s, st4);
    if (stride == 4) {
        *conf0 = movq(*s);
        *s = rshiftbyte_m128(*s, 8);
        *conf0 ^= ~0ULL;
        *s = or128(*s, st8);
        *s = or128(*s, st12);
        *conf8 = movq(*s);
        *s = rshiftbyte_m128(*s, 8);
        *conf8 ^= ~0ULL;
        return;
    }
    // get_conf_stride_2
    u64a reach2  = domain_mask & (it_hi >> 16);
    u64a reach6  = domain_mask & (it_hi >> 48);
    u64a reach10 = domain_mask & (it_lo >> 16);
    u64a reach14 = domain_mask & (it_lo >> 48);
    m128 st2 = load_m128_from_u64a(ft + reach2);
    m128 st6 = load_m128_from_u64a(ft + reach6);
    m128 st10 = load_m128_from_u64a(ft + reach10);
    m128 st14 = load_m128_from_u64a(ft + reach14);
    st2  = lshiftbyte_m128(st2, 2);
    st6  = lshiftbyte_m128(st6, 6);
    st10 = lshiftbyte_m128(st10, 2);
    st14 = lshiftbyte_m128(st14, 6);
    *s = or128(*s, st2);
    *s = or128(*s, st6);
    if (stride == 2) {
        *conf0 = movq(*s);
        *s = rshiftbyte_m128(*s, 8);
        *conf0 ^= ~0ULL;
        *s = or128(*s, st8);
        *s = or128(*s, st10);
        *s = or128(*s, st12);
        *s = or128(*s, st14);
        *conf8 = movq(*s);
        *s = rshiftbyte_m128(*s, 8);
        *conf8 ^= ~0ULL;
        return;
    }
    // get_conf_stride_1
    u64a reach1  = domain_mask & (it_hi >> 8);
    u64a reach3  = domain_mask & (it_hi >> 24);
    u64a reach5  = domain_mask & (it_hi >> 40);
    u64a reach7  = domain_mask & ((it_hi >> 56) | (it_lo << 8));
    u64a reach9  = domain_mask & (it_lo >> 8);
    u64a reach11 = domain_mask & (it_lo >> 24);
    u64a reach13 = domain_mask & (it_lo >> 40);
    u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
    m128 st1 = load_m128_from_u64a(ft + reach1);
    m128 st3 = load_m128_from_u64a(ft + reach3);
    m128 st5 = load_m128_from_u64a(ft + reach5);
    m128 st7 = load_m128_from_u64a(ft + reach7);
    m128 st9 = load_m128_from_u64a(ft + reach9);
    m128 st11 = load_m128_from_u64a(ft + reach11);
    m128 st13 = load_m128_from_u64a(ft + reach13);
    m128 st15 = load_m128_from_u64a(ft + reach15);
    st1 = lshiftbyte_m128(st1, 1);
    st3 = lshiftbyte_m128(st3, 3);
    st5 = lshiftbyte_m128(st5, 5);
    st7 = lshiftbyte_m128(st7, 7);
    st9 = lshiftbyte_m128(st9, 1);
    st11 = lshiftbyte_m128(st11, 3);
    st13 = lshiftbyte_m128(st13, 5);
    st15 = lshiftbyte_m128(st15, 7);
    st0 = or128(st0, st1);
    st2 = or128(st2, st3);
    st4 = or128(st4, st5);
    st6 = or128(st6, st7);
    st0 = or128(st0, st2);
    st4 = or128(st4, st6);
    st0 = or128(st0, st4);
    st8 = or128(st8, st9);
    st10 = or128(st10, st11);
    st12 = or128(st12, st13);
    st14 = or128(st14, st15);
    st8 = or128(st8, st10);
    st12 = or128(st12, st14);
    st8 = or128(st8, st12);
    m128 st = or128(*s, st0);
    *conf0 = movq(st) ^ ~0ULL;
    st = rshiftbyte_m128(st, 8);
    st = or128(st, st8);
    *conf8 = movq(st) ^ ~0ULL;
    *s = rshiftbyte_m128(st, 8);
 }
 static really_inline
 void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
                    const u32 *confBase, const struct FDR_Runtime_Args *a,
                    const u8 *ptr, u32 *last_match_id, const struct zone *z) {
    const u8 bucket = 8;
    if (likely(!*conf)) {
        return;
    }
    /* ptr is currently referring to a location in the zone's buffer, we also
     * need a pointer in the original, main buffer for the final string compare.
     */
    const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
    const u8 *confLoc = ptr;
    do  {
        u32 bit = findAndClearLSB_64(conf);
        u32 byte = bit / bucket + offset;
        u32 bitRem = bit % bucket;
        u32 idx = bitRem;
        u32 cf = confBase[idx];
        if (!cf) {
            continue;
        }
        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
                                        ((const u8 *)confBase + cf);
        if (!(fdrc->groups & *control)) {
            continue;
        }
        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
        confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
                    last_match_id, confVal, conf, bit);
    } while (unlikely(!!*conf));
 }
 #endif // FDR_IMPL_PPC64LE_H
--- a/src/fdr/x86/fdr_impl.h
+++ b/src/fdr/x86/fdr_impl.h
@@ -0,0 +1,196 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
 * Copyright (c) 2020-2025, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef FDR_IMPL_X86_H
 #define FDR_IMPL_X86_H
 static really_inline
 void get_conf_stride(const u8 *itPtr, UNUSED const u8 *start_ptr,
                       UNUSED const u8 *end_ptr, u32 domain_mask, u8 stride,
                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
    // get_conf_stride_4
    u64a it_hi = *(const u64a *)itPtr;
    u64a it_lo = *(const u64a *)(itPtr + 8);
    u64a reach0  = domain_mask & it_hi;
    u64a reach4  = domain_mask & (it_hi >> 32);
    u64a reach8  = domain_mask & it_lo;
    u64a reach12 = domain_mask & (it_lo >> 32);
    m128 st0 = load_m128_from_u64a(ft + reach0);
    m128 st4 = load_m128_from_u64a(ft + reach4);
    m128 st8 = load_m128_from_u64a(ft + reach8);
    m128 st12 = load_m128_from_u64a(ft + reach12);
    st4 = lshiftbyte_m128(st4, 4);
    st12 = lshiftbyte_m128(st12, 4);
    *s = or128(*s, st0);
    *s = or128(*s, st4);
    if (stride == 4) {
        *conf0 = movq(*s);
        *s = rshiftbyte_m128(*s, 8);
        *conf0 ^= ~0ULL;
        *s = or128(*s, st8);
        *s = or128(*s, st12);
        *conf8 = movq(*s);
        *s = rshiftbyte_m128(*s, 8);
        *conf8 ^= ~0ULL;
        return;
    }
    // get_conf_stride_2
    u64a reach2  = domain_mask & (it_hi >> 16);
    u64a reach6  = domain_mask & (it_hi >> 48);
    u64a reach10 = domain_mask & (it_lo >> 16);
    u64a reach14 = domain_mask & (it_lo >> 48);
    m128 st2 = load_m128_from_u64a(ft + reach2);
    m128 st6 = load_m128_from_u64a(ft + reach6);
    m128 st10 = load_m128_from_u64a(ft + reach10);
    m128 st14 = load_m128_from_u64a(ft + reach14);
    st2  = lshiftbyte_m128(st2, 2);
    st6  = lshiftbyte_m128(st6, 6);
    st10 = lshiftbyte_m128(st10, 2);
    st14 = lshiftbyte_m128(st14, 6);
    *s = or128(*s, st2);
    *s = or128(*s, st6);
    if (stride == 2) {
        *conf0 = movq(*s);
        *s = rshiftbyte_m128(*s, 8);
        *conf0 ^= ~0ULL;
        *s = or128(*s, st8);
        *s = or128(*s, st10);
        *s = or128(*s, st12);
        *s = or128(*s, st14);
        *conf8 = movq(*s);
        *s = rshiftbyte_m128(*s, 8);
        *conf8 ^= ~0ULL;
        return;
    }
    // get_conf_stride_1
    u64a reach1  = domain_mask & (it_hi >> 8);
    u64a reach3  = domain_mask & (it_hi >> 24);
    u64a reach5  = domain_mask & (it_hi >> 40);
    u64a reach7  = domain_mask & ((it_hi >> 56) | (it_lo << 8));
    u64a reach9  = domain_mask & (it_lo >> 8);
    u64a reach11 = domain_mask & (it_lo >> 24);
    u64a reach13 = domain_mask & (it_lo >> 40);
    u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
    m128 st1 = load_m128_from_u64a(ft + reach1);
    m128 st3 = load_m128_from_u64a(ft + reach3);
    m128 st5 = load_m128_from_u64a(ft + reach5);
    m128 st7 = load_m128_from_u64a(ft + reach7);
    m128 st9 = load_m128_from_u64a(ft + reach9);
    m128 st11 = load_m128_from_u64a(ft + reach11);
    m128 st13 = load_m128_from_u64a(ft + reach13);
    m128 st15 = load_m128_from_u64a(ft + reach15);
    st1 = lshiftbyte_m128(st1, 1);
    st3 = lshiftbyte_m128(st3, 3);
    st5 = lshiftbyte_m128(st5, 5);
    st7 = lshiftbyte_m128(st7, 7);
    st9 = lshiftbyte_m128(st9, 1);
    st11 = lshiftbyte_m128(st11, 3);
    st13 = lshiftbyte_m128(st13, 5);
    st15 = lshiftbyte_m128(st15, 7);
    st0 = or128(st0, st1);
    st2 = or128(st2, st3);
    st4 = or128(st4, st5);
    st6 = or128(st6, st7);
    st0 = or128(st0, st2);
    st4 = or128(st4, st6);
    st0 = or128(st0, st4);
    st8 = or128(st8, st9);
    st10 = or128(st10, st11);
    st12 = or128(st12, st13);
    st14 = or128(st14, st15);
    st8 = or128(st8, st10);
    st12 = or128(st12, st14);
    st8 = or128(st8, st12);
    m128 st = or128(*s, st0);
    *conf0 = movq(st) ^ ~0ULL;
    st = rshiftbyte_m128(st, 8);
    st = or128(st, st8);
    *conf8 = movq(st) ^ ~0ULL;
    *s = rshiftbyte_m128(st, 8);
 }
 static really_inline
 void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
                    const u32 *confBase, const struct FDR_Runtime_Args *a,
                    const u8 *ptr, u32 *last_match_id, const struct zone *z) {
    const u8 bucket = 8;
    if (likely(!*conf)) {
        return;
    }
    /* ptr is currently referring to a location in the zone's buffer, we also
     * need a pointer in the original, main buffer for the final string compare.
     */
    const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
    const u8 *confLoc = ptr;
    do  {
        u32 bit = findAndClearLSB_64(conf);
        u32 byte = bit / bucket + offset;
        u32 bitRem = bit % bucket;
        u32 idx = bitRem;
        u32 cf = confBase[idx];
        if (!cf) {
            continue;
        }
        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
                                        ((const u8 *)confBase + cf);
        if (!(fdrc->groups & *control)) {
            continue;
        }
        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
        confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
                    last_match_id, confVal, conf, bit);
    } while (unlikely(!!*conf));
 }
 #endif // FDR_IMPL_X86_H
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -181,6 +181,8 @@ static really_inline m128 set1_2x64(u64a c) {
    return (m128) vdupq_n_u64(c);
 }
 #define insert32_m128(in, val, imm) ((m128) vsetq_lane_u32(val, (uint32x4_t)in, imm))
 static really_inline u32 movd(const m128 in) {
    return vgetq_lane_u32((uint32x4_t) in, 0);
 }
@@ -449,4 +451,14 @@ m128 set2x64(u64a hi, u64a lo) {
    return (m128) vld1q_u64((uint64_t *) data);
 }
 static really_inline
 m128 widenlo128(m128 x) {
    return (m128) vmovl_u32(vget_low_u32((uint32x4_t)x));
 }
 static really_inline
 m128 widenhi128(m128 x) {
    return (m128) vmovl_u32(vget_high_u32((uint32x4_t)x));
 }
 #endif // ARCH_ARM_SIMD_UTILS_H
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -388,6 +388,14 @@ m256 pshufb_m256(m256 a, m256 b) {
    return rv;
 }
 static really_inline
 m256 widen128(m128 x) {
    m256 rv;
    rv.lo = widenlo128(x);
    rv.hi = widenhi128(x);
    return rv;
 }
 #endif // HAVE_SIMD_256_BITS
 /****
--- a/src/util/arch/ppc64el/simd_utils.h
+++ b/src/util/arch/ppc64el/simd_utils.h
@@ -429,6 +429,16 @@ m128 set2x64(u64a hi, u64a lo) {
    return (m128) v;
 }
 static really_inline
 m128 widenlo128(m128 x) {
    return (m128) vec_mergel((m128)x, zeroes128());
 }
 static really_inline
 m128 widenhi128(m128 x) {
    return (m128) vec_mergeh((m128)x, zeroes128());
 }
 #if defined(__clang__) && (__clang_major__ == 15)
 #pragma clang diagnostic pop
 #endif // defined(__clang__) && (__clang_major__ == 15)
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -123,6 +123,17 @@ m128 sub_2x64(m128 a, m128 b) {
    return (m128) _mm_sub_epi64(a, b);
 }
 static really_really_inline
 m128 lshift32_m128(m128 a, unsigned b) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
    if (__builtin_constant_p(b)) {
        return _mm_slli_epi32(a, b);
    }
 #endif
    m128 x = _mm_cvtsi32_si128(b);
    return _mm_sll_epi32(a, x);
 }
 static really_really_inline
 m128 lshift64_m128(m128 a, unsigned b) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
@@ -158,6 +169,8 @@ static really_inline m128 set1_2x64(u64a c) {
    return _mm_set1_epi64x(c);
 }
 #define insert32_m128(in, val, imm) (m128) (_mm_insert_epi32((m128) in, (m128) val, (m128) imm))
 static really_inline u32 movd(const m128 in) {
    return _mm_cvtsi128_si32(in);
 }
@@ -474,6 +487,16 @@ m128 set2x64(u64a hi, u64a lo) {
    return _mm_set_epi64x(hi, lo);
 }
 static really_inline
 m128 widenlo128(m128 x) {
    return _mm_unpacklo_epi32(x, zeroes128());
 }
 static really_inline
 m128 widenhi128(m128 x) {
    return _mm_unpackhi_epi32(x, zeroes128());
 }
 /****
 **** 256-bit Primitives
 ****/
@@ -750,6 +773,12 @@ m256 combine2x128(m128 hi, m128 lo) {
    return insert128to256(cast128to256(lo), hi, 1);
 #endif
 }
 static really_inline
 m256 widen128(m128 x) {
    return (m256) _mm256_cvtepu32_epi64(x);
 }
 #endif //AVX2
 /****