Reinforced Teddy with 1-byte approach, based on "shift-or" and AVX2.

2025-11-16 01:12:15 +03:00 · 2017-01-22 12:23:25 -08:00
parent b09e3acd04
commit dbd3f66e87
10 changed files with 1070 additions and 1233 deletions
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@@ -38,8 +38,12 @@
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
+#include "util/uniform_ops.h"

 extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
+#if defined(__AVX2__)
+extern const u8 ALIGN_DIRECTIVE p_mask_arr256[33][64];
+#endif

 #ifdef ARCH_64_BIT
 #define TEDDY_CONF_TYPE u64a
@@ -110,8 +114,27 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
 }

 // Note: p_mask is an output param that initialises a poison mask.
+//       *p_mask = load128(p_mask_arr[n] + 16 - m) means:
+//       m byte 0xff in the beginning, followed by n byte 0x00,
+//       then followed by the rest bytes 0xff.
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          start      0       start+offset     end(<=16)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=16)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
 static really_inline
-m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi,
                     const u8 *buf_history, size_t len_history,
                     const u32 nMasks) {
    union {
@@ -123,27 +146,34 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
    uintptr_t copy_start;
    uintptr_t copy_len;

-    if (ptr >= lo) {
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
        uintptr_t avail = (uintptr_t)(hi - ptr);
        if (avail >= 16) {
-            *p_mask = load128(p_mask_arr[16] + 16);
+            assert(start_offset - start <= 16);
+            *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
+                               + 16 - start_offset + start);
            return loadu128(ptr);
        }
-        *p_mask = load128(p_mask_arr[avail] + 16);
+        assert(start_offset - start <= avail);
+        *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
+                           + 16 - start_offset + start);
        copy_start = 0;
        copy_len = avail;
-    } else {
+    } else { // start zone
        uintptr_t need = MIN((uintptr_t)(lo - ptr),
                             MIN(len_history, nMasks - 1));
        uintptr_t start = (uintptr_t)(lo - ptr);
        uintptr_t i;
-        for (i = start - need; ptr + i < lo; i++) {
-            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+        for (i = start - need; i < start; i++) {
+            u.val8[i] = buf_history[len_history - (start - i)];
        }
        uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
-        *p_mask = loadu128(p_mask_arr[end - start] + 16 - start);
-        copy_start = i;
-        copy_len = end - i;
+        assert(start + start_offset <= end);
+        *p_mask = loadu128(p_mask_arr[end - start - start_offset]
+                           + 16 - start - start_offset);
+        copy_start = start;
+        copy_len = end - start;
    }

    // Runt block from the buffer.
@@ -152,6 +182,135 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
    return u.val128;
 }

+#if defined(__AVX2__)
+/*
+ * \brief Copy a block of [0,31] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad256.
+ */
+static really_inline
+void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
+    switch (len) {
+    case 0:
+        break;
+    case 1:
+        *dst = *src;
+        break;
+    case 2:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        break;
+    case 3:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        dst[2] = src[2];
+        break;
+    case 4:
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 5:
+    case 6:
+    case 7:
+        /* Perform copy with two overlapping 4-byte chunks. */
+        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 8:
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+        /* Perform copy with two overlapping 8-byte chunks. */
+        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 16:
+        storeu128(dst, loadu128(src));
+        break;
+    default:
+        /* Perform copy with two overlapping 16-byte chunks. */
+        assert(len < 32);
+        storeu128(dst + len - 16, loadu128(src + len - 16));
+        storeu128(dst, loadu128(src));
+        break;
+    }
+}
+
+// Note: p_mask is an output param that initialises a poison mask.
+//       *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
+//       m byte 0xff in the beginning, followed by n byte 0x00,
+//       then followed by the rest bytes 0xff.
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          start      0       start+offset     end(<=32)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=32)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history,
+                     const u32 nMasks) {
+    union {
+        u8 val8[32];
+        m256 val256;
+    } u;
+    u.val256 = zeroes256();
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 32) {
+            assert(start_offset - start <= 32);
+            *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
+                               + 32 - start_offset + start);
+            return loadu256(ptr);
+        }
+        assert(start_offset - start <= avail);
+        *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
+                           + 32 - start_offset + start);
+        copy_start = 0;
+        copy_len = avail;
+    } else { //start zone
+        uintptr_t need = MIN((uintptr_t)(lo - ptr),
+                             MIN(len_history, nMasks - 1));
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        uintptr_t i;
+        for (i = start - need; i < start; i++) {
+            u.val8[i] = buf_history[len_history - (start - i)];
+        }
+        uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
+        assert(start + start_offset <= end);
+        *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
+                           + 32 - start - start_offset);
+        copy_start = start;
+        copy_len = end - start;
+    }
+
+    // Runt block from the buffer.
+    copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+    return u.val256;
+}
+#endif // __AVX2__
+
 static really_inline
 u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
                CautionReason reason) {
@@ -196,53 +355,17 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
    } while (unlikely(*conf));
 }

-static really_inline
-void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
-                           const u32 *confBase, CautionReason reason,
-                           const struct FDR_Runtime_Args *a, const u8 *ptr,
-                           hwlmcb_rv_t *control, u32 *last_match) {
-    do {
-        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
-        u32 byte = bit / bucket + offset;
-        u32 idx  = bit % bucket;
-        u32 cf = confBase[idx];
-        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
-                                        ((const u8 *)confBase + cf);
-        if (!(fdrc->groups & *control)) {
-            continue;
-        }
-        u64a confVal = getConfVal(a, ptr, byte, reason);
-        confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match,
-                     confVal);
-    } while (unlikely(*conf));
-}
-
-static really_inline
-void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
-                              const u32 *confBase, CautionReason reason,
-                              const struct FDR_Runtime_Args *a, const u8 *ptr,
-                              hwlmcb_rv_t *control, u32 *last_match) {
-    do {
-        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
-        u32 byte = bit / bucket + offset;
-        u32 idx = bit % bucket;
-        u32 cf = confBase[idx];
-        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
-                                        ((const u8 *)confBase + cf);
-        if (!(fdrc->groups & *control)) {
-            continue;
-        }
-        u64a confVal = getConfVal(a, ptr, byte, reason);
-        confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control,
-                        last_match, confVal);
-    } while (unlikely(*conf));
-}
-
 static really_inline
 const m128 *getMaskBase(const struct Teddy *teddy) {
    return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
 }

+static really_inline
+const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
+    return (const u64a *)((const u8 *)getMaskBase(teddy)
+                          + ROUNDUP_CL(2 * numMask * sizeof(m128)));
+}
+
 static really_inline
 const u32 *getConfBase(const struct Teddy *teddy) {
    return (const u32 *)((const u8 *)teddy + teddy->confOffset);