mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-11-16 01:12:15 +03:00
Reinforced Teddy with 1-byte approach, based on "shift-or" and AVX2.
This commit is contained in:
committed by
Matthew Barr
parent
b09e3acd04
commit
dbd3f66e87
@@ -38,8 +38,12 @@
|
||||
#include "ue2common.h"
|
||||
#include "util/bitutils.h"
|
||||
#include "util/simd_utils.h"
|
||||
#include "util/uniform_ops.h"
|
||||
|
||||
extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
|
||||
#if defined(__AVX2__)
|
||||
extern const u8 ALIGN_DIRECTIVE p_mask_arr256[33][64];
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
#define TEDDY_CONF_TYPE u64a
|
||||
@@ -110,8 +114,27 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
|
||||
}
|
||||
|
||||
// Note: p_mask is an output param that initialises a poison mask.
|
||||
// *p_mask = load128(p_mask_arr[n] + 16 - m) means:
|
||||
// m byte 0xff in the beginning, followed by n byte 0x00,
|
||||
// then followed by the rest bytes 0xff.
|
||||
// ptr >= lo:
|
||||
// no history.
|
||||
// for end/short zone, ptr==lo and start_offset==0
|
||||
// for start zone, see below
|
||||
// lo ptr hi hi
|
||||
// |----------|-------|----------------|............|
|
||||
// start 0 start+offset end(<=16)
|
||||
// p_mask ffff..ff0000...........00ffff..........
|
||||
// ptr < lo:
|
||||
// only start zone.
|
||||
// history
|
||||
// ptr lo hi hi
|
||||
// |----------|-------|----------------|............|
|
||||
// 0 start start+offset end(<=16)
|
||||
// p_mask ffff.....ffffff..ff0000...........00ffff..........
|
||||
static really_inline
|
||||
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
|
||||
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
const u8 *lo, const u8 *hi,
|
||||
const u8 *buf_history, size_t len_history,
|
||||
const u32 nMasks) {
|
||||
union {
|
||||
@@ -123,27 +146,34 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
|
||||
uintptr_t copy_start;
|
||||
uintptr_t copy_len;
|
||||
|
||||
if (ptr >= lo) {
|
||||
if (ptr >= lo) { // short/end/start zone
|
||||
uintptr_t start = (uintptr_t)(ptr - lo);
|
||||
uintptr_t avail = (uintptr_t)(hi - ptr);
|
||||
if (avail >= 16) {
|
||||
*p_mask = load128(p_mask_arr[16] + 16);
|
||||
assert(start_offset - start <= 16);
|
||||
*p_mask = loadu128(p_mask_arr[16 - start_offset + start]
|
||||
+ 16 - start_offset + start);
|
||||
return loadu128(ptr);
|
||||
}
|
||||
*p_mask = load128(p_mask_arr[avail] + 16);
|
||||
assert(start_offset - start <= avail);
|
||||
*p_mask = loadu128(p_mask_arr[avail - start_offset + start]
|
||||
+ 16 - start_offset + start);
|
||||
copy_start = 0;
|
||||
copy_len = avail;
|
||||
} else {
|
||||
} else { // start zone
|
||||
uintptr_t need = MIN((uintptr_t)(lo - ptr),
|
||||
MIN(len_history, nMasks - 1));
|
||||
uintptr_t start = (uintptr_t)(lo - ptr);
|
||||
uintptr_t i;
|
||||
for (i = start - need; ptr + i < lo; i++) {
|
||||
u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
|
||||
for (i = start - need; i < start; i++) {
|
||||
u.val8[i] = buf_history[len_history - (start - i)];
|
||||
}
|
||||
uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
|
||||
*p_mask = loadu128(p_mask_arr[end - start] + 16 - start);
|
||||
copy_start = i;
|
||||
copy_len = end - i;
|
||||
assert(start + start_offset <= end);
|
||||
*p_mask = loadu128(p_mask_arr[end - start - start_offset]
|
||||
+ 16 - start - start_offset);
|
||||
copy_start = start;
|
||||
copy_len = end - start;
|
||||
}
|
||||
|
||||
// Runt block from the buffer.
|
||||
@@ -152,6 +182,135 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
|
||||
return u.val128;
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
/*
|
||||
* \brief Copy a block of [0,31] bytes efficiently.
|
||||
*
|
||||
* This function is a workaround intended to stop some compilers from
|
||||
* synthesizing a memcpy function call out of the copy of a small number of
|
||||
* bytes that we do in vectoredLoad256.
|
||||
*/
|
||||
static really_inline
|
||||
void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
|
||||
switch (len) {
|
||||
case 0:
|
||||
break;
|
||||
case 1:
|
||||
*dst = *src;
|
||||
break;
|
||||
case 2:
|
||||
unaligned_store_u16(dst, unaligned_load_u16(src));
|
||||
break;
|
||||
case 3:
|
||||
unaligned_store_u16(dst, unaligned_load_u16(src));
|
||||
dst[2] = src[2];
|
||||
break;
|
||||
case 4:
|
||||
unaligned_store_u32(dst, unaligned_load_u32(src));
|
||||
break;
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
/* Perform copy with two overlapping 4-byte chunks. */
|
||||
unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
|
||||
unaligned_store_u32(dst, unaligned_load_u32(src));
|
||||
break;
|
||||
case 8:
|
||||
unaligned_store_u64a(dst, unaligned_load_u64a(src));
|
||||
break;
|
||||
case 9:
|
||||
case 10:
|
||||
case 11:
|
||||
case 12:
|
||||
case 13:
|
||||
case 14:
|
||||
case 15:
|
||||
/* Perform copy with two overlapping 8-byte chunks. */
|
||||
unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
|
||||
unaligned_store_u64a(dst, unaligned_load_u64a(src));
|
||||
break;
|
||||
case 16:
|
||||
storeu128(dst, loadu128(src));
|
||||
break;
|
||||
default:
|
||||
/* Perform copy with two overlapping 16-byte chunks. */
|
||||
assert(len < 32);
|
||||
storeu128(dst + len - 16, loadu128(src + len - 16));
|
||||
storeu128(dst, loadu128(src));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: p_mask is an output param that initialises a poison mask.
|
||||
// *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
|
||||
// m byte 0xff in the beginning, followed by n byte 0x00,
|
||||
// then followed by the rest bytes 0xff.
|
||||
// ptr >= lo:
|
||||
// no history.
|
||||
// for end/short zone, ptr==lo and start_offset==0
|
||||
// for start zone, see below
|
||||
// lo ptr hi hi
|
||||
// |----------|-------|----------------|............|
|
||||
// start 0 start+offset end(<=32)
|
||||
// p_mask ffff..ff0000...........00ffff..........
|
||||
// ptr < lo:
|
||||
// only start zone.
|
||||
// history
|
||||
// ptr lo hi hi
|
||||
// |----------|-------|----------------|............|
|
||||
// 0 start start+offset end(<=32)
|
||||
// p_mask ffff.....ffffff..ff0000...........00ffff..........
|
||||
static really_inline
|
||||
m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
const u8 *lo, const u8 *hi,
|
||||
const u8 *buf_history, size_t len_history,
|
||||
const u32 nMasks) {
|
||||
union {
|
||||
u8 val8[32];
|
||||
m256 val256;
|
||||
} u;
|
||||
u.val256 = zeroes256();
|
||||
|
||||
uintptr_t copy_start;
|
||||
uintptr_t copy_len;
|
||||
|
||||
if (ptr >= lo) { // short/end/start zone
|
||||
uintptr_t start = (uintptr_t)(ptr - lo);
|
||||
uintptr_t avail = (uintptr_t)(hi - ptr);
|
||||
if (avail >= 32) {
|
||||
assert(start_offset - start <= 32);
|
||||
*p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
|
||||
+ 32 - start_offset + start);
|
||||
return loadu256(ptr);
|
||||
}
|
||||
assert(start_offset - start <= avail);
|
||||
*p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
|
||||
+ 32 - start_offset + start);
|
||||
copy_start = 0;
|
||||
copy_len = avail;
|
||||
} else { //start zone
|
||||
uintptr_t need = MIN((uintptr_t)(lo - ptr),
|
||||
MIN(len_history, nMasks - 1));
|
||||
uintptr_t start = (uintptr_t)(lo - ptr);
|
||||
uintptr_t i;
|
||||
for (i = start - need; i < start; i++) {
|
||||
u.val8[i] = buf_history[len_history - (start - i)];
|
||||
}
|
||||
uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
|
||||
assert(start + start_offset <= end);
|
||||
*p_mask = loadu256(p_mask_arr256[end - start - start_offset]
|
||||
+ 32 - start - start_offset);
|
||||
copy_start = start;
|
||||
copy_len = end - start;
|
||||
}
|
||||
|
||||
// Runt block from the buffer.
|
||||
copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
|
||||
|
||||
return u.val256;
|
||||
}
|
||||
#endif // __AVX2__
|
||||
|
||||
static really_inline
|
||||
u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
|
||||
CautionReason reason) {
|
||||
@@ -196,53 +355,17 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
|
||||
} while (unlikely(*conf));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
|
||||
const u32 *confBase, CautionReason reason,
|
||||
const struct FDR_Runtime_Args *a, const u8 *ptr,
|
||||
hwlmcb_rv_t *control, u32 *last_match) {
|
||||
do {
|
||||
u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
|
||||
u32 byte = bit / bucket + offset;
|
||||
u32 idx = bit % bucket;
|
||||
u32 cf = confBase[idx];
|
||||
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
|
||||
((const u8 *)confBase + cf);
|
||||
if (!(fdrc->groups & *control)) {
|
||||
continue;
|
||||
}
|
||||
u64a confVal = getConfVal(a, ptr, byte, reason);
|
||||
confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match,
|
||||
confVal);
|
||||
} while (unlikely(*conf));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
|
||||
const u32 *confBase, CautionReason reason,
|
||||
const struct FDR_Runtime_Args *a, const u8 *ptr,
|
||||
hwlmcb_rv_t *control, u32 *last_match) {
|
||||
do {
|
||||
u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
|
||||
u32 byte = bit / bucket + offset;
|
||||
u32 idx = bit % bucket;
|
||||
u32 cf = confBase[idx];
|
||||
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
|
||||
((const u8 *)confBase + cf);
|
||||
if (!(fdrc->groups & *control)) {
|
||||
continue;
|
||||
}
|
||||
u64a confVal = getConfVal(a, ptr, byte, reason);
|
||||
confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control,
|
||||
last_match, confVal);
|
||||
} while (unlikely(*conf));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
const m128 *getMaskBase(const struct Teddy *teddy) {
|
||||
return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
|
||||
return (const u64a *)((const u8 *)getMaskBase(teddy)
|
||||
+ ROUNDUP_CL(2 * numMask * sizeof(m128)));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
const u32 *getConfBase(const struct Teddy *teddy) {
|
||||
return (const u32 *)((const u8 *)teddy + teddy->confOffset);
|
||||
|
||||
Reference in New Issue
Block a user