mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
borrow cache prefetching tricks from the Marvell port, seem to improve performance by 5-28%
This commit is contained in:
parent
51dcfa8571
commit
b62247a36e
@ -147,6 +147,7 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
|
|||||||
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
|
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
|
||||||
/* +1: the zones ensure that we can read the byte at z->end */
|
/* +1: the zones ensure that we can read the byte at z->end */
|
||||||
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
|
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
|
||||||
|
|
||||||
u64a reach0 = andn(domain_mask_flipped, itPtr);
|
u64a reach0 = andn(domain_mask_flipped, itPtr);
|
||||||
u64a reach1 = andn(domain_mask_flipped, itPtr + 1);
|
u64a reach1 = andn(domain_mask_flipped, itPtr + 1);
|
||||||
u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
|
u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
|
||||||
@ -184,9 +185,8 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
|
|||||||
st0 = or128(st0, st4);
|
st0 = or128(st0, st4);
|
||||||
*s = or128(*s, st0);
|
*s = or128(*s, st0);
|
||||||
|
|
||||||
*conf0 = movq(*s);
|
*conf0 = movq(*s) ^ ~0ULL;
|
||||||
*s = rshiftbyte_m128(*s, 8);
|
*s = rshiftbyte_m128(*s, 8);
|
||||||
*conf0 ^= ~0ULL;
|
|
||||||
|
|
||||||
u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
|
u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
|
||||||
u64a reach9 = andn(domain_mask_flipped, itPtr + 9);
|
u64a reach9 = andn(domain_mask_flipped, itPtr + 9);
|
||||||
@ -225,9 +225,8 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
|
|||||||
st8 = or128(st8, st12);
|
st8 = or128(st8, st12);
|
||||||
*s = or128(*s, st8);
|
*s = or128(*s, st8);
|
||||||
|
|
||||||
*conf8 = movq(*s);
|
*conf8 = movq(*s) ^ ~0ULL;
|
||||||
*s = rshiftbyte_m128(*s, 8);
|
*s = rshiftbyte_m128(*s, 8);
|
||||||
*conf8 ^= ~0ULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
@ -235,6 +234,7 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
|
|||||||
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
|
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
|
||||||
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
|
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
|
||||||
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
|
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
|
||||||
|
|
||||||
u64a reach0 = andn(domain_mask_flipped, itPtr);
|
u64a reach0 = andn(domain_mask_flipped, itPtr);
|
||||||
u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
|
u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
|
||||||
u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
|
u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
|
||||||
@ -287,6 +287,7 @@ void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
|
|||||||
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
|
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
|
||||||
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
|
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
|
||||||
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
|
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
|
||||||
|
|
||||||
u64a reach0 = andn(domain_mask_flipped, itPtr);
|
u64a reach0 = andn(domain_mask_flipped, itPtr);
|
||||||
u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
|
u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
|
||||||
u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
|
u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
|
||||||
@ -683,6 +684,10 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
|
|||||||
const u8 *tryFloodDetect = zz->floodPtr; \
|
const u8 *tryFloodDetect = zz->floodPtr; \
|
||||||
const u8 *start_ptr = zz->start; \
|
const u8 *start_ptr = zz->start; \
|
||||||
const u8 *end_ptr = zz->end; \
|
const u8 *end_ptr = zz->end; \
|
||||||
|
for (const u8 *itPtr = start_ptr; itPtr + 4*ITER_BYTES <= end_ptr; \
|
||||||
|
itPtr += 4*ITER_BYTES) { \
|
||||||
|
__builtin_prefetch(itPtr); \
|
||||||
|
} \
|
||||||
\
|
\
|
||||||
for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \
|
for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \
|
||||||
itPtr += ITER_BYTES) { \
|
itPtr += ITER_BYTES) { \
|
||||||
|
@ -634,6 +634,11 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
|
|||||||
assert(ISALIGNED_N(q->state, 2));
|
assert(ISALIGNED_N(q->state, 2));
|
||||||
u32 s = *(u16 *)q->state;
|
u32 s = *(u16 *)q->state;
|
||||||
|
|
||||||
|
__builtin_prefetch(&m->remap[0]);
|
||||||
|
__builtin_prefetch(&m->remap[64]);
|
||||||
|
__builtin_prefetch(&m->remap[128]);
|
||||||
|
__builtin_prefetch(&m->remap[192]);
|
||||||
|
|
||||||
if (q->report_current) {
|
if (q->report_current) {
|
||||||
assert(s);
|
assert(s);
|
||||||
assert(get_aux(m, s)->accept);
|
assert(get_aux(m, s)->accept);
|
||||||
@ -790,6 +795,11 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
|
|||||||
|
|
||||||
u32 s = *(u8 *)q->state;
|
u32 s = *(u8 *)q->state;
|
||||||
|
|
||||||
|
__builtin_prefetch(&m->remap[0]);
|
||||||
|
__builtin_prefetch(&m->remap[64]);
|
||||||
|
__builtin_prefetch(&m->remap[128]);
|
||||||
|
__builtin_prefetch(&m->remap[192]);
|
||||||
|
|
||||||
if (q->report_current) {
|
if (q->report_current) {
|
||||||
assert(s);
|
assert(s);
|
||||||
assert(s >= m->accept_limit_8);
|
assert(s >= m->accept_limit_8);
|
||||||
|
@ -889,6 +889,11 @@ char nfaExecMcSheng16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
|
|||||||
return MO_ALIVE;
|
return MO_ALIVE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__builtin_prefetch(&m->remap[0]);
|
||||||
|
__builtin_prefetch(&m->remap[64]);
|
||||||
|
__builtin_prefetch(&m->remap[128]);
|
||||||
|
__builtin_prefetch(&m->remap[192]);
|
||||||
|
|
||||||
while (1) {
|
while (1) {
|
||||||
assert(q->cur < q->end);
|
assert(q->cur < q->end);
|
||||||
s64a ep = q->items[q->cur].location;
|
s64a ep = q->items[q->cur].location;
|
||||||
@ -1017,6 +1022,11 @@ char nfaExecMcSheng8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
|
|||||||
return MO_ALIVE;
|
return MO_ALIVE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__builtin_prefetch(&m->remap[0]);
|
||||||
|
__builtin_prefetch(&m->remap[64]);
|
||||||
|
__builtin_prefetch(&m->remap[128]);
|
||||||
|
__builtin_prefetch(&m->remap[192]);
|
||||||
|
|
||||||
while (1) {
|
while (1) {
|
||||||
DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
|
DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
|
||||||
q->items[q->cur].type == MQE_END ? "END" : "???",
|
q->items[q->cur].type == MQE_END ? "END" : "???",
|
||||||
|
@ -109,7 +109,8 @@ DUMP_MSK(128)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GET_LO_4(chars) and128(chars, low4bits)
|
#define GET_LO_4(chars) and128(chars, low4bits)
|
||||||
#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
|
#define GET_HI_4(chars) and128(rshift64_m128(chars, 4), low4bits)
|
||||||
|
//#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
|
u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
|
||||||
@ -177,6 +178,10 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
|
|||||||
// Reroll FTW.
|
// Reroll FTW.
|
||||||
|
|
||||||
const u8 *last_block = buf_end - 16;
|
const u8 *last_block = buf_end - 16;
|
||||||
|
|
||||||
|
for (const u8 *itPtr = buf; itPtr + 4*16 <= last_block; itPtr += 4*16) {
|
||||||
|
__builtin_prefetch(itPtr);
|
||||||
|
}
|
||||||
while (buf < last_block) {
|
while (buf < last_block) {
|
||||||
m128 lchars = load128(buf);
|
m128 lchars = load128(buf);
|
||||||
rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes);
|
rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user