diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index 1a3b7003..372a78b1 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -147,6 +147,7 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { /* +1: the zones ensure that we can read the byte at z->end */ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + u64a reach0 = andn(domain_mask_flipped, itPtr); u64a reach1 = andn(domain_mask_flipped, itPtr + 1); u64a reach2 = andn(domain_mask_flipped, itPtr + 2); @@ -184,17 +185,16 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, st0 = or128(st0, st4); *s = or128(*s, st0); - *conf0 = movq(*s); + *conf0 = movq(*s) ^ ~0ULL; *s = rshiftbyte_m128(*s, 8); - *conf0 ^= ~0ULL; u64a reach8 = andn(domain_mask_flipped, itPtr + 8); u64a reach9 = andn(domain_mask_flipped, itPtr + 9); u64a reach10 = andn(domain_mask_flipped, itPtr + 10); u64a reach11 = andn(domain_mask_flipped, itPtr + 11); - m128 st8 = load_m128_from_u64a(ft + reach8); - m128 st9 = load_m128_from_u64a(ft + reach9); + m128 st8 = load_m128_from_u64a(ft + reach8); + m128 st9 = load_m128_from_u64a(ft + reach9); m128 st10 = load_m128_from_u64a(ft + reach10); m128 st11 = load_m128_from_u64a(ft + reach11); @@ -225,9 +225,8 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, st8 = or128(st8, st12); *s = or128(*s, st8); - *conf8 = movq(*s); + *conf8 = movq(*s) ^ ~0ULL; *s = rshiftbyte_m128(*s, 8); - *conf8 ^= ~0ULL; } static really_inline @@ -235,6 +234,7 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr, UNUSED const u8 *end_ptr, u32 domain_mask_flipped, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + u64a reach0 = andn(domain_mask_flipped, itPtr); u64a reach2 = andn(domain_mask_flipped, itPtr + 2); u64a reach4 = andn(domain_mask_flipped, itPtr + 4); @@ -287,6 +287,7 @@ void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr, UNUSED const u8 *end_ptr, u32 domain_mask_flipped, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + u64a reach0 = andn(domain_mask_flipped, itPtr); u64a reach4 = andn(domain_mask_flipped, itPtr + 4); u64a reach8 = andn(domain_mask_flipped, itPtr + 8); @@ -683,6 +684,10 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend, const u8 *tryFloodDetect = zz->floodPtr; \ const u8 *start_ptr = zz->start; \ const u8 *end_ptr = zz->end; \ + for (const u8 *itPtr = start_ptr; itPtr + 4*ITER_BYTES <= end_ptr; \ + itPtr += 4*ITER_BYTES) { \ + __builtin_prefetch(itPtr); \ + } \ \ for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \ itPtr += ITER_BYTES) { \ diff --git a/src/nfa/mcclellan.c b/src/nfa/mcclellan.c index 71f71e32..5ac0615a 100644 --- a/src/nfa/mcclellan.c +++ b/src/nfa/mcclellan.c @@ -634,6 +634,11 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, assert(ISALIGNED_N(q->state, 2)); u32 s = *(u16 *)q->state; + __builtin_prefetch(&m->remap[0]); + __builtin_prefetch(&m->remap[64]); + __builtin_prefetch(&m->remap[128]); + __builtin_prefetch(&m->remap[192]); + if (q->report_current) { assert(s); assert(get_aux(m, s)->accept); @@ -790,6 +795,11 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, u32 s = *(u8 *)q->state; + __builtin_prefetch(&m->remap[0]); + __builtin_prefetch(&m->remap[64]); + __builtin_prefetch(&m->remap[128]); + __builtin_prefetch(&m->remap[192]); + if (q->report_current) { assert(s); assert(s >= m->accept_limit_8); diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c index dd00617e..fe67102b 100644 --- a/src/nfa/mcsheng.c +++ b/src/nfa/mcsheng.c @@ -889,6 +889,11 @@ char nfaExecMcSheng16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, return MO_ALIVE; } + __builtin_prefetch(&m->remap[0]); + __builtin_prefetch(&m->remap[64]); + __builtin_prefetch(&m->remap[128]); + __builtin_prefetch(&m->remap[192]); + while (1) { assert(q->cur < q->end); s64a ep = q->items[q->cur].location; @@ -1017,6 +1022,11 @@ char nfaExecMcSheng8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, return MO_ALIVE; } + __builtin_prefetch(&m->remap[0]); + __builtin_prefetch(&m->remap[64]); + __builtin_prefetch(&m->remap[128]); + __builtin_prefetch(&m->remap[192]); + while (1) { DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" : q->items[q->cur].type == MQE_END ? "END" : "???", diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c index e76dcca8..f1f2befc 100644 --- a/src/nfa/shufti.c +++ b/src/nfa/shufti.c @@ -109,7 +109,8 @@ DUMP_MSK(128) #endif #define GET_LO_4(chars) and128(chars, low4bits) -#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) +#define GET_HI_4(chars) and128(rshift64_m128(chars, 4), low4bits) +//#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) static really_inline u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits, @@ -177,6 +178,10 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, // Reroll FTW. const u8 *last_block = buf_end - 16; + + for (const u8 *itPtr = buf; itPtr + 4*16 <= last_block; itPtr += 4*16) { + __builtin_prefetch(itPtr); + } while (buf < last_block) { m128 lchars = load128(buf); rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes);