diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index 715ab684..561e8f98 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -665,7 +665,7 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend, const u8 *tryFloodDetect = zz->floodPtr; \ const u8 *start_ptr = zz->start; \ const u8 *end_ptr = zz->end; \ - for (const u8 *itPtr = start_ptr; itPtr + 4*ITER_BYTES <= end_ptr; \ + for (const u8 *itPtr = ROUNDDOWN_PTR(start_ptr, 64); itPtr + 4*ITER_BYTES <= end_ptr; \ itPtr += 4*ITER_BYTES) { \ __builtin_prefetch(itPtr); \ } \ diff --git a/src/hwlm/noodle_engine_avx2.c b/src/hwlm/noodle_engine_avx2.c index 05c40cd2..0aebdc67 100644 --- a/src/hwlm/noodle_engine_avx2.c +++ b/src/hwlm/noodle_engine_avx2.c @@ -95,7 +95,7 @@ hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf, u32 z = movemask256(eq256(mask1, v)); // On large packet buffers, this prefetch appears to get us about 2%. - __builtin_prefetch(d + 128); + __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64)); hwlm_error_t result = single_zscan(n, d, buf, z, len, cbi); if (unlikely(result != HWLM_SUCCESS)) @@ -126,7 +126,7 @@ hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf, lastz0 = z0 >> 31; // On large packet buffers, this prefetch appears to get us about 2%. - __builtin_prefetch(d + 128); + __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64)); hwlm_error_t result = double_zscan(n, d, buf, z, len, cbi); if (unlikely(result != HWLM_SUCCESS)) diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c index 78033a47..501aea85 100644 --- a/src/hwlm/noodle_engine_sse.c +++ b/src/hwlm/noodle_engine_sse.c @@ -91,7 +91,7 @@ hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf, u32 z = movemask128(eq128(mask1, v)); // On large packet buffers, this prefetch appears to get us about 2%. - __builtin_prefetch(d + 128); + __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64)); DEBUG_PRINTF("z 0x%08x\n", z); hwlm_error_t result = single_zscan(n, d, buf, z, len, cbi); @@ -118,7 +118,7 @@ hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf, lastz1 = z1; // On large packet buffers, this prefetch appears to get us about 2%. - __builtin_prefetch(d + 128); + __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64)); DEBUG_PRINTF("z 0x%08x\n", z); hwlm_error_t result = double_zscan(n, d, buf, z, len, cbi); diff --git a/src/nfa/mcclellan.c b/src/nfa/mcclellan.c index 5ac0615a..a7fcb06a 100644 --- a/src/nfa/mcclellan.c +++ b/src/nfa/mcclellan.c @@ -634,10 +634,11 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, assert(ISALIGNED_N(q->state, 2)); u32 s = *(u16 *)q->state; - __builtin_prefetch(&m->remap[0]); - __builtin_prefetch(&m->remap[64]); - __builtin_prefetch(&m->remap[128]); - __builtin_prefetch(&m->remap[192]); + const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64); + __builtin_prefetch(base); + __builtin_prefetch(base + 64); + __builtin_prefetch(base + 128); + __builtin_prefetch(base + 192); if (q->report_current) { assert(s); @@ -795,10 +796,11 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, u32 s = *(u8 *)q->state; - __builtin_prefetch(&m->remap[0]); - __builtin_prefetch(&m->remap[64]); - __builtin_prefetch(&m->remap[128]); - __builtin_prefetch(&m->remap[192]); + const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64); + __builtin_prefetch(base); + __builtin_prefetch(base + 64); + __builtin_prefetch(base + 128); + __builtin_prefetch(base + 192); if (q->report_current) { assert(s); diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c index c52bf31c..5c97d73a 100644 --- a/src/nfa/mcsheng.c +++ b/src/nfa/mcsheng.c @@ -889,10 +889,11 @@ char nfaExecMcSheng16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, return MO_ALIVE; } - __builtin_prefetch(&m->remap[0]); - __builtin_prefetch(&m->remap[64]); - __builtin_prefetch(&m->remap[128]); - __builtin_prefetch(&m->remap[192]); + const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64); + __builtin_prefetch(base); + __builtin_prefetch(base + 64); + __builtin_prefetch(base + 128); + __builtin_prefetch(base + 192); while (1) { assert(q->cur < q->end); @@ -1022,10 +1023,11 @@ char nfaExecMcSheng8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, return MO_ALIVE; } - __builtin_prefetch(&m->remap[0]); - __builtin_prefetch(&m->remap[64]); - __builtin_prefetch(&m->remap[128]); - __builtin_prefetch(&m->remap[192]); + const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64); + __builtin_prefetch(base); + __builtin_prefetch(base + 64); + __builtin_prefetch(base + 128); + __builtin_prefetch(base + 192); while (1) { DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" : diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c index 4f7cae2e..2c30ce5c 100644 --- a/src/nfa/shufti.c +++ b/src/nfa/shufti.c @@ -179,7 +179,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *last_block = buf_end - 16; - for (const u8 *itPtr = buf; itPtr + 4*16 <= last_block; itPtr += 4*16) { + for (const u8 *itPtr = ROUNDDOWN_PTR(buf, 64); itPtr + 4*16 <= last_block; itPtr += 4*16) { __builtin_prefetch(itPtr); } while (buf < last_block) {