mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
prefetch works best when addresses are 64-byte aligned
This commit is contained in:
parent
521f233cfd
commit
d3ff893871
@ -665,7 +665,7 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
|
||||
const u8 *tryFloodDetect = zz->floodPtr; \
|
||||
const u8 *start_ptr = zz->start; \
|
||||
const u8 *end_ptr = zz->end; \
|
||||
for (const u8 *itPtr = start_ptr; itPtr + 4*ITER_BYTES <= end_ptr; \
|
||||
for (const u8 *itPtr = ROUNDDOWN_PTR(start_ptr, 64); itPtr + 4*ITER_BYTES <= end_ptr; \
|
||||
itPtr += 4*ITER_BYTES) { \
|
||||
__builtin_prefetch(itPtr); \
|
||||
} \
|
||||
|
@ -95,7 +95,7 @@ hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
|
||||
u32 z = movemask256(eq256(mask1, v));
|
||||
|
||||
// On large packet buffers, this prefetch appears to get us about 2%.
|
||||
__builtin_prefetch(d + 128);
|
||||
__builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64));
|
||||
|
||||
hwlm_error_t result = single_zscan(n, d, buf, z, len, cbi);
|
||||
if (unlikely(result != HWLM_SUCCESS))
|
||||
@ -126,7 +126,7 @@ hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
|
||||
lastz0 = z0 >> 31;
|
||||
|
||||
// On large packet buffers, this prefetch appears to get us about 2%.
|
||||
__builtin_prefetch(d + 128);
|
||||
__builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64));
|
||||
|
||||
hwlm_error_t result = double_zscan(n, d, buf, z, len, cbi);
|
||||
if (unlikely(result != HWLM_SUCCESS))
|
||||
|
@ -91,7 +91,7 @@ hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
|
||||
u32 z = movemask128(eq128(mask1, v));
|
||||
|
||||
// On large packet buffers, this prefetch appears to get us about 2%.
|
||||
__builtin_prefetch(d + 128);
|
||||
__builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64));
|
||||
DEBUG_PRINTF("z 0x%08x\n", z);
|
||||
|
||||
hwlm_error_t result = single_zscan(n, d, buf, z, len, cbi);
|
||||
@ -118,7 +118,7 @@ hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
|
||||
lastz1 = z1;
|
||||
|
||||
// On large packet buffers, this prefetch appears to get us about 2%.
|
||||
__builtin_prefetch(d + 128);
|
||||
__builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64));
|
||||
DEBUG_PRINTF("z 0x%08x\n", z);
|
||||
|
||||
hwlm_error_t result = double_zscan(n, d, buf, z, len, cbi);
|
||||
|
@ -634,10 +634,11 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
|
||||
assert(ISALIGNED_N(q->state, 2));
|
||||
u32 s = *(u16 *)q->state;
|
||||
|
||||
__builtin_prefetch(&m->remap[0]);
|
||||
__builtin_prefetch(&m->remap[64]);
|
||||
__builtin_prefetch(&m->remap[128]);
|
||||
__builtin_prefetch(&m->remap[192]);
|
||||
const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
|
||||
__builtin_prefetch(base);
|
||||
__builtin_prefetch(base + 64);
|
||||
__builtin_prefetch(base + 128);
|
||||
__builtin_prefetch(base + 192);
|
||||
|
||||
if (q->report_current) {
|
||||
assert(s);
|
||||
@ -795,10 +796,11 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
|
||||
|
||||
u32 s = *(u8 *)q->state;
|
||||
|
||||
__builtin_prefetch(&m->remap[0]);
|
||||
__builtin_prefetch(&m->remap[64]);
|
||||
__builtin_prefetch(&m->remap[128]);
|
||||
__builtin_prefetch(&m->remap[192]);
|
||||
const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
|
||||
__builtin_prefetch(base);
|
||||
__builtin_prefetch(base + 64);
|
||||
__builtin_prefetch(base + 128);
|
||||
__builtin_prefetch(base + 192);
|
||||
|
||||
if (q->report_current) {
|
||||
assert(s);
|
||||
|
@ -889,10 +889,11 @@ char nfaExecMcSheng16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
|
||||
return MO_ALIVE;
|
||||
}
|
||||
|
||||
__builtin_prefetch(&m->remap[0]);
|
||||
__builtin_prefetch(&m->remap[64]);
|
||||
__builtin_prefetch(&m->remap[128]);
|
||||
__builtin_prefetch(&m->remap[192]);
|
||||
const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
|
||||
__builtin_prefetch(base);
|
||||
__builtin_prefetch(base + 64);
|
||||
__builtin_prefetch(base + 128);
|
||||
__builtin_prefetch(base + 192);
|
||||
|
||||
while (1) {
|
||||
assert(q->cur < q->end);
|
||||
@ -1022,10 +1023,11 @@ char nfaExecMcSheng8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
|
||||
return MO_ALIVE;
|
||||
}
|
||||
|
||||
__builtin_prefetch(&m->remap[0]);
|
||||
__builtin_prefetch(&m->remap[64]);
|
||||
__builtin_prefetch(&m->remap[128]);
|
||||
__builtin_prefetch(&m->remap[192]);
|
||||
const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
|
||||
__builtin_prefetch(base);
|
||||
__builtin_prefetch(base + 64);
|
||||
__builtin_prefetch(base + 128);
|
||||
__builtin_prefetch(base + 192);
|
||||
|
||||
while (1) {
|
||||
DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
|
||||
|
@ -179,7 +179,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
|
||||
|
||||
const u8 *last_block = buf_end - 16;
|
||||
|
||||
for (const u8 *itPtr = buf; itPtr + 4*16 <= last_block; itPtr += 4*16) {
|
||||
for (const u8 *itPtr = ROUNDDOWN_PTR(buf, 64); itPtr + 4*16 <= last_block; itPtr += 4*16) {
|
||||
__builtin_prefetch(itPtr);
|
||||
}
|
||||
while (buf < last_block) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user