borrow cache prefetching tricks from the Marvell port, seem to improve performance by 5-28%

2025-11-19 02:30:35 +03:00 · 2021-01-15 17:42:11 +02:00
parent 700a0a093c
commit e830470028
4 changed files with 37 additions and 7 deletions
--- a/src/nfa/mcsheng.c
+++ b/src/nfa/mcsheng.c
@@ -889,6 +889,11 @@ char nfaExecMcSheng16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
        return MO_ALIVE;
    }

+    __builtin_prefetch(&m->remap[0]);
+    __builtin_prefetch(&m->remap[64]);
+    __builtin_prefetch(&m->remap[128]);
+    __builtin_prefetch(&m->remap[192]);
+
    while (1) {
        assert(q->cur < q->end);
        s64a ep = q->items[q->cur].location;
@@ -1017,6 +1022,11 @@ char nfaExecMcSheng8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
        return MO_ALIVE;
    }

+    __builtin_prefetch(&m->remap[0]);
+    __builtin_prefetch(&m->remap[64]);
+    __builtin_prefetch(&m->remap[128]);
+    __builtin_prefetch(&m->remap[192]);
+
    while (1) {
        DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
                     q->items[q->cur].type == MQE_END ? "END" : "???",