Speed up truffle with 256b TBL instructions

256b wide SVE vectors allow some simplification of truffle. Up to 40% speedup on graviton3. Going from 12500 MB/s to 17000 MB/s onhe microbenchmark. SVE2 also offer this capability for 128b vector with a speedup around 25% compared to normal SVE Add unit tests and benchmark for this wide variant Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
2025-11-16 17:31:51 +03:00 · 2024-04-23 12:04:40 +00:00
parent 154cb6333b
commit 7054378c93
23 changed files with 1125 additions and 78 deletions
--- a/src/nfa/truffle_simd.hpp
+++ b/src/nfa/truffle_simd.hpp
@@ -45,6 +45,14 @@
 #ifdef HAVE_SVE
 static really_inline
 svuint8_t blockSingleMask(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars);
+
+static really_inline
+svuint8_t blockSingleMaskWide32(svuint8_t shuf_mask_32, svuint8_t chars);
+
+#ifdef HAVE_SVE2
+static really_inline
+svuint8_t blockSingleMaskWide(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars);
+#endif //HAVE_SVE2
 #else
 template <uint16_t S>
 static really_inline
@@ -64,19 +72,36 @@ const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, Supe
 #endif

 #ifdef HAVE_SVE
-
-const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
+template <bool is_wide, bool is_vector_128b>
+static really_inline
+const u8 *truffleExecSVE(m256 shuf_mask_32,
                      const u8 *buf, const u8 *buf_end);

-const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
+template <bool is_wide, bool is_vector_128b>
+static really_inline
+const u8 *rtruffleExecSVE(m256 shuf_mask_32,
                       const u8 *buf, const u8 *buf_end);

+template <bool is_wide, bool is_vector_128b>
 static really_inline
-const u8 *scanBlock(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars, const u8 *buf, bool forward) {
-
-    const size_t vector_size_int_8 = svcntb();
-
-    const svuint8_t result_mask = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+const u8 *scanBlock(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset,
+                    svuint8_t chars, const u8 *buf, const size_t vector_size_int_8, bool forward)
+{
+    svuint8_t result_mask;
+    if(is_wide) {
+        if(is_vector_128b) {
+#ifdef HAVE_SVE2
+            result_mask = blockSingleMaskWide(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+#else
+            DEBUG_PRINTF("Wide Truffle is not supported with 128b vectors unless SVE2 is enabled");
+            assert(false);
+#endif
+        } else {
+            result_mask = blockSingleMaskWide32(shuf_mask_lo_highclear, chars);
+        }
+    } else {
+        result_mask = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+    }
    uint64_t index;
    if (forward) {
        index = first_non_zero(vector_size_int_8, result_mask);
@@ -84,25 +109,33 @@ const u8 *scanBlock(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_hig
        index = last_non_zero(vector_size_int_8, result_mask);
    }

-    if(index < vector_size_int_8) {
+    if (index < vector_size_int_8) {
        return buf+index;
    } else {
        return NULL;
    }
 }

-really_inline
-const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) {
+template <bool is_wide, bool is_vector_128b>
+static really_inline
+const u8 *truffleExecSVE(m256 shuf_mask_32, const u8 *buf, const u8 *buf_end) {
    const int vect_size_int8 = svcntb();
-    // Activate only 16 lanes to read the m128 buffers
-    const svbool_t lane_pred_16 = svwhilelt_b8(0, 16);
    assert(buf && buf_end);
    assert(buf < buf_end);
    DEBUG_PRINTF("truffle %p len %zu\n", buf, buf_end - buf);
    DEBUG_PRINTF("b %s\n", buf);

-    svuint8_t wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highclear);
-    svuint8_t wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highset);
+    svuint8_t wide_shuf_mask_lo_highclear;
+    svuint8_t wide_shuf_mask_lo_highset;
+    if (is_wide && !is_vector_128b) {
+        const svbool_t lane_pred_32 = svwhilelt_b8(0, 32);
+        wide_shuf_mask_lo_highclear = svld1(lane_pred_32, (uint8_t*) &shuf_mask_32.lo);
+        wide_shuf_mask_lo_highset = svld1(svpfalse(), (uint8_t*) &shuf_mask_32.hi); /* empty vector */
+    } else {
+        const svbool_t lane_pred_16 = svwhilelt_b8(0, 16);
+        wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_32.lo);
+        wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_32.hi);
+    }

    const u8 *work_buffer = buf;
    const u8 *ret_val;
@@ -118,16 +151,16 @@ const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
        if (!ISALIGNED_N(work_buffer, vect_size_int8)) {
            svuint8_t chars = svld1(svptrue_b8(), work_buffer);
            const u8 *alligned_buffer = ROUNDUP_PTR(work_buffer, vect_size_int8);
-            ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, true);
+            ret_val = scanBlock<is_wide, is_vector_128b>(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, vect_size_int8, true);
            if (ret_val && ret_val < alligned_buffer) return ret_val;
            work_buffer = alligned_buffer;
        }

-        while(work_buffer + vect_size_int8 <= buf_end) {
+        while (work_buffer + vect_size_int8 <= buf_end) {
            __builtin_prefetch(work_buffer + 16*64);
            DEBUG_PRINTF("work_buffer %p \n", work_buffer);
            svuint8_t chars = svld1(svptrue_b8(), work_buffer);
-            ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, true);
+            ret_val = scanBlock<is_wide, is_vector_128b>(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, vect_size_int8, true);
            if (ret_val) return ret_val;
            work_buffer += vect_size_int8;
        }
@@ -147,7 +180,7 @@ const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
            chars = svld1(svptrue_b8(), buf_end - vect_size_int8);
            end_buf = buf_end - vect_size_int8;
        }
-        ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, end_buf, true);
+        ret_val = scanBlock<is_wide, is_vector_128b>(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, end_buf, vect_size_int8, true);
        DEBUG_PRINTF("ret_val %p \n", ret_val);
        if (ret_val && ret_val < buf_end) return ret_val;
    }
@@ -155,18 +188,26 @@ const u8 *truffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
    return buf_end;
 }

-really_inline
-const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end){
+template <bool is_wide, bool is_vector_128b>
+static really_inline
+const u8 *rtruffleExecSVE(m256 shuf_mask_32, const u8 *buf, const u8 *buf_end){
    const int vect_size_int8 = svcntb();
-    // Activate only 16 lanes to read the m128 buffers
-    const svbool_t lane_pred_16 = svwhilelt_b8(0, 16);
    assert(buf && buf_end);
    assert(buf < buf_end);
    DEBUG_PRINTF("truffle %p len %zu\n", buf, buf_end - buf);
    DEBUG_PRINTF("b %s\n", buf);

-    svuint8_t wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highclear);
-    svuint8_t wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_lo_highset);
+    svuint8_t wide_shuf_mask_lo_highclear;
+    svuint8_t wide_shuf_mask_lo_highset;
+    if (is_wide && !is_vector_128b) {
+        const svbool_t lane_pred_32 = svwhilelt_b8(0, 32);
+        wide_shuf_mask_lo_highclear = svld1(lane_pred_32, (uint8_t*) &shuf_mask_32.lo);
+        wide_shuf_mask_lo_highset = svld1(svpfalse(), (uint8_t*) &shuf_mask_32.hi); /* empty vector */
+    } else {
+        const svbool_t lane_pred_16 = svwhilelt_b8(0, 16);
+        wide_shuf_mask_lo_highclear = svld1(lane_pred_16, (uint8_t*) &shuf_mask_32.lo);
+        wide_shuf_mask_lo_highset = svld1(lane_pred_16, (uint8_t*) &shuf_mask_32.hi);
+    }

    const u8 *work_buffer = buf_end;
    const u8 *ret_val;
@@ -182,7 +223,7 @@ const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset
        if (!ISALIGNED_N(work_buffer, vect_size_int8)) {
            svuint8_t chars = svld1(svptrue_b8(), work_buffer - vect_size_int8);
            const u8 *alligned_buffer = ROUNDDOWN_PTR(work_buffer, vect_size_int8);
-            ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer - vect_size_int8, false);
+            ret_val = scanBlock<is_wide, is_vector_128b>(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer - vect_size_int8, vect_size_int8, false);
            DEBUG_PRINTF("ret_val %p \n", ret_val);
            if (ret_val >= alligned_buffer) return ret_val;
            work_buffer = alligned_buffer;
@@ -195,7 +236,7 @@ const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset

            work_buffer -= vect_size_int8;
            svuint8_t chars = svld1(svptrue_b8(), work_buffer);
-            ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, false);
+            ret_val = scanBlock<is_wide, is_vector_128b>(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, work_buffer, vect_size_int8, false);
            if (ret_val) return ret_val;
        }
    }
@@ -211,7 +252,7 @@ const u8 *rtruffleExecSVE(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset
        } else {
            chars = svld1(svptrue_b8(), buf);
        }
-        ret_val = scanBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, buf, false);
+        ret_val = scanBlock<is_wide, is_vector_128b>(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, buf, vect_size_int8, false);
        DEBUG_PRINTF("ret_val %p \n", ret_val);
        if (ret_val && ret_val < buf_end) return ret_val;
    }
@@ -253,7 +294,7 @@ const u8 *truffleExecReal(const m128 &shuf_mask_lo_highclear, m128 shuf_mask_lo_
            d = dup;
        }

-        while(d + S <= buf_end) {
+        while (d + S <= buf_end) {
            __builtin_prefetch(d + 16*64);
            DEBUG_PRINTF("d %p \n", d);
            SuperVector<S> chars = SuperVector<S>::load(d);