Speed up truffle with 256b TBL instructions

256b wide SVE vectors allow some simplification of truffle. Up to 40% speedup on graviton3. Going from 12500 MB/s to 17000 MB/s onhe microbenchmark. SVE2 also offer this capability for 128b vector with a speedup around 25% compared to normal SVE Add unit tests and benchmark for this wide variant Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
2026-01-17 16:00:26 +03:00 · 2024-04-23 12:04:40 +00:00
parent b312112e87
commit 938c026256
23 changed files with 1125 additions and 78 deletions
--- a/src/rose/rose_build_lit_accel.cpp
+++ b/src/rose/rose_build_lit_accel.cpp
@@ -461,11 +461,20 @@ void findForwardAccelScheme(const vector<AccelString> &lits,
        aux->shufti.offset = verify_u8(min_offset);
        return;
    }
-
-    truffleBuildMasks(cr, reinterpret_cast<u8 *>(&aux->truffle.mask1), reinterpret_cast<u8 *>(&aux->truffle.mask2));
+#if defined(CAN_USE_WIDE_TRUFFLE)
+    if(CAN_USE_WIDE_TRUFFLE) {
+        aux->truffle.accel_type = ACCEL_TRUFFLE_WIDE;
+        truffleBuildMasksWide(cr, reinterpret_cast<u8 *>(&aux->truffle.mask));
+    } else
+#endif
+    {
+        aux->truffle.accel_type = ACCEL_TRUFFLE;
+        truffleBuildMasks(cr,
+                          reinterpret_cast<u8 *>(&aux->truffle.mask_lo),
+                          reinterpret_cast<u8 *>(&aux->truffle.mask_hi));
+    }
    DEBUG_PRINTF("built truffle for %s (%zu chars, offset %u)\n",
                 describeClass(cr).c_str(), cr.count(), min_offset);
-    aux->truffle.accel_type = ACCEL_TRUFFLE;
    aux->truffle.offset = verify_u8(min_offset);
 }