Speed up truffle with 256b TBL instructions

256b wide SVE vectors allow some simplification of truffle. Up to 40% speedup on graviton3. Going from 12500 MB/s to 17000 MB/s onhe microbenchmark. SVE2 also offer this capability for 128b vector with a speedup around 25% compared to normal SVE Add unit tests and benchmark for this wide variant Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
2026-01-02 14:44:41 +03:00 · 2024-04-23 12:04:40 +00:00
parent 154cb6333b
commit 7054378c93
23 changed files with 1125 additions and 78 deletions
--- a/unit/internal/sheng.cpp
+++ b/unit/internal/sheng.cpp
@@ -290,19 +290,26 @@ struct NFA *get_expected_nfa_header(u8 type, unsigned int length, unsigned int n
 }

 struct NFA *get_expected_nfa16_header() {
-    return get_expected_nfa_header(SHENG_NFA, 4736, 8);
+    return get_expected_nfa_header(SHENG_NFA, 4736, 8); /* size recorded in 04/2024 */
 }

 #if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
 struct NFA *get_expected_nfa32_header() {
-    return get_expected_nfa_header(SHENG_NFA_32, 17216, 18);
+    return get_expected_nfa_header(SHENG_NFA_32, 17216, 18); /* size recorded in 04/2024 */
 }
 #endif /* defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) */

 void test_nfa_equal(const NFA& l, const NFA& r)
 {
+    /**
+     * The length is meant to be a sanity test: it's not 0 (we compiled something) and that it roughly fit the
+     * expected size for a given sheng implementation (we don't feed compiled sheng32 into sheng16).
+     * Changes in other nfa algorithms may affect the sheng length, so we accept small variations.
+     */
+    int relative_difference = std::abs((float)(l.length) - r.length) / ((l.length + r.length) / 2);
+    EXPECT_LE(relative_difference, 0.1); /* same +-10% */
+
    EXPECT_EQ(l.flags, r.flags);
-    EXPECT_EQ(l.length, r.length);
    EXPECT_EQ(l.type, r.type);
    EXPECT_EQ(l.rAccelType, r.rAccelType);
    EXPECT_EQ(l.rAccelOffset, r.rAccelOffset);