Make the match component of SVE truffle constant time

There are no significant speed up for 128b vectors but we expect some speedup for wider vectors compared to the previous linear time implementation of the match.hpp component Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
2025-06-28 16:41:01 +03:00 · 2024-01-12 15:33:28 +00:00 · 2024-01-12 15:33:28 +00:00 · 01d8a2d768
commit 01d8a2d768
parent 46488b8097
1 changed files with 13 additions and 31 deletions
--- a/src/util/arch/arm/match.hpp
+++ b/src/util/arch/arm/match.hpp
@ -102,26 +102,16 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16

 #ifdef HAVE_SVE

-
+// Return the index of the first lane with an active predicate.
+// Assumes pred is not svpfalse()
 static really_inline
-uint64_t last_non_zero_real(svuint8_t mask) {
-	const svuint64_t leading_zeros = svclz_x(svptrue_b64(), svreinterpret_u64(mask));
+uint64_t index_first_predicate(const svbool_t pred) {

-        uint64_t last_active_lane;
+    const svuint8_t indices = svindex_u8(0, 1);
+    const svbool_t single_lane_mask = svpnext_b8(pred, svpfalse());
+    const uint64_t lane_index = svlastb(single_lane_mask, indices);

-        svbool_t remaining_mask = svptrue_b64();
-        uint64_t i = 0;
-        while(svptest_any(svptrue_b64(), remaining_mask)) {
-            svbool_t single_lane_mask = svpnext_b64(remaining_mask, svpfalse());
-            remaining_mask = sveor_z(svptrue_b64(), remaining_mask, single_lane_mask);
-            uint64_t active_element = svlastb(single_lane_mask, leading_zeros);
-            if(active_element<64) {
-                uint64_t lane_index = (i+1)*8 - (active_element/8) - 1;
-                last_active_lane = lane_index;
-            }
-            i++;
-        }
-        return last_active_lane;
+    return lane_index;
 }

 /*
@ -129,10 +119,10 @@ uint64_t last_non_zero_real(svuint8_t mask) {
 */
 static really_inline
 uint64_t last_non_zero(const size_t vector_size_int_8, svuint8_t mask) {
-    const svbool_t result_pred = svcmpne(svptrue_b8(), mask, 0);
+    const svbool_t non_zero = svcmpne(svptrue_b8(), mask, 0);

-    if (svptest_any(svptrue_b8(), result_pred)) {
-        return last_non_zero_real(mask);
+    if (svptest_any(svptrue_b8(), non_zero)) {
+        return vector_size_int_8 - 1 - index_first_predicate(svrev_b8(non_zero));
    } else {
        return vector_size_int_8;
    }
@ -143,18 +133,10 @@ uint64_t last_non_zero(const size_t vector_size_int_8, svuint8_t mask) {
 */
 static really_inline
 uint64_t first_non_zero(const size_t vector_size_int_8, svuint8_t mask) {
-    const svbool_t result_pred = svcmpne(svptrue_b8(), mask, 0);
+    const svbool_t non_zero = svcmpne(svptrue_b8(), mask, 0);

-    if (svptest_any(svptrue_b8(), result_pred)) {
-
-        // We don't have a CTZ instruction but we can work around by reversing the lane order
-        const svuint64_t rev_large_res = svreinterpret_u64(svrev(mask));
-	// Now each pack of 8 leading 0 means one empty lane. So if we have 18 leading 0,
-        // that means the third lane have a matching character.
-	uint64_t first_active_lane = last_non_zero_real(svreinterpret_u8(rev_large_res));
-        // We reversed the lanes, so we reverse back the index
-	first_active_lane = (vector_size_int_8-1) - first_active_lane;
-        return first_active_lane;
+    if (svptest_any(svptrue_b8(), non_zero)) {
+        return index_first_predicate(non_zero);
    } else {
        return vector_size_int_8;
    }