Speed up truffle with 256b TBL instructions

256b wide SVE vectors allow some simplification of truffle.
Up to 40% speedup on graviton3. Going from 12500 MB/s to 17000 MB/s
onhe microbenchmark.
SVE2 also offer this capability for 128b vector with a speedup around
25% compared to normal SVE

Add unit tests and benchmark for this wide variant

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
This commit is contained in:
Yoan Picchi
2024-04-23 12:04:40 +00:00
parent 154cb6333b
commit 7054378c93
23 changed files with 1125 additions and 78 deletions

View File

@@ -53,5 +53,11 @@
#define HAVE_SVE2_BITPERM
#endif
#if defined(HAVE_SVE2)
#define CAN_USE_WIDE_TRUFFLE 1
#elif defined(HAVE_SVE)
#define CAN_USE_WIDE_TRUFFLE (svcntb() >= 32)
#endif
#endif // UTIL_ARCH_ARM_H_

View File

@@ -34,5 +34,9 @@
typedef int32x4_t m128;
#endif
#if !defined(m256) && defined(m128)
typedef struct {m128 lo; m128 hi;} m256;
#endif
#endif /* SIMD_TYPES_ARM_H */

View File

@@ -31,3 +31,6 @@
typedef int32x4_t m128;
#endif
#if !defined(m256) && defined(m128)
typedef struct {m128 lo; m128 hi;} m256;
#endif