mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2026-01-02 22:54:43 +03:00
Speed up truffle with 256b TBL instructions
256b wide SVE vectors allow some simplification of truffle. Up to 40% speedup on graviton3. Going from 12500 MB/s to 17000 MB/s onhe microbenchmark. SVE2 also offer this capability for 128b vector with a speedup around 25% compared to normal SVE Add unit tests and benchmark for this wide variant Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
This commit is contained in:
@@ -142,9 +142,18 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
|
||||
return c;
|
||||
}
|
||||
|
||||
rv = truffleExec(accel->truffle.mask1, accel->truffle.mask2, c, c_end);
|
||||
rv = truffleExec(accel->truffle.mask_lo, accel->truffle.mask_hi, c, c_end);
|
||||
break;
|
||||
#ifdef CAN_USE_WIDE_TRUFFLE
|
||||
case ACCEL_TRUFFLE_WIDE:
|
||||
DEBUG_PRINTF("accel Truffle Wide %p %p\n", c, c_end);
|
||||
if (c + 15 >= c_end) {
|
||||
return c;
|
||||
}
|
||||
|
||||
rv = truffleExecWide(accel->truffle.mask, c, c_end);
|
||||
break;
|
||||
#endif
|
||||
case ACCEL_DSHUFTI:
|
||||
DEBUG_PRINTF("accel dshufti %p %p\n", c, c_end);
|
||||
if (c + 15 + 1 >= c_end) {
|
||||
|
||||
Reference in New Issue
Block a user