Speed up truffle with 256b TBL instructions

256b wide SVE vectors allow some simplification of truffle.
Up to 40% speedup on graviton3. Going from 12500 MB/s to 17000 MB/s
onhe microbenchmark.
SVE2 also offer this capability for 128b vector with a speedup around
25% compared to normal SVE

Add unit tests and benchmark for this wide variant

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
This commit is contained in:
Yoan Picchi
2024-04-23 12:04:40 +00:00
parent b312112e87
commit 938c026256
23 changed files with 1125 additions and 78 deletions

View File

@@ -461,11 +461,20 @@ void findForwardAccelScheme(const vector<AccelString> &lits,
aux->shufti.offset = verify_u8(min_offset);
return;
}
truffleBuildMasks(cr, reinterpret_cast<u8 *>(&aux->truffle.mask1), reinterpret_cast<u8 *>(&aux->truffle.mask2));
#if defined(CAN_USE_WIDE_TRUFFLE)
if(CAN_USE_WIDE_TRUFFLE) {
aux->truffle.accel_type = ACCEL_TRUFFLE_WIDE;
truffleBuildMasksWide(cr, reinterpret_cast<u8 *>(&aux->truffle.mask));
} else
#endif
{
aux->truffle.accel_type = ACCEL_TRUFFLE;
truffleBuildMasks(cr,
reinterpret_cast<u8 *>(&aux->truffle.mask_lo),
reinterpret_cast<u8 *>(&aux->truffle.mask_hi));
}
DEBUG_PRINTF("built truffle for %s (%zu chars, offset %u)\n",
describeClass(cr).c_str(), cr.count(), min_offset);
aux->truffle.accel_type = ACCEL_TRUFFLE;
aux->truffle.offset = verify_u8(min_offset);
}