Optimize vectorscan for aarch64 by using shrn instruction

This optimization is based on the thread
https://twitter.com/Danlark1/status/1539344279268691970 and uses
shift right and narrow by 4 instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SHRN--SHRN2--Shift-Right-Narrow--immediate--

To achieve that, I needed to redesign a little movemask into comparemask
and have an additional step towards mask iteration. Our benchmarks
showed 10-15% improvement on average for long matches.
This commit is contained in:
Danila Kutenin
2022-06-26 22:50:05 +00:00
parent d4044039db
commit eb7b0bb50c
11 changed files with 264 additions and 150 deletions

View File

@@ -53,7 +53,15 @@ really_really_inline
u32 packedExtract<16>(SuperVector<16> s, const SuperVector<16> permute, const SuperVector<16> compare) {
SuperVector<16> shuffled = s.pshufb<true>(permute);
SuperVector<16> compared = shuffled & compare;
u16 rv = ~compared.eqmask(shuffled);
u64a rv = (~compared.eqmask(shuffled)) & 0xffff;
if (SuperVector<16>::mask_width() != 1) {
u32 ans = 0;
for (u32 i = 0; i < 16; ++i) {
ans |= (rv & (1ull << (i * SuperVector<16>::mask_width()))) >>
(i * SuperVector<16>::mask_width() - i);
}
return ans;
}
return (u32)rv;
}
@@ -62,7 +70,8 @@ really_really_inline
u32 packedExtract<32>(SuperVector<32> s, const SuperVector<32> permute, const SuperVector<32> compare) {
SuperVector<32> shuffled = s.pshufb<true>(permute);
SuperVector<32> compared = shuffled & compare;
u32 rv = ~compared.eqmask(shuffled);
// TODO(danlark1): Future ARM support might have a bug.
u64a rv = (~compared.eqmask(shuffled)) & 0xffffffff;
return (u32)((rv >> 16) | (rv & 0xffffU));
}
@@ -71,6 +80,7 @@ really_really_inline
u32 packedExtract<64>(SuperVector<64> s, const SuperVector<64> permute, const SuperVector<64> compare) {
SuperVector<64> shuffled = s.pshufb<true>(permute);
SuperVector<64> compared = shuffled & compare;
// TODO(danlark1): Future ARM support might have a bug.
u64a rv = ~compared.eqmask(shuffled);
rv = rv >> 32 | rv;
return (u32)(((rv >> 16) | rv) & 0xffffU);