Use SVE2 Bitperm's bdep instruction in bitutils and state_compress

Specifically for pdep64, expand32, and expand64 in bitutils,
as well as all of the loadcompressed functions used in
state_compress.

Change-Id: I92851bd12481dbee6a7e344df0890c4901b56d01
This commit is contained in:
George Wort
2021-07-02 10:43:48 +01:00
committed by Konstantinos Margaritis
parent 7e5138b78f
commit ace6cd15f2
2 changed files with 54 additions and 10 deletions

View File

@@ -109,7 +109,7 @@ m128 compress128_impl(m128 x, m128 m) {
m128 mm = sub_2x64(zeroes128(), m);
m128 xm = and128(x, m);
xm = and128(xm, mm);
m128 mask = not128(eq64_m128(xm, zeroes128()));
res = or128(res, and128(bb, mask));
m = and128(m, sub_2x64(m, one));
@@ -120,12 +120,20 @@ m128 compress128_impl(m128 x, m128 m) {
static really_inline
u32 expand32_impl(u32 x, u32 m) {
#if defined(HAVE_SVE2_BITPERM)
return svlasta(svpfalse(), svbdep(svdup_u32(x), m));
#else
return expand32_impl_c(x, m);
#endif
}
static really_inline
u64a expand64_impl(u64a x, u64a m) {
#if defined(HAVE_SVE2_BITPERM)
return svlasta(svpfalse(), svbdep(svdup_u64(x), m));
#else
return expand64_impl_c(x, m);
#endif
}
static really_inline
@@ -194,11 +202,6 @@ u64a pext64_impl(u64a x, u64a mask) {
return pext64_impl_c(x, mask);
}
static really_inline
u64a pdep64(u64a x, u64a mask) {
return pdep64_impl_c(x, mask);
}
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
* so we force its generation.
*/