Use SVE2 Bitperm's bdep instruction in bitutils and state_compress

Specifically for pdep64, expand32, and expand64 in bitutils,
as well as all of the loadcompressed functions used in
state_compress.

Change-Id: I92851bd12481dbee6a7e344df0890c4901b56d01
This commit is contained in:
George Wort 2021-07-02 10:43:48 +01:00 committed by Konstantinos Margaritis
parent 4bc28272da
commit 051ceed0f9
2 changed files with 54 additions and 10 deletions

View File

@ -120,12 +120,20 @@ m128 compress128_impl(m128 x, m128 m) {
static really_inline static really_inline
u32 expand32_impl(u32 x, u32 m) { u32 expand32_impl(u32 x, u32 m) {
#if defined(HAVE_SVE2_BITPERM)
return svlasta(svpfalse(), svbdep(svdup_u32(x), m));
#else
return expand32_impl_c(x, m); return expand32_impl_c(x, m);
#endif
} }
static really_inline static really_inline
u64a expand64_impl(u64a x, u64a m) { u64a expand64_impl(u64a x, u64a m) {
#if defined(HAVE_SVE2_BITPERM)
return svlasta(svpfalse(), svbdep(svdup_u64(x), m));
#else
return expand64_impl_c(x, m); return expand64_impl_c(x, m);
#endif
} }
static really_inline static really_inline
@ -194,11 +202,6 @@ u64a pext64_impl(u64a x, u64a mask) {
return pext64_impl_c(x, mask); return pext64_impl_c(x, mask);
} }
static really_inline
u64a pdep64(u64a x, u64a mask) {
return pdep64_impl_c(x, mask);
}
/* compilers don't reliably synthesize the 32-bit ANDN instruction here, /* compilers don't reliably synthesize the 32-bit ANDN instruction here,
* so we force its generation. * so we force its generation.
*/ */

View File

@ -72,11 +72,27 @@ void storecompressed64(void *ptr, const u64a *x, const u64a *m, u32 bytes) {
void loadcompressed64(u64a *x, const void *ptr, const u64a *m, u32 bytes) { void loadcompressed64(u64a *x, const void *ptr, const u64a *m, u32 bytes) {
assert(popcount64(*m) <= bytes * 8); assert(popcount64(*m) <= bytes * 8);
#ifdef HAVE_SVE2_BITPERM
svbool_t pg = svwhilelt_b8(0U, bytes);
svuint64_t expanded = svbdep(svreinterpret_u64(svld1_u8(pg, ptr)), *m);
svst1(svptrue_pat_b64(SV_VL1), (uint64_t *)x, expanded);
#else
u64a v = partial_load_u64a(ptr, bytes); u64a v = partial_load_u64a(ptr, bytes);
*x = expand64(v, *m); *x = expand64(v, *m);
#endif
} }
#if defined(HAVE_SVE2_BITPERM)
static really_inline
void bdep64x2(u64a *d, const u64a *x, const m128 *m) {
svbool_t pg = svptrue_pat_b64(SV_VL2);
svst1(pg, (uint64_t *)d, svbdep(svld1_u64(pg, (const uint64_t *)x),
svld1_u64(pg, (const uint64_t *)m)));
}
#endif // HAVE_SVE2_BITPERM
/* /*
* 128-bit store/load. * 128-bit store/load.
*/ */
@ -168,10 +184,14 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
u64a ALIGN_ATTR(16) v[2]; u64a ALIGN_ATTR(16) v[2];
unpack_bits_64(v, (const u8 *)ptr, bits, 2); unpack_bits_64(v, (const u8 *)ptr, bits, 2);
m128 xvec = load128(v);
// Expand vector #ifdef HAVE_SVE2_BITPERM
return expand128(xvec, mvec); u64a ALIGN_ATTR(16) xvec[2];
bdep64x2(xvec, v, &mvec);
return load128(xvec);
#else
return expand128(load128(v), mvec);
#endif
} }
#endif #endif
@ -291,8 +311,14 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
unpack_bits_64(v, (const u8 *)ptr, bits, 4); unpack_bits_64(v, (const u8 *)ptr, bits, 4);
#ifdef HAVE_SVE2_BITPERM
u64a ALIGN_ATTR(16) x[4];
bdep64x2(x, v, &mvec.lo);
bdep64x2(&x[2], &v[2], &mvec.hi);
#else
u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]), u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
expand64(v[2], m[2]), expand64(v[3], m[3]) }; expand64(v[2], m[2]), expand64(v[3], m[3]) };
#endif
#if !defined(HAVE_AVX2) #if !defined(HAVE_AVX2)
m256 xvec = { .lo = set2x64(x[1], x[0]), m256 xvec = { .lo = set2x64(x[1], x[0]),
@ -427,9 +453,16 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
unpack_bits_64(v, (const u8 *)ptr, bits, 6); unpack_bits_64(v, (const u8 *)ptr, bits, 6);
#ifdef HAVE_SVE2_BITPERM
u64a ALIGN_ATTR(16) x[6];
bdep64x2(x, v, &mvec.lo);
bdep64x2(&x[2], &v[2], &mvec.mid);
bdep64x2(&x[4], &v[4], &mvec.hi);
#else
u64a x[6] = { expand64(v[0], m[0]), expand64(v[1], m[1]), u64a x[6] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
expand64(v[2], m[2]), expand64(v[3], m[3]), expand64(v[2], m[2]), expand64(v[3], m[3]),
expand64(v[4], m[4]), expand64(v[5], m[5]) }; expand64(v[4], m[4]), expand64(v[5], m[5]) };
#endif
m384 xvec = { .lo = set2x64(x[1], x[0]), m384 xvec = { .lo = set2x64(x[1], x[0]),
.mid = set2x64(x[3], x[2]), .mid = set2x64(x[3], x[2]),
@ -586,10 +619,18 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
unpack_bits_64(v, (const u8 *)ptr, bits, 8); unpack_bits_64(v, (const u8 *)ptr, bits, 8);
#ifdef HAVE_SVE2_BITPERM
u64a ALIGN_ATTR(16) x[8];
bdep64x2(x, v, &mvec.lo.lo);
bdep64x2(&x[2], &v[2], &mvec.lo.hi);
bdep64x2(&x[4], &v[4], &mvec.hi.lo);
bdep64x2(&x[6], &v[6], &mvec.hi.hi);
#else
u64a x[8] = { expand64(v[0], m[0]), expand64(v[1], m[1]), u64a x[8] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
expand64(v[2], m[2]), expand64(v[3], m[3]), expand64(v[2], m[2]), expand64(v[3], m[3]),
expand64(v[4], m[4]), expand64(v[5], m[5]), expand64(v[4], m[4]), expand64(v[5], m[5]),
expand64(v[6], m[6]), expand64(v[7], m[7]) }; expand64(v[6], m[6]), expand64(v[7], m[7]) };
#endif
#if defined(HAVE_AVX512) #if defined(HAVE_AVX512)
m512 xvec = set8x64(x[7], x[6], x[5], x[4], m512 xvec = set8x64(x[7], x[6], x[5], x[4],