From 051ceed0f95324ddff09898102e9da9fba8cae29 Mon Sep 17 00:00:00 2001 From: George Wort Date: Fri, 2 Jul 2021 10:43:48 +0100 Subject: [PATCH] Use SVE2 Bitperm's bdep instruction in bitutils and state_compress Specifically for pdep64, expand32, and expand64 in bitutils, as well as all of the loadcompressed functions used in state_compress. Change-Id: I92851bd12481dbee6a7e344df0890c4901b56d01 --- src/util/arch/arm/bitutils.h | 15 ++++++----- src/util/state_compress.c | 49 +++++++++++++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h index 498db568..0960db33 100644 --- a/src/util/arch/arm/bitutils.h +++ b/src/util/arch/arm/bitutils.h @@ -109,7 +109,7 @@ m128 compress128_impl(m128 x, m128 m) { m128 mm = sub_2x64(zeroes128(), m); m128 xm = and128(x, m); xm = and128(xm, mm); - + m128 mask = not128(eq64_m128(xm, zeroes128())); res = or128(res, and128(bb, mask)); m = and128(m, sub_2x64(m, one)); @@ -120,12 +120,20 @@ m128 compress128_impl(m128 x, m128 m) { static really_inline u32 expand32_impl(u32 x, u32 m) { +#if defined(HAVE_SVE2_BITPERM) + return svlasta(svpfalse(), svbdep(svdup_u32(x), m)); +#else return expand32_impl_c(x, m); +#endif } static really_inline u64a expand64_impl(u64a x, u64a m) { +#if defined(HAVE_SVE2_BITPERM) + return svlasta(svpfalse(), svbdep(svdup_u64(x), m)); +#else return expand64_impl_c(x, m); +#endif } static really_inline @@ -194,11 +202,6 @@ u64a pext64_impl(u64a x, u64a mask) { return pext64_impl_c(x, mask); } -static really_inline -u64a pdep64(u64a x, u64a mask) { - return pdep64_impl_c(x, mask); -} - /* compilers don't reliably synthesize the 32-bit ANDN instruction here, * so we force its generation. */ diff --git a/src/util/state_compress.c b/src/util/state_compress.c index 2040ffa1..fc837392 100644 --- a/src/util/state_compress.c +++ b/src/util/state_compress.c @@ -72,11 +72,27 @@ void storecompressed64(void *ptr, const u64a *x, const u64a *m, u32 bytes) { void loadcompressed64(u64a *x, const void *ptr, const u64a *m, u32 bytes) { assert(popcount64(*m) <= bytes * 8); - +#ifdef HAVE_SVE2_BITPERM + svbool_t pg = svwhilelt_b8(0U, bytes); + svuint64_t expanded = svbdep(svreinterpret_u64(svld1_u8(pg, ptr)), *m); + svst1(svptrue_pat_b64(SV_VL1), (uint64_t *)x, expanded); +#else u64a v = partial_load_u64a(ptr, bytes); *x = expand64(v, *m); +#endif } +#if defined(HAVE_SVE2_BITPERM) + +static really_inline +void bdep64x2(u64a *d, const u64a *x, const m128 *m) { + svbool_t pg = svptrue_pat_b64(SV_VL2); + svst1(pg, (uint64_t *)d, svbdep(svld1_u64(pg, (const uint64_t *)x), + svld1_u64(pg, (const uint64_t *)m))); +} + +#endif // HAVE_SVE2_BITPERM + /* * 128-bit store/load. */ @@ -168,10 +184,14 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { u64a ALIGN_ATTR(16) v[2]; unpack_bits_64(v, (const u8 *)ptr, bits, 2); - m128 xvec = load128(v); - // Expand vector - return expand128(xvec, mvec); +#ifdef HAVE_SVE2_BITPERM + u64a ALIGN_ATTR(16) xvec[2]; + bdep64x2(xvec, v, &mvec); + return load128(xvec); +#else + return expand128(load128(v), mvec); +#endif } #endif @@ -291,8 +311,14 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) { unpack_bits_64(v, (const u8 *)ptr, bits, 4); +#ifdef HAVE_SVE2_BITPERM + u64a ALIGN_ATTR(16) x[4]; + bdep64x2(x, v, &mvec.lo); + bdep64x2(&x[2], &v[2], &mvec.hi); +#else u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]), expand64(v[2], m[2]), expand64(v[3], m[3]) }; +#endif #if !defined(HAVE_AVX2) m256 xvec = { .lo = set2x64(x[1], x[0]), @@ -427,9 +453,16 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) { unpack_bits_64(v, (const u8 *)ptr, bits, 6); +#ifdef HAVE_SVE2_BITPERM + u64a ALIGN_ATTR(16) x[6]; + bdep64x2(x, v, &mvec.lo); + bdep64x2(&x[2], &v[2], &mvec.mid); + bdep64x2(&x[4], &v[4], &mvec.hi); +#else u64a x[6] = { expand64(v[0], m[0]), expand64(v[1], m[1]), expand64(v[2], m[2]), expand64(v[3], m[3]), expand64(v[4], m[4]), expand64(v[5], m[5]) }; +#endif m384 xvec = { .lo = set2x64(x[1], x[0]), .mid = set2x64(x[3], x[2]), @@ -586,10 +619,18 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) { unpack_bits_64(v, (const u8 *)ptr, bits, 8); +#ifdef HAVE_SVE2_BITPERM + u64a ALIGN_ATTR(16) x[8]; + bdep64x2(x, v, &mvec.lo.lo); + bdep64x2(&x[2], &v[2], &mvec.lo.hi); + bdep64x2(&x[4], &v[4], &mvec.hi.lo); + bdep64x2(&x[6], &v[6], &mvec.hi.hi); +#else u64a x[8] = { expand64(v[0], m[0]), expand64(v[1], m[1]), expand64(v[2], m[2]), expand64(v[3], m[3]), expand64(v[4], m[4]), expand64(v[5], m[5]), expand64(v[6], m[6]), expand64(v[7], m[7]) }; +#endif #if defined(HAVE_AVX512) m512 xvec = set8x64(x[7], x[6], x[5], x[4],