From 1c581e45e98b9c8758076865b5e7e1f12e21acdc Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 15 Jan 2021 17:33:41 +0200 Subject: [PATCH] add expand128() implementation for NEON --- src/util/arch/arm/bitutils.h | 22 ++++++++++++++++++++-- src/util/arch/common/bitutils.h | 12 ++++++++++++ src/util/arch/x86/bitutils.h | 5 +++++ src/util/bitutils.h | 4 ++++ src/util/state_compress.c | 12 +++++++----- 5 files changed, 48 insertions(+), 7 deletions(-) diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h index 1d1e0167..ddca35c9 100644 --- a/src/util/arch/arm/bitutils.h +++ b/src/util/arch/arm/bitutils.h @@ -106,7 +106,6 @@ u64a compress64_impl(u64a x, u64a m) { static really_inline m128 compress128_impl(m128 x, m128 m) { - m128 one = set1_2x64(1); m128 bitset = one; m128 vres = zeroes128(); @@ -118,7 +117,7 @@ m128 compress128_impl(m128 x, m128 m) { m128 mask = not128(eq64_m128(tv, zeroes128())); mask = vandq_s64(bitset, mask); vres = or128(vres, mask); - m = and128(m, sub_2x64(m, set1_2x64(1))); + m = and128(m, sub_2x64(m, one)); bitset = lshift64_m128(bitset, 1); } return vres; @@ -134,6 +133,25 @@ u64a expand64_impl(u64a x, u64a m) { return expand64_impl_c(x, m); } +static really_inline +m128 expand128_impl(m128 x, m128 m) { + m128 one = set1_2x64(1); + m128 bitset = one; + m128 vres = zeroes128(); + while (isnonzero128(m)) { + m128 tv = and128(x, m); + + m128 mm = sub_2x64(zeroes128(), m); + m128 mask = not128(eq64_m128(tv, zeroes128())); + mask = vandq_s64(bitset, mask); + mask = and128(mask, mm); + vres = or128(vres, mask); + m = and128(m, sub_2x64(m, one)); + bitset = lshift64_m128(bitset, 1); + } + return vres; +} + /* returns the first set bit after begin (if not ~0U). If no bit is set after * begin returns ~0U */ diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h index 88e71bba..723e4a18 100644 --- a/src/util/arch/common/bitutils.h +++ b/src/util/arch/common/bitutils.h @@ -301,6 +301,18 @@ u64a expand64_impl_c(u64a x, u64a m) { return x & m0; // clear out extraneous bits*/ } +static really_inline +m128 expand128_impl_c(m128 xvec, m128 mvec) { + u64a ALIGN_ATTR(16) x[2]; + u64a ALIGN_ATTR(16) m[2]; + store128(x, xvec); + store128(m, mvec); + + expand64_impl_c(x[0], m[0]); + expand64_impl_c(x[1], m[1]); + + return xvec; +} /* returns the first set bit after begin (if not ~0U). If no bit is set after * begin returns ~0U diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h index 33fff7c2..1a9c3f7c 100644 --- a/src/util/arch/x86/bitutils.h +++ b/src/util/arch/x86/bitutils.h @@ -239,6 +239,11 @@ u64a expand64_impl(u64a x, u64a m) { #endif } +static really_inline +m128 expand128_impl(m128 x, m128 m) { + return expand128_impl_c(x, m); +} + /* returns the first set bit after begin (if not ~0U). If no bit is set after * begin returns ~0U */ diff --git a/src/util/bitutils.h b/src/util/bitutils.h index 21d35388..68494507 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -135,6 +135,10 @@ u64a expand64(u64a x, u64a m) { return expand64_impl(x, m); } +static really_inline +m128 expand128(m128 x, m128 m) { + return expand128_impl(x, m); +} /* returns the first set bit after begin (if not ~0U). If no bit is set after * begin returns ~0U diff --git a/src/util/state_compress.c b/src/util/state_compress.c index 5c26f043..66cd4daf 100644 --- a/src/util/state_compress.c +++ b/src/util/state_compress.c @@ -162,14 +162,16 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { u64a ALIGN_ATTR(16) m[2]; store128(m, mvec); - u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; + // Count the number of bits of compressed state we're writing out per + // chunk. + u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) }; + u64a ALIGN_ATTR(16) v[2]; - unpack_bits_64(v, (const u8 *)ptr, bits, 2); + m128 xvec = load128(v); - u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) }; - - return set2x64(x[1], x[0]); + // Expand vector + return expand128(xvec, mvec); } #endif