diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h index 514ddc5c..0b579dc9 100644 --- a/src/util/arch/arm/bitutils.h +++ b/src/util/arch/arm/bitutils.h @@ -104,6 +104,108 @@ u64a compress64_impl(u64a x, u64a m) { return compress64_impl_c(x, m); } +static really_inline +m128 compress128_impl(m128 x, m128 m) { + +/* x = and128(x, m); // clear irrelevant bits + + // Return zero quickly on trivial cases + if (diff128(x, zeroes128()) == 0) { + return zeroes128(); + }*/ + + + u64a ALIGN_ATTR(16) xv[2]; + u64a ALIGN_ATTR(16) mv[2]; + u64a ALIGN_ATTR(16) res[2]; + u64a ALIGN_ATTR(16) t[2]; + u64a ALIGN_ATTR(16) bbv[2]; + store128(xv, x); + store128(mv, m); + res[0] = 0; + res[1] = 0; + printf("x[%d] = %0llx\n", 0, xv[0]); + printf("x[%d] = %0llx\n", 1, xv[1]); + + m128 one = set1_2x64(1); + m128 bitset = one; + m128 vres = zeroes128(); + for (u64a bb = 1; mv[0] | mv[1]; bb <<= 1) { + printf("bb = %lld\n", bb); + store128(bbv, bitset); + printf("bb[%d] = %0lld\n", 0, bbv[0]); + printf("bb[%d] = %0lld\n", 1, bbv[1]); + printf("m[%d] = %0llx\n", 0, mv[0]); + printf("m[%d] = %0llx\n", 1, mv[1]); + printf("scalar: -m[%d] = %0llx\n", 0, -mv[0]); + printf("scalar: -m[%d] = %0llx\n", 1, -mv[1]); + m128 mm = sub_2x64(zeroes128(), m); + store128(t, mm); + printf("vector: -m[0] = %0llx\n", t[0]); + printf("vector: -m[1] = %0llx\n", t[1]); + m128 tv = and128(x, m); + store128(t, tv); + printf("vector: x[0] & m[0] = %0llx\n", t[0]); + printf("vector: x[1] & m[1] = %0llx\n", t[1]); + tv = and128(tv, mm); + store128(t, tv); + printf("vector: x[0] & m[0] & -m[0] = %0llx\n", t[0]); + printf("vector: x[1] & m[1] & -m[1] = %0llx\n", t[1]); + t[0] = xv[0] & mv[0]; + t[1] = xv[1] & mv[1]; + printf("scalar: x[0] & m[0] = %0llx\n", t[0]); + printf("scalar: x[1] & m[1] = %0llx\n", t[1]); + t[0] = xv[0] & mv[0] & -mv[0]; + t[1] = xv[1] & mv[1] & -mv[1]; + printf("scalar: x[0] & m[0] & -m[0] = %0llx\n", t[0]); + printf("scalar: x[1] & m[1] & -m[1] = %0llx\n", t[1]); + + if ( t[0] ) { + printf("x & m & -m != 0\n"); + res[0] |= bb; + printf("x[%d] = %0llx\n", 0, xv[0]); + } + if ( t[1] ) { + printf("x & m & -m != 0\n"); + res[1] |= bb; + printf("x[%d] = %0llx\n", 1, xv[1]); + } + + m128 mask = not128(eq64_m128(tv, zeroes128())); + store128(t, mask); + printf("mask: x[0] & m[0] & -m[0] != 0 : %0llx\n", t[0]); + printf("mask: x[1] & m[1] & -m[1] != 0 : %0llx\n", t[1]); + + mask = vandq_s64(bitset, mask); + store128(t, mask); + printf("mask: mask[0] & bitset[1] != 0 : %0llx\n", t[0]); + printf("mask: mask[1] & bitset[1] != 0 : %0llx\n", t[1]); + + vres = or128(vres, mask); + store128(t, vres); + printf("res: res[0] != 0 : %0llx\n", t[0]); + printf("res: res[1] != 0 : %0llx\n", t[1]); + if (t[0] != res[0]) { + printf("mismatch: t[0] != res[0]: %0llx != %0llx\n", t[0], res[0]); + } + if (t[1] != res[1]) { + printf("mismatch: t[1] != res[1]: %0llx != %0llx\n", t[1], res[1]); + } + + mv[0] &= mv[0] - 1; + mv[1] &= mv[1] - 1; + m = and128(m, sub_2x64(m, set1_2x64(1))); + printf("x[%d] = %0llx\n", 0, xv[0]); + printf("x[%d] = %0llx\n", 1, xv[1]); + bitset = lshift64_m128(bitset, 1); + } + store128(res, vres); + printf("final x[%d] = %0llx\n", 0, res[0]); + printf("final x[%d] = %0llx\n", 1, res[1]); +// x = load128(res); + return vres; +} + static really_inline u32 expand32_impl(u32 x, u32 m) { return expand32_impl_c(x, m); diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h index e86b8d44..88e71bba 100644 --- a/src/util/arch/common/bitutils.h +++ b/src/util/arch/common/bitutils.h @@ -35,6 +35,7 @@ #include "util/popcount.h" #include "util/unaligned.h" +#include "util/simd_utils.h" static really_inline u32 clz32_impl_c(u32 x) { @@ -177,7 +178,13 @@ u32 compress32_impl_c(u32 x, u32 m) { static really_inline u64a compress64_impl_c(u64a x, u64a m) { - // Return zero quickly on trivial cases + u64a res = 0; + for (u64a bb = 1; m != 0; bb += bb) { + if (x & m & -m) { res |= bb; } + m &= (m - 1); + } + return res; +/* // Return zero quickly on trivial cases if ((x & m) == 0) { return 0; } @@ -202,7 +209,20 @@ u64a compress64_impl_c(u64a x, u64a m) { mk = mk & ~mp; } - return x; + return x;*/ +} + +static really_inline +m128 compress128_impl_c(m128 xvec, m128 mvec) { + u64a ALIGN_ATTR(16) x[2]; + u64a ALIGN_ATTR(16) m[2]; + store128(x, xvec); + store128(m, mvec); + + compress64_impl_c(x[0], m[0]); + compress64_impl_c(x[1], m[1]); + + return xvec; } static really_inline @@ -242,7 +262,13 @@ u32 expand32_impl_c(u32 x, u32 m) { static really_inline u64a expand64_impl_c(u64a x, u64a m) { - // Return zero quickly on trivial cases + u64a res = 0; + for (u64a bb = 1; m != 0; bb += bb) { + if (x & bb) { res |= m & (-m); } + m &= (m - 1); + } + return res; +/* // Return zero quickly on trivial cases if (!x || !m) { return 0; } @@ -272,7 +298,7 @@ u64a expand64_impl_c(u64a x, u64a m) { x = (x & ~mv) | (t & mv); } - return x & m0; // clear out extraneous bits + return x & m0; // clear out extraneous bits*/ } diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h index ec4c95ad..a0769a5e 100644 --- a/src/util/arch/x86/bitutils.h +++ b/src/util/arch/x86/bitutils.h @@ -214,6 +214,11 @@ u64a compress64_impl(u64a x, u64a m) { #endif } +static really_inline +u64a compress128_impl(m128 x, m128 m) { + compress128_impl_c(x, m); +} + static really_inline u32 expand32_impl(u32 x, u32 m) { #if defined(HAVE_BMI2) diff --git a/src/util/bitutils.h b/src/util/bitutils.h index 556ba818..21d35388 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -120,6 +120,11 @@ u64a compress64(u64a x, u64a m) { return compress64_impl(x, m); } +static really_inline +m128 compress128(m128 x, m128 m) { + return compress128_impl(x, m); +} + static really_inline u32 expand32(u32 x, u32 m) { return expand32_impl(x, m);