diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c index 5d47768d..0f14852d 100644 --- a/src/hwlm/noodle_engine_sse.c +++ b/src/hwlm/noodle_engine_sse.c @@ -49,12 +49,8 @@ hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, if (!l) { return HWLM_SUCCESS; } - m128 v = zeroes128(); - // we don't have a clever way of doing this move yet - memcpy(&v, d, l); - if (noCase) { - v = and128(v, caseMask); - } + m128 mask128 = noCase ? caseMask : ones128(); + m128 v = and128(loadu128(d), mask128); // mask out where we can't match u32 mask = (0xFFFF >> (16 - l)); @@ -76,11 +72,8 @@ hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); const size_t l = end - start; - m128 v = loadu128(d); - - if (noCase) { - v = and128(v, caseMask); - } + m128 mask128 = noCase ? caseMask : ones128(); + m128 v = and128(loadu128(d), mask128); u32 buf_off = start - offset; u32 mask = ((1 << l) - 1) << buf_off; @@ -109,11 +102,8 @@ hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, assert(l <= 32); DEBUG_PRINTF("d %zu\n", d - buf); - m128 v = zeroes128(); - memcpy(&v, d, l); - if (noCase) { - v = and128(v, caseMask); - } + m128 mask128 = noCase ? caseMask : ones128(); + m128 v = and128(loadu128(d), mask128); u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), eq128(mask2, v))); @@ -137,11 +127,8 @@ hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); size_t l = end - start; - m128 v = loadu128(d); - - if (noCase) { - v = and128(v, caseMask); - } + m128 mask128 = noCase ? caseMask : ones128(); + m128 v = and128(loadu128(d), mask128); u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), eq128(mask2, v))); @@ -164,9 +151,10 @@ hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf, size_t end) { const u8 *d = buf + start, *e = buf + end; assert(d < e); + m128 mask128 = noCase ? caseMask : ones128(); for (; d < e; d += 16) { - m128 v = noCase ? and128(load128(d), caseMask) : load128(d); + m128 v = and128(load128(d), mask128); u32 z = movemask128(eq128(mask1, v)); @@ -186,9 +174,10 @@ hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf, const u8 *d = buf + start, *e = buf + end; assert(d < e); m128 lastz1 = zeroes128(); + m128 mask128 = noCase ? caseMask : ones128(); for (; d < e; d += 16) { - m128 v = noCase ? and128(load128(d), caseMask) : load128(d); + m128 v = and128(load128(d), mask128); m128 z1 = eq128(mask1, v); m128 z2 = eq128(mask2, v); u32 z = movemask128(and128(palignr(z1, lastz1, 15), z2)); diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h index ddca35c9..498db568 100644 --- a/src/util/arch/arm/bitutils.h +++ b/src/util/arch/arm/bitutils.h @@ -82,11 +82,7 @@ u32 findAndClearLSB_64_impl(u64a *v) { static really_inline u32 findAndClearMSB_32_impl(u32 *v) { - u32 val = *v; - u32 offset = 31 - clz32_impl(val); - *v = val & ~(1 << offset); - assert(offset < 32); - return offset; + return findAndClearMSB_32_impl_c(v); } static really_inline @@ -107,20 +103,19 @@ u64a compress64_impl(u64a x, u64a m) { static really_inline m128 compress128_impl(m128 x, m128 m) { m128 one = set1_2x64(1); - m128 bitset = one; - m128 vres = zeroes128(); + m128 bb = one; + m128 res = zeroes128(); while (isnonzero128(m)) { - m128 mm = sub_2x64(zeroes128(), m); - m128 tv = and128(x, m); - tv = and128(tv, mm); - - m128 mask = not128(eq64_m128(tv, zeroes128())); - mask = vandq_s64(bitset, mask); - vres = or128(vres, mask); - m = and128(m, sub_2x64(m, one)); - bitset = lshift64_m128(bitset, 1); + m128 mm = sub_2x64(zeroes128(), m); + m128 xm = and128(x, m); + xm = and128(xm, mm); + + m128 mask = not128(eq64_m128(xm, zeroes128())); + res = or128(res, and128(bb, mask)); + m = and128(m, sub_2x64(m, one)); + bb = lshift64_m128(bb, 1); } - return vres; + return res; } static really_inline @@ -136,20 +131,18 @@ u64a expand64_impl(u64a x, u64a m) { static really_inline m128 expand128_impl(m128 x, m128 m) { m128 one = set1_2x64(1); - m128 bitset = one; - m128 vres = zeroes128(); + m128 bb = one; + m128 res = zeroes128(); while (isnonzero128(m)) { - m128 tv = and128(x, m); - - m128 mm = sub_2x64(zeroes128(), m); - m128 mask = not128(eq64_m128(tv, zeroes128())); - mask = vandq_s64(bitset, mask); - mask = and128(mask, mm); - vres = or128(vres, mask); - m = and128(m, sub_2x64(m, one)); - bitset = lshift64_m128(bitset, 1); + m128 xm = and128(x, bb); + m128 mm = sub_2x64(zeroes128(), m); + m128 mask = not128(eq64_m128(xm, zeroes128())); + mask = and128(mask, and128(m, mm)); + res = or128(res, mask); + m = and128(m, sub_2x64(m, one)); + bb = lshift64_m128(bb, 1); } - return vres; + return res; } /* returns the first set bit after begin (if not ~0U). If no bit is set after diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index f3215fb2..8cf00025 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -62,7 +62,7 @@ static really_inline int diff128(m128 a, m128 b) { } static really_inline int isnonzero128(m128 a) { - return !!diff128(a, zeroes128()); + return diff128(a, zeroes128()); } /** @@ -121,7 +121,6 @@ static really_inline m128 eq64_m128(m128 a, m128 b) { return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b); } - static really_inline u32 movemask128(m128 a) { static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; @@ -311,22 +310,28 @@ m128 palignr(m128 r, m128 l, int offset) { static really_really_inline m128 rshiftbyte_m128(m128 a, unsigned b) { + if (b == 0) { + return a; + } return palignr(zeroes128(), a, b); } static really_really_inline m128 lshiftbyte_m128(m128 a, unsigned b) { + if (b == 0) { + return a; + } return palignr(a, zeroes128(), 16 - b); } static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) { assert(amount >= -16 && amount <= 16); - static const uint8x16_t vbs_mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }; - const uint8x16_t outside_mask = set1_16x8(0xf0); - - m128 shift_mask = palignr_imm(vbs_mask, outside_mask, 16 - amount); - return vqtbl1q_s8(in, shift_mask); + if (amount < 0) { + return palignr_imm(zeroes128(), in, -amount); + } else { + return palignr_imm(in, zeroes128(), 16 - amount); + } } #ifdef __cplusplus diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index e0073fad..d8499ea2 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -72,7 +72,7 @@ static inline void print_m128_4x32(const char *label, m128 vector) { printf("\n"); } -static inline void print_m128_2x64(char *label, m128 vector) { +static inline void print_m128_2x64(const char *label, m128 vector) { uint64_t ALIGN_ATTR(16) data[2]; store128(data, vector); DEBUG_PRINTF("%s: ", label); diff --git a/unit/internal/bitutils.cpp b/unit/internal/bitutils.cpp index 3f788544..8af8f9a4 100644 --- a/unit/internal/bitutils.cpp +++ b/unit/internal/bitutils.cpp @@ -294,6 +294,39 @@ TEST(BitUtils, compress64) { } } +TEST(BitUtils, compress128) { + const m128 all_zeroes = zeroes128(); + const m128 all_ones = ones128(); + const m128 odd_bits = set1_2x64(0x5555555555555555ull); + const m128 even_bits = set1_2x64(0xaaaaaaaaaaaaaaaaull); + + EXPECT_EQ(0, diff128(all_zeroes, compress128(all_zeroes, all_zeroes))); + EXPECT_EQ(0, diff128(all_zeroes, compress128(all_zeroes, set1_4x32(1)))); + EXPECT_EQ(0, diff128(all_zeroes, compress128(all_zeroes, all_ones))); + EXPECT_EQ(0, diff128(all_ones, compress128(all_ones, all_ones))); + EXPECT_EQ(0, diff128(set1_2x64(0xffffffffull), compress128(odd_bits, odd_bits))); + EXPECT_EQ(0, diff128(set1_2x64(0xffffffffull), compress128(even_bits, even_bits))); + EXPECT_EQ(0, diff128(all_zeroes, compress128(odd_bits, even_bits))); + EXPECT_EQ(0, diff128(all_zeroes, compress128(even_bits, odd_bits))); + + // Some single-bit tests. + for (u32 i = 0; i < 64; i++) { + const m128 one_bit = set1_2x64(1ull << i); + + EXPECT_EQ(0, diff128(all_zeroes, compress128(all_zeroes, one_bit))); + EXPECT_EQ(0, diff128(set1_2x64(1ull), compress128(one_bit, one_bit))); + EXPECT_EQ(0, diff128(one_bit, compress128(one_bit, all_ones))); + + if (i % 2) { + EXPECT_EQ(0, diff128(set1_2x64(1ull << (i / 2)), compress128(one_bit, even_bits))); + EXPECT_EQ(0, diff128(all_zeroes, compress128(one_bit, odd_bits))); + } else { + EXPECT_EQ(0, diff128(set1_2x64(1ull << (i / 2)), compress128(one_bit, odd_bits))); + EXPECT_EQ(0, diff128(all_zeroes, compress128(one_bit, even_bits))); + } + } +} + TEST(BitUtils, expand32) { const u32 all_ones = 0xffffffffu; const u32 odd_bits = 0x55555555u; @@ -352,6 +385,35 @@ TEST(BitUtils, expand64) { } } +TEST(BitUtils, expand128) { + const m128 all_zeroes = zeroes128(); + const m128 all_ones = ones128(); + const m128 odd_bits = set1_2x64(0x5555555555555555ull); + const m128 even_bits = set1_2x64(0xaaaaaaaaaaaaaaaaull); + + EXPECT_EQ(0, diff128(all_zeroes, expand128(all_zeroes, all_zeroes))); + EXPECT_EQ(0, diff128(all_zeroes, expand128(all_zeroes, set1_2x64(1ull)))); + EXPECT_EQ(0, diff128(all_zeroes, expand128(all_zeroes, all_ones))); + EXPECT_EQ(0, diff128(all_ones, expand128(all_ones, all_ones))); + EXPECT_EQ(0, diff128(odd_bits, expand128(set1_2x64(0xffffffffull), odd_bits))); + EXPECT_EQ(0, diff128(even_bits, expand128(set1_2x64(0xffffffffull), even_bits))); + EXPECT_EQ(0, diff128(all_zeroes, expand128(set1_2x64(0xffffffff00000000ull), even_bits))); + EXPECT_EQ(0, diff128(all_zeroes, expand128(set1_2x64(0xffffffff00000000ull), odd_bits))); + EXPECT_EQ(0, diff128(set1_2x64(1u), expand128(set1_2x64(1u), odd_bits))); + EXPECT_EQ(0, diff128(set1_2x64(2u), expand128(set1_2x64(1u), even_bits))); + + // Some single-bit tests. + for (u32 i = 0; i < 64; i++) { + const m128 one_bit = set1_2x64(1ull << i); + + EXPECT_EQ(0, diff128(all_zeroes, expand128(all_zeroes, one_bit))); + EXPECT_EQ(0, diff128(one_bit, expand128(set1_2x64(1ull), one_bit))); + EXPECT_EQ(0, diff128(one_bit, expand128(one_bit, all_ones))); + + EXPECT_EQ(0, diff128(one_bit, expand128(set1_2x64(1ull << (i / 2)), i % 2 ? even_bits : odd_bits))); + } +} + TEST(BitUtils, bf_op_1) { u64a a = 0; for (u32 i = 0; i < 64; i++) {