From 6a11c83630536ebaed0c1ed53ef531cffafa04fb Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 15 Jan 2021 17:33:41 +0200 Subject: [PATCH 01/13] add expand128() implementation for NEON --- src/util/arch/arm/bitutils.h | 22 ++++++++++++++++++++-- src/util/arch/common/bitutils.h | 12 ++++++++++++ src/util/arch/x86/bitutils.h | 5 +++++ src/util/bitutils.h | 4 ++++ src/util/state_compress.c | 12 +++++++----- 5 files changed, 48 insertions(+), 7 deletions(-) diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h index 1d1e0167..ddca35c9 100644 --- a/src/util/arch/arm/bitutils.h +++ b/src/util/arch/arm/bitutils.h @@ -106,7 +106,6 @@ u64a compress64_impl(u64a x, u64a m) { static really_inline m128 compress128_impl(m128 x, m128 m) { - m128 one = set1_2x64(1); m128 bitset = one; m128 vres = zeroes128(); @@ -118,7 +117,7 @@ m128 compress128_impl(m128 x, m128 m) { m128 mask = not128(eq64_m128(tv, zeroes128())); mask = vandq_s64(bitset, mask); vres = or128(vres, mask); - m = and128(m, sub_2x64(m, set1_2x64(1))); + m = and128(m, sub_2x64(m, one)); bitset = lshift64_m128(bitset, 1); } return vres; @@ -134,6 +133,25 @@ u64a expand64_impl(u64a x, u64a m) { return expand64_impl_c(x, m); } +static really_inline +m128 expand128_impl(m128 x, m128 m) { + m128 one = set1_2x64(1); + m128 bitset = one; + m128 vres = zeroes128(); + while (isnonzero128(m)) { + m128 tv = and128(x, m); + + m128 mm = sub_2x64(zeroes128(), m); + m128 mask = not128(eq64_m128(tv, zeroes128())); + mask = vandq_s64(bitset, mask); + mask = and128(mask, mm); + vres = or128(vres, mask); + m = and128(m, sub_2x64(m, one)); + bitset = lshift64_m128(bitset, 1); + } + return vres; +} + /* returns the first set bit after begin (if not ~0U). If no bit is set after * begin returns ~0U */ diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h index 88e71bba..723e4a18 100644 --- a/src/util/arch/common/bitutils.h +++ b/src/util/arch/common/bitutils.h @@ -301,6 +301,18 @@ u64a expand64_impl_c(u64a x, u64a m) { return x & m0; // clear out extraneous bits*/ } +static really_inline +m128 expand128_impl_c(m128 xvec, m128 mvec) { + u64a ALIGN_ATTR(16) x[2]; + u64a ALIGN_ATTR(16) m[2]; + store128(x, xvec); + store128(m, mvec); + + expand64_impl_c(x[0], m[0]); + expand64_impl_c(x[1], m[1]); + + return xvec; +} /* returns the first set bit after begin (if not ~0U). If no bit is set after * begin returns ~0U diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h index 33fff7c2..1a9c3f7c 100644 --- a/src/util/arch/x86/bitutils.h +++ b/src/util/arch/x86/bitutils.h @@ -239,6 +239,11 @@ u64a expand64_impl(u64a x, u64a m) { #endif } +static really_inline +m128 expand128_impl(m128 x, m128 m) { + return expand128_impl_c(x, m); +} + /* returns the first set bit after begin (if not ~0U). If no bit is set after * begin returns ~0U */ diff --git a/src/util/bitutils.h b/src/util/bitutils.h index 21d35388..68494507 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -135,6 +135,10 @@ u64a expand64(u64a x, u64a m) { return expand64_impl(x, m); } +static really_inline +m128 expand128(m128 x, m128 m) { + return expand128_impl(x, m); +} /* returns the first set bit after begin (if not ~0U). If no bit is set after * begin returns ~0U diff --git a/src/util/state_compress.c b/src/util/state_compress.c index 5c26f043..66cd4daf 100644 --- a/src/util/state_compress.c +++ b/src/util/state_compress.c @@ -162,14 +162,16 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { u64a ALIGN_ATTR(16) m[2]; store128(m, mvec); - u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; + // Count the number of bits of compressed state we're writing out per + // chunk. + u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) }; + u64a ALIGN_ATTR(16) v[2]; - unpack_bits_64(v, (const u8 *)ptr, bits, 2); + m128 xvec = load128(v); - u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) }; - - return set2x64(x[1], x[0]); + // Expand vector + return expand128(xvec, mvec); } #endif From ef9bf02d006c9510fa6edfe6ff76141a8e5ac021 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 15 Jan 2021 17:35:01 +0200 Subject: [PATCH 02/13] add some useful intrinsics --- src/util/arch/arm/simd_utils.h | 16 ++++++++++++++-- src/util/arch/common/simd_utils.h | 22 ++++++++++++++++++---- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index f7b92e70..dcf3fe58 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -202,6 +202,18 @@ static really_inline u64a extract64from128(const m128 in, unsigned imm) { #endif } +static really_inline m128 low64from128(const m128 in) { + return vcombine_u64(vget_low_u64(in), vdup_n_u64(0)); +} + +static really_inline m128 high64from128(const m128 in) { + return vcombine_u64(vget_high_u64(in), vdup_n_u64(0)); +} + +static really_inline m128 add128(m128 a, m128 b) { + return (m128) vaddq_u64((uint64x2_t)a, (uint64x2_t)b); +} + static really_inline m128 and128(m128 a, m128 b) { return (m128) vandq_s8((int8x16_t)a, (int8x16_t)b); } @@ -381,13 +393,13 @@ m128 sub_u8_m128(m128 a, m128 b) { static really_inline m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) { - uint32_t __attribute__((aligned(16))) data[4] = { x0, x1, x2, x3 }; + uint32_t ALIGN_ATTR(16) data[4] = { x0, x1, x2, x3 }; return (m128) vld1q_u32((uint32_t *) data); } static really_inline m128 set2x64(u64a hi, u64a lo) { - uint64_t __attribute__((aligned(16))) data[2] = { lo, hi }; + uint64_t ALIGN_ATTR(16) data[2] = { lo, hi }; return (m128) vld1q_u64((uint64_t *) data); } diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 0c67ee94..b20becdc 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -46,7 +46,7 @@ #ifdef DEBUG static inline void print_m128_16x8(char *label, m128 vector) { - uint8_t __attribute__((aligned(16))) data[16]; + uint8_t ALIGN_ATTR(16) data[16]; store128(data, vector); DEBUG_PRINTF("%s: ", label); for(int i=0; i < 16; i++) @@ -55,7 +55,7 @@ static inline void print_m128_16x8(char *label, m128 vector) { } static inline void print_m128_8x16(char *label, m128 vector) { - uint16_t __attribute__((aligned(16))) data[8]; + uint16_t ALIGN_ATTR(16) data[8]; store128(data, vector); DEBUG_PRINTF("%s: ", label); for(int i=0; i < 8; i++) @@ -64,7 +64,7 @@ static inline void print_m128_8x16(char *label, m128 vector) { } static inline void print_m128_4x32(char *label, m128 vector) { - uint32_t __attribute__((aligned(16))) data[4]; + uint32_t ALIGN_ATTR(16) data[4]; store128(data, vector); DEBUG_PRINTF("%s: ", label); for(int i=0; i < 4; i++) @@ -73,7 +73,7 @@ static inline void print_m128_4x32(char *label, m128 vector) { } static inline void print_m128_2x64(char *label, m128 vector) { - uint64_t __attribute__((aligned(16))) data[2]; + uint64_t ALIGN_ATTR(16) data[2]; store128(data, vector); DEBUG_PRINTF("%s: ", label); for(int i=0; i < 2; i++) @@ -146,6 +146,13 @@ static really_inline m256 ones256(void) { return rv; } +static really_inline m256 add256(m256 a, m256 b) { + m256 rv; + rv.lo = add128(a.lo, b.lo); + rv.hi = add128(a.hi, b.hi); + return rv; +} + static really_inline m256 and256(m256 a, m256 b) { m256 rv; rv.lo = and128(a.lo, b.lo); @@ -585,6 +592,13 @@ m512 set1_4x128(m128 a) { return rv; } +static really_inline +m512 add512(m512 a, m512 b) { + m512 rv; + rv.lo = add256(a.lo, b.lo); + rv.hi = add256(a.hi, b.hi); + return rv; +} static really_inline m512 and512(m512 a, m512 b) { From fc4338eca0335749a2899dd2b131e4aeeb8a348a Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 15 Jan 2021 17:35:21 +0200 Subject: [PATCH 03/13] fix compilation on non-x86 --- unit/internal/masked_move.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unit/internal/masked_move.cpp b/unit/internal/masked_move.cpp index 7bd78c50..1b7a2cf1 100644 --- a/unit/internal/masked_move.cpp +++ b/unit/internal/masked_move.cpp @@ -32,7 +32,9 @@ #include "gtest/gtest.h" #include "util/arch.h" +#if defined(ARCH_IA32) || defined(ARCH_X86_64) #include "util/masked_move.h" +#endif namespace { From 94739756b417223aaf0bb1103c7178a5c530f1c3 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 15 Jan 2021 17:42:11 +0200 Subject: [PATCH 04/13] borrow cache prefetching tricks from the Marvell port, seem to improve performance by 5-28% --- src/fdr/fdr.c | 17 +++++++++++------ src/nfa/mcclellan.c | 10 ++++++++++ src/nfa/mcsheng.c | 10 ++++++++++ src/nfa/shufti.c | 7 ++++++- 4 files changed, 37 insertions(+), 7 deletions(-) diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index 1a3b7003..372a78b1 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -147,6 +147,7 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { /* +1: the zones ensure that we can read the byte at z->end */ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + u64a reach0 = andn(domain_mask_flipped, itPtr); u64a reach1 = andn(domain_mask_flipped, itPtr + 1); u64a reach2 = andn(domain_mask_flipped, itPtr + 2); @@ -184,17 +185,16 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, st0 = or128(st0, st4); *s = or128(*s, st0); - *conf0 = movq(*s); + *conf0 = movq(*s) ^ ~0ULL; *s = rshiftbyte_m128(*s, 8); - *conf0 ^= ~0ULL; u64a reach8 = andn(domain_mask_flipped, itPtr + 8); u64a reach9 = andn(domain_mask_flipped, itPtr + 9); u64a reach10 = andn(domain_mask_flipped, itPtr + 10); u64a reach11 = andn(domain_mask_flipped, itPtr + 11); - m128 st8 = load_m128_from_u64a(ft + reach8); - m128 st9 = load_m128_from_u64a(ft + reach9); + m128 st8 = load_m128_from_u64a(ft + reach8); + m128 st9 = load_m128_from_u64a(ft + reach9); m128 st10 = load_m128_from_u64a(ft + reach10); m128 st11 = load_m128_from_u64a(ft + reach11); @@ -225,9 +225,8 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, st8 = or128(st8, st12); *s = or128(*s, st8); - *conf8 = movq(*s); + *conf8 = movq(*s) ^ ~0ULL; *s = rshiftbyte_m128(*s, 8); - *conf8 ^= ~0ULL; } static really_inline @@ -235,6 +234,7 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr, UNUSED const u8 *end_ptr, u32 domain_mask_flipped, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + u64a reach0 = andn(domain_mask_flipped, itPtr); u64a reach2 = andn(domain_mask_flipped, itPtr + 2); u64a reach4 = andn(domain_mask_flipped, itPtr + 4); @@ -287,6 +287,7 @@ void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr, UNUSED const u8 *end_ptr, u32 domain_mask_flipped, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + u64a reach0 = andn(domain_mask_flipped, itPtr); u64a reach4 = andn(domain_mask_flipped, itPtr + 4); u64a reach8 = andn(domain_mask_flipped, itPtr + 8); @@ -683,6 +684,10 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend, const u8 *tryFloodDetect = zz->floodPtr; \ const u8 *start_ptr = zz->start; \ const u8 *end_ptr = zz->end; \ + for (const u8 *itPtr = start_ptr; itPtr + 4*ITER_BYTES <= end_ptr; \ + itPtr += 4*ITER_BYTES) { \ + __builtin_prefetch(itPtr); \ + } \ \ for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \ itPtr += ITER_BYTES) { \ diff --git a/src/nfa/mcclellan.c b/src/nfa/mcclellan.c index 71f71e32..5ac0615a 100644 --- a/src/nfa/mcclellan.c +++ b/src/nfa/mcclellan.c @@ -634,6 +634,11 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, assert(ISALIGNED_N(q->state, 2)); u32 s = *(u16 *)q->state; + __builtin_prefetch(&m->remap[0]); + __builtin_prefetch(&m->remap[64]); + __builtin_prefetch(&m->remap[128]); + __builtin_prefetch(&m->remap[192]); + if (q->report_current) { assert(s); assert(get_aux(m, s)->accept); @@ -790,6 +795,11 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, u32 s = *(u8 *)q->state; + __builtin_prefetch(&m->remap[0]); + __builtin_prefetch(&m->remap[64]); + __builtin_prefetch(&m->remap[128]); + __builtin_prefetch(&m->remap[192]); + if (q->report_current) { assert(s); assert(s >= m->accept_limit_8); diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c index dd00617e..fe67102b 100644 --- a/src/nfa/mcsheng.c +++ b/src/nfa/mcsheng.c @@ -889,6 +889,11 @@ char nfaExecMcSheng16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, return MO_ALIVE; } + __builtin_prefetch(&m->remap[0]); + __builtin_prefetch(&m->remap[64]); + __builtin_prefetch(&m->remap[128]); + __builtin_prefetch(&m->remap[192]); + while (1) { assert(q->cur < q->end); s64a ep = q->items[q->cur].location; @@ -1017,6 +1022,11 @@ char nfaExecMcSheng8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, return MO_ALIVE; } + __builtin_prefetch(&m->remap[0]); + __builtin_prefetch(&m->remap[64]); + __builtin_prefetch(&m->remap[128]); + __builtin_prefetch(&m->remap[192]); + while (1) { DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" : q->items[q->cur].type == MQE_END ? "END" : "???", diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c index e76dcca8..f1f2befc 100644 --- a/src/nfa/shufti.c +++ b/src/nfa/shufti.c @@ -109,7 +109,8 @@ DUMP_MSK(128) #endif #define GET_LO_4(chars) and128(chars, low4bits) -#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) +#define GET_HI_4(chars) and128(rshift64_m128(chars, 4), low4bits) +//#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) static really_inline u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits, @@ -177,6 +178,10 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, // Reroll FTW. const u8 *last_block = buf_end - 16; + + for (const u8 *itPtr = buf; itPtr + 4*16 <= last_block; itPtr += 4*16) { + __builtin_prefetch(itPtr); + } while (buf < last_block) { m128 lchars = load128(buf); rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes); From 9bf5cac782d7fa73d2915baf60657f72b79c9611 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 18 Jan 2021 13:00:45 +0200 Subject: [PATCH 05/13] replace andn() by explicit bitops and group loads/stores, gives ~1% gain --- src/fdr/fdr.c | 104 ++++++++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 46 deletions(-) diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index 372a78b1..356cc3e6 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -148,25 +148,58 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, /* +1: the zones ensure that we can read the byte at z->end */ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - u64a reach0 = andn(domain_mask_flipped, itPtr); - u64a reach1 = andn(domain_mask_flipped, itPtr + 1); - u64a reach2 = andn(domain_mask_flipped, itPtr + 2); - u64a reach3 = andn(domain_mask_flipped, itPtr + 3); + u64a ALIGN_ATTR(16) ptr[16]; + ptr[0] = unaligned_load_u32(itPtr + 0); + ptr[1] = unaligned_load_u32(itPtr + 1); + ptr[2] = unaligned_load_u32(itPtr + 2); + ptr[3] = unaligned_load_u32(itPtr + 3); + ptr[4] = unaligned_load_u32(itPtr + 4); + ptr[5] = unaligned_load_u32(itPtr + 5); + ptr[6] = unaligned_load_u32(itPtr + 6); + ptr[7] = unaligned_load_u32(itPtr + 7); + ptr[8] = unaligned_load_u32(itPtr + 8); + ptr[9] = unaligned_load_u32(itPtr + 9); + ptr[10] = unaligned_load_u32(itPtr + 10); + ptr[11] = unaligned_load_u32(itPtr + 11); + ptr[12] = unaligned_load_u32(itPtr + 12); + ptr[13] = unaligned_load_u32(itPtr + 13); + ptr[14] = unaligned_load_u32(itPtr + 14); + ptr[15] = unaligned_load_u32(itPtr + 15); + + u64a mask_not = ~domain_mask_flipped; + u64a reach0 = mask_not & ptr[0]; + u64a reach1 = mask_not & ptr[1]; + u64a reach2 = mask_not & ptr[2]; + u64a reach3 = mask_not & ptr[3]; + u64a reach4 = mask_not & ptr[4]; + u64a reach5 = mask_not & ptr[5]; + u64a reach6 = mask_not & ptr[6]; + u64a reach7 = mask_not & ptr[7]; + u64a reach8 = mask_not & ptr[8]; + u64a reach9 = mask_not & ptr[9]; + u64a reach10 = mask_not & ptr[10]; + u64a reach11 = mask_not & ptr[11]; + u64a reach12 = mask_not & ptr[12]; + u64a reach13 = mask_not & ptr[13]; + u64a reach14 = mask_not & ptr[14]; + u64a reach15 = mask_not & ptr[15]; m128 st0 = load_m128_from_u64a(ft + reach0); m128 st1 = load_m128_from_u64a(ft + reach1); m128 st2 = load_m128_from_u64a(ft + reach2); m128 st3 = load_m128_from_u64a(ft + reach3); - - u64a reach4 = andn(domain_mask_flipped, itPtr + 4); - u64a reach5 = andn(domain_mask_flipped, itPtr + 5); - u64a reach6 = andn(domain_mask_flipped, itPtr + 6); - u64a reach7 = andn(domain_mask_flipped, itPtr + 7); - m128 st4 = load_m128_from_u64a(ft + reach4); m128 st5 = load_m128_from_u64a(ft + reach5); m128 st6 = load_m128_from_u64a(ft + reach6); m128 st7 = load_m128_from_u64a(ft + reach7); + m128 st8 = load_m128_from_u64a(ft + reach8); + m128 st9 = load_m128_from_u64a(ft + reach9); + m128 st10 = load_m128_from_u64a(ft + reach10); + m128 st11 = load_m128_from_u64a(ft + reach11); + m128 st12 = load_m128_from_u64a(ft + reach12); + m128 st13 = load_m128_from_u64a(ft + reach13); + m128 st14 = load_m128_from_u64a(ft + reach14); + m128 st15 = load_m128_from_u64a(ft + reach15); st1 = lshiftbyte_m128(st1, 1); st2 = lshiftbyte_m128(st2, 2); @@ -175,39 +208,6 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, st5 = lshiftbyte_m128(st5, 5); st6 = lshiftbyte_m128(st6, 6); st7 = lshiftbyte_m128(st7, 7); - - st0 = or128(st0, st1); - st2 = or128(st2, st3); - st4 = or128(st4, st5); - st6 = or128(st6, st7); - st0 = or128(st0, st2); - st4 = or128(st4, st6); - st0 = or128(st0, st4); - *s = or128(*s, st0); - - *conf0 = movq(*s) ^ ~0ULL; - *s = rshiftbyte_m128(*s, 8); - - u64a reach8 = andn(domain_mask_flipped, itPtr + 8); - u64a reach9 = andn(domain_mask_flipped, itPtr + 9); - u64a reach10 = andn(domain_mask_flipped, itPtr + 10); - u64a reach11 = andn(domain_mask_flipped, itPtr + 11); - - m128 st8 = load_m128_from_u64a(ft + reach8); - m128 st9 = load_m128_from_u64a(ft + reach9); - m128 st10 = load_m128_from_u64a(ft + reach10); - m128 st11 = load_m128_from_u64a(ft + reach11); - - u64a reach12 = andn(domain_mask_flipped, itPtr + 12); - u64a reach13 = andn(domain_mask_flipped, itPtr + 13); - u64a reach14 = andn(domain_mask_flipped, itPtr + 14); - u64a reach15 = andn(domain_mask_flipped, itPtr + 15); - - m128 st12 = load_m128_from_u64a(ft + reach12); - m128 st13 = load_m128_from_u64a(ft + reach13); - m128 st14 = load_m128_from_u64a(ft + reach14); - m128 st15 = load_m128_from_u64a(ft + reach15); - st9 = lshiftbyte_m128(st9, 1); st10 = lshiftbyte_m128(st10, 2); st11 = lshiftbyte_m128(st11, 3); @@ -216,6 +216,14 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, st14 = lshiftbyte_m128(st14, 6); st15 = lshiftbyte_m128(st15, 7); + st0 = or128(st0, st1); + st2 = or128(st2, st3); + st4 = or128(st4, st5); + st6 = or128(st6, st7); + st0 = or128(st0, st2); + st4 = or128(st4, st6); + st0 = or128(st0, st4); + st8 = or128(st8, st9); st10 = or128(st10, st11); st12 = or128(st12, st13); @@ -223,10 +231,14 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, st8 = or128(st8, st10); st12 = or128(st12, st14); st8 = or128(st8, st12); - *s = or128(*s, st8); - *conf8 = movq(*s) ^ ~0ULL; - *s = rshiftbyte_m128(*s, 8); + m128 st = or128(*s, st0); + *conf0 = movq(st) ^ ~0ULL; + st = rshiftbyte_m128(st, 8); + st = or128(st, st8); + + *conf8 = movq(st) ^ ~0ULL; + *s = rshiftbyte_m128(st, 8); } static really_inline From dfba9227e930b05f614ac8807d9657aa7f90a786 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 22 Jan 2021 10:11:20 +0200 Subject: [PATCH 06/13] fix non-const char * write-strings compile error --- src/util/arch/common/simd_utils.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index b20becdc..e0073fad 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -45,7 +45,7 @@ #endif // HAVE_SIMD_128_BITS #ifdef DEBUG -static inline void print_m128_16x8(char *label, m128 vector) { +static inline void print_m128_16x8(const char *label, m128 vector) { uint8_t ALIGN_ATTR(16) data[16]; store128(data, vector); DEBUG_PRINTF("%s: ", label); @@ -54,7 +54,7 @@ static inline void print_m128_16x8(char *label, m128 vector) { printf("\n"); } -static inline void print_m128_8x16(char *label, m128 vector) { +static inline void print_m128_8x16(const char *label, m128 vector) { uint16_t ALIGN_ATTR(16) data[8]; store128(data, vector); DEBUG_PRINTF("%s: ", label); @@ -63,7 +63,7 @@ static inline void print_m128_8x16(char *label, m128 vector) { printf("\n"); } -static inline void print_m128_4x32(char *label, m128 vector) { +static inline void print_m128_4x32(const char *label, m128 vector) { uint32_t ALIGN_ATTR(16) data[4]; store128(data, vector); DEBUG_PRINTF("%s: ", label); From f9ef98ce19cfc9f71580a0de7149ef2674756a9b Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 22 Jan 2021 10:13:19 +0200 Subject: [PATCH 07/13] remove loads from movemask128, variable_byte_shift, add palignr_imm(), minor fixes --- src/util/arch/arm/simd_utils.h | 53 ++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index dcf3fe58..f3215fb2 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -121,16 +121,18 @@ static really_inline m128 eq64_m128(m128 a, m128 b) { return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b); } + static really_inline u32 movemask128(m128 a) { static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; // Compute the mask from the input - uint64x2_t mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); + uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); + uint64x2_t mask1 = (m128)vextq_s8(mask, zeroes128(), 7); + mask = vorrq_u8(mask, mask1); // Get the resulting bytes uint16_t output; - vst1q_lane_u8((uint8_t*)&output + 0, (uint8x16_t)mask, 0); - vst1q_lane_u8((uint8_t*)&output + 1, (uint8x16_t)mask, 8); + vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0); return output; } @@ -233,14 +235,12 @@ static really_inline m128 andnot128(m128 a, m128 b) { // aligned load static really_inline m128 load128(const void *ptr) { assert(ISALIGNED_N(ptr, alignof(m128))); - ptr = assume_aligned(ptr, 16); return (m128) vld1q_s32((const int32_t *)ptr); } // aligned store static really_inline void store128(void *ptr, m128 a) { assert(ISALIGNED_N(ptr, alignof(m128))); - ptr = assume_aligned(ptr, 16); vst1q_s32((int32_t *)ptr, a); } @@ -270,22 +270,13 @@ m128 loadbytes128(const void *ptr, unsigned int n) { return a; } -static really_inline -m128 variable_byte_shift_m128(m128 in, s32 amount) { - assert(amount >= -16 && amount <= 16); - m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); - return vqtbl1q_s8(in, shift_mask); -} #define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break; -static really_inline -m128 palignr(m128 r, m128 l, int offset) { -#if defined(HS_OPTIMIZE) - return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); -#else +static really_really_inline +m128 palignr_imm(m128 r, m128 l, int offset) { switch (offset) { - CASE_ALIGN_VECTORS(l, r, 0); + case 0: return l; break; CASE_ALIGN_VECTORS(l, r, 1); CASE_ALIGN_VECTORS(l, r, 2); CASE_ALIGN_VECTORS(l, r, 3); @@ -301,30 +292,42 @@ m128 palignr(m128 r, m128 l, int offset) { CASE_ALIGN_VECTORS(l, r, 13); CASE_ALIGN_VECTORS(l, r, 14); CASE_ALIGN_VECTORS(l, r, 15); + case 16: return r; break; default: return zeroes128(); break; } +} + +static really_really_inline +m128 palignr(m128 r, m128 l, int offset) { +#if defined(HS_OPTIMIZE) + return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); +#else + return palignr_imm(r, l, offset); #endif } #undef CASE_ALIGN_VECTORS static really_really_inline m128 rshiftbyte_m128(m128 a, unsigned b) { - if (b) - return palignr(zeroes128(), a, b); - else - return a; + return palignr(zeroes128(), a, b); } static really_really_inline m128 lshiftbyte_m128(m128 a, unsigned b) { - if (b) - return palignr(a, zeroes128(), 16 - b); - else - return a; + return palignr(a, zeroes128(), 16 - b); } +static really_inline +m128 variable_byte_shift_m128(m128 in, s32 amount) { + assert(amount >= -16 && amount <= 16); + static const uint8x16_t vbs_mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }; + const uint8x16_t outside_mask = set1_16x8(0xf0); + + m128 shift_mask = palignr_imm(vbs_mask, outside_mask, 16 - amount); + return vqtbl1q_s8(in, shift_mask); +} #ifdef __cplusplus extern "C" { From c238d627c9c58564196a70395632a714d9b489bd Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 22 Jan 2021 10:13:55 +0200 Subject: [PATCH 08/13] optimize get_conf_stride_1() --- src/fdr/fdr.c | 99 ++++++++++++++++++--------------------------------- 1 file changed, 34 insertions(+), 65 deletions(-) diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index 356cc3e6..715ab684 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -147,74 +147,43 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { /* +1: the zones ensure that we can read the byte at z->end */ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + u64a domain_mask = ~domain_mask_flipped; - u64a ALIGN_ATTR(16) ptr[16]; - ptr[0] = unaligned_load_u32(itPtr + 0); - ptr[1] = unaligned_load_u32(itPtr + 1); - ptr[2] = unaligned_load_u32(itPtr + 2); - ptr[3] = unaligned_load_u32(itPtr + 3); - ptr[4] = unaligned_load_u32(itPtr + 4); - ptr[5] = unaligned_load_u32(itPtr + 5); - ptr[6] = unaligned_load_u32(itPtr + 6); - ptr[7] = unaligned_load_u32(itPtr + 7); - ptr[8] = unaligned_load_u32(itPtr + 8); - ptr[9] = unaligned_load_u32(itPtr + 9); - ptr[10] = unaligned_load_u32(itPtr + 10); - ptr[11] = unaligned_load_u32(itPtr + 11); - ptr[12] = unaligned_load_u32(itPtr + 12); - ptr[13] = unaligned_load_u32(itPtr + 13); - ptr[14] = unaligned_load_u32(itPtr + 14); - ptr[15] = unaligned_load_u32(itPtr + 15); + u64a it_hi = *(const u64a *)itPtr; + u64a it_lo = *(const u64a *)(itPtr + 8); + u64a reach0 = domain_mask & it_hi; + u64a reach1 = domain_mask & (it_hi >> 8); + u64a reach2 = domain_mask & (it_hi >> 16); + u64a reach3 = domain_mask & (it_hi >> 24); + u64a reach4 = domain_mask & (it_hi >> 32); + u64a reach5 = domain_mask & (it_hi >> 40); + u64a reach6 = domain_mask & (it_hi >> 48); + u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8)); + u64a reach8 = domain_mask & it_lo; + u64a reach9 = domain_mask & (it_lo >> 8); + u64a reach10 = domain_mask & (it_lo >> 16); + u64a reach11 = domain_mask & (it_lo >> 24); + u64a reach12 = domain_mask & (it_lo >> 32); + u64a reach13 = domain_mask & (it_lo >> 40); + u64a reach14 = domain_mask & (it_lo >> 48); + u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15); - u64a mask_not = ~domain_mask_flipped; - u64a reach0 = mask_not & ptr[0]; - u64a reach1 = mask_not & ptr[1]; - u64a reach2 = mask_not & ptr[2]; - u64a reach3 = mask_not & ptr[3]; - u64a reach4 = mask_not & ptr[4]; - u64a reach5 = mask_not & ptr[5]; - u64a reach6 = mask_not & ptr[6]; - u64a reach7 = mask_not & ptr[7]; - u64a reach8 = mask_not & ptr[8]; - u64a reach9 = mask_not & ptr[9]; - u64a reach10 = mask_not & ptr[10]; - u64a reach11 = mask_not & ptr[11]; - u64a reach12 = mask_not & ptr[12]; - u64a reach13 = mask_not & ptr[13]; - u64a reach14 = mask_not & ptr[14]; - u64a reach15 = mask_not & ptr[15]; - - m128 st0 = load_m128_from_u64a(ft + reach0); - m128 st1 = load_m128_from_u64a(ft + reach1); - m128 st2 = load_m128_from_u64a(ft + reach2); - m128 st3 = load_m128_from_u64a(ft + reach3); - m128 st4 = load_m128_from_u64a(ft + reach4); - m128 st5 = load_m128_from_u64a(ft + reach5); - m128 st6 = load_m128_from_u64a(ft + reach6); - m128 st7 = load_m128_from_u64a(ft + reach7); + m128 st0 = load_m128_from_u64a(ft + reach0); + m128 st1 = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1); + m128 st2 = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2); + m128 st3 = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3); + m128 st4 = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4); + m128 st5 = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5); + m128 st6 = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6); + m128 st7 = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7); m128 st8 = load_m128_from_u64a(ft + reach8); - m128 st9 = load_m128_from_u64a(ft + reach9); - m128 st10 = load_m128_from_u64a(ft + reach10); - m128 st11 = load_m128_from_u64a(ft + reach11); - m128 st12 = load_m128_from_u64a(ft + reach12); - m128 st13 = load_m128_from_u64a(ft + reach13); - m128 st14 = load_m128_from_u64a(ft + reach14); - m128 st15 = load_m128_from_u64a(ft + reach15); - - st1 = lshiftbyte_m128(st1, 1); - st2 = lshiftbyte_m128(st2, 2); - st3 = lshiftbyte_m128(st3, 3); - st4 = lshiftbyte_m128(st4, 4); - st5 = lshiftbyte_m128(st5, 5); - st6 = lshiftbyte_m128(st6, 6); - st7 = lshiftbyte_m128(st7, 7); - st9 = lshiftbyte_m128(st9, 1); - st10 = lshiftbyte_m128(st10, 2); - st11 = lshiftbyte_m128(st11, 3); - st12 = lshiftbyte_m128(st12, 4); - st13 = lshiftbyte_m128(st13, 5); - st14 = lshiftbyte_m128(st14, 6); - st15 = lshiftbyte_m128(st15, 7); + m128 st9 = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1); + m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2); + m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3); + m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4); + m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5); + m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6); + m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7); st0 = or128(st0, st1); st2 = or128(st2, st3); From d9874898c73d1fda98779b297cce77e408ed729c Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 8 Feb 2021 19:19:52 +0200 Subject: [PATCH 09/13] make const --- src/util/arch/common/simd_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index e0073fad..d8499ea2 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -72,7 +72,7 @@ static inline void print_m128_4x32(const char *label, m128 vector) { printf("\n"); } -static inline void print_m128_2x64(char *label, m128 vector) { +static inline void print_m128_2x64(const char *label, m128 vector) { uint64_t ALIGN_ATTR(16) data[2]; store128(data, vector); DEBUG_PRINTF("%s: ", label); From f541f754005aefbd9d3470f2c078eacaecfc6598 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 8 Feb 2021 19:20:37 +0200 Subject: [PATCH 10/13] bugfix compress128/expand128, add unit tests --- src/util/arch/arm/bitutils.h | 51 +++++++++++++---------------- unit/internal/bitutils.cpp | 62 ++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 29 deletions(-) diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h index ddca35c9..498db568 100644 --- a/src/util/arch/arm/bitutils.h +++ b/src/util/arch/arm/bitutils.h @@ -82,11 +82,7 @@ u32 findAndClearLSB_64_impl(u64a *v) { static really_inline u32 findAndClearMSB_32_impl(u32 *v) { - u32 val = *v; - u32 offset = 31 - clz32_impl(val); - *v = val & ~(1 << offset); - assert(offset < 32); - return offset; + return findAndClearMSB_32_impl_c(v); } static really_inline @@ -107,20 +103,19 @@ u64a compress64_impl(u64a x, u64a m) { static really_inline m128 compress128_impl(m128 x, m128 m) { m128 one = set1_2x64(1); - m128 bitset = one; - m128 vres = zeroes128(); + m128 bb = one; + m128 res = zeroes128(); while (isnonzero128(m)) { - m128 mm = sub_2x64(zeroes128(), m); - m128 tv = and128(x, m); - tv = and128(tv, mm); - - m128 mask = not128(eq64_m128(tv, zeroes128())); - mask = vandq_s64(bitset, mask); - vres = or128(vres, mask); - m = and128(m, sub_2x64(m, one)); - bitset = lshift64_m128(bitset, 1); + m128 mm = sub_2x64(zeroes128(), m); + m128 xm = and128(x, m); + xm = and128(xm, mm); + + m128 mask = not128(eq64_m128(xm, zeroes128())); + res = or128(res, and128(bb, mask)); + m = and128(m, sub_2x64(m, one)); + bb = lshift64_m128(bb, 1); } - return vres; + return res; } static really_inline @@ -136,20 +131,18 @@ u64a expand64_impl(u64a x, u64a m) { static really_inline m128 expand128_impl(m128 x, m128 m) { m128 one = set1_2x64(1); - m128 bitset = one; - m128 vres = zeroes128(); + m128 bb = one; + m128 res = zeroes128(); while (isnonzero128(m)) { - m128 tv = and128(x, m); - - m128 mm = sub_2x64(zeroes128(), m); - m128 mask = not128(eq64_m128(tv, zeroes128())); - mask = vandq_s64(bitset, mask); - mask = and128(mask, mm); - vres = or128(vres, mask); - m = and128(m, sub_2x64(m, one)); - bitset = lshift64_m128(bitset, 1); + m128 xm = and128(x, bb); + m128 mm = sub_2x64(zeroes128(), m); + m128 mask = not128(eq64_m128(xm, zeroes128())); + mask = and128(mask, and128(m, mm)); + res = or128(res, mask); + m = and128(m, sub_2x64(m, one)); + bb = lshift64_m128(bb, 1); } - return vres; + return res; } /* returns the first set bit after begin (if not ~0U). If no bit is set after diff --git a/unit/internal/bitutils.cpp b/unit/internal/bitutils.cpp index 3f788544..8af8f9a4 100644 --- a/unit/internal/bitutils.cpp +++ b/unit/internal/bitutils.cpp @@ -294,6 +294,39 @@ TEST(BitUtils, compress64) { } } +TEST(BitUtils, compress128) { + const m128 all_zeroes = zeroes128(); + const m128 all_ones = ones128(); + const m128 odd_bits = set1_2x64(0x5555555555555555ull); + const m128 even_bits = set1_2x64(0xaaaaaaaaaaaaaaaaull); + + EXPECT_EQ(0, diff128(all_zeroes, compress128(all_zeroes, all_zeroes))); + EXPECT_EQ(0, diff128(all_zeroes, compress128(all_zeroes, set1_4x32(1)))); + EXPECT_EQ(0, diff128(all_zeroes, compress128(all_zeroes, all_ones))); + EXPECT_EQ(0, diff128(all_ones, compress128(all_ones, all_ones))); + EXPECT_EQ(0, diff128(set1_2x64(0xffffffffull), compress128(odd_bits, odd_bits))); + EXPECT_EQ(0, diff128(set1_2x64(0xffffffffull), compress128(even_bits, even_bits))); + EXPECT_EQ(0, diff128(all_zeroes, compress128(odd_bits, even_bits))); + EXPECT_EQ(0, diff128(all_zeroes, compress128(even_bits, odd_bits))); + + // Some single-bit tests. + for (u32 i = 0; i < 64; i++) { + const m128 one_bit = set1_2x64(1ull << i); + + EXPECT_EQ(0, diff128(all_zeroes, compress128(all_zeroes, one_bit))); + EXPECT_EQ(0, diff128(set1_2x64(1ull), compress128(one_bit, one_bit))); + EXPECT_EQ(0, diff128(one_bit, compress128(one_bit, all_ones))); + + if (i % 2) { + EXPECT_EQ(0, diff128(set1_2x64(1ull << (i / 2)), compress128(one_bit, even_bits))); + EXPECT_EQ(0, diff128(all_zeroes, compress128(one_bit, odd_bits))); + } else { + EXPECT_EQ(0, diff128(set1_2x64(1ull << (i / 2)), compress128(one_bit, odd_bits))); + EXPECT_EQ(0, diff128(all_zeroes, compress128(one_bit, even_bits))); + } + } +} + TEST(BitUtils, expand32) { const u32 all_ones = 0xffffffffu; const u32 odd_bits = 0x55555555u; @@ -352,6 +385,35 @@ TEST(BitUtils, expand64) { } } +TEST(BitUtils, expand128) { + const m128 all_zeroes = zeroes128(); + const m128 all_ones = ones128(); + const m128 odd_bits = set1_2x64(0x5555555555555555ull); + const m128 even_bits = set1_2x64(0xaaaaaaaaaaaaaaaaull); + + EXPECT_EQ(0, diff128(all_zeroes, expand128(all_zeroes, all_zeroes))); + EXPECT_EQ(0, diff128(all_zeroes, expand128(all_zeroes, set1_2x64(1ull)))); + EXPECT_EQ(0, diff128(all_zeroes, expand128(all_zeroes, all_ones))); + EXPECT_EQ(0, diff128(all_ones, expand128(all_ones, all_ones))); + EXPECT_EQ(0, diff128(odd_bits, expand128(set1_2x64(0xffffffffull), odd_bits))); + EXPECT_EQ(0, diff128(even_bits, expand128(set1_2x64(0xffffffffull), even_bits))); + EXPECT_EQ(0, diff128(all_zeroes, expand128(set1_2x64(0xffffffff00000000ull), even_bits))); + EXPECT_EQ(0, diff128(all_zeroes, expand128(set1_2x64(0xffffffff00000000ull), odd_bits))); + EXPECT_EQ(0, diff128(set1_2x64(1u), expand128(set1_2x64(1u), odd_bits))); + EXPECT_EQ(0, diff128(set1_2x64(2u), expand128(set1_2x64(1u), even_bits))); + + // Some single-bit tests. + for (u32 i = 0; i < 64; i++) { + const m128 one_bit = set1_2x64(1ull << i); + + EXPECT_EQ(0, diff128(all_zeroes, expand128(all_zeroes, one_bit))); + EXPECT_EQ(0, diff128(one_bit, expand128(set1_2x64(1ull), one_bit))); + EXPECT_EQ(0, diff128(one_bit, expand128(one_bit, all_ones))); + + EXPECT_EQ(0, diff128(one_bit, expand128(set1_2x64(1ull << (i / 2)), i % 2 ? even_bits : odd_bits))); + } +} + TEST(BitUtils, bf_op_1) { u64a a = 0; for (u32 i = 0; i < 64; i++) { From be66cdb51dbd50100d562c0c008dfb8a7c793109 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 8 Feb 2021 19:38:20 +0200 Subject: [PATCH 11/13] fixes in shifting primitives --- src/util/arch/arm/simd_utils.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index f3215fb2..8cf00025 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -62,7 +62,7 @@ static really_inline int diff128(m128 a, m128 b) { } static really_inline int isnonzero128(m128 a) { - return !!diff128(a, zeroes128()); + return diff128(a, zeroes128()); } /** @@ -121,7 +121,6 @@ static really_inline m128 eq64_m128(m128 a, m128 b) { return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b); } - static really_inline u32 movemask128(m128 a) { static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; @@ -311,22 +310,28 @@ m128 palignr(m128 r, m128 l, int offset) { static really_really_inline m128 rshiftbyte_m128(m128 a, unsigned b) { + if (b == 0) { + return a; + } return palignr(zeroes128(), a, b); } static really_really_inline m128 lshiftbyte_m128(m128 a, unsigned b) { + if (b == 0) { + return a; + } return palignr(a, zeroes128(), 16 - b); } static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) { assert(amount >= -16 && amount <= 16); - static const uint8x16_t vbs_mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }; - const uint8x16_t outside_mask = set1_16x8(0xf0); - - m128 shift_mask = palignr_imm(vbs_mask, outside_mask, 16 - amount); - return vqtbl1q_s8(in, shift_mask); + if (amount < 0) { + return palignr_imm(zeroes128(), in, -amount); + } else { + return palignr_imm(in, zeroes128(), 16 - amount); + } } #ifdef __cplusplus From d3e03ed88a8ff76fbfcee32f335983d042e6d55a Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 10 Feb 2021 13:29:45 +0200 Subject: [PATCH 12/13] optimize case mask AND out of the loop --- src/hwlm/noodle_engine_sse.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c index 5d47768d..fcd753fc 100644 --- a/src/hwlm/noodle_engine_sse.c +++ b/src/hwlm/noodle_engine_sse.c @@ -49,12 +49,8 @@ hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, if (!l) { return HWLM_SUCCESS; } - m128 v = zeroes128(); - // we don't have a clever way of doing this move yet - memcpy(&v, d, l); - if (noCase) { - v = and128(v, caseMask); - } + m128 mask128 = noCase ? caseMask : ones128(); + m128 v = and128(load128(d), mask128); // mask out where we can't match u32 mask = (0xFFFF >> (16 - l)); @@ -76,11 +72,8 @@ hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); const size_t l = end - start; - m128 v = loadu128(d); - - if (noCase) { - v = and128(v, caseMask); - } + m128 mask128 = noCase ? caseMask : ones128(); + m128 v = and128(loadu128(d), mask128); u32 buf_off = start - offset; u32 mask = ((1 << l) - 1) << buf_off; @@ -109,11 +102,8 @@ hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, assert(l <= 32); DEBUG_PRINTF("d %zu\n", d - buf); - m128 v = zeroes128(); - memcpy(&v, d, l); - if (noCase) { - v = and128(v, caseMask); - } + m128 mask128 = noCase ? caseMask : ones128(); + m128 v = and128(load128(d), mask128); u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), eq128(mask2, v))); @@ -137,11 +127,8 @@ hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); size_t l = end - start; - m128 v = loadu128(d); - - if (noCase) { - v = and128(v, caseMask); - } + m128 mask128 = noCase ? caseMask : ones128(); + m128 v = and128(loadu128(d), mask128); u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), eq128(mask2, v))); @@ -164,9 +151,10 @@ hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf, size_t end) { const u8 *d = buf + start, *e = buf + end; assert(d < e); + m128 mask128 = noCase ? caseMask : ones128(); for (; d < e; d += 16) { - m128 v = noCase ? and128(load128(d), caseMask) : load128(d); + m128 v = and128(load128(d), mask128); u32 z = movemask128(eq128(mask1, v)); @@ -186,9 +174,10 @@ hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf, const u8 *d = buf + start, *e = buf + end; assert(d < e); m128 lastz1 = zeroes128(); + m128 mask128 = noCase ? caseMask : ones128(); for (; d < e; d += 16) { - m128 v = noCase ? and128(load128(d), caseMask) : load128(d); + m128 v = and128(load128(d), mask128); m128 z1 = eq128(mask1, v); m128 z2 = eq128(mask2, v); u32 z = movemask128(and128(palignr(z1, lastz1, 15), z2)); From 9fd94e0062159e49939aa6be7fffdc82039d176f Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 11 Feb 2021 14:21:57 +0200 Subject: [PATCH 13/13] use unaligned loads for short scans --- src/hwlm/noodle_engine_sse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c index fcd753fc..0f14852d 100644 --- a/src/hwlm/noodle_engine_sse.c +++ b/src/hwlm/noodle_engine_sse.c @@ -50,7 +50,7 @@ hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, return HWLM_SUCCESS; } m128 mask128 = noCase ? caseMask : ones128(); - m128 v = and128(load128(d), mask128); + m128 v = and128(loadu128(d), mask128); // mask out where we can't match u32 mask = (0xFFFF >> (16 - l)); @@ -103,7 +103,7 @@ hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, DEBUG_PRINTF("d %zu\n", d - buf); m128 mask128 = noCase ? caseMask : ones128(); - m128 v = and128(load128(d), mask128); + m128 v = and128(loadu128(d), mask128); u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), eq128(mask2, v)));