From 49eb18ee4f21b5bd389e0e9d5644be1ec1dc85c6 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sun, 26 Jun 2022 22:50:05 +0000 Subject: [PATCH] Optimize vectorscan for aarch64 by using shrn instruction This optimization is based on the thread https://twitter.com/Danlark1/status/1539344279268691970 and uses shift right and narrow by 4 instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SHRN--SHRN2--Shift-Right-Narrow--immediate-- To achieve that, I needed to redesign a little movemask into comparemask and have an additional step towards mask iteration. Our benchmarks showed 10-15% improvement on average for long matches. --- src/hwlm/noodle_engine_simd.hpp | 55 ++++++++++------ src/nfa/limex_shuffle.hpp | 14 +++- src/util/arch/arm/match.hpp | 40 ++++++------ src/util/arch/arm/simd_utils.h | 27 ++++---- src/util/arch/ppc64el/match.hpp | 8 +-- src/util/arch/x86/match.hpp | 72 +++++++++++--------- src/util/supervector/arch/arm/impl.cpp | 32 ++++----- src/util/supervector/arch/ppc64el/impl.cpp | 18 +++-- src/util/supervector/arch/x86/impl.cpp | 76 +++++++++++++++------- src/util/supervector/supervector.hpp | 36 +++++++--- unit/internal/supervector.cpp | 36 ++++++---- 11 files changed, 264 insertions(+), 150 deletions(-) diff --git a/src/hwlm/noodle_engine_simd.hpp b/src/hwlm/noodle_engine_simd.hpp index c49bfc7e..8006bd79 100644 --- a/src/hwlm/noodle_engine_simd.hpp +++ b/src/hwlm/noodle_engine_simd.hpp @@ -36,7 +36,7 @@ static really_really_inline hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, Z_TYPE z, size_t len, const struct cb_info *cbi) { while (unlikely(z)) { - Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); + Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT; size_t matchPos = d - buf + pos; DEBUG_PRINTF("match pos %zu\n", matchPos); hwlmcb_rv_t rv = final(n, buf, len, n->msk_len != 1, cbi, matchPos); @@ -49,7 +49,7 @@ static really_really_inline hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, Z_TYPE z, size_t len, const struct cb_info *cbi) { while (unlikely(z)) { - Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); + Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT; size_t matchPos = d - buf + pos - 1; DEBUG_PRINTF("match pos %zu\n", matchPos); hwlmcb_rv_t rv = final(n, buf, len, true, cbi, matchPos); @@ -77,9 +77,11 @@ hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, SuperVector v = SuperVector::Zeroes(); memcpy(&v.u, d, l); - typename SuperVector::movemask_type mask = SINGLE_LOAD_MASK(l); + typename SuperVector::comparemask_type mask = + SINGLE_LOAD_MASK(l * SuperVector::mask_width()); v = v & caseMask; - typename SuperVector::movemask_type z = mask & mask1.eqmask(v); + typename SuperVector::comparemask_type z = mask & mask1.eqmask(v); + z = SuperVector::iteration_mask(z); return single_zscan(n, d, buf, z, len, cbi); } @@ -103,9 +105,12 @@ hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, return HWLM_SUCCESS; } size_t buf_off = start - offset; - typename SuperVector::movemask_type mask = SINGLE_LOAD_MASK(l) << buf_off; + typename SuperVector::comparemask_type mask = + SINGLE_LOAD_MASK(l * SuperVector::mask_width()) + << (buf_off * SuperVector::mask_width()); SuperVector v = SuperVector::loadu(d) & caseMask; - typename SuperVector::movemask_type z = mask & mask1.eqmask(v); + typename SuperVector::comparemask_type z = mask & mask1.eqmask(v); + z = SuperVector::iteration_mask(z); return single_zscan(n, d, buf, z, len, cbi); } @@ -126,10 +131,13 @@ hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, memcpy(&v.u, d, l); v = v & caseMask; - typename SuperVector::movemask_type mask = DOUBLE_LOAD_MASK(l); - typename SuperVector::movemask_type z1 = mask1.eqmask(v); - typename SuperVector::movemask_type z2 = mask2.eqmask(v); - typename SuperVector::movemask_type z = mask & (z1 << 1) & z2; + typename SuperVector::comparemask_type mask = + DOUBLE_LOAD_MASK(l * SuperVector::mask_width()); + typename SuperVector::comparemask_type z1 = mask1.eqmask(v); + typename SuperVector::comparemask_type z2 = mask2.eqmask(v); + typename SuperVector::comparemask_type z = + mask & (z1 << (SuperVector::mask_width())) & z2; + z = SuperVector::iteration_mask(z); return double_zscan(n, d, buf, z, len, cbi); } @@ -148,10 +156,14 @@ hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, } SuperVector v = SuperVector::loadu(d) & caseMask; size_t buf_off = start - offset; - typename SuperVector::movemask_type mask = DOUBLE_LOAD_MASK(l) << buf_off; - typename SuperVector::movemask_type z1 = mask1.eqmask(v); - typename SuperVector::movemask_type z2 = mask2.eqmask(v); - typename SuperVector::movemask_type z = mask & (z1 << 1) & z2; + typename SuperVector::comparemask_type mask = + DOUBLE_LOAD_MASK(l * SuperVector::mask_width()) + << (buf_off * SuperVector::mask_width()); + typename SuperVector::comparemask_type z1 = mask1.eqmask(v); + typename SuperVector::comparemask_type z2 = mask2.eqmask(v); + typename SuperVector::comparemask_type z = + mask & (z1 << SuperVector::mask_width()) & z2; + z = SuperVector::iteration_mask(z); return double_zscan(n, d, buf, z, len, cbi); } @@ -191,7 +203,8 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, __builtin_prefetch(base + 256); SuperVector v = SuperVector::load(d) & caseMask; - typename SuperVector::movemask_type z = mask1.eqmask(v); + typename SuperVector::comparemask_type z = mask1.eqmask(v); + z = SuperVector::iteration_mask(z); hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi); RETURN_IF_TERMINATED(rv); @@ -220,7 +233,7 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, size_t start = offset + n->msk_len - n->key_offset; - typename SuperVector::movemask_type lastz1{0}; + typename SuperVector::comparemask_type lastz1{0}; const u8 *d = buf + start; const u8 *e = buf + end; @@ -248,10 +261,12 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, __builtin_prefetch(base + 256); SuperVector v = SuperVector::load(d) & caseMask; - typename SuperVector::movemask_type z1 = mask1.eqmask(v); - typename SuperVector::movemask_type z2 = mask2.eqmask(v); - typename SuperVector::movemask_type z = (z1 << 1 | lastz1) & z2; - lastz1 = z1 >> Z_SHIFT; + typename SuperVector::comparemask_type z1 = mask1.eqmask(v); + typename SuperVector::comparemask_type z2 = mask2.eqmask(v); + typename SuperVector::comparemask_type z = + (z1 << SuperVector::mask_width() | lastz1) & z2; + lastz1 = z1 >> (Z_SHIFT * SuperVector::mask_width()); + z = SuperVector::iteration_mask(z); hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi); RETURN_IF_TERMINATED(rv); diff --git a/src/nfa/limex_shuffle.hpp b/src/nfa/limex_shuffle.hpp index 4266d7da..367d400b 100644 --- a/src/nfa/limex_shuffle.hpp +++ b/src/nfa/limex_shuffle.hpp @@ -53,7 +53,15 @@ really_really_inline u32 packedExtract<16>(SuperVector<16> s, const SuperVector<16> permute, const SuperVector<16> compare) { SuperVector<16> shuffled = s.pshufb(permute); SuperVector<16> compared = shuffled & compare; - u16 rv = ~compared.eqmask(shuffled); + u64a rv = (~compared.eqmask(shuffled)) & 0xffff; + if (SuperVector<16>::mask_width() != 1) { + u32 ans = 0; + for (u32 i = 0; i < 16; ++i) { + ans |= (rv & (1ull << (i * SuperVector<16>::mask_width()))) >> + (i * SuperVector<16>::mask_width() - i); + } + return ans; + } return (u32)rv; } @@ -62,7 +70,8 @@ really_really_inline u32 packedExtract<32>(SuperVector<32> s, const SuperVector<32> permute, const SuperVector<32> compare) { SuperVector<32> shuffled = s.pshufb(permute); SuperVector<32> compared = shuffled & compare; - u32 rv = ~compared.eqmask(shuffled); + // TODO(danlark1): Future ARM support might have a bug. + u64a rv = (~compared.eqmask(shuffled)) & 0xffffffff; return (u32)((rv >> 16) | (rv & 0xffffU)); } @@ -71,6 +80,7 @@ really_really_inline u32 packedExtract<64>(SuperVector<64> s, const SuperVector<64> permute, const SuperVector<64> compare) { SuperVector<64> shuffled = s.pshufb(permute); SuperVector<64> compared = shuffled & compare; + // TODO(danlark1): Future ARM support might have a bug. u64a rv = ~compared.eqmask(shuffled); rv = rv >> 32 | rv; return (u32)(((rv >> 16) | rv) & 0xffffU); diff --git a/src/util/arch/arm/match.hpp b/src/util/arch/arm/match.hpp index 892c3877..1280fed5 100644 --- a/src/util/arch/arm/match.hpp +++ b/src/util/arch/arm/match.hpp @@ -33,13 +33,13 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 cons uint32x4_t m = mask.u.u32x4[0]; uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0); if (vmax != 0) { - typename SuperVector<16>::movemask_type z = mask.movemask(); - DEBUG_PRINTF("z %08x\n", z); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - u32 pos = ctz32(z & 0xffff); + typename SuperVector<16>::comparemask_type z = mask.comparemask(); + DEBUG_PRINTF("z %08llx\n", z); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + u32 pos = ctz64(z) / SuperVector<16>::mask_width(); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos < 16); - DEBUG_PRINTF("buf + pos %p\n", buf + pos); + DEBUG_PRINTF("buf + pos %p\n", buf + (pos)); return buf + pos; } else { return NULL; // no match @@ -52,13 +52,12 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const uint32x4_t m = mask.u.u32x4[0]; uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0); if (vmax != 0) { - typename SuperVector<16>::movemask_type z = mask.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); - u32 pos = clz32(z & 0xffff); + typename SuperVector<16>::comparemask_type z = mask.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); + u32 pos = clz64(z) / SuperVector<16>::mask_width(); DEBUG_PRINTF("match @ pos %u\n", pos); - assert(pos >= 16 && pos < 32); - return buf + (31 - pos); + return buf + (15 - pos); } else { return NULL; // no match } @@ -70,10 +69,10 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16 uint32x4_t m = mask.u.u32x4[0]; uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0); if (vmax != 0) { - typename SuperVector<16>::movemask_type z = mask.movemask(); - DEBUG_PRINTF("z %08x\n", z); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - u32 pos = ctz32(z & 0xffff); + typename SuperVector<16>::comparemask_type z = mask.comparemask(); + DEBUG_PRINTF("z %08llx\n", z); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + u32 pos = ctz64(z) / SuperVector<16>::mask_width(); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos < 16); DEBUG_PRINTF("buf + pos %p\n", buf + pos); @@ -89,13 +88,12 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16 uint32x4_t m = mask.u.u32x4[0]; uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0); if (vmax != 0) { - typename SuperVector<16>::movemask_type z = mask.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); - u32 pos = clz32(z & 0xffff); + typename SuperVector<16>::comparemask_type z = mask.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); + u32 pos = clz64(z) / SuperVector<16>::mask_width(); DEBUG_PRINTF("match @ pos %u\n", pos); - assert(pos >= 16 && pos < 32); - return buf + (31 - pos); + return buf + (15 - pos); } else { return NULL; // no match } diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index e6836b25..68c29c67 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -86,8 +86,9 @@ static really_inline m128 not128(m128 a) { /** \brief Return 1 if a and b are different otherwise 0 */ static really_inline int diff128(m128 a, m128 b) { - int res = vaddvq_s8((int8x16_t) vceqq_s32(a, b)); - return (-16 != res); + uint64_t res = vget_lane_u64( + (uint64x1_t)vshrn_n_u16((uint16x8_t)vceqq_s32(a, b), 4), 0); + return (~0ull != res); } static really_inline int isnonzero128(m128 a) { @@ -379,15 +380,19 @@ static really_inline m128 eq64_m128(m128 a, m128 b) { } static really_inline u32 movemask128(m128 a) { - uint8x16_t input = vreinterpretq_u8_s32(a); - uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); - uint32x4_t paired16 = - vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); - uint64x2_t paired32 = - vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); - uint8x16_t paired64 = - vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); - return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); + static const uint8x16_t powers = {1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128}; + + // Compute the mask from the input + uint8x16_t mask = (uint8x16_t)vpaddlq_u32( + vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); + uint8x16_t mask1 = vextq_u8(mask, (uint8x16_t)zeroes128(), 7); + mask = vorrq_u8(mask, mask1); + + // Get the resulting bytes + uint16_t output; + vst1q_lane_u16((uint16_t *)&output, (uint16x8_t)mask, 0); + return output; } static really_inline m128 set1_16x8(u8 c) { diff --git a/src/util/arch/ppc64el/match.hpp b/src/util/arch/ppc64el/match.hpp index a3f52e41..4f7cc7f1 100644 --- a/src/util/arch/ppc64el/match.hpp +++ b/src/util/arch/ppc64el/match.hpp @@ -30,7 +30,7 @@ template <> really_really_inline const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { - SuperVector<16>::movemask_type z = v.movemask(); + SuperVector<16>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("z %08x\n", z); if (unlikely(z)) { @@ -47,7 +47,7 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const U template <> really_really_inline const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { - SuperVector<16>::movemask_type z = v.movemask(); + SuperVector<16>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("z %08x\n", z); if (unlikely(z)) { @@ -63,7 +63,7 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UN template <> really_really_inline const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { - SuperVector<16>::movemask_type z = v.movemask(); + SuperVector<16>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("z %08x\n", z); if (unlikely(z != 0xffff)) { @@ -81,7 +81,7 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 co template <> really_really_inline const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) { - SuperVector<16>::movemask_type z = v.movemask(); + SuperVector<16>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("z %08x\n", z); if (unlikely(z != 0xffff)) { diff --git a/src/util/arch/x86/match.hpp b/src/util/arch/x86/match.hpp index cbf4ab6b..d237567f 100644 --- a/src/util/arch/x86/match.hpp +++ b/src/util/arch/x86/match.hpp @@ -30,12 +30,13 @@ template <> really_really_inline const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { - SuperVector<16>::movemask_type z = v.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); + assert(SuperVector<16>::mask_width() == 1); + SuperVector<16>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); if (unlikely(z)) { u32 pos = ctz32(z); - DEBUG_PRINTF("~z %08x\n", ~z); + DEBUG_PRINTF("~z %08llx\n", ~z); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos < 16); return buf + pos; @@ -47,8 +48,9 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const U template <> really_really_inline const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) { - SuperVector<32>::movemask_type z = v.movemask(); - DEBUG_PRINTF("z 0x%08x\n", z); + assert(SuperVector<32>::mask_width() == 1); + SuperVector<32>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("z 0x%08llx\n", z); if (unlikely(z)) { u32 pos = ctz32(z); assert(pos < 32); @@ -61,7 +63,8 @@ const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const U template <> really_really_inline const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) { - SuperVector<64>::movemask_type z = v.movemask(); + assert(SuperVector<64>::mask_width() == 1); + SuperVector<64>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("z 0x%016llx\n", z); u64a mask = (~0ULL) >> (64 - len); DEBUG_PRINTF("mask %016llx\n", mask); @@ -80,9 +83,10 @@ const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const le template <> really_really_inline const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { - SuperVector<16>::movemask_type z = v.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); + assert(SuperVector<16>::mask_width() == 1); + SuperVector<16>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); if (unlikely(z)) { u32 pos = clz32(z); DEBUG_PRINTF("match @ pos %u\n", pos); @@ -96,8 +100,9 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UN template <> really_really_inline const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) { - SuperVector<32>::movemask_type z = v.movemask(); - DEBUG_PRINTF("z 0x%08x\n", z); + assert(SuperVector<32>::mask_width() == 1); + SuperVector<32>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("z 0x%08llx\n", z); if (unlikely(z)) { u32 pos = clz32(z); assert(pos < 32); @@ -110,7 +115,8 @@ const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UN template <> really_really_inline const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) { - SuperVector<64>::movemask_type z = v.movemask(); + assert(SuperVector<64>::mask_width() == 1); + SuperVector<64>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("z 0x%016llx\n", z); u64a mask = (~0ULL) >> (64 - len); DEBUG_PRINTF("mask %016llx\n", mask); @@ -129,12 +135,13 @@ const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len template <> really_really_inline const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { - SuperVector<16>::movemask_type z = v.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); + assert(SuperVector<16>::mask_width() == 1); + SuperVector<16>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); if (unlikely(z != 0xffff)) { u32 pos = ctz32(~z & 0xffff); - DEBUG_PRINTF("~z %08x\n", ~z); + DEBUG_PRINTF("~z %08llx\n", ~z); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos < 16); return buf + pos; @@ -146,10 +153,11 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 co template <> really_really_inline const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) { - SuperVector<32>::movemask_type z = v.movemask(); - DEBUG_PRINTF("z 0x%08x\n", z); + assert(SuperVector<32>::mask_width() == 1); + SuperVector<32>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("z 0x%08llx\n", z); if (unlikely(z != 0xffffffff)) { - u32 pos = ctz32(~z); + u32 pos = ctz32(~z & 0xffffffffu); assert(pos < 32); DEBUG_PRINTF("match @ pos %u\n", pos); return buf + pos; @@ -160,7 +168,8 @@ const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 co template <> really_really_inline const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 const len) { - SuperVector<64>::movemask_type z = v.movemask(); + assert(SuperVector<64>::mask_width() == 1); + SuperVector<64>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("z 0x%016llx\n", z); u64a mask = (~0ULL) >> (64 - len); DEBUG_PRINTF("mask %016llx\n", mask); @@ -179,12 +188,13 @@ const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 con template <> really_really_inline const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) { - SuperVector<16>::movemask_type z = v.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); + assert(SuperVector<16>::mask_width() == 1); + SuperVector<16>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); if (unlikely(z != 0xffff)) { - u32 pos = clz32(~z & 0xffff); - DEBUG_PRINTF("~z %08x\n", ~z); + u32 pos = clz32(~z & 0xffffu); + DEBUG_PRINTF("~z %08llx\n", ~z); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos >= 16 && pos < 32); return buf + (31 - pos); @@ -196,9 +206,10 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_ template<> really_really_inline const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_t UNUSED len) { - SuperVector<32>::movemask_type z = v.movemask(); - if (unlikely(z != 0xffffffff)) { - u32 pos = clz32(~z & 0xffffffff); + assert(SuperVector<32>::mask_width() == 1); + SuperVector<32>::comparemask_type z = v.comparemask(); + if (unlikely(static_cast(z) != 0xffffffff)) { + u32 pos = clz32(~z & 0xffffffffu); DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); assert(pos < 32); return buf + (31 - pos); @@ -210,8 +221,9 @@ const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_ template <> really_really_inline const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v, uint16_t len) { + assert(SuperVector<64>::mask_width() == 1); v.print8("v"); - SuperVector<64>::movemask_type z = v.movemask(); + SuperVector<64>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("z 0x%016llx\n", z); u64a mask = (~0ULL) >> (64 - len); DEBUG_PRINTF("mask %016llx\n", mask); diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp index 89497d3d..b3e4233e 100644 --- a/src/util/supervector/arch/arm/impl.cpp +++ b/src/util/supervector/arch/arm/impl.cpp @@ -249,25 +249,25 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons } template <> -really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void) const -{ - SuperVector powers = SuperVector::dup_u64(0x8040201008040201UL); - - // Compute the mask from the input - uint8x16_t mask = (uint8x16_t) vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(u.u8x16[0], powers.u.u8x16[0])))); - uint64x2_t mask1 = (uint64x2_t) vextq_u8(mask, vdupq_n_u8(0), 7); - mask = vorrq_u8(mask, (uint8x16_t) mask1); - - // Get the resulting bytes - uint16_t output; - vst1q_lane_u16(&output, (uint16x8_t)mask, 0); - return static_cast::movemask_type>(output); +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::comparemask(void) const { + return static_cast::comparemask_type>( + vget_lane_u64((uint64x1_t)vshrn_n_u16(u.u16x8[0], 4), 0)); } template <> -really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const -{ - return eq(b).movemask(); +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::eqmask(SuperVector<16> const b) const { + return eq(b).comparemask(); +} + +template <> really_inline u32 SuperVector<16>::mask_width() { return 4; } + +template <> +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::iteration_mask( + typename SuperVector<16>::comparemask_type mask) { + return mask & 0x1111111111111111ull; } template <> diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 109b8d5e..5becb8f8 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -206,8 +206,8 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons } template <> -really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const -{ +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::comparemask(void) const { uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7)); uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7)); @@ -230,11 +230,19 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask( } template <> -really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const -{ - return eq(b).movemask(); +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::eqmask(SuperVector<16> const b) const { + return eq(b).comparemask(); } +template <> really_inline u32 SuperVector<16>::mask_width() { return 1; } + +template <> +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::iteration_mask( + typename SuperVector<16>::comparemask_type mask) { + return mask; +} template <> template diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp index 157f1dc4..c9daf0cf 100644 --- a/src/util/supervector/arch/x86/impl.cpp +++ b/src/util/supervector/arch/x86/impl.cpp @@ -203,15 +203,24 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons } template <> -really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const -{ - return _mm_movemask_epi8(u.v128[0]); +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::comparemask(void) const { + return (u32)_mm_movemask_epi8(u.v128[0]); } template <> -really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const -{ - return eq(b).movemask(); +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::eqmask(SuperVector<16> const b) const { + return eq(b).comparemask(); +} + +template <> really_inline u32 SuperVector<16>::mask_width() { return 1; } + +template <> +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::iteration_mask( + typename SuperVector<16>::comparemask_type mask) { + return mask; } // template <> @@ -754,17 +763,25 @@ really_inline SuperVector<32> SuperVector<32>::eq(SuperVector<32> const &b) cons } template <> -really_inline typename SuperVector<32>::movemask_type SuperVector<32>::movemask(void)const -{ - return _mm256_movemask_epi8(u.v256[0]); +really_inline typename SuperVector<32>::comparemask_type +SuperVector<32>::comparemask(void) const { + return (u32)_mm256_movemask_epi8(u.v256[0]); } template <> -really_inline typename SuperVector<32>::movemask_type SuperVector<32>::eqmask(SuperVector<32> const b) const -{ - return eq(b).movemask(); +really_inline typename SuperVector<32>::comparemask_type +SuperVector<32>::eqmask(SuperVector<32> const b) const { + return eq(b).comparemask(); } +template <> really_inline u32 SuperVector<32>::mask_width() { return 1; } + +template <> +really_inline typename SuperVector<32>::comparemask_type +SuperVector<32>::iteration_mask( + typename SuperVector<32>::comparemask_type mask) { + return mask; +} // template <> // template @@ -1347,42 +1364,48 @@ really_inline SuperVector<64> SuperVector<64>::opandnot(SuperVector<64> const &b template <> really_inline SuperVector<64> SuperVector<64>::operator==(SuperVector<64> const &b) const { - SuperVector<64>::movemask_type mask = _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]); + SuperVector<64>::comparemask_type mask = + _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]); return {_mm512_movm_epi8(mask)}; } template <> really_inline SuperVector<64> SuperVector<64>::operator!=(SuperVector<64> const &b) const { - SuperVector<64>::movemask_type mask = _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]); + SuperVector<64>::comparemask_type mask = + _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]); return {_mm512_movm_epi8(mask)}; } template <> really_inline SuperVector<64> SuperVector<64>::operator>(SuperVector<64> const &b) const { - SuperVector<64>::movemask_type mask = _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]); + SuperVector<64>::comparemask_type mask = + _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]); return {_mm512_movm_epi8(mask)}; } template <> really_inline SuperVector<64> SuperVector<64>::operator<(SuperVector<64> const &b) const { - SuperVector<64>::movemask_type mask = _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]); + SuperVector<64>::comparemask_type mask = + _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]); return {_mm512_movm_epi8(mask)}; } template <> really_inline SuperVector<64> SuperVector<64>::operator>=(SuperVector<64> const &b) const { - SuperVector<64>::movemask_type mask = _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]); + SuperVector<64>::comparemask_type mask = + _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]); return {_mm512_movm_epi8(mask)}; } template <> really_inline SuperVector<64> SuperVector<64>::operator<=(SuperVector<64> const &b) const { - SuperVector<64>::movemask_type mask = _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]); + SuperVector<64>::comparemask_type mask = + _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]); return {_mm512_movm_epi8(mask)}; } @@ -1393,19 +1416,28 @@ really_inline SuperVector<64> SuperVector<64>::eq(SuperVector<64> const &b) cons } template <> -really_inline typename SuperVector<64>::movemask_type SuperVector<64>::movemask(void)const -{ +really_inline typename SuperVector<64>::comparemask_type +SuperVector<64>::comparemask(void) const { __m512i msb = _mm512_set1_epi8(0xFF); __m512i mask = _mm512_and_si512(msb, u.v512[0]); return _mm512_cmpeq_epi8_mask(mask, msb); } template <> -really_inline typename SuperVector<64>::movemask_type SuperVector<64>::eqmask(SuperVector<64> const b) const -{ +really_inline typename SuperVector<64>::comparemask_type +SuperVector<64>::eqmask(SuperVector<64> const b) const { return _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]); } +template <> really_inline u32 SuperVector<64>::mask_width() { return 1; } + +template <> +really_inline typename SuperVector<64>::comparemask_type +SuperVector<64>::iteration_mask( + typename SuperVector<64>::comparemask_type mask) { + return mask; +} + // template <> // template // really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index f0ddf63c..51310db2 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -46,19 +46,29 @@ using Z_TYPE = u64a; #define Z_BITS 64 #define Z_SHIFT 63 +#define Z_POSSHIFT 0 #define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS -(l))) #define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) #elif defined(HAVE_SIMD_256_BITS) using Z_TYPE = u32; #define Z_BITS 32 #define Z_SHIFT 31 +#define Z_POSSHIFT 0 #define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) #define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) #elif defined(HAVE_SIMD_128_BITS) +#if defined(ARCH_ARM32) || defined(ARCH_AARCH64) +using Z_TYPE = u64a; +#define Z_BITS 64 +#define Z_POSSHIFT 2 +#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l))) +#else using Z_TYPE = u32; #define Z_BITS 32 +#define Z_POSSHIFT 0 +#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) +#endif #define Z_SHIFT 15 -#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) #define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) #endif @@ -94,7 +104,8 @@ struct BaseVector static constexpr bool is_valid = false; static constexpr u16 size = 8; using type = void; - using movemask_type = void; + using comparemask_type = void; + using cmpmask_type = void; static constexpr bool has_previous = false; using previous_type = void; static constexpr u16 previous_size = 4; @@ -106,7 +117,7 @@ struct BaseVector<128> static constexpr bool is_valid = true; static constexpr u16 size = 128; using type = void; - using movemask_type = u64a; + using comparemask_type = u64a; static constexpr bool has_previous = true; using previous_type = m512; static constexpr u16 previous_size = 64; @@ -118,7 +129,7 @@ struct BaseVector<64> static constexpr bool is_valid = true; static constexpr u16 size = 64; using type = m512; - using movemask_type = u64a; + using comparemask_type = u64a; static constexpr bool has_previous = true; using previous_type = m256; static constexpr u16 previous_size = 32; @@ -131,7 +142,7 @@ struct BaseVector<32> static constexpr bool is_valid = true; static constexpr u16 size = 32; using type = m256; - using movemask_type = u32; + using comparemask_type = u64a; static constexpr bool has_previous = true; using previous_type = m128; static constexpr u16 previous_size = 16; @@ -144,7 +155,7 @@ struct BaseVector<16> static constexpr bool is_valid = true; static constexpr u16 size = 16; using type = m128; - using movemask_type = u32; + using comparemask_type = u64a; static constexpr bool has_previous = false; using previous_type = u64a; static constexpr u16 previous_size = 8; @@ -231,8 +242,17 @@ public: SuperVector eq(SuperVector const &b) const; SuperVector operator<<(uint8_t const N) const; SuperVector operator>>(uint8_t const N) const; - typename base_type::movemask_type movemask(void) const; - typename base_type::movemask_type eqmask(SuperVector const b) const; + // Returns mask_width groups of zeros or ones. To get the mask which can be + // iterated, use iteration_mask method, it ensures only one bit is set per + // mask_width group. + // Precondition: all bytes must be 0 or 0xff. + typename base_type::comparemask_type comparemask(void) const; + typename base_type::comparemask_type eqmask(SuperVector const b) const; + static u32 mask_width(); + // Returns a mask with at most 1 bit set to 1. It can be used to iterate + // over bits through ctz/clz and lowest bit clear. + static typename base_type::comparemask_type + iteration_mask(typename base_type::comparemask_type mask); static SuperVector loadu(void const *ptr); static SuperVector load(void const *ptr); diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp index deb3b169..0b4cae58 100644 --- a/unit/internal/supervector.cpp +++ b/unit/internal/supervector.cpp @@ -176,9 +176,9 @@ TEST(SuperVectorUtilsTest,Movemask128c){ } } auto SP = SuperVector<16>::loadu(vec); - u16 mask = SP.movemask(); - for(int i=0; i<16; i++) { - if (mask & (1 << i)) { + u64a mask = SP.comparemask(); + for (int i = 0; i < 16; i++) { + if (mask & (1ull << (i * SuperVector<16>::mask_width()))) { vec2[i] = 0xff; } } @@ -195,15 +195,21 @@ TEST(SuperVectorUtilsTest,Eqmask128c){ for (int i = 0; i<16; i++) { vec2[i]= rand() % 100 + 67;} auto SP = SuperVector<16>::loadu(vec); auto SP1 = SuperVector<16>::loadu(vec2); - int mask = SP.eqmask(SP); - ASSERT_EQ(mask,0xFFFF); + u64a mask = SP.eqmask(SP); + for (u32 i = 0; i < 16; ++i) { + ASSERT_TRUE(mask & (1ull << (i * SuperVector<16>::mask_width()))); + } mask = SP.eqmask(SP1); ASSERT_EQ(mask,0); vec2[0] = vec[0]; vec2[1] = vec[1]; auto SP2 = SuperVector<16>::loadu(vec2); mask = SP.eqmask(SP2); - ASSERT_EQ(mask,3); + ASSERT_TRUE(mask & 1); + ASSERT_TRUE(mask & (1ull << SuperVector<16>::mask_width())); + for (u32 i = 2; i < 16; ++i) { + ASSERT_FALSE(mask & (1ull << (i * SuperVector<16>::mask_width()))); + } } /*Define LSHIFT128 macro*/ @@ -507,9 +513,9 @@ TEST(SuperVectorUtilsTest,Movemask256c){ } } auto SP = SuperVector<32>::loadu(vec); - u32 mask = SP.movemask(); + u64a mask = SP.comparemask(); for(int i=0; i<32; i++) { - if (mask & (1 << i)) { + if (mask & (1ull << (i * SuperVector<32>::mask_width()))) { vec2[i] = 0xff; } } @@ -527,15 +533,21 @@ TEST(SuperVectorUtilsTest,Eqmask256c){ for (int i = 0; i<32; i++) { vec2[i]= rand() % 100 + 67;} auto SP = SuperVector<32>::loadu(vec); auto SP1 = SuperVector<32>::loadu(vec2); - u32 mask = SP.eqmask(SP); - ASSERT_EQ(mask,0xFFFFFFFF); + u64a mask = SP.eqmask(SP); + for (u32 i = 0; i < 32; ++i) { + ASSERT_TRUE(mask & (1ull << (i * SuperVector<32>::mask_width()))); + } mask = SP.eqmask(SP1); ASSERT_EQ(mask,0); vec2[0] = vec[0]; vec2[1] = vec[1]; auto SP2 = SuperVector<32>::loadu(vec2); mask = SP.eqmask(SP2); - ASSERT_EQ(mask,3); + ASSERT_TRUE(mask & 1); + ASSERT_TRUE(mask & (1ull << SuperVector<32>::mask_width())); + for (u32 i = 2; i < 32; ++i) { + ASSERT_FALSE(mask & (1ull << (i * SuperVector<32>::mask_width()))); + } } TEST(SuperVectorUtilsTest,pshufb256c) { @@ -871,6 +883,8 @@ TEST(SuperVectorUtilsTest,Eqmask512c){ auto SP = SuperVector<64>::loadu(vec); auto SP1 = SuperVector<64>::loadu(vec2); u64a mask = SP.eqmask(SP); + // Mask width for 64 bit type cannot be more than 1. + ASSERT_EQ(SuperVector<64>::mask_width(), 1); ASSERT_EQ(mask,0xFFFFFFFFFFFFFFFF); mask = SP.eqmask(SP1); ASSERT_EQ(mask,0);