From 49eb18ee4f21b5bd389e0e9d5644be1ec1dc85c6 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sun, 26 Jun 2022 22:50:05 +0000 Subject: [PATCH 01/35] Optimize vectorscan for aarch64 by using shrn instruction This optimization is based on the thread https://twitter.com/Danlark1/status/1539344279268691970 and uses shift right and narrow by 4 instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SHRN--SHRN2--Shift-Right-Narrow--immediate-- To achieve that, I needed to redesign a little movemask into comparemask and have an additional step towards mask iteration. Our benchmarks showed 10-15% improvement on average for long matches. --- src/hwlm/noodle_engine_simd.hpp | 55 ++++++++++------ src/nfa/limex_shuffle.hpp | 14 +++- src/util/arch/arm/match.hpp | 40 ++++++------ src/util/arch/arm/simd_utils.h | 27 ++++---- src/util/arch/ppc64el/match.hpp | 8 +-- src/util/arch/x86/match.hpp | 72 +++++++++++--------- src/util/supervector/arch/arm/impl.cpp | 32 ++++----- src/util/supervector/arch/ppc64el/impl.cpp | 18 +++-- src/util/supervector/arch/x86/impl.cpp | 76 +++++++++++++++------- src/util/supervector/supervector.hpp | 36 +++++++--- unit/internal/supervector.cpp | 36 ++++++---- 11 files changed, 264 insertions(+), 150 deletions(-) diff --git a/src/hwlm/noodle_engine_simd.hpp b/src/hwlm/noodle_engine_simd.hpp index c49bfc7e..8006bd79 100644 --- a/src/hwlm/noodle_engine_simd.hpp +++ b/src/hwlm/noodle_engine_simd.hpp @@ -36,7 +36,7 @@ static really_really_inline hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, Z_TYPE z, size_t len, const struct cb_info *cbi) { while (unlikely(z)) { - Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); + Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT; size_t matchPos = d - buf + pos; DEBUG_PRINTF("match pos %zu\n", matchPos); hwlmcb_rv_t rv = final(n, buf, len, n->msk_len != 1, cbi, matchPos); @@ -49,7 +49,7 @@ static really_really_inline hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, Z_TYPE z, size_t len, const struct cb_info *cbi) { while (unlikely(z)) { - Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); + Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT; size_t matchPos = d - buf + pos - 1; DEBUG_PRINTF("match pos %zu\n", matchPos); hwlmcb_rv_t rv = final(n, buf, len, true, cbi, matchPos); @@ -77,9 +77,11 @@ hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, SuperVector v = SuperVector::Zeroes(); memcpy(&v.u, d, l); - typename SuperVector::movemask_type mask = SINGLE_LOAD_MASK(l); + typename SuperVector::comparemask_type mask = + SINGLE_LOAD_MASK(l * SuperVector::mask_width()); v = v & caseMask; - typename SuperVector::movemask_type z = mask & mask1.eqmask(v); + typename SuperVector::comparemask_type z = mask & mask1.eqmask(v); + z = SuperVector::iteration_mask(z); return single_zscan(n, d, buf, z, len, cbi); } @@ -103,9 +105,12 @@ hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, return HWLM_SUCCESS; } size_t buf_off = start - offset; - typename SuperVector::movemask_type mask = SINGLE_LOAD_MASK(l) << buf_off; + typename SuperVector::comparemask_type mask = + SINGLE_LOAD_MASK(l * SuperVector::mask_width()) + << (buf_off * SuperVector::mask_width()); SuperVector v = SuperVector::loadu(d) & caseMask; - typename SuperVector::movemask_type z = mask & mask1.eqmask(v); + typename SuperVector::comparemask_type z = mask & mask1.eqmask(v); + z = SuperVector::iteration_mask(z); return single_zscan(n, d, buf, z, len, cbi); } @@ -126,10 +131,13 @@ hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, memcpy(&v.u, d, l); v = v & caseMask; - typename SuperVector::movemask_type mask = DOUBLE_LOAD_MASK(l); - typename SuperVector::movemask_type z1 = mask1.eqmask(v); - typename SuperVector::movemask_type z2 = mask2.eqmask(v); - typename SuperVector::movemask_type z = mask & (z1 << 1) & z2; + typename SuperVector::comparemask_type mask = + DOUBLE_LOAD_MASK(l * SuperVector::mask_width()); + typename SuperVector::comparemask_type z1 = mask1.eqmask(v); + typename SuperVector::comparemask_type z2 = mask2.eqmask(v); + typename SuperVector::comparemask_type z = + mask & (z1 << (SuperVector::mask_width())) & z2; + z = SuperVector::iteration_mask(z); return double_zscan(n, d, buf, z, len, cbi); } @@ -148,10 +156,14 @@ hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, } SuperVector v = SuperVector::loadu(d) & caseMask; size_t buf_off = start - offset; - typename SuperVector::movemask_type mask = DOUBLE_LOAD_MASK(l) << buf_off; - typename SuperVector::movemask_type z1 = mask1.eqmask(v); - typename SuperVector::movemask_type z2 = mask2.eqmask(v); - typename SuperVector::movemask_type z = mask & (z1 << 1) & z2; + typename SuperVector::comparemask_type mask = + DOUBLE_LOAD_MASK(l * SuperVector::mask_width()) + << (buf_off * SuperVector::mask_width()); + typename SuperVector::comparemask_type z1 = mask1.eqmask(v); + typename SuperVector::comparemask_type z2 = mask2.eqmask(v); + typename SuperVector::comparemask_type z = + mask & (z1 << SuperVector::mask_width()) & z2; + z = SuperVector::iteration_mask(z); return double_zscan(n, d, buf, z, len, cbi); } @@ -191,7 +203,8 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, __builtin_prefetch(base + 256); SuperVector v = SuperVector::load(d) & caseMask; - typename SuperVector::movemask_type z = mask1.eqmask(v); + typename SuperVector::comparemask_type z = mask1.eqmask(v); + z = SuperVector::iteration_mask(z); hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi); RETURN_IF_TERMINATED(rv); @@ -220,7 +233,7 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, size_t start = offset + n->msk_len - n->key_offset; - typename SuperVector::movemask_type lastz1{0}; + typename SuperVector::comparemask_type lastz1{0}; const u8 *d = buf + start; const u8 *e = buf + end; @@ -248,10 +261,12 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, __builtin_prefetch(base + 256); SuperVector v = SuperVector::load(d) & caseMask; - typename SuperVector::movemask_type z1 = mask1.eqmask(v); - typename SuperVector::movemask_type z2 = mask2.eqmask(v); - typename SuperVector::movemask_type z = (z1 << 1 | lastz1) & z2; - lastz1 = z1 >> Z_SHIFT; + typename SuperVector::comparemask_type z1 = mask1.eqmask(v); + typename SuperVector::comparemask_type z2 = mask2.eqmask(v); + typename SuperVector::comparemask_type z = + (z1 << SuperVector::mask_width() | lastz1) & z2; + lastz1 = z1 >> (Z_SHIFT * SuperVector::mask_width()); + z = SuperVector::iteration_mask(z); hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi); RETURN_IF_TERMINATED(rv); diff --git a/src/nfa/limex_shuffle.hpp b/src/nfa/limex_shuffle.hpp index 4266d7da..367d400b 100644 --- a/src/nfa/limex_shuffle.hpp +++ b/src/nfa/limex_shuffle.hpp @@ -53,7 +53,15 @@ really_really_inline u32 packedExtract<16>(SuperVector<16> s, const SuperVector<16> permute, const SuperVector<16> compare) { SuperVector<16> shuffled = s.pshufb(permute); SuperVector<16> compared = shuffled & compare; - u16 rv = ~compared.eqmask(shuffled); + u64a rv = (~compared.eqmask(shuffled)) & 0xffff; + if (SuperVector<16>::mask_width() != 1) { + u32 ans = 0; + for (u32 i = 0; i < 16; ++i) { + ans |= (rv & (1ull << (i * SuperVector<16>::mask_width()))) >> + (i * SuperVector<16>::mask_width() - i); + } + return ans; + } return (u32)rv; } @@ -62,7 +70,8 @@ really_really_inline u32 packedExtract<32>(SuperVector<32> s, const SuperVector<32> permute, const SuperVector<32> compare) { SuperVector<32> shuffled = s.pshufb(permute); SuperVector<32> compared = shuffled & compare; - u32 rv = ~compared.eqmask(shuffled); + // TODO(danlark1): Future ARM support might have a bug. + u64a rv = (~compared.eqmask(shuffled)) & 0xffffffff; return (u32)((rv >> 16) | (rv & 0xffffU)); } @@ -71,6 +80,7 @@ really_really_inline u32 packedExtract<64>(SuperVector<64> s, const SuperVector<64> permute, const SuperVector<64> compare) { SuperVector<64> shuffled = s.pshufb(permute); SuperVector<64> compared = shuffled & compare; + // TODO(danlark1): Future ARM support might have a bug. u64a rv = ~compared.eqmask(shuffled); rv = rv >> 32 | rv; return (u32)(((rv >> 16) | rv) & 0xffffU); diff --git a/src/util/arch/arm/match.hpp b/src/util/arch/arm/match.hpp index 892c3877..1280fed5 100644 --- a/src/util/arch/arm/match.hpp +++ b/src/util/arch/arm/match.hpp @@ -33,13 +33,13 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 cons uint32x4_t m = mask.u.u32x4[0]; uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0); if (vmax != 0) { - typename SuperVector<16>::movemask_type z = mask.movemask(); - DEBUG_PRINTF("z %08x\n", z); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - u32 pos = ctz32(z & 0xffff); + typename SuperVector<16>::comparemask_type z = mask.comparemask(); + DEBUG_PRINTF("z %08llx\n", z); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + u32 pos = ctz64(z) / SuperVector<16>::mask_width(); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos < 16); - DEBUG_PRINTF("buf + pos %p\n", buf + pos); + DEBUG_PRINTF("buf + pos %p\n", buf + (pos)); return buf + pos; } else { return NULL; // no match @@ -52,13 +52,12 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const uint32x4_t m = mask.u.u32x4[0]; uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0); if (vmax != 0) { - typename SuperVector<16>::movemask_type z = mask.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); - u32 pos = clz32(z & 0xffff); + typename SuperVector<16>::comparemask_type z = mask.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); + u32 pos = clz64(z) / SuperVector<16>::mask_width(); DEBUG_PRINTF("match @ pos %u\n", pos); - assert(pos >= 16 && pos < 32); - return buf + (31 - pos); + return buf + (15 - pos); } else { return NULL; // no match } @@ -70,10 +69,10 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16 uint32x4_t m = mask.u.u32x4[0]; uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0); if (vmax != 0) { - typename SuperVector<16>::movemask_type z = mask.movemask(); - DEBUG_PRINTF("z %08x\n", z); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - u32 pos = ctz32(z & 0xffff); + typename SuperVector<16>::comparemask_type z = mask.comparemask(); + DEBUG_PRINTF("z %08llx\n", z); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + u32 pos = ctz64(z) / SuperVector<16>::mask_width(); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos < 16); DEBUG_PRINTF("buf + pos %p\n", buf + pos); @@ -89,13 +88,12 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16 uint32x4_t m = mask.u.u32x4[0]; uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0); if (vmax != 0) { - typename SuperVector<16>::movemask_type z = mask.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); - u32 pos = clz32(z & 0xffff); + typename SuperVector<16>::comparemask_type z = mask.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); + u32 pos = clz64(z) / SuperVector<16>::mask_width(); DEBUG_PRINTF("match @ pos %u\n", pos); - assert(pos >= 16 && pos < 32); - return buf + (31 - pos); + return buf + (15 - pos); } else { return NULL; // no match } diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index e6836b25..68c29c67 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -86,8 +86,9 @@ static really_inline m128 not128(m128 a) { /** \brief Return 1 if a and b are different otherwise 0 */ static really_inline int diff128(m128 a, m128 b) { - int res = vaddvq_s8((int8x16_t) vceqq_s32(a, b)); - return (-16 != res); + uint64_t res = vget_lane_u64( + (uint64x1_t)vshrn_n_u16((uint16x8_t)vceqq_s32(a, b), 4), 0); + return (~0ull != res); } static really_inline int isnonzero128(m128 a) { @@ -379,15 +380,19 @@ static really_inline m128 eq64_m128(m128 a, m128 b) { } static really_inline u32 movemask128(m128 a) { - uint8x16_t input = vreinterpretq_u8_s32(a); - uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); - uint32x4_t paired16 = - vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); - uint64x2_t paired32 = - vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); - uint8x16_t paired64 = - vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); - return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); + static const uint8x16_t powers = {1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128}; + + // Compute the mask from the input + uint8x16_t mask = (uint8x16_t)vpaddlq_u32( + vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); + uint8x16_t mask1 = vextq_u8(mask, (uint8x16_t)zeroes128(), 7); + mask = vorrq_u8(mask, mask1); + + // Get the resulting bytes + uint16_t output; + vst1q_lane_u16((uint16_t *)&output, (uint16x8_t)mask, 0); + return output; } static really_inline m128 set1_16x8(u8 c) { diff --git a/src/util/arch/ppc64el/match.hpp b/src/util/arch/ppc64el/match.hpp index a3f52e41..4f7cc7f1 100644 --- a/src/util/arch/ppc64el/match.hpp +++ b/src/util/arch/ppc64el/match.hpp @@ -30,7 +30,7 @@ template <> really_really_inline const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { - SuperVector<16>::movemask_type z = v.movemask(); + SuperVector<16>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("z %08x\n", z); if (unlikely(z)) { @@ -47,7 +47,7 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const U template <> really_really_inline const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { - SuperVector<16>::movemask_type z = v.movemask(); + SuperVector<16>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("z %08x\n", z); if (unlikely(z)) { @@ -63,7 +63,7 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UN template <> really_really_inline const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { - SuperVector<16>::movemask_type z = v.movemask(); + SuperVector<16>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("z %08x\n", z); if (unlikely(z != 0xffff)) { @@ -81,7 +81,7 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 co template <> really_really_inline const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) { - SuperVector<16>::movemask_type z = v.movemask(); + SuperVector<16>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("z %08x\n", z); if (unlikely(z != 0xffff)) { diff --git a/src/util/arch/x86/match.hpp b/src/util/arch/x86/match.hpp index cbf4ab6b..d237567f 100644 --- a/src/util/arch/x86/match.hpp +++ b/src/util/arch/x86/match.hpp @@ -30,12 +30,13 @@ template <> really_really_inline const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { - SuperVector<16>::movemask_type z = v.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); + assert(SuperVector<16>::mask_width() == 1); + SuperVector<16>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); if (unlikely(z)) { u32 pos = ctz32(z); - DEBUG_PRINTF("~z %08x\n", ~z); + DEBUG_PRINTF("~z %08llx\n", ~z); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos < 16); return buf + pos; @@ -47,8 +48,9 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const U template <> really_really_inline const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) { - SuperVector<32>::movemask_type z = v.movemask(); - DEBUG_PRINTF("z 0x%08x\n", z); + assert(SuperVector<32>::mask_width() == 1); + SuperVector<32>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("z 0x%08llx\n", z); if (unlikely(z)) { u32 pos = ctz32(z); assert(pos < 32); @@ -61,7 +63,8 @@ const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const U template <> really_really_inline const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) { - SuperVector<64>::movemask_type z = v.movemask(); + assert(SuperVector<64>::mask_width() == 1); + SuperVector<64>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("z 0x%016llx\n", z); u64a mask = (~0ULL) >> (64 - len); DEBUG_PRINTF("mask %016llx\n", mask); @@ -80,9 +83,10 @@ const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const le template <> really_really_inline const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { - SuperVector<16>::movemask_type z = v.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); + assert(SuperVector<16>::mask_width() == 1); + SuperVector<16>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); if (unlikely(z)) { u32 pos = clz32(z); DEBUG_PRINTF("match @ pos %u\n", pos); @@ -96,8 +100,9 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UN template <> really_really_inline const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) { - SuperVector<32>::movemask_type z = v.movemask(); - DEBUG_PRINTF("z 0x%08x\n", z); + assert(SuperVector<32>::mask_width() == 1); + SuperVector<32>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("z 0x%08llx\n", z); if (unlikely(z)) { u32 pos = clz32(z); assert(pos < 32); @@ -110,7 +115,8 @@ const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UN template <> really_really_inline const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) { - SuperVector<64>::movemask_type z = v.movemask(); + assert(SuperVector<64>::mask_width() == 1); + SuperVector<64>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("z 0x%016llx\n", z); u64a mask = (~0ULL) >> (64 - len); DEBUG_PRINTF("mask %016llx\n", mask); @@ -129,12 +135,13 @@ const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len template <> really_really_inline const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { - SuperVector<16>::movemask_type z = v.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); + assert(SuperVector<16>::mask_width() == 1); + SuperVector<16>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); if (unlikely(z != 0xffff)) { u32 pos = ctz32(~z & 0xffff); - DEBUG_PRINTF("~z %08x\n", ~z); + DEBUG_PRINTF("~z %08llx\n", ~z); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos < 16); return buf + pos; @@ -146,10 +153,11 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 co template <> really_really_inline const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) { - SuperVector<32>::movemask_type z = v.movemask(); - DEBUG_PRINTF("z 0x%08x\n", z); + assert(SuperVector<32>::mask_width() == 1); + SuperVector<32>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("z 0x%08llx\n", z); if (unlikely(z != 0xffffffff)) { - u32 pos = ctz32(~z); + u32 pos = ctz32(~z & 0xffffffffu); assert(pos < 32); DEBUG_PRINTF("match @ pos %u\n", pos); return buf + pos; @@ -160,7 +168,8 @@ const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 co template <> really_really_inline const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 const len) { - SuperVector<64>::movemask_type z = v.movemask(); + assert(SuperVector<64>::mask_width() == 1); + SuperVector<64>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("z 0x%016llx\n", z); u64a mask = (~0ULL) >> (64 - len); DEBUG_PRINTF("mask %016llx\n", mask); @@ -179,12 +188,13 @@ const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 con template <> really_really_inline const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) { - SuperVector<16>::movemask_type z = v.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); + assert(SuperVector<16>::mask_width() == 1); + SuperVector<16>::comparemask_type z = v.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); if (unlikely(z != 0xffff)) { - u32 pos = clz32(~z & 0xffff); - DEBUG_PRINTF("~z %08x\n", ~z); + u32 pos = clz32(~z & 0xffffu); + DEBUG_PRINTF("~z %08llx\n", ~z); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos >= 16 && pos < 32); return buf + (31 - pos); @@ -196,9 +206,10 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_ template<> really_really_inline const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_t UNUSED len) { - SuperVector<32>::movemask_type z = v.movemask(); - if (unlikely(z != 0xffffffff)) { - u32 pos = clz32(~z & 0xffffffff); + assert(SuperVector<32>::mask_width() == 1); + SuperVector<32>::comparemask_type z = v.comparemask(); + if (unlikely(static_cast(z) != 0xffffffff)) { + u32 pos = clz32(~z & 0xffffffffu); DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); assert(pos < 32); return buf + (31 - pos); @@ -210,8 +221,9 @@ const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_ template <> really_really_inline const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v, uint16_t len) { + assert(SuperVector<64>::mask_width() == 1); v.print8("v"); - SuperVector<64>::movemask_type z = v.movemask(); + SuperVector<64>::comparemask_type z = v.comparemask(); DEBUG_PRINTF("z 0x%016llx\n", z); u64a mask = (~0ULL) >> (64 - len); DEBUG_PRINTF("mask %016llx\n", mask); diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp index 89497d3d..b3e4233e 100644 --- a/src/util/supervector/arch/arm/impl.cpp +++ b/src/util/supervector/arch/arm/impl.cpp @@ -249,25 +249,25 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons } template <> -really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void) const -{ - SuperVector powers = SuperVector::dup_u64(0x8040201008040201UL); - - // Compute the mask from the input - uint8x16_t mask = (uint8x16_t) vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(u.u8x16[0], powers.u.u8x16[0])))); - uint64x2_t mask1 = (uint64x2_t) vextq_u8(mask, vdupq_n_u8(0), 7); - mask = vorrq_u8(mask, (uint8x16_t) mask1); - - // Get the resulting bytes - uint16_t output; - vst1q_lane_u16(&output, (uint16x8_t)mask, 0); - return static_cast::movemask_type>(output); +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::comparemask(void) const { + return static_cast::comparemask_type>( + vget_lane_u64((uint64x1_t)vshrn_n_u16(u.u16x8[0], 4), 0)); } template <> -really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const -{ - return eq(b).movemask(); +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::eqmask(SuperVector<16> const b) const { + return eq(b).comparemask(); +} + +template <> really_inline u32 SuperVector<16>::mask_width() { return 4; } + +template <> +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::iteration_mask( + typename SuperVector<16>::comparemask_type mask) { + return mask & 0x1111111111111111ull; } template <> diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 109b8d5e..5becb8f8 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -206,8 +206,8 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons } template <> -really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const -{ +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::comparemask(void) const { uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7)); uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7)); @@ -230,11 +230,19 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask( } template <> -really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const -{ - return eq(b).movemask(); +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::eqmask(SuperVector<16> const b) const { + return eq(b).comparemask(); } +template <> really_inline u32 SuperVector<16>::mask_width() { return 1; } + +template <> +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::iteration_mask( + typename SuperVector<16>::comparemask_type mask) { + return mask; +} template <> template diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp index 157f1dc4..c9daf0cf 100644 --- a/src/util/supervector/arch/x86/impl.cpp +++ b/src/util/supervector/arch/x86/impl.cpp @@ -203,15 +203,24 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons } template <> -really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const -{ - return _mm_movemask_epi8(u.v128[0]); +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::comparemask(void) const { + return (u32)_mm_movemask_epi8(u.v128[0]); } template <> -really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const -{ - return eq(b).movemask(); +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::eqmask(SuperVector<16> const b) const { + return eq(b).comparemask(); +} + +template <> really_inline u32 SuperVector<16>::mask_width() { return 1; } + +template <> +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::iteration_mask( + typename SuperVector<16>::comparemask_type mask) { + return mask; } // template <> @@ -754,17 +763,25 @@ really_inline SuperVector<32> SuperVector<32>::eq(SuperVector<32> const &b) cons } template <> -really_inline typename SuperVector<32>::movemask_type SuperVector<32>::movemask(void)const -{ - return _mm256_movemask_epi8(u.v256[0]); +really_inline typename SuperVector<32>::comparemask_type +SuperVector<32>::comparemask(void) const { + return (u32)_mm256_movemask_epi8(u.v256[0]); } template <> -really_inline typename SuperVector<32>::movemask_type SuperVector<32>::eqmask(SuperVector<32> const b) const -{ - return eq(b).movemask(); +really_inline typename SuperVector<32>::comparemask_type +SuperVector<32>::eqmask(SuperVector<32> const b) const { + return eq(b).comparemask(); } +template <> really_inline u32 SuperVector<32>::mask_width() { return 1; } + +template <> +really_inline typename SuperVector<32>::comparemask_type +SuperVector<32>::iteration_mask( + typename SuperVector<32>::comparemask_type mask) { + return mask; +} // template <> // template @@ -1347,42 +1364,48 @@ really_inline SuperVector<64> SuperVector<64>::opandnot(SuperVector<64> const &b template <> really_inline SuperVector<64> SuperVector<64>::operator==(SuperVector<64> const &b) const { - SuperVector<64>::movemask_type mask = _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]); + SuperVector<64>::comparemask_type mask = + _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]); return {_mm512_movm_epi8(mask)}; } template <> really_inline SuperVector<64> SuperVector<64>::operator!=(SuperVector<64> const &b) const { - SuperVector<64>::movemask_type mask = _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]); + SuperVector<64>::comparemask_type mask = + _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]); return {_mm512_movm_epi8(mask)}; } template <> really_inline SuperVector<64> SuperVector<64>::operator>(SuperVector<64> const &b) const { - SuperVector<64>::movemask_type mask = _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]); + SuperVector<64>::comparemask_type mask = + _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]); return {_mm512_movm_epi8(mask)}; } template <> really_inline SuperVector<64> SuperVector<64>::operator<(SuperVector<64> const &b) const { - SuperVector<64>::movemask_type mask = _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]); + SuperVector<64>::comparemask_type mask = + _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]); return {_mm512_movm_epi8(mask)}; } template <> really_inline SuperVector<64> SuperVector<64>::operator>=(SuperVector<64> const &b) const { - SuperVector<64>::movemask_type mask = _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]); + SuperVector<64>::comparemask_type mask = + _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]); return {_mm512_movm_epi8(mask)}; } template <> really_inline SuperVector<64> SuperVector<64>::operator<=(SuperVector<64> const &b) const { - SuperVector<64>::movemask_type mask = _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]); + SuperVector<64>::comparemask_type mask = + _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]); return {_mm512_movm_epi8(mask)}; } @@ -1393,19 +1416,28 @@ really_inline SuperVector<64> SuperVector<64>::eq(SuperVector<64> const &b) cons } template <> -really_inline typename SuperVector<64>::movemask_type SuperVector<64>::movemask(void)const -{ +really_inline typename SuperVector<64>::comparemask_type +SuperVector<64>::comparemask(void) const { __m512i msb = _mm512_set1_epi8(0xFF); __m512i mask = _mm512_and_si512(msb, u.v512[0]); return _mm512_cmpeq_epi8_mask(mask, msb); } template <> -really_inline typename SuperVector<64>::movemask_type SuperVector<64>::eqmask(SuperVector<64> const b) const -{ +really_inline typename SuperVector<64>::comparemask_type +SuperVector<64>::eqmask(SuperVector<64> const b) const { return _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]); } +template <> really_inline u32 SuperVector<64>::mask_width() { return 1; } + +template <> +really_inline typename SuperVector<64>::comparemask_type +SuperVector<64>::iteration_mask( + typename SuperVector<64>::comparemask_type mask) { + return mask; +} + // template <> // template // really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index f0ddf63c..51310db2 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -46,19 +46,29 @@ using Z_TYPE = u64a; #define Z_BITS 64 #define Z_SHIFT 63 +#define Z_POSSHIFT 0 #define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS -(l))) #define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) #elif defined(HAVE_SIMD_256_BITS) using Z_TYPE = u32; #define Z_BITS 32 #define Z_SHIFT 31 +#define Z_POSSHIFT 0 #define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) #define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) #elif defined(HAVE_SIMD_128_BITS) +#if defined(ARCH_ARM32) || defined(ARCH_AARCH64) +using Z_TYPE = u64a; +#define Z_BITS 64 +#define Z_POSSHIFT 2 +#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l))) +#else using Z_TYPE = u32; #define Z_BITS 32 +#define Z_POSSHIFT 0 +#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) +#endif #define Z_SHIFT 15 -#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) #define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) #endif @@ -94,7 +104,8 @@ struct BaseVector static constexpr bool is_valid = false; static constexpr u16 size = 8; using type = void; - using movemask_type = void; + using comparemask_type = void; + using cmpmask_type = void; static constexpr bool has_previous = false; using previous_type = void; static constexpr u16 previous_size = 4; @@ -106,7 +117,7 @@ struct BaseVector<128> static constexpr bool is_valid = true; static constexpr u16 size = 128; using type = void; - using movemask_type = u64a; + using comparemask_type = u64a; static constexpr bool has_previous = true; using previous_type = m512; static constexpr u16 previous_size = 64; @@ -118,7 +129,7 @@ struct BaseVector<64> static constexpr bool is_valid = true; static constexpr u16 size = 64; using type = m512; - using movemask_type = u64a; + using comparemask_type = u64a; static constexpr bool has_previous = true; using previous_type = m256; static constexpr u16 previous_size = 32; @@ -131,7 +142,7 @@ struct BaseVector<32> static constexpr bool is_valid = true; static constexpr u16 size = 32; using type = m256; - using movemask_type = u32; + using comparemask_type = u64a; static constexpr bool has_previous = true; using previous_type = m128; static constexpr u16 previous_size = 16; @@ -144,7 +155,7 @@ struct BaseVector<16> static constexpr bool is_valid = true; static constexpr u16 size = 16; using type = m128; - using movemask_type = u32; + using comparemask_type = u64a; static constexpr bool has_previous = false; using previous_type = u64a; static constexpr u16 previous_size = 8; @@ -231,8 +242,17 @@ public: SuperVector eq(SuperVector const &b) const; SuperVector operator<<(uint8_t const N) const; SuperVector operator>>(uint8_t const N) const; - typename base_type::movemask_type movemask(void) const; - typename base_type::movemask_type eqmask(SuperVector const b) const; + // Returns mask_width groups of zeros or ones. To get the mask which can be + // iterated, use iteration_mask method, it ensures only one bit is set per + // mask_width group. + // Precondition: all bytes must be 0 or 0xff. + typename base_type::comparemask_type comparemask(void) const; + typename base_type::comparemask_type eqmask(SuperVector const b) const; + static u32 mask_width(); + // Returns a mask with at most 1 bit set to 1. It can be used to iterate + // over bits through ctz/clz and lowest bit clear. + static typename base_type::comparemask_type + iteration_mask(typename base_type::comparemask_type mask); static SuperVector loadu(void const *ptr); static SuperVector load(void const *ptr); diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp index deb3b169..0b4cae58 100644 --- a/unit/internal/supervector.cpp +++ b/unit/internal/supervector.cpp @@ -176,9 +176,9 @@ TEST(SuperVectorUtilsTest,Movemask128c){ } } auto SP = SuperVector<16>::loadu(vec); - u16 mask = SP.movemask(); - for(int i=0; i<16; i++) { - if (mask & (1 << i)) { + u64a mask = SP.comparemask(); + for (int i = 0; i < 16; i++) { + if (mask & (1ull << (i * SuperVector<16>::mask_width()))) { vec2[i] = 0xff; } } @@ -195,15 +195,21 @@ TEST(SuperVectorUtilsTest,Eqmask128c){ for (int i = 0; i<16; i++) { vec2[i]= rand() % 100 + 67;} auto SP = SuperVector<16>::loadu(vec); auto SP1 = SuperVector<16>::loadu(vec2); - int mask = SP.eqmask(SP); - ASSERT_EQ(mask,0xFFFF); + u64a mask = SP.eqmask(SP); + for (u32 i = 0; i < 16; ++i) { + ASSERT_TRUE(mask & (1ull << (i * SuperVector<16>::mask_width()))); + } mask = SP.eqmask(SP1); ASSERT_EQ(mask,0); vec2[0] = vec[0]; vec2[1] = vec[1]; auto SP2 = SuperVector<16>::loadu(vec2); mask = SP.eqmask(SP2); - ASSERT_EQ(mask,3); + ASSERT_TRUE(mask & 1); + ASSERT_TRUE(mask & (1ull << SuperVector<16>::mask_width())); + for (u32 i = 2; i < 16; ++i) { + ASSERT_FALSE(mask & (1ull << (i * SuperVector<16>::mask_width()))); + } } /*Define LSHIFT128 macro*/ @@ -507,9 +513,9 @@ TEST(SuperVectorUtilsTest,Movemask256c){ } } auto SP = SuperVector<32>::loadu(vec); - u32 mask = SP.movemask(); + u64a mask = SP.comparemask(); for(int i=0; i<32; i++) { - if (mask & (1 << i)) { + if (mask & (1ull << (i * SuperVector<32>::mask_width()))) { vec2[i] = 0xff; } } @@ -527,15 +533,21 @@ TEST(SuperVectorUtilsTest,Eqmask256c){ for (int i = 0; i<32; i++) { vec2[i]= rand() % 100 + 67;} auto SP = SuperVector<32>::loadu(vec); auto SP1 = SuperVector<32>::loadu(vec2); - u32 mask = SP.eqmask(SP); - ASSERT_EQ(mask,0xFFFFFFFF); + u64a mask = SP.eqmask(SP); + for (u32 i = 0; i < 32; ++i) { + ASSERT_TRUE(mask & (1ull << (i * SuperVector<32>::mask_width()))); + } mask = SP.eqmask(SP1); ASSERT_EQ(mask,0); vec2[0] = vec[0]; vec2[1] = vec[1]; auto SP2 = SuperVector<32>::loadu(vec2); mask = SP.eqmask(SP2); - ASSERT_EQ(mask,3); + ASSERT_TRUE(mask & 1); + ASSERT_TRUE(mask & (1ull << SuperVector<32>::mask_width())); + for (u32 i = 2; i < 32; ++i) { + ASSERT_FALSE(mask & (1ull << (i * SuperVector<32>::mask_width()))); + } } TEST(SuperVectorUtilsTest,pshufb256c) { @@ -871,6 +883,8 @@ TEST(SuperVectorUtilsTest,Eqmask512c){ auto SP = SuperVector<64>::loadu(vec); auto SP1 = SuperVector<64>::loadu(vec2); u64a mask = SP.eqmask(SP); + // Mask width for 64 bit type cannot be more than 1. + ASSERT_EQ(SuperVector<64>::mask_width(), 1); ASSERT_EQ(mask,0xFFFFFFFFFFFFFFFF); mask = SP.eqmask(SP1); ASSERT_EQ(mask,0); From 8a49e20bcd504f7bd8cc95d9e6807543296950d8 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sun, 26 Jun 2022 22:59:58 +0000 Subject: [PATCH 02/35] Fix formatting of a couple files --- src/util/arch/arm/simd_utils.h | 22 +++++++++------------- src/util/supervector/supervector.hpp | 11 +++++------ 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 68c29c67..8d8c4456 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -380,19 +380,15 @@ static really_inline m128 eq64_m128(m128 a, m128 b) { } static really_inline u32 movemask128(m128 a) { - static const uint8x16_t powers = {1, 2, 4, 8, 16, 32, 64, 128, - 1, 2, 4, 8, 16, 32, 64, 128}; - - // Compute the mask from the input - uint8x16_t mask = (uint8x16_t)vpaddlq_u32( - vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); - uint8x16_t mask1 = vextq_u8(mask, (uint8x16_t)zeroes128(), 7); - mask = vorrq_u8(mask, mask1); - - // Get the resulting bytes - uint16_t output; - vst1q_lane_u16((uint16_t *)&output, (uint16x8_t)mask, 0); - return output; + ruint8x16_t input = vreinterpretq_u8_s32(a); + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + uint32x4_t paired16 = + vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + uint64x2_t paired32 = + vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + uint8x16_t paired64 = + vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); } static really_inline m128 set1_16x8(u8 c) { diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index 51310db2..5d066c1a 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -104,8 +104,7 @@ struct BaseVector static constexpr bool is_valid = false; static constexpr u16 size = 8; using type = void; - using comparemask_type = void; - using cmpmask_type = void; + using comparemask_type = void; static constexpr bool has_previous = false; using previous_type = void; static constexpr u16 previous_size = 4; @@ -117,7 +116,7 @@ struct BaseVector<128> static constexpr bool is_valid = true; static constexpr u16 size = 128; using type = void; - using comparemask_type = u64a; + using comparemask_type = u64a; static constexpr bool has_previous = true; using previous_type = m512; static constexpr u16 previous_size = 64; @@ -129,7 +128,7 @@ struct BaseVector<64> static constexpr bool is_valid = true; static constexpr u16 size = 64; using type = m512; - using comparemask_type = u64a; + using comparemask_type = u64a; static constexpr bool has_previous = true; using previous_type = m256; static constexpr u16 previous_size = 32; @@ -142,7 +141,7 @@ struct BaseVector<32> static constexpr bool is_valid = true; static constexpr u16 size = 32; using type = m256; - using comparemask_type = u64a; + using comparemask_type = u64a; static constexpr bool has_previous = true; using previous_type = m128; static constexpr u16 previous_size = 16; @@ -155,7 +154,7 @@ struct BaseVector<16> static constexpr bool is_valid = true; static constexpr u16 size = 16; using type = m128; - using comparemask_type = u64a; + using comparemask_type = u64a; static constexpr bool has_previous = false; using previous_type = u64a; static constexpr u16 previous_size = 8; From 849846700a757efb454ada64ee5851f548f94807 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sun, 26 Jun 2022 23:02:02 +0000 Subject: [PATCH 03/35] Minor fix --- src/util/arch/arm/simd_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 8d8c4456..2a4f9c16 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -380,7 +380,7 @@ static really_inline m128 eq64_m128(m128 a, m128 b) { } static really_inline u32 movemask128(m128 a) { - ruint8x16_t input = vreinterpretq_u8_s32(a); + uint8x16_t input = vreinterpretq_u8_s32(a); uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); From 7e7f604f7d5bbb860e570a2e3e70eab0cbac1550 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sun, 26 Jun 2022 23:05:17 +0000 Subject: [PATCH 04/35] Fix ppc64el debug --- src/util/arch/ppc64el/match.hpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/util/arch/ppc64el/match.hpp b/src/util/arch/ppc64el/match.hpp index 4f7cc7f1..bf71be2d 100644 --- a/src/util/arch/ppc64el/match.hpp +++ b/src/util/arch/ppc64el/match.hpp @@ -31,11 +31,11 @@ template <> really_really_inline const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { SuperVector<16>::comparemask_type z = v.comparemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); if (unlikely(z)) { u32 pos = ctz32(z); - DEBUG_PRINTF("~z %08x\n", ~z); + DEBUG_PRINTF("~z %08llx\n", ~z); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos < 16); return buf + pos; @@ -48,8 +48,8 @@ template <> really_really_inline const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { SuperVector<16>::comparemask_type z = v.comparemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); if (unlikely(z)) { u32 pos = clz32(z); DEBUG_PRINTF("match @ pos %u\n", pos); @@ -64,11 +64,11 @@ template <> really_really_inline const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { SuperVector<16>::comparemask_type z = v.comparemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); if (unlikely(z != 0xffff)) { u32 pos = ctz32(~z & 0xffff); - DEBUG_PRINTF("~z %08x\n", ~z); + DEBUG_PRINTF("~z %08llx\n", ~z); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos < 16); return buf + pos; @@ -82,11 +82,11 @@ template <> really_really_inline const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) { SuperVector<16>::comparemask_type z = v.comparemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); if (unlikely(z != 0xffff)) { u32 pos = clz32(~z & 0xffff); - DEBUG_PRINTF("~z %08x\n", ~z); + DEBUG_PRINTF("~z %08llx\n", ~z); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos >= 16 && pos < 32); return buf + (31 - pos); From db52ce6f086d7fa7e8cce29f06e31f19345c3ca0 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Wed, 20 Jul 2022 09:03:50 +0100 Subject: [PATCH 05/35] Fix avx512 movemask call --- unit/internal/supervector.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp index 0b4cae58..2432e598 100644 --- a/unit/internal/supervector.cpp +++ b/unit/internal/supervector.cpp @@ -861,7 +861,7 @@ TEST(SuperVectorUtilsTest,Movemask512c){ } auto SP = SuperVector<64>::loadu(vec); u8 vec2[64] = {0}; - u64a mask = SP.movemask(); + u64a mask = SP.comparemask(); for(int i=0; i<64; i++) { if (mask & (1ULL << i)) { vec2[i] = 0xff; From b5e1384995fc3cf214c8cfeccef9c5ca9e0b7f6a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 20 Jul 2022 13:26:52 +0000 Subject: [PATCH 06/35] Fixed the PCRE download location --- cmake/setenv-arm64-cross.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/setenv-arm64-cross.sh b/cmake/setenv-arm64-cross.sh index 4858da1e..c9001699 100644 --- a/cmake/setenv-arm64-cross.sh +++ b/cmake/setenv-arm64-cross.sh @@ -9,11 +9,11 @@ export CROSS_SYS= # wget -O boost_$BOOST_VERSION.tar.gz https://sourceforge.net/projects/boost/files/boost/$BOOST_DOT_VERSION/boost_$BOOST_VERSION.tar.gz/download # tar xf boost_$BOOST_VERSION.tar.gz # fi -if [ ! -d "pcre-8.41" ]; +if [ ! -d "pcre-8.45" ]; then - wget -O pcre-8.41.tar.bz2 https://ftp.pcre.org/pub/pcre/pcre-8.41.tar.bz2 - tar xf pcre-8.41.tar.bz2 + wget -O pcre-8.45.tar.bz2 https://sourceforge.net/projects/pcre/files/pcre/8.45/pcre-8.45.tar.bz2/download + tar xf pcre-8.45.tar.bz2 export PCRE_SOURCE=1 fi -export BOOST_PATH= \ No newline at end of file +export BOOST_PATH= From cafd5248b11cbd98035286d64475b2c371aa4c87 Mon Sep 17 00:00:00 2001 From: "Hong, Yang A" Date: Thu, 4 Mar 2021 16:50:14 +0000 Subject: [PATCH 07/35] literal API: add instruction support fixes github issue #303 --- src/rose/program_runtime.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c index 7d4da45a..2bba5bbf 100644 --- a/src/rose/program_runtime.c +++ b/src/rose/program_runtime.c @@ -3092,6 +3092,7 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP; const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV; + const char skip_mpv_catchup = prog_flags & ROSE_PROG_FLAG_SKIP_MPV_CATCHUP; const char *pc_base = getByOffset(t, programOffset); const char *pc = pc_base; @@ -3188,6 +3189,17 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, } L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(CATCH_UP_MPV) { + if (from_mpv || skip_mpv_catchup) { + DEBUG_PRINTF("skipping mpv catchup\n"); + } else if (roseCatchUpMPV(t, + end - scratch->core_info.buf_offset, + scratch) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(SOM_FROM_REPORT) { som = handleSomExternal(scratch, &ri->som, end); DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch, @@ -3195,6 +3207,15 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, } L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(TRIGGER_SUFFIX) { + if (roseTriggerSuffix(t, scratch, ri->queue, ri->event, som, + end) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + L_PROGRAM_NEXT_INSTRUCTION + L_PROGRAM_CASE(DEDUPE) { updateSeqPoint(tctxt, end, from_mpv); const char do_som = t->hasSom; // TODO: constant propagate From a119693a66504e671b73b6e96ef2bd9760647536 Mon Sep 17 00:00:00 2001 From: "Hong, Yang A" Date: Thu, 4 Mar 2021 17:00:34 +0000 Subject: [PATCH 08/35] mcclellan: improve wide-state checking in Sherman optimization fixes github issue #305 --- src/nfa/mcclellancompile.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp index b5c3a8ac..aa04e470 100644 --- a/src/nfa/mcclellancompile.cpp +++ b/src/nfa/mcclellancompile.cpp @@ -1081,7 +1081,9 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit, // Use the daddy already set for this state so long as it isn't already // a Sherman state. dstate_id_t daddy = currState.daddy; - if (!info.is_sherman(daddy) && !info.is_widestate(daddy)) { + if (info.is_widestate(daddy)) { + return; + } else if (!info.is_sherman(daddy)) { hinted.insert(currState.daddy); } else { // Fall back to granddaddy, which has already been processed (due From decabdfede6a3d3d846964795b8a45fbe63025ff Mon Sep 17 00:00:00 2001 From: "Hong, Yang A" Date: Thu, 11 Mar 2021 15:20:55 +0000 Subject: [PATCH 09/35] update year for bugfix #302-#305 --- src/compiler/compiler.cpp | 2 +- src/nfa/mcclellancompile.cpp | 2 +- src/rose/program_runtime.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp index 5751bd64..ae5927bc 100644 --- a/src/compiler/compiler.cpp +++ b/src/compiler/compiler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2020, Intel Corporation + * Copyright (c) 2015-2021, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp index aa04e470..055920b2 100644 --- a/src/nfa/mcclellancompile.cpp +++ b/src/nfa/mcclellancompile.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2020, Intel Corporation + * Copyright (c) 2015-2021, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c index 2bba5bbf..f607e8f2 100644 --- a/src/rose/program_runtime.c +++ b/src/rose/program_runtime.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2020, Intel Corporation + * Copyright (c) 2015-2021, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: From c1659b854437c4fa92cc2693b6c854cc2c4a4277 Mon Sep 17 00:00:00 2001 From: "Chang, Harry" Date: Wed, 10 Mar 2021 07:20:01 +0000 Subject: [PATCH 10/35] Logical Combination: bypass combination flag in hs_expression_info. Fixes github issue #291 --- src/hs.cpp | 8 +++++++- src/hs_compile.h | 12 +++--------- src/hs_internal.h | 6 ++++-- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/hs.cpp b/src/hs.cpp index 303e7838..73cc032f 100644 --- a/src/hs.cpp +++ b/src/hs.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2020, Intel Corporation + * Copyright (c) 2015-2021, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -517,6 +517,12 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags, return HS_COMPILER_ERROR; } + if (flags & HS_FLAG_COMBINATION) { + *error = generateCompileError("Invalid parameter: unsupported " + "logical combination expression", -1); + return HS_COMPILER_ERROR; + } + *info = nullptr; *error = nullptr; diff --git a/src/hs_compile.h b/src/hs_compile.h index b318c29d..5aa24188 100644 --- a/src/hs_compile.h +++ b/src/hs_compile.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2020, Intel Corporation + * Copyright (c) 2015-2021, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -748,10 +748,7 @@ hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error); * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset * when a match is found. - * - HS_FLAG_COMBINATION - Parse the expression in logical combination - * syntax. - * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for - * the sub-expressions in logical combinations. + * - HS_FLAG_QUIET - This flag will be ignored. * * @param info * On success, a pointer to the pattern information will be returned in @@ -814,10 +811,7 @@ hs_error_t HS_CDECL hs_expression_info(const char *expression, * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset * when a match is found. - * - HS_FLAG_COMBINATION - Parse the expression in logical combination - * syntax. - * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for - * the sub-expressions in logical combinations. + * - HS_FLAG_QUIET - This flag will be ignored. * * @param ext * A pointer to a filled @ref hs_expr_ext_t structure that defines diff --git a/src/hs_internal.h b/src/hs_internal.h index adf07b22..4eb5e157 100644 --- a/src/hs_internal.h +++ b/src/hs_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, Intel Corporation + * Copyright (c) 2019-2021, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -80,7 +80,9 @@ extern "C" | HS_FLAG_PREFILTER \ | HS_FLAG_SINGLEMATCH \ | HS_FLAG_ALLOWEMPTY \ - | HS_FLAG_SOM_LEFTMOST) + | HS_FLAG_SOM_LEFTMOST \ + | HS_FLAG_COMBINATION \ + | HS_FLAG_QUIET) #ifdef __cplusplus } /* extern "C" */ From 2731a3384bbd7ffc4933f6d43478ef2762e5b4d8 Mon Sep 17 00:00:00 2001 From: hongyang7 Date: Thu, 16 Dec 2021 19:02:17 +0800 Subject: [PATCH 11/35] Fix segfaults on allocation failure (#4) Throw std::bad_alloc instead of returning nullptr from ue2::AlignedAllocator. Allocators for STL containers are expected never to return with an invalid pointer, and instead must throw on failure. Violating this expectation can lead to invalid pointer dereferences. Co-authored-by: johanngan fixes github issue #317 (PR #320) --- src/util/alloc.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/util/alloc.h b/src/util/alloc.h index de20c8d0..49b4a824 100644 --- a/src/util/alloc.h +++ b/src/util/alloc.h @@ -76,7 +76,11 @@ public: T *allocate(std::size_t size) const { size_t alloc_size = size * sizeof(T); - return static_cast(aligned_malloc_internal(alloc_size, N)); + T *ptr = static_cast(aligned_malloc_internal(alloc_size, N)); + if (!ptr) { + throw std::bad_alloc(); + } + return ptr; } void deallocate(T *x, std::size_t) const noexcept { From 4d4940dfbe523589e4ea90033bda4c574c73d627 Mon Sep 17 00:00:00 2001 From: "Hong, Yang A" Date: Thu, 28 Apr 2022 10:11:32 +0000 Subject: [PATCH 12/35] bugfix: fix overflow risk of strlen function --- src/compiler/compiler.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp index ae5927bc..32836834 100644 --- a/src/compiler/compiler.cpp +++ b/src/compiler/compiler.cpp @@ -323,7 +323,8 @@ void addExpression(NG &ng, unsigned index, const char *expression, } // Ensure that our pattern isn't too long (in characters). - if (strlen(expression) > cc.grey.limitPatternLength) { + size_t maxlen = cc.grey.limitPatternLength + 1; + if (strnlen(expression, maxlen) >= maxlen) { throw CompileError("Pattern length exceeds limit."); } From a9ca0e4de36ff32fb4a28f1bdc74ef08dc3f1ca4 Mon Sep 17 00:00:00 2001 From: "Chang, Harry" Date: Thu, 12 May 2022 02:15:07 +0000 Subject: [PATCH 13/35] Corpus generator: fix random char value of UTF-8. fixes github issue #184 --- util/ng_corpus_generator.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/ng_corpus_generator.cpp b/util/ng_corpus_generator.cpp index 145a0ab8..6c3f613d 100644 --- a/util/ng_corpus_generator.cpp +++ b/util/ng_corpus_generator.cpp @@ -476,14 +476,14 @@ void CorpusGeneratorUtf8::generateCorpus(vector &data) { * that we've been asked for. */ unichar CorpusGeneratorUtf8::getRandomChar() { u32 range = MAX_UNICODE + 1 - - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1); + - (UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1); range = min(cProps.alphabetSize, range); assert(range); unichar c = 'a' + cProps.rand(0, range - 1); if (c >= UNICODE_SURROGATE_MIN) { - c =+ UNICODE_SURROGATE_MAX + 1; + c += UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1; } return c % (MAX_UNICODE + 1); From 31afacc7be282ac591e71564bfee794303a244fa Mon Sep 17 00:00:00 2001 From: "Chang, Harry" Date: Thu, 12 May 2022 08:20:29 +0000 Subject: [PATCH 14/35] Corpus editor: fix random char value of UTF-8. --- util/ng_corpus_editor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/ng_corpus_editor.cpp b/util/ng_corpus_editor.cpp index ac4f8b65..c1149216 100644 --- a/util/ng_corpus_editor.cpp +++ b/util/ng_corpus_editor.cpp @@ -268,12 +268,12 @@ void CorpusEditorUtf8::flip_case(vector &corpus) { unichar CorpusEditorUtf8::chooseCodePoint(void) { /* We need to ensure that we don't pick a surrogate cp */ const u32 range = - MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1); + MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1); unichar raw = props.rand(0, range - 1); if (raw < UNICODE_SURROGATE_MIN) { return raw; } else { - return raw + UNICODE_SURROGATE_MAX + 1; + return raw + UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1; } } From 4f27a70dd7c4c48d259a77bf22bfd7dfa51b1d7e Mon Sep 17 00:00:00 2001 From: "Hong, Yang A" Date: Thu, 28 Jul 2022 04:59:34 +0000 Subject: [PATCH 15/35] chimera: fix SKIP flag issue fix github issue #360 --- chimera/ch_runtime.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/chimera/ch_runtime.c b/chimera/ch_runtime.c index fdb5b992..1009036b 100644 --- a/chimera/ch_runtime.c +++ b/chimera/ch_runtime.c @@ -326,6 +326,10 @@ ch_error_t catchupPcre(struct HybridContext *hyctx, unsigned int id, } else if (cbrv == CH_CALLBACK_SKIP_PATTERN) { DEBUG_PRINTF("user callback told us to skip this pattern\n"); pd->scanStart = hyctx->length; + if (top_id == id) { + break; + } + continue; } if (top_id == id) { From 70b2a28386f6a4be7903d9d61836c5918d219652 Mon Sep 17 00:00:00 2001 From: "Hong, Yang A" Date: Thu, 4 Mar 2021 16:13:46 +0000 Subject: [PATCH 16/35] literal API: add empty string check. fixes github issue #302, #304 --- src/compiler/compiler.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp index 32836834..35f46b3f 100644 --- a/src/compiler/compiler.cpp +++ b/src/compiler/compiler.cpp @@ -417,6 +417,10 @@ void addLitExpression(NG &ng, unsigned index, const char *expression, "HS_FLAG_SOM_LEFTMOST are supported in literal API."); } + if (!strcmp(expression, "")) { + throw CompileError("Pure literal API doesn't support empty string."); + } + // This expression must be a pure literal, we can build ue2_literal // directly based on expression text. ParsedLitExpression ple(index, expression, expLength, flags, id); From c597f69c5910db5042cf1942de64416ed41cd5f4 Mon Sep 17 00:00:00 2001 From: Liu Zixian Date: Mon, 27 Jun 2022 16:07:16 +0800 Subject: [PATCH 17/35] fix build with glibc-2.34 SIGTSKSZ is no long a constant after glibc 2.34 https://sourceware.org/pipermail/libc-alpha/2021-August/129718.html --- tools/hscollider/sig.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/hscollider/sig.cpp b/tools/hscollider/sig.cpp index bb00185d..d2e221b5 100644 --- a/tools/hscollider/sig.cpp +++ b/tools/hscollider/sig.cpp @@ -38,6 +38,7 @@ #if defined(HAVE_SIGACTION) #include +#define STACK_SIZE 8192 #endif #ifdef HAVE_BACKTRACE @@ -166,7 +167,7 @@ void installSignalHandler(void) { } #ifdef HAVE_SIGALTSTACK -static TLS_VARIABLE char alt_stack_loc[SIGSTKSZ]; +static TLS_VARIABLE char alt_stack_loc[STACK_SIZE]; #endif void setSignalStack(void) { @@ -178,7 +179,7 @@ void setSignalStack(void) { stack_t alt_stack; memset(&alt_stack, 0, sizeof(alt_stack)); alt_stack.ss_flags = 0; - alt_stack.ss_size = SIGSTKSZ; + alt_stack.ss_size = STACK_SIZE; alt_stack.ss_sp = alt_stack_loc; if (!sigaltstack(&alt_stack, nullptr)) { act.sa_flags |= SA_ONSTACK; From 74ab41897cc1d4f03555e5adde679fe21c60ee0a Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 30 Aug 2022 20:40:23 +0300 Subject: [PATCH 18/35] Add missing header --- unit/internal/multi_bit_compress.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unit/internal/multi_bit_compress.cpp b/unit/internal/multi_bit_compress.cpp index 2d59ea14..40078f81 100644 --- a/unit/internal/multi_bit_compress.cpp +++ b/unit/internal/multi_bit_compress.cpp @@ -28,6 +28,8 @@ #include "config.h" +#include + #include "gtest/gtest.h" #include "ue2common.h" #include "util/compile_error.h" From 43c053a069848fbbd6f92f860dc035bd17bc3627 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 2 Sep 2022 15:12:56 +0300 Subject: [PATCH 19/35] add popcount32x4, popcount64x4 helper functions --- src/util/bitfield.h | 5 +---- src/util/popcount.h | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/util/bitfield.h b/src/util/bitfield.h index a580da7b..202232b6 100644 --- a/src/util/bitfield.h +++ b/src/util/bitfield.h @@ -189,10 +189,7 @@ public: size_t sum = 0; size_t i = 0; for (; i + 4 <= num_blocks; i += 4) { - sum += popcount64(bits[i]); - sum += popcount64(bits[i + 1]); - sum += popcount64(bits[i + 2]); - sum += popcount64(bits[i + 3]); + sum += popcount64x4(&bits[i]); } for (; i < num_blocks; i++) { sum += popcount64(bits[i]); diff --git a/src/util/popcount.h b/src/util/popcount.h index c7a69d46..d90a0d50 100644 --- a/src/util/popcount.h +++ b/src/util/popcount.h @@ -52,6 +52,15 @@ u32 popcount32(u32 x) { // #endif } +static really_inline +u32 popcount32x4(u32 const *x) { + u32 sum = popcount32(x[0]); + sum += popcount32(x[1]); + sum += popcount32(x[2]); + sum += popcount32(x[3]); + return sum; +} + static really_inline u32 popcount64(u64a x) { return __builtin_popcountll(x); @@ -73,5 +82,14 @@ u32 popcount64(u64a x) { // #endif } +static really_inline +u32 popcount64x4(u64a const *x) { + volatile u32 sum = popcount64(x[0]); + sum += popcount64(x[1]); + sum += popcount64(x[2]); + sum += popcount64(x[3]); + return sum; +} + #endif /* UTIL_POPCOUNT_H_ */ From 026f7616714896f314273c9732daefefb92590dd Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 6 Sep 2022 18:10:55 +0300 Subject: [PATCH 20/35] [VSX] optimized mask1bit128(), moved simd_onebit_masks to common --- src/util/arch/common/simd_utils.h | 18 ++++++++++++ src/util/arch/ppc64el/simd_utils.h | 44 ++++-------------------------- 2 files changed, 23 insertions(+), 39 deletions(-) diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 17de949a..2f2dcf7c 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -88,6 +88,24 @@ static inline void print_m128_2x64(const char *label, m128 vec) { #define print_m128_2x64(label, vec) ; #endif +#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 + +/** \brief LUT for the mask1bit functions. */ +ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = { + ZEROES_32, ZEROES_32, + ZEROES_31, 0x01, ZEROES_32, + ZEROES_31, 0x02, ZEROES_32, + ZEROES_31, 0x04, ZEROES_32, + ZEROES_31, 0x08, ZEROES_32, + ZEROES_31, 0x10, ZEROES_32, + ZEROES_31, 0x20, ZEROES_32, + ZEROES_31, 0x40, ZEROES_32, + ZEROES_31, 0x80, ZEROES_32, + ZEROES_32, ZEROES_32, +}; + /**** **** 256-bit Primitives ****/ diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index d046ed47..ce67dae2 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -54,34 +54,6 @@ typedef __vector signed char int8x16_t; typedef unsigned long long int ulong64_t; typedef signed long long int long64_t; -/* -typedef __vector uint64_t uint64x2_t; -typedef __vector int64_t int64x2_t; -typedef __vector uint32_t uint32x4_t; -typedef __vector int32_t int32x4_t; -typedef __vector uint16_t uint16x8_t; -typedef __vector int16_t int16x8_t; -typedef __vector uint8_t uint8x16_t; -typedef __vector int8_t int8x16_t;*/ - - -#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 -#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 -#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 - -/** \brief LUT for the mask1bit functions. */ -ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = { - ZEROES_32, ZEROES_32, - ZEROES_31, 0x01, ZEROES_32, - ZEROES_31, 0x02, ZEROES_32, - ZEROES_31, 0x04, ZEROES_32, - ZEROES_31, 0x08, ZEROES_32, - ZEROES_31, 0x10, ZEROES_32, - ZEROES_31, 0x20, ZEROES_32, - ZEROES_31, 0x40, ZEROES_32, - ZEROES_31, 0x80, ZEROES_32, - ZEROES_32, ZEROES_32, -}; static really_inline m128 ones128(void) { return (m128) vec_splat_u8(-1); @@ -115,10 +87,6 @@ static really_inline u32 diffrich128(m128 a, m128 b) { m128 mask = (m128) vec_cmpeq(a, b); // _mm_cmpeq_epi32 (a, b); mask = vec_and(not128(mask), movemask); m128 sum = vec_sums(mask, zeroes128()); - //sum = vec_sld(zeroes128(), sum, 4); - //s32 ALIGN_ATTR(16) x; - //vec_ste(sum, 0, &x); - //return x; // it could be ~(movemask_128(mask)) & 0x; return sum[3]; } @@ -131,10 +99,6 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) { uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b); mask = (uint64x2_t) vec_and((uint64x2_t)not128((m128)mask), movemask); m128 sum = vec_sums((m128)mask, zeroes128()); - //sum = vec_sld(zeroes128(), sum, 4); - //s32 ALIGN_ATTR(16) x; - //vec_ste(sum, 0, &x); - //return x; return sum[3]; } @@ -425,9 +389,11 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) { static really_inline m128 mask1bit128(unsigned int n) { assert(n < sizeof(m128) * 8); - u32 mask_idx = ((n % 8) * 64) + 95; - mask_idx -= n / 8; - return loadu128(&simd_onebit_masks[mask_idx]); + static uint64x2_t onebit = { 1, 0 }; + m128 octets = (m128) vec_splats((uint8_t) ((n / 8) << 3)); + m128 bits = (m128) vec_splats((uint8_t) ((n % 8))); + m128 mask = (m128) vec_slo((uint8x16_t) onebit, (uint8x16_t) octets); + return (m128) vec_sll((uint8x16_t) mask, (uint8x16_t) bits); } // switches on bit N in the given vector. From 0e7874f122a55da0b2b92a129f5610e352594be6 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 6 Sep 2022 18:46:39 +0300 Subject: [PATCH 21/35] [VSX] optimize and correct lshift_m128/rshift_m128 --- src/util/arch/ppc64el/simd_utils.h | 44 ++++++------------------------ 1 file changed, 8 insertions(+), 36 deletions(-) diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index ce67dae2..589c4031 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -114,46 +114,18 @@ m128 sub_2x64(m128 a, m128 b) { static really_really_inline m128 lshift_m128(m128 a, unsigned b) { - switch(b){ - case 1: return vec_sld(a, zeroes128(), 1); break; - case 2: return vec_sld(a, zeroes128(), 2); break; - case 3: return vec_sld(a, zeroes128(), 3); break; - case 4: return vec_sld(a, zeroes128(), 4); break; - case 5: return vec_sld(a, zeroes128(), 5); break; - case 6: return vec_sld(a, zeroes128(), 6); break; - case 7: return vec_sld(a, zeroes128(), 7); break; - case 8: return vec_sld(a, zeroes128(), 8); break; - case 9: return vec_sld(a, zeroes128(), 9); break; - case 10: return vec_sld(a, zeroes128(), 10); break; - case 11: return vec_sld(a, zeroes128(), 11); break; - case 12: return vec_sld(a, zeroes128(), 12); break; - case 13: return vec_sld(a, zeroes128(), 13); break; - case 14: return vec_sld(a, zeroes128(), 14); break; - case 15: return vec_sld(a, zeroes128(), 15); break; - } - return a; + if (b == 0) return a; + m128 sl = (m128) vec_splats((uint8_t) b << 3); + m128 result = (m128) vec_slo((uint8x16_t) a, (uint8x16_t) sl); + return result; } static really_really_inline m128 rshift_m128(m128 a, unsigned b) { - switch(b){ - case 1: return vec_sld(zeroes128(), a, 15); break; - case 2: return vec_sld(zeroes128(), a, 14); break; - case 3: return vec_sld(zeroes128(), a, 13); break; - case 4: return vec_sld(zeroes128(), a, 12); break; - case 5: return vec_sld(zeroes128(), a, 11); break; - case 6: return vec_sld(zeroes128(), a, 10); break; - case 7: return vec_sld(zeroes128(), a, 9); break; - case 8: return vec_sld(zeroes128(), a, 8); break; - case 9: return vec_sld(zeroes128(), a, 7); break; - case 10: return vec_sld(zeroes128(), a, 6); break; - case 11: return vec_sld(zeroes128(), a, 5); break; - case 12: return vec_sld(zeroes128(), a, 4); break; - case 13: return vec_sld(zeroes128(), a, 3); break; - case 14: return vec_sld(zeroes128(), a, 2); break; - case 15: return vec_sld(zeroes128(), a, 1); break; - } - return a; + if (b == 0) return a; + m128 sl = (m128) vec_splats((uint8_t) b << 3); + m128 result = (m128) vec_sro((uint8x16_t) a, (uint8x16_t) sl); + return result; } static really_really_inline From 17467ff21bb7df033814968c75b2b91a429c62a8 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 6 Sep 2022 20:08:44 +0300 Subject: [PATCH 22/35] [VSX] huge optimization of movemask128 --- src/util/arch/ppc64el/simd_utils.h | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index 589c4031..44c9122c 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -148,27 +148,13 @@ static really_inline m128 eq64_m128(m128 a, m128 b) { return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b); } - static really_inline u32 movemask128(m128 a) { - uint8x16_t s1 = vec_sr((uint8x16_t)a, vec_splat_u8(7)); - - uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7)); - uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff)); - uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and); - - uint32x4_t ss2 = vec_sr((uint32x4_t)s2, vec_splat_u32(14)); - uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff)); - uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2); - - uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28)); - uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff)); - uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3); - - uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9); - uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff)); - uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); - - return s5[0]; + static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + uint8x16_t bitmask = vec_gb((uint8x16_t) a); + bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm); + u32 movemask; + vec_ste((uint32x4_t) bitmask, 0, &movemask); + return movemask; } static really_inline m128 set1_16x8(u8 c) { From 94fe406f0c24a7996b12ee5a18378833c9fd813c Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 6 Sep 2022 23:39:44 +0300 Subject: [PATCH 23/35] [VSX] correct lshiftbyte_m128/rshiftbyte_m128, variable_byte_shift --- src/util/arch/ppc64el/simd_utils.h | 13 ++++---- unit/internal/simd_utils.cpp | 51 +++++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index 44c9122c..32014e54 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -285,7 +285,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) { return a; } - #define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vec_sld((int8x16_t)(b), (int8x16_t)(a), (16 - offset)); break; static really_really_inline @@ -326,21 +325,21 @@ m128 palignr(m128 r, m128 l, int offset) { static really_really_inline m128 rshiftbyte_m128(m128 a, unsigned b) { - return rshift_m128(a,b); + return palignr_imm(zeroes128(), a, b); } static really_really_inline m128 lshiftbyte_m128(m128 a, unsigned b) { - return lshift_m128(a,b); + return palignr_imm(a, zeroes128(), 16 - b); } static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) { assert(amount >= -16 && amount <= 16); - if (amount < 0){ - return palignr_imm(zeroes128(), in, -amount); - } else{ - return palignr_imm(in, zeroes128(), 16 - amount); + if (amount < 0) { + return rshiftbyte_m128(in, -amount); + } else { + return lshiftbyte_m128(in, amount); } } diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 69f1a64c..c5cfec7b 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -723,10 +723,59 @@ TEST(SimdUtilsTest, set2x128) { } #endif +#define TEST_LSHIFTBYTE128(v1, buf, l) { \ + m128 v_shifted = lshiftbyte_m128(v1, l); \ + storeu128(res, v_shifted); \ + int i; \ + for (i=0; i < l; i++) { \ + assert(res[i] == 0); \ + } \ + for (; i < 16; i++) { \ + assert(res[i] == vec[i - l]); \ + } \ + } + +TEST(SimdUtilsTest, lshiftbyte128){ + u8 vec[16]; + u8 res[16]; + for (int i=0; i<16; i++) { + vec[i]=i; + } + m128 v1 = loadu128(vec); + for (int j = 0; j<16; j++){ + TEST_LSHIFTBYTE128(v1, vec, j); + } +} + +#define TEST_RSHIFTBYTE128(v1, buf, l) { \ + m128 v_shifted = rshiftbyte_m128(v1, l); \ + storeu128(res, v_shifted); \ + int i; \ + for (i=15; i >= 16 - l; i--) { \ + assert(res[i] == 0); \ + } \ + for (; i >= 0; i--) { \ + assert(res[i] == vec[i + l]); \ + } \ + } + +TEST(SimdUtilsTest, rshiftbyte128){ + u8 vec[16]; + u8 res[16]; + for (int i=0; i<16; i++) { + vec[i]=i; + } + m128 v1 = loadu128(vec); + for (int j = 0; j<16; j++){ + TEST_RSHIFTBYTE128(v1, vec, j); + } +} + TEST(SimdUtilsTest, variableByteShift128) { char base[] = "0123456789ABCDEF"; m128 in = loadu128(base); + EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0))); EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1), @@ -773,7 +822,7 @@ TEST(SimdUtilsTest, variableByteShift128) { EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10), variable_byte_shift_m128(in, 10))); - EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16))); + EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 15), variable_byte_shift_m128(in, 15))); EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16))); } From 7295b9c718c1716ad2ec161f7be15fddeafcd737 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 7 Sep 2022 00:01:54 +0300 Subject: [PATCH 24/35] [VSX] add algorithm for alignr w/o use of immediates --- src/util/arch/ppc64el/simd_utils.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index 32014e54..ea1766b2 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -313,12 +313,18 @@ m128 palignr_imm(m128 r, m128 l, int offset) { static really_really_inline m128 palignr(m128 r, m128 l, int offset) { -#if defined(HS_OPTIMIZE) - // need a faster way to do this. - return palignr_imm(r, l, offset); -#else - return palignr_imm(r, l, offset); + if (offset == 0) return l; + if (offset == 16) return r; +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(offset)) { + return (m128)vec_sld((int8x16_t)(r), (int8x16_t)(l), 16 - offset); + } #endif + m128 sl = (m128) vec_splats((uint8_t) (offset << 3)); + m128 sr = (m128) vec_splats((uint8_t) ((16 - offset) << 3)); + m128 rhs = (m128) vec_slo((uint8x16_t) r, (uint8x16_t) sr); + m128 lhs = (m128) vec_sro((uint8x16_t) l, (uint8x16_t) sl); + return or128(lhs, rhs); } #undef CASE_ALIGN_VECTORS From dc6b8ae92db27e9d9bd19a427f0128cb7ef6fc9b Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 7 Sep 2022 02:02:11 +0300 Subject: [PATCH 25/35] optimize comparemask implementation, clean up code, use union types instead of casts --- src/util/supervector/arch/ppc64el/impl.cpp | 160 +++++++++++++-------- src/util/supervector/supervector.hpp | 10 +- 2 files changed, 108 insertions(+), 62 deletions(-) diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 5becb8f8..7903bee2 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -39,7 +39,7 @@ #include "util/supervector/supervector.hpp" #include -// 128-bit Powerpc64le implementation +// 128-bit IBM Power VSX implementation template<> really_inline SuperVector<16>::SuperVector(SuperVector const &other) @@ -47,6 +47,69 @@ really_inline SuperVector<16>::SuperVector(SuperVector const &other) u.v128[0] = other.u.v128[0]; } +template<> +template<> +really_inline SuperVector<16>::SuperVector(char __bool __vector v) +{ + u.u8x16[0] = (uint8x16_t) v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int8x16_t const v) +{ + u.s8x16[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint8x16_t const v) +{ + u.u8x16[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int16x8_t const v) +{ + u.s16x8[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint16x8_t const v) +{ + u.u16x8[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int32x4_t const v) +{ + u.s32x4[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint32x4_t const v) +{ + u.u32x4[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int64x2_t const v) +{ + u.s64x2[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint64x2_t const v) +{ + u.u64x2[0] = v; +}; + template<> really_inline SuperVector<16>::SuperVector(typename base_type::type const v) { @@ -57,69 +120,69 @@ template<> template<> really_inline SuperVector<16>::SuperVector(int8_t const other) { - u.v128[0] = (m128) vec_splats(other); + u.s8x16[0] = vec_splats(other); } template<> template<> really_inline SuperVector<16>::SuperVector(uint8_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.u8x16[0] = vec_splats(static_cast(other)); } template<> template<> really_inline SuperVector<16>::SuperVector(int16_t const other) { - u.v128[0] = (m128) vec_splats(other); + u.s16x8[0] = vec_splats(other); } template<> template<> really_inline SuperVector<16>::SuperVector(uint16_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.u16x8[0] = vec_splats(static_cast(other)); } template<> template<> really_inline SuperVector<16>::SuperVector(int32_t const other) { - u.v128[0] = (m128) vec_splats(other); + u.s32x4[0] = vec_splats(other); } template<> template<> really_inline SuperVector<16>::SuperVector(uint32_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.u32x4[0] = vec_splats(static_cast(other)); } template<> template<> really_inline SuperVector<16>::SuperVector(int64_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.s64x2[0] = (int64x2_t) vec_splats(static_cast(other)); } template<> template<> really_inline SuperVector<16>::SuperVector(uint64_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.u64x2[0] = (uint64x2_t) vec_splats(static_cast(other)); } // Constants template<> really_inline SuperVector<16> SuperVector<16>::Ones(void) { - return {(m128) vec_splat_s8(-1)}; + return { vec_splat_s8(-1)}; } template<> really_inline SuperVector<16> SuperVector<16>::Zeroes(void) { - return {(m128) vec_splat_s8(0)}; + return { vec_splat_s8(0) }; } // Methods @@ -133,39 +196,38 @@ really_inline void SuperVector<16>::operator=(SuperVector<16> const &other) template <> really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const { - return {vec_and(u.v128[0], b.u.v128[0])}; + return { vec_and(u.v128[0], b.u.v128[0]) }; } template <> really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const { - return {vec_or(u.v128[0], b.u.v128[0])}; + return { vec_or(u.v128[0], b.u.v128[0]) }; } template <> really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const { - return {(m128) vec_xor(u.v128[0], b.u.v128[0])}; + return { vec_xor(u.v128[0], b.u.v128[0]) }; } template <> really_inline SuperVector<16> SuperVector<16>::operator!() const { - return {(m128) vec_xor(u.v128[0], u.v128[0])}; + return { vec_xor(u.v128[0], u.v128[0]) }; } template <> really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const { - m128 not_res = vec_xor(u.v128[0], (m128)vec_splat_s8(-1)); - return {(m128) vec_and(not_res, (m128)b.u.v128[0]) }; + int8x16_t not_res = vec_xor(u.s8x16[0], vec_splat_s8(-1)); + return { vec_and(not_res, b.u.s8x16[0]) }; } - template <> really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const { - return {(m128) vec_cmpeq(u.s8x16[0], b.u.s8x16[0])}; + return { vec_cmpeq(u.s8x16[0], b.u.s8x16[0])}; } template <> @@ -177,28 +239,27 @@ really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const template <> really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const { - return {(m128) vec_cmpgt(u.v128[0], b.u.v128[0])}; + return { vec_cmpgt(u.s8x16[0], b.u.s8x16[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const { - return {(m128) vec_cmpge(u.v128[0], b.u.v128[0])}; + return { vec_cmpge(u.s8x16[0], b.u.s8x16[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const { - return {(m128) vec_cmpgt(b.u.v128[0], u.v128[0])}; + return { vec_cmpgt(b.u.s8x16[0], u.s8x16[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const { - return {(m128) vec_cmpge(b.u.v128[0], u.v128[0])}; + return { vec_cmpge(b.u.s8x16[0], u.s8x16[0])}; } - template <> really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const { @@ -208,25 +269,12 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons template <> really_inline typename SuperVector<16>::comparemask_type SuperVector<16>::comparemask(void) const { - uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7)); - - uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7)); - uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff)); - uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and); - - uint32x4_t ss2 = vec_sr((uint32x4_t)s2 , vec_splat_u32(14)); - uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff)); - uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2); - - uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28)); - uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff)); - uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3); - - uint64x2_t ss4 = vec_sld((uint64x2_t) vec_splats(0), s4, 9); - uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff)); - uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); - - return s5[0]; + uint8x16_t bitmask = vec_gb( u.u8x16[0]); + static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm); + u32 movemask; + vec_ste((uint32x4_t) bitmask, 0, &movemask); + return movemask; } template <> @@ -248,35 +296,35 @@ template <> template really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const { - return { (m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)N)) }; + return { vec_sl(u.s8x16[0], vec_splat_u8(N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const { - return { (m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)N)) }; + return { vec_sl(u.s16x8[0], vec_splat_u16(N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const { - return { (m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)N)) }; + return { vec_sl(u.s32x4[0], vec_splat_u32(N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const { - return { (m128) vec_sl(u.s64x2[0], vec_splats((ulong64_t)N)) }; + return { vec_sl(u.s64x2[0], vec_splats((ulong64_t) N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const { - return { (m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), N)}; + return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)}; } template <> @@ -290,35 +338,35 @@ template <> template really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const { - return { (m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)N)) }; + return { vec_sr(u.s8x16[0], vec_splat_u8(N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const { - return { (m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)N)) }; + return { vec_sr(u.s16x8[0], vec_splat_u16(N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const { - return { (m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)N)) }; + return { vec_sr(u.s32x4[0], vec_splat_u32(N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const { - return { (m128) vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) }; + return { vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const { - return { (m128) vec_sld((int8x16_t)vec_splat_s8(0), u.s8x16[0], 16 - N) }; + return { vec_sld(vec_splat_s8(0), u.s8x16[0], 16 - N) }; } template <> @@ -535,9 +583,7 @@ template <> really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len) { SuperVector<16> mask = Ones_vshr(16 -len); - mask.print8("mask"); SuperVector<16> v = loadu(ptr); - v.print8("v"); return mask & v; } @@ -574,9 +620,9 @@ really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane. below is the version that is converted from Intel to PPC. */ - uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], (uint8x16_t)vec_splats((uint8_t)0x80)); + uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], vec_splats((uint8_t)0x80)); uint8x16_t res = vec_perm (u.u8x16[0], u.u8x16[0], b.u.u8x16[0]); - return (m128) vec_sel(res, (uint8x16_t)vec_splat_s8(0), mask); + return { vec_sel(res, vec_splat_u8(0), mask) }; } template<> diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index 5d066c1a..fef5f09f 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -177,13 +177,13 @@ public: #if defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL) uint64x2_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size]; - int64x2_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size]; + int64x2_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size]; uint32x4_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size]; - int32x4_t ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size]; + int32x4_t ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size]; uint16x8_t ALIGN_ATTR(BaseVector<16>::size) u16x8[SIZE / BaseVector<16>::size]; - int16x8_t ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size]; + int16x8_t ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size]; uint8x16_t ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size]; - int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; + int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; #endif uint64_t u64[SIZE / sizeof(uint64_t)]; @@ -204,7 +204,7 @@ public: SuperVector(typename base_type::type const v); template - SuperVector(T other); + SuperVector(T const other); SuperVector(SuperVector const lo, SuperVector const hi); SuperVector(previous_type const lo, previous_type const hi); From be20c2c519b4afde108db21a90296410db933ed9 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 7 Sep 2022 11:52:08 +0300 Subject: [PATCH 26/35] [VSX] optimize shifting methods, replace template Unroller --- src/util/supervector/arch/ppc64el/impl.cpp | 62 ++++++++-------------- 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 7903bee2..94aa6a32 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -396,50 +396,40 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s8x16[0], vec_splats((uint8_t)n))}; }); - return result; + uint8x16_t shift_indices = vec_splats((uint8_t) N); + return { vec_sl(u.u8x16[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s16x8[0], vec_splats((uint16_t)n))}; }); - return result; + uint16x8_t shift_indices = vec_splats((uint16_t) N); + return { vec_sl(u.u16x8[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s32x4[0], vec_splats((uint32_t)n))}; }); - return result; + uint32x4_t shift_indices = vec_splats((uint32_t) N); + return { vec_sl(u.u32x4[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s64x2[0], vec_splats((ulong64_t)n))}; }); - return result; + uint64x2_t shift_indices = vec_splats((ulong64_t) N); + return { vec_sl(u.u64x2[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(v->u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; }); - return result; + SuperVector sl{N << 3}; + return { vec_slo(u.u8x16[0], sl.u.u8x16[0]) }; } template <> @@ -452,50 +442,40 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s8x16[0], vec_splats((uint8_t)n))}; }); - return result; + uint8x16_t shift_indices = vec_splats((uint8_t) N); + return { vec_sr(u.u8x16[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s16x8[0], vec_splats((uint16_t)n))}; }); - return result; + uint16x8_t shift_indices = vec_splats((uint16_t) N); + return { vec_sr(u.u16x8[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s32x4[0], vec_splats((uint32_t)n))}; }); - return result; + uint32x4_t shift_indices = vec_splats((uint32_t) N); + return { vec_sr(u.u32x4[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s64x2[0], vec_splats((ulong64_t)n))}; }); - return result; + uint64x2_t shift_indices = vec_splats((ulong64_t) N); + return { vec_sr(u.u64x2[0], shift_indices) }; } template <> -really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N) const +really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), v->u.s8x16[0], 16 - n)}; }); - return result; + SuperVector sr{N << 3}; + return { vec_sro(u.u8x16[0], sr.u.u8x16[0]) }; } template <> From a837cf3bee355ab082e948d157c0eece66d46acc Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 7 Sep 2022 12:16:14 +0300 Subject: [PATCH 27/35] [VSX] optimize shift operators --- src/util/supervector/arch/ppc64el/impl.cpp | 50 ++++++---------------- 1 file changed, 12 insertions(+), 38 deletions(-) diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 94aa6a32..90847a0c 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -487,51 +487,25 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const template <> really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const { - switch(N) { - case 1: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 15)}; break; - case 2: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 14)}; break; - case 3: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 13)}; break; - case 4: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 12)}; break; - case 5: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 11)}; break; - case 6: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 10)}; break; - case 7: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 9)}; break; - case 8: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 8)}; break; - case 9: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 7)}; break; - case 10: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 6)}; break; - case 11: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 5)}; break; - case 12: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 4)}; break; - case 13: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 3)}; break; - case 14: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 2)}; break; - case 15: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 1)}; break; - case 16: return Zeroes(); break; - default: break; +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (N == 0) return *this; + if (__builtin_constant_p(N)) { + return { vec_sld(vec_splat_s8(0), u.s8x16[0], 16 - N) }; } - return *this; +#endif + return vshr_128(N); } template <> really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { - switch(N) { - case 1: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 1)}; break; - case 2: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 2)}; break; - case 3: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 3)}; break; - case 4: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 4)}; break; - case 5: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 5)}; break; - case 6: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 6)}; break; - case 7: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 7)}; break; - case 8: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 8)}; break; - case 9: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 9)}; break; - case 10: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 10)}; break; - case 11: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 11)}; break; - case 12: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 12)}; break; - case 13: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 13)}; break; - case 14: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 14)}; break; - case 15: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 15)}; break; - case 16: return Zeroes(); break; - default: break; +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (N == 0) return *this; + if (__builtin_constant_p(N)) { + return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)}; } - return *this; +#endif + return vshl_128(N); } template<> From 305a041c737b882b17c609ca54faf39bf37788bd Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 7 Sep 2022 12:35:28 +0300 Subject: [PATCH 28/35] [VSX] optimize alignr method --- src/util/supervector/arch/ppc64el/impl.cpp | 35 ++++++++-------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 90847a0c..2eba69b2 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -523,14 +523,14 @@ really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N) template <> really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) { - return (m128) vec_xl(0, (const long64_t*)ptr); + return { vec_xl(0, (const long64_t*)ptr) }; } template <> really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) { assert(ISALIGNED_N(ptr, alignof(SuperVector::size))); - return (m128) vec_xl(0, (const long64_t*)ptr); + return { vec_xl(0, (const long64_t*)ptr) }; } template <> @@ -544,27 +544,18 @@ really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint template<> really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) { - - switch(offset) { - case 0: return other; break; - case 1: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 15)}; break; - case 2: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 14)}; break; - case 3: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 13)}; break; - case 4: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 12)}; break; - case 5: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 11)}; break; - case 6: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 10)}; break; - case 7: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 9)}; break; - case 8: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 8)}; break; - case 9: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 7)}; break; - case 10: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 6)}; break; - case 11: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 5)}; break; - case 12: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 4)}; break; - case 13: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 3)}; break; - case 14: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 2)}; break; - case 15: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 1)}; break; - default: break; + if (offset == 0) return other; + if (offset == 16) return *this; +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(offset)) { + return { vec_sld(u.s8x16[0], other.u.s8x16[0], offset) }; } - return *this; +#endif + uint8x16_t sl = vec_splats((uint8_t) (offset << 3)); + uint8x16_t sr = vec_splats((uint8_t) ((16 - offset) << 3)); + uint8x16_t rhs = vec_slo(u.u8x16[0], sr); + uint8x16_t lhs = vec_sro(other.u.u8x16[0], sl); + return { vec_or(lhs, rhs) }; } template<> From 02ae2a3cad3410129a98d4f530f3f3b316e24c29 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 7 Sep 2022 12:41:32 +0300 Subject: [PATCH 29/35] remove simd_onebit_masks from arm/x86 headers, as they moved to common --- src/util/arch/arm/simd_utils.h | 18 ------------------ src/util/arch/x86/simd_utils.h | 18 ------------------ 2 files changed, 36 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 2a4f9c16..6447996c 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -53,24 +53,6 @@ #include // for memcpy -#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 -#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 -#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 - -/** \brief LUT for the mask1bit functions. */ -ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = { - ZEROES_32, ZEROES_32, - ZEROES_31, 0x01, ZEROES_32, - ZEROES_31, 0x02, ZEROES_32, - ZEROES_31, 0x04, ZEROES_32, - ZEROES_31, 0x08, ZEROES_32, - ZEROES_31, 0x10, ZEROES_32, - ZEROES_31, 0x20, ZEROES_32, - ZEROES_31, 0x40, ZEROES_32, - ZEROES_31, 0x80, ZEROES_32, - ZEROES_32, ZEROES_32, -}; - static really_inline m128 ones128(void) { return (m128) vdupq_n_s8(0xFF); } diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index c4a3b97c..d432251f 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -42,24 +42,6 @@ #include // for memcpy -#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 -#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 -#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 - -/** \brief LUT for the mask1bit functions. */ -ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = { - ZEROES_32, ZEROES_32, - ZEROES_31, 0x01, ZEROES_32, - ZEROES_31, 0x02, ZEROES_32, - ZEROES_31, 0x04, ZEROES_32, - ZEROES_31, 0x08, ZEROES_32, - ZEROES_31, 0x10, ZEROES_32, - ZEROES_31, 0x20, ZEROES_32, - ZEROES_31, 0x40, ZEROES_32, - ZEROES_31, 0x80, ZEROES_32, - ZEROES_32, ZEROES_32, -}; - static really_inline m128 ones128(void) { #if defined(__GNUC__) || defined(__INTEL_COMPILER) /* gcc gets this right */ From 0af2ba86165c469361fbfd9f34fd70aa2a53213d Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 7 Sep 2022 10:20:01 +0000 Subject: [PATCH 30/35] [NEON] optimize mask1bit128, get rid of simd_onebit_masks --- src/util/arch/arm/simd_utils.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 6447996c..45bcd23c 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -577,9 +577,9 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) { static really_inline m128 mask1bit128(unsigned int n) { assert(n < sizeof(m128) * 8); - u32 mask_idx = ((n % 8) * 64) + 95; - mask_idx -= n / 8; - return loadu128(&simd_onebit_masks[mask_idx]); + static m128 onebit = { 1, 0 }; + m128 mask = lshiftbyte_m128( onebit, n / 8 ); + return lshift64_m128( mask, n % 8 ); } // switches on bit N in the given vector. From 1ae0d151812fd7627ae921632af49309b14c22ae Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 7 Sep 2022 13:42:25 +0300 Subject: [PATCH 31/35] readd simd_onebit_masks for x86, needs more work --- src/util/arch/common/simd_utils.h | 2 ++ src/util/arch/x86/simd_utils.h | 26 ++++++++++++++++++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 2f2dcf7c..90ae80b0 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -88,6 +88,7 @@ static inline void print_m128_2x64(const char *label, m128 vec) { #define print_m128_2x64(label, vec) ; #endif +#if !defined(ARCH_IA32) && !defined(ARCH_X86_64) #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 #define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 #define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 @@ -105,6 +106,7 @@ ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = { ZEROES_31, 0x80, ZEROES_32, ZEROES_32, ZEROES_32, }; +#endif // !defined(ARCH_IA32) && !defined(ARCH_X86_64) /**** **** 256-bit Primitives diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index d432251f..f732e3b8 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -42,6 +42,24 @@ #include // for memcpy +#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 + +/** \brief LUT for the mask1bit functions. */ +ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = { + ZEROES_32, ZEROES_32, + ZEROES_31, 0x01, ZEROES_32, + ZEROES_31, 0x02, ZEROES_32, + ZEROES_31, 0x04, ZEROES_32, + ZEROES_31, 0x08, ZEROES_32, + ZEROES_31, 0x10, ZEROES_32, + ZEROES_31, 0x20, ZEROES_32, + ZEROES_31, 0x40, ZEROES_32, + ZEROES_31, 0x80, ZEROES_32, + ZEROES_32, ZEROES_32, +}; + static really_inline m128 ones128(void) { #if defined(__GNUC__) || defined(__INTEL_COMPILER) /* gcc gets this right */ @@ -237,14 +255,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) { memcpy(&a, ptr, n); return a; } -/* -#ifdef __cplusplus -extern "C" { -#endif -extern const u8 simd_onebit_masks[]; -#ifdef __cplusplus -} -#endif*/ static really_inline m128 mask1bit128(unsigned int n) { From 756ef409b400cabb66ae55d44971593fe85607d7 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 7 Sep 2022 15:07:20 +0300 Subject: [PATCH 32/35] provide non-immediate versions of lshiftbyte/rshiftbyte on x86 --- src/util/arch/x86/simd_utils.h | 65 ++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index f732e3b8..d3d07f79 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -165,8 +165,67 @@ m128 load_m128_from_u64a(const u64a *p) { return _mm_set_epi64x(0LL, *p); } -#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) -#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) +#define CASE_RSHIFT_VECTOR(a, count) case count: return _mm_srli_si128((m128)(a), (count)); break; + +static really_inline +m128 rshiftbyte_m128(const m128 a, int count_immed) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(count_immed)) { + return _mm_srli_si128(a, count_immed); + } +#endif + switch (count_immed) { + case 0: return a; break; + CASE_RSHIFT_VECTOR(a, 1); + CASE_RSHIFT_VECTOR(a, 2); + CASE_RSHIFT_VECTOR(a, 3); + CASE_RSHIFT_VECTOR(a, 4); + CASE_RSHIFT_VECTOR(a, 5); + CASE_RSHIFT_VECTOR(a, 6); + CASE_RSHIFT_VECTOR(a, 7); + CASE_RSHIFT_VECTOR(a, 8); + CASE_RSHIFT_VECTOR(a, 9); + CASE_RSHIFT_VECTOR(a, 10); + CASE_RSHIFT_VECTOR(a, 11); + CASE_RSHIFT_VECTOR(a, 12); + CASE_RSHIFT_VECTOR(a, 13); + CASE_RSHIFT_VECTOR(a, 14); + CASE_RSHIFT_VECTOR(a, 15); + default: return zeroes128(); break; + } +} +#undef CASE_RSHIFT_VECTOR + +#define CASE_LSHIFT_VECTOR(a, count) case count: return _mm_srli_si128((m128)(a), (count)); break; + +static really_inline +m128 lshiftbyte_m128(const m128 a, int count_immed) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(count_immed)) { + return _mm_slli_si128(a, count_immed); + } +#endif + switch (count_immed) { + case 0: return a; break; + CASE_LSHIFT_VECTOR(a, 1); + CASE_LSHIFT_VECTOR(a, 2); + CASE_LSHIFT_VECTOR(a, 3); + CASE_LSHIFT_VECTOR(a, 4); + CASE_LSHIFT_VECTOR(a, 5); + CASE_LSHIFT_VECTOR(a, 6); + CASE_LSHIFT_VECTOR(a, 7); + CASE_LSHIFT_VECTOR(a, 8); + CASE_LSHIFT_VECTOR(a, 9); + CASE_LSHIFT_VECTOR(a, 10); + CASE_LSHIFT_VECTOR(a, 11); + CASE_LSHIFT_VECTOR(a, 12); + CASE_LSHIFT_VECTOR(a, 13); + CASE_LSHIFT_VECTOR(a, 14); + CASE_LSHIFT_VECTOR(a, 15); + default: return zeroes128(); break; + } +} +#undef CASE_LSHIFT_VECTOR #if defined(HAVE_SSE41) #define extract32from128(a, imm) _mm_extract_epi32(a, imm) @@ -322,6 +381,7 @@ m128 palignr_sw(m128 r, m128 l, int offset) { break; } } +#undef CASE_ALIGN_VECTORS static really_really_inline m128 palignr(m128 r, m128 l, int offset) { @@ -332,7 +392,6 @@ m128 palignr(m128 r, m128 l, int offset) { #endif return palignr_sw(r, l, offset); } -#undef CASE_ALIGN_VECTORS static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) { From e3c237a7e055a0cf885712ca9ab9d907eb6bb18e Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 7 Sep 2022 16:00:10 +0300 Subject: [PATCH 33/35] use correct intrinsic for lshiftbyte_m128 --- src/util/arch/x86/simd_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index d3d07f79..924a91c6 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -196,7 +196,7 @@ m128 rshiftbyte_m128(const m128 a, int count_immed) { } #undef CASE_RSHIFT_VECTOR -#define CASE_LSHIFT_VECTOR(a, count) case count: return _mm_srli_si128((m128)(a), (count)); break; +#define CASE_LSHIFT_VECTOR(a, count) case count: return _mm_slli_si128((m128)(a), (count)); break; static really_inline m128 lshiftbyte_m128(const m128 a, int count_immed) { From f4840adf3d6ff539241e2db3548b96a96585b138 Mon Sep 17 00:00:00 2001 From: liquidaty Date: Thu, 8 Sep 2022 09:59:37 -0700 Subject: [PATCH 34/35] fix to enable successful build with mingw64 --- src/util/alloc.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/util/alloc.cpp b/src/util/alloc.cpp index f3a2a259..40004932 100644 --- a/src/util/alloc.cpp +++ b/src/util/alloc.cpp @@ -47,7 +47,15 @@ namespace ue2 { #endif /* get us a posix_memalign from somewhere */ -#if !defined(HAVE_POSIX_MEMALIGN) +#if defined(__MINGW32__) || defined(__MINGW64__) + #include + #include + #include + #include + + #define posix_memalign(A, B, C) ((*A = (void *)__mingw_aligned_malloc(C, B)) == nullptr) + +#elif !defined(HAVE_POSIX_MEMALIGN) # if defined(HAVE_MEMALIGN) #define posix_memalign(A, B, C) ((*A = (void *)memalign(B, C)) == nullptr) # elif defined(HAVE__ALIGNED_MALLOC) @@ -77,7 +85,11 @@ void aligned_free_internal(void *ptr) { return; } +#if defined(__MINGW32__) || defined(__MINGW64__) + __mingw_aligned_free(ptr); +#else free(ptr); +#endif } /** \brief 64-byte aligned, zeroed malloc. From 67b414f2f9e543e894ea3204e6ce71721a0c251b Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 12 Sep 2022 13:09:51 +0000 Subject: [PATCH 35/35] [NEON] simplify/optimize shift/align primitives --- src/util/arch/arm/simd_utils.h | 220 +------------------------ src/util/supervector/arch/arm/impl.cpp | 96 ++++------- 2 files changed, 41 insertions(+), 275 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 45bcd23c..7f8539b0 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -112,43 +112,8 @@ m128 lshift_m128(m128 a, unsigned b) { return (m128) vshlq_n_u32((uint32x4_t)a, b); } #endif -#define CASE_LSHIFT_m128(a, offset) case offset: return (m128)vshlq_n_u32((uint32x4_t)(a), (offset)); break; - switch (b) { - case 0: return a; break; - CASE_LSHIFT_m128(a, 1); - CASE_LSHIFT_m128(a, 2); - CASE_LSHIFT_m128(a, 3); - CASE_LSHIFT_m128(a, 4); - CASE_LSHIFT_m128(a, 5); - CASE_LSHIFT_m128(a, 6); - CASE_LSHIFT_m128(a, 7); - CASE_LSHIFT_m128(a, 8); - CASE_LSHIFT_m128(a, 9); - CASE_LSHIFT_m128(a, 10); - CASE_LSHIFT_m128(a, 11); - CASE_LSHIFT_m128(a, 12); - CASE_LSHIFT_m128(a, 13); - CASE_LSHIFT_m128(a, 14); - CASE_LSHIFT_m128(a, 15); - CASE_LSHIFT_m128(a, 16); - CASE_LSHIFT_m128(a, 17); - CASE_LSHIFT_m128(a, 18); - CASE_LSHIFT_m128(a, 19); - CASE_LSHIFT_m128(a, 20); - CASE_LSHIFT_m128(a, 21); - CASE_LSHIFT_m128(a, 22); - CASE_LSHIFT_m128(a, 23); - CASE_LSHIFT_m128(a, 24); - CASE_LSHIFT_m128(a, 25); - CASE_LSHIFT_m128(a, 26); - CASE_LSHIFT_m128(a, 27); - CASE_LSHIFT_m128(a, 28); - CASE_LSHIFT_m128(a, 29); - CASE_LSHIFT_m128(a, 30); - CASE_LSHIFT_m128(a, 31); - default: return zeroes128(); break; - } -#undef CASE_LSHIFT_m128 + int32x4_t shift_indices = vdupq_n_s32(b); + return (m128) vshlq_s32(a, shift_indices); } static really_really_inline @@ -158,43 +123,8 @@ m128 rshift_m128(m128 a, unsigned b) { return (m128) vshrq_n_u32((uint32x4_t)a, b); } #endif -#define CASE_RSHIFT_m128(a, offset) case offset: return (m128)vshrq_n_u32((uint32x4_t)(a), (offset)); break; - switch (b) { - case 0: return a; break; - CASE_RSHIFT_m128(a, 1); - CASE_RSHIFT_m128(a, 2); - CASE_RSHIFT_m128(a, 3); - CASE_RSHIFT_m128(a, 4); - CASE_RSHIFT_m128(a, 5); - CASE_RSHIFT_m128(a, 6); - CASE_RSHIFT_m128(a, 7); - CASE_RSHIFT_m128(a, 8); - CASE_RSHIFT_m128(a, 9); - CASE_RSHIFT_m128(a, 10); - CASE_RSHIFT_m128(a, 11); - CASE_RSHIFT_m128(a, 12); - CASE_RSHIFT_m128(a, 13); - CASE_RSHIFT_m128(a, 14); - CASE_RSHIFT_m128(a, 15); - CASE_RSHIFT_m128(a, 16); - CASE_RSHIFT_m128(a, 17); - CASE_RSHIFT_m128(a, 18); - CASE_RSHIFT_m128(a, 19); - CASE_RSHIFT_m128(a, 20); - CASE_RSHIFT_m128(a, 21); - CASE_RSHIFT_m128(a, 22); - CASE_RSHIFT_m128(a, 23); - CASE_RSHIFT_m128(a, 24); - CASE_RSHIFT_m128(a, 25); - CASE_RSHIFT_m128(a, 26); - CASE_RSHIFT_m128(a, 27); - CASE_RSHIFT_m128(a, 28); - CASE_RSHIFT_m128(a, 29); - CASE_RSHIFT_m128(a, 30); - CASE_RSHIFT_m128(a, 31); - default: return zeroes128(); break; - } -#undef CASE_RSHIFT_m128 + int32x4_t shift_indices = vdupq_n_s32(-b); + return (m128) vshlq_s32(a, shift_indices); } static really_really_inline @@ -204,75 +134,8 @@ m128 lshift64_m128(m128 a, unsigned b) { return (m128) vshlq_n_u64((uint64x2_t)a, b); } #endif -#define CASE_LSHIFT64_m128(a, offset) case offset: return (m128)vshlq_n_u64((uint64x2_t)(a), (offset)); break; - switch (b) { - case 0: return a; break; - CASE_LSHIFT64_m128(a, 1); - CASE_LSHIFT64_m128(a, 2); - CASE_LSHIFT64_m128(a, 3); - CASE_LSHIFT64_m128(a, 4); - CASE_LSHIFT64_m128(a, 5); - CASE_LSHIFT64_m128(a, 6); - CASE_LSHIFT64_m128(a, 7); - CASE_LSHIFT64_m128(a, 8); - CASE_LSHIFT64_m128(a, 9); - CASE_LSHIFT64_m128(a, 10); - CASE_LSHIFT64_m128(a, 11); - CASE_LSHIFT64_m128(a, 12); - CASE_LSHIFT64_m128(a, 13); - CASE_LSHIFT64_m128(a, 14); - CASE_LSHIFT64_m128(a, 15); - CASE_LSHIFT64_m128(a, 16); - CASE_LSHIFT64_m128(a, 17); - CASE_LSHIFT64_m128(a, 18); - CASE_LSHIFT64_m128(a, 19); - CASE_LSHIFT64_m128(a, 20); - CASE_LSHIFT64_m128(a, 21); - CASE_LSHIFT64_m128(a, 22); - CASE_LSHIFT64_m128(a, 23); - CASE_LSHIFT64_m128(a, 24); - CASE_LSHIFT64_m128(a, 25); - CASE_LSHIFT64_m128(a, 26); - CASE_LSHIFT64_m128(a, 27); - CASE_LSHIFT64_m128(a, 28); - CASE_LSHIFT64_m128(a, 29); - CASE_LSHIFT64_m128(a, 30); - CASE_LSHIFT64_m128(a, 31); - CASE_LSHIFT64_m128(a, 32); - CASE_LSHIFT64_m128(a, 33); - CASE_LSHIFT64_m128(a, 34); - CASE_LSHIFT64_m128(a, 35); - CASE_LSHIFT64_m128(a, 36); - CASE_LSHIFT64_m128(a, 37); - CASE_LSHIFT64_m128(a, 38); - CASE_LSHIFT64_m128(a, 39); - CASE_LSHIFT64_m128(a, 40); - CASE_LSHIFT64_m128(a, 41); - CASE_LSHIFT64_m128(a, 42); - CASE_LSHIFT64_m128(a, 43); - CASE_LSHIFT64_m128(a, 44); - CASE_LSHIFT64_m128(a, 45); - CASE_LSHIFT64_m128(a, 46); - CASE_LSHIFT64_m128(a, 47); - CASE_LSHIFT64_m128(a, 48); - CASE_LSHIFT64_m128(a, 49); - CASE_LSHIFT64_m128(a, 50); - CASE_LSHIFT64_m128(a, 51); - CASE_LSHIFT64_m128(a, 52); - CASE_LSHIFT64_m128(a, 53); - CASE_LSHIFT64_m128(a, 54); - CASE_LSHIFT64_m128(a, 55); - CASE_LSHIFT64_m128(a, 56); - CASE_LSHIFT64_m128(a, 57); - CASE_LSHIFT64_m128(a, 58); - CASE_LSHIFT64_m128(a, 59); - CASE_LSHIFT64_m128(a, 60); - CASE_LSHIFT64_m128(a, 61); - CASE_LSHIFT64_m128(a, 62); - CASE_LSHIFT64_m128(a, 63); - default: return zeroes128(); break; - } -#undef CASE_LSHIFT64_m128 + int64x2_t shift_indices = vdupq_n_s64(b); + return (m128) vshlq_s64((int64x2_t) a, shift_indices); } static really_really_inline @@ -282,75 +145,8 @@ m128 rshift64_m128(m128 a, unsigned b) { return (m128) vshrq_n_u64((uint64x2_t)a, b); } #endif -#define CASE_RSHIFT64_m128(a, offset) case offset: return (m128)vshrq_n_u64((uint64x2_t)(a), (offset)); break; - switch (b) { - case 0: return a; break; - CASE_RSHIFT64_m128(a, 1); - CASE_RSHIFT64_m128(a, 2); - CASE_RSHIFT64_m128(a, 3); - CASE_RSHIFT64_m128(a, 4); - CASE_RSHIFT64_m128(a, 5); - CASE_RSHIFT64_m128(a, 6); - CASE_RSHIFT64_m128(a, 7); - CASE_RSHIFT64_m128(a, 8); - CASE_RSHIFT64_m128(a, 9); - CASE_RSHIFT64_m128(a, 10); - CASE_RSHIFT64_m128(a, 11); - CASE_RSHIFT64_m128(a, 12); - CASE_RSHIFT64_m128(a, 13); - CASE_RSHIFT64_m128(a, 14); - CASE_RSHIFT64_m128(a, 15); - CASE_RSHIFT64_m128(a, 16); - CASE_RSHIFT64_m128(a, 17); - CASE_RSHIFT64_m128(a, 18); - CASE_RSHIFT64_m128(a, 19); - CASE_RSHIFT64_m128(a, 20); - CASE_RSHIFT64_m128(a, 21); - CASE_RSHIFT64_m128(a, 22); - CASE_RSHIFT64_m128(a, 23); - CASE_RSHIFT64_m128(a, 24); - CASE_RSHIFT64_m128(a, 25); - CASE_RSHIFT64_m128(a, 26); - CASE_RSHIFT64_m128(a, 27); - CASE_RSHIFT64_m128(a, 28); - CASE_RSHIFT64_m128(a, 29); - CASE_RSHIFT64_m128(a, 30); - CASE_RSHIFT64_m128(a, 31); - CASE_RSHIFT64_m128(a, 32); - CASE_RSHIFT64_m128(a, 33); - CASE_RSHIFT64_m128(a, 34); - CASE_RSHIFT64_m128(a, 35); - CASE_RSHIFT64_m128(a, 36); - CASE_RSHIFT64_m128(a, 37); - CASE_RSHIFT64_m128(a, 38); - CASE_RSHIFT64_m128(a, 39); - CASE_RSHIFT64_m128(a, 40); - CASE_RSHIFT64_m128(a, 41); - CASE_RSHIFT64_m128(a, 42); - CASE_RSHIFT64_m128(a, 43); - CASE_RSHIFT64_m128(a, 44); - CASE_RSHIFT64_m128(a, 45); - CASE_RSHIFT64_m128(a, 46); - CASE_RSHIFT64_m128(a, 47); - CASE_RSHIFT64_m128(a, 48); - CASE_RSHIFT64_m128(a, 49); - CASE_RSHIFT64_m128(a, 50); - CASE_RSHIFT64_m128(a, 51); - CASE_RSHIFT64_m128(a, 52); - CASE_RSHIFT64_m128(a, 53); - CASE_RSHIFT64_m128(a, 54); - CASE_RSHIFT64_m128(a, 55); - CASE_RSHIFT64_m128(a, 56); - CASE_RSHIFT64_m128(a, 57); - CASE_RSHIFT64_m128(a, 58); - CASE_RSHIFT64_m128(a, 59); - CASE_RSHIFT64_m128(a, 60); - CASE_RSHIFT64_m128(a, 61); - CASE_RSHIFT64_m128(a, 62); - CASE_RSHIFT64_m128(a, 63); - default: return zeroes128(); break; - } -#undef CASE_RSHIFT64_m128 + int64x2_t shift_indices = vdupq_n_s64(-b); + return (m128) vshlq_s64((int64x2_t) a, shift_indices); } static really_inline m128 eq128(m128 a, m128 b) { diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp index b3e4233e..5283ab00 100644 --- a/src/util/supervector/arch/arm/impl.cpp +++ b/src/util/supervector/arch/arm/impl.cpp @@ -374,10 +374,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(v->u.u8x16[0], n)}; }); - return result; + if (N == 8) return Zeroes(); + int8x16_t shift_indices = vdupq_n_s8(N); + return { vshlq_s8(u.s8x16[0], shift_indices) }; } template <> @@ -385,9 +384,8 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const { if (N == 0) return *this; if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(v->u.u16x8[0], n)}; }); - return result; + int16x8_t shift_indices = vdupq_n_s16(N); + return { vshlq_s16(u.s16x8[0], shift_indices) }; } template <> @@ -395,9 +393,8 @@ really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const { if (N == 0) return *this; if (N == 32) return Zeroes(); - SuperVector result; - Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(v->u.u32x4[0], n)}; }); - return result; + int32x4_t shift_indices = vdupq_n_s32(N); + return { vshlq_s32(u.s32x4[0], shift_indices) }; } template <> @@ -405,9 +402,8 @@ really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const { if (N == 0) return *this; if (N == 64) return Zeroes(); - SuperVector result; - Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(v->u.u64x2[0], n)}; }); - return result; + int64x2_t shift_indices = vdupq_n_s64(N); + return { vshlq_s64(u.s64x2[0], shift_indices) }; } template <> @@ -415,6 +411,11 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const { if (N == 0) return *this; if (N == 16) return Zeroes(); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - N)}; + } +#endif SuperVector result; Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), v->u.u8x16[0], 16 - n)}; }); return result; @@ -431,9 +432,8 @@ really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const { if (N == 0) return *this; if (N == 8) return Zeroes(); - SuperVector result; - Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(v->u.u8x16[0], n)}; }); - return result; + int8x16_t shift_indices = vdupq_n_s8(-N); + return { vshlq_s8(u.s8x16[0], shift_indices) }; } template <> @@ -441,9 +441,8 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const { if (N == 0) return *this; if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(v->u.u16x8[0], n)}; }); - return result; + int16x8_t shift_indices = vdupq_n_s16(-N); + return { vshlq_s16(u.s16x8[0], shift_indices) }; } template <> @@ -451,9 +450,8 @@ really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const { if (N == 0) return *this; if (N == 32) return Zeroes(); - SuperVector result; - Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(v->u.u32x4[0], n)}; }); - return result; + int32x4_t shift_indices = vdupq_n_s32(-N); + return { vshlq_s32(u.s32x4[0], shift_indices) }; } template <> @@ -461,9 +459,8 @@ really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const { if (N == 0) return *this; if (N == 64) return Zeroes(); - SuperVector result; - Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(v->u.u64x2[0], n)}; }); - return result; + int64x2_t shift_indices = vdupq_n_s64(-N); + return { vshlq_s64(u.s64x2[0], shift_indices) }; } template <> @@ -471,6 +468,11 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const { if (N == 0) return *this; if (N == 16) return Zeroes(); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {vextq_u8(u.u8x16[0], vdupq_n_u8(0), N)}; + } +#endif SuperVector result; Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(v->u.u8x16[0], vdupq_n_u8(0), n)}; }); return result; @@ -485,22 +487,12 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const template <> really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const { -#if defined(HAVE__BUILTIN_CONSTANT_P) - if (__builtin_constant_p(N)) { - return {vextq_u8(u.u8x16[0], vdupq_n_u8(0), N)}; - } -#endif return vshr_128(N); } template <> really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { -#if defined(HAVE__BUILTIN_CONSTANT_P) - if (__builtin_constant_p(N)) { - return {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - N)}; - } -#endif return vshl_128(N); } @@ -534,45 +526,23 @@ template <> really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len) { SuperVector mask = Ones_vshr(16 -len); - //mask.print8("mask"); SuperVector<16> v = loadu(ptr); - //v.print8("v"); return mask & v; } template<> really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) { + if (offset == 0) return other; + if (offset == 16) return *this; #if defined(HAVE__BUILTIN_CONSTANT_P) if (__builtin_constant_p(offset)) { - if (offset == 16) { - return *this; - } else { - return {vextq_u8(other.u.u8x16[0], u.u8x16[0], offset)}; - } + return {vextq_u8(other.u.u8x16[0], u.u8x16[0], offset)}; } #endif - switch(offset) { - case 0: return other; break; - case 1: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 1)}; break; - case 2: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 2)}; break; - case 3: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 3)}; break; - case 4: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 4)}; break; - case 5: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 5)}; break; - case 6: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 6)}; break; - case 7: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 7)}; break; - case 8: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 8)}; break; - case 9: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 9)}; break; - case 10: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 10)}; break; - case 11: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 11)}; break; - case 12: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 12)}; break; - case 13: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 13)}; break; - case 14: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 14)}; break; - case 15: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 15)}; break; - case 16: return *this; break; - default: break; - } - return *this; + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (offset == n) result = {vextq_u8(other.u.u8x16[0], v->u.u8x16[0], n)}; }); + return result; } template<>