mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-09-29 19:24:25 +03:00
Optimize vectorscan for aarch64 by using shrn instruction
This optimization is based on the thread https://twitter.com/Danlark1/status/1539344279268691970 and uses shift right and narrow by 4 instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SHRN--SHRN2--Shift-Right-Narrow--immediate-- To achieve that, I needed to redesign a little movemask into comparemask and have an additional step towards mask iteration. Our benchmarks showed 10-15% improvement on average for long matches.
This commit is contained in:
@@ -249,25 +249,25 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void) const
|
||||
{
|
||||
SuperVector powers = SuperVector::dup_u64(0x8040201008040201UL);
|
||||
|
||||
// Compute the mask from the input
|
||||
uint8x16_t mask = (uint8x16_t) vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(u.u8x16[0], powers.u.u8x16[0]))));
|
||||
uint64x2_t mask1 = (uint64x2_t) vextq_u8(mask, vdupq_n_u8(0), 7);
|
||||
mask = vorrq_u8(mask, (uint8x16_t) mask1);
|
||||
|
||||
// Get the resulting bytes
|
||||
uint16_t output;
|
||||
vst1q_lane_u16(&output, (uint16x8_t)mask, 0);
|
||||
return static_cast<typename SuperVector<16>::movemask_type>(output);
|
||||
really_inline typename SuperVector<16>::comparemask_type
|
||||
SuperVector<16>::comparemask(void) const {
|
||||
return static_cast<typename SuperVector<16>::comparemask_type>(
|
||||
vget_lane_u64((uint64x1_t)vshrn_n_u16(u.u16x8[0], 4), 0));
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
|
||||
{
|
||||
return eq(b).movemask();
|
||||
really_inline typename SuperVector<16>::comparemask_type
|
||||
SuperVector<16>::eqmask(SuperVector<16> const b) const {
|
||||
return eq(b).comparemask();
|
||||
}
|
||||
|
||||
template <> really_inline u32 SuperVector<16>::mask_width() { return 4; }
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<16>::comparemask_type
|
||||
SuperVector<16>::iteration_mask(
|
||||
typename SuperVector<16>::comparemask_type mask) {
|
||||
return mask & 0x1111111111111111ull;
|
||||
}
|
||||
|
||||
template <>
|
||||
|
@@ -206,8 +206,8 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const
|
||||
{
|
||||
really_inline typename SuperVector<16>::comparemask_type
|
||||
SuperVector<16>::comparemask(void) const {
|
||||
uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7));
|
||||
|
||||
uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
|
||||
@@ -230,11 +230,19 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
|
||||
{
|
||||
return eq(b).movemask();
|
||||
really_inline typename SuperVector<16>::comparemask_type
|
||||
SuperVector<16>::eqmask(SuperVector<16> const b) const {
|
||||
return eq(b).comparemask();
|
||||
}
|
||||
|
||||
template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<16>::comparemask_type
|
||||
SuperVector<16>::iteration_mask(
|
||||
typename SuperVector<16>::comparemask_type mask) {
|
||||
return mask;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<uint8_t N>
|
||||
|
@@ -203,15 +203,24 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const
|
||||
{
|
||||
return _mm_movemask_epi8(u.v128[0]);
|
||||
really_inline typename SuperVector<16>::comparemask_type
|
||||
SuperVector<16>::comparemask(void) const {
|
||||
return (u32)_mm_movemask_epi8(u.v128[0]);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
|
||||
{
|
||||
return eq(b).movemask();
|
||||
really_inline typename SuperVector<16>::comparemask_type
|
||||
SuperVector<16>::eqmask(SuperVector<16> const b) const {
|
||||
return eq(b).comparemask();
|
||||
}
|
||||
|
||||
template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<16>::comparemask_type
|
||||
SuperVector<16>::iteration_mask(
|
||||
typename SuperVector<16>::comparemask_type mask) {
|
||||
return mask;
|
||||
}
|
||||
|
||||
// template <>
|
||||
@@ -754,17 +763,25 @@ really_inline SuperVector<32> SuperVector<32>::eq(SuperVector<32> const &b) cons
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<32>::movemask_type SuperVector<32>::movemask(void)const
|
||||
{
|
||||
return _mm256_movemask_epi8(u.v256[0]);
|
||||
really_inline typename SuperVector<32>::comparemask_type
|
||||
SuperVector<32>::comparemask(void) const {
|
||||
return (u32)_mm256_movemask_epi8(u.v256[0]);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<32>::movemask_type SuperVector<32>::eqmask(SuperVector<32> const b) const
|
||||
{
|
||||
return eq(b).movemask();
|
||||
really_inline typename SuperVector<32>::comparemask_type
|
||||
SuperVector<32>::eqmask(SuperVector<32> const b) const {
|
||||
return eq(b).comparemask();
|
||||
}
|
||||
|
||||
template <> really_inline u32 SuperVector<32>::mask_width() { return 1; }
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<32>::comparemask_type
|
||||
SuperVector<32>::iteration_mask(
|
||||
typename SuperVector<32>::comparemask_type mask) {
|
||||
return mask;
|
||||
}
|
||||
|
||||
// template <>
|
||||
// template<uint8_t N>
|
||||
@@ -1347,42 +1364,48 @@ really_inline SuperVector<64> SuperVector<64>::opandnot(SuperVector<64> const &b
|
||||
template <>
|
||||
really_inline SuperVector<64> SuperVector<64>::operator==(SuperVector<64> const &b) const
|
||||
{
|
||||
SuperVector<64>::movemask_type mask = _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
SuperVector<64>::comparemask_type mask =
|
||||
_mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
return {_mm512_movm_epi8(mask)};
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline SuperVector<64> SuperVector<64>::operator!=(SuperVector<64> const &b) const
|
||||
{
|
||||
SuperVector<64>::movemask_type mask = _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
SuperVector<64>::comparemask_type mask =
|
||||
_mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
return {_mm512_movm_epi8(mask)};
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline SuperVector<64> SuperVector<64>::operator>(SuperVector<64> const &b) const
|
||||
{
|
||||
SuperVector<64>::movemask_type mask = _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
SuperVector<64>::comparemask_type mask =
|
||||
_mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
return {_mm512_movm_epi8(mask)};
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline SuperVector<64> SuperVector<64>::operator<(SuperVector<64> const &b) const
|
||||
{
|
||||
SuperVector<64>::movemask_type mask = _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
SuperVector<64>::comparemask_type mask =
|
||||
_mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
return {_mm512_movm_epi8(mask)};
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline SuperVector<64> SuperVector<64>::operator>=(SuperVector<64> const &b) const
|
||||
{
|
||||
SuperVector<64>::movemask_type mask = _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
SuperVector<64>::comparemask_type mask =
|
||||
_mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
return {_mm512_movm_epi8(mask)};
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline SuperVector<64> SuperVector<64>::operator<=(SuperVector<64> const &b) const
|
||||
{
|
||||
SuperVector<64>::movemask_type mask = _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
SuperVector<64>::comparemask_type mask =
|
||||
_mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
return {_mm512_movm_epi8(mask)};
|
||||
}
|
||||
|
||||
@@ -1393,19 +1416,28 @@ really_inline SuperVector<64> SuperVector<64>::eq(SuperVector<64> const &b) cons
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<64>::movemask_type SuperVector<64>::movemask(void)const
|
||||
{
|
||||
really_inline typename SuperVector<64>::comparemask_type
|
||||
SuperVector<64>::comparemask(void) const {
|
||||
__m512i msb = _mm512_set1_epi8(0xFF);
|
||||
__m512i mask = _mm512_and_si512(msb, u.v512[0]);
|
||||
return _mm512_cmpeq_epi8_mask(mask, msb);
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<64>::movemask_type SuperVector<64>::eqmask(SuperVector<64> const b) const
|
||||
{
|
||||
really_inline typename SuperVector<64>::comparemask_type
|
||||
SuperVector<64>::eqmask(SuperVector<64> const b) const {
|
||||
return _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
}
|
||||
|
||||
template <> really_inline u32 SuperVector<64>::mask_width() { return 1; }
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<64>::comparemask_type
|
||||
SuperVector<64>::iteration_mask(
|
||||
typename SuperVector<64>::comparemask_type mask) {
|
||||
return mask;
|
||||
}
|
||||
|
||||
// template <>
|
||||
// template<uint8_t N>
|
||||
// really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
|
||||
|
@@ -46,19 +46,29 @@
|
||||
using Z_TYPE = u64a;
|
||||
#define Z_BITS 64
|
||||
#define Z_SHIFT 63
|
||||
#define Z_POSSHIFT 0
|
||||
#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS -(l)))
|
||||
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||
#elif defined(HAVE_SIMD_256_BITS)
|
||||
using Z_TYPE = u32;
|
||||
#define Z_BITS 32
|
||||
#define Z_SHIFT 31
|
||||
#define Z_POSSHIFT 0
|
||||
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||
#elif defined(HAVE_SIMD_128_BITS)
|
||||
#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
|
||||
using Z_TYPE = u64a;
|
||||
#define Z_BITS 64
|
||||
#define Z_POSSHIFT 2
|
||||
#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l)))
|
||||
#else
|
||||
using Z_TYPE = u32;
|
||||
#define Z_BITS 32
|
||||
#define Z_POSSHIFT 0
|
||||
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||
#endif
|
||||
#define Z_SHIFT 15
|
||||
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||
#endif
|
||||
|
||||
@@ -94,7 +104,8 @@ struct BaseVector
|
||||
static constexpr bool is_valid = false;
|
||||
static constexpr u16 size = 8;
|
||||
using type = void;
|
||||
using movemask_type = void;
|
||||
using comparemask_type = void;
|
||||
using cmpmask_type = void;
|
||||
static constexpr bool has_previous = false;
|
||||
using previous_type = void;
|
||||
static constexpr u16 previous_size = 4;
|
||||
@@ -106,7 +117,7 @@ struct BaseVector<128>
|
||||
static constexpr bool is_valid = true;
|
||||
static constexpr u16 size = 128;
|
||||
using type = void;
|
||||
using movemask_type = u64a;
|
||||
using comparemask_type = u64a;
|
||||
static constexpr bool has_previous = true;
|
||||
using previous_type = m512;
|
||||
static constexpr u16 previous_size = 64;
|
||||
@@ -118,7 +129,7 @@ struct BaseVector<64>
|
||||
static constexpr bool is_valid = true;
|
||||
static constexpr u16 size = 64;
|
||||
using type = m512;
|
||||
using movemask_type = u64a;
|
||||
using comparemask_type = u64a;
|
||||
static constexpr bool has_previous = true;
|
||||
using previous_type = m256;
|
||||
static constexpr u16 previous_size = 32;
|
||||
@@ -131,7 +142,7 @@ struct BaseVector<32>
|
||||
static constexpr bool is_valid = true;
|
||||
static constexpr u16 size = 32;
|
||||
using type = m256;
|
||||
using movemask_type = u32;
|
||||
using comparemask_type = u64a;
|
||||
static constexpr bool has_previous = true;
|
||||
using previous_type = m128;
|
||||
static constexpr u16 previous_size = 16;
|
||||
@@ -144,7 +155,7 @@ struct BaseVector<16>
|
||||
static constexpr bool is_valid = true;
|
||||
static constexpr u16 size = 16;
|
||||
using type = m128;
|
||||
using movemask_type = u32;
|
||||
using comparemask_type = u64a;
|
||||
static constexpr bool has_previous = false;
|
||||
using previous_type = u64a;
|
||||
static constexpr u16 previous_size = 8;
|
||||
@@ -231,8 +242,17 @@ public:
|
||||
SuperVector eq(SuperVector const &b) const;
|
||||
SuperVector operator<<(uint8_t const N) const;
|
||||
SuperVector operator>>(uint8_t const N) const;
|
||||
typename base_type::movemask_type movemask(void) const;
|
||||
typename base_type::movemask_type eqmask(SuperVector const b) const;
|
||||
// Returns mask_width groups of zeros or ones. To get the mask which can be
|
||||
// iterated, use iteration_mask method, it ensures only one bit is set per
|
||||
// mask_width group.
|
||||
// Precondition: all bytes must be 0 or 0xff.
|
||||
typename base_type::comparemask_type comparemask(void) const;
|
||||
typename base_type::comparemask_type eqmask(SuperVector const b) const;
|
||||
static u32 mask_width();
|
||||
// Returns a mask with at most 1 bit set to 1. It can be used to iterate
|
||||
// over bits through ctz/clz and lowest bit clear.
|
||||
static typename base_type::comparemask_type
|
||||
iteration_mask(typename base_type::comparemask_type mask);
|
||||
|
||||
static SuperVector loadu(void const *ptr);
|
||||
static SuperVector load(void const *ptr);
|
||||
|
Reference in New Issue
Block a user