Optimize vectorscan for aarch64 by using shrn instruction

This optimization is based on the thread
https://twitter.com/Danlark1/status/1539344279268691970 and uses
shift right and narrow by 4 instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SHRN--SHRN2--Shift-Right-Narrow--immediate--

To achieve that, I needed to redesign a little movemask into comparemask
and have an additional step towards mask iteration. Our benchmarks
showed 10-15% improvement on average for long matches.
This commit is contained in:
Danila Kutenin
2022-06-26 22:50:05 +00:00
parent bd9113463d
commit 49eb18ee4f
11 changed files with 264 additions and 150 deletions

View File

@@ -33,13 +33,13 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 cons
uint32x4_t m = mask.u.u32x4[0];
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) {
typename SuperVector<16>::movemask_type z = mask.movemask();
DEBUG_PRINTF("z %08x\n", z);
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
u32 pos = ctz32(z & 0xffff);
typename SuperVector<16>::comparemask_type z = mask.comparemask();
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
u32 pos = ctz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
DEBUG_PRINTF("buf + pos %p\n", buf + pos);
DEBUG_PRINTF("buf + pos %p\n", buf + (pos));
return buf + pos;
} else {
return NULL; // no match
@@ -52,13 +52,12 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const
uint32x4_t m = mask.u.u32x4[0];
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) {
typename SuperVector<16>::movemask_type z = mask.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
u32 pos = clz32(z & 0xffff);
typename SuperVector<16>::comparemask_type z = mask.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
u32 pos = clz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos >= 16 && pos < 32);
return buf + (31 - pos);
return buf + (15 - pos);
} else {
return NULL; // no match
}
@@ -70,10 +69,10 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
uint32x4_t m = mask.u.u32x4[0];
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) {
typename SuperVector<16>::movemask_type z = mask.movemask();
DEBUG_PRINTF("z %08x\n", z);
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
u32 pos = ctz32(z & 0xffff);
typename SuperVector<16>::comparemask_type z = mask.comparemask();
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
u32 pos = ctz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
DEBUG_PRINTF("buf + pos %p\n", buf + pos);
@@ -89,13 +88,12 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
uint32x4_t m = mask.u.u32x4[0];
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) {
typename SuperVector<16>::movemask_type z = mask.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
u32 pos = clz32(z & 0xffff);
typename SuperVector<16>::comparemask_type z = mask.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
u32 pos = clz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos >= 16 && pos < 32);
return buf + (31 - pos);
return buf + (15 - pos);
} else {
return NULL; // no match
}

View File

@@ -86,8 +86,9 @@ static really_inline m128 not128(m128 a) {
/** \brief Return 1 if a and b are different otherwise 0 */
static really_inline int diff128(m128 a, m128 b) {
int res = vaddvq_s8((int8x16_t) vceqq_s32(a, b));
return (-16 != res);
uint64_t res = vget_lane_u64(
(uint64x1_t)vshrn_n_u16((uint16x8_t)vceqq_s32(a, b), 4), 0);
return (~0ull != res);
}
static really_inline int isnonzero128(m128 a) {
@@ -379,15 +380,19 @@ static really_inline m128 eq64_m128(m128 a, m128 b) {
}
static really_inline u32 movemask128(m128 a) {
uint8x16_t input = vreinterpretq_u8_s32(a);
uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
uint32x4_t paired16 =
vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
uint64x2_t paired32 =
vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
uint8x16_t paired64 =
vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
static const uint8x16_t powers = {1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128};
// Compute the mask from the input
uint8x16_t mask = (uint8x16_t)vpaddlq_u32(
vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
uint8x16_t mask1 = vextq_u8(mask, (uint8x16_t)zeroes128(), 7);
mask = vorrq_u8(mask, mask1);
// Get the resulting bytes
uint16_t output;
vst1q_lane_u16((uint16_t *)&output, (uint16x8_t)mask, 0);
return output;
}
static really_inline m128 set1_16x8(u8 c) {

View File

@@ -30,7 +30,7 @@
template <>
really_really_inline
const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z)) {
@@ -47,7 +47,7 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const U
template <>
really_really_inline
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z)) {
@@ -63,7 +63,7 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UN
template <>
really_really_inline
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z != 0xffff)) {
@@ -81,7 +81,7 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 co
template <>
really_really_inline
const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
SuperVector<16>::movemask_type z = v.movemask();
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z != 0xffff)) {

View File

@@ -30,12 +30,13 @@
template <>
really_really_inline
const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
assert(SuperVector<16>::mask_width() == 1);
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
if (unlikely(z)) {
u32 pos = ctz32(z);
DEBUG_PRINTF("~z %08x\n", ~z);
DEBUG_PRINTF("~z %08llx\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
return buf + pos;
@@ -47,8 +48,9 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const U
template <>
really_really_inline
const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%08x\n", z);
assert(SuperVector<32>::mask_width() == 1);
SuperVector<32>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("z 0x%08llx\n", z);
if (unlikely(z)) {
u32 pos = ctz32(z);
assert(pos < 32);
@@ -61,7 +63,8 @@ const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const U
template <>
really_really_inline
const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
SuperVector<64>::movemask_type z = v.movemask();
assert(SuperVector<64>::mask_width() == 1);
SuperVector<64>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask %016llx\n", mask);
@@ -80,9 +83,10 @@ const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const le
template <>
really_really_inline
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
assert(SuperVector<16>::mask_width() == 1);
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
if (unlikely(z)) {
u32 pos = clz32(z);
DEBUG_PRINTF("match @ pos %u\n", pos);
@@ -96,8 +100,9 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UN
template <>
really_really_inline
const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%08x\n", z);
assert(SuperVector<32>::mask_width() == 1);
SuperVector<32>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("z 0x%08llx\n", z);
if (unlikely(z)) {
u32 pos = clz32(z);
assert(pos < 32);
@@ -110,7 +115,8 @@ const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UN
template <>
really_really_inline
const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
SuperVector<64>::movemask_type z = v.movemask();
assert(SuperVector<64>::mask_width() == 1);
SuperVector<64>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask %016llx\n", mask);
@@ -129,12 +135,13 @@ const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len
template <>
really_really_inline
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
assert(SuperVector<16>::mask_width() == 1);
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
if (unlikely(z != 0xffff)) {
u32 pos = ctz32(~z & 0xffff);
DEBUG_PRINTF("~z %08x\n", ~z);
DEBUG_PRINTF("~z %08llx\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
return buf + pos;
@@ -146,10 +153,11 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 co
template <>
really_really_inline
const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%08x\n", z);
assert(SuperVector<32>::mask_width() == 1);
SuperVector<32>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("z 0x%08llx\n", z);
if (unlikely(z != 0xffffffff)) {
u32 pos = ctz32(~z);
u32 pos = ctz32(~z & 0xffffffffu);
assert(pos < 32);
DEBUG_PRINTF("match @ pos %u\n", pos);
return buf + pos;
@@ -160,7 +168,8 @@ const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 co
template <>
really_really_inline
const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
SuperVector<64>::movemask_type z = v.movemask();
assert(SuperVector<64>::mask_width() == 1);
SuperVector<64>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask %016llx\n", mask);
@@ -179,12 +188,13 @@ const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 con
template <>
really_really_inline
const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
assert(SuperVector<16>::mask_width() == 1);
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
if (unlikely(z != 0xffff)) {
u32 pos = clz32(~z & 0xffff);
DEBUG_PRINTF("~z %08x\n", ~z);
u32 pos = clz32(~z & 0xffffu);
DEBUG_PRINTF("~z %08llx\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos >= 16 && pos < 32);
return buf + (31 - pos);
@@ -196,9 +206,10 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_
template<>
really_really_inline
const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_t UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask();
if (unlikely(z != 0xffffffff)) {
u32 pos = clz32(~z & 0xffffffff);
assert(SuperVector<32>::mask_width() == 1);
SuperVector<32>::comparemask_type z = v.comparemask();
if (unlikely(static_cast<u32>(z) != 0xffffffff)) {
u32 pos = clz32(~z & 0xffffffffu);
DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
assert(pos < 32);
return buf + (31 - pos);
@@ -210,8 +221,9 @@ const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_
template <>
really_really_inline
const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v, uint16_t len) {
assert(SuperVector<64>::mask_width() == 1);
v.print8("v");
SuperVector<64>::movemask_type z = v.movemask();
SuperVector<64>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask %016llx\n", mask);

View File

@@ -249,25 +249,25 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
}
template <>
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void) const
{
SuperVector powers = SuperVector::dup_u64(0x8040201008040201UL);
// Compute the mask from the input
uint8x16_t mask = (uint8x16_t) vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(u.u8x16[0], powers.u.u8x16[0]))));
uint64x2_t mask1 = (uint64x2_t) vextq_u8(mask, vdupq_n_u8(0), 7);
mask = vorrq_u8(mask, (uint8x16_t) mask1);
// Get the resulting bytes
uint16_t output;
vst1q_lane_u16(&output, (uint16x8_t)mask, 0);
return static_cast<typename SuperVector<16>::movemask_type>(output);
really_inline typename SuperVector<16>::comparemask_type
SuperVector<16>::comparemask(void) const {
return static_cast<typename SuperVector<16>::comparemask_type>(
vget_lane_u64((uint64x1_t)vshrn_n_u16(u.u16x8[0], 4), 0));
}
template <>
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
{
return eq(b).movemask();
really_inline typename SuperVector<16>::comparemask_type
SuperVector<16>::eqmask(SuperVector<16> const b) const {
return eq(b).comparemask();
}
template <> really_inline u32 SuperVector<16>::mask_width() { return 4; }
template <>
really_inline typename SuperVector<16>::comparemask_type
SuperVector<16>::iteration_mask(
typename SuperVector<16>::comparemask_type mask) {
return mask & 0x1111111111111111ull;
}
template <>

View File

@@ -206,8 +206,8 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
}
template <>
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const
{
really_inline typename SuperVector<16>::comparemask_type
SuperVector<16>::comparemask(void) const {
uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7));
uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
@@ -230,11 +230,19 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(
}
template <>
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
{
return eq(b).movemask();
really_inline typename SuperVector<16>::comparemask_type
SuperVector<16>::eqmask(SuperVector<16> const b) const {
return eq(b).comparemask();
}
template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
template <>
really_inline typename SuperVector<16>::comparemask_type
SuperVector<16>::iteration_mask(
typename SuperVector<16>::comparemask_type mask) {
return mask;
}
template <>
template<uint8_t N>

View File

@@ -203,15 +203,24 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
}
template <>
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const
{
return _mm_movemask_epi8(u.v128[0]);
really_inline typename SuperVector<16>::comparemask_type
SuperVector<16>::comparemask(void) const {
return (u32)_mm_movemask_epi8(u.v128[0]);
}
template <>
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
{
return eq(b).movemask();
really_inline typename SuperVector<16>::comparemask_type
SuperVector<16>::eqmask(SuperVector<16> const b) const {
return eq(b).comparemask();
}
template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
template <>
really_inline typename SuperVector<16>::comparemask_type
SuperVector<16>::iteration_mask(
typename SuperVector<16>::comparemask_type mask) {
return mask;
}
// template <>
@@ -754,17 +763,25 @@ really_inline SuperVector<32> SuperVector<32>::eq(SuperVector<32> const &b) cons
}
template <>
really_inline typename SuperVector<32>::movemask_type SuperVector<32>::movemask(void)const
{
return _mm256_movemask_epi8(u.v256[0]);
really_inline typename SuperVector<32>::comparemask_type
SuperVector<32>::comparemask(void) const {
return (u32)_mm256_movemask_epi8(u.v256[0]);
}
template <>
really_inline typename SuperVector<32>::movemask_type SuperVector<32>::eqmask(SuperVector<32> const b) const
{
return eq(b).movemask();
really_inline typename SuperVector<32>::comparemask_type
SuperVector<32>::eqmask(SuperVector<32> const b) const {
return eq(b).comparemask();
}
template <> really_inline u32 SuperVector<32>::mask_width() { return 1; }
template <>
really_inline typename SuperVector<32>::comparemask_type
SuperVector<32>::iteration_mask(
typename SuperVector<32>::comparemask_type mask) {
return mask;
}
// template <>
// template<uint8_t N>
@@ -1347,42 +1364,48 @@ really_inline SuperVector<64> SuperVector<64>::opandnot(SuperVector<64> const &b
template <>
really_inline SuperVector<64> SuperVector<64>::operator==(SuperVector<64> const &b) const
{
SuperVector<64>::movemask_type mask = _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
SuperVector<64>::comparemask_type mask =
_mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
return {_mm512_movm_epi8(mask)};
}
template <>
really_inline SuperVector<64> SuperVector<64>::operator!=(SuperVector<64> const &b) const
{
SuperVector<64>::movemask_type mask = _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
SuperVector<64>::comparemask_type mask =
_mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
return {_mm512_movm_epi8(mask)};
}
template <>
really_inline SuperVector<64> SuperVector<64>::operator>(SuperVector<64> const &b) const
{
SuperVector<64>::movemask_type mask = _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
SuperVector<64>::comparemask_type mask =
_mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
return {_mm512_movm_epi8(mask)};
}
template <>
really_inline SuperVector<64> SuperVector<64>::operator<(SuperVector<64> const &b) const
{
SuperVector<64>::movemask_type mask = _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
SuperVector<64>::comparemask_type mask =
_mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
return {_mm512_movm_epi8(mask)};
}
template <>
really_inline SuperVector<64> SuperVector<64>::operator>=(SuperVector<64> const &b) const
{
SuperVector<64>::movemask_type mask = _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
SuperVector<64>::comparemask_type mask =
_mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
return {_mm512_movm_epi8(mask)};
}
template <>
really_inline SuperVector<64> SuperVector<64>::operator<=(SuperVector<64> const &b) const
{
SuperVector<64>::movemask_type mask = _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
SuperVector<64>::comparemask_type mask =
_mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
return {_mm512_movm_epi8(mask)};
}
@@ -1393,19 +1416,28 @@ really_inline SuperVector<64> SuperVector<64>::eq(SuperVector<64> const &b) cons
}
template <>
really_inline typename SuperVector<64>::movemask_type SuperVector<64>::movemask(void)const
{
really_inline typename SuperVector<64>::comparemask_type
SuperVector<64>::comparemask(void) const {
__m512i msb = _mm512_set1_epi8(0xFF);
__m512i mask = _mm512_and_si512(msb, u.v512[0]);
return _mm512_cmpeq_epi8_mask(mask, msb);
}
template <>
really_inline typename SuperVector<64>::movemask_type SuperVector<64>::eqmask(SuperVector<64> const b) const
{
really_inline typename SuperVector<64>::comparemask_type
SuperVector<64>::eqmask(SuperVector<64> const b) const {
return _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
}
template <> really_inline u32 SuperVector<64>::mask_width() { return 1; }
template <>
really_inline typename SuperVector<64>::comparemask_type
SuperVector<64>::iteration_mask(
typename SuperVector<64>::comparemask_type mask) {
return mask;
}
// template <>
// template<uint8_t N>
// really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const

View File

@@ -46,19 +46,29 @@
using Z_TYPE = u64a;
#define Z_BITS 64
#define Z_SHIFT 63
#define Z_POSSHIFT 0
#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS -(l)))
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#elif defined(HAVE_SIMD_256_BITS)
using Z_TYPE = u32;
#define Z_BITS 32
#define Z_SHIFT 31
#define Z_POSSHIFT 0
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#elif defined(HAVE_SIMD_128_BITS)
#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
using Z_TYPE = u64a;
#define Z_BITS 64
#define Z_POSSHIFT 2
#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l)))
#else
using Z_TYPE = u32;
#define Z_BITS 32
#define Z_POSSHIFT 0
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#endif
#define Z_SHIFT 15
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#endif
@@ -94,7 +104,8 @@ struct BaseVector
static constexpr bool is_valid = false;
static constexpr u16 size = 8;
using type = void;
using movemask_type = void;
using comparemask_type = void;
using cmpmask_type = void;
static constexpr bool has_previous = false;
using previous_type = void;
static constexpr u16 previous_size = 4;
@@ -106,7 +117,7 @@ struct BaseVector<128>
static constexpr bool is_valid = true;
static constexpr u16 size = 128;
using type = void;
using movemask_type = u64a;
using comparemask_type = u64a;
static constexpr bool has_previous = true;
using previous_type = m512;
static constexpr u16 previous_size = 64;
@@ -118,7 +129,7 @@ struct BaseVector<64>
static constexpr bool is_valid = true;
static constexpr u16 size = 64;
using type = m512;
using movemask_type = u64a;
using comparemask_type = u64a;
static constexpr bool has_previous = true;
using previous_type = m256;
static constexpr u16 previous_size = 32;
@@ -131,7 +142,7 @@ struct BaseVector<32>
static constexpr bool is_valid = true;
static constexpr u16 size = 32;
using type = m256;
using movemask_type = u32;
using comparemask_type = u64a;
static constexpr bool has_previous = true;
using previous_type = m128;
static constexpr u16 previous_size = 16;
@@ -144,7 +155,7 @@ struct BaseVector<16>
static constexpr bool is_valid = true;
static constexpr u16 size = 16;
using type = m128;
using movemask_type = u32;
using comparemask_type = u64a;
static constexpr bool has_previous = false;
using previous_type = u64a;
static constexpr u16 previous_size = 8;
@@ -231,8 +242,17 @@ public:
SuperVector eq(SuperVector const &b) const;
SuperVector operator<<(uint8_t const N) const;
SuperVector operator>>(uint8_t const N) const;
typename base_type::movemask_type movemask(void) const;
typename base_type::movemask_type eqmask(SuperVector const b) const;
// Returns mask_width groups of zeros or ones. To get the mask which can be
// iterated, use iteration_mask method, it ensures only one bit is set per
// mask_width group.
// Precondition: all bytes must be 0 or 0xff.
typename base_type::comparemask_type comparemask(void) const;
typename base_type::comparemask_type eqmask(SuperVector const b) const;
static u32 mask_width();
// Returns a mask with at most 1 bit set to 1. It can be used to iterate
// over bits through ctz/clz and lowest bit clear.
static typename base_type::comparemask_type
iteration_mask(typename base_type::comparemask_type mask);
static SuperVector loadu(void const *ptr);
static SuperVector load(void const *ptr);