mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-11-19 02:30:35 +03:00
Optimize vectorscan for aarch64 by using shrn instruction
This optimization is based on the thread https://twitter.com/Danlark1/status/1539344279268691970 and uses shift right and narrow by 4 instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SHRN--SHRN2--Shift-Right-Narrow--immediate-- To achieve that, I needed to redesign a little movemask into comparemask and have an additional step towards mask iteration. Our benchmarks showed 10-15% improvement on average for long matches.
This commit is contained in:
@@ -33,13 +33,13 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 cons
|
||||
uint32x4_t m = mask.u.u32x4[0];
|
||||
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
|
||||
if (vmax != 0) {
|
||||
typename SuperVector<16>::movemask_type z = mask.movemask();
|
||||
DEBUG_PRINTF("z %08x\n", z);
|
||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
||||
u32 pos = ctz32(z & 0xffff);
|
||||
typename SuperVector<16>::comparemask_type z = mask.comparemask();
|
||||
DEBUG_PRINTF("z %08llx\n", z);
|
||||
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||
u32 pos = ctz64(z) / SuperVector<16>::mask_width();
|
||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||
assert(pos < 16);
|
||||
DEBUG_PRINTF("buf + pos %p\n", buf + pos);
|
||||
DEBUG_PRINTF("buf + pos %p\n", buf + (pos));
|
||||
return buf + pos;
|
||||
} else {
|
||||
return NULL; // no match
|
||||
@@ -52,13 +52,12 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const
|
||||
uint32x4_t m = mask.u.u32x4[0];
|
||||
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
|
||||
if (vmax != 0) {
|
||||
typename SuperVector<16>::movemask_type z = mask.movemask();
|
||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
||||
DEBUG_PRINTF("z %08x\n", z);
|
||||
u32 pos = clz32(z & 0xffff);
|
||||
typename SuperVector<16>::comparemask_type z = mask.comparemask();
|
||||
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||
DEBUG_PRINTF("z %08llx\n", z);
|
||||
u32 pos = clz64(z) / SuperVector<16>::mask_width();
|
||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||
assert(pos >= 16 && pos < 32);
|
||||
return buf + (31 - pos);
|
||||
return buf + (15 - pos);
|
||||
} else {
|
||||
return NULL; // no match
|
||||
}
|
||||
@@ -70,10 +69,10 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
|
||||
uint32x4_t m = mask.u.u32x4[0];
|
||||
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
|
||||
if (vmax != 0) {
|
||||
typename SuperVector<16>::movemask_type z = mask.movemask();
|
||||
DEBUG_PRINTF("z %08x\n", z);
|
||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
||||
u32 pos = ctz32(z & 0xffff);
|
||||
typename SuperVector<16>::comparemask_type z = mask.comparemask();
|
||||
DEBUG_PRINTF("z %08llx\n", z);
|
||||
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||
u32 pos = ctz64(z) / SuperVector<16>::mask_width();
|
||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||
assert(pos < 16);
|
||||
DEBUG_PRINTF("buf + pos %p\n", buf + pos);
|
||||
@@ -89,13 +88,12 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
|
||||
uint32x4_t m = mask.u.u32x4[0];
|
||||
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
|
||||
if (vmax != 0) {
|
||||
typename SuperVector<16>::movemask_type z = mask.movemask();
|
||||
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
|
||||
DEBUG_PRINTF("z %08x\n", z);
|
||||
u32 pos = clz32(z & 0xffff);
|
||||
typename SuperVector<16>::comparemask_type z = mask.comparemask();
|
||||
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
|
||||
DEBUG_PRINTF("z %08llx\n", z);
|
||||
u32 pos = clz64(z) / SuperVector<16>::mask_width();
|
||||
DEBUG_PRINTF("match @ pos %u\n", pos);
|
||||
assert(pos >= 16 && pos < 32);
|
||||
return buf + (31 - pos);
|
||||
return buf + (15 - pos);
|
||||
} else {
|
||||
return NULL; // no match
|
||||
}
|
||||
|
||||
@@ -86,8 +86,9 @@ static really_inline m128 not128(m128 a) {
|
||||
|
||||
/** \brief Return 1 if a and b are different otherwise 0 */
|
||||
static really_inline int diff128(m128 a, m128 b) {
|
||||
int res = vaddvq_s8((int8x16_t) vceqq_s32(a, b));
|
||||
return (-16 != res);
|
||||
uint64_t res = vget_lane_u64(
|
||||
(uint64x1_t)vshrn_n_u16((uint16x8_t)vceqq_s32(a, b), 4), 0);
|
||||
return (~0ull != res);
|
||||
}
|
||||
|
||||
static really_inline int isnonzero128(m128 a) {
|
||||
@@ -379,15 +380,19 @@ static really_inline m128 eq64_m128(m128 a, m128 b) {
|
||||
}
|
||||
|
||||
static really_inline u32 movemask128(m128 a) {
|
||||
uint8x16_t input = vreinterpretq_u8_s32(a);
|
||||
uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
|
||||
uint32x4_t paired16 =
|
||||
vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
|
||||
uint64x2_t paired32 =
|
||||
vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
|
||||
uint8x16_t paired64 =
|
||||
vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
|
||||
return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
|
||||
static const uint8x16_t powers = {1, 2, 4, 8, 16, 32, 64, 128,
|
||||
1, 2, 4, 8, 16, 32, 64, 128};
|
||||
|
||||
// Compute the mask from the input
|
||||
uint8x16_t mask = (uint8x16_t)vpaddlq_u32(
|
||||
vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
|
||||
uint8x16_t mask1 = vextq_u8(mask, (uint8x16_t)zeroes128(), 7);
|
||||
mask = vorrq_u8(mask, mask1);
|
||||
|
||||
// Get the resulting bytes
|
||||
uint16_t output;
|
||||
vst1q_lane_u16((uint16_t *)&output, (uint16x8_t)mask, 0);
|
||||
return output;
|
||||
}
|
||||
|
||||
static really_inline m128 set1_16x8(u8 c) {
|
||||
|
||||
Reference in New Issue
Block a user