add len parameter and mask, fixes corner cases on AVX512

This commit is contained in:
Konstantinos Margaritis 2021-11-05 14:30:22 +02:00
parent 210295a702
commit 24fa54081b
3 changed files with 99 additions and 70 deletions

View File

@ -41,42 +41,46 @@
template <uint16_t S> template <uint16_t S>
static really_inline static really_inline
const u8 *vermicelliBlock(SuperVector<S> data, SuperVector<S> chars, SuperVector<S> casemask, const u8 *buf) { const u8 *vermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data); SuperVector<S> mask = chars.eq(casemask & data);
return first_non_zero_match<S>(buf, mask); return first_non_zero_match<S>(buf, mask, len);
} }
template <uint16_t S> template <uint16_t S>
static really_inline static really_inline
const u8 *vermicelliBlockNeg(SuperVector<S> data, SuperVector<S> chars, SuperVector<S> casemask, const u8 *buf) { const u8 *vermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data); SuperVector<S> mask = chars.eq(casemask & data);
return first_zero_match_inverted<S>(buf, mask); return first_zero_match_inverted<S>(buf, mask, len);
} }
template <uint16_t S> template <uint16_t S>
static really_inline static really_inline
const u8 *rvermicelliBlock(SuperVector<S> data, SuperVector<S> chars, SuperVector<S> casemask, const u8 *buf) { const u8 *rvermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data); SuperVector<S> mask = chars.eq(casemask & data);
return last_non_zero_match<S>(buf, mask); return last_non_zero_match<S>(buf, mask, len);
} }
template <uint16_t S> template <uint16_t S>
static really_inline static really_inline
const u8 *rvermicelliBlockNeg(SuperVector<S> data, SuperVector<S> chars, SuperVector<S> casemask, const u8 *buf) { const u8 *rvermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, u16 const len) {
data.print8("data");
chars.print8("chars");
casemask.print8("casemask");
SuperVector<S> mask = chars.eq(casemask & data); SuperVector<S> mask = chars.eq(casemask & data);
return last_zero_match_inverted<S>(buf, mask); mask.print8("mask");
return last_zero_match_inverted<S>(buf, mask, len);
} }
template <uint16_t S> template <uint16_t S>
static really_inline static really_inline
const u8 *vermicelliDoubleBlock(SuperVector<S> data, SuperVector<S> chars1, SuperVector<S> chars2, SuperVector<S> casemask, const u8 *vermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, const u8 *buf) { u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data; SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v); SuperVector<S> mask1 = chars1.eq(v);
@ -88,13 +92,13 @@ const u8 *vermicelliDoubleBlock(SuperVector<S> data, SuperVector<S> chars1, Supe
DEBUG_PRINTF("partial = %d\n", partial_match); DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) return buf - 1; if (partial_match) return buf - 1;
return first_non_zero_match<S>(buf, mask); return first_non_zero_match<S>(buf, mask, len);
} }
template <uint16_t S> template <uint16_t S>
static really_inline static really_inline
const u8 *rvermicelliDoubleBlock(SuperVector<S> data, SuperVector<S> chars1, SuperVector<S> chars2, SuperVector<S> casemask, const u8 *rvermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, const u8 *buf) { u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data; SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v); SuperVector<S> mask1 = chars1.eq(v);
@ -108,14 +112,14 @@ const u8 *rvermicelliDoubleBlock(SuperVector<S> data, SuperVector<S> chars1, Sup
mask = mask | (SuperVector<S>::Ones() >> (S-1)); mask = mask | (SuperVector<S>::Ones() >> (S-1));
} }
return last_non_zero_match<S>(buf, mask); return last_non_zero_match<S>(buf, mask, len);
} }
template <uint16_t S> template <uint16_t S>
static really_inline static really_inline
const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> data, SuperVector<S> chars1, SuperVector<S> chars2, const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2,
SuperVector<S> mask1, SuperVector<S> mask2, SuperVector<S> const mask1, SuperVector<S> const mask2,
u8 const c1, u8 const c2, u8 const m1, u8 const m2, const u8 *buf) { u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len) {
SuperVector<S> v1 = chars1.eq(data & mask1); SuperVector<S> v1 = chars1.eq(data & mask1);
SuperVector<S> v2 = chars2.eq(data & mask2); SuperVector<S> v2 = chars2.eq(data & mask2);
@ -126,11 +130,11 @@ const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> data, SuperVector<S> chars1
DEBUG_PRINTF("partial = %d\n", partial_match); DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) return buf - 1; if (partial_match) return buf - 1;
return first_non_zero_match<S>(buf, mask); return first_non_zero_match<S>(buf, mask, len);
} }
template <uint16_t S> template <uint16_t S>
static const u8 *vermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, const u8 *buf_end) { static const u8 *vermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u8 const *buf_end) {
assert(buf && buf_end); assert(buf && buf_end);
assert(buf < buf_end); assert(buf < buf_end);
DEBUG_PRINTF("verm %p len %zu\n", buf, buf_end - buf); DEBUG_PRINTF("verm %p len %zu\n", buf, buf_end - buf);
@ -149,17 +153,18 @@ static const u8 *vermicelliExecReal(SuperVector<S> const chars, SuperVector<S> c
// Reach vector aligned boundaries // Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) { if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDUP_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d); SuperVector<S> data = SuperVector<S>::loadu(d);
rv = vermicelliBlock(data, chars, casemask, d); rv = vermicelliBlock(data, chars, casemask, d, S);
if (rv) return rv; if (rv) return rv;
d = ROUNDUP_PTR(d, S); d = d1;
} }
while(d + S <= buf_end) { while(d + S <= buf_end) {
__builtin_prefetch(d + 64); __builtin_prefetch(d + 64);
DEBUG_PRINTF("d %p \n", d); DEBUG_PRINTF("d %p \n", d);
SuperVector<S> data = SuperVector<S>::load(d); SuperVector<S> data = SuperVector<S>::load(d);
rv = vermicelliBlock(data, chars, casemask, d); rv = vermicelliBlock(data, chars, casemask, d, S);
if (rv) return rv; if (rv) return rv;
d += S; d += S;
} }
@ -170,7 +175,7 @@ static const u8 *vermicelliExecReal(SuperVector<S> const chars, SuperVector<S> c
if (d != buf_end) { if (d != buf_end) {
SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d); SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d);
rv = vermicelliBlock(data, chars, casemask, d); rv = vermicelliBlock(data, chars, casemask, d, buf_end - d);
DEBUG_PRINTF("rv %p \n", rv); DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv; if (rv && rv < buf_end) return rv;
} }
@ -198,17 +203,18 @@ static const u8 *nvermicelliExecReal(SuperVector<S> const chars, SuperVector<S>
// Reach vector aligned boundaries // Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) { if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDUP_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d); SuperVector<S> data = SuperVector<S>::loadu(d);
rv = vermicelliBlockNeg(data, chars, casemask, d); rv = vermicelliBlockNeg(data, chars, casemask, d, S);
if (rv) return rv; if (rv) return rv;
d = ROUNDUP_PTR(d, S); d = d1;
} }
while(d + S <= buf_end) { while(d + S <= buf_end) {
__builtin_prefetch(d + 64); __builtin_prefetch(d + 64);
DEBUG_PRINTF("d %p \n", d); DEBUG_PRINTF("d %p \n", d);
SuperVector<S> data = SuperVector<S>::load(d); SuperVector<S> data = SuperVector<S>::load(d);
rv = vermicelliBlockNeg(data, chars, casemask, d); rv = vermicelliBlockNeg(data, chars, casemask, d, S);
if (rv) return rv; if (rv) return rv;
d += S; d += S;
} }
@ -219,7 +225,7 @@ static const u8 *nvermicelliExecReal(SuperVector<S> const chars, SuperVector<S>
if (d != buf_end) { if (d != buf_end) {
SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d); SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d);
rv = vermicelliBlockNeg(data, chars, casemask, d); rv = vermicelliBlockNeg(data, chars, casemask, d, buf_end - d);
DEBUG_PRINTF("rv %p \n", rv); DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv; if (rv && rv < buf_end) return rv;
} }
@ -249,11 +255,12 @@ const u8 *rvermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const c
// Reach vector aligned boundaries // Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S)); DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
if (!ISALIGNED_N(d, S)) { if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDDOWN_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d - S); SuperVector<S> data = SuperVector<S>::loadu(d - S);
rv = rvermicelliBlock(data, chars, casemask, d - S); rv = rvermicelliBlock(data, chars, casemask, d - S, S);
DEBUG_PRINTF("rv %p \n", rv); DEBUG_PRINTF("rv %p \n", rv);
if (rv) return rv; if (rv) return rv;
d = ROUNDDOWN_PTR(d, S); d = d1;
} }
while (d - S >= buf) { while (d - S >= buf) {
@ -263,7 +270,7 @@ const u8 *rvermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const c
d -= S; d -= S;
SuperVector<S> data = SuperVector<S>::load(d); SuperVector<S> data = SuperVector<S>::load(d);
rv = rvermicelliBlock(data, chars, casemask, d); rv = rvermicelliBlock(data, chars, casemask, d, S);
if (rv) return rv; if (rv) return rv;
} }
} }
@ -273,7 +280,7 @@ const u8 *rvermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const c
if (d != buf) { if (d != buf) {
SuperVector<S> data = SuperVector<S>::loadu(buf); SuperVector<S> data = SuperVector<S>::loadu(buf);
rv = rvermicelliBlock(data, chars, casemask, buf); rv = rvermicelliBlock(data, chars, casemask, buf, d - buf);
DEBUG_PRINTF("rv %p \n", rv); DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv; if (rv && rv < buf_end) return rv;
} }
@ -303,11 +310,12 @@ const u8 *rnvermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const
// Reach vector aligned boundaries // Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S)); DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
if (!ISALIGNED_N(d, S)) { if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDDOWN_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d - S); SuperVector<S> data = SuperVector<S>::loadu(d - S);
rv = rvermicelliBlockNeg(data, chars, casemask, d - S); rv = rvermicelliBlockNeg(data, chars, casemask, d - S, S);
DEBUG_PRINTF("rv %p \n", rv); DEBUG_PRINTF("rv %p \n", rv);
if (rv) return rv; if (rv) return rv;
d = ROUNDDOWN_PTR(d, S); d = d1;
} }
while (d - S >= buf) { while (d - S >= buf) {
@ -317,7 +325,7 @@ const u8 *rnvermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const
d -= S; d -= S;
SuperVector<S> data = SuperVector<S>::load(d); SuperVector<S> data = SuperVector<S>::load(d);
rv = rvermicelliBlockNeg(data, chars, casemask, d); rv = rvermicelliBlockNeg(data, chars, casemask, d, S);
if (rv) return rv; if (rv) return rv;
} }
} }
@ -327,7 +335,7 @@ const u8 *rnvermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const
if (d != buf) { if (d != buf) {
SuperVector<S> data = SuperVector<S>::loadu(buf); SuperVector<S> data = SuperVector<S>::loadu(buf);
rv = rvermicelliBlockNeg(data, chars, casemask, buf); rv = rvermicelliBlockNeg(data, chars, casemask, buf, d - buf);
DEBUG_PRINTF("rv %p \n", rv); DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv; if (rv && rv < buf_end) return rv;
} }
@ -360,17 +368,18 @@ static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector<
// Reach vector aligned boundaries // Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) { if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDUP_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d); SuperVector<S> data = SuperVector<S>::loadu(d);
rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d); rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, S);
if (rv) return rv; if (rv) return rv;
d = ROUNDUP_PTR(d, S); d = d1;
} }
while(d + S <= buf_end) { while(d + S <= buf_end) {
__builtin_prefetch(d + 64); __builtin_prefetch(d + 64);
DEBUG_PRINTF("d %p \n", d); DEBUG_PRINTF("d %p \n", d);
SuperVector<S> data = SuperVector<S>::load(d); SuperVector<S> data = SuperVector<S>::load(d);
rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d); rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, S);
if (rv) return rv; if (rv) return rv;
d += S; d += S;
} }
@ -381,7 +390,7 @@ static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector<
if (d != buf_end) { if (d != buf_end) {
SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d); SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d);
rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d); rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, buf_end - d);
DEBUG_PRINTF("rv %p \n", rv); DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv; if (rv && rv < buf_end) return rv;
} }
@ -424,11 +433,12 @@ const u8 *rvermicelliDoubleExecReal(char c1, char c2, SuperVector<S> const casem
// Reach vector aligned boundaries // Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S)); DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
if (!ISALIGNED_N(d, S)) { if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDDOWN_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d - S); SuperVector<S> data = SuperVector<S>::loadu(d - S);
rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d - S); rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d - S, S);
DEBUG_PRINTF("rv %p \n", rv); DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv; if (rv && rv < buf_end) return rv;
d = ROUNDDOWN_PTR(d, S); d = d1;
} }
while (d - S >= buf) { while (d - S >= buf) {
@ -438,7 +448,7 @@ const u8 *rvermicelliDoubleExecReal(char c1, char c2, SuperVector<S> const casem
d -= S; d -= S;
SuperVector<S> data = SuperVector<S>::load(d); SuperVector<S> data = SuperVector<S>::load(d);
rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d); rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, S);
if (rv) return rv; if (rv) return rv;
} }
} }
@ -448,7 +458,7 @@ const u8 *rvermicelliDoubleExecReal(char c1, char c2, SuperVector<S> const casem
if (d != buf) { if (d != buf) {
SuperVector<S> data = SuperVector<S>::loadu(buf); SuperVector<S> data = SuperVector<S>::loadu(buf);
rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, buf); rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, buf, d - buf);
DEBUG_PRINTF("rv %p \n", rv); DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv; if (rv && rv < buf_end) return rv;
} }
@ -482,17 +492,18 @@ static const u8 *vermicelliDoubleMaskedExecReal(u8 const c1, u8 const c2, u8 con
// Reach vector aligned boundaries // Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) { if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDUP_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d); SuperVector<S> data = SuperVector<S>::loadu(d);
rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d); rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d, S);
if (rv) return rv; if (rv) return rv;
d = ROUNDUP_PTR(d, S); d = d1;
} }
while(d + S <= buf_end) { while(d + S <= buf_end) {
__builtin_prefetch(d + 64); __builtin_prefetch(d + 64);
DEBUG_PRINTF("d %p \n", d); DEBUG_PRINTF("d %p \n", d);
SuperVector<S> data = SuperVector<S>::load(d); SuperVector<S> data = SuperVector<S>::load(d);
rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d); rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d, S);
if (rv) return rv; if (rv) return rv;
d += S; d += S;
} }
@ -503,7 +514,7 @@ static const u8 *vermicelliDoubleMaskedExecReal(u8 const c1, u8 const c2, u8 con
if (d != buf_end) { if (d != buf_end) {
SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d); SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d);
rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d); rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d, buf_end - d);
DEBUG_PRINTF("rv %p \n", rv); DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv; if (rv && rv < buf_end) return rv;
} }
@ -591,4 +602,4 @@ extern "C" const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char
assert(buf < buf_end); assert(buf < buf_end);
return vermicelliDoubleMaskedExecReal<VECTORSIZE>(c1, c2, m1, m2, buf, buf_end); return vermicelliDoubleMaskedExecReal<VECTORSIZE>(c1, c2, m1, m2, buf, buf_end);
} }

View File

@ -29,7 +29,7 @@
template <> template <>
really_really_inline really_really_inline
const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v) { const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask(); SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z); DEBUG_PRINTF("z %08x\n", z);
@ -46,7 +46,7 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v) {
template <> template <>
really_really_inline really_really_inline
const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v) { const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask(); SuperVector<32>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%08x\n", z); DEBUG_PRINTF("z 0x%08x\n", z);
if (unlikely(z)) { if (unlikely(z)) {
@ -60,9 +60,13 @@ const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v) {
} }
template <> template <>
really_really_inline really_really_inline
const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v) { const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
SuperVector<64>::movemask_type z = v.movemask(); SuperVector<64>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%016llx\n", z); DEBUG_PRINTF("z 0x%016llx\n", z);
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask %016llx\n", mask);
z &= mask;
DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z)) { if (unlikely(z)) {
u32 pos = ctz64(z); u32 pos = ctz64(z);
DEBUG_PRINTF("match @ pos %u\n", pos); DEBUG_PRINTF("match @ pos %u\n", pos);
@ -75,7 +79,7 @@ const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v) {
template <> template <>
really_really_inline really_really_inline
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v) { const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask(); SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z); DEBUG_PRINTF("z %08x\n", z);
@ -91,7 +95,7 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v) {
template <> template <>
really_really_inline really_really_inline
const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v) { const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask(); SuperVector<32>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%08x\n", z); DEBUG_PRINTF("z 0x%08x\n", z);
if (unlikely(z)) { if (unlikely(z)) {
@ -105,14 +109,18 @@ const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v) {
} }
template <> template <>
really_really_inline really_really_inline
const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v) { const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
SuperVector<64>::movemask_type z = v.movemask(); SuperVector<64>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%016llx\n", z); DEBUG_PRINTF("z 0x%016llx\n", z);
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask %016llx\n", mask);
z &= mask;
DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z)) { if (unlikely(z)) {
u32 pos = clz64(z); u32 pos = clz64(z);
DEBUG_PRINTF("match @ pos %u\n", pos); DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 64); assert(pos < 64);
return buf + (31 - pos); return buf + (63 - pos);
} else { } else {
return NULL; // no match return NULL; // no match
} }
@ -120,7 +128,7 @@ const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v) {
template <> template <>
really_really_inline really_really_inline
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v) { const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask(); SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z); DEBUG_PRINTF("z %08x\n", z);
@ -137,7 +145,7 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v) {
template <> template <>
really_really_inline really_really_inline
const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v) { const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask(); SuperVector<32>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%08x\n", z); DEBUG_PRINTF("z 0x%08x\n", z);
if (unlikely(z != 0xffffffff)) { if (unlikely(z != 0xffffffff)) {
@ -151,11 +159,15 @@ const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v) {
} }
template <> template <>
really_really_inline really_really_inline
const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v) { const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
SuperVector<64>::movemask_type z = v.movemask(); SuperVector<64>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%016llx\n", z); DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z != ~0ULL)) { u64a mask = (~0ULL) >> (64 - len);
u32 pos = ctz64(~z); DEBUG_PRINTF("mask %016llx\n", mask);
z = ~z & mask;
DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z)) {
u32 pos = ctz64(z);
DEBUG_PRINTF("match @ pos %u\n", pos); DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 64); assert(pos < 64);
return buf + pos; return buf + pos;
@ -166,7 +178,7 @@ const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v) {
template <> template <>
really_really_inline really_really_inline
const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v) { const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
SuperVector<16>::movemask_type z = v.movemask(); SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z); DEBUG_PRINTF("z %08x\n", z);
@ -183,10 +195,10 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v) {
template<> template<>
really_really_inline really_really_inline
const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v) { const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_t UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask(); SuperVector<32>::movemask_type z = v.movemask();
if (unlikely(z != 0xffffffff)) { if (unlikely(z != 0xffffffff)) {
u32 pos = clz32(~z); u32 pos = clz32(~z & 0xffffffff);
DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
assert(pos < 32); assert(pos < 32);
return buf + (31 - pos); return buf + (31 - pos);
@ -197,11 +209,17 @@ const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v) {
template <> template <>
really_really_inline really_really_inline
const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v) { const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v, uint16_t len) {
v.print8("v");
SuperVector<64>::movemask_type z = v.movemask(); SuperVector<64>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%016llx\n", z); DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z != ~0ULL)) { u64a mask = (~0ULL) >> (64 - len);
u32 pos = clz64(~z); DEBUG_PRINTF("mask %016llx\n", mask);
z = ~z & mask;
DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z)) {
u32 pos = clz64(z);
DEBUG_PRINTF("~z 0x%016llx\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos); DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 64); assert(pos < 64);
return buf + (63 - pos); return buf + (63 - pos);

View File

@ -38,16 +38,16 @@
#include "util/supervector/supervector.hpp" #include "util/supervector/supervector.hpp"
template <u16 S> template <u16 S>
const u8 *first_non_zero_match(const u8 *buf, SuperVector<S> v); const u8 *first_non_zero_match(const u8 *buf, SuperVector<S> v, u16 const len = S);
template <u16 S> template <u16 S>
const u8 *last_non_zero_match(const u8 *buf, SuperVector<S> v); const u8 *last_non_zero_match(const u8 *buf, SuperVector<S> v, u16 const len = S);
template <u16 S> template <u16 S>
const u8 *first_zero_match_inverted(const u8 *buf, SuperVector<S> v); const u8 *first_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 const len = S);
template <u16 S> template <u16 S>
const u8 *last_zero_match_inverted(const u8 *buf, SuperVector<S> v); const u8 *last_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 len = S);
#if defined(ARCH_IA32) || defined(ARCH_X86_64) #if defined(ARCH_IA32) || defined(ARCH_X86_64)
#include "util/arch/x86/match.hpp" #include "util/arch/x86/match.hpp"