diff --git a/src/hwlm/noodle_engine_sve.hpp b/src/hwlm/noodle_engine_sve.hpp index 193b30ab..aece9c82 100644 --- a/src/hwlm/noodle_engine_sve.hpp +++ b/src/hwlm/noodle_engine_sve.hpp @@ -126,24 +126,6 @@ hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len, return scanSingleLoop(n, buf, len, cbi, chars, d1, e); } -static really_inline -svuint16_t getCharMaskDouble(const struct noodTable *n, bool noCase) { - if (noCase) { - const uint64_t lowerFirst = n->key0 & 0xdf; - const uint64_t upperFirst = n->key0 | 0x20; - const uint64_t lowerSecond = n->key1 & 0xdf; - const uint64_t upperSecond = n->key1 | 0x20; - const uint64_t chars = lowerFirst | (lowerSecond << 8) - | (lowerFirst << 16) | (upperSecond) << 24 - | (upperFirst << 32) | (lowerSecond) << 40 - | (upperFirst << 48) | (upperSecond) << 56; - return svreinterpret_u16(svdup_u64(chars)); - } else { - uint16_t chars_u16 = n->key0 | (n->key1 << 8); - return svdup_u16(chars_u16); - } -} - static really_inline hwlm_error_t doubleCheckMatched(const struct noodTable *n, const u8 *buf, size_t len, const struct cb_info *cbi, @@ -238,7 +220,7 @@ hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len, } ++d; - svuint16_t chars = getCharMaskDouble(n, noCase); + svuint16_t chars = getCharMaskDouble(n->key0, n->key1, noCase); if (scan_len <= svcntb()) { return scanDoubleOnce(n, buf, len, cbi, chars, d, e); diff --git a/src/nfa/vermicelli.h b/src/nfa/vermicelli.h index b2ec0725..9defd899 100644 --- a/src/nfa/vermicelli.h +++ b/src/nfa/vermicelli.h @@ -48,82 +48,6 @@ #include "vermicelli_sse.h" #endif -static really_inline -const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n", - nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf)); - assert(buf < buf_end); - - VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */ - VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */ - -#ifdef HAVE_AVX512 - if (buf_end - buf <= VERM_BOUNDARY) { - const u8 *ptr = nocase - ? dvermMiniNocase(chars1, chars2, buf, buf_end) - : dvermMini(chars1, chars2, buf, buf_end); - if (ptr) { - return ptr; - } - - /* check for partial match at end */ - u8 mask = nocase ? CASE_CLEAR : 0xff; - if ((buf_end[-1] & mask) == (u8)c1) { - DEBUG_PRINTF("partial!!!\n"); - return buf_end - 1; - } - - return buf_end; - } -#endif - - assert((buf_end - buf) >= VERM_BOUNDARY); - uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; - if (min) { - // Input isn't aligned, so we need to run one iteration with an - // unaligned load, then skip buf forward to the next aligned address. - // There's some small overlap here, but we don't mind scanning it twice - // if we can do it quickly, do we? - const u8 *ptr = nocase - ? dvermPreconditionNocase(chars1, chars2, buf) - : dvermPrecondition(chars1, chars2, buf); - if (ptr) { - return ptr; - } - - buf += VERM_BOUNDARY - min; - assert(buf < buf_end); - } - - // Aligned loops from here on in - const u8 *ptr = nocase ? dvermSearchAlignedNocase(chars1, chars2, c1, c2, - buf, buf_end) - : dvermSearchAligned(chars1, chars2, c1, c2, buf, - buf_end); - if (ptr) { - return ptr; - } - - // Tidy up the mess at the end - ptr = nocase ? dvermPreconditionNocase(chars1, chars2, - buf_end - VERM_BOUNDARY) - : dvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY); - - if (ptr) { - return ptr; - } - - /* check for partial match at end */ - u8 mask = nocase ? CASE_CLEAR : 0xff; - if ((buf_end[-1] & mask) == (u8)c1) { - DEBUG_PRINTF("partial!!!\n"); - return buf_end - 1; - } - - return buf_end; -} - static really_inline const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2, const u8 *buf, const u8 *buf_end) { @@ -194,60 +118,4 @@ const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2, return buf_end; } -/* returns highest offset of c2 (NOTE: not c1) */ -static really_inline -const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n", - nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf)); - assert(buf < buf_end); - - VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */ - VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */ - -#ifdef HAVE_AVX512 - if (buf_end - buf <= VERM_BOUNDARY) { - const u8 *ptr = nocase - ? rdvermMiniNocase(chars1, chars2, buf, buf_end) - : rdvermMini(chars1, chars2, buf, buf_end); - - if (ptr) { - return ptr; - } - - // check for partial match at end ??? - return buf - 1; - } -#endif - - assert((buf_end - buf) >= VERM_BOUNDARY); - size_t min = (size_t)buf_end % VERM_BOUNDARY; - if (min) { - // input not aligned, so we need to run one iteration with an unaligned - // load, then skip buf forward to the next aligned address. There's - // some small overlap here, but we don't mind scanning it twice if we - // can do it quickly, do we? - const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2, - buf_end - VERM_BOUNDARY) - : rdvermPrecondition(chars1, chars2, - buf_end - VERM_BOUNDARY); - - if (ptr) { - return ptr; - } - - buf_end -= min; - if (buf >= buf_end) { - return buf_end; - } - } - - // Aligned loops from here on in - if (nocase) { - return rdvermSearchAlignedNocase(chars1, chars2, c1, c2, buf, buf_end); - } else { - return rdvermSearchAligned(chars1, chars2, c1, c2, buf, buf_end); - } -} - #endif /* VERMICELLI_H */ diff --git a/src/nfa/vermicelli_common.h b/src/nfa/vermicelli_common.h index 39109fe1..aca58dcb 100644 --- a/src/nfa/vermicelli_common.h +++ b/src/nfa/vermicelli_common.h @@ -37,51 +37,20 @@ #define VERM_TYPE m128 #define VERM_SET_FN set1_16x8 +// returns NULL if not found static really_inline -const u8 *lastMatchOffset(const u8 *buf_end, u32 z) { - assert(z); - return buf_end - 16 + 31 - clz32(z); -} +const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2, + m128 mask1, m128 mask2, const u8 *buf) { + m128 data = loadu128(buf); // unaligned + m128 v1 = eq128(chars1, and128(data, mask1)); + m128 v2 = eq128(chars2, and128(data, mask2)); + u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1))); -static really_inline -const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - for (; buf + 16 < buf_end; buf += 16) { - m128 data = load128(buf); - u32 z = movemask128(and128(eq128(chars1, data), - rshiftbyte_m128(eq128(chars2, data), 1))); - if (buf[15] == c1 && buf[16] == c2) { - z |= (1 << 15); - } - if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; - } + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; } - - return NULL; -} - -static really_inline -const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - assert((size_t)buf % 16 == 0); - m128 casemask = set1_16x8(CASE_CLEAR); - - for (; buf + 16 < buf_end; buf += 16) { - m128 data = load128(buf); - m128 v = and128(casemask, data); - u32 z = movemask128(and128(eq128(chars1, v), - rshiftbyte_m128(eq128(chars2, v), 1))); - if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) { - z |= (1 << 15); - } - if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; - } - } - return NULL; } @@ -106,128 +75,5 @@ const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2, } } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { - m128 data = loadu128(buf); // unaligned - u32 z = movemask128(and128(eq128(chars1, data), - rshiftbyte_m128(eq128(chars2, data), 1))); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { - /* due to laziness, nonalphas and nocase having interesting behaviour */ - m128 casemask = set1_16x8(CASE_CLEAR); - m128 data = loadu128(buf); // unaligned - m128 v = and128(casemask, data); - u32 z = movemask128(and128(eq128(chars1, v), - rshiftbyte_m128(eq128(chars2, v), 1))); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2, - m128 mask1, m128 mask2, const u8 *buf) { - m128 data = loadu128(buf); // unaligned - m128 v1 = eq128(chars1, and128(data, mask1)); - m128 v2 = eq128(chars2, and128(data, mask2)); - u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1))); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; - } - return NULL; -} - -static really_inline -const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - assert((size_t)buf_end % 16 == 0); - - for (; buf + 16 < buf_end; buf_end -= 16) { - m128 data = load128(buf_end - 16); - u32 z = movemask128(and128(eq128(chars2, data), - lshiftbyte_m128(eq128(chars1, data), 1))); - if (buf_end[-17] == c1 && buf_end[-16] == c2) { - z |= 1; - } - if (unlikely(z)) { - return lastMatchOffset(buf_end, z); - } - } - return buf_end; -} - -static really_inline -const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - assert((size_t)buf_end % 16 == 0); - m128 casemask = set1_16x8(CASE_CLEAR); - - for (; buf + 16 < buf_end; buf_end -= 16) { - m128 data = load128(buf_end - 16); - m128 v = and128(casemask, data); - u32 z = movemask128(and128(eq128(chars2, v), - lshiftbyte_m128(eq128(chars1, v), 1))); - if ((buf_end[-17] & CASE_CLEAR) == c1 - && (buf_end[-16] & CASE_CLEAR) == c2) { - z |= 1; - } - if (unlikely(z)) { - return lastMatchOffset(buf_end, z); - } - } - return buf_end; -} - -// returns NULL if not found -static really_inline -const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { - m128 data = loadu128(buf); - u32 z = movemask128(and128(eq128(chars2, data), - lshiftbyte_m128(eq128(chars1, data), 1))); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - return lastMatchOffset(buf + 16, z); - } - - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { - /* due to laziness, nonalphas and nocase having interesting behaviour */ - m128 casemask = set1_16x8(CASE_CLEAR); - m128 data = loadu128(buf); - m128 v = and128(casemask, data); - u32 z = movemask128(and128(eq128(chars2, v), - lshiftbyte_m128(eq128(chars1, v), 1))); - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - return lastMatchOffset(buf + 16, z); - } - return NULL; } \ No newline at end of file diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h index a754224b..268e9e08 100644 --- a/src/nfa/vermicelli_sse.h +++ b/src/nfa/vermicelli_sse.h @@ -143,6 +143,12 @@ const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) { return NULL; } +static really_inline +const u8 *lastMatchOffset(const u8 *buf_end, u32 z) { + assert(z); + return buf_end - 16 + 31 - clz32(z); +} + static really_inline const u8 *rvermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, char negate) { @@ -216,6 +222,167 @@ const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) { return NULL; } +static really_inline +const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + for (; buf + 16 < buf_end; buf += 16) { + m128 data = load128(buf); + u32 z = movemask128(and128(eq128(chars1, data), + rshiftbyte_m128(eq128(chars2, data), 1))); + if (buf[15] == c1 && buf[16] == c2) { + z |= (1 << 15); + } + if (unlikely(z)) { + const u8 *matchPos = buf + ctz32(z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; + } + } + + return NULL; +} + +static really_inline +const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf % 16 == 0); + m128 casemask = set1_16x8(CASE_CLEAR); + + for (; buf + 16 < buf_end; buf += 16) { + m128 data = load128(buf); + m128 v = and128(casemask, data); + u32 z = movemask128(and128(eq128(chars1, v), + rshiftbyte_m128(eq128(chars2, v), 1))); + if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) { + z |= (1 << 15); + } + if (unlikely(z)) { + const u8 *matchPos = buf + ctz32(z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; + } + } + + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { + m128 data = loadu128(buf); // unaligned + u32 z = movemask128(and128(eq128(chars1, data), + rshiftbyte_m128(eq128(chars2, data), 1))); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + const u8 *matchPos = buf + ctz32(z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { + /* due to laziness, nonalphas and nocase having interesting behaviour */ + m128 casemask = set1_16x8(CASE_CLEAR); + m128 data = loadu128(buf); // unaligned + m128 v = and128(casemask, data); + u32 z = movemask128(and128(eq128(chars1, v), + rshiftbyte_m128(eq128(chars2, v), 1))); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + const u8 *matchPos = buf + ctz32(z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; + } + return NULL; +} + + +static really_inline +const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf_end % 16 == 0); + + for (; buf + 16 < buf_end; buf_end -= 16) { + m128 data = load128(buf_end - 16); + u32 z = movemask128(and128(eq128(chars2, data), + lshiftbyte_m128(eq128(chars1, data), 1))); + if (buf_end[-17] == c1 && buf_end[-16] == c2) { + z |= 1; + } + if (unlikely(z)) { + const u8 *matchPos = lastMatchOffset(buf_end, z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; + } + } + return buf_end; +} + +static really_inline +const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf_end % 16 == 0); + m128 casemask = set1_16x8(CASE_CLEAR); + + for (; buf + 16 < buf_end; buf_end -= 16) { + m128 data = load128(buf_end - 16); + m128 v = and128(casemask, data); + u32 z = movemask128(and128(eq128(chars2, v), + lshiftbyte_m128(eq128(chars1, v), 1))); + if ((buf_end[-17] & CASE_CLEAR) == c1 + && (buf_end[-16] & CASE_CLEAR) == c2) { + z |= 1; + } + if (unlikely(z)) { + const u8 *matchPos = lastMatchOffset(buf_end, z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; + } + } + return buf_end; +} + +// returns NULL if not found +static really_inline +const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { + m128 data = loadu128(buf); + u32 z = movemask128(and128(eq128(chars2, data), + lshiftbyte_m128(eq128(chars1, data), 1))); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + const u8 *matchPos = lastMatchOffset(buf + 16, z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; + } + + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { + /* due to laziness, nonalphas and nocase having interesting behaviour */ + m128 casemask = set1_16x8(CASE_CLEAR); + m128 data = loadu128(buf); + m128 v = and128(casemask, data); + u32 z = movemask128(and128(eq128(chars2, v), + lshiftbyte_m128(eq128(chars1, v), 1))); + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + const u8 *matchPos = lastMatchOffset(buf + 16, z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; + } + + return NULL; +} + #else // HAVE_AVX512 #define VERM_BOUNDARY 64 @@ -982,4 +1149,136 @@ const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf, ptr = nocase ? rvermUnalignNocase(chars, buf, 1) : rvermUnalign(chars, buf, 1); return ptr ? ptr : buf - 1; +} + +static really_inline +const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */ + VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */ + +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? dvermMiniNocase(chars1, chars2, buf, buf_end) + : dvermMini(chars1, chars2, buf, buf_end); + if (ptr) { + return ptr; + } + + /* check for partial match at end */ + u8 mask = nocase ? CASE_CLEAR : 0xff; + if ((buf_end[-1] & mask) == (u8)c1) { + DEBUG_PRINTF("partial!!!\n"); + return buf_end - 1; + } + + return buf_end; + } +#endif + + assert((buf_end - buf) >= VERM_BOUNDARY); + uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; + if (min) { + // Input isn't aligned, so we need to run one iteration with an + // unaligned load, then skip buf forward to the next aligned address. + // There's some small overlap here, but we don't mind scanning it twice + // if we can do it quickly, do we? + const u8 *ptr = nocase + ? dvermPreconditionNocase(chars1, chars2, buf) + : dvermPrecondition(chars1, chars2, buf); + if (ptr) { + return ptr; + } + + buf += VERM_BOUNDARY - min; + assert(buf < buf_end); + } + + // Aligned loops from here on in + const u8 *ptr = nocase ? dvermSearchAlignedNocase(chars1, chars2, c1, c2, + buf, buf_end) + : dvermSearchAligned(chars1, chars2, c1, c2, buf, + buf_end); + if (ptr) { + return ptr; + } + + // Tidy up the mess at the end + ptr = nocase ? dvermPreconditionNocase(chars1, chars2, + buf_end - VERM_BOUNDARY) + : dvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY); + + if (ptr) { + return ptr; + } + + /* check for partial match at end */ + u8 mask = nocase ? CASE_CLEAR : 0xff; + if ((buf_end[-1] & mask) == (u8)c1) { + DEBUG_PRINTF("partial!!!\n"); + return buf_end - 1; + } + + return buf_end; +} + +/* returns highest offset of c2 (NOTE: not c1) */ +static really_inline +const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */ + VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */ + +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? rdvermMiniNocase(chars1, chars2, buf, buf_end) + : rdvermMini(chars1, chars2, buf, buf_end); + + if (ptr) { + return ptr; + } + + // check for partial match at end ??? + return buf - 1; + } +#endif + + assert((buf_end - buf) >= VERM_BOUNDARY); + size_t min = (size_t)buf_end % VERM_BOUNDARY; + if (min) { + // input not aligned, so we need to run one iteration with an unaligned + // load, then skip buf forward to the next aligned address. There's + // some small overlap here, but we don't mind scanning it twice if we + // can do it quickly, do we? + const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2, + buf_end - VERM_BOUNDARY) + : rdvermPrecondition(chars1, chars2, + buf_end - VERM_BOUNDARY); + + if (ptr) { + return ptr; + } + + buf_end -= min; + if (buf >= buf_end) { + return buf_end; + } + } + + // Aligned loops from here on in + if (nocase) { + return rdvermSearchAlignedNocase(chars1, chars2, c1, c2, buf, buf_end); + } else { + return rdvermSearchAligned(chars1, chars2, c1, c2, buf, buf_end); + } } \ No newline at end of file diff --git a/src/nfa/vermicelli_sve.h b/src/nfa/vermicelli_sve.h index 21c47592..6a76f671 100644 --- a/src/nfa/vermicelli_sve.h +++ b/src/nfa/vermicelli_sve.h @@ -33,10 +33,29 @@ */ static really_inline -int vermSearchGetOffset(svbool_t matched) { +uint64_t vermSearchGetOffset(svbool_t matched) { return svcntp_b8(svptrue_b8(), svbrkb_z(svptrue_b8(), matched)); } +static really_inline +int dvermSearchGetOffset(svbool_t matched, svbool_t matched_rot) { + int offset = vermSearchGetOffset(matched); + int offset_rot = vermSearchGetOffset(matched_rot) - 1; + return (offset_rot < offset) ? offset_rot : offset; +} + +static really_inline +uint64_t rdvermSearchGetSingleOffset(svbool_t matched) { + return svcntp_b8(svptrue_b8(), svbrkb_z(svptrue_b8(), svrev_b8(matched))); +} + +static really_inline +uint64_t rdvermSearchGetOffset(svbool_t matched, svbool_t matched_rot) { + uint64_t offset = rdvermSearchGetSingleOffset(matched); + uint64_t offset_rot = rdvermSearchGetSingleOffset(matched_rot) - 1; + return (offset_rot < offset) ? offset_rot : offset; +} + static really_inline const u8 *vermSearchCheckMatched(const u8 *buf, svbool_t matched) { if (unlikely(svptest_any(svptrue_b8(), matched))) { @@ -58,6 +77,29 @@ const u8 *rvermSearchCheckMatched(const u8 *buf, svbool_t matched) { return NULL; } +static really_inline +const u8 *dvermSearchCheckMatched(const u8 *buf, svbool_t matched, + svbool_t matched_rot, svbool_t any) { + if (unlikely(svptest_any(svptrue_b8(), any))) { + const u8 *matchPos = buf + dvermSearchGetOffset(matched, matched_rot); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; + } + return NULL; +} + +static really_inline +const u8 *rdvermSearchCheckMatched(const u8 *buf, svbool_t matched, + svbool_t matched_rot, svbool_t any) { + if (unlikely(svptest_any(svptrue_b8(), any))) { + const u8 *matchPos = buf + (svcntb() - + rdvermSearchGetOffset(matched, matched_rot)); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; + } + return NULL; +} + static really_inline svbool_t singleMatched(svuint8_t chars, const u8 *buf, svbool_t pg, bool negate, const int64_t vnum) { @@ -69,6 +111,17 @@ svbool_t singleMatched(svuint8_t chars, const u8 *buf, svbool_t pg, } } +static really_inline +svbool_t doubleMatched(svuint16_t chars, const u8 *buf, const u8 *buf_rot, + svbool_t pg, svbool_t pg_rot, svbool_t * const matched, + svbool_t * const matched_rot) { + svuint16_t vec = svreinterpret_u16(svld1_u8(pg, buf)); + svuint16_t vec_rot = svreinterpret_u16(svld1_u8(pg_rot, buf_rot)); + *matched = svmatch(pg, vec, chars); + *matched_rot = svmatch(pg_rot, vec_rot, chars); + return svorr_z(svptrue_b8(), *matched, *matched_rot); +} + static really_inline const u8 *vermSearchOnce(svuint8_t chars, const u8 *buf, const u8 *buf_end, bool negate) { @@ -122,6 +175,62 @@ const u8 *rvermSearchLoopBody(svuint8_t chars, const u8 *buf, bool negate) { return rvermSearchCheckMatched(buf, matched); } +static really_inline +const u8 *dvermSearchOnce(svuint16_t chars, const u8 *buf, const u8 *buf_end) { + DEBUG_PRINTF("start %p end %p\n", buf, buf_end); + assert(buf < buf_end); + DEBUG_PRINTF("l = %td\n", buf_end - buf); + svbool_t pg = svwhilelt_b8_s64(0, buf_end - buf); + svbool_t pg_rot = svwhilele_b8_s64(0, buf_end - buf); + svbool_t matched, matched_rot; + // buf - 1 won't underflow as the first position in the buffer has been + // dealt with meaning that buf - 1 is within the buffer. + svbool_t any = doubleMatched(chars, buf, buf - 1, pg, pg_rot, + &matched, &matched_rot); + return dvermSearchCheckMatched(buf, matched, matched_rot, any); +} + +static really_inline +const u8 *dvermSearchLoopBody(svuint16_t chars, const u8 *buf) { + DEBUG_PRINTF("start %p end %p\n", buf, buf + svcntb()); + svbool_t matched, matched_rot; + // buf - 1 won't underflow as the first position in the buffer has been + // dealt with meaning that buf - 1 is within the buffer. + svbool_t any = doubleMatched(chars, buf, buf - 1, svptrue_b8(), + svptrue_b8(), &matched, &matched_rot); + return dvermSearchCheckMatched(buf, matched, matched_rot, any); +} + +static really_inline +const u8 *rdvermSearchOnce(svuint16_t chars, const u8 *buf, const u8 *buf_end) { + DEBUG_PRINTF("start %p end %p\n", buf, buf_end); + assert(buf < buf_end); + + DEBUG_PRINTF("l = %td\n", buf_end - buf); + // buf_end can be read as the last position in the buffer has been + // dealt with meaning that buf_end is within the buffer. + // buf_end needs to be read by both the buf load and the buf + 1 load, + // this is because buf_end must be the upper 8 bits of the 16 bit element + // to be matched. + svbool_t pg = svwhilele_b8_s64(0, buf_end - buf); + svbool_t pg_rot = svwhilelt_b8_s64(0, buf_end - buf); + svbool_t matched, matched_rot; + svbool_t any = doubleMatched(chars, buf, buf + 1, pg, pg_rot, + &matched, &matched_rot); + return rdvermSearchCheckMatched(buf, matched, matched_rot, any); +} + +static really_inline +const u8 *rdvermSearchLoopBody(svuint16_t chars, const u8 *buf) { + DEBUG_PRINTF("start %p end %p\n", buf, buf + svcntb()); + svbool_t matched, matched_rot; + // buf + svcntb() can be read as the last position in the buffer has + // been dealt with meaning that buf + svcntb() is within the buffer. + svbool_t any = doubleMatched(chars, buf, buf + 1, svptrue_b8(), + svptrue_b8(), &matched, &matched_rot); + return rdvermSearchCheckMatched(buf, matched, matched_rot, any); +} + static really_inline const u8 *vermSearch(char c, bool nocase, const u8 *buf, const u8 *buf_end, bool negate) { @@ -185,6 +294,60 @@ const u8 *rvermSearch(char c, bool nocase, const u8 *buf, const u8 *buf_end, return buf == buf_end ? NULL : rvermSearchLoopBody(chars, buf, negate); } +static really_inline +const u8 *dvermSearch(char c1, char c2, bool nocase, const u8 *buf, + const u8 *buf_end) { + svuint16_t chars = getCharMaskDouble(c1, c2, nocase); + size_t len = buf_end - buf; + if (len <= svcntb()) { + return dvermSearchOnce(chars, buf, buf_end); + } + // peel off first part to align to the vector size + const u8 *aligned_buf = ROUNDUP_PTR(buf, svcntb_pat(SV_POW2)); + assert(aligned_buf < buf_end); + if (buf != aligned_buf) { + const u8 *ptr = dvermSearchLoopBody(chars, buf); + if (ptr) return ptr; + } + buf = aligned_buf; + size_t loops = (buf_end - buf) / svcntb(); + DEBUG_PRINTF("loops %zu \n", loops); + for (size_t i = 0; i < loops; i++, buf += svcntb()) { + const u8 *ptr = dvermSearchLoopBody(chars, buf); + if (ptr) return ptr; + } + DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end); + return buf == buf_end ? NULL : dvermSearchLoopBody(chars, + buf_end - svcntb()); +} + +static really_inline +const u8 *rdvermSearch(char c1, char c2, bool nocase, const u8 *buf, + const u8 *buf_end) { + svuint16_t chars = getCharMaskDouble(c1, c2, nocase); + size_t len = buf_end - buf; + if (len <= svcntb()) { + return rdvermSearchOnce(chars, buf, buf_end); + } + // peel off first part to align to the vector size + const u8 *aligned_buf_end = ROUNDDOWN_PTR(buf_end, svcntb_pat(SV_POW2)); + assert(buf < aligned_buf_end); + if (buf_end != aligned_buf_end) { + const u8 *rv = rdvermSearchLoopBody(chars, buf_end - svcntb()); + if (rv) return rv; + } + buf_end = aligned_buf_end; + size_t loops = (buf_end - buf) / svcntb(); + DEBUG_PRINTF("loops %zu \n", loops); + for (size_t i = 0; i < loops; i++) { + buf_end -= svcntb(); + const u8 *rv = rdvermSearchLoopBody(chars, buf_end); + if (rv) return rv; + } + DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end); + return buf == buf_end ? NULL : rdvermSearchLoopBody(chars, buf); +} + static really_inline const u8 *vermicelliExec(char c, bool nocase, const u8 *buf, const u8 *buf_end) { @@ -225,4 +388,43 @@ const u8 *rnvermicelliExec(char c, bool nocase, const u8 *buf, nocase ? "nocase " : "", c, buf_end - buf); const u8 *ptr = rvermSearch(c, nocase, buf, buf_end, true); return ptr ? ptr : buf - 1; +} + +static really_inline +const u8 *vermicelliDoubleExec(char c1, char c2, bool nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %td bytes\n", + nocase ? "nocase " : "", c1, c2, buf_end - buf); + assert(buf < buf_end); + if (buf_end - buf > 1) { + ++buf; + const u8 *ptr = dvermSearch(c1, c2, nocase, buf, buf_end); + if (ptr) { + return ptr; + } + } + /* check for partial match at end */ + u8 mask = nocase ? CASE_CLEAR : 0xff; + if ((buf_end[-1] & mask) == (u8)c1) { + DEBUG_PRINTF("partial!!!\n"); + return buf_end - 1; + } + return buf_end; +} + +/* returns highest offset of c2 (NOTE: not c1) */ +static really_inline +const u8 *rvermicelliDoubleExec(char c1, char c2, bool nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %td bytes\n", + nocase ? "nocase " : "", c1, c2, buf_end - buf); + assert(buf < buf_end); + if (buf_end - buf > 1) { + --buf_end; + const u8 *ptr = rdvermSearch(c1, c2, nocase, buf, buf_end); + if (ptr) { + return ptr; + } + } + return buf - 1; } \ No newline at end of file diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 95a85b9b..9e73e931 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -53,6 +53,24 @@ svuint8_t getCharMaskSingle(const u8 c, bool noCase) { } } +static really_inline +svuint16_t getCharMaskDouble(const u8 c0, const u8 c1, bool noCase) { + if (noCase) { + const uint64_t lowerFirst = c0 & 0xdf; + const uint64_t upperFirst = c0 | 0x20; + const uint64_t lowerSecond = c1 & 0xdf; + const uint64_t upperSecond = c1 | 0x20; + const uint64_t chars = lowerFirst | (lowerSecond << 8) + | (lowerFirst << 16) | (upperSecond) << 24 + | (upperFirst << 32) | (lowerSecond) << 40 + | (upperFirst << 48) | (upperSecond) << 56; + return svreinterpret_u16(svdup_u64(chars)); + } else { + uint16_t chars_u16 = c0 | (c1 << 8); + return svdup_u16(chars_u16); + } +} + #endif #include // for memcpy