diff --git a/src/hwlm/noodle_engine_sve.hpp b/src/hwlm/noodle_engine_sve.hpp index d541b6eb..193b30ab 100644 --- a/src/hwlm/noodle_engine_sve.hpp +++ b/src/hwlm/noodle_engine_sve.hpp @@ -26,16 +26,6 @@ * POSSIBILITY OF SUCH DAMAGE. */ -static really_inline -svuint8_t getCharMaskSingle(const struct noodTable *n, bool noCase) { - if (noCase) { - uint16_t chars_u16 = (n->key0 & 0xdf) | ((n->key0 | 0x20) << 8); - return svreinterpret_u8(svdup_u16(chars_u16)); - } else { - return svdup_u8(n->key0); - } -} - static really_inline hwlm_error_t checkMatched(const struct noodTable *n, const u8 *buf, size_t len, const struct cb_info *cbi, const u8 *d, @@ -120,7 +110,7 @@ hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len, assert(d < e); assert(d >= buf); - svuint8_t chars = getCharMaskSingle(n, noCase); + svuint8_t chars = getCharMaskSingle(n->key0, noCase); size_t scan_len = e - d; if (scan_len <= svcntb()) { diff --git a/src/nfa/shufti.cpp b/src/nfa/shufti.cpp index 0a95bacb..d78a7054 100644 --- a/src/nfa/shufti.cpp +++ b/src/nfa/shufti.cpp @@ -69,7 +69,6 @@ const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf, return buf_end; } -#if !defined(HAVE_SVE) #include "shufti_simd.hpp" const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, @@ -87,4 +86,3 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, const u8 *buf, const u8 *buf_end) { return shuftiDoubleExecReal(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf, buf_end); } -#endif diff --git a/src/nfa/vermicelli.h b/src/nfa/vermicelli.h index ed797d83..b2ec0725 100644 --- a/src/nfa/vermicelli.h +++ b/src/nfa/vermicelli.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2015-2020, Intel Corporation + * Copyright (c) 2021, Arm Limited * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -37,138 +38,16 @@ #include "util/simd_utils.h" #include "util/unaligned.h" +#if !defined(HAVE_AVX512) +#include "vermicelli_common.h" +#endif + +#ifdef HAVE_SVE2 +#include "vermicelli_sve.h" +#else #include "vermicelli_sse.h" - -static really_inline -const u8 *vermicelliExec(char c, char nocase, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n", - nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); - assert(buf < buf_end); - - VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ - - // Handle small scans. -#ifdef HAVE_AVX512 - if (buf_end - buf <= VERM_BOUNDARY) { - const u8 *ptr = nocase - ? vermMiniNocase(chars, buf, buf_end, 0) - : vermMini(chars, buf, buf_end, 0); - if (ptr) { - return ptr; - } - return buf_end; - } -#else - if (buf_end - buf < VERM_BOUNDARY) { - for (; buf < buf_end; buf++) { - char cur = (char)*buf; - if (nocase) { - cur &= CASE_CLEAR; - } - if (cur == c) { - break; - } - } - return buf; - } #endif - uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; - if (min) { - // Input isn't aligned, so we need to run one iteration with an - // unaligned load, then skip buf forward to the next aligned address. - // There's some small overlap here, but we don't mind scanning it twice - // if we can do it quickly, do we? - const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 0) - : vermUnalign(chars, buf, 0); - if (ptr) { - return ptr; - } - - buf += VERM_BOUNDARY - min; - assert(buf < buf_end); - } - - // Aligned loops from here on in - const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 0) - : vermSearchAligned(chars, buf, buf_end - 1, 0); - if (ptr) { - return ptr; - } - - // Tidy up the mess at the end - ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 0) - : vermUnalign(chars, buf_end - VERM_BOUNDARY, 0); - return ptr ? ptr : buf_end; -} - -/* like vermicelliExec except returns the address of the first character which - * is not c */ -static really_inline -const u8 *nvermicelliExec(char c, char nocase, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("nverm scan %s\\x%02hhx over %zu bytes\n", - nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); - assert(buf < buf_end); - - VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ - - // Handle small scans. -#ifdef HAVE_AVX512 - if (buf_end - buf <= VERM_BOUNDARY) { - const u8 *ptr = nocase - ? vermMiniNocase(chars, buf, buf_end, 1) - : vermMini(chars, buf, buf_end, 1); - if (ptr) { - return ptr; - } - return buf_end; - } -#else - if (buf_end - buf < VERM_BOUNDARY) { - for (; buf < buf_end; buf++) { - char cur = (char)*buf; - if (nocase) { - cur &= CASE_CLEAR; - } - if (cur != c) { - break; - } - } - return buf; - } -#endif - - size_t min = (size_t)buf % VERM_BOUNDARY; - if (min) { - // Input isn't aligned, so we need to run one iteration with an - // unaligned load, then skip buf forward to the next aligned address. - // There's some small overlap here, but we don't mind scanning it twice - // if we can do it quickly, do we? - const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 1) - : vermUnalign(chars, buf, 1); - if (ptr) { - return ptr; - } - - buf += VERM_BOUNDARY - min; - assert(buf < buf_end); - } - - // Aligned loops from here on in - const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 1) - : vermSearchAligned(chars, buf, buf_end - 1, 1); - if (ptr) { - return ptr; - } - - // Tidy up the mess at the end - ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 1) - : vermUnalign(chars, buf_end - VERM_BOUNDARY, 1); - return ptr ? ptr : buf_end; -} - static really_inline const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, const u8 *buf_end) { @@ -315,150 +194,6 @@ const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2, return buf_end; } -// Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if -// character not found. -static really_inline -const u8 *rvermicelliExec(char c, char nocase, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n", - nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); - assert(buf < buf_end); - - VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ - - // Handle small scans. -#ifdef HAVE_AVX512 - if (buf_end - buf <= VERM_BOUNDARY) { - const u8 *ptr = nocase - ? rvermMiniNocase(chars, buf, buf_end, 0) - : rvermMini(chars, buf, buf_end, 0); - if (ptr) { - return ptr; - } - return buf - 1; - } -#else - if (buf_end - buf < VERM_BOUNDARY) { - for (buf_end--; buf_end >= buf; buf_end--) { - char cur = (char)*buf_end; - if (nocase) { - cur &= CASE_CLEAR; - } - if (cur == c) { - break; - } - } - return buf_end; - } -#endif - - size_t min = (size_t)buf_end % VERM_BOUNDARY; - if (min) { - // Input isn't aligned, so we need to run one iteration with an - // unaligned load, then skip buf backward to the next aligned address. - // There's some small overlap here, but we don't mind scanning it twice - // if we can do it quickly, do we? - const u8 *ptr = nocase ? rvermUnalignNocase(chars, - buf_end - VERM_BOUNDARY, - 0) - : rvermUnalign(chars, buf_end - VERM_BOUNDARY, - 0); - - if (ptr) { - return ptr; - } - - buf_end -= min; - if (buf >= buf_end) { - return buf_end; - } - } - - // Aligned loops from here on in. - const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 0) - : rvermSearchAligned(chars, buf, buf_end, 0); - if (ptr) { - return ptr; - } - - // Tidy up the mess at the end, return buf - 1 if not found. - ptr = nocase ? rvermUnalignNocase(chars, buf, 0) - : rvermUnalign(chars, buf, 0); - return ptr ? ptr : buf - 1; -} - -/* like rvermicelliExec except returns the address of the last character which - * is not c */ -static really_inline -const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n", - nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); - assert(buf < buf_end); - - VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ - - // Handle small scans. -#ifdef HAVE_AVX512 - if (buf_end - buf <= VERM_BOUNDARY) { - const u8 *ptr = nocase - ? rvermMiniNocase(chars, buf, buf_end, 1) - : rvermMini(chars, buf, buf_end, 1); - if (ptr) { - return ptr; - } - return buf - 1; - } -#else - if (buf_end - buf < VERM_BOUNDARY) { - for (buf_end--; buf_end >= buf; buf_end--) { - char cur = (char)*buf_end; - if (nocase) { - cur &= CASE_CLEAR; - } - if (cur != c) { - break; - } - } - return buf_end; - } -#endif - - size_t min = (size_t)buf_end % VERM_BOUNDARY; - if (min) { - // Input isn't aligned, so we need to run one iteration with an - // unaligned load, then skip buf backward to the next aligned address. - // There's some small overlap here, but we don't mind scanning it twice - // if we can do it quickly, do we? - const u8 *ptr = nocase ? rvermUnalignNocase(chars, - buf_end - VERM_BOUNDARY, - 1) - : rvermUnalign(chars, buf_end - VERM_BOUNDARY, - 1); - - if (ptr) { - return ptr; - } - - buf_end -= min; - if (buf >= buf_end) { - return buf_end; - } - } - - // Aligned loops from here on in. - const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 1) - : rvermSearchAligned(chars, buf, buf_end, 1); - if (ptr) { - return ptr; - } - - // Tidy up the mess at the end, return buf - 1 if not found. - ptr = nocase ? rvermUnalignNocase(chars, buf, 1) - : rvermUnalign(chars, buf, 1); - return ptr ? ptr : buf - 1; -} - /* returns highest offset of c2 (NOTE: not c1) */ static really_inline const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, diff --git a/src/nfa/vermicelli_common.h b/src/nfa/vermicelli_common.h new file mode 100644 index 00000000..39109fe1 --- /dev/null +++ b/src/nfa/vermicelli_common.h @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * Copyright (c) 2021, Arm Limited + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Vermicelli: Implementation shared between architectures. + * + * (users should include vermicelli.h instead of this) + */ + +#define VERM_BOUNDARY 16 +#define VERM_TYPE m128 +#define VERM_SET_FN set1_16x8 + +static really_inline +const u8 *lastMatchOffset(const u8 *buf_end, u32 z) { + assert(z); + return buf_end - 16 + 31 - clz32(z); +} + +static really_inline +const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + for (; buf + 16 < buf_end; buf += 16) { + m128 data = load128(buf); + u32 z = movemask128(and128(eq128(chars1, data), + rshiftbyte_m128(eq128(chars2, data), 1))); + if (buf[15] == c1 && buf[16] == c2) { + z |= (1 << 15); + } + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + } + + return NULL; +} + +static really_inline +const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf % 16 == 0); + m128 casemask = set1_16x8(CASE_CLEAR); + + for (; buf + 16 < buf_end; buf += 16) { + m128 data = load128(buf); + m128 v = and128(casemask, data); + u32 z = movemask128(and128(eq128(chars1, v), + rshiftbyte_m128(eq128(chars2, v), 1))); + if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) { + z |= (1 << 15); + } + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + } + + return NULL; +} + +static really_inline +const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2, + m128 mask1, m128 mask2, u8 c1, u8 c2, u8 m1, + u8 m2, const u8 *buf, const u8 *buf_end) { + assert((size_t)buf % 16 == 0); + + for (; buf + 16 < buf_end; buf += 16) { + m128 data = load128(buf); + m128 v1 = eq128(chars1, and128(data, mask1)); + m128 v2 = eq128(chars2, and128(data, mask2)); + u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1))); + + if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) { + z |= (1 << 15); + } + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + } + + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { + m128 data = loadu128(buf); // unaligned + u32 z = movemask128(and128(eq128(chars1, data), + rshiftbyte_m128(eq128(chars2, data), 1))); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { + /* due to laziness, nonalphas and nocase having interesting behaviour */ + m128 casemask = set1_16x8(CASE_CLEAR); + m128 data = loadu128(buf); // unaligned + m128 v = and128(casemask, data); + u32 z = movemask128(and128(eq128(chars1, v), + rshiftbyte_m128(eq128(chars2, v), 1))); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2, + m128 mask1, m128 mask2, const u8 *buf) { + m128 data = loadu128(buf); // unaligned + m128 v1 = eq128(chars1, and128(data, mask1)); + m128 v2 = eq128(chars2, and128(data, mask2)); + u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1))); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + return NULL; +} + +static really_inline +const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf_end % 16 == 0); + + for (; buf + 16 < buf_end; buf_end -= 16) { + m128 data = load128(buf_end - 16); + u32 z = movemask128(and128(eq128(chars2, data), + lshiftbyte_m128(eq128(chars1, data), 1))); + if (buf_end[-17] == c1 && buf_end[-16] == c2) { + z |= 1; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return buf_end; +} + +static really_inline +const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf_end % 16 == 0); + m128 casemask = set1_16x8(CASE_CLEAR); + + for (; buf + 16 < buf_end; buf_end -= 16) { + m128 data = load128(buf_end - 16); + m128 v = and128(casemask, data); + u32 z = movemask128(and128(eq128(chars2, v), + lshiftbyte_m128(eq128(chars1, v), 1))); + if ((buf_end[-17] & CASE_CLEAR) == c1 + && (buf_end[-16] & CASE_CLEAR) == c2) { + z |= 1; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return buf_end; +} + +// returns NULL if not found +static really_inline +const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { + m128 data = loadu128(buf); + u32 z = movemask128(and128(eq128(chars2, data), + lshiftbyte_m128(eq128(chars1, data), 1))); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + return lastMatchOffset(buf + 16, z); + } + + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { + /* due to laziness, nonalphas and nocase having interesting behaviour */ + m128 casemask = set1_16x8(CASE_CLEAR); + m128 data = loadu128(buf); + m128 v = and128(casemask, data); + u32 z = movemask128(and128(eq128(chars2, v), + lshiftbyte_m128(eq128(chars1, v), 1))); + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + return lastMatchOffset(buf + 16, z); + } + + return NULL; +} \ No newline at end of file diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h index 12001f4f..a754224b 100644 --- a/src/nfa/vermicelli_sse.h +++ b/src/nfa/vermicelli_sse.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2015-2020, Intel Corporation + * Copyright (c) 2021, Arm Limited * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -29,7 +30,7 @@ /** \file * \brief Vermicelli: Intel SSE implementation. * - * (users should include vermicelli.h) + * (users should include vermicelli.h instead of this) */ #if !defined(HAVE_AVX512) @@ -52,8 +53,9 @@ const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, z = ~z; } if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; + const u8 *matchPos = buf + ctz32(z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; } } for (; buf + 15 < buf_end; buf += 16) { @@ -63,8 +65,9 @@ const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, z = ~z & 0xffff; } if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; + const u8 *matchPos = buf + ctz32(z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; } } return NULL; @@ -86,8 +89,9 @@ const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf, z = ~z; } if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; + const u8 *matchPos = buf + ctz32(z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; } } @@ -98,8 +102,9 @@ const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf, z = ~z & 0xffff; } if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; + const u8 *matchPos = buf + ctz32(z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; } } return NULL; @@ -114,7 +119,9 @@ const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) { z = ~z & 0xffff; } if (unlikely(z)) { - return buf + ctz32(z); + const u8 *matchPos = buf + ctz32(z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; } return NULL; } @@ -129,133 +136,13 @@ const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) { z = ~z & 0xffff; } if (unlikely(z)) { - return buf + ctz32(z); + const u8 *matchPos = buf + ctz32(z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; } return NULL; } -static really_inline -const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - for (; buf + 16 < buf_end; buf += 16) { - m128 data = load128(buf); - u32 z = movemask128(and128(eq128(chars1, data), - rshiftbyte_m128(eq128(chars2, data), 1))); - if (buf[15] == c1 && buf[16] == c2) { - z |= (1 << 15); - } - if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; - } - } - - return NULL; -} - -static really_inline -const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - assert((size_t)buf % 16 == 0); - m128 casemask = set1_16x8(CASE_CLEAR); - - for (; buf + 16 < buf_end; buf += 16) { - m128 data = load128(buf); - m128 v = and128(casemask, data); - u32 z = movemask128(and128(eq128(chars1, v), - rshiftbyte_m128(eq128(chars2, v), 1))); - if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) { - z |= (1 << 15); - } - if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; - } - } - - return NULL; -} - -static really_inline -const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2, - m128 mask1, m128 mask2, u8 c1, u8 c2, u8 m1, - u8 m2, const u8 *buf, const u8 *buf_end) { - assert((size_t)buf % 16 == 0); - - for (; buf + 16 < buf_end; buf += 16) { - m128 data = load128(buf); - m128 v1 = eq128(chars1, and128(data, mask1)); - m128 v2 = eq128(chars2, and128(data, mask2)); - u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1))); - - if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) { - z |= (1 << 15); - } - if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; - } - } - - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { - m128 data = loadu128(buf); // unaligned - u32 z = movemask128(and128(eq128(chars1, data), - rshiftbyte_m128(eq128(chars2, data), 1))); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { - /* due to laziness, nonalphas and nocase having interesting behaviour */ - m128 casemask = set1_16x8(CASE_CLEAR); - m128 data = loadu128(buf); // unaligned - m128 v = and128(casemask, data); - u32 z = movemask128(and128(eq128(chars1, v), - rshiftbyte_m128(eq128(chars2, v), 1))); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2, - m128 mask1, m128 mask2, const u8 *buf) { - m128 data = loadu128(buf); // unaligned - m128 v1 = eq128(chars1, and128(data, mask1)); - m128 v2 = eq128(chars2, and128(data, mask2)); - u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1))); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - u32 pos = ctz32(z); - return buf + pos; - } - return NULL; -} - -static really_inline -const u8 *lastMatchOffset(const u8 *buf_end, u32 z) { - assert(z); - return buf_end - 16 + 31 - clz32(z); -} - static really_inline const u8 *rvermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, char negate) { @@ -267,7 +154,9 @@ const u8 *rvermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, z = ~z & 0xffff; } if (unlikely(z)) { - return lastMatchOffset(buf_end, z); + const u8 *matchPos = lastMatchOffset(buf_end, z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; } } return NULL; @@ -286,7 +175,9 @@ const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf, z = ~z & 0xffff; } if (unlikely(z)) { - return lastMatchOffset(buf_end, z); + const u8 *matchPos = lastMatchOffset(buf_end, z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; } } return NULL; @@ -301,7 +192,9 @@ const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) { z = ~z & 0xffff; } if (unlikely(z)) { - return lastMatchOffset(buf + 16, z); + const u8 *matchPos = lastMatchOffset(buf + 16, z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; } return NULL; } @@ -316,84 +209,13 @@ const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) { z = ~z & 0xffff; } if (unlikely(z)) { - return lastMatchOffset(buf + 16, z); + const u8 *matchPos = lastMatchOffset(buf + 16, z); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; } return NULL; } -static really_inline -const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - assert((size_t)buf_end % 16 == 0); - - for (; buf + 16 < buf_end; buf_end -= 16) { - m128 data = load128(buf_end - 16); - u32 z = movemask128(and128(eq128(chars2, data), - lshiftbyte_m128(eq128(chars1, data), 1))); - if (buf_end[-17] == c1 && buf_end[-16] == c2) { - z |= 1; - } - if (unlikely(z)) { - return lastMatchOffset(buf_end, z); - } - } - return buf_end; -} - -static really_inline -const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - assert((size_t)buf_end % 16 == 0); - m128 casemask = set1_16x8(CASE_CLEAR); - - for (; buf + 16 < buf_end; buf_end -= 16) { - m128 data = load128(buf_end - 16); - m128 v = and128(casemask, data); - u32 z = movemask128(and128(eq128(chars2, v), - lshiftbyte_m128(eq128(chars1, v), 1))); - if ((buf_end[-17] & CASE_CLEAR) == c1 - && (buf_end[-16] & CASE_CLEAR) == c2) { - z |= 1; - } - if (unlikely(z)) { - return lastMatchOffset(buf_end, z); - } - } - return buf_end; -} - -// returns NULL if not found -static really_inline -const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { - m128 data = loadu128(buf); - u32 z = movemask128(and128(eq128(chars2, data), - lshiftbyte_m128(eq128(chars1, data), 1))); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - return lastMatchOffset(buf + 16, z); - } - - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { - /* due to laziness, nonalphas and nocase having interesting behaviour */ - m128 casemask = set1_16x8(CASE_CLEAR); - m128 data = loadu128(buf); - m128 v = and128(casemask, data); - u32 z = movemask128(and128(eq128(chars2, v), - lshiftbyte_m128(eq128(chars1, v), 1))); - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - return lastMatchOffset(buf + 16, z); - } - - return NULL; -} - #else // HAVE_AVX512 #define VERM_BOUNDARY 64 @@ -887,3 +709,277 @@ const u8 *rdvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) { } #endif // HAVE_AVX512 + +static really_inline +const u8 *vermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ + + // Handle small scans. +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? vermMiniNocase(chars, buf, buf_end, 0) + : vermMini(chars, buf, buf_end, 0); + if (ptr) { + return ptr; + } + return buf_end; + } +#else + if (buf_end - buf < VERM_BOUNDARY) { + for (; buf < buf_end; buf++) { + char cur = (char)*buf; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur == c) { + break; + } + } + return buf; + } +#endif + + uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; + if (min) { + // Input isn't aligned, so we need to run one iteration with an + // unaligned load, then skip buf forward to the next aligned address. + // There's some small overlap here, but we don't mind scanning it twice + // if we can do it quickly, do we? + const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 0) + : vermUnalign(chars, buf, 0); + if (ptr) { + return ptr; + } + + buf += VERM_BOUNDARY - min; + assert(buf < buf_end); + } + + // Aligned loops from here on in + const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 0) + : vermSearchAligned(chars, buf, buf_end - 1, 0); + if (ptr) { + return ptr; + } + + // Tidy up the mess at the end + ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 0) + : vermUnalign(chars, buf_end - VERM_BOUNDARY, 0); + return ptr ? ptr : buf_end; +} + +/* like vermicelliExec except returns the address of the first character which + * is not c */ +static really_inline +const u8 *nvermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("nverm scan %s\\x%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ + + // Handle small scans. +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? vermMiniNocase(chars, buf, buf_end, 1) + : vermMini(chars, buf, buf_end, 1); + if (ptr) { + return ptr; + } + return buf_end; + } +#else + if (buf_end - buf < VERM_BOUNDARY) { + for (; buf < buf_end; buf++) { + char cur = (char)*buf; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur != c) { + break; + } + } + return buf; + } +#endif + + size_t min = (size_t)buf % VERM_BOUNDARY; + if (min) { + // Input isn't aligned, so we need to run one iteration with an + // unaligned load, then skip buf forward to the next aligned address. + // There's some small overlap here, but we don't mind scanning it twice + // if we can do it quickly, do we? + const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 1) + : vermUnalign(chars, buf, 1); + if (ptr) { + return ptr; + } + + buf += VERM_BOUNDARY - min; + assert(buf < buf_end); + } + + // Aligned loops from here on in + const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 1) + : vermSearchAligned(chars, buf, buf_end - 1, 1); + if (ptr) { + return ptr; + } + + // Tidy up the mess at the end + ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 1) + : vermUnalign(chars, buf_end - VERM_BOUNDARY, 1); + return ptr ? ptr : buf_end; +} + +// Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if +// character not found. +static really_inline +const u8 *rvermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ + + // Handle small scans. +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? rvermMiniNocase(chars, buf, buf_end, 0) + : rvermMini(chars, buf, buf_end, 0); + if (ptr) { + return ptr; + } + return buf - 1; + } +#else + if (buf_end - buf < VERM_BOUNDARY) { + for (buf_end--; buf_end >= buf; buf_end--) { + char cur = (char)*buf_end; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur == c) { + break; + } + } + return buf_end; + } +#endif + + size_t min = (size_t)buf_end % VERM_BOUNDARY; + if (min) { + // Input isn't aligned, so we need to run one iteration with an + // unaligned load, then skip buf backward to the next aligned address. + // There's some small overlap here, but we don't mind scanning it twice + // if we can do it quickly, do we? + const u8 *ptr = nocase ? rvermUnalignNocase(chars, + buf_end - VERM_BOUNDARY, + 0) + : rvermUnalign(chars, buf_end - VERM_BOUNDARY, + 0); + + if (ptr) { + return ptr; + } + + buf_end -= min; + if (buf >= buf_end) { + return buf_end; + } + } + + // Aligned loops from here on in. + const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 0) + : rvermSearchAligned(chars, buf, buf_end, 0); + if (ptr) { + return ptr; + } + + // Tidy up the mess at the end, return buf - 1 if not found. + ptr = nocase ? rvermUnalignNocase(chars, buf, 0) + : rvermUnalign(chars, buf, 0); + return ptr ? ptr : buf - 1; +} + +/* like rvermicelliExec except returns the address of the last character which + * is not c */ +static really_inline +const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ + + // Handle small scans. +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? rvermMiniNocase(chars, buf, buf_end, 1) + : rvermMini(chars, buf, buf_end, 1); + if (ptr) { + return ptr; + } + return buf - 1; + } +#else + if (buf_end - buf < VERM_BOUNDARY) { + for (buf_end--; buf_end >= buf; buf_end--) { + char cur = (char)*buf_end; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur != c) { + break; + } + } + return buf_end; + } +#endif + + size_t min = (size_t)buf_end % VERM_BOUNDARY; + if (min) { + // Input isn't aligned, so we need to run one iteration with an + // unaligned load, then skip buf backward to the next aligned address. + // There's some small overlap here, but we don't mind scanning it twice + // if we can do it quickly, do we? + const u8 *ptr = nocase ? rvermUnalignNocase(chars, + buf_end - VERM_BOUNDARY, + 1) + : rvermUnalign(chars, buf_end - VERM_BOUNDARY, + 1); + + if (ptr) { + return ptr; + } + + buf_end -= min; + if (buf >= buf_end) { + return buf_end; + } + } + + // Aligned loops from here on in. + const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 1) + : rvermSearchAligned(chars, buf, buf_end, 1); + if (ptr) { + return ptr; + } + + // Tidy up the mess at the end, return buf - 1 if not found. + ptr = nocase ? rvermUnalignNocase(chars, buf, 1) + : rvermUnalign(chars, buf, 1); + return ptr ? ptr : buf - 1; +} \ No newline at end of file diff --git a/src/nfa/vermicelli_sve.h b/src/nfa/vermicelli_sve.h new file mode 100644 index 00000000..21c47592 --- /dev/null +++ b/src/nfa/vermicelli_sve.h @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2021, Arm Limited + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Vermicelli: AArch64 SVE implementation. + * + * (users should include vermicelli.h instead of this) + */ + +static really_inline +int vermSearchGetOffset(svbool_t matched) { + return svcntp_b8(svptrue_b8(), svbrkb_z(svptrue_b8(), matched)); +} + +static really_inline +const u8 *vermSearchCheckMatched(const u8 *buf, svbool_t matched) { + if (unlikely(svptest_any(svptrue_b8(), matched))) { + const u8 *matchPos = buf + vermSearchGetOffset(matched); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; + } + return NULL; +} + +static really_inline +const u8 *rvermSearchCheckMatched(const u8 *buf, svbool_t matched) { + if (unlikely(svptest_any(svptrue_b8(), matched))) { + const u8 *matchPos = buf + (svcntb() - + svcntp_b8(svptrue_b8(), svbrka_z(svptrue_b8(), svrev_b8(matched)))); + DEBUG_PRINTF("match pos %p\n", matchPos); + return matchPos; + } + return NULL; +} + +static really_inline +svbool_t singleMatched(svuint8_t chars, const u8 *buf, svbool_t pg, + bool negate, const int64_t vnum) { + svuint8_t vec = svld1_vnum_u8(pg, buf, vnum); + if (negate) { + return svnmatch(pg, vec, chars); + } else { + return svmatch(pg, vec, chars); + } +} + +static really_inline +const u8 *vermSearchOnce(svuint8_t chars, const u8 *buf, const u8 *buf_end, + bool negate) { + DEBUG_PRINTF("start %p end %p\n", buf, buf_end); + assert(buf <= buf_end); + DEBUG_PRINTF("l = %td\n", buf_end - buf); + svbool_t pg = svwhilelt_b8_s64(0, buf_end - buf); + svbool_t matched = singleMatched(chars, buf, pg, negate, 0); + return vermSearchCheckMatched(buf, matched); +} + +static really_inline +const u8 *vermSearchLoopBody(svuint8_t chars, const u8 *buf, bool negate) { + DEBUG_PRINTF("start %p end %p\n", buf, buf + svcntb()); + svbool_t matched = singleMatched(chars, buf, svptrue_b8(), negate, 0); + return vermSearchCheckMatched(buf, matched); +} + +static really_inline +const u8 *vermSearchLoopBodyUnrolled(svuint8_t chars, const u8 *buf, + bool negate) { + DEBUG_PRINTF("start %p end %p\n", buf, buf + (2 * svcntb())); + svbool_t matched0 = singleMatched(chars, buf, svptrue_b8(), negate, 0); + svbool_t matched1 = singleMatched(chars, buf, svptrue_b8(), negate, 1); + svbool_t any = svorr_z(svptrue_b8(), matched0, matched1); + if (unlikely(svptest_any(svptrue_b8(), any))) { + if (svptest_any(svptrue_b8(), matched0)) { + return buf + vermSearchGetOffset(matched0); + } else { + return buf + svcntb() + vermSearchGetOffset(matched1); + } + } + return NULL; +} + +static really_inline +const u8 *rvermSearchOnce(svuint8_t chars, const u8 *buf, const u8 *buf_end, + bool negate) { + DEBUG_PRINTF("start %p end %p\n", buf, buf_end); + assert(buf <= buf_end); + DEBUG_PRINTF("l = %td\n", buf_end - buf); + svbool_t pg = svwhilelt_b8_s64(0, buf_end - buf); + svbool_t matched = singleMatched(chars, buf, pg, negate, 0); + return rvermSearchCheckMatched(buf, matched); +} + +static really_inline +const u8 *rvermSearchLoopBody(svuint8_t chars, const u8 *buf, bool negate) { + DEBUG_PRINTF("start %p end %p\n", buf, buf + svcntb()); + svbool_t matched = singleMatched(chars, buf, svptrue_b8(), negate, 0); + return rvermSearchCheckMatched(buf, matched); +} + +static really_inline +const u8 *vermSearch(char c, bool nocase, const u8 *buf, const u8 *buf_end, + bool negate) { + assert(buf < buf_end); + svuint8_t chars = getCharMaskSingle(c, nocase); + size_t len = buf_end - buf; + if (len <= svcntb()) { + return vermSearchOnce(chars, buf, buf_end, negate); + } + // peel off first part to align to the vector size + const u8 *aligned_buf = ROUNDUP_PTR(buf, svcntb_pat(SV_POW2)); + assert(aligned_buf < buf_end); + if (buf != aligned_buf) { + const u8 *ptr = vermSearchLoopBody(chars, buf, negate); + if (ptr) return ptr; + } + buf = aligned_buf; + uint64_t unrolled_cntb = 2 * svcntb(); + size_t unrolled_loops = (buf_end - buf) / unrolled_cntb; + DEBUG_PRINTF("unrolled_loops %zu \n", unrolled_loops); + for (size_t i = 0; i < unrolled_loops; i++, buf += unrolled_cntb) { + const u8 *ptr = vermSearchLoopBodyUnrolled(chars, buf, negate); + if (ptr) return ptr; + } + size_t loops = (buf_end - buf) / svcntb(); + DEBUG_PRINTF("loops %zu \n", loops); + for (size_t i = 0; i < loops; i++, buf += svcntb()) { + const u8 *ptr = vermSearchLoopBody(chars, buf, negate); + if (ptr) return ptr; + } + DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end); + return buf == buf_end ? NULL : vermSearchLoopBody(chars, buf_end - svcntb(), + negate); +} + +static really_inline +const u8 *rvermSearch(char c, bool nocase, const u8 *buf, const u8 *buf_end, + bool negate) { + assert(buf < buf_end); + svuint8_t chars = getCharMaskSingle(c, nocase); + size_t len = buf_end - buf; + if (len <= svcntb()) { + return rvermSearchOnce(chars, buf, buf_end, negate); + } + // peel off first part to align to the vector size + const u8 *aligned_buf_end = ROUNDDOWN_PTR(buf_end, svcntb_pat(SV_POW2)); + assert(buf < aligned_buf_end); + if (buf_end != aligned_buf_end) { + const u8 *ptr = rvermSearchLoopBody(chars, buf_end - svcntb(), negate); + if (ptr) return ptr; + } + buf_end = aligned_buf_end; + size_t loops = (buf_end - buf) / svcntb(); + DEBUG_PRINTF("loops %zu \n", loops); + for (size_t i = 0; i < loops; i++) { + buf_end -= svcntb(); + const u8 *ptr = rvermSearchLoopBody(chars, buf_end, negate); + if (ptr) return ptr; + } + DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end); + return buf == buf_end ? NULL : rvermSearchLoopBody(chars, buf, negate); +} + +static really_inline +const u8 *vermicelliExec(char c, bool nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("verm scan %s\\x%02hhx over %td bytes\n", + nocase ? "nocase " : "", c, buf_end - buf); + const u8 *ptr = vermSearch(c, nocase, buf, buf_end, false); + return ptr ? ptr : buf_end; +} + +/* like vermicelliExec except returns the address of the first character which + * is not c */ +static really_inline +const u8 *nvermicelliExec(char c, bool nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("nverm scan %s\\x%02hhx over %td bytes\n", + nocase ? "nocase " : "", c, buf_end - buf); + const u8 *ptr = vermSearch(c, nocase, buf, buf_end, true); + return ptr ? ptr : buf_end; +} + +// Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if +// character not found. +static really_inline +const u8 *rvermicelliExec(char c, bool nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %td bytes\n", + nocase ? "nocase " : "", c, buf_end - buf); + const u8 *ptr = rvermSearch(c, nocase, buf, buf_end, false); + return ptr ? ptr : buf - 1; +} + +/* like rvermicelliExec except returns the address of the last character which + * is not c */ +static really_inline +const u8 *rnvermicelliExec(char c, bool nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %td bytes\n", + nocase ? "nocase " : "", c, buf_end - buf); + const u8 *ptr = rvermSearch(c, nocase, buf, buf_end, true); + return ptr ? ptr : buf - 1; +} \ No newline at end of file diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 8cf00025..95a85b9b 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -34,12 +34,27 @@ #define ARCH_ARM_SIMD_UTILS_H #include +#include #include "ue2common.h" #include "util/simd_types.h" #include "util/unaligned.h" #include "util/intrinsics.h" +#ifdef HAVE_SVE2 + +static really_inline +svuint8_t getCharMaskSingle(const u8 c, bool noCase) { + if (noCase) { + uint16_t chars_u16 = (c & 0xdf) | ((c | 0x20) << 8); + return svreinterpret_u8(svdup_u16(chars_u16)); + } else { + return svdup_u8(c); + } +} + +#endif + #include // for memcpy static really_inline m128 ones128(void) { diff --git a/unit/internal/rvermicelli.cpp b/unit/internal/rvermicelli.cpp index 22c238e9..497ffe07 100644 --- a/unit/internal/rvermicelli.cpp +++ b/unit/internal/rvermicelli.cpp @@ -113,6 +113,92 @@ TEST(RVermicelli, Exec4) { } } +TEST(RNVermicelli, ExecNoMatch1) { + char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + const u8 *buf = (const u8 *)t1; + + for (size_t i = 0; i < 16; i++) { + SCOPED_TRACE(i); + for (size_t j = 0; j < 16; j++) { + SCOPED_TRACE(j); + const u8 *rv = rnvermicelliExec('b', 0, buf + i, + buf + strlen(t1) - j); + + ASSERT_EQ(buf + i - 1, rv); + + rv = rnvermicelliExec('B', 1, buf + i, buf + strlen(t1) - j); + + ASSERT_EQ(buf + i - 1, rv); + } + } +} + +TEST(RNVermicelli, Exec1) { + char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbb"; + const u8 *buf = (const u8 *)t1; + + for (size_t i = 0; i < 16; i++) { + SCOPED_TRACE(i); + const u8 *rv = rnvermicelliExec('b', 0, buf, buf + strlen(t1) - i); + + ASSERT_EQ(buf + 48, rv); + + rv = rnvermicelliExec('B', 1, buf + i, buf + strlen(t1) - i); + + ASSERT_EQ(buf + 48, rv); + } +} + +TEST(RNVermicelli, Exec2) { + char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbb"; + const u8 *buf = (const u8 *)t1; + + for (size_t i = 0; i < 16; i++) { + SCOPED_TRACE(i); + const u8 *rv = rnvermicelliExec('b', 0, buf, buf + strlen(t1) - i); + + ASSERT_EQ(buf + 48, rv); + + rv = rnvermicelliExec('B', 1, buf + i, buf + strlen(t1) - i); + + ASSERT_EQ(buf + 48, rv); + } +} + +TEST(RNVermicelli, Exec3) { + char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbaaaaaaaaaaaaaaaaaaaaaaAbbbbbbbbbbbbbbbbbbbbbb"; + const u8 *buf = (const u8 *)t1; + + for (size_t i = 0; i < 16; i++) { + SCOPED_TRACE(i); + const u8 *rv = rnvermicelliExec('b', 0, buf + i, buf + strlen(t1)); + + ASSERT_EQ(buf + 48, rv); + + rv = rnvermicelliExec('B', 1, buf + i, buf + strlen(t1)); + + ASSERT_EQ(buf + 48, rv); + } +} + +TEST(RNVermicelli, Exec4) { + char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + const u8 *buf = (const u8 *)t1; + + for (size_t i = 0; i < 31; i++) { + SCOPED_TRACE(i); + t1[16 + i] = 'a'; + const u8 *rv = rnvermicelliExec('b', 0, buf, buf + strlen(t1)); + + ASSERT_EQ(buf + 16 + i, rv); + + rv = rnvermicelliExec('B', 1, buf, buf + strlen(t1)); + + ASSERT_EQ(buf + 16 + i, rv); + } +} + + TEST(RDoubleVermicelli, Exec1) { char t1[] = "bbbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbb";