diff --git a/src/nfa/vermicelli.hpp b/src/nfa/vermicelli.hpp index 83eb2335..105194b1 100644 --- a/src/nfa/vermicelli.hpp +++ b/src/nfa/vermicelli.hpp @@ -35,6 +35,12 @@ #ifndef VERMICELLI_HPP #define VERMICELLI_HPP +#include "util/bitutils.h" + +#ifdef HAVE_SVE2 +#include "vermicelli_sve.h" +#endif + #ifdef __cplusplus extern "C" { #endif @@ -83,4 +89,12 @@ const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, co } #endif +#ifdef __cplusplus +extern "C" { +#endif +const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2, const u8 *buf, const u8 *buf_end); +#ifdef __cplusplus +} +#endif + #endif /* VERMICELLI_HPP */ \ No newline at end of file diff --git a/src/nfa/vermicelli_simd.cpp b/src/nfa/vermicelli_simd.cpp index cd818dfb..c2215651 100644 --- a/src/nfa/vermicelli_simd.cpp +++ b/src/nfa/vermicelli_simd.cpp @@ -111,6 +111,24 @@ const u8 *rvermicelliDoubleBlock(SuperVector data, SuperVector chars1, Sup return last_non_zero_match(buf, mask); } +template +static really_inline +const u8 *vermicelliDoubleMaskedBlock(SuperVector data, SuperVector chars1, SuperVector chars2, + SuperVector mask1, SuperVector mask2, + u8 const c1, u8 const c2, u8 const m1, u8 const m2, const u8 *buf) { + + SuperVector v1 = chars1.eq(data & mask1); + SuperVector v2 = chars2.eq(data & mask2); + SuperVector mask = v1 & (v2 >> 1); + + DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]); + bool partial_match = (((buf[0] & m1) == c2) && ((buf[-1] & m2) == c1)); + DEBUG_PRINTF("partial = %d\n", partial_match); + if (partial_match) return buf - 1; + + return first_non_zero_match(buf, mask); +} + template static const u8 *vermicelliExecReal(SuperVector const chars, SuperVector const casemask, const u8 *buf, const u8 *buf_end) { assert(buf && buf_end); @@ -343,7 +361,7 @@ static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector< DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); if (!ISALIGNED_N(d, S)) { SuperVector data = SuperVector::loadu(d); - rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d);//, &lastmask1); + rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d); if (rv) return rv; d = ROUNDUP_PTR(d, S); } @@ -352,7 +370,7 @@ static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector< __builtin_prefetch(d + 64); DEBUG_PRINTF("d %p \n", d); SuperVector data = SuperVector::load(d); - rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d);//, &lastmask1); + rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d); if (rv) return rv; d += S; } @@ -363,7 +381,7 @@ static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector< if (d != buf_end) { SuperVector data = SuperVector::loadu_maskz(d, buf_end - d); - rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d);//, buf_end - d); + rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } @@ -371,7 +389,6 @@ static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector< DEBUG_PRINTF("real tail d %p e %p \n", d, buf_end); /* check for partial match at end */ u8 mask = casemask.u.u8[0]; - // u8 c1 = chars1.u.u8[0]; if ((buf_end[-1] & mask) == (u8)c1) { DEBUG_PRINTF("partial!!!\n"); return buf_end - 1; @@ -439,6 +456,68 @@ const u8 *rvermicelliDoubleExecReal(char c1, char c2, SuperVector const casem return buf - 1; } +template +static const u8 *vermicelliDoubleMaskedExecReal(u8 const c1, u8 const c2, u8 const m1, u8 const m2, + const u8 *buf, const u8 *buf_end) { + assert(buf && buf_end); + assert(buf < buf_end); + DEBUG_PRINTF("verm %p len %zu\n", buf, buf_end - buf); + DEBUG_PRINTF("b %s\n", buf); + + const u8 *d = buf; + const u8 *rv; + // SuperVector lastmask1{0}; + const SuperVector chars1 = SuperVector::dup_u8(c1); + const SuperVector chars2 = SuperVector::dup_u8(c2); + const SuperVector mask1 = SuperVector::dup_u8(m1); + const SuperVector mask2 = SuperVector::dup_u8(m2); + + __builtin_prefetch(d + 64); + __builtin_prefetch(d + 2*64); + __builtin_prefetch(d + 3*64); + __builtin_prefetch(d + 4*64); + DEBUG_PRINTF("start %p end %p \n", d, buf_end); + assert(d < buf_end); + if (d + S <= buf_end) { + // Reach vector aligned boundaries + DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); + if (!ISALIGNED_N(d, S)) { + SuperVector data = SuperVector::loadu(d); + rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d); + if (rv) return rv; + d = ROUNDUP_PTR(d, S); + } + + while(d + S <= buf_end) { + __builtin_prefetch(d + 64); + DEBUG_PRINTF("d %p \n", d); + SuperVector data = SuperVector::load(d); + rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d); + if (rv) return rv; + d += S; + } + } + + DEBUG_PRINTF("tail d %p e %p \n", d, buf_end); + // finish off tail + + if (d != buf_end) { + SuperVector data = SuperVector::loadu_maskz(d, buf_end - d); + rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d); + DEBUG_PRINTF("rv %p \n", rv); + if (rv && rv < buf_end) return rv; + } + + DEBUG_PRINTF("real tail d %p e %p \n", d, buf_end); + /* check for partial match at end */ + if ((buf_end[-1] & m1) == (u8)c1) { + DEBUG_PRINTF("partial!!!\n"); + return buf_end - 1; + } + + return buf_end; +} + extern "C" const u8 *vermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end) { DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n", nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); @@ -503,4 +582,13 @@ extern "C" const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const const SuperVector casemask{nocase ? getCaseMask() : SuperVector::Ones()}; return rvermicelliDoubleExecReal(c1, c2, casemask, buf, buf_end); +} + +extern "C" const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2, + const u8 *buf, const u8 *buf_end) { + DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) " + "over %zu bytes\n", c1, m1, c2, m2, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + return vermicelliDoubleMaskedExecReal(c1, c2, m1, m2, buf, buf_end); } \ No newline at end of file