From 44dc75a3ea5ea787515606e257d337821d47eb5c Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 1 Nov 2021 16:51:18 +0200 Subject: [PATCH] complete refactoring and unification of Vermicelli functions --- src/nfa/vermicelli.hpp | 8 ++ src/nfa/vermicelli_simd.cpp | 240 ++++++++++++++++++------------------ 2 files changed, 127 insertions(+), 121 deletions(-) diff --git a/src/nfa/vermicelli.hpp b/src/nfa/vermicelli.hpp index 0b4686e1..83eb2335 100644 --- a/src/nfa/vermicelli.hpp +++ b/src/nfa/vermicelli.hpp @@ -75,4 +75,12 @@ const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, con } #endif +#ifdef __cplusplus +extern "C" { +#endif +const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, const u8 *buf_end); +#ifdef __cplusplus +} +#endif + #endif /* VERMICELLI_HPP */ \ No newline at end of file diff --git a/src/nfa/vermicelli_simd.cpp b/src/nfa/vermicelli_simd.cpp index 6348e6f3..cd818dfb 100644 --- a/src/nfa/vermicelli_simd.cpp +++ b/src/nfa/vermicelli_simd.cpp @@ -41,48 +41,16 @@ template static really_inline -const u8 *vermicelliSingleBlock(SuperVector data, SuperVector chars, SuperVector casemask, const u8 *buf) { +const u8 *vermicelliBlock(SuperVector data, SuperVector chars, SuperVector casemask, const u8 *buf) { SuperVector mask = chars.eq(casemask & data); return first_non_zero_match(buf, mask); } -template -static really_inline -const u8 *rvermicelliSingleBlock(SuperVector data, SuperVector chars, SuperVector casemask, const u8 *buf) { - - SuperVector mask = chars.eq(casemask & data); - return last_non_zero_match(buf, mask); -} template static really_inline -const u8 *vermicelliDoubleBlock(SuperVector data, SuperVector chars1, SuperVector chars2, SuperVector casemask, - const u8 *buf/*, SuperVector *lastmask1, size_t len = S*/) { - - // lastmask1->print8("lastmask1"); - data.print8("data"); - chars1.print8("chars1"); - chars2.print8("chars2"); - casemask.print8("casemask"); - SuperVector v = casemask & data; - v.print8("v"); - SuperVector mask1 = chars1.eq(v); - mask1.print8("mask1"); - SuperVector mask2 = chars2.eq(v); - mask2.print8("mask2"); - SuperVector mask = (mask1 & (mask2 >> 1)); - mask.print8("mask"); - DEBUG_PRINTF("len = %ld\n", len); - // *lastmask1 = mask1 >> (len -1); - // lastmask1->print8("lastmask1"); - - return first_non_zero_match(buf, mask); -} - -template -static really_inline -const u8 *vermicelliSingleBlockNeg(SuperVector data, SuperVector chars, SuperVector casemask, const u8 *buf) { +const u8 *vermicelliBlockNeg(SuperVector data, SuperVector chars, SuperVector casemask, const u8 *buf) { SuperVector mask = chars.eq(casemask & data); return first_zero_match_inverted(buf, mask); @@ -90,36 +58,58 @@ const u8 *vermicelliSingleBlockNeg(SuperVector data, SuperVector chars, Su template static really_inline -const u8 *rvermicelliSingleBlockNeg(SuperVector data, SuperVector chars, SuperVector casemask, const u8 *buf) { +const u8 *rvermicelliBlock(SuperVector data, SuperVector chars, SuperVector casemask, const u8 *buf) { + + SuperVector mask = chars.eq(casemask & data); + return last_non_zero_match(buf, mask); +} + + +template +static really_inline +const u8 *rvermicelliBlockNeg(SuperVector data, SuperVector chars, SuperVector casemask, const u8 *buf) { SuperVector mask = chars.eq(casemask & data); return last_zero_match_inverted(buf, mask); } -/* + template static really_inline -const u8 *vermicelliDoubleBlockNeg(SuperVector data, SuperVector chars1, SuperVector chars2, SuperVector casemask, - const u8 *buf, size_t len = S) { +const u8 *vermicelliDoubleBlock(SuperVector data, SuperVector chars1, SuperVector chars2, SuperVector casemask, + u8 const c1, u8 const c2, u8 const casechar, const u8 *buf) { - // lastmask1.print8("lastmask1"); - data.print8("data"); - chars1.print8("chars1"); - chars2.print8("chars2"); - casemask.print8("casemask"); SuperVector v = casemask & data; - v.print8("v"); SuperVector mask1 = chars1.eq(v); - mask1.print8("mask1"); SuperVector mask2 = chars2.eq(v); - mask2.print8("mask2"); - SuperVector mask = (mask1 & (mask2 >> 1));// | lastmask1; - mask.print8("mask"); - DEBUG_PRINTF("len = %ld\n", len); - // lastmask1 = mask << (len -1); - // lastmask1.print8("lastmask1"); + SuperVector mask = mask1 & (mask2 >> 1); - return last_zero_match_inverted(buf, mask); -}*/ + DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]); + bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); + DEBUG_PRINTF("partial = %d\n", partial_match); + if (partial_match) return buf - 1; + + return first_non_zero_match(buf, mask); +} + +template +static really_inline +const u8 *rvermicelliDoubleBlock(SuperVector data, SuperVector chars1, SuperVector chars2, SuperVector casemask, + u8 const c1, u8 const c2, u8 const casechar, const u8 *buf) { + + SuperVector v = casemask & data; + SuperVector mask1 = chars1.eq(v); + SuperVector mask2 = chars2.eq(v); + SuperVector mask = (mask1 << 1)& mask2; + + DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]); + bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); + DEBUG_PRINTF("partial = %d\n", partial_match); + if (partial_match) { + mask = mask | (SuperVector::Ones() >> (S-1)); + } + + return last_non_zero_match(buf, mask); +} template static const u8 *vermicelliExecReal(SuperVector const chars, SuperVector const casemask, const u8 *buf, const u8 *buf_end) { @@ -142,7 +132,7 @@ static const u8 *vermicelliExecReal(SuperVector const chars, SuperVector c DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); if (!ISALIGNED_N(d, S)) { SuperVector data = SuperVector::loadu(d); - rv = vermicelliSingleBlock(data, chars, casemask, d); + rv = vermicelliBlock(data, chars, casemask, d); if (rv) return rv; d = ROUNDUP_PTR(d, S); } @@ -151,7 +141,7 @@ static const u8 *vermicelliExecReal(SuperVector const chars, SuperVector c __builtin_prefetch(d + 64); DEBUG_PRINTF("d %p \n", d); SuperVector data = SuperVector::load(d); - rv = vermicelliSingleBlock(data, chars, casemask, d); + rv = vermicelliBlock(data, chars, casemask, d); if (rv) return rv; d += S; } @@ -162,7 +152,7 @@ static const u8 *vermicelliExecReal(SuperVector const chars, SuperVector c if (d != buf_end) { SuperVector data = SuperVector::loadu_maskz(d, buf_end - d); - rv = vermicelliSingleBlock(data, chars, casemask, d); + rv = vermicelliBlock(data, chars, casemask, d); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } @@ -180,8 +170,6 @@ static const u8 *nvermicelliExecReal(SuperVector const chars, SuperVector const u8 *d = buf; const u8 *rv; - - __builtin_prefetch(d + 64); __builtin_prefetch(d + 2*64); __builtin_prefetch(d + 3*64); @@ -193,7 +181,7 @@ static const u8 *nvermicelliExecReal(SuperVector const chars, SuperVector DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); if (!ISALIGNED_N(d, S)) { SuperVector data = SuperVector::loadu(d); - rv = vermicelliSingleBlockNeg(data, chars, casemask, d); + rv = vermicelliBlockNeg(data, chars, casemask, d); if (rv) return rv; d = ROUNDUP_PTR(d, S); } @@ -202,7 +190,7 @@ static const u8 *nvermicelliExecReal(SuperVector const chars, SuperVector __builtin_prefetch(d + 64); DEBUG_PRINTF("d %p \n", d); SuperVector data = SuperVector::load(d); - rv = vermicelliSingleBlockNeg(data, chars, casemask, d); + rv = vermicelliBlockNeg(data, chars, casemask, d); if (rv) return rv; d += S; } @@ -213,7 +201,7 @@ static const u8 *nvermicelliExecReal(SuperVector const chars, SuperVector if (d != buf_end) { SuperVector data = SuperVector::loadu_maskz(d, buf_end - d); - rv = vermicelliSingleBlockNeg(data, chars, casemask, d); + rv = vermicelliBlockNeg(data, chars, casemask, d); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } @@ -244,7 +232,7 @@ const u8 *rvermicelliExecReal(SuperVector const chars, SuperVector const c DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S)); if (!ISALIGNED_N(d, S)) { SuperVector data = SuperVector::loadu(d - S); - rv = rvermicelliSingleBlock(data, chars, casemask, d - S); + rv = rvermicelliBlock(data, chars, casemask, d - S); DEBUG_PRINTF("rv %p \n", rv); if (rv) return rv; d = ROUNDDOWN_PTR(d, S); @@ -257,7 +245,7 @@ const u8 *rvermicelliExecReal(SuperVector const chars, SuperVector const c d -= S; SuperVector data = SuperVector::load(d); - rv = rvermicelliSingleBlock(data, chars, casemask, d); + rv = rvermicelliBlock(data, chars, casemask, d); if (rv) return rv; } } @@ -267,7 +255,7 @@ const u8 *rvermicelliExecReal(SuperVector const chars, SuperVector const c if (d != buf) { SuperVector data = SuperVector::loadu(buf); - rv = rvermicelliSingleBlock(data, chars, casemask, buf); + rv = rvermicelliBlock(data, chars, casemask, buf); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } @@ -298,7 +286,7 @@ const u8 *rnvermicelliExecReal(SuperVector const chars, SuperVector const DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S)); if (!ISALIGNED_N(d, S)) { SuperVector data = SuperVector::loadu(d - S); - rv = rvermicelliSingleBlockNeg(data, chars, casemask, d - S); + rv = rvermicelliBlockNeg(data, chars, casemask, d - S); DEBUG_PRINTF("rv %p \n", rv); if (rv) return rv; d = ROUNDDOWN_PTR(d, S); @@ -311,7 +299,7 @@ const u8 *rnvermicelliExecReal(SuperVector const chars, SuperVector const d -= S; SuperVector data = SuperVector::load(d); - rv = rvermicelliSingleBlockNeg(data, chars, casemask, d); + rv = rvermicelliBlockNeg(data, chars, casemask, d); if (rv) return rv; } } @@ -321,7 +309,7 @@ const u8 *rnvermicelliExecReal(SuperVector const chars, SuperVector const if (d != buf) { SuperVector data = SuperVector::loadu(buf); - rv = rvermicelliSingleBlockNeg(data, chars, casemask, buf); + rv = rvermicelliBlockNeg(data, chars, casemask, buf); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } @@ -355,7 +343,7 @@ static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector< DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); if (!ISALIGNED_N(d, S)) { SuperVector data = SuperVector::loadu(d); - rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, d);//, &lastmask1); + rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d);//, &lastmask1); if (rv) return rv; d = ROUNDUP_PTR(d, S); } @@ -364,11 +352,8 @@ static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector< __builtin_prefetch(d + 64); DEBUG_PRINTF("d %p \n", d); SuperVector data = SuperVector::load(d); - rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, d);//, &lastmask1); - if (rv) { - bool partial_match = (((rv[0] & casechar) == c2) && ((rv[-1] & casechar) == c1)); - return rv - partial_match; - } + rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d);//, &lastmask1); + if (rv) return rv; d += S; } } @@ -378,7 +363,7 @@ static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector< if (d != buf_end) { SuperVector data = SuperVector::loadu_maskz(d, buf_end - d); - rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, d);//, buf_end - d); + rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d);//, buf_end - d); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } @@ -396,60 +381,63 @@ static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector< } // /* returns highest offset of c2 (NOTE: not c1) */ -// static really_inline -// const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, -// const u8 *buf_end) { -// DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n", -// nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf)); -// assert(buf < buf_end); +template +const u8 *rvermicelliDoubleExecReal(char c1, char c2, SuperVector const casemask, const u8 *buf, const u8 *buf_end) { + assert(buf && buf_end); + assert(buf < buf_end); + DEBUG_PRINTF("rverm %p len %zu\n", buf, buf_end - buf); + DEBUG_PRINTF("b %s\n", buf); + char s[255]; + snprintf(s, buf_end - buf + 1, "%s", buf); + DEBUG_PRINTF("b %s\n", s); -// VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */ -// VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */ + const u8 *d = buf_end; + const u8 *rv; + const SuperVector chars1 = SuperVector::dup_u8(c1); + const SuperVector chars2 = SuperVector::dup_u8(c2); + const u8 casechar = casemask.u.u8[0]; -// #ifdef HAVE_AVX512 -// if (buf_end - buf <= VERM_BOUNDARY) { -// const u8 *ptr = nocase -// ? rdvermMiniNocase(chars1, chars2, buf, buf_end) -// : rdvermMini(chars1, chars2, buf, buf_end); + __builtin_prefetch(d - 64); + __builtin_prefetch(d - 2*64); + __builtin_prefetch(d - 3*64); + __builtin_prefetch(d - 4*64); + DEBUG_PRINTF("start %p end %p \n", buf, d); + assert(d > buf); + if (d - S >= buf) { + // Reach vector aligned boundaries + DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S)); + if (!ISALIGNED_N(d, S)) { + SuperVector data = SuperVector::loadu(d - S); + rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d - S); + DEBUG_PRINTF("rv %p \n", rv); + if (rv && rv < buf_end) return rv; + d = ROUNDDOWN_PTR(d, S); + } -// if (ptr) { -// return ptr; -// } + while (d - S >= buf) { + DEBUG_PRINTF("aligned %p \n", d); + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(d - 64); -// // check for partial match at end ??? -// return buf - 1; -// } -// #endif + d -= S; + SuperVector data = SuperVector::load(d); + rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d); + if (rv) return rv; + } + } -// assert((buf_end - buf) >= VERM_BOUNDARY); -// size_t min = (size_t)buf_end % VERM_BOUNDARY; -// if (min) { -// // input not aligned, so we need to run one iteration with an unaligned -// // load, then skip buf forward to the next aligned address. There's -// // some small overlap here, but we don't mind scanning it twice if we -// // can do it quickly, do we? -// const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2, -// buf_end - VERM_BOUNDARY) -// : rdvermPrecondition(chars1, chars2, -// buf_end - VERM_BOUNDARY); + DEBUG_PRINTF("tail d %p e %p \n", buf, d); + // finish off head -// if (ptr) { -// return ptr; -// } + if (d != buf) { + SuperVector data = SuperVector::loadu(buf); + rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, buf); + DEBUG_PRINTF("rv %p \n", rv); + if (rv && rv < buf_end) return rv; + } -// buf_end -= min; -// if (buf >= buf_end) { -// return buf_end; -// } -// } - -// // Aligned loops from here on in -// if (nocase) { -// return rdvermSearchAlignedNocase(chars1, chars2, c1, c2, buf, buf_end); -// } else { -// return rdvermSearchAligned(chars1, chars2, c1, c2, buf, buf_end); -// } -// } + return buf - 1; +} extern "C" const u8 *vermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end) { DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n", @@ -505,4 +493,14 @@ extern "C" const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u const SuperVector casemask{nocase ? getCaseMask() : SuperVector::Ones()}; return vermicelliDoubleExecReal(c1, c2, casemask, buf, buf_end); +} + +extern "C" const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, const u8 *buf_end) { + DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + const SuperVector casemask{nocase ? getCaseMask() : SuperVector::Ones()}; + + return rvermicelliDoubleExecReal(c1, c2, casemask, buf, buf_end); } \ No newline at end of file