diff --git a/src/hwlm/noodle_engine_simd.hpp b/src/hwlm/noodle_engine_simd.hpp index dfe7eea1..c49bfc7e 100644 --- a/src/hwlm/noodle_engine_simd.hpp +++ b/src/hwlm/noodle_engine_simd.hpp @@ -58,12 +58,10 @@ hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, return HWLM_SUCCESS; } -// The short scan routine. It is used both to scan data up to an -// alignment boundary if needed and to finish off data that the aligned scan -// function can't handle (due to small/unaligned chunk at end) + template static really_inline -hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, +hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, SuperVector caseMask, SuperVector mask1, const struct cb_info *cbi, size_t len, size_t start, size_t end) { @@ -76,7 +74,36 @@ hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, return HWLM_SUCCESS; } + SuperVector v = SuperVector::Zeroes(); + memcpy(&v.u, d, l); + typename SuperVector::movemask_type mask = SINGLE_LOAD_MASK(l); + v = v & caseMask; + typename SuperVector::movemask_type z = mask & mask1.eqmask(v); + + return single_zscan(n, d, buf, z, len, cbi); +} + +// The short scan routine. It is used both to scan data up to an +// alignment boundary if needed and to finish off data that the aligned scan +// function can't handle (due to small/unaligned chunk at end) +template +static really_inline +hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, + SuperVector caseMask, SuperVector mask1, + const struct cb_info *cbi, size_t len, size_t offset, + size_t start, + size_t end) { + const u8 *d = buf + offset; + DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); + const size_t l = end - start; + DEBUG_PRINTF("l = %ld\n", l); + assert(l <= 64); + if (!l) { + return HWLM_SUCCESS; + } + size_t buf_off = start - offset; + typename SuperVector::movemask_type mask = SINGLE_LOAD_MASK(l) << buf_off; SuperVector v = SuperVector::loadu(d) & caseMask; typename SuperVector::movemask_type z = mask & mask1.eqmask(v); @@ -85,8 +112,8 @@ hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, template static really_inline -hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, - SuperVector caseMask, SuperVector mask1, SuperVector mask2, typename SuperVector::movemask_type *lastz1, +hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, + SuperVector caseMask, SuperVector mask1, SuperVector mask2, const struct cb_info *cbi, size_t len, size_t start, size_t end) { const u8 *d = buf + start; DEBUG_PRINTF("start %zu end %zu\n", start, end); @@ -95,13 +122,36 @@ hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, if (!l) { return HWLM_SUCCESS; } - SuperVector v = SuperVector::loadu(d) & caseMask; + SuperVector v = SuperVector::Zeroes(); + memcpy(&v.u, d, l); + v = v & caseMask; typename SuperVector::movemask_type mask = DOUBLE_LOAD_MASK(l); typename SuperVector::movemask_type z1 = mask1.eqmask(v); typename SuperVector::movemask_type z2 = mask2.eqmask(v); - typename SuperVector::movemask_type z = mask & (*lastz1 | z1 << 1) & z2; - *lastz1 = z1 >> (l -1); + typename SuperVector::movemask_type z = mask & (z1 << 1) & z2; + + return double_zscan(n, d, buf, z, len, cbi); +} + +template +static really_inline +hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, + SuperVector caseMask, SuperVector mask1, SuperVector mask2, + const struct cb_info *cbi, size_t len, size_t offset, size_t start, size_t end) { + const u8 *d = buf + offset; + DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); + const size_t l = end - start; + assert(l <= S); + if (!l) { + return HWLM_SUCCESS; + } + SuperVector v = SuperVector::loadu(d) & caseMask; + size_t buf_off = start - offset; + typename SuperVector::movemask_type mask = DOUBLE_LOAD_MASK(l) << buf_off; + typename SuperVector::movemask_type z1 = mask1.eqmask(v); + typename SuperVector::movemask_type z2 = mask2.eqmask(v); + typename SuperVector::movemask_type z = mask & (z1 << 1) & z2; return double_zscan(n, d, buf, z, len, cbi); } @@ -119,11 +169,14 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, const u8 *e = buf + end; DEBUG_PRINTF("start %p end %p \n", d, e); assert(d < e); + if (e - d < S) { + return scanSingleShort(n, buf, caseMask, mask1, cbi, len, start, end); + } if (d + S <= e) { // peel off first part to cacheline boundary const u8 *d1 = ROUNDUP_PTR(d, S); DEBUG_PRINTF("until aligned %p \n", d1); - if (scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, start, d1 - buf) == HWLM_TERMINATED) { + if (scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, start, start, d1 - buf) == HWLM_TERMINATED) { return HWLM_TERMINATED; } d = d1; @@ -147,8 +200,12 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, DEBUG_PRINTF("d %p e %p \n", d, e); // finish off tail + size_t s2End = ROUNDDOWN_PTR(e, S) - buf; + if (s2End == end) { + return HWLM_SUCCESS; + } - return scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, d - buf, end); + return scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, end - S, s2End, len); } template @@ -169,14 +226,17 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, const u8 *e = buf + end; DEBUG_PRINTF("start %p end %p \n", d, e); assert(d < e); + if (e - d < S) { + return scanDoubleShort(n, buf, caseMask, mask1, mask2, cbi, len, d - buf, end); + } if (d + S <= e) { // peel off first part to cacheline boundary - const u8 *d1 = ROUNDUP_PTR(d, S); + const u8 *d1 = ROUNDUP_PTR(d, S) + 1; DEBUG_PRINTF("until aligned %p \n", d1); - if (scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, &lastz1, cbi, len, start, d1 - buf) == HWLM_TERMINATED) { + if (scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, cbi, len, start, start, d1 - buf) == HWLM_TERMINATED) { return HWLM_TERMINATED; } - d = d1; + d = d1 - 1; size_t loops = (end - (d - buf)) / S; DEBUG_PRINTF("loops %ld \n", loops); @@ -196,12 +256,16 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi); RETURN_IF_TERMINATED(rv); } + if (loops == 0) { + d = d1; + } } - - DEBUG_PRINTF("d %p e %p \n", d, e); // finish off tail - - return scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, &lastz1, cbi, len, d - buf, end); + size_t s2End = ROUNDDOWN_PTR(e, S) - buf; + if (s2End == end) { + return HWLM_SUCCESS; + } + return scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, cbi, len, end - S, d - buf, end); } // Single-character specialisation, used when keyLen = 1 diff --git a/src/nfa/arm/vermicelli.hpp b/src/nfa/arm/vermicelli.hpp index d790fa1f..496468e0 100644 --- a/src/nfa/arm/vermicelli.hpp +++ b/src/nfa/arm/vermicelli.hpp @@ -67,7 +67,7 @@ const u8 *rvermicelliBlockNeg(SuperVector const data, SuperVector const ch return last_zero_match_inverted(buf, mask, len); } -template +template static really_inline const u8 *vermicelliDoubleBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const casemask, u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) { @@ -78,14 +78,16 @@ const u8 *vermicelliDoubleBlock(SuperVector const data, SuperVector const SuperVector mask = mask1 & (mask2 >> 1); DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]); - bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); + bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); DEBUG_PRINTF("partial = %d\n", partial_match); - if (partial_match) return buf - 1; + if (partial_match) { + mask = mask | ((SuperVector::Ones() >> (S-1)) << (S-1)); + } return first_non_zero_match(buf, mask, len); } -template +template static really_inline const u8 *rvermicelliDoubleBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const casemask, u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) { @@ -96,7 +98,7 @@ const u8 *rvermicelliDoubleBlock(SuperVector const data, SuperVector const SuperVector mask = (mask1 << 1)& mask2; DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]); - bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); + bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); DEBUG_PRINTF("partial = %d\n", partial_match); if (partial_match) { mask = mask | (SuperVector::Ones() >> (S-1)); @@ -105,7 +107,7 @@ const u8 *rvermicelliDoubleBlock(SuperVector const data, SuperVector const return last_non_zero_match(buf, mask, len); } -template +template static really_inline const u8 *vermicelliDoubleMaskedBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const mask1, SuperVector const mask2, @@ -116,9 +118,11 @@ const u8 *vermicelliDoubleMaskedBlock(SuperVector const data, SuperVector SuperVector mask = v1 & (v2 >> 1); DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]); - bool partial_match = (((buf[0] & m1) == c2) && ((buf[-1] & m2) == c1)); + bool partial_match = (check_partial && ((buf[0] & m2) == c2) && ((buf[-1] & m1) == c1)); DEBUG_PRINTF("partial = %d\n", partial_match); - if (partial_match) return buf - 1; + if (partial_match) { + mask = mask | ((SuperVector::Ones() >> (S-1)) << (S-1)); + } return first_non_zero_match(buf, mask, len); } diff --git a/src/nfa/ppc64el/vermicelli.hpp b/src/nfa/ppc64el/vermicelli.hpp index eeaad6a1..1f3de25f 100644 --- a/src/nfa/ppc64el/vermicelli.hpp +++ b/src/nfa/ppc64el/vermicelli.hpp @@ -67,7 +67,7 @@ const u8 *rvermicelliBlockNeg(SuperVector const data, SuperVector const ch return last_zero_match_inverted(buf, mask, len); } -template +template static really_inline const u8 *vermicelliDoubleBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const casemask, u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) { @@ -78,14 +78,16 @@ const u8 *vermicelliDoubleBlock(SuperVector const data, SuperVector const SuperVector mask = mask1 & (mask2 >> 1); DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]); - bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); + bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); DEBUG_PRINTF("partial = %d\n", partial_match); - if (partial_match) return buf - 1; + if (partial_match) { + mask = mask | ((SuperVector::Ones() >> (S-1)) << (S-1)); + } return first_non_zero_match(buf, mask, len); } -template +template static really_inline const u8 *rvermicelliDoubleBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const casemask, u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) { @@ -96,7 +98,7 @@ const u8 *rvermicelliDoubleBlock(SuperVector const data, SuperVector const SuperVector mask = (mask1 << 1)& mask2; DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]); - bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); + bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); DEBUG_PRINTF("partial = %d\n", partial_match); if (partial_match) { mask = mask | (SuperVector::Ones() >> (S-1)); @@ -105,7 +107,7 @@ const u8 *rvermicelliDoubleBlock(SuperVector const data, SuperVector const return last_non_zero_match(buf, mask, len); } -template +template static really_inline const u8 *vermicelliDoubleMaskedBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const mask1, SuperVector const mask2, @@ -116,9 +118,11 @@ const u8 *vermicelliDoubleMaskedBlock(SuperVector const data, SuperVector SuperVector mask = v1 & (v2 >> 1); DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]); - bool partial_match = (((buf[0] & m1) == c2) && ((buf[-1] & m2) == c1)); + bool partial_match = (check_partial && ((buf[0] & m2) == c2) && ((buf[-1] & m1) == c1)); DEBUG_PRINTF("partial = %d\n", partial_match); - if (partial_match) return buf - 1; + if (partial_match) { + mask = mask | ((SuperVector::Ones() >> (S-1)) << (S-1)); + } return first_non_zero_match(buf, mask, len); } diff --git a/src/nfa/shufti_simd.hpp b/src/nfa/shufti_simd.hpp index 887f2468..0f8e2a7b 100644 --- a/src/nfa/shufti_simd.hpp +++ b/src/nfa/shufti_simd.hpp @@ -128,8 +128,8 @@ const u8 *shuftiExecReal(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *bu // finish off tail if (d != buf_end) { - SuperVector chars = SuperVector::loadu_maskz(d, buf_end - d); - rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, d); + SuperVector chars = SuperVector::loadu(buf_end - S); + rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - S); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } @@ -240,22 +240,36 @@ const u8 *shuftiDoubleExecReal(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 // finish off tail if (d != buf_end) { - SuperVector chars = SuperVector::loadu(d); - rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, d); + SuperVector chars = SuperVector::Zeroes(); + const u8 *end_buf; + if (buf_end - buf < S) { + memcpy(&chars.u, buf, buf_end - buf); + end_buf = buf; + } else { + chars = SuperVector::loadu(buf_end - S); + end_buf = buf_end - S; + } + rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, end_buf); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } - + return buf_end; } const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *buf_end) { - return shuftiExecReal(mask_lo, mask_hi, buf, buf_end); + if (buf_end - buf < VECTORSIZE) { + return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, buf_end); + } + return shuftiExecReal(mask_lo, mask_hi, buf, buf_end); } const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *buf_end) { + if (buf_end - buf < VECTORSIZE) { + return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, buf_end); + } return rshuftiExecReal(mask_lo, mask_hi, buf, buf_end); } diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp index 8d9911fd..e07e92f6 100644 --- a/src/nfa/truffle_simd.hpp +++ b/src/nfa/truffle_simd.hpp @@ -107,8 +107,16 @@ const u8 *truffleExecReal(m128 &shuf_mask_lo_highclear, m128 shuf_mask_lo_highse // finish off tail if (d != buf_end) { - SuperVector chars = SuperVector::loadu_maskz(d, buf_end - d); - rv = fwdBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, d); + SuperVector chars = SuperVector::Zeroes(); + const u8* end_buf; + if (buf_end - buf < S) { + memcpy(&chars.u, buf, buf_end - buf); + end_buf = buf; + } else { + chars = SuperVector::loadu(buf_end - S); + end_buf = buf_end - S; + } + rv = fwdBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, end_buf); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } @@ -171,7 +179,12 @@ const u8 *rtruffleExecReal(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highse // finish off head if (d != buf) { - SuperVector chars = SuperVector::loadu(buf); + SuperVector chars = SuperVector::Zeroes(); + if (buf_end - buf < S) { + memcpy(&chars.u, buf, buf_end - buf); + } else { + chars = SuperVector::loadu(buf); + } rv = revBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, buf); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; diff --git a/src/nfa/vermicelli_simd.cpp b/src/nfa/vermicelli_simd.cpp index d790d137..17d99d55 100644 --- a/src/nfa/vermicelli_simd.cpp +++ b/src/nfa/vermicelli_simd.cpp @@ -55,17 +55,17 @@ template static really_inline const u8 *rvermicelliBlockNeg(SuperVector const data, SuperVector const chars, SuperVector const casemask, const u8 *buf, u16 const len); -template +template static really_inline const u8 *vermicelliDoubleBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const casemask, u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len); -template +template static really_inline const u8 *rvermicelliDoubleBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const casemask, u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len); -template +template static really_inline const u8 *vermicelliDoubleMaskedBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const mask1, SuperVector const mask2, @@ -120,8 +120,8 @@ static const u8 *vermicelliExecReal(SuperVector const chars, SuperVector c // finish off tail if (d != buf_end) { - SuperVector data = SuperVector::loadu_maskz(d, buf_end - d); - rv = vermicelliBlock(data, chars, casemask, d, buf_end - d); + SuperVector data = SuperVector::loadu(buf_end - S); + rv = vermicelliBlock(data, chars, casemask, buf_end - S, buf_end - d); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } @@ -170,8 +170,8 @@ static const u8 *nvermicelliExecReal(SuperVector const chars, SuperVector // finish off tail if (d != buf_end) { - SuperVector data = SuperVector::loadu_maskz(d, buf_end - d); - rv = vermicelliBlockNeg(data, chars, casemask, d, buf_end - d); + SuperVector data = SuperVector::loadu(buf_end - S); + rv = vermicelliBlockNeg(data, chars, casemask, buf_end - S, buf_end - d); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } @@ -316,17 +316,17 @@ static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector< if (!ISALIGNED_N(d, S)) { u8 const *d1 = ROUNDUP_PTR(d, S); SuperVector data = SuperVector::loadu(d); - rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, S); - if (rv) return rv; + rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d + S, S); + if (rv) return rv - S; d = d1; } - while(d + S <= buf_end) { + while(d + S < buf_end) { __builtin_prefetch(d + 64); DEBUG_PRINTF("d %p \n", d); SuperVector data = SuperVector::load(d); - rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, S); - if (rv) return rv; + rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d + S, S); + if (rv) return rv - S; d += S; } } @@ -335,8 +335,16 @@ static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector< // finish off tail if (d != buf_end) { - SuperVector data = SuperVector::loadu_maskz(d, buf_end - d); - rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, buf_end - d); + SuperVector data = SuperVector::Zeroes(); + const u8* end_buf; + if (buf_end - buf < S) { + memcpy(&data.u, buf, buf_end - buf); + end_buf = buf; + } else { + data = SuperVector::loadu(buf_end - S); + end_buf = buf_end - S; + } + rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, end_buf, buf_end - d); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } @@ -403,8 +411,13 @@ const u8 *rvermicelliDoubleExecReal(char c1, char c2, SuperVector const casem // finish off head if (d != buf) { - SuperVector data = SuperVector::loadu(buf); - rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, buf, d - buf); + SuperVector data = SuperVector::Zeroes(); + if (d - buf < S) { + memcpy(&data.u, buf, d - buf); + } else { + data = SuperVector::loadu(buf); + } + rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, buf, d - buf); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } @@ -440,17 +453,17 @@ static const u8 *vermicelliDoubleMaskedExecReal(u8 const c1, u8 const c2, u8 con if (!ISALIGNED_N(d, S)) { u8 const *d1 = ROUNDUP_PTR(d, S); SuperVector data = SuperVector::loadu(d); - rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d, S); - if (rv) return rv; + rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d + S, S); + if (rv) return rv - S; d = d1; } - while(d + S <= buf_end) { + while(d + S < buf_end) { __builtin_prefetch(d + 64); DEBUG_PRINTF("d %p \n", d); SuperVector data = SuperVector::load(d); - rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d, S); - if (rv) return rv; + rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d + S, S); + if (rv) return rv - S; d += S; } } @@ -459,8 +472,16 @@ static const u8 *vermicelliDoubleMaskedExecReal(u8 const c1, u8 const c2, u8 con // finish off tail if (d != buf_end) { - SuperVector data = SuperVector::loadu_maskz(d, buf_end - d); - rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d, buf_end - d); + SuperVector data = SuperVector::Zeroes(); + const u8* end_buf; + if (buf_end - buf < S) { + memcpy(&data.u, buf, buf_end - buf); + end_buf = buf; + } else { + data = SuperVector::loadu(buf_end - S); + end_buf = buf_end - S; + } + rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, end_buf, buf_end - d); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } @@ -480,6 +501,20 @@ extern "C" const u8 *vermicelliExec(char c, char nocase, const u8 *buf, const u8 nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); assert(buf < buf_end); + // Small ranges. + if (buf_end - buf < VECTORSIZE) { + for (; buf < buf_end; buf++) { + char cur = (char)*buf; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur == c) { + break; + } + } + return buf; + } + const SuperVector chars = SuperVector::dup_u8(c); const SuperVector casemask{nocase ? getCaseMask() : SuperVector::Ones()}; @@ -493,6 +528,20 @@ extern "C" const u8 *nvermicelliExec(char c, char nocase, const u8 *buf, const u nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); assert(buf < buf_end); + // Small ranges. + if (buf_end - buf < VECTORSIZE) { + for (; buf < buf_end; buf++) { + char cur = *buf; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur != c) { + break; + } + } + return buf; + } + const SuperVector chars = SuperVector::dup_u8(c); const SuperVector casemask{nocase ? getCaseMask() : SuperVector::Ones()}; @@ -504,6 +553,20 @@ extern "C" const u8 *rvermicelliExec(char c, char nocase, const u8 *buf, const u nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); assert(buf < buf_end); + // Small ranges. + if (buf_end - buf < VECTORSIZE) { + for (buf_end--; buf_end >= buf; buf_end--) { + char cur = (char)*buf_end; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur == c) { + break; + } + } + return buf_end; + } + const SuperVector chars = SuperVector::dup_u8(c); const SuperVector casemask{nocase ? getCaseMask() : SuperVector::Ones()}; @@ -515,6 +578,20 @@ extern "C" const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf, const nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); assert(buf < buf_end); + // Small ranges. + if (buf_end - buf < VECTORSIZE) { + for (buf_end--; buf_end >= buf; buf_end--) { + char cur = (char)*buf_end; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur != c) { + break; + } + } + return buf_end; + } + const SuperVector chars = SuperVector::dup_u8(c); const SuperVector casemask{nocase ? getCaseMask() : SuperVector::Ones()}; diff --git a/src/nfa/x86/vermicelli.hpp b/src/nfa/x86/vermicelli.hpp index 8b461dfe..2f219f31 100644 --- a/src/nfa/x86/vermicelli.hpp +++ b/src/nfa/x86/vermicelli.hpp @@ -67,7 +67,7 @@ const u8 *rvermicelliBlockNeg(SuperVector const data, SuperVector const ch return last_zero_match_inverted(buf, mask, len); } -template +template static really_inline const u8 *vermicelliDoubleBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const casemask, u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) { @@ -78,14 +78,16 @@ const u8 *vermicelliDoubleBlock(SuperVector const data, SuperVector const SuperVector mask = mask1 & (mask2 >> 1); DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]); - bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); + bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); DEBUG_PRINTF("partial = %d\n", partial_match); - if (partial_match) return buf - 1; + if (partial_match) { + mask = mask | ((SuperVector::Ones() >> (S-1)) << (S-1)); + } return first_non_zero_match(buf, mask, len); } -template +template static really_inline const u8 *rvermicelliDoubleBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const casemask, u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) { @@ -96,7 +98,7 @@ const u8 *rvermicelliDoubleBlock(SuperVector const data, SuperVector const SuperVector mask = (mask1 << 1)& mask2; DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]); - bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); + bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); DEBUG_PRINTF("partial = %d\n", partial_match); if (partial_match) { mask = mask | (SuperVector::Ones() >> (S-1)); @@ -105,7 +107,7 @@ const u8 *rvermicelliDoubleBlock(SuperVector const data, SuperVector const return last_non_zero_match(buf, mask, len); } -template +template static really_inline const u8 *vermicelliDoubleMaskedBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const mask1, SuperVector const mask2, @@ -116,9 +118,11 @@ const u8 *vermicelliDoubleMaskedBlock(SuperVector const data, SuperVector SuperVector mask = v1 & (v2 >> 1); DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]); - bool partial_match = (((buf[0] & m1) == c2) && ((buf[-1] & m2) == c1)); + bool partial_match = (check_partial && ((buf[0] & m2) == c2) && ((buf[-1] & m1) == c1)); DEBUG_PRINTF("partial = %d\n", partial_match); - if (partial_match) return buf - 1; + if (partial_match) { + mask = mask | ((SuperVector::Ones() >> (S-1)) << (S-1)); + } return first_non_zero_match(buf, mask, len); } diff --git a/unit/hyperscan/allocators.cpp b/unit/hyperscan/allocators.cpp index 40c45072..a30a3702 100644 --- a/unit/hyperscan/allocators.cpp +++ b/unit/hyperscan/allocators.cpp @@ -99,7 +99,7 @@ TEST(CustomAllocator, TwoAlignedCompileError) { ASSERT_NE(nullptr, compile_err); EXPECT_STREQ("Allocator returned misaligned memory.", compile_err->message); hs_free_compile_error(compile_err); - hs_set_database_allocator(nullptr, nullptr); + hs_set_misc_allocator(nullptr, nullptr); } TEST(CustomAllocator, TwoAlignedDatabaseInfo) { diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp index f1a03d5a..deb85e9f 100644 --- a/unit/internal/shuffle.cpp +++ b/unit/internal/shuffle.cpp @@ -36,6 +36,9 @@ #include"util/supervector/supervector.hpp" #include "nfa/limex_shuffle.hpp" +#ifdef setbit +#undef setbit +#endif namespace { diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index bc2421dc..69f1a64c 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -33,6 +33,10 @@ #include "util/bytecode_ptr.h" #include "util/simd_utils.h" +#ifdef setbit +#undef setbit +#endif + using namespace std; using namespace ue2;