diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp index bfe976ce..8d61722b 100644 --- a/src/nfa/truffle_simd.hpp +++ b/src/nfa/truffle_simd.hpp @@ -43,37 +43,18 @@ template static really_inline -SuperVector block(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, SuperVector chars) { +const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, SuperVector chars); - chars.print8("chars"); - shuf_mask_lo_highclear.print8("shuf_mask_lo_highclear"); - shuf_mask_lo_highset.print8("shuf_mask_lo_highset"); - - SuperVector highconst = SuperVector::dup_u8(0x80); - highconst.print8("highconst"); - SuperVector shuf_mask_hi = SuperVector::dup_u64(0x8040201008040201); - shuf_mask_hi.print8("shuf_mask_hi"); - - SuperVector shuf1 = shuf_mask_lo_highclear.template pshufb(chars); - shuf1.print8("shuf1"); - SuperVector t1 = chars ^ highconst; - t1.print8("t1"); - SuperVector shuf2 = shuf_mask_lo_highset.template pshufb(t1); - shuf2.print8("shuf2"); - SuperVector t2 = highconst.opandnot(chars.template vshr_64_imm<4>()); - t2.print8("t2"); - SuperVector shuf3 = shuf_mask_hi.template pshufb(t2); - shuf3.print8("shuf3"); - SuperVector res = (shuf1 | shuf2) & shuf3; - res.print8("(shuf1 | shuf2) & shuf3"); - - return !res.eq(SuperVector::Zeroes());//{(m128)vcgtq_u8((uint8x16_t)tmp.u.v128[0], vdupq_n_u8(0))}; -} +#if defined(ARCH_IA32) || defined(ARCH_X86_64) +#include "x86/truffle.hpp" +#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) +#include "arm/truffle.hpp" +#endif template static really_inline const u8 *fwdBlock(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, SuperVector chars, const u8 *buf) { - SuperVector res = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); + SuperVector res = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); return firstMatch(buf, res); } @@ -98,23 +79,26 @@ const u8 *truffleExecReal(m128 &shuf_mask_lo_highclear, m128 shuf_mask_lo_highse __builtin_prefetch(d + 2*64); __builtin_prefetch(d + 3*64); __builtin_prefetch(d + 4*64); + DEBUG_PRINTF("start %p end %p \n", d, buf_end); + assert(d < buf_end); if (d + S <= buf_end) { // Reach vector aligned boundaries DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); if (!ISALIGNED_N(d, S)) { SuperVector chars = SuperVector::loadu(d); + const u8 *dup = ROUNDUP_PTR(d, S); rv = fwdBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, d); - if (rv) return rv; - d = ROUNDUP_PTR(d, S); + if (rv && rv < dup) return rv; + d = dup; } - while(d + S <= buf_end) { + while(d + S <= buf_end) { __builtin_prefetch(d + 64); DEBUG_PRINTF("d %p \n", d); SuperVector chars = SuperVector::load(d); rv = fwdBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, d); if (rv) return rv; - d += S; + d += S; } } @@ -122,44 +106,23 @@ const u8 *truffleExecReal(m128 &shuf_mask_lo_highclear, m128 shuf_mask_lo_highse // finish off tail if (d != buf_end) { - SuperVector chars = SuperVector::loadu(d); + SuperVector chars = SuperVector::loadu_maskz(d, buf_end - d); rv = fwdBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, d); DEBUG_PRINTF("rv %p \n", rv); - if (rv) return rv; + if (rv && rv < buf_end) return rv; } return buf_end; } - -template -static really_inline const u8 *truffleRevMini(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, - const u8 *buf, const u8 *buf_end){ - uintptr_t len = buf_end - buf; - DEBUG_PRINTF("buf %p len %ld\n", buf, len); - assert(len < S); - - SuperVector chars = SuperVector::loadu_maskz(buf, len); - - SuperVector v = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); - const u8 *rv = lastMatch(buf, v); - DEBUG_PRINTF("rv %p buf+len %p \n", rv, buf+len); - - if (rv && rv < buf+len) { - return rv; - } - return buf - 1; -} - template static really_inline const u8 *revBlock(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, SuperVector v, const u8 *buf) { - SuperVector res = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); + SuperVector res = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); return lastMatch(buf, res); } - template const u8 *rtruffleExecReal(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end){ assert(buf && buf_end); @@ -173,42 +136,45 @@ const u8 *rtruffleExecReal(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highse const u8 *d = buf_end; const u8 *rv; + __builtin_prefetch(d - 64); + __builtin_prefetch(d - 2*64); + __builtin_prefetch(d - 3*64); + __builtin_prefetch(d - 4*64); DEBUG_PRINTF("start %p end %p \n", buf, d); assert(d > buf); if (d - S >= buf) { + // Reach vector aligned boundaries + DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S)); if (!ISALIGNED_N(d, S)) { - // peel off first part to cacheline boundary - const u8 *d1 = ROUNDDOWN_PTR(d, S); - DEBUG_PRINTF("until aligned %p \n", d1); - if (d1 != d) { - rv = truffleRevMini(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, d1, d); - if (rv != d1 - 1) return rv; - d = d1; - } + SuperVector chars = SuperVector::loadu(d - S); + const u8 *dbot = ROUNDDOWN_PTR(d, S); + rv = revBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, d - S); + DEBUG_PRINTF("rv %p \n", rv); + if (rv >= dbot) return rv; + d = dbot; } while (d - S >= buf) { - d -= S; - DEBUG_PRINTF("d %p \n", d); + DEBUG_PRINTF("aligned %p \n", d); // On large packet buffers, this prefetch appears to get us about 2%. __builtin_prefetch(d - 64); - + + d -= S; SuperVector chars = SuperVector::load(d); rv = revBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, d); if (rv) return rv; } } - DEBUG_PRINTF("tail: d %p e %p \n", buf, d); - // finish off tail + DEBUG_PRINTF("tail d %p e %p \n", buf, d); + // finish off head if (d != buf) { - rv = truffleRevMini(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, buf, d); + SuperVector chars = SuperVector::loadu(buf); + rv = revBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, buf); DEBUG_PRINTF("rv %p \n", rv); - if (rv >= buf && rv < buf_end) return rv; + if (rv && rv < buf_end) return rv; } - + return buf - 1; } - - diff --git a/src/nfa/x86/truffle.hpp b/src/nfa/x86/truffle.hpp new file mode 100644 index 00000000..7dc711f4 --- /dev/null +++ b/src/nfa/x86/truffle.hpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Truffle: character class acceleration. + * + */ + +template +static really_inline +const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, SuperVector chars) { + + chars.print8("chars"); + shuf_mask_lo_highclear.print8("shuf_mask_lo_highclear"); + shuf_mask_lo_highset.print8("shuf_mask_lo_highset"); + + SuperVector highconst = SuperVector::dup_u8(0x80); + highconst.print8("highconst"); + SuperVector shuf_mask_hi = SuperVector::dup_u64(0x8040201008040201); + shuf_mask_hi.print8("shuf_mask_hi"); + + SuperVector shuf1 = shuf_mask_lo_highclear.pshufb(chars); + shuf1.print8("shuf1"); + SuperVector t1 = chars ^ highconst; + t1.print8("t1"); + SuperVector shuf2 = shuf_mask_lo_highset.pshufb(t1); + shuf2.print8("shuf2"); + SuperVector t2 = highconst.opandnot(chars.template vshr_64_imm<4>()); + t2.print8("t2"); + SuperVector shuf3 = shuf_mask_hi.pshufb(t2); + shuf3.print8("shuf3"); + SuperVector res = (shuf1 | shuf2) & shuf3; + res.print8("(shuf1 | shuf2) & shuf3"); + + return res.eq(SuperVector::Zeroes()); +}