From d04b899c29ad6d9a62434871848f82063951fe39 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 12 Jul 2021 21:12:05 +0300 Subject: [PATCH] fix truffle SIMD for S>16 as well --- src/nfa/truffle_simd.hpp | 83 ++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 33 deletions(-) diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp index 0d57650b..eeba8b0c 100644 --- a/src/nfa/truffle_simd.hpp +++ b/src/nfa/truffle_simd.hpp @@ -57,6 +57,19 @@ typename SuperVector::movemask_type block(SuperVector shuf_mask_lo_highcle SuperVector shuf3 = shuf_mask_hi.pshufb(t2); SuperVector tmp = (shuf1 | shuf2) & shuf3; + shuf_mask_lo_highclear.print8("shuf_mask_lo_highclear"); + shuf_mask_lo_highset.print8("shuf_mask_lo_highset"); + v.print8("v"); + highconst.print8("highconst"); + shuf_mask_hi.print8("shuf_mask_hi"); + shuf1.print8("shuf1"); + t1.print8("t1"); + shuf2.print8("shuf2"); + t2.print8("t2"); + shuf3.print8("shuf3"); + tmp.print8("tmp"); + DEBUG_PRINTF("z %08x \n", tmp.eqmask(SuperVector::Zeroes())); + return tmp.eqmask(SuperVector::Zeroes()); } @@ -64,20 +77,20 @@ template static really_inline const u8 *truffleMini(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end){ uintptr_t len = buf_end - buf; - assert(len < 16); + assert(len < S); - SuperVector chars = SuperVector::Zeroes(); - memcpy(&chars.u.u8[0], buf, len); + DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end); + SuperVector chars = SuperVector::loadu_maskz(buf, len); + chars.print8("chars"); - u32 mask = (0xffff >> (16 - len)) ^ 0xffff; typename SuperVector::movemask_type z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); - const u8 *rv = firstMatch(buf, z | mask); + const u8 *rv = firstMatch(buf, z); + DEBUG_PRINTF("rv %p buf+len %p \n", rv, buf+len); - if (rv) { + if (rv && rv < buf+len) { return rv; - } else { - return buf_end; } + return buf_end; } template @@ -91,7 +104,7 @@ const u8 *fwdBlock(SuperVector shuf_mask_lo_highclear, SuperVector shuf_ma template -const u8 *truffleExecReal(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) { +const u8 *truffleExecReal(m128 &shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) { assert(buf && buf_end); assert(buf < buf_end); DEBUG_PRINTF("truffle %p len %zu\n", buf, buf_end - buf); @@ -107,15 +120,17 @@ const u8 *truffleExecReal(SuperVector shuf_mask_lo_highclear, SuperVector assert(d < buf_end); if (d + S <= buf_end) { - // peel off first part to cacheline boundary - const u8 *d1 = ROUNDUP_PTR(d, S); - DEBUG_PRINTF("until aligned %p \n", d1); - if (d1 != d) { - rv = truffleMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, d, d1); - if (rv != d1) { - return rv; + if (!ISALIGNED_N(d, S)) { + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDUP_PTR(d, S); + DEBUG_PRINTF("until aligned %p \n", d1); + if (d1 != d) { + rv = truffleMini(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, d, d1); + if (rv != d1) { + return rv; + } + d = d1; } - d = d1; } size_t loops = (buf_end - d) / S; @@ -138,7 +153,7 @@ const u8 *truffleExecReal(SuperVector shuf_mask_lo_highclear, SuperVector rv = buf_end; if (d != buf_end) { - rv = truffleMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, d, buf_end); + rv = truffleMini(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, d, buf_end); DEBUG_PRINTF("rv %p \n", rv); } @@ -150,16 +165,16 @@ template static really_inline const u8 *truffleRevMini(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end){ uintptr_t len = buf_end - buf; - assert(len < 16); + DEBUG_PRINTF("buf %p len %ld\n", buf, len); + assert(len < S); - SuperVector chars = SuperVector::loadu(buf); + SuperVector chars = SuperVector::loadu_maskz(buf, len); - u32 mask = (0xffff >> (16 - len)) ^ 0xffff; - typename SuperVector::movemask_type z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); - const u8 *rv = lastMatch(buf,z | mask); + const u8 *rv = lastMatch(buf, z); + DEBUG_PRINTF("rv %p buf+len %p \n", rv, buf+len); - if (rv) { + if (rv && rv < buf+len) { return rv; } return buf - 1; @@ -176,7 +191,7 @@ const u8 *revBlock(SuperVector shuf_mask_lo_highclear, SuperVector shuf_ma template -const u8 *rtruffleExecReal(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end){ +const u8 *rtruffleExecReal(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end){ assert(buf && buf_end); assert(buf < buf_end); DEBUG_PRINTF("trufle %p len %zu\n", buf, buf_end - buf); @@ -191,13 +206,15 @@ const u8 *rtruffleExecReal(SuperVector shuf_mask_lo_highclear, SuperVector DEBUG_PRINTF("start %p end %p \n", buf, d); assert(d > buf); if (d - S >= buf) { - // peel off first part to cacheline boundary - const u8 *d1 = ROUNDDOWN_PTR(d, S); - DEBUG_PRINTF("until aligned %p \n", d1); - if (d1 != d) { - rv = truffleRevMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, d1, d); - if (rv != d1 - 1) return rv; - d = d1; + if (!ISALIGNED_N(d, S)) { + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDDOWN_PTR(d, S); + DEBUG_PRINTF("until aligned %p \n", d1); + if (d1 != d) { + rv = truffleRevMini(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, d1, d); + if (rv != d1 - 1) return rv; + d = d1; + } } while (d - S >= buf) { @@ -217,7 +234,7 @@ const u8 *rtruffleExecReal(SuperVector shuf_mask_lo_highclear, SuperVector // finish off tail if (d != buf) { - rv = truffleRevMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf, d); + rv = truffleRevMini(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, buf, d); DEBUG_PRINTF("rv %p \n", rv); if (rv) return rv; }