From 68f6849687c237425dff08c8b00efbeedf06e8c5 Mon Sep 17 00:00:00 2001 From: Anatoly Burakov Date: Wed, 9 Dec 2015 12:36:12 +0000 Subject: [PATCH] Adding AVX2 version of truffle --- src/nfa/truffle.c | 204 ++++++++++++++++++++++++++++----------- src/nfa/truffle.h | 8 ++ src/nfa/truffle_common.h | 149 ++++++++++++++++++++++++++++ 3 files changed, 304 insertions(+), 57 deletions(-) create mode 100644 src/nfa/truffle_common.h diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c index 86dcda63..8863c71a 100644 --- a/src/nfa/truffle.c +++ b/src/nfa/truffle.c @@ -37,18 +37,9 @@ #include "util/simd_utils.h" #include "util/simd_utils_ssse3.h" -#define shift128r(a, b) _mm_srli_epi64((a), (b)) +#include "truffle_common.h" -static really_inline -const u8 *firstMatch(const u8 *buf, u32 z) { - if (unlikely(z != 0xffff)) { - u32 pos = ctz32(~z & 0xffff); - assert(pos < 16); - return buf + pos; - } - - return NULL; // no match -} +#if !defined(__AVX2__) static really_inline const u8 *lastMatch(const u8 *buf, u32 z) { @@ -61,25 +52,6 @@ const u8 *lastMatch(const u8 *buf, u32 z) { return NULL; // no match } -static really_inline -u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) { - - m128 highconst = _mm_set1_epi8(0x80); - m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201); - - // and now do the real work - m128 shuf1 = pshufb(shuf_mask_lo_highclear, v); - m128 t1 = xor128(v, highconst); - m128 shuf2 = pshufb(shuf_mask_lo_highset, t1); - m128 t2 = andnot128(highconst, shift128r(v, 4)); - m128 shuf3 = pshufb(shuf_mask_hi, t2); - m128 tmp = and128(or128(shuf1, shuf2), shuf3); - m128 tmp2 = eq128(tmp, zeroes128()); - u32 z = movemask128(tmp2); - - return z; -} - static really_inline const u8 *fwdBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v, const u8 *buf) { @@ -94,30 +66,9 @@ const u8 *revBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, return lastMatch(buf, z); } -static -const u8 *truffleMini(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, - const u8 *buf, const u8 *buf_end) { - uintptr_t len = buf_end - buf; - assert(len < 16); - - m128 chars = zeroes128(); - memcpy(&chars, buf, len); - - u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); - // can't be these bytes in z - u32 mask = (0xFFFF >> (16 - len)) ^ 0xFFFF; - const u8 *rv = firstMatch(buf, z| mask); - - if (rv) { - return rv; - } else { - return buf_end; - } -} - const u8 *truffleExec(m128 shuf_mask_lo_highclear, - m128 shuf_mask_lo_highset, - const u8 *buf, const u8 *buf_end) { + m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { DEBUG_PRINTF("len %zu\n", buf_end - buf); assert(buf && buf_end); @@ -166,8 +117,8 @@ const u8 *truffleExec(m128 shuf_mask_lo_highclear, static const u8 *truffleRevMini(m128 shuf_mask_lo_highclear, - m128 shuf_mask_lo_highset, const u8 *buf, - const u8 *buf_end) { + m128 shuf_mask_lo_highset, const u8 *buf, + const u8 *buf_end) { uintptr_t len = buf_end - buf; assert(len < 16); @@ -184,11 +135,9 @@ const u8 *truffleRevMini(m128 shuf_mask_lo_highclear, return buf - 1; } - const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) { - assert(buf && buf_end); assert(buf < buf_end); const u8 *rv; @@ -233,4 +182,145 @@ const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, return buf - 1; } +#else +static really_inline +const u8 *lastMatch(const u8 *buf, u32 z) { + if (unlikely(z != 0xffffffff)) { + u32 pos = clz32(~z); + assert(pos < 32); + return buf + (31 - pos); + } + + return NULL; // no match +} + +static really_inline +const u8 *fwdBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, + m256 v, const u8 *buf) { + u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); + return firstMatch(buf, z); +} + +static really_inline +const u8 *revBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, + m256 v, const u8 *buf) { + u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); + return lastMatch(buf, z); +} + +const u8 *truffleExec(m128 shuf_mask_lo_highclear, + m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + DEBUG_PRINTF("len %zu\n", buf_end - buf); + const m256 wide_clear = set2x128(shuf_mask_lo_highclear); + const m256 wide_set = set2x128(shuf_mask_lo_highset); + + assert(buf && buf_end); + assert(buf < buf_end); + const u8 *rv; + + if (buf_end - buf < 32) { + return truffleMini(wide_clear, wide_set, buf, buf_end); + } + + size_t min = (size_t)buf % 32; + assert(buf_end - buf >= 32); + + // Preconditioning: most of the time our buffer won't be aligned. + m256 chars = loadu256(buf); + rv = fwdBlock(wide_clear, wide_set, chars, buf); + if (rv) { + return rv; + } + buf += (32 - min); + + const u8 *last_block = buf_end - 32; + while (buf < last_block) { + m256 lchars = load256(buf); + rv = fwdBlock(wide_clear, wide_set, lchars, buf); + if (rv) { + return rv; + } + buf += 32; + } + + // Use an unaligned load to mop up the last 32 bytes and get an accurate + // picture to buf_end. + assert(buf <= buf_end && buf >= buf_end - 32); + chars = loadu256(buf_end - 32); + rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 32); + if (rv) { + return rv; + } + return buf_end; +} + +static +const u8 *truffleRevMini(m256 shuf_mask_lo_highclear, + m256 shuf_mask_lo_highset, const u8 *buf, + const u8 *buf_end) { + uintptr_t len = buf_end - buf; + assert(len < 32); + + m256 chars = zeroes256(); + memcpy(&chars, buf, len); + + u32 mask = (0xFFFFFFFF >> (32 - len)) ^ 0xFFFFFFFF; + u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); + const u8 *rv = lastMatch(buf, z | mask); + + if (rv) { + return rv; + } + return buf - 1; +} + + +const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, + m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + const m256 wide_clear = set2x128(shuf_mask_lo_highclear); + const m256 wide_set = set2x128(shuf_mask_lo_highset); + assert(buf && buf_end); + assert(buf < buf_end); + const u8 *rv; + + DEBUG_PRINTF("len %zu\n", buf_end - buf); + + if (buf_end - buf < 32) { + return truffleRevMini(wide_clear, wide_set, buf, buf_end); + } + + assert(buf_end - buf >= 32); + + // Preconditioning: most of the time our buffer won't be aligned. + m256 chars = loadu256(buf_end - 32); + rv = revBlock(wide_clear, wide_set, chars, + buf_end - 32); + if (rv) { + return rv; + } + buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f)); + + const u8 *last_block = buf + 32; + while (buf_end > last_block) { + buf_end -= 32; + m256 lchars = load256(buf_end); + rv = revBlock(wide_clear, wide_set, lchars, buf_end); + if (rv) { + return rv; + } + } + + // Use an unaligned load to mop up the last 32 bytes and get an accurate + // picture to buf_end. + chars = loadu256(buf); + rv = revBlock(wide_clear, wide_set, chars, buf); + if (rv) { + return rv; + } + return buf - 1; +} + +#endif diff --git a/src/nfa/truffle.h b/src/nfa/truffle.h index cf5f6346..f67227ad 100644 --- a/src/nfa/truffle.h +++ b/src/nfa/truffle.h @@ -26,9 +26,17 @@ * POSSIBILITY OF SUCH DAMAGE. */ +/** \file + * \brief Truffle: fully general character class acceleration. + * + * Utilises the SSSE3 pshufb or AVX2 vpshufb shuffle instructions + */ + #ifndef TRUFFLE_H #define TRUFFLE_H + #include "util/simd_types.h" + #ifdef __cplusplus extern "C" { diff --git a/src/nfa/truffle_common.h b/src/nfa/truffle_common.h new file mode 100644 index 00000000..122f65c4 --- /dev/null +++ b/src/nfa/truffle_common.h @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TRUFFLE_COMMON_H_ +#define TRUFFLE_COMMON_H_ + +#include "util/bitutils.h" +#include "util/simd_utils.h" +#include "util/simd_utils_ssse3.h" + +/* + * Common stuff for all versions of truffle (single, multi and multidouble) + */ +#if !defined(__AVX2__) + +static really_inline +const u8 *firstMatch(const u8 *buf, u32 z) { + if (unlikely(z != 0xffff)) { + u32 pos = ctz32(~z & 0xffff); + assert(pos < 16); + return buf + pos; + } + + return NULL; // no match +} + +#define shift128r(a, b) _mm_srli_epi64((a), (b)) +static really_inline +u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) { + + m128 highconst = _mm_set1_epi8(0x80); + m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201); + + // and now do the real work + m128 shuf1 = pshufb(shuf_mask_lo_highclear, v); + m128 t1 = xor128(v, highconst); + m128 shuf2 = pshufb(shuf_mask_lo_highset, t1); + m128 t2 = andnot128(highconst, shift128r(v, 4)); + m128 shuf3 = pshufb(shuf_mask_hi, t2); + m128 tmp = and128(or128(shuf1, shuf2), shuf3); + m128 tmp2 = eq128(tmp, zeroes128()); + u32 z = movemask128(tmp2); + + return z; +} + +static +const u8 *truffleMini(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + uintptr_t len = buf_end - buf; + assert(len < 16); + + m128 chars = zeroes128(); + memcpy(&chars, buf, len); + + u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); + // can't be these bytes in z + u32 mask = (0xFFFF >> (16 - len)) ^ 0xFFFF; + const u8 *rv = firstMatch(buf, z| mask); + + if (rv) { + return rv; + } else { + return buf_end; + } +} + +#else + +static really_inline +const u8 *firstMatch(const u8 *buf, u32 z) { + if (unlikely(z != 0xffffffff)) { + u32 pos = ctz32(~z); + assert(pos < 32); + return buf + pos; + } + + return NULL; // no match +} + +#define shift256r(a, b) _mm256_srli_epi64((a), (b)) +static really_inline +u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) { + + m256 highconst = _mm256_set1_epi8(0x80); + m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201); + + // and now do the real work + m256 shuf1 = vpshufb(shuf_mask_lo_highclear, v); + m256 t1 = xor256(v, highconst); + m256 shuf2 = vpshufb(shuf_mask_lo_highset, t1); + m256 t2 = andnot256(highconst, shift256r(v, 4)); + m256 shuf3 = vpshufb(shuf_mask_hi, t2); + m256 tmp = and256(or256(shuf1, shuf2), shuf3); + m256 tmp2 = eq256(tmp, zeroes256()); + u32 z = movemask256(tmp2); + + return z; +} + +static +const u8 *truffleMini(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + uintptr_t len = buf_end - buf; + assert(len < 32); + + m256 chars = zeroes256(); + memcpy(&chars, buf, len); + + u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); + // can't be these bytes in z + u32 mask = (0xFFFFFFFF >> (32 - len)) ^ 0xFFFFFFFF; + const u8 *rv = firstMatch(buf, z | mask); + + if (rv) { + return rv; + } else { + return buf_end; + } +} + +#endif + +#endif /* TRUFFLE_COMMON_H_ */