From 23b075cbd48c504ef2b0fe614de264765865768a Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 10 Jun 2021 13:34:38 +0300 Subject: [PATCH] refactor shufti algorithm to use SuperVector class, WIP --- CMakeLists.txt | 14 +- src/nfa/shufti.c | 1102 --------------------------------------- src/nfa/shufti.cpp | 127 +++++ src/nfa/shufti.h | 2 +- src/nfa/shufti_simd.hpp | 367 +++++++++++++ 5 files changed, 508 insertions(+), 1104 deletions(-) delete mode 100644 src/nfa/shufti.c create mode 100644 src/nfa/shufti.cpp create mode 100644 src/nfa/shufti_simd.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index fa9648f7..8b46e610 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -691,7 +691,7 @@ set (hs_exec_SRCS src/nfa/sheng_impl.h src/nfa/sheng_impl4.h src/nfa/sheng_internal.h - src/nfa/shufti.c + src/nfa/shufti.cpp src/nfa/shufti.h src/nfa/tamarama.c src/nfa/tamarama.h @@ -753,6 +753,18 @@ set (hs_exec_SRCS src/database.h ) +if (NOT OPTIMISE) +if (ARCH_IA32 OR ARCH_X86_64) +set (hs_exec_SRCS + ${hs_exec_SRCS} + src/util/simd/arch/x86/impl.cpp) +else (ARCH_ARM32 OR ARCH_AARCH64) +set (hs_exec_SRCS + ${hs_exec_SRCS} + src/util/simd/arch/arm/impl.cpp) +endif () +endif() + set (hs_exec_avx2_SRCS src/fdr/teddy_avx2.c src/util/arch/x86/masked_move.c diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c deleted file mode 100644 index 2c30ce5c..00000000 --- a/src/nfa/shufti.c +++ /dev/null @@ -1,1102 +0,0 @@ -/* - * Copyright (c) 2015-2017, Intel Corporation - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/** \file - * \brief Shufti: character class acceleration. - * - * Utilises the SSSE3 pshufb shuffle instruction - */ - -#include "shufti.h" -#include "ue2common.h" -#include "util/arch.h" -#include "util/bitutils.h" -#include "util/simd_utils.h" -#include "util/unaligned.h" - -#ifdef DEBUG -#include - -#define DUMP_MSK(_t) \ -static UNUSED \ -void dumpMsk##_t(m##_t msk) { \ - u8 * mskAsU8 = (u8 *)&msk; \ - for (unsigned i = 0; i < sizeof(msk); i++) { \ - u8 c = mskAsU8[i]; \ - for (int j = 0; j < 8; j++) { \ - if ((c >> (7-j)) & 0x1) \ - printf("1"); \ - else \ - printf("0"); \ - } \ - printf(" "); \ - } \ -} \ -static UNUSED \ -void dumpMsk##_t##AsChars(m##_t msk) { \ - u8 * mskAsU8 = (u8 *)&msk; \ - for (unsigned i = 0; i < sizeof(msk); i++) { \ - u8 c = mskAsU8[i]; \ - if (isprint(c)) \ - printf("%c",c); \ - else \ - printf("."); \ - } \ -} - -#endif - -/** \brief Naive byte-by-byte implementation. */ -static really_inline -const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf, - const u8 *buf_end) { - assert(buf < buf_end); - - for (; buf < buf_end; ++buf) { - u8 c = *buf; - if (lo[c & 0xf] & hi[c >> 4]) { - break; - } - } - return buf; -} - -/** \brief Naive byte-by-byte implementation. */ -static really_inline -const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf, - const u8 *buf_end) { - assert(buf < buf_end); - - for (buf_end--; buf_end >= buf; buf_end--) { - u8 c = *buf_end; - if (lo[c & 0xf] & hi[c >> 4]) { - break; - } - } - return buf_end; -} - -#if !defined(HAVE_AVX2) -/* Normal SSSE3 shufti */ - -#ifdef DEBUG -DUMP_MSK(128) -#endif - -#define GET_LO_4(chars) and128(chars, low4bits) -#define GET_HI_4(chars) and128(rshift64_m128(chars, 4), low4bits) -//#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) - -static really_inline -u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits, - const m128 compare) { - m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); - m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); - m128 t = and128(c_lo, c_hi); - -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); -#endif - return movemask128(eq128(t, compare)); -} - -static really_inline -const u8 *firstMatch(const u8 *buf, u32 z) { - if (unlikely(z != 0xffff)) { - u32 pos = ctz32(~z & 0xffff); - assert(pos < 16); - return buf + pos; - } else { - return NULL; // no match - } -} - -static really_inline -const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf, - const m128 low4bits, const m128 zeroes) { - u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); - - return firstMatch(buf, z); -} - -const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end) { - assert(buf && buf_end); - assert(buf < buf_end); - - // Slow path for small cases. - if (buf_end - buf < 16) { - return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, - buf, buf_end); - } - - const m128 zeroes = zeroes128(); - const m128 low4bits = set1_16x8(0xf); - const u8 *rv; - - size_t min = (size_t)buf % 16; - assert(buf_end - buf >= 16); - - // Preconditioning: most of the time our buffer won't be aligned. - m128 chars = loadu128(buf); - rv = fwdBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes); - if (rv) { - return rv; - } - buf += (16 - min); - - // Unrolling was here, but it wasn't doing anything but taking up space. - // Reroll FTW. - - const u8 *last_block = buf_end - 16; - - for (const u8 *itPtr = ROUNDDOWN_PTR(buf, 64); itPtr + 4*16 <= last_block; itPtr += 4*16) { - __builtin_prefetch(itPtr); - } - while (buf < last_block) { - m128 lchars = load128(buf); - rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes); - if (rv) { - return rv; - } - buf += 16; - } - - // Use an unaligned load to mop up the last 16 bytes and get an accurate - // picture to buf_end. - assert(buf <= buf_end && buf >= buf_end - 16); - chars = loadu128(buf_end - 16); - rv = fwdBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes); - if (rv) { - return rv; - } - - return buf_end; -} - -static really_inline -const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) { -#ifdef DEBUG - DEBUG_PRINTF("confirming match in:"); dumpMsk128(t); printf("\n"); -#endif - - u32 z = movemask128(eq128(t, compare)); - if (unlikely(z != 0xffff)) { - u32 pos = clz32(~z & 0xffff); - DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); - assert(pos >= 16 && pos < 32); - return buf + (31 - pos); - } else { - return NULL; // no match - } -} - - -static really_inline -const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf, - const m128 low4bits, const m128 zeroes) { - m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); - m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); - m128 t = and128(c_lo, c_hi); - -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); -#endif - - return lastMatch(buf, t, zeroes); -} - -const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end) { - assert(buf && buf_end); - assert(buf < buf_end); - - // Slow path for small cases. - if (buf_end - buf < 16) { - return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, - buf, buf_end); - } - - const m128 zeroes = zeroes128(); - const m128 low4bits = set1_16x8(0xf); - const u8 *rv; - - assert(buf_end - buf >= 16); - - // Preconditioning: most of the time our buffer won't be aligned. - m128 chars = loadu128(buf_end - 16); - rv = revBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes); - if (rv) { - return rv; - } - buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0xf)); - - // Unrolling was here, but it wasn't doing anything but taking up space. - // Reroll FTW. - - const u8 *last_block = buf + 16; - while (buf_end > last_block) { - buf_end -= 16; - m128 lchars = load128(buf_end); - rv = revBlock(mask_lo, mask_hi, lchars, buf_end, low4bits, zeroes); - if (rv) { - return rv; - } - } - - // Use an unaligned load to mop up the last 16 bytes and get an accurate - // picture to buf. - chars = loadu128(buf); - rv = revBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes); - if (rv) { - return rv; - } - - return buf - 1; -} - -static really_inline -const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, - m128 chars, const u8 *buf, const m128 low4bits, - const m128 ones) { - m128 chars_lo = GET_LO_4(chars); - m128 chars_hi = GET_HI_4(chars); - m128 c_lo = pshufb_m128(mask1_lo, chars_lo); - m128 c_hi = pshufb_m128(mask1_hi, chars_hi); - m128 t = or128(c_lo, c_hi); - -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); -#endif - - m128 c2_lo = pshufb_m128(mask2_lo, chars_lo); - m128 c2_hi = pshufb_m128(mask2_hi, chars_hi); - m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1)); - -#ifdef DEBUG - DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo); printf("\n"); - DEBUG_PRINTF(" c2_hi: "); dumpMsk128(c2_hi); printf("\n"); - DEBUG_PRINTF(" t2: "); dumpMsk128(t2); printf("\n"); -#endif - - u32 z = movemask128(eq128(t2, ones)); - DEBUG_PRINTF(" z: 0x%08x\n", z); - return firstMatch(buf, z); -} - -const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, - m128 mask2_lo, m128 mask2_hi, - const u8 *buf, const u8 *buf_end) { - const m128 ones = ones128(); - const m128 low4bits = set1_16x8(0xf); - const u8 *rv; - - size_t min = (size_t)buf % 16; - - // Preconditioning: most of the time our buffer won't be aligned. - m128 chars = loadu128(buf); - rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, - chars, buf, low4bits, ones); - if (rv) { - return rv; - } - buf += (16 - min); - - // Unrolling was here, but it wasn't doing anything but taking up space. - // Reroll FTW. - - const u8 *last_block = buf_end - 16; - while (buf < last_block) { - m128 lchars = load128(buf); - rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, - lchars, buf, low4bits, ones); - if (rv) { - return rv; - } - buf += 16; - } - - // Use an unaligned load to mop up the last 16 bytes and get an accurate - // picture to buf_end. - chars = loadu128(buf_end - 16); - rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, - chars, buf_end - 16, low4bits, ones); - if (rv) { - return rv; - } - - return buf_end; -} - -#elif !defined(HAVE_AVX512) -// AVX2 - 256 wide shuftis - -#ifdef DEBUG -DUMP_MSK(256) -#endif - -#define GET_LO_4(chars) and256(chars, low4bits) -#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4) - -static really_inline -u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits, - const m256 compare) { - m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); - m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); - m256 t = and256(c_lo, c_hi); - -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); -#endif - - return movemask256(eq256(t, compare)); -} - -static really_inline -const u8 *firstMatch(const u8 *buf, u32 z) { - DEBUG_PRINTF("z 0x%08x\n", z); - if (unlikely(z != 0xffffffff)) { - u32 pos = ctz32(~z); - assert(pos < 32); - DEBUG_PRINTF("match @ pos %u\n", pos); - return buf + pos; - } else { - return NULL; // no match - } -} - -static really_inline -const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf, - const m256 low4bits) { - // do the hi and lo shuffles in the one avx register - m256 c = combine2x128(rshift64_m128(chars, 4), chars); - c = and256(c, low4bits); - m256 c_shuf = pshufb_m256(mask, c); - m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); - // the upper 32-bits can't match - u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128())); - - return firstMatch(buf, z); -} - -static really_inline -const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end, const m256 low4bits) { - // run shufti over two overlapping 16-byte unaligned reads - const m256 mask = combine2x128(mask_hi, mask_lo); - m128 chars = loadu128(buf); - const u8 *rv = fwdBlockShort(mask, chars, buf, low4bits); - if (rv) { - return rv; - } - - chars = loadu128(buf_end - 16); - rv = fwdBlockShort(mask, chars, buf_end - 16, low4bits); - if (rv) { - return rv; - } - return buf_end; -} - -static really_inline -const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, - const m256 low4bits, const m256 zeroes) { - u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); - - return firstMatch(buf, z); -} - -/* takes 128 bit masks, but operates on 256 bits of data */ -const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end) { - assert(buf && buf_end); - assert(buf < buf_end); - DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf); - - // Slow path for small cases. - if (buf_end - buf < 16) { - return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, - buf, buf_end); - } - - const m256 low4bits = set1_32x8(0xf); - - if (buf_end - buf <= 32) { - return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits); - } - - const m256 zeroes = zeroes256(); - const m256 wide_mask_lo = set1_2x128(mask_lo); - const m256 wide_mask_hi = set1_2x128(mask_hi); - const u8 *rv; - - size_t min = (size_t)buf % 32; - assert(buf_end - buf >= 32); - - // Preconditioning: most of the time our buffer won't be aligned. - m256 chars = loadu256(buf); - rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes); - if (rv) { - return rv; - } - buf += (32 - min); - - // Unrolling was here, but it wasn't doing anything but taking up space. - // Reroll FTW. - - const u8 *last_block = buf_end - 32; - while (buf < last_block) { - m256 lchars = load256(buf); - rv = fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes); - if (rv) { - return rv; - } - buf += 32; - } - - // Use an unaligned load to mop up the last 32 bytes and get an accurate - // picture to buf_end. - assert(buf <= buf_end && buf >= buf_end - 32); - chars = loadu256(buf_end - 32); - rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes); - if (rv) { - return rv; - } - - return buf_end; -} - -static really_inline -const u8 *lastMatch(const u8 *buf, u32 z) { - if (unlikely(z != 0xffffffff)) { - u32 pos = clz32(~z); - DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); - return buf + (31 - pos); - } else { - return NULL; // no match - } -} - -static really_inline -const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, - const m256 low4bits, const m256 zeroes) { - m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); - m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); - m256 t = and256(c_lo, c_hi); - -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); -#endif - - u32 z = movemask256(eq256(t, zeroes)); - return lastMatch(buf, z); -} - -static really_inline -const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf, - const m256 low4bits) { - // do the hi and lo shuffles in the one avx register - m256 c = combine2x128(rshift64_m128(chars, 4), chars); - c = and256(c, low4bits); - m256 c_shuf = pshufb_m256(mask, c); - m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); - // the upper 32-bits can't match - u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128())); - - return lastMatch(buf, z); -} - -static really_inline -const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end, const m256 low4bits) { - // run shufti over two overlapping 16-byte unaligned reads - const m256 mask = combine2x128(mask_hi, mask_lo); - - m128 chars = loadu128(buf_end - 16); - const u8 *rv = revBlockShort(mask, chars, buf_end - 16, low4bits); - if (rv) { - return rv; - } - - chars = loadu128(buf); - rv = revBlockShort(mask, chars, buf, low4bits); - if (rv) { - return rv; - } - return buf - 1; -} - - -/* takes 128 bit masks, but operates on 256 bits of data */ -const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end) { - assert(buf && buf_end); - assert(buf < buf_end); - - // Slow path for small cases. - if (buf_end - buf < 16) { - return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, - buf, buf_end); - } - - const m256 low4bits = set1_32x8(0xf); - - if (buf_end - buf <= 32) { - return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits); - } - - const m256 zeroes = zeroes256(); - const m256 wide_mask_lo = set1_2x128(mask_lo); - const m256 wide_mask_hi = set1_2x128(mask_hi); - const u8 *rv; - - assert(buf_end - buf >= 32); - - // Preconditioning: most of the time our buffer won't be aligned. - m256 chars = loadu256(buf_end - 32); - rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes); - if (rv) { - return rv; - } - buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f)); - - // Unrolling was here, but it wasn't doing anything but taking up space. - // Reroll FTW. - const u8 *last_block = buf + 32; - while (buf_end > last_block) { - buf_end -= 32; - m256 lchars = load256(buf_end); - rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, zeroes); - if (rv) { - return rv; - } - } - - // Use an unaligned load to mop up the last 32 bytes and get an accurate - // picture to buf. - chars = loadu256(buf); - rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes); - if (rv) { - return rv; - } - - return buf - 1; -} - -static really_inline -const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi, - m256 chars, const u8 *buf, const m256 low4bits, - const m256 ones) { - DEBUG_PRINTF("buf %p\n", buf); - m256 chars_lo = GET_LO_4(chars); - m256 chars_hi = GET_HI_4(chars); - m256 c_lo = pshufb_m256(mask1_lo, chars_lo); - m256 c_hi = pshufb_m256(mask1_hi, chars_hi); - m256 t = or256(c_lo, c_hi); - -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); -#endif - - m256 c2_lo = pshufb_m256(mask2_lo, chars_lo); - m256 c2_hi = pshufb_m256(mask2_hi, chars_hi); - m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1)); - -#ifdef DEBUG - DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo); printf("\n"); - DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi); printf("\n"); - DEBUG_PRINTF(" t2: "); dumpMsk256(t2); printf("\n"); -#endif - u32 z = movemask256(eq256(t2, ones)); - - return firstMatch(buf, z); -} - -static really_inline -const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf, - const m256 low4bits) { - // do the hi and lo shuffles in the one avx register - m256 c = combine2x128(rshift64_m128(chars, 4), chars); - c = and256(c, low4bits); - m256 c_shuf1 = pshufb_m256(mask1, c); - m256 c_shuf2 = rshift128_m256(pshufb_m256(mask2, c), 1); - m256 t0 = or256(c_shuf1, c_shuf2); - m128 t = or128(movdq_hi(t0), cast256to128(t0)); - // the upper 32-bits can't match - u32 z = 0xffff0000U | movemask128(eq128(t, ones128())); - - return firstMatch(buf, z); -} - -static really_inline -const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, - m128 mask2_hi, const u8 *buf, const u8 *buf_end) { - DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); - const m256 low4bits = set1_32x8(0xf); - // run shufti over two overlapping 16-byte unaligned reads - const m256 mask1 = combine2x128(mask1_hi, mask1_lo); - const m256 mask2 = combine2x128(mask2_hi, mask2_lo); - m128 chars = loadu128(buf); - const u8 *rv = fwdBlockShort2(mask1, mask2, chars, buf, low4bits); - if (rv) { - return rv; - } - - chars = loadu128(buf_end - 16); - rv = fwdBlockShort2(mask1, mask2, chars, buf_end - 16, low4bits); - if (rv) { - return rv; - } - return buf_end; -} - -/* takes 128 bit masks, but operates on 256 bits of data */ -const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, - m128 mask2_lo, m128 mask2_hi, - const u8 *buf, const u8 *buf_end) { - /* we should always have at least 16 bytes */ - assert(buf_end - buf >= 16); - DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); - - if (buf_end - buf < 32) { - return shuftiDoubleShort(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf, - buf_end); - } - - const m256 ones = ones256(); - const m256 low4bits = set1_32x8(0xf); - const m256 wide_mask1_lo = set1_2x128(mask1_lo); - const m256 wide_mask1_hi = set1_2x128(mask1_hi); - const m256 wide_mask2_lo = set1_2x128(mask2_lo); - const m256 wide_mask2_hi = set1_2x128(mask2_hi); - const u8 *rv; - - size_t min = (size_t)buf % 32; - - // Preconditioning: most of the time our buffer won't be aligned. - m256 chars = loadu256(buf); - rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, - chars, buf, low4bits, ones); - if (rv) { - return rv; - } - buf += (32 - min); - - // Unrolling was here, but it wasn't doing anything but taking up space. - // Reroll FTW. - const u8 *last_block = buf_end - 32; - while (buf < last_block) { - m256 lchars = load256(buf); - rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, - lchars, buf, low4bits, ones); - if (rv) { - return rv; - } - buf += 32; - } - - // Use an unaligned load to mop up the last 32 bytes and get an accurate - // picture to buf_end. - chars = loadu256(buf_end - 32); - rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, - chars, buf_end - 32, low4bits, ones); - if (rv) { - return rv; - } - - return buf_end; -} - -#else // defined(HAVE_AVX512) - -#ifdef DEBUG -DUMP_MSK(512) -#endif - -static really_inline -u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits, - const m512 compare) { - m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); - m512 c_hi = pshufb_m512(mask_hi, - rshift64_m512(andnot512(low4bits, chars), 4)); - m512 t = and512(c_lo, c_hi); - -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); -#endif - - return eq512mask(t, compare); -} -static really_inline -const u8 *firstMatch64(const u8 *buf, u64a z) { - DEBUG_PRINTF("z 0x%016llx\n", z); - if (unlikely(z != ~0ULL)) { - u32 pos = ctz64(~z); - DEBUG_PRINTF("match @ pos %u\n", pos); - assert(pos < 64); - return buf + pos; - } else { - return NULL; // no match - } -} - -static really_inline -const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, - const m512 low4bits, const m512 zeroes) { - u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); - - return firstMatch64(buf, z); -} - -static really_inline -const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, - const u8 *buf_end, const m512 low4bits, - const m512 zeroes) { - DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf); - uintptr_t len = buf_end - buf; - assert(len <= 64); - - // load mask - u64a k = (~0ULL) >> (64 - len); - DEBUG_PRINTF("load mask 0x%016llx\n", k); - - m512 chars = loadu_maskz_m512(k, buf); - - u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); - - // reuse the load mask to indicate valid bytes - return firstMatch64(buf, z | ~k); -} - -/* takes 128 bit masks, but operates on 512 bits of data */ -const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end) { - assert(buf && buf_end); - assert(buf < buf_end); - DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf); - DEBUG_PRINTF("b %s\n", buf); - - const m512 low4bits = set1_64x8(0xf); - const m512 zeroes = zeroes512(); - const m512 wide_mask_lo = set1_4x128(mask_lo); - const m512 wide_mask_hi = set1_4x128(mask_hi); - const u8 *rv; - - // small cases. - if (buf_end - buf <= 64) { - rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits, - zeroes); - return rv ? rv : buf_end; - } - - assert(buf_end - buf >= 64); - - // Preconditioning: most of the time our buffer won't be aligned. - if ((uintptr_t)buf % 64) { - rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, - ROUNDUP_PTR(buf, 64), low4bits, zeroes); - if (rv) { - return rv; - } - buf = ROUNDUP_PTR(buf, 64); - } - - const u8 *last_block = ROUNDDOWN_PTR(buf_end, 64); - while (buf < last_block) { - m512 lchars = load512(buf); - rv = fwdBlock512(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, - zeroes); - if (rv) { - return rv; - } - buf += 64; - } - - if (buf == buf_end) { - goto done; - } - - // Use an unaligned load to mop up the last 64 bytes and get an accurate - // picture to buf_end. - assert(buf <= buf_end && buf >= buf_end - 64); - m512 chars = loadu512(buf_end - 64); - rv = fwdBlock512(wide_mask_lo, wide_mask_hi, chars, buf_end - 64, low4bits, - zeroes); - if (rv) { - return rv; - } -done: - return buf_end; -} - -static really_inline -const u8 *lastMatch64(const u8 *buf, u64a z) { - DEBUG_PRINTF("z 0x%016llx\n", z); - if (unlikely(z != ~0ULL)) { - u32 pos = clz64(~z); - DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); - return buf + (63 - pos); - } else { - return NULL; // no match - } -} - -static really_inline -const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, - const u8 *buf_end, const m512 low4bits, - const m512 zeroes) { - DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf); - uintptr_t len = buf_end - buf; - assert(len <= 64); - - // load mask - u64a k = (~0ULL) >> (64 - len); - DEBUG_PRINTF("load mask 0x%016llx\n", k); - - m512 chars = loadu_maskz_m512(k, buf); - - u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); - - // reuse the load mask to indicate valid bytes - return lastMatch64(buf, z | ~k); -} - -static really_inline -const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, - const m512 low4bits, const m512 zeroes) { - m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); - m512 c_hi = pshufb_m512(mask_hi, - rshift64_m512(andnot512(low4bits, chars), 4)); - m512 t = and512(c_lo, c_hi); - -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); -#endif - - u64a z = eq512mask(t, zeroes); - return lastMatch64(buf, z); -} - -/* takes 128 bit masks, but operates on 512 bits of data */ -const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("buf %p buf_end %p\n", buf, buf_end); - assert(buf && buf_end); - assert(buf < buf_end); - - const m512 low4bits = set1_64x8(0xf); - const m512 zeroes = zeroes512(); - const m512 wide_mask_lo = set1_4x128(mask_lo); - const m512 wide_mask_hi = set1_4x128(mask_hi); - const u8 *rv; - - if (buf_end - buf < 64) { - rv = rshortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits, - zeroes); - return rv ? rv : buf - 1; - } - - if (ROUNDDOWN_PTR(buf_end, 64) != buf_end) { - // peel off unaligned portion - assert(buf_end - buf >= 64); - DEBUG_PRINTF("start\n"); - rv = rshortShufti512(wide_mask_lo, wide_mask_hi, - ROUNDDOWN_PTR(buf_end, 64), buf_end, low4bits, - zeroes); - if (rv) { - return rv; - } - buf_end = ROUNDDOWN_PTR(buf_end, 64); - } - - const u8 *last_block = ROUNDUP_PTR(buf, 64); - while (buf_end > last_block) { - buf_end -= 64; - m512 lchars = load512(buf_end); - rv = revBlock512(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, - zeroes); - if (rv) { - return rv; - } - } - if (buf_end == buf) { - goto done; - } - // Use an unaligned load to mop up the last 64 bytes and get an accurate - // picture to buf. - m512 chars = loadu512(buf); - rv = revBlock512(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes); - if (rv) { - return rv; - } -done: - return buf - 1; -} - -static really_inline -const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi, - m512 chars, const u8 *buf, const m512 low4bits, - const m512 ones, __mmask64 k) { - DEBUG_PRINTF("buf %p %.64s\n", buf, buf); - m512 chars_lo = and512(chars, low4bits); - m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4); - m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo); - m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi); - m512 t = or512(c_lo, c_hi); - -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); -#endif - - m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo); - m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi); - m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1)); - -#ifdef DEBUG - DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo); printf("\n"); - DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi); printf("\n"); - DEBUG_PRINTF(" t2: "); dumpMsk512(t2); printf("\n"); -#endif - u64a z = eq512mask(t2, ones); - - return firstMatch64(buf, z | ~k); -} - -static really_inline -const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, - m512 mask2_hi, const u8 *buf, const u8 *buf_end, - const m512 low4bits, const m512 ones) { - DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf); - uintptr_t len = buf_end - buf; - assert(len <= 64); - - u64a k = (~0ULL) >> (64 - len); - DEBUG_PRINTF("load mask 0x%016llx\n", k); - - m512 chars = loadu_mask_m512(ones, k, buf); - - const u8 *rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf, - low4bits, ones, k); - - return rv; -} - -/* takes 128 bit masks, but operates on 512 bits of data */ -const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, - m128 mask2_lo, m128 mask2_hi, - const u8 *buf, const u8 *buf_end) { - /* we should always have at least 16 bytes */ - assert(buf_end - buf >= 16); - DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); - - const m512 ones = ones512(); - const m512 low4bits = set1_64x8(0xf); - const m512 wide_mask1_lo = set1_4x128(mask1_lo); - const m512 wide_mask1_hi = set1_4x128(mask1_hi); - const m512 wide_mask2_lo = set1_4x128(mask2_lo); - const m512 wide_mask2_hi = set1_4x128(mask2_hi); - const u8 *rv; - - if (buf_end - buf <= 64) { - rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, - wide_mask2_hi, buf, buf_end, low4bits, ones); - DEBUG_PRINTF("rv %p\n", rv); - return rv ? rv : buf_end; - } - - // Preconditioning: most of the time our buffer won't be aligned. - if ((uintptr_t)buf % 64) { - rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, - wide_mask2_hi, buf, ROUNDUP_PTR(buf, 64), - low4bits, ones); - if (rv) { - return rv; - } - - buf = ROUNDUP_PTR(buf, 64); - } - - const u8 *last_block = buf_end - 64; - while (buf < last_block) { - m512 lchars = load512(buf); - rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, - wide_mask2_hi, lchars, buf, low4bits, ones, ~0); - if (rv) { - return rv; - } - buf += 64; - } - - // Use an unaligned load to mop up the last 64 bytes and get an accurate - // picture to buf_end. - m512 chars = loadu512(buf_end - 64); - rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, - chars, buf_end - 64, low4bits, ones, ~0); - if (rv) { - return rv; - } - - return buf_end; -} -#endif diff --git a/src/nfa/shufti.cpp b/src/nfa/shufti.cpp new file mode 100644 index 00000000..4622af92 --- /dev/null +++ b/src/nfa/shufti.cpp @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Shufti: character class acceleration. + * + * Utilises the SSSE3 pshufb shuffle instruction + */ + +#include "shufti.h" +#include "ue2common.h" +#include "util/arch.h" +#include "util/bitutils.h" + +#ifdef DEBUG +#include + +#define DUMP_MSK(_t) \ +static UNUSED \ +void dumpMsk##_t(m##_t msk) { \ + u8 * mskAsU8 = (u8 *)&msk; \ + for (unsigned i = 0; i < sizeof(msk); i++) { \ + u8 c = mskAsU8[i]; \ + for (int j = 0; j < 8; j++) { \ + if ((c >> (7-j)) & 0x1) \ + printf("1"); \ + else \ + printf("0"); \ + } \ + printf(" "); \ + } \ +} \ +static UNUSED \ +void dumpMsk##_t##AsChars(m##_t msk) { \ + u8 * mskAsU8 = (u8 *)&msk; \ + for (unsigned i = 0; i < sizeof(msk); i++) { \ + u8 c = mskAsU8[i]; \ + if (isprint(c)) \ + printf("%c",c); \ + else \ + printf("."); \ + } \ +} + +#endif + +#ifdef DEBUG +DUMP_MSK(128) +#endif + + + +/** \brief Naive byte-by-byte implementation. */ +static really_inline +const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf, + const u8 *buf_end) { + assert(buf < buf_end); + + DEBUG_PRINTF("buf %p end %p \n", buf, buf_end); + for (; buf < buf_end; ++buf) { + u8 c = *buf; + if (lo[c & 0xf] & hi[c >> 4]) { + break; + } + } + return buf; +} + +/** \brief Naive byte-by-byte implementation. */ +static really_inline +const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf, + const u8 *buf_end) { + assert(buf < buf_end); + + for (buf_end--; buf_end >= buf; buf_end--) { + u8 c = *buf_end; + if (lo[c & 0xf] & hi[c >> 4]) { + break; + } + } + return buf_end; +} + +#if !defined(HAVE_SVE) +#include "shufti_simd.hpp" + +const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end) { + return shuftiExecReal(mask_lo, mask_hi, buf, buf_end); +} + +const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end) { + return rshuftiExecReal(mask_lo, mask_hi, buf, buf_end); +} + +const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, + m128 mask2_lo, m128 mask2_hi, + const u8 *buf, const u8 *buf_end) { + return shuftiDoubleExecReal(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf, buf_end); +} +#endif \ No newline at end of file diff --git a/src/nfa/shufti.h b/src/nfa/shufti.h index 1ebf776c..a6f9bc79 100644 --- a/src/nfa/shufti.h +++ b/src/nfa/shufti.h @@ -36,7 +36,7 @@ #define SHUFTI_H #include "ue2common.h" -#include "util/simd_utils.h" +#include "util/simd_types.h" #ifdef __cplusplus extern "C" diff --git a/src/nfa/shufti_simd.hpp b/src/nfa/shufti_simd.hpp new file mode 100644 index 00000000..de1d7970 --- /dev/null +++ b/src/nfa/shufti_simd.hpp @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Shufti: character class acceleration. + * + * Utilises the SSSE3 pshufb shuffle instruction + */ + +#include "shufti.h" +#include "ue2common.h" +#include "util/arch.h" +#include "util/bitutils.h" +#include "util/unaligned.h" + +#include "util/simd/types.hpp" + +#define GET1_LO_4(chars, low4bits) and128(chars, low4bits) +#define GET1_HI_4(chars, low4bits) and128(rshift64_m128(chars, 4), low4bits) + +template +static really_inline +typename SuperVector::movemask_type block(SuperVector mask_lo, SuperVector mask_hi, + SuperVector chars, const SuperVector low4bits) { + SuperVector c_lo = chars & low4bits; + print_m128_16x8("c_lo", c_lo.u.v128[0]); + c_lo = mask_lo.pshufb(c_lo); + print_m128_16x8("c_lo", c_lo.u.v128[0]); + SuperVector c_hi = mask_hi.pshufb(chars.rshift64(4) & low4bits); + SuperVector t = c_lo & c_hi; + + print_m128_16x8("low4bits", low4bits.u.v128[0]); + print_m128_16x8("mask_lo", mask_lo.u.v128[0]); + print_m128_16x8("mask_hi", mask_hi.u.v128[0]); + print_m128_16x8("chars", chars.u.v128[0]); + print_m128_16x8("c_lo", c_lo.u.v128[0]); + print_m128_16x8("c_hi", c_hi.u.v128[0]); + print_m128_16x8("t", t.u.v128[0]); + + return t.eqmask(SuperVector::Zeroes()); +} + +template +const u8 *firstMatch(const u8 *buf, typename SuperVector::movemask_type z); + +template +const u8 *lastMatch(const u8 *buf, typename SuperVector::movemask_type z); + + +template <> +really_inline +const u8 *firstMatch<16>(const u8 *buf, typename SuperVector<16>::movemask_type z) { + DEBUG_PRINTF("buf %p z %08x \n", buf, z); + DEBUG_PRINTF("z %08x\n", z); + if (unlikely(z != 0xffff)) { + u32 pos = ctz32(~z & 0xffff); + DEBUG_PRINTF("~z %08x\n", ~z); + DEBUG_PRINTF("match @ pos %u\n", pos); + assert(pos < 16); + return buf + pos; + } else { + return NULL; // no match + } +} + +template <> +really_inline +const u8 *firstMatch<64>(const u8 *buf, typename SuperVector<64>::movemask_type z) { + DEBUG_PRINTF("z 0x%016llx\n", z); + if (unlikely(z != ~0ULL)) { + u32 pos = ctz64(~z); + DEBUG_PRINTF("match @ pos %u\n", pos); + assert(pos < 64); + return buf + pos; + } else { + return NULL; // no match + } +} + +template +static really_inline +const u8 *fwdBlock(SuperVector mask_lo, SuperVector mask_hi, SuperVector chars, + const SuperVector low4bits, const u8 *buf) { + typename SuperVector::movemask_type z = block(mask_lo, mask_hi, chars, low4bits); + DEBUG_PRINTF("z %08x\n", z); + + return firstMatch(buf, z); +} + +template +static really_inline +const u8 *shortShufti(SuperVector mask_lo, SuperVector mask_hi, const u8 *buf, + const u8 *buf_end, const SuperVector low4bits) { + DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf); + uintptr_t len = buf_end - buf; + assert(len <= S); + + SuperVector chars = SuperVector::loadu_maskz(buf, static_cast(len)); + print_m128_16x8("chars", chars.u.v128[0]); + uint8_t alignment = (uintptr_t)(buf) & 15; + typename SuperVector::movemask_type maskb = 1 << alignment; + typename SuperVector::movemask_type maske = SINGLE_LOAD_MASK(len - alignment); + typename SuperVector::movemask_type z = block(mask_lo, mask_hi, chars, low4bits); + // reuse the load mask to indicate valid bytes + DEBUG_PRINTF("z %08x\n", z); + z &= maskb | maske; + DEBUG_PRINTF("z %08x\n", z); + + return firstMatch(buf, z); +} + + +template <> +really_inline +const u8 *lastMatch<16>(const u8 *buf, typename SuperVector<16>::movemask_type z) { + DEBUG_PRINTF("buf %p z %08x \n", buf, z); + DEBUG_PRINTF("z %08x\n", z); + if (unlikely(z != 0xffff)) { + u32 pos = clz32(~z & 0xffff); + DEBUG_PRINTF("~z %08x\n", ~z); + DEBUG_PRINTF("match @ pos %u\n", pos); + assert(pos >= 16 && pos < 32); + return buf + (31 - pos); + } else { + return NULL; // no match + } +} + +template <> +really_inline +const u8 *lastMatch<64>(const u8 *buf, typename SuperVector<64>::movemask_type z) { + DEBUG_PRINTF("z 0x%016llx\n", z); + if (unlikely(z != ~0ULL)) { + u32 pos = clz64(~z); + DEBUG_PRINTF("match @ pos %u\n", pos); + assert(pos < 64); + return buf + pos; + } else { + return NULL; // no match + } +} + +template +static really_inline +const u8 *revBlock(SuperVector mask_lo, SuperVector mask_hi, SuperVector chars, + const SuperVector low4bits, const u8 *buf) { + typename SuperVector::movemask_type z = block(mask_lo, mask_hi, chars, low4bits); + DEBUG_PRINTF("z %08x\n", z); + + return lastMatch(buf, z); +} + + +template +const u8 *shuftiExecReal(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *buf_end) { + assert(buf && buf_end); + assert(buf < buf_end); + DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf); + DEBUG_PRINTF("b %s\n", buf); + + const SuperVector low4bits = SuperVector::set1u_16x8(0xf); + const SuperVector wide_mask_lo(mask_lo); + const SuperVector wide_mask_hi(mask_hi); + + const u8 *d = buf; + const u8 *rv; + + DEBUG_PRINTF("start %p end %p \n", d, buf_end); + assert(d < buf_end); + if (d + S <= buf_end) { + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDUP_PTR(d, S); + DEBUG_PRINTF("until aligned %p \n", d1); + if (d1 != d) { + rv = shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, d, d1); + // rv = shortShufti(wide_mask_lo, wide_mask_hi, d, d1, low4bits); + if (rv != d1) { + return rv; + } + d = d1; + } + + size_t loops = (buf_end - d) / S; + DEBUG_PRINTF("loops %ld \n", loops); + + for (size_t i = 0; i < loops; i++, d+= S) { + DEBUG_PRINTF("d %p \n", d); + const u8 *base = ROUNDUP_PTR(d, S); + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(base + 256); + + SuperVector chars = SuperVector::load(d); + rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, low4bits, d); + if (rv) return rv; + } + } + + DEBUG_PRINTF("d %p e %p \n", d, buf_end); + // finish off tail + + rv = buf_end; + if (d != buf_end) { + rv = shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, d, buf_end); + // rv = shortShufti(wide_mask_lo, wide_mask_hi, buf_end - S, buf_end, low4bits); + DEBUG_PRINTF("rv %p \n", rv); + } + + return rv; +} + +template +const u8 *rshuftiExecReal(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *buf_end) { + assert(buf && buf_end); + assert(buf < buf_end); + DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf); + DEBUG_PRINTF("b %s\n", buf); + + const SuperVector low4bits = SuperVector::set1u_16x8(0xf); + const SuperVector wide_mask_lo(mask_lo); + const SuperVector wide_mask_hi(mask_hi); + + const u8 *d = buf_end; + const u8 *rv; + + DEBUG_PRINTF("start %p end %p \n", buf, d); + assert(d > buf); + if (d - S >= buf) { + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDDOWN_PTR(d, S); + DEBUG_PRINTF("until aligned %p \n", d1); + if (d1 != d) { + rv = shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, d1, d); + DEBUG_PRINTF("rv %p \n", rv); + // rv = shortShufti(wide_mask_lo, wide_mask_hi, d, d1, low4bits); + if (rv != d1 - 1) return rv; + d = d1; + } + + while (d - S >= buf) { + d -= S; + DEBUG_PRINTF("d %p \n", d); + const u8 *base = ROUNDDOWN_PTR(buf, S); + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(base + 256); + + SuperVector chars = SuperVector::load(d); + rv = revBlock(wide_mask_lo, wide_mask_hi, chars, low4bits, d); + if (rv) return rv; + } + } + + DEBUG_PRINTF("d %p e %p \n", buf, d); + // finish off tail + + if (d != buf) { + rv = shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, d); + // rv = shortShufti(wide_mask_lo, wide_mask_hi, buf_end - S, buf_end, low4bits); + DEBUG_PRINTF("rv %p \n", rv); + if (rv != d - 1) return rv; + } + + return buf - 1; +} + +template +static really_inline +const u8 *fwdBlockDouble(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, + SuperVector chars, const SuperVector low4bits, const u8 *buf) { + SuperVector chars_lo = chars & low4bits; + SuperVector chars_hi = chars.rshift64(4) & low4bits; + SuperVector c1_lo = mask1_lo.pshufb(chars_lo); + SuperVector c1_hi = mask1_hi.pshufb(chars_hi); + SuperVector t1 = c1_lo | c1_hi; + + SuperVector c2_lo = mask2_lo.pshufb(chars_lo); + SuperVector c2_hi = mask2_hi.pshufb(chars_hi); + SuperVector t2 = c2_lo | c2_hi; + SuperVector t = t1 | (t2 >> 1); + + typename SuperVector::movemask_type z = t.eqmask(SuperVector::Ones()); + DEBUG_PRINTF(" z: 0x%08x\n", z); + return firstMatch(buf, z); +} + +template +const u8 *shuftiDoubleExecReal(m128 mask1_lo, m128 mask1_hi, + m128 mask2_lo, m128 mask2_hi, + const u8 *buf, const u8 *buf_end) { + assert(buf && buf_end); + assert(buf < buf_end); + DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf); + DEBUG_PRINTF("b %s\n", buf); + + const SuperVector low4bits = SuperVector::set1u_16x8(0xf); + const SuperVector wide_mask1_lo(mask1_lo); + const SuperVector wide_mask1_hi(mask1_hi); + const SuperVector wide_mask2_lo(mask2_lo); + const SuperVector wide_mask2_hi(mask2_hi); + + const u8 *d = buf; + const u8 *rv; + + DEBUG_PRINTF("start %p end %p \n", d, buf_end); + assert(d < buf_end); + if (d + S <= buf_end) { + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDUP_PTR(d, S); + DEBUG_PRINTF("until aligned %p \n", d1); + if (d1 != d) { + SuperVector chars = SuperVector::loadu(d); + rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, low4bits, d); + if (rv) return rv; + d = d1; + } + + size_t loops = (buf_end - d) / S; + DEBUG_PRINTF("loops %ld \n", loops); + + for (size_t i = 0; i < loops; i++, d+= S) { + DEBUG_PRINTF("d %p \n", d); + const u8 *base = ROUNDUP_PTR(d, S); + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(base + 256); + + SuperVector chars = SuperVector::load(d); + rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, low4bits, d); + if (rv) return rv; + } + } + + DEBUG_PRINTF("d %p e %p \n", d, buf_end); + // finish off tail + + if (d != buf_end) { + SuperVector chars = SuperVector::loadu(buf_end - S); + rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, low4bits, buf_end - S); + DEBUG_PRINTF("rv %p \n", rv); + if (rv) return rv; + } + + return buf_end; +}