From 47b17ade27ffef5e169bdd2da4fe41803606f814 Mon Sep 17 00:00:00 2001 From: Anatoly Burakov Date: Wed, 9 Dec 2015 12:20:34 +0000 Subject: [PATCH] Multibyte shufti runtime --- CMakeLists.txt | 5 + src/nfa/accel.c | 55 ++++++++ src/nfa/accel.h | 22 +++ src/nfa/accel_dump.cpp | 62 ++++++--- src/nfa/limex_accel.c | 27 ++++ src/nfa/multishufti.c | 114 ++++++++++++++++ src/nfa/multishufti.h | 70 ++++++++++ src/nfa/multishufti_avx2.h | 122 +++++++++++++++++ src/nfa/multishufti_sse.h | 266 +++++++++++++++++++++++++++++++++++++ src/nfa/shufti.c | 108 ++------------- src/nfa/shufti_common.h | 146 ++++++++++++++++++++ 11 files changed, 886 insertions(+), 111 deletions(-) create mode 100644 src/nfa/multishufti.c create mode 100644 src/nfa/multishufti.h create mode 100644 src/nfa/multishufti_avx2.h create mode 100644 src/nfa/multishufti_sse.h create mode 100644 src/nfa/shufti_common.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 0848f550..353bc561 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -444,6 +444,10 @@ set (hs_exec_SRCS src/nfa/multiaccel_longgrab.h src/nfa/multiaccel_shift.h src/nfa/multiaccel_shiftgrab.h + src/nfa/multishufti.c + src/nfa/multishufti_avx2.h + src/nfa/multishufti_sse.h + src/nfa/multishufti.h src/nfa/multivermicelli.c src/nfa/multivermicelli.h src/nfa/multivermicelli_sse.h @@ -455,6 +459,7 @@ set (hs_exec_SRCS src/nfa/repeat.c src/nfa/repeat.h src/nfa/repeat_internal.h + src/nfa/shufti_common.h src/nfa/shufti.c src/nfa/shufti.h src/nfa/truffle.c diff --git a/src/nfa/accel.c b/src/nfa/accel.c index 43ecd84f..ee081154 100644 --- a/src/nfa/accel.c +++ b/src/nfa/accel.c @@ -30,6 +30,7 @@ #include "shufti.h" #include "truffle.h" #include "vermicelli.h" +#include "multishufti.h" #include "multivermicelli.h" #include "ue2common.h" @@ -219,6 +220,60 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) { rv = doubleshiftgrab_vermicelliExec(accel->mdverm.c, 1, c, c_end, accel->mdverm.len1, accel->mdverm.len2); break; + case ACCEL_MLSHUFTI: + DEBUG_PRINTF("accel mlshufti %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = long_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end, + accel->mshufti.len); + break; + case ACCEL_MLGSHUFTI: + DEBUG_PRINTF("accel mlgshufti %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = longgrab_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end, + accel->mshufti.len); + break; + case ACCEL_MSSHUFTI: + DEBUG_PRINTF("accel msshufti %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = shift_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end, + accel->mshufti.len); + break; + case ACCEL_MSGSHUFTI: + DEBUG_PRINTF("accel msgshufti %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = shiftgrab_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end, + accel->mshufti.len); + break; + case ACCEL_MDSSHUFTI: + DEBUG_PRINTF("accel mdsshufti %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = doubleshift_shuftiExec(accel->mdshufti.lo, accel->mdshufti.hi, c, c_end, + accel->mdshufti.len1, accel->mdshufti.len2); + break; + case ACCEL_MDSGSHUFTI: + DEBUG_PRINTF("accel msgshufti %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = doubleshiftgrab_shuftiExec(accel->mdshufti.lo, accel->mdshufti.hi, c, c_end, + accel->mdshufti.len1, accel->mdshufti.len2); + break; default: assert(!"not here"); diff --git a/src/nfa/accel.h b/src/nfa/accel.h index cc64d587..87acf6cf 100644 --- a/src/nfa/accel.h +++ b/src/nfa/accel.h @@ -74,6 +74,13 @@ enum AccelType { ACCEL_MDSVERM_NOCASE, ACCEL_MDSGVERM, ACCEL_MDSGVERM_NOCASE, + /* multibyte shuftis */ + ACCEL_MLSHUFTI, + ACCEL_MLGSHUFTI, + ACCEL_MSSHUFTI, + ACCEL_MSGSHUFTI, + ACCEL_MDSSHUFTI, + ACCEL_MDSGSHUFTI, }; /** \brief Structure for accel framework. */ @@ -121,6 +128,21 @@ union AccelAux { m128 lo2; m128 hi2; } dshufti; + struct { + u8 accel_type; + u8 offset; + m128 lo; + m128 hi; + u8 len; + } mshufti; + struct { + u8 accel_type; + u8 offset; + m128 lo; + m128 hi; + u8 len1; + u8 len2; + } mdshufti; struct { u8 accel_type; u8 offset; diff --git a/src/nfa/accel_dump.cpp b/src/nfa/accel_dump.cpp index 19116a8f..5a28c6a0 100644 --- a/src/nfa/accel_dump.cpp +++ b/src/nfa/accel_dump.cpp @@ -110,11 +110,38 @@ const char *accelName(u8 accel_type) { return "multibyte doubleshift-grab vermicelli"; case ACCEL_MDSGVERM_NOCASE: return "multibyte doubleshift-grab vermicelli nocase"; + case ACCEL_MLSHUFTI: + return "multibyte long shufti"; + case ACCEL_MLGSHUFTI: + return "multibyte long-grab shufti"; + case ACCEL_MSSHUFTI: + return "multibyte shift shufti"; + case ACCEL_MSGSHUFTI: + return "multibyte shift-grab shufti"; + case ACCEL_MDSSHUFTI: + return "multibyte doubleshift shufti"; + case ACCEL_MDSGSHUFTI: + return "multibyte doubleshift-grab shufti"; default: return "unknown!"; } } +static +void dumpShuftiCharReach(FILE *f, const m128 &lo, const m128 &hi) { + CharReach cr = shufti2cr(lo, hi); + fprintf(f, "count %zu class %s\n", cr.count(), + describeClass(cr).c_str()); +} + +static +void dumpShuftiMasks(FILE *f, const m128 &lo, const m128 &hi) { + fprintf(f, "lo %s\n", + dumpMask((const u8 *)&lo, 128).c_str()); + fprintf(f, "hi %s\n", + dumpMask((const u8 *)&hi, 128).c_str()); +} + void dumpAccelInfo(FILE *f, const AccelAux &accel) { fprintf(f, " %s", accelName(accel.accel_type)); if (accel.generic.offset) { @@ -136,25 +163,16 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) { break; case ACCEL_SHUFTI: { fprintf(f, "\n"); - fprintf(f, "lo %s\n", - dumpMask((const u8 *)&accel.shufti.lo, 128).c_str()); - fprintf(f, "hi %s\n", - dumpMask((const u8 *)&accel.shufti.hi, 128).c_str()); - CharReach cr = shufti2cr(accel.shufti.lo, accel.shufti.hi); - fprintf(f, "count %zu class %s\n", cr.count(), - describeClass(cr).c_str()); + dumpShuftiMasks(f, accel.shufti.lo, accel.shufti.hi); + dumpShuftiCharReach(f, accel.shufti.lo, accel.shufti.hi); break; } case ACCEL_DSHUFTI: fprintf(f, "\n"); - fprintf(f, "lo1 %s\n", - dumpMask((const u8 *)&accel.dshufti.lo1, 128).c_str()); - fprintf(f, "hi1 %s\n", - dumpMask((const u8 *)&accel.dshufti.hi1, 128).c_str()); - fprintf(f, "lo2 %s\n", - dumpMask((const u8 *)&accel.dshufti.lo2, 128).c_str()); - fprintf(f, "hi2 %s\n", - dumpMask((const u8 *)&accel.dshufti.hi2, 128).c_str()); + fprintf(f, "mask 1\n"); + dumpShuftiMasks(f, accel.dshufti.lo1, accel.dshufti.hi1); + fprintf(f, "mask 2\n"); + dumpShuftiMasks(f, accel.dshufti.lo2, accel.dshufti.hi2); break; case ACCEL_TRUFFLE: { fprintf(f, "\n"); @@ -184,6 +202,20 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) { fprintf(f, " [\\x%02hhx] len1:%u len2:%u\n", accel.mdverm.c, accel.mdverm.len1, accel.mdverm.len2); break; + case ACCEL_MLSHUFTI: + case ACCEL_MLGSHUFTI: + case ACCEL_MSSHUFTI: + case ACCEL_MSGSHUFTI: + fprintf(f, " len:%u\n", accel.mshufti.len); + dumpShuftiMasks(f, accel.mshufti.lo, accel.mshufti.hi); + dumpShuftiCharReach(f, accel.mshufti.lo, accel.mshufti.hi); + break; + case ACCEL_MDSSHUFTI: + case ACCEL_MDSGSHUFTI: + fprintf(f, " len1:%u len2:%u\n", accel.mdshufti.len1, accel.mdshufti.len2); + dumpShuftiMasks(f, accel.mdshufti.lo, accel.mdshufti.hi); + dumpShuftiCharReach(f, accel.mdshufti.lo, accel.mdshufti.hi); + break; default: fprintf(f, "\n"); break; diff --git a/src/nfa/limex_accel.c b/src/nfa/limex_accel.c index b04792b2..c12f917a 100644 --- a/src/nfa/limex_accel.c +++ b/src/nfa/limex_accel.c @@ -38,6 +38,7 @@ #include "nfa_internal.h" #include "shufti.h" #include "truffle.h" +#include "multishufti.h" #include "multivermicelli.h" #include "ue2common.h" #include "vermicelli.h" @@ -150,6 +151,32 @@ const u8 *accelScan(const union AccelAux *aux, const u8 *ptr, const u8 *end) { ptr = shuftiDoubleExec(aux->dshufti.lo1, aux->dshufti.hi1, aux->dshufti.lo2, aux->dshufti.hi2, ptr, end); break; + case ACCEL_MLSHUFTI: + offset = aux->mshufti.offset; + ptr = long_shuftiExec(aux->mshufti.lo, aux->mshufti.hi, ptr, end, aux->mshufti.len); + break; + case ACCEL_MLGSHUFTI: + offset = aux->mshufti.offset; + ptr = longgrab_shuftiExec(aux->mshufti.lo, aux->mshufti.hi, ptr, end, aux->mshufti.len); + break; + case ACCEL_MSSHUFTI: + offset = aux->mshufti.offset; + ptr = shift_shuftiExec(aux->mshufti.lo, aux->mshufti.hi, ptr, end, aux->mshufti.len); + break; + case ACCEL_MSGSHUFTI: + offset = aux->mshufti.offset; + ptr = shiftgrab_shuftiExec(aux->mshufti.lo, aux->mshufti.hi, ptr, end, aux->mshufti.len); + break; + case ACCEL_MDSSHUFTI: + offset = aux->mdshufti.offset; + ptr = doubleshift_shuftiExec(aux->mdshufti.lo, aux->mdshufti.hi, ptr, end, + aux->mdshufti.len1, aux->mdshufti.len2); + break; + case ACCEL_MDSGSHUFTI: + offset = aux->mdshufti.offset; + ptr = doubleshiftgrab_shuftiExec(aux->mdshufti.lo, aux->mdshufti.hi, ptr, end, + aux->mdshufti.len1, aux->mdshufti.len2); + break; case ACCEL_TRUFFLE: DEBUG_PRINTF("truffle shuffle\n"); offset = aux->truffle.offset; diff --git a/src/nfa/multishufti.c b/src/nfa/multishufti.c new file mode 100644 index 00000000..cb85b718 --- /dev/null +++ b/src/nfa/multishufti.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Shufti: character class acceleration. + * + * Utilises the SSSE3 pshufb shuffle instruction + */ + +#include "config.h" +#include "ue2common.h" + +#include "multishufti.h" + +#include "multiaccel_common.h" + +#if !defined(__AVX2__) + +#define MATCH_ALGO long_ +#include "multiaccel_long.h" +#include "multishufti_sse.h" +#undef MATCH_ALGO + +#define MATCH_ALGO longgrab_ +#include "multiaccel_longgrab.h" +#include "multishufti_sse.h" +#undef MATCH_ALGO + +#define MATCH_ALGO shift_ +#include "multiaccel_shift.h" +#include "multishufti_sse.h" +#undef MATCH_ALGO + +#define MATCH_ALGO shiftgrab_ +#include "multiaccel_shiftgrab.h" +#include "multishufti_sse.h" +#undef MATCH_ALGO + +#define MULTIACCEL_DOUBLE + +#define MATCH_ALGO doubleshift_ +#include "multiaccel_doubleshift.h" +#include "multishufti_sse.h" +#undef MATCH_ALGO + +#define MATCH_ALGO doubleshiftgrab_ +#include "multiaccel_doubleshiftgrab.h" +#include "multishufti_sse.h" +#undef MATCH_ALGO + +#undef MULTIACCEL_DOUBLE + +#else + +#define MATCH_ALGO long_ +#include "multiaccel_long.h" +#include "multishufti_avx2.h" +#undef MATCH_ALGO + +#define MATCH_ALGO longgrab_ +#include "multiaccel_longgrab.h" +#include "multishufti_avx2.h" +#undef MATCH_ALGO + +#define MATCH_ALGO shift_ +#include "multiaccel_shift.h" +#include "multishufti_avx2.h" +#undef MATCH_ALGO + +#define MATCH_ALGO shiftgrab_ +#include "multiaccel_shiftgrab.h" +#include "multishufti_avx2.h" +#undef MATCH_ALGO + +#define MULTIACCEL_DOUBLE + +#define MATCH_ALGO doubleshift_ +#include "multiaccel_doubleshift.h" +#include "multishufti_avx2.h" +#undef MATCH_ALGO + +#define MATCH_ALGO doubleshiftgrab_ +#include "multiaccel_doubleshiftgrab.h" +#include "multishufti_avx2.h" +#undef MATCH_ALGO + +#undef MULTIACCEL_DOUBLE + +#endif diff --git a/src/nfa/multishufti.h b/src/nfa/multishufti.h new file mode 100644 index 00000000..bcccf607 --- /dev/null +++ b/src/nfa/multishufti.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Multishufti: multibyte version of Shufti + * + * Utilises the SSSE3 pshufb shuffle instruction + */ + +#ifndef MULTISHUFTI_H +#define MULTISHUFTI_H + +#include "ue2common.h" +#include "util/simd_utils.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +const u8 *long_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end, const u8 run_len); + +const u8 *longgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end, const u8 run_len); + +const u8 *shift_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end, const u8 run_len); + +const u8 *shiftgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end, const u8 run_len); + +const u8 *doubleshift_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end, const u8 run_len, + const u8 run2_len); + +const u8 *doubleshiftgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end, const u8 run_len, + const u8 run2_len); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/nfa/multishufti_avx2.h b/src/nfa/multishufti_avx2.h new file mode 100644 index 00000000..e9980872 --- /dev/null +++ b/src/nfa/multishufti_avx2.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "shufti_common.h" + +#include "ue2common.h" +#include "util/bitutils.h" +#include "util/simd_utils.h" +#include "util/simd_utils_ssse3.h" + +static really_inline +const u8 *JOIN(MATCH_ALGO, fwdBlock)(m256 mask_lo, m256 mask_hi, m256 chars, + const u8 *buf, const m256 low4bits, + const m256 zeroes, const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); + return (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])(buf, ~z +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); +} + +const u8 *JOIN(MATCH_ALGO, shuftiExec)(m128 mask_lo, m128 mask_hi, + const u8 *buf, + const u8 *buf_end, u8 run_len +#ifdef MULTIACCEL_DOUBLE + , u8 run_len2 +#endif + ) { + assert(buf && buf_end); + assert(buf < buf_end); + + // Slow path for small cases. + if (buf_end - buf < 32) { + return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, + buf, buf_end); + } + + const m256 zeroes = zeroes256(); + const m256 low4bits = set32x8(0xf); + const m256 wide_mask_lo = set2x128(mask_lo); + const m256 wide_mask_hi = set2x128(mask_hi); + const u8 *rv; + + size_t min = (size_t)buf % 32; + assert(buf_end - buf >= 32); + + // Preconditioning: most of the time our buffer won't be aligned. + m256 chars = loadu256(buf); + rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, chars, buf, + low4bits, zeroes, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (rv) { + return rv; + } + buf += (32 - min); + + // Unrolling was here, but it wasn't doing anything but taking up space. + // Reroll FTW. + const u8 *last_block = buf_end - 32; + while (buf < last_block) { + m256 lchars = load256(buf); + rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, lchars, buf, + low4bits, zeroes, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (rv) { + return rv; + } + buf += 32; + } + + // Use an unaligned load to mop up the last 32 bytes and get an accurate + // picture to buf_end. + assert(buf <= buf_end && buf >= buf_end - 32); + chars = loadu256(buf_end - 32); + rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, + low4bits, zeroes, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (rv) { + return rv; + } + + return buf_end; +} diff --git a/src/nfa/multishufti_sse.h b/src/nfa/multishufti_sse.h new file mode 100644 index 00000000..7ea5946d --- /dev/null +++ b/src/nfa/multishufti_sse.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "shufti_common.h" + +#include "ue2common.h" +#include "util/bitutils.h" +#include "util/simd_utils.h" +#include "util/simd_utils_ssse3.h" + +/* Normal SSSE3 shufti */ + +static really_inline +const u8 *JOIN(MATCH_ALGO, fwdBlock)(m128 mask_lo, m128 mask_hi, m128 chars, + const u8 *buf, const m128 low4bits, + const m128 zeroes, const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + // negate first 16 bits + u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes) ^ 0xFFFF; + return (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])(buf, z +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); +} + +/* + * 16-byte pipeline, for smaller scans + */ +static +const u8 *JOIN(MATCH_ALGO, shuftiPipeline16)(m128 mask_lo, m128 mask_hi, + const u8 *buf, const u8 *buf_end, + const m128 low4bits, + const m128 zeroes, const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + const u8* ptr, *last_buf; + u32 last_res; + + // pipeline prologue: scan first 16 bytes + m128 data = load128(buf); + u32 z = block(mask_lo, mask_hi, data, low4bits, zeroes) ^ 0xFFFF; + last_buf = buf; + last_res = z; + buf += 16; + + // now, start the pipeline! + assert((size_t)buf % 16 == 0); + for (; buf + 15 < buf_end; buf += 16) { + // scan more data + data = load128(buf); + z = block(mask_lo, mask_hi, data, low4bits, zeroes) ^ 0xFFFF; + + // do a comparison on previous result + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + last_buf = buf; + last_res = z; + } + assert(buf <= buf_end && buf >= buf_end - 16); + + // epilogue: compare final results + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + + return NULL; +} + +/* + * 32-byte pipeline, for bigger scans + */ +static +const u8 *JOIN(MATCH_ALGO, shuftiPipeline32)(m128 mask_lo, m128 mask_hi, + const u8 *buf, const u8 *buf_end, + const m128 low4bits, + const m128 zeroes, const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + const u8* ptr, *last_buf; + u32 res; + + // pipeline prologue: scan first 32 bytes + m128 data1 = load128(buf); + u32 z1 = block(mask_lo, mask_hi, data1, low4bits, zeroes) ^ 0xFFFF; + m128 data2 = load128(buf + 16); + u32 z2 = block(mask_lo, mask_hi, data2, low4bits, zeroes) ^ 0xFFFF; + + // store the results + u32 last_res = z1 | (z2 << 16); + last_buf = buf; + buf += 32; + + + // now, start the pipeline! + assert((size_t)buf % 16 == 0); + for (; buf + 31 < buf_end; buf += 32) { + // scan more data + data1 = load128(buf); + z1 = block(mask_lo, mask_hi, data1, low4bits, zeroes) ^ 0xFFFF; + data2 = load128(buf + 16); + z2 = block(mask_lo, mask_hi, data2, low4bits, zeroes) ^ 0xFFFF; + res = z1 | (z2 << 16); + + // do a comparison on previous result + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + last_res = res; + last_buf = buf; + } + + // epilogue: compare final results + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + + // if we still have some data left, scan it too + for (; buf + 15 < buf_end; buf += 16) { + m128 chars = load128(buf); + ptr = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars, buf, + low4bits, zeroes, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + } + assert(buf <= buf_end && buf >= buf_end - 16); + + return NULL; +} + +const u8 *JOIN(MATCH_ALGO, shuftiExec)(m128 mask_lo, m128 mask_hi, + const u8 *buf, + const u8 *buf_end, u8 run_len +#ifdef MULTIACCEL_DOUBLE + , u8 run_len2 +#endif + ) { + assert(buf && buf_end); + assert(buf < buf_end); + + // Slow path for small cases. + if (buf_end - buf < 16) { + return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, + buf, buf_end); + } + + const m128 zeroes = zeroes128(); + const m128 low4bits = _mm_set1_epi8(0xf); + const u8 *rv; + + size_t min = (size_t)buf % 16; + assert(buf_end - buf >= 16); + + // Preconditioning: most of the time our buffer won't be aligned. + m128 chars = loadu128(buf); + rv = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars, buf, + low4bits, zeroes, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (rv) { + return rv; + } + buf += (16 - min); + + // if we have enough data, run bigger pipeline; otherwise run smaller one + if (buf_end - buf >= 128) { + rv = JOIN(MATCH_ALGO, shuftiPipeline32)(mask_lo, mask_hi, + buf, buf_end, low4bits, zeroes, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(rv)) { + return rv; + } + } else if (buf_end - buf >= 16){ + rv = JOIN(MATCH_ALGO, shuftiPipeline16)(mask_lo, mask_hi, + buf, buf_end, low4bits, zeroes, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(rv)) { + return rv; + } + } + + // Use an unaligned load to mop up the last 16 bytes and get an accurate + // picture to buf_end. + chars = loadu128(buf_end - 16); + rv = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars, + buf_end - 16, low4bits, zeroes, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (rv) { + return rv; + } + + return buf_end; +} diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c index 7d50709c..b1fec488 100644 --- a/src/nfa/shufti.c +++ b/src/nfa/shufti.c @@ -38,20 +38,9 @@ #include "util/simd_utils.h" #include "util/unaligned.h" -/** \brief Naive byte-by-byte implementation. */ -static really_inline -const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf, - const u8 *buf_end) { - assert(buf < buf_end); +#include "shufti_common.h" - for (; buf < buf_end; ++buf) { - u8 c = *buf; - if (lo[c & 0xf] & hi[c >> 4]) { - break; - } - } - return buf; -} +#include "util/simd_utils_ssse3.h" /** \brief Naive byte-by-byte implementation. */ static really_inline @@ -68,54 +57,11 @@ const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf, return buf_end; } -#ifdef DEBUG -#include - -#define DUMP_MSK(_t) \ -static UNUSED \ -void dumpMsk##_t(m##_t msk) { \ - u8 * mskAsU8 = (u8 *)&msk; \ - for (unsigned i = 0; i < sizeof(msk); i++) { \ - u8 c = mskAsU8[i]; \ - for (int j = 0; j < 8; j++) { \ - if ((c >> (7-j)) & 0x1) \ - printf("1"); \ - else \ - printf("0"); \ - } \ - printf(" "); \ - } \ -} \ -static UNUSED \ -void dumpMsk##_t##AsChars(m##_t msk) { \ - u8 * mskAsU8 = (u8 *)&msk; \ - for (unsigned i = 0; i < sizeof(msk); i++) { \ - u8 c = mskAsU8[i]; \ - if (isprint(c)) \ - printf("%c",c); \ - else \ - printf("."); \ - } \ -} - -DUMP_MSK(128) -#endif - -#include "util/simd_utils_ssse3.h" - #if !defined(__AVX2__) /* Normal SSSE3 shufti */ -#define GET_LO_4(chars) and128(chars, low4bits) -#define GET_HI_4(chars) rshift2x64(andnot128(low4bits, chars), 4) - static really_inline -const u8 *firstMatch(const u8 *buf, m128 t, m128 compare) { -#ifdef DEBUG - DEBUG_PRINTF("confirming match in:"); dumpMsk128(t); printf("\n"); -#endif - - u32 z = movemask128(eq128(t, compare)); +const u8 *firstMatch(const u8 *buf, u32 z) { if (unlikely(z != 0xffff)) { u32 pos = ctz32(~z & 0xffff); assert(pos < 16); @@ -128,19 +74,9 @@ const u8 *firstMatch(const u8 *buf, m128 t, m128 compare) { static really_inline const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf, const m128 low4bits, const m128 zeroes) { - m128 c_lo = pshufb(mask_lo, GET_LO_4(chars)); - m128 c_hi = pshufb(mask_hi, GET_HI_4(chars)); - m128 t = and128(c_lo, c_hi); + u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); -#endif - - return firstMatch(buf, t, zeroes); + return firstMatch(buf, z); } const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, @@ -307,7 +243,8 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, DEBUG_PRINTF(" t2: "); dumpMsk128(t2); printf("\n"); #endif - return firstMatch(buf, t2, ones); + u32 z = movemask128(eq128(t2, ones)); + return firstMatch(buf, z); } const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, @@ -356,20 +293,8 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, #else // AVX2 - 256 wide shuftis -#ifdef DEBUG -DUMP_MSK(256) -#endif - -#define GET_LO_4(chars) and256(chars, low4bits) -#define GET_HI_4(chars) rshift4x64(andnot256(low4bits, chars), 4) - static really_inline -const u8 *firstMatch(const u8 *buf, m256 t, m256 compare) { -#ifdef DEBUG - DEBUG_PRINTF("confirming match in:"); dumpMsk256(t); printf("\n"); -#endif - - u32 z = movemask256(eq256(t, compare)); +const u8 *firstMatch(const u8 *buf, u32 z) { if (unlikely(z != 0xffffffff)) { u32 pos = ctz32(~z); assert(pos < 32); @@ -382,19 +307,9 @@ const u8 *firstMatch(const u8 *buf, m256 t, m256 compare) { static really_inline const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, const m256 low4bits, const m256 zeroes) { - m256 c_lo = vpshufb(mask_lo, GET_LO_4(chars)); - m256 c_hi = vpshufb(mask_hi, GET_HI_4(chars)); - m256 t = and256(c_lo, c_hi); + u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); -#endif - - return firstMatch(buf, t, zeroes); + return firstMatch(buf, z); } /* takes 128 bit masks, but operates on 256 bits of data */ @@ -564,8 +479,9 @@ const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi, DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi); printf("\n"); DEBUG_PRINTF(" t2: "); dumpMsk256(t2); printf("\n"); #endif + u32 z = movemask256(eq256(t2, ones)); - return firstMatch(buf, t2, ones); + return firstMatch(buf, z); } /* takes 128 bit masks, but operates on 256 bits of data */ diff --git a/src/nfa/shufti_common.h b/src/nfa/shufti_common.h new file mode 100644 index 00000000..9c11f2b9 --- /dev/null +++ b/src/nfa/shufti_common.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SHUFTI_COMMON_H_ +#define SHUFTI_COMMON_H_ + +#include "ue2common.h" + +#include "util/bitutils.h" +#include "util/simd_utils.h" +#include "util/unaligned.h" +#include "util/simd_utils_ssse3.h" + +/* + * Common stuff for all versions of shufti (single, multi and multidouble) + */ + +/** \brief Naive byte-by-byte implementation. */ +static really_inline +const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf, + const u8 *buf_end) { + assert(buf < buf_end); + + for (; buf < buf_end; ++buf) { + u8 c = *buf; + if (lo[c & 0xf] & hi[c >> 4]) { + break; + } + } + return buf; +} + +#ifdef DEBUG +#include + +#define DUMP_MSK(_t) \ +static UNUSED \ +void dumpMsk##_t(m##_t msk) { \ + u8 * mskAsU8 = (u8 *)&msk; \ + for (unsigned i = 0; i < sizeof(msk); i++) { \ + u8 c = mskAsU8[i]; \ + for (int j = 0; j < 8; j++) { \ + if ((c >> (7-j)) & 0x1) \ + printf("1"); \ + else \ + printf("0"); \ + } \ + printf(" "); \ + } \ +} \ +static UNUSED \ +void dumpMsk##_t##AsChars(m##_t msk) { \ + u8 * mskAsU8 = (u8 *)&msk; \ + for (unsigned i = 0; i < sizeof(msk); i++) { \ + u8 c = mskAsU8[i]; \ + if (isprint(c)) \ + printf("%c",c); \ + else \ + printf("."); \ + } \ +} + +#endif + +#if !defined(__AVX2__) + +#ifdef DEBUG +DUMP_MSK(128) +#endif + +#define GET_LO_4(chars) and128(chars, low4bits) +#define GET_HI_4(chars) rshift2x64(andnot128(low4bits, chars), 4) + +static really_inline +u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits, + const m128 compare) { + m128 c_lo = pshufb(mask_lo, GET_LO_4(chars)); + m128 c_hi = pshufb(mask_hi, GET_HI_4(chars)); + m128 t = and128(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); +#endif + return movemask128(eq128(t, compare)); +} + +#else + +#ifdef DEBUG +DUMP_MSK(256) +#endif + +#define GET_LO_4(chars) and256(chars, low4bits) +#define GET_HI_4(chars) rshift4x64(andnot256(low4bits, chars), 4) + +static really_inline +u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits, + const m256 compare) { + m256 c_lo = vpshufb(mask_lo, GET_LO_4(chars)); + m256 c_hi = vpshufb(mask_hi, GET_HI_4(chars)); + m256 t = and256(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); +#endif + + return movemask256(eq256(t, compare)); +} + +#endif + + +#endif /* SHUFTI_COMMON_H_ */