From dd2ec6bdaca08cb10f3d134f1862401811157543 Mon Sep 17 00:00:00 2001 From: Anatoly Burakov Date: Wed, 9 Dec 2015 11:46:19 +0000 Subject: [PATCH] Multibyte vermicelli runtime --- CMakeLists.txt | 4 + src/nfa/accel.c | 103 ++++++++ src/nfa/accel.h | 28 +- src/nfa/accel_dump.cpp | 41 +++ src/nfa/limex_accel.c | 61 +++++ src/nfa/multivermicelli.c | 108 ++++++++ src/nfa/multivermicelli.h | 62 +++++ src/nfa/multivermicelli_avx2.h | 283 +++++++++++++++++++++ src/nfa/multivermicelli_sse.h | 452 +++++++++++++++++++++++++++++++++ 9 files changed, 1141 insertions(+), 1 deletion(-) create mode 100644 src/nfa/multivermicelli.c create mode 100644 src/nfa/multivermicelli.h create mode 100644 src/nfa/multivermicelli_avx2.h create mode 100644 src/nfa/multivermicelli_sse.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 714168d8..0848f550 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -444,6 +444,10 @@ set (hs_exec_SRCS src/nfa/multiaccel_longgrab.h src/nfa/multiaccel_shift.h src/nfa/multiaccel_shiftgrab.h + src/nfa/multivermicelli.c + src/nfa/multivermicelli.h + src/nfa/multivermicelli_sse.h + src/nfa/multivermicelli_avx2.h src/nfa/nfa_api.h src/nfa/nfa_api_dispatch.c src/nfa/nfa_internal.h diff --git a/src/nfa/accel.c b/src/nfa/accel.c index af5e9610..43ecd84f 100644 --- a/src/nfa/accel.c +++ b/src/nfa/accel.c @@ -30,6 +30,7 @@ #include "shufti.h" #include "truffle.h" #include "vermicelli.h" +#include "multivermicelli.h" #include "ue2common.h" const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) { @@ -117,6 +118,108 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) { rv = c_end; break; + /* multibyte matchers */ + case ACCEL_MLVERM: + DEBUG_PRINTF("accel mlverm %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = long_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len); + break; + case ACCEL_MLVERM_NOCASE: + DEBUG_PRINTF("accel mlverm nc %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = long_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len); + break; + case ACCEL_MLGVERM: + DEBUG_PRINTF("accel mlgverm %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = longgrab_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len); + break; + case ACCEL_MLGVERM_NOCASE: + DEBUG_PRINTF("accel mlgverm nc %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = longgrab_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len); + break; + case ACCEL_MSVERM: + DEBUG_PRINTF("accel msverm %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = shift_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len); + break; + case ACCEL_MSVERM_NOCASE: + DEBUG_PRINTF("accel msverm nc %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = shift_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len); + break; + case ACCEL_MSGVERM: + DEBUG_PRINTF("accel msgverm %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = shiftgrab_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len); + break; + case ACCEL_MSGVERM_NOCASE: + DEBUG_PRINTF("accel msgverm nc %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = shiftgrab_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len); + break; + case ACCEL_MDSVERM: + DEBUG_PRINTF("accel mdsverm %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = doubleshift_vermicelliExec(accel->mdverm.c, 0, c, c_end, + accel->mdverm.len1, accel->mdverm.len2); + break; + case ACCEL_MDSVERM_NOCASE: + DEBUG_PRINTF("accel mdsverm nc %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = doubleshift_vermicelliExec(accel->mdverm.c, 1, c, c_end, + accel->mdverm.len1, accel->mdverm.len2); + break; + case ACCEL_MDSGVERM: + DEBUG_PRINTF("accel mdsgverm %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = doubleshiftgrab_vermicelliExec(accel->mdverm.c, 0, c, c_end, + accel->mdverm.len1, accel->mdverm.len2); + break; + case ACCEL_MDSGVERM_NOCASE: + DEBUG_PRINTF("accel mdsgverm nc %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = doubleshiftgrab_vermicelliExec(accel->mdverm.c, 1, c, c_end, + accel->mdverm.len1, accel->mdverm.len2); + break; + default: assert(!"not here"); return c; diff --git a/src/nfa/accel.h b/src/nfa/accel.h index 2c1f223a..cc64d587 100644 --- a/src/nfa/accel.h +++ b/src/nfa/accel.h @@ -60,7 +60,20 @@ enum AccelType { ACCEL_SHUFTI, ACCEL_DSHUFTI, ACCEL_TRUFFLE, - ACCEL_RED_TAPE + ACCEL_RED_TAPE, + /* multibyte vermicellis */ + ACCEL_MLVERM, + ACCEL_MLVERM_NOCASE, + ACCEL_MLGVERM, + ACCEL_MLGVERM_NOCASE, + ACCEL_MSVERM, + ACCEL_MSVERM_NOCASE, + ACCEL_MSGVERM, + ACCEL_MSGVERM_NOCASE, + ACCEL_MDSVERM, + ACCEL_MDSVERM_NOCASE, + ACCEL_MDSGVERM, + ACCEL_MDSGVERM_NOCASE, }; /** \brief Structure for accel framework. */ @@ -81,6 +94,19 @@ union AccelAux { u8 c1; // uppercase if nocase u8 c2; // uppercase if nocase } dverm; + struct { + u8 accel_type; + u8 offset; + u8 c; // uppercase if nocase + u8 len; + } mverm; + struct { + u8 accel_type; + u8 offset; + u8 c; // uppercase if nocase + u8 len1; + u8 len2; + } mdverm; struct { u8 accel_type; u8 offset; diff --git a/src/nfa/accel_dump.cpp b/src/nfa/accel_dump.cpp index 40c9c653..19116a8f 100644 --- a/src/nfa/accel_dump.cpp +++ b/src/nfa/accel_dump.cpp @@ -86,6 +86,30 @@ const char *accelName(u8 accel_type) { return "truffle"; case ACCEL_RED_TAPE: return "red tape"; + case ACCEL_MLVERM: + return "multibyte long vermicelli"; + case ACCEL_MLVERM_NOCASE: + return "multibyte long vermicelli nocase"; + case ACCEL_MLGVERM: + return "multibyte long-grab vermicelli"; + case ACCEL_MLGVERM_NOCASE: + return "multibyte long-grab vermicelli nocase"; + case ACCEL_MSVERM: + return "multibyte shift vermicelli"; + case ACCEL_MSVERM_NOCASE: + return "multibyte shift vermicelli nocase"; + case ACCEL_MSGVERM: + return "multibyte shift-grab vermicelli"; + case ACCEL_MSGVERM_NOCASE: + return "multibyte shift-grab vermicelli nocase"; + case ACCEL_MDSVERM: + return "multibyte doubleshift vermicelli"; + case ACCEL_MDSVERM_NOCASE: + return "multibyte doubleshift vermicelli nocase"; + case ACCEL_MDSGVERM: + return "multibyte doubleshift-grab vermicelli"; + case ACCEL_MDSGVERM_NOCASE: + return "multibyte doubleshift-grab vermicelli nocase"; default: return "unknown!"; } @@ -143,6 +167,23 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) { describeClass(cr).c_str()); break; } + case ACCEL_MLVERM: + case ACCEL_MLVERM_NOCASE: + case ACCEL_MLGVERM: + case ACCEL_MLGVERM_NOCASE: + case ACCEL_MSVERM: + case ACCEL_MSVERM_NOCASE: + case ACCEL_MSGVERM: + case ACCEL_MSGVERM_NOCASE: + fprintf(f, " [\\x%02hhx] len:%u\n", accel.mverm.c, accel.mverm.len); + break; + case ACCEL_MDSVERM: + case ACCEL_MDSVERM_NOCASE: + case ACCEL_MDSGVERM: + case ACCEL_MDSGVERM_NOCASE: + fprintf(f, " [\\x%02hhx] len1:%u len2:%u\n", accel.mdverm.c, accel.mdverm.len1, + accel.mdverm.len2); + break; default: fprintf(f, "\n"); break; diff --git a/src/nfa/limex_accel.c b/src/nfa/limex_accel.c index 1aa1b30f..b04792b2 100644 --- a/src/nfa/limex_accel.c +++ b/src/nfa/limex_accel.c @@ -38,6 +38,7 @@ #include "nfa_internal.h" #include "shufti.h" #include "truffle.h" +#include "multivermicelli.h" #include "ue2common.h" #include "vermicelli.h" #include "util/bitutils.h" @@ -78,6 +79,66 @@ const u8 *accelScan(const union AccelAux *aux, const u8 *ptr, const u8 *end) { ptr = vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2, 1, ptr, end); break; + case ACCEL_MLVERM: + DEBUG_PRINTF("long vermicelli for 0x%02hhx\n", aux->mverm.c); + offset = aux->mverm.offset; + ptr = long_vermicelliExec(aux->mverm.c, 0, ptr, end, aux->mverm.len); + break; + case ACCEL_MLVERM_NOCASE: + DEBUG_PRINTF("long vermicelli-nocase for 0x%02hhx\n", aux->mverm.c); + offset = aux->mverm.offset; + ptr = long_vermicelliExec(aux->mverm.c, 1, ptr, end, aux->mverm.len); + break; + case ACCEL_MLGVERM: + DEBUG_PRINTF("long grab vermicelli for 0x%02hhx\n", aux->mverm.c); + offset = aux->mverm.offset; + ptr = longgrab_vermicelliExec(aux->mverm.c, 0, ptr, end, aux->mverm.len); + break; + case ACCEL_MLGVERM_NOCASE: + DEBUG_PRINTF("long grab vermicelli-nocase for 0x%02hhx\n", aux->mverm.c); + offset = aux->mverm.offset; + ptr = longgrab_vermicelliExec(aux->mverm.c, 1, ptr, end, aux->mverm.len); + break; + case ACCEL_MSVERM: + DEBUG_PRINTF("shift vermicelli for 0x%02hhx\n", aux->mverm.c); + offset = aux->mverm.offset; + ptr = shift_vermicelliExec(aux->mverm.c, 0, ptr, end, aux->mverm.len); + break; + case ACCEL_MSVERM_NOCASE: + DEBUG_PRINTF("shift vermicelli-nocase for 0x%02hhx\n", aux->mverm.c); + offset = aux->mverm.offset; + ptr = shift_vermicelliExec(aux->mverm.c, 1, ptr, end, aux->mverm.len); + break; + case ACCEL_MSGVERM: + DEBUG_PRINTF("shift grab vermicelli for 0x%02hhx\n", aux->mverm.c); + offset = aux->mverm.offset; + ptr = shiftgrab_vermicelliExec(aux->mverm.c, 0, ptr, end, aux->mverm.len); + break; + case ACCEL_MSGVERM_NOCASE: + DEBUG_PRINTF("shift grab vermicelli-nocase for 0x%02hhx\n", aux->mverm.c); + offset = aux->mverm.offset; + ptr = shiftgrab_vermicelliExec(aux->mverm.c, 1, ptr, end, aux->mverm.len); + break; + case ACCEL_MDSVERM: + DEBUG_PRINTF("double shift vermicelli for 0x%02hhx\n", aux->mdverm.c); + offset = aux->mdverm.offset; + ptr = doubleshift_vermicelliExec(aux->mdverm.c, 0, ptr, end, aux->mdverm.len1, aux->mdverm.len2); + break; + case ACCEL_MDSVERM_NOCASE: + DEBUG_PRINTF("double shift vermicelli-nocase for 0x%02hhx\n", aux->mdverm.c); + offset = aux->mverm.offset; + ptr = doubleshift_vermicelliExec(aux->mdverm.c, 1, ptr, end, aux->mdverm.len1, aux->mdverm.len2); + break; + case ACCEL_MDSGVERM: + DEBUG_PRINTF("double shift grab vermicelli for 0x%02hhx\n", aux->mdverm.c); + offset = aux->mverm.offset; + ptr = doubleshiftgrab_vermicelliExec(aux->mdverm.c, 0, ptr, end, aux->mdverm.len1, aux->mdverm.len2); + break; + case ACCEL_MDSGVERM_NOCASE: + DEBUG_PRINTF("double shift grab vermicelli-nocase for 0x%02hhx\n", aux->mdverm.c); + offset = aux->mverm.offset; + ptr = doubleshiftgrab_vermicelliExec(aux->mdverm.c, 1, ptr, end, aux->mdverm.len1, aux->mdverm.len2); + break; case ACCEL_SHUFTI: DEBUG_PRINTF("single shufti\n"); offset = aux->shufti.offset; diff --git a/src/nfa/multivermicelli.c b/src/nfa/multivermicelli.c new file mode 100644 index 00000000..ab6d2cf2 --- /dev/null +++ b/src/nfa/multivermicelli.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "ue2common.h" + +#include "multivermicelli.h" + +#include "multiaccel_common.h" + +#if !defined(__AVX2__) + +#define MATCH_ALGO long_ +#include "multiaccel_long.h" +#include "multivermicelli_sse.h" +#undef MATCH_ALGO + +#define MATCH_ALGO longgrab_ +#include "multiaccel_longgrab.h" +#include "multivermicelli_sse.h" +#undef MATCH_ALGO + +#define MATCH_ALGO shift_ +#include "multiaccel_shift.h" +#include "multivermicelli_sse.h" +#undef MATCH_ALGO + +#define MATCH_ALGO shiftgrab_ +#include "multiaccel_shiftgrab.h" +#include "multivermicelli_sse.h" +#undef MATCH_ALGO + +#define MULTIACCEL_DOUBLE + +#define MATCH_ALGO doubleshift_ +#include "multiaccel_doubleshift.h" +#include "multivermicelli_sse.h" +#undef MATCH_ALGO + +#define MATCH_ALGO doubleshiftgrab_ +#include "multiaccel_doubleshiftgrab.h" +#include "multivermicelli_sse.h" +#undef MATCH_ALGO + +#undef MULTIACCEL_DOUBLE + +#else + +#define MATCH_ALGO long_ +#include "multiaccel_long.h" +#include "multivermicelli_avx2.h" +#undef MATCH_ALGO + +#define MATCH_ALGO longgrab_ +#include "multiaccel_longgrab.h" +#include "multivermicelli_avx2.h" +#undef MATCH_ALGO + +#define MATCH_ALGO shift_ +#include "multiaccel_shift.h" +#include "multivermicelli_avx2.h" +#undef MATCH_ALGO + +#define MATCH_ALGO shiftgrab_ +#include "multiaccel_shiftgrab.h" +#include "multivermicelli_avx2.h" +#undef MATCH_ALGO + +#define MULTIACCEL_DOUBLE + +#define MATCH_ALGO doubleshift_ +#include "multiaccel_doubleshift.h" +#include "multivermicelli_avx2.h" +#undef MATCH_ALGO + +#define MATCH_ALGO doubleshiftgrab_ +#include "multiaccel_doubleshiftgrab.h" +#include "multivermicelli_avx2.h" +#undef MATCH_ALGO + +#undef MULTIACCEL_DOUBLE + +#endif diff --git a/src/nfa/multivermicelli.h b/src/nfa/multivermicelli.h new file mode 100644 index 00000000..55f9b1f2 --- /dev/null +++ b/src/nfa/multivermicelli.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MULTIVERMICELLI_H_ +#define MULTIVERMICELLI_H_ + +#ifdef __cplusplus +extern "C" +{ +#endif + +const u8 *long_vermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end, const u8 run_len); + +const u8 *longgrab_vermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end, const u8 run_len); + +const u8 *shift_vermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end, const u8 run_len); + +const u8 *shiftgrab_vermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end, const u8 run_len); + +const u8 *doubleshift_vermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end, const u8 run_len, + const u8 run2_len); + +const u8 *doubleshiftgrab_vermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end, const u8 run_len, + const u8 run2_len); + +#ifdef __cplusplus +} +#endif + + +#endif /* MULTIVERMICELLI_H_ */ diff --git a/src/nfa/multivermicelli_avx2.h b/src/nfa/multivermicelli_avx2.h new file mode 100644 index 00000000..9081aa3f --- /dev/null +++ b/src/nfa/multivermicelli_avx2.h @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "util/bitutils.h" +#include "util/simd_utils.h" +#include "util/unaligned.h" + +#include "multiaccel_common.h" + +static really_inline +const u8 *JOIN(MATCH_ALGO, vermUnalignNocase)(m256 chars, + const u8 *buf, + const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + m256 casemask = set32x8(CASE_CLEAR); + const u8 *ptr; + m256 data = loadu256(buf); + u32 z = movemask256(eq256(chars, and256(casemask, data))); + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len]) + (buf, z +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + return NULL; +} + +static really_inline +const u8 *JOIN(MATCH_ALGO, vermUnalign)(m256 chars, + const u8 *buf, + const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + const u8 *ptr; + + m256 data = loadu256(buf); + u32 z = movemask256(eq256(chars, data)); + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len]) + (buf, z +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + return NULL; +} + +/* + * 32-byte pipeline + */ +static really_inline +const u8 *JOIN(MATCH_ALGO, vermPipeline)(m256 chars, + const u8 *buf, + const u8 *buf_end, + const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + const u8* ptr, *last_buf; + u32 last_res; + + // pipeline prologue: scan first 32 bytes + m256 data = load256(buf); + u32 z = movemask256(eq256(chars, data)); + last_res = z; + last_buf = buf; + buf += 32; + + // now, start the pipeline! + assert((size_t)buf % 32 == 0); + for (; buf + 31 < buf_end; buf += 32) { + // scan more data + data = load256(buf); + z = movemask256(eq256(chars, data)); + + // do a comparison on previous result + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + last_buf = buf; + last_res = z; + } + assert(buf <= buf_end && buf >= buf_end - 32); + + // epilogue: compare final results + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + + return NULL; +} + +/* + * 32-byte caseless pipeline + */ +static really_inline +const u8 *JOIN(MATCH_ALGO, vermPipelineNocase)(m256 chars, + const u8 *buf, + const u8 *buf_end, + const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + m256 casemask = set32x8(CASE_CLEAR); + const u8* ptr, *last_buf; + u32 last_res; + + // pipeline prologue: scan first 32 bytes + m256 data = load256(buf); + u32 z = movemask256(eq256(chars, and256(casemask, data))); + last_res = z; + last_buf = buf; + buf += 32; + + + // now, start the pipeline! + assert((size_t)buf % 32 == 0); + for (; buf + 31 < buf_end; buf += 32) { + // scan more data + data = load256(buf); + z = movemask256(eq256(chars, and256(casemask, data))); + + // do a comparison on previous result + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + last_buf = buf; + last_res = z; + } + assert(buf <= buf_end && buf >= buf_end - 32); + + // epilogue: compare final results + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + + return NULL; +} + +const u8 *JOIN(MATCH_ALGO, vermicelliExec)(char c, char nocase, + const u8 *buf, + const u8 *buf_end, + const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + const u8 *ptr; + + // Handle small scans. + if (buf_end - buf < 32) { + for (; buf < buf_end; buf++) { + char cur = (char)*buf; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur == c) { + break; + } + } + return buf; + } + + m256 chars = set32x8(c); /* nocase already uppercase */ + + uintptr_t min = (uintptr_t)buf % 32; + + if (min) { + ptr = nocase ? JOIN(MATCH_ALGO, vermUnalignNocase)(chars, + buf, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ) : JOIN(MATCH_ALGO, vermUnalign)(chars, + buf, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + buf += 32 - min; + } + + if (buf_end - buf >= 32){ + ptr = nocase ? JOIN(MATCH_ALGO, vermPipelineNocase)(chars, + buf, buf_end, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ) : JOIN(MATCH_ALGO, vermPipeline)(chars, + buf, buf_end, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + } + + // final unaligned scan + ptr = nocase ? JOIN(MATCH_ALGO, vermUnalignNocase)(chars, + buf_end - 32, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ) : JOIN(MATCH_ALGO, vermUnalign)(chars, + buf_end - 32, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + + // run our pipeline + return ptr ? ptr : buf_end; +} diff --git a/src/nfa/multivermicelli_sse.h b/src/nfa/multivermicelli_sse.h new file mode 100644 index 00000000..cdacd2c4 --- /dev/null +++ b/src/nfa/multivermicelli_sse.h @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "util/bitutils.h" +#include "util/simd_utils.h" +#include "util/unaligned.h" + +#define VERM_BOUNDARY 16 +#define VERM_TYPE m128 +#define VERM_SET_FN set16x8 + +#include "multiaccel_common.h" + +static really_inline +const u8 *JOIN(MATCH_ALGO, vermUnalignNocase)(m128 chars, + const u8 *buf, + const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + m128 casemask = set16x8(CASE_CLEAR); + const u8 *ptr; + m128 data = loadu128(buf); + u32 z = movemask128(eq128(chars, and128(casemask, data))); + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len]) + (buf, z +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + return NULL; +} + +static really_inline +const u8 *JOIN(MATCH_ALGO, vermUnalign)(m128 chars, + const u8 *buf, + const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + const u8 *ptr; + + m128 data = loadu128(buf); + u32 z = movemask128(eq128(chars, data)); + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len]) + (buf, z +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + return NULL; +} + +/* + * 16-byte pipeline, for smaller scans + */ +static +const u8 *JOIN(MATCH_ALGO, vermPipeline16)(m128 chars, + const u8 *buf, + const u8 *buf_end, + const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + const u8* ptr, *last_buf; + u32 last_res; + + // pipeline prologue: scan first 16 bytes + m128 data = load128(buf); + u32 z = movemask128(eq128(chars, data)); + last_buf = buf; + last_res = z; + buf += 16; + + // now, start the pipeline! + assert((size_t)buf % 16 == 0); + for (; buf + 15 < buf_end; buf += 16) { + // scan more data + data = load128(buf); + z = movemask128(eq128(chars, data)); + + // do a comparison on previous result + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + last_buf = buf; + last_res = z; + } + assert(buf <= buf_end && buf >= buf_end - 16); + + // epilogue: compare final results + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + + return NULL; +} + +/* + * 16-byte pipeline, for smaller scans + */ +static +const u8 *JOIN(MATCH_ALGO, vermPipeline16Nocase)(m128 chars, + const u8 *buf, + const u8 *buf_end, + const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + m128 casemask = set16x8(CASE_CLEAR); + const u8* ptr, *last_buf; + u32 last_res; + + // pipeline prologue: scan first 16 bytes + m128 data = load128(buf); + u32 z = movemask128(eq128(chars, and128(casemask, data))); + last_buf = buf; + last_res = z; + buf += 16; + + // now, start the pipeline! + assert((size_t)buf % 16 == 0); + for (; buf + 15 < buf_end; buf += 16) { + // scan more data + data = load128(buf); + z = movemask128(eq128(chars, and128(casemask, data))); + + // do a comparison on previous result + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + last_buf = buf; + last_res = z; + } + assert(buf <= buf_end && buf >= buf_end - 16); + + // epilogue: compare final results + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + + return NULL; +} + +/* + * 32-byte pipeline, for bigger scans + */ +static +const u8 *JOIN(MATCH_ALGO, vermPipeline32)(m128 chars, + const u8 *buf, + const u8 *buf_end, + const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + const u8* ptr, *last_buf; + u32 res; + + // pipeline prologue: scan first 32 bytes + m128 data1 = load128(buf); + u32 z1 = movemask128(eq128(chars, data1)); + m128 data2 = load128(buf + 16); + u32 z2 = movemask128(eq128(chars, data2)); + + // store the results + u32 last_res = z1 | (z2 << VERM_BOUNDARY); + last_buf = buf; + buf += 32; + + + // now, start the pipeline! + assert((size_t)buf % 16 == 0); + for (; buf + 31 < buf_end; buf += 32) { + // scan more data + data1 = load128(buf); + z1 = movemask128(eq128(chars, data1)); + data2 = load128(buf + 16); + z2 = movemask128(eq128(chars, data2)); + res = z1 | (z2 << 16); + + // do a comparison on previous result + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + last_res = res; + last_buf = buf; + } + + // epilogue: compare final results + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + + // if we still have some data left, scan it too + if (buf + 15 < buf_end) { + return JOIN(MATCH_ALGO, vermPipeline16)(chars, buf, buf_end, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + } + assert(buf <= buf_end && buf >= buf_end - 16); + + return NULL; +} + +/* + * 32-byte caseless pipeline, for bigger scans + */ +static +const u8 *JOIN(MATCH_ALGO, vermPipeline32Nocase)(m128 chars, + const u8 *buf, + const u8 *buf_end, + const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + m128 casemask = set16x8(CASE_CLEAR); + const u8* ptr, *last_buf; + u32 last_res; + + // pipeline prologue: scan first 32 bytes + m128 data1 = load128(buf); + u32 z1 = movemask128(eq128(chars, and128(casemask, data1))); + m128 data2 = load128(buf + 16); + u32 z2 = movemask128(eq128(chars, and128(casemask, data2))); + u32 z = z1 | (z2 << VERM_BOUNDARY); + + last_res = z; + last_buf = buf; + buf += 32; + + // now, start the pipeline! + assert((size_t)buf % 16 == 0); + for (; buf + 31 < buf_end; buf += 32) { + // scan more data + data1 = load128(buf); + z1 = movemask128(eq128(chars, and128(casemask, data1))); + data2 = load128(buf + 16); + z2 = movemask128(eq128(chars, and128(casemask, data2))); + z = z1 | (z2 << 16); + + // do a comparison on previous result + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + last_res = z; + last_buf = buf; + } + + // epilogue: compare final results + ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len]) + (last_buf, last_res +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + + // if we still have some data left, scan it too + if (buf + 15 < buf_end) { + return JOIN(MATCH_ALGO, vermPipeline16Nocase)(chars, buf, buf_end, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + } + assert(buf <= buf_end && buf >= buf_end - 16); + + return NULL; +} + +const u8 *JOIN(MATCH_ALGO, vermicelliExec)(char c, char nocase, + const u8 *buf, + const u8 *buf_end, + const u8 run_len +#ifdef MULTIACCEL_DOUBLE + , const u8 run_len2 +#endif + ) { + DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + const u8 *ptr; + + // Handle small scans. + if (buf_end - buf < VERM_BOUNDARY) { + for (; buf < buf_end; buf++) { + char cur = (char)*buf; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur == c) { + break; + } + } + return buf; + } + + VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ + + uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; + + if (min) { + ptr = nocase ? JOIN(MATCH_ALGO, vermUnalignNocase)(chars, + buf, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ) : JOIN(MATCH_ALGO, vermUnalign)(chars, + buf, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + buf += VERM_BOUNDARY - min; + } + + // if we have enough data, run bigger pipeline; otherwise run smaller one + if (buf_end - buf >= 128) { + ptr = nocase ? JOIN(MATCH_ALGO, vermPipeline32Nocase)(chars, + buf, buf_end, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ) : JOIN(MATCH_ALGO, vermPipeline32)(chars, + buf, buf_end, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + } else if (buf_end - buf >= 16){ + ptr = nocase ? JOIN(MATCH_ALGO, vermPipeline16Nocase)(chars, + buf, buf_end, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ) : JOIN(MATCH_ALGO, vermPipeline16)(chars, + buf, buf_end, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + if (unlikely(ptr)) { + return ptr; + } + } + + // final unaligned scan + ptr = nocase ? JOIN(MATCH_ALGO, vermUnalignNocase)(chars, + buf_end - VERM_BOUNDARY, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ) : JOIN(MATCH_ALGO, vermUnalign)(chars, + buf_end - VERM_BOUNDARY, run_len +#ifdef MULTIACCEL_DOUBLE + , run_len2 +#endif + ); + + // run our pipeline + return ptr ? ptr : buf_end; +}