From ed772380c05996cd71a77e18d6bec2081d49f0e0 Mon Sep 17 00:00:00 2001 From: Mohammad Abdul Awal Date: Thu, 31 Mar 2016 11:28:42 +0100 Subject: [PATCH] teddy: remove python codegen, refactor code Major cleanup of the Teddy runtime code. Removes python code generation, splits AVX2 models into their own file, improves readability. --- .gitignore | 4 - CMakeLists.txt | 14 +- src/fdr/CMakeLists.txt | 33 - src/fdr/arch.py | 58 -- src/fdr/autogen.py | 118 --- src/fdr/autogen_utils.py | 120 --- src/fdr/engine_description.h | 3 +- src/fdr/fdr.c | 30 +- src/fdr/fdr_engine_description.cpp | 2 +- src/fdr/teddy.c | 744 +++++++++++++---- src/fdr/teddy.h | 108 +++ src/fdr/teddy_autogen.py | 773 ------------------ src/fdr/teddy_avx2.c | 1110 ++++++++++++++++++++++++++ src/fdr/teddy_engine_description.cpp | 27 +- src/fdr/teddy_runtime_common.h | 256 ++++++ 15 files changed, 2114 insertions(+), 1286 deletions(-) delete mode 100644 src/fdr/CMakeLists.txt delete mode 100755 src/fdr/arch.py delete mode 100755 src/fdr/autogen.py delete mode 100755 src/fdr/autogen_utils.py create mode 100644 src/fdr/teddy.h delete mode 100755 src/fdr/teddy_autogen.py create mode 100644 src/fdr/teddy_avx2.c create mode 100644 src/fdr/teddy_runtime_common.h diff --git a/.gitignore b/.gitignore index 6e50ce45..4d984534 100644 --- a/.gitignore +++ b/.gitignore @@ -46,10 +46,6 @@ sqlite3 src/config.h src/config.h.in src/hs_version.h -src/fdr/fdr_autogen.c -src/fdr/fdr_autogen_compiler.cpp -src/fdr/teddy_autogen.c -src/fdr/teddy_autogen_compiler.cpp src/parser/Parser.cpp # Generated PCRE files diff --git a/CMakeLists.txt b/CMakeLists.txt index ad7bb3f9..2bc68474 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -357,11 +357,6 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}") -# include the autogen targets -add_subdirectory(src/fdr) - -include_directories(${PROJECT_BINARY_DIR}/src/fdr) - if(NOT WIN32) set(RAGEL_C_FLAGS "-Wno-unused") endif() @@ -381,8 +376,6 @@ SET(hs_HEADERS ) install(FILES ${hs_HEADERS} DESTINATION include/hs) -set(fdr_autogen_targets autogen_runtime autogen_teddy_runtime) - set (hs_exec_SRCS ${hs_HEADERS} src/hs_version.h @@ -400,7 +393,10 @@ set (hs_exec_SRCS src/fdr/flood_runtime.h src/fdr/fdr_loadval.h src/fdr/teddy.c + src/fdr/teddy_avx2.c + src/fdr/teddy.h src/fdr/teddy_internal.h + src/fdr/teddy_runtime_common.h src/hwlm/hwlm.c src/hwlm/hwlm.h src/hwlm/hwlm_internal.h @@ -929,11 +925,9 @@ set (LIB_VERSION ${HS_VERSION}) set (LIB_SOVERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}) add_library(hs_exec OBJECT ${hs_exec_SRCS}) -add_dependencies(hs_exec ${fdr_autogen_targets}) if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS) add_library(hs_exec_shared OBJECT ${hs_exec_SRCS}) -add_dependencies(hs_exec_shared ${fdr_autogen_targets}) set_target_properties(hs_exec_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE) endif() @@ -964,7 +958,6 @@ endif() add_library(hs STATIC ${hs_SRCS} $) add_dependencies(hs ragel_Parser) -add_dependencies(hs autogen_teddy_compiler) if (NOT BUILD_SHARED_LIBS) install(TARGETS hs DESTINATION lib) @@ -973,7 +966,6 @@ endif() if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS) add_library(hs_shared SHARED ${hs_SRCS} $) add_dependencies(hs_shared ragel_Parser) - add_dependencies(hs_shared autogen_compiler autogen_teddy_compiler) set_target_properties(hs_shared PROPERTIES OUTPUT_NAME hs VERSION ${LIB_VERSION} diff --git a/src/fdr/CMakeLists.txt b/src/fdr/CMakeLists.txt deleted file mode 100644 index 7bbf82ff..00000000 --- a/src/fdr/CMakeLists.txt +++ /dev/null @@ -1,33 +0,0 @@ -# The set of rules and other nastiness for generating FDR/Teddy source - -# we need to add these as explicit dependencies -set(AUTOGEN_PY_FILES - arch.py - autogen.py - autogen_utils.py - teddy_autogen.py -) - -function(fdr_autogen type out) - add_custom_command ( - COMMENT "AUTOGEN ${out}" - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${out}" - COMMAND ${PYTHON} "${CMAKE_CURRENT_SOURCE_DIR}/autogen.py" ${type} > "${CMAKE_CURRENT_BINARY_DIR}/${out}" - DEPENDS ${AUTOGEN_PY_FILES} - ) - add_custom_target(autogen_${type} DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/${out}") -endfunction(fdr_autogen) - -#now build the functions -fdr_autogen(runtime fdr_autogen.c) -fdr_autogen(teddy_runtime teddy_autogen.c) -fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp) - -set(fdr_GENERATED_SRC - ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen.c - ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen.c - ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp - PARENT_SCOPE) - -set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE) -include_directories(${CMAKE_CURRENT_BINARY_DIR}) diff --git a/src/fdr/arch.py b/src/fdr/arch.py deleted file mode 100755 index 83a31254..00000000 --- a/src/fdr/arch.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/python - -# Copyright (c) 2015, Intel Corporation -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of Intel Corporation nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import autogen_utils - -# wrapper for architectures - -class Arch: - def __init__(self, name, extensions = []): - self.name = name - self.extensions = extensions - self.target = None - - def get_guard(self): - # these defines definitely fall into the "belt-and-suspenders" - # category of paranoia - if (self.guard_list == []): - return "#if 1" - - return "#if " + " && ".join(self.guard_list) - -class X86Arch(Arch): - def __init__(self, name, extensions = []): - Arch.__init__(self, name, extensions) - self.guard_list = [ ] - self.target = "0" - - if "AVX2" in extensions: - self.target += " | HS_CPU_FEATURES_AVX2" - self.guard_list += [ "defined(__AVX2__)" ] - - -arch_x86_64 = X86Arch("x86_64", extensions = [ ]) -arch_x86_64_avx2 = X86Arch("x86_64_avx2", extensions = [ "AVX2" ]) diff --git a/src/fdr/autogen.py b/src/fdr/autogen.py deleted file mode 100755 index a8510487..00000000 --- a/src/fdr/autogen.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/python - -# Copyright (c) 2015-2016, Intel Corporation -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of Intel Corporation nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import sys -from autogen_utils import * -from teddy_autogen import * -from arch import * - -# teddy setup - -def build_teddy_matchers(): - all_matchers = [ ] - - # AVX2 - all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = False) ] - all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = True) ] - for n_msk in range(1, 5): - all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = False, num_masks = n_msk, num_buckets = 16) ] - all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = True, num_masks = n_msk, num_buckets = 16) ] - - # SSE/SSE2/SSSE3 - for n_msk in range(1, 5): - all_matchers += [ MT(arch = arch_x86_64, packed = False, num_masks = n_msk, num_buckets = 8) ] - all_matchers += [ MT(arch = arch_x86_64, packed = True, num_masks = n_msk, num_buckets = 8) ] - - return all_matchers - -def produce_teddy_compiles(l): - print "void getTeddyDescriptions(vector *out) {" - print " static const TeddyEngineDef defns[] = {" - for m in l: - m.produce_compile_call() - print " };" - print " out->clear();" - print " for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {" - print " out->push_back(TeddyEngineDescription(defns[i]));" - print " }" - print "}" - -# see below - we don't produce our 'zeros' at the point of the teddy runtimes as they -# are linked. So we either generate the function or we don't - then at the point of the -# header in fdr_autogen.c we either generate the header or we #define the zero. - -def produce_teddy_runtimes(l): - # Since we're using -Wmissing-prototypes, we need headers first. - for m in l: - m.produce_guard() - print m.produce_header(visible = True, header_only = True) - m.close_guard() - - for m in l: - m.produce_guard() - m.produce_code() - m.close_guard() - -# see produce_teddy_runtimes() comment for the rationale - -def produce_teddy_headers(l): - for m in l: - m.produce_guard() - print m.produce_header(visible = True, header_only = True) - m.produce_zero_alternative() - -# general utilities - -def make_fdr_function_pointers(matcher_list): - print """ -typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a); -static FDRFUNCTYPE funcs[] = { -""" - all_funcs = " fdr_engine_exec,\n" - all_funcs += ",\n".join([ " %s" % m.get_name() for m in matcher_list ]) - print all_funcs - print """ -}; -""" - -def assign_ids(matcher_list, next_id): - for m in matcher_list: - m.id = next_id - next_id += 1 - return next_id - -# Main entry point - -tm = build_teddy_matchers() -next_id = assign_ids(tm, 1) -if sys.argv[1] == "runtime": - produce_teddy_headers(tm) - make_fdr_function_pointers(tm) -elif sys.argv[1] == "teddy_runtime": - produce_teddy_runtimes(tm) -elif sys.argv[1] == "teddy_compiler": - produce_teddy_compiles(tm) diff --git a/src/fdr/autogen_utils.py b/src/fdr/autogen_utils.py deleted file mode 100755 index 3544bc7b..00000000 --- a/src/fdr/autogen_utils.py +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/python - -# Copyright (c) 2015-2016, Intel Corporation -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of Intel Corporation nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import sys - -def fail_out(msg = ""): - print >>sys.stderr, "Internal failure in autogen.py: " + msg - sys.exit(1) - -class IntegerType: - def __init__(self, size): - self.size = size - - def get_name(self): - return { 256: "m256", 128 : "m128", 64 : "u64a", 32 : "u32" , 16 : "u16", 8 : "u8"}[self.size] - - def size_in_bytes(self): - return self.size / 8 - - def zero_expression(self): - return "0" - - def constant_to_string(self, n): - if self.size == 64: - suffix = "ULL" - else: - suffix = "" - return "0x%x%s" % (n & ((1 << self.size) - 1), suffix) - - def lowbits(self, n): - return (1 << n) - 1 - - def highbits(self, n): - return ~(self.lowbits(self.size - n)) - - def lowbit_mask(self, n): - return self.constant_to_string(self.lowbits(n)) - - def lowbit_extract_expr(self, expr_string, n): - return "(%s & %s)" % ( expr_string, self.lowbit_mask(n)) - - def flip_lowbits_expr(self, expr_string, n): - return "(%s ^ %s)" % ( expr_string, self.lowbit_mask(n)) - - def bit_extract_expr(self, expr_string, low, high): - lbm = self.lowbit_mask(high - low) - return "((%s >> %d) & %s)" % (expr_string, low, lbm) - - # shifts are +ve if left and -ve if right - def shift_expr(self, expr_string, n): - if n <= -self.size or n >= self.size: - return self.zero_expression() - elif (n > 0): - return "(%s << %d)" % (expr_string, n) - elif (n < 0): - return "(%s >> %d)" % (expr_string, -n) - else: - return "(%s)" % (expr_string) - -class SIMDIntegerType(IntegerType): - def __init__(self, size): - IntegerType.__init__(self, size) - - def zero_expression(self): - return "zeroes128()" - - def lowbit_extract_expr(self, expr_string, n): - if (n <= 32): - tmpType = IntegerType(32) - tmpExpr = "movd(%s)" % expr_string - elif (32 < n <= 64): - tmpType = IntegerType(64) - tmpExpr = "movq(%s)" % expr_string - return tmpType.lowbit_extract_expr(tmpExpr, n) - - def bit_extract_expr(self, expr_string, low, high, flip): - fail_out("Unimplemented bit extract on m128") - - def shift_expr(self, expr_string, n): - if n % 8 != 0: - fail_out("Trying to shift a m128 by a bit granular value") - - # should check that n is divisible by 8 - if n <= -self.size or n >= self.size: - return self.zero_expression() - elif (n > 0): - return "byteShiftLeft128(%s, %s)" % (expr_string, n / 8) - elif (n < 0): - return "byteShiftRight128(%s, %s)" % (expr_string, -n / 8) - else: - return "(%s)" % (expr_string) - - def lowbit_mask(self, n): - if n % 8 != 0: - fail_out("Trying to make a lowbit mask in a m128 by a bit granular value") - return self.shift_expr("ones128()", -(128 - n)) diff --git a/src/fdr/engine_description.h b/src/fdr/engine_description.h index 3c3026c3..09b16179 100644 --- a/src/fdr/engine_description.h +++ b/src/fdr/engine_description.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -55,6 +55,7 @@ public: u32 getNumBuckets() const { return numBuckets; } u32 getConfirmPullBackDistance() const { return confirmPullBackDistance; } u32 getConfirmTopLevelSplit() const { return confirmTopLevelSplit; } + void setConfirmTopLevelSplit(u32 split) { confirmTopLevelSplit = split; } bool isValidOnTarget(const target_t &target_in) const; virtual u32 getDefaultFloodSuffixLength() const = 0; diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index bd7dbe83..51a041cc 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -33,6 +33,7 @@ #include "fdr_loadval.h" #include "fdr_streaming_runtime.h" #include "flood_runtime.h" +#include "teddy.h" #include "teddy_internal.h" #include "util/simd_utils.h" #include "util/simd_utils_ssse3.h" @@ -764,7 +765,34 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr, return HWLM_SUCCESS; } -#include "fdr_autogen.c" +#if defined(__AVX2__) +#define ONLY_AVX2(func) func +#else +#define ONLY_AVX2(func) NULL +#endif + +typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a); +static const FDRFUNCTYPE funcs[] = { + fdr_engine_exec, + ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fast), + ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fast), + ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fat), + ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fat), + ONLY_AVX2(fdr_exec_teddy_avx2_msks2_fat), + ONLY_AVX2(fdr_exec_teddy_avx2_msks2_pck_fat), + ONLY_AVX2(fdr_exec_teddy_avx2_msks3_fat), + ONLY_AVX2(fdr_exec_teddy_avx2_msks3_pck_fat), + ONLY_AVX2(fdr_exec_teddy_avx2_msks4_fat), + ONLY_AVX2(fdr_exec_teddy_avx2_msks4_pck_fat), + fdr_exec_teddy_msks1, + fdr_exec_teddy_msks1_pck, + fdr_exec_teddy_msks2, + fdr_exec_teddy_msks2_pck, + fdr_exec_teddy_msks3, + fdr_exec_teddy_msks3_pck, + fdr_exec_teddy_msks4, + fdr_exec_teddy_msks4_pck, +}; #define FAKE_HISTORY_SIZE 16 static const u8 fake_history[FAKE_HISTORY_SIZE]; diff --git a/src/fdr/fdr_engine_description.cpp b/src/fdr/fdr_engine_description.cpp index 103bc214..5e923b08 100644 --- a/src/fdr/fdr_engine_description.cpp +++ b/src/fdr/fdr_engine_description.cpp @@ -57,7 +57,7 @@ u32 FDREngineDescription::getDefaultFloodSuffixLength() const { void getFdrDescriptions(vector *out) { static const FDREngineDef def = {0, 128, 8, 0, 1, 256}; out->clear(); - out->push_back(FDREngineDescription(def)); + out->emplace_back(def); } static diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c index 11df9d69..08b761c0 100644 --- a/src/fdr/teddy.c +++ b/src/fdr/teddy.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -26,11 +26,19 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include "config.h" +/** \file + * \brief Teddy literal matcher: SSSE3 engine runtime. + */ + +#include "fdr_internal.h" +#include "flood_runtime.h" +#include "teddy.h" +#include "teddy_internal.h" +#include "teddy_runtime_common.h" #include "util/simd_utils.h" #include "util/simd_utils_ssse3.h" -static const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = { +const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = { {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -67,178 +75,584 @@ static const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff} }; -// Note: p_mask is an output param that initialises a poison mask. -UNUSED static really_inline -m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, - const u8 *buf_history, size_t len_history, - const u32 nMasks) { - union { - u8 val8[16]; - m128 val128; - } u; - u.val128 = zeroes128(); +#ifdef ARCH_64_BIT +#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(isnonzero128(var))) { \ + u64a lo = movq(var); \ + u64a hi = movq(byteShiftRight128(var, 8)); \ + if (unlikely(lo)) { \ + conf_fn(&lo, bucket, offset, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(hi)) { \ + conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + } \ +} while (0); +#else +#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(isnonzero128(var))) { \ + u32 part1 = movd(var); \ + u32 part2 = movd(byteShiftRight128(var, 4)); \ + u32 part3 = movd(byteShiftRight128(var, 8)); \ + u32 part4 = movd(byteShiftRight128(var, 12)); \ + if (unlikely(part1)) { \ + conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part2)) { \ + conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part3)) { \ + conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part4)) { \ + conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + } \ +} while (0); +#endif - if (ptr >= lo) { - u32 avail = (u32)(hi - ptr); - if (avail >= 16) { - *p_mask = load128((const void*)(p_mask_arr[16] + 16)); - return loadu128(ptr); - } - *p_mask = load128((const void*)(p_mask_arr[avail] + 16)); - for (u32 i = 0; i < avail; i++) { - u.val8[i] = ptr[i]; - } - } else { - u32 need = MIN((u32)(lo - ptr), MIN(len_history, nMasks - 1)); - u32 start = (u32)(lo - ptr); - u32 i; - for (i = start - need; ptr + i < lo; i++) { - u.val8[i] = buf_history[len_history - (lo - (ptr + i))]; - } - u32 end = MIN(16, (u32)(hi - ptr)); - *p_mask = loadu128((const void*)(p_mask_arr[end - start] + 16 - start)); - for (; i < end; i++) { - u.val8[i] = ptr[i]; - } +static really_inline +m128 prep_conf_teddy_m1(const m128 *maskBase, m128 p_mask, m128 val) { + m128 mask = set16x8(0xf); + m128 lo = and128(val, mask); + m128 hi = and128(rshift2x64(val, 4), mask); + return and128(and128(pshufb(maskBase[0*2], lo), + pshufb(maskBase[0*2+1], hi)), p_mask); +} + +static really_inline +m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 p_mask, + m128 val) { + m128 mask = set16x8(0xf); + m128 lo = and128(val, mask); + m128 hi = and128(rshift2x64(val, 4), mask); + m128 r = prep_conf_teddy_m1(maskBase, p_mask, val); + + m128 res_1 = and128(pshufb(maskBase[1*2], lo), + pshufb(maskBase[1*2+1], hi)); + m128 res_shifted_1 = palignr(res_1, *old_1, 16-1); + *old_1 = res_1; + return and128(and128(r, p_mask), res_shifted_1); +} + +static really_inline +m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2, + m128 p_mask, m128 val) { + m128 mask = set16x8(0xf); + m128 lo = and128(val, mask); + m128 hi = and128(rshift2x64(val, 4), mask); + m128 r = prep_conf_teddy_m2(maskBase, old_1, p_mask, val); + + m128 res_2 = and128(pshufb(maskBase[2*2], lo), + pshufb(maskBase[2*2+1], hi)); + m128 res_shifted_2 = palignr(res_2, *old_2, 16-2); + *old_2 = res_2; + return and128(r, res_shifted_2); +} + +static really_inline +m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2, + m128 *old_3, m128 p_mask, m128 val) { + m128 mask = set16x8(0xf); + m128 lo = and128(val, mask); + m128 hi = and128(rshift2x64(val, 4), mask); + m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, p_mask, val); + + m128 res_3 = and128(pshufb(maskBase[3*2], lo), + pshufb(maskBase[3*2+1], hi)); + m128 res_shifted_3 = palignr(res_3, *old_3, 16-3); + *old_3 = res_3; + return and128(r, res_shifted_3); +} + +hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m128 *maskBase = getMaskBase(teddy); + const u32 *confBase = getConfBase(teddy, 1); + + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 1); + m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy); + ptr += 16; } - return u.val128; -} - - -#if defined(__AVX2__) - -UNUSED static really_inline -m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, - const u8 *buf_history, size_t len_history, - const u32 nMasks) { - m128 p_mask128; - m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history, len_history, nMasks)); - *p_mask = set2x128(p_mask128); - return ret; -} - -static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = { - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff} -}; - - -UNUSED static really_inline -m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, - const u8 *buf_history, size_t len_history) { - union { - u8 val8[32]; - m256 val256; - } u; - - if (ptr >= lo) { - u32 avail = (u32)(hi - ptr); - if (avail >= 32) { - *p_mask = load256((const void*)(p_mask_arr256[32] + 32)); - return loadu256(ptr); - } - *p_mask = load256((const void*)(p_mask_arr256[avail] + 32)); - for (u32 i = 0; i < avail; i++) { - u.val8[i] = ptr[i]; - } - } else { - // need contains "how many chars to pull from history" - // calculate based on what we need, what we have in the buffer - // and only what we need to make primary confirm work - u32 start = (u32)(lo - ptr); - u32 i; - for (i = start; ptr + i < lo; i++) { - u.val8[i] = buf_history[len_history - (lo - (ptr + i))]; - } - u32 end = MIN(32, (u32)(hi - ptr)); - *p_mask = loadu256((const void*)(p_mask_arr256[end - start] + 32 - start)); - for (; i < end; i++) { - u.val8[i] = ptr[i]; - } + if (ptr + 16 < buf_end) { + m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy); + ptr += 16; } - return u.val256; + for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy); + m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16)); + CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 1); + m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; } +hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); -#endif // __AVX2__ + const m128 *maskBase = getMaskBase(teddy); + const u32 *confBase = getConfBase(teddy, 1); -#define P0(cnd) unlikely(cnd) + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 1); + m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } -#include "fdr.h" -#include "fdr_internal.h" -#include "flood_runtime.h" + if (ptr + 16 < buf_end) { + m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } -#include "fdr_confirm.h" -#include "fdr_confirm_runtime.h" + for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy); + m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16)); + CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy); + } -#include "fdr_loadval.h" -#include "util/bitutils.h" -#include "teddy_internal.h" + for (; ptr < buf_end; ptr += 16) { + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 1); + m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} -#include "teddy_autogen.c" +hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m128 *maskBase = getMaskBase(teddy); + const u32 *confBase = getConfBase(teddy, 2); + + m128 res_old_1 = ones128(); + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 2); + m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(), + load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); + ptr += 16; + } + + for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(), + load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); + m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(), + load128(ptr + 16)); + CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 2); + m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m128 *maskBase = getMaskBase(teddy); + const u32 *confBase = getConfBase(teddy, 2); + + m128 res_old_1 = ones128(); + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 2); + m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(), + load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(), + load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy); + m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(), + load128(ptr + 16)); + CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 2); + m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m128 *maskBase = getMaskBase(teddy); + const u32 *confBase = getConfBase(teddy, 3); + + m128 res_old_1 = ones128(); + m128 res_old_2 = ones128(); + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 3); + m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, + p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, + ones128(), load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); + ptr += 16; + } + + for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, + ones128(), load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); + m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, + ones128(), load128(ptr + 16)); + CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 3); + m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, + p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m128 *maskBase = getMaskBase(teddy); + const u32 *confBase = getConfBase(teddy, 3); + + m128 res_old_1 = ones128(); + m128 res_old_2 = ones128(); + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 3); + m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, + p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, + ones128(), load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, + ones128(), load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy); + m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, + ones128(), load128(ptr + 16)); + CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 3); + m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, + p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m128 *maskBase = getMaskBase(teddy); + const u32 *confBase = getConfBase(teddy, 4); + + m128 res_old_1 = ones128(); + m128 res_old_2 = ones128(); + m128 res_old_3 = ones128(); + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 4); + m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, ones128(), load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); + ptr += 16; + } + + for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, ones128(), load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); + m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, ones128(), load128(ptr + 16)); + CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 4); + m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m128 *maskBase = getMaskBase(teddy); + const u32 *confBase = getConfBase(teddy, 4); + + m128 res_old_1 = ones128(); + m128 res_old_2 = ones128(); + m128 res_old_3 = ones128(); + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 4); + m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, ones128(), load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, ones128(), load128(ptr)); + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy); + m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, ones128(), load128(ptr + 16)); + CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m128 p_mask; + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 4); + m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, p_mask, val_0); + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} diff --git a/src/fdr/teddy.h b/src/fdr/teddy.h new file mode 100644 index 00000000..a0377f60 --- /dev/null +++ b/src/fdr/teddy.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Teddy literal matcher: function declarations. + */ + +#ifndef TEDDY_H_ +#define TEDDY_H_ + +#include "hwlm/hwlm.h" + +struct FDR; // forward declaration from fdr_internal.h +struct FDR_Runtime_Args; + +hwlm_error_t fdr_exec_s1_w128(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_s2_w128(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_s4_w128(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +#if defined(__AVX2__) + +hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr, + const struct FDR_Runtime_Args *a); + +#endif /* __AVX2__ */ + +#endif /* TEDDY_H_ */ diff --git a/src/fdr/teddy_autogen.py b/src/fdr/teddy_autogen.py deleted file mode 100755 index 1cada00c..00000000 --- a/src/fdr/teddy_autogen.py +++ /dev/null @@ -1,773 +0,0 @@ -#!/usr/bin/python - -# Copyright (c) 2015-2016, Intel Corporation -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of Intel Corporation nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import sys -from autogen_utils import * -from string import Template - -class MT: - def produce_header(self, visible, header_only = False): - s = "" - if not visible: - s += "static never_inline" - s += """ -hwlm_error_t %s(UNUSED const struct FDR *fdr, - UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name() - if header_only: - s += ";" - else: - s += "{" - s += "\n" - return s - - def produce_guard(self): - print self.arch.get_guard() - - def produce_zero_alternative(self): - print """ -#else -#define %s 0 -#endif -""" % self.get_name() - - def close_guard(self): - print "#endif" - - def produce_confirm_base(self, conf_var_name, conf_var_size, offset, cautious, enable_confirmless, do_bailout = False): - if cautious: - caution_string = "VECTORING" - else: - caution_string = "NOT_CAUTIOUS" - conf_split_mask = IntegerType(32).constant_to_string( - self.conf_top_level_split - 1) - if enable_confirmless: - quick_check_string = """ - if (!fdrc->mult) { - u32 id = fdrc->nBitsOrSoleID; - if ((last_match == id) && (fdrc->flags & NoRepeat)) - continue; - last_match = id; - controlVal = a->cb(ptr+byte-buf, ptr+byte-buf, id, a->ctxt); - continue; - } """ - else: - quick_check_string = "" - if do_bailout: - bailout_string = """ - if ((ptr + byte < buf + a->start_offset) || (ptr + byte >= buf + len)) continue;""" - else: - bailout_string = "" - - return Template(""" -if (P0(!!$CONFVAR)) { - do { - u32 bit = findAndClearLSB_$CONFVAR_SIZE(&$CONFVAR); - u32 byte = bit / $NUM_BUCKETS + $OFFSET; - u32 bitRem = bit % $NUM_BUCKETS; - $BAILOUT_STRING - u32 confSplit = *(ptr+byte) & $SPLIT_MASK; - u32 idx = confSplit * $NUM_BUCKETS + bitRem; - u32 cf = confBase[idx]; - if (!cf) - continue; - fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf); - if (!(fdrc->groups & *control)) - continue; - $QUICK_CHECK_STRING - CautionReason reason = $CAUTION_STRING; - CONF_TYPE v; - const u8 * confirm_loc = ptr + byte - $CONF_PULL_BACK - 7; - if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) { - v = lv_u64a(confirm_loc, buf, buf + len); - } else { // r == VECTORING, confirm_loc < buf - u64a histBytes = a->histBytes; - v = lv_u64a_ce(confirm_loc, buf, buf + len); - // stitch together v (which doesn't move) and history (which does) - u32 overhang = buf - confirm_loc; - histBytes >>= 64 - (overhang * 8); - v |= histBytes; - } - confWithBit(fdrc, a, ptr - buf + byte, $CONF_PULL_BACK, control, &last_match, v); - } while(P0(!!$CONFVAR)); - if (P0(controlVal == HWLM_TERMINATE_MATCHING)) { - *a->groups = controlVal; - return HWLM_TERMINATED; - } -}""").substitute(CONFVAR = conf_var_name, - CONFVAR_SIZE = conf_var_size, - NUM_BUCKETS = self.num_buckets, - OFFSET = offset, - SPLIT_MASK = conf_split_mask, - QUICK_CHECK_STRING = quick_check_string, - BAILOUT_STRING = bailout_string, - CAUTION_STRING = caution_string, - CONF_PULL_BACK = self.conf_pull_back) - - def produce_confirm(self, iter, var_name, offset, bits, cautious = True): - if self.packed: - print self.produce_confirm_base(var_name, bits, iter*16 + offset, cautious, enable_confirmless = False, do_bailout = False) - else: - if cautious: - caution_string = "VECTORING" - else: - caution_string = "NOT_CAUTIOUS" - - print " if (P0(!!%s)) {" % var_name - print " do {" - if bits == 64: - print " u32 bit = findAndClearLSB_64(&%s);" % (var_name) - else: - print " u32 bit = findAndClearLSB_32(&%s);" % (var_name) - print " u32 byte = bit / %d + %d;" % (self.num_buckets, iter*16 + offset) - print " u32 idx = bit %% %d;" % self.num_buckets - print " u32 cf = confBase[idx];" - print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);" - print " if (!(fdrc->groups & *control))" - print " continue;" - print """ - CautionReason reason = %s; - CONF_TYPE v; - const u8 * confirm_loc = ptr + byte - 7; - if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) { - v = lv_u64a(confirm_loc, buf, buf + len); - } else { // r == VECTORING, confirm_loc < buf - u64a histBytes = a->histBytes; - v = lv_u64a_ce(confirm_loc, buf, buf + len); - // stitch together v (which doesn't move) and history (which does) - u32 overhang = buf - confirm_loc; - histBytes >>= 64 - (overhang * 8); - v |= histBytes; - }""" % (caution_string) - if self.num_masks == 1: - print " confWithBit1(fdrc, a, ptr - buf + byte, control, &last_match, v);" - else: - print " confWithBitMany(fdrc, a, ptr - buf + byte, %s, control, &last_match, v);" % (caution_string) - print " } while(P0(!!%s));" % var_name - print " if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {" - print " *a->groups = controlVal;" - print " return HWLM_TERMINATED;" - print " }" - print " }" - - def produce_needed_temporaries(self, max_iterations): - print " m128 p_mask;" - for iter in range(0, max_iterations): - print " m128 val_%d;" % iter - print " m128 val_%d_lo;" % iter - print " m128 val_%d_hi;" % iter - for x in range(self.num_masks): - print " m128 res_%d_%d;" % (iter, x) - if x != 0: - print " m128 res_shifted_%d_%d;" % (iter, x) - print " m128 r_%d;" % iter - print "#ifdef ARCH_64_BIT" - print " u64a r_%d_lopart;" % iter - print " u64a r_%d_hipart;" % iter - print "#else" - print " u32 r_%d_part1;" % iter - print " u32 r_%d_part2;" % iter - print " u32 r_%d_part3;" % iter - print " u32 r_%d_part4;" % iter - print "#endif" - - def produce_one_iteration_state_calc(self, iter, effective_num_iterations, - cautious, save_old): - if cautious: - print " val_%d = vectoredLoad128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks) - else: - print " val_%d = load128(ptr + %d);" % (iter, iter*16) - print " val_%d_lo = and128(val_%d, lomask);" % (iter, iter) - print " val_%d_hi = rshift2x64(val_%d, 4);" % (iter, iter) - print " val_%d_hi = and128(val_%d_hi, lomask);" % (iter, iter) - print - for x in range(self.num_masks): - print Template(""" - res_${ITER}_${X} = and128(pshufb(maskBase[${X}*2] , val_${ITER}_lo), - pshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x) - if x != 0: - if iter == 0: - print " res_shifted_%d_%d = palignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x, iter, x, x, x) - else: - print " res_shifted_%d_%d = palignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x, iter, x, iter-1, x, x) - if x != 0 and iter == effective_num_iterations - 1 and save_old: - print " res_old_%d = res_%d_%d;" % (x, iter, x) - print - if cautious: - print " r_%d = and128(res_%d_0, p_mask);" % (iter, iter) - else: - print " r_%d = res_%d_0;" % (iter, iter) - for x in range(1, self.num_masks): - print " r_%d = and128(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x) - print - - def produce_one_iteration_confirm(self, iter, confirmCautious): - setup64 = [ (0, "r_%d_lopart" % iter, "movq(r_%d)" % iter), - (8, "r_%d_hipart" % iter, "movq(byteShiftRight128(r_%d, 8))" % iter) ] - - setup32 = [ (0, "r_%d_part1" % iter, "movd(r_%d)" % iter), - (4, "r_%d_part2" % iter, "movd(byteShiftRight128(r_%d, 4))" % iter), - (8, "r_%d_part3" % iter, "movd(byteShiftRight128(r_%d, 8))" % iter), - (12, "r_%d_part4" % iter, "movd(byteShiftRight128(r_%d, 12))" % iter) ] - - print " if (P0(isnonzero128(r_%d))) {" % (iter) - print "#ifdef ARCH_64_BIT" - for (off, val, init) in setup64: - print " %s = %s;" % (val, init) - for (off, val, init) in setup64: - self.produce_confirm(iter, val, off, 64, cautious = confirmCautious) - print "#else" - for (off, val, init) in setup32: - print " %s = %s;" % (val, init) - for (off, val, init) in setup32: - self.produce_confirm(iter, val, off, 32, cautious = confirmCautious) - print "#endif" - print " }" - - def produce_one_iteration(self, iter, effective_num_iterations, cautious = False, - confirmCautious = True, save_old = True): - self.produce_one_iteration_state_calc(iter, effective_num_iterations, cautious, save_old) - self.produce_one_iteration_confirm(iter, confirmCautious) - - def produce_code(self): - print self.produce_header(visible = True, header_only = False) - print """ - const u8 * buf = a->buf; - const size_t len = a->len; - const u8 * ptr = buf + a->start_offset; - hwlmcb_rv_t controlVal = *a->groups; - hwlmcb_rv_t * control = &controlVal; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 * tryFloodDetect = a->firstFloodDetect; - const struct FDRConfirm *fdrc; - u32 last_match = (u32)-1; -""" - print - - self.produce_needed_temporaries(self.num_iterations) - print - - print " const struct Teddy * teddy = (const struct Teddy *)fdr;" - print " const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));" - print " const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32));" % self.num_masks - print " const u8 * mainStart = ROUNDUP_PTR(ptr, 16);" - print " const size_t iterBytes = %d;" % (self.num_iterations * 16) - - print ' DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \ - ' buf, len, a->start_offset);' - print ' DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \ - ' mainStart);' - - for x in range(self.num_masks): - if (x != 0): - print " m128 res_old_%d = ones128();" % x - print " m128 lomask = set16x8(0xf);" - - print " if (ptr < mainStart) {" - print " ptr = mainStart - 16;" - self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True) - print " ptr += 16;" - print " }" - - print " if (ptr + 16 < buf + len) {" - self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True) - print " ptr += 16;" - print " }" - - print """ - for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - if (P0(ptr > tryFloodDetect)) { - tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes); - if (P0(controlVal == HWLM_TERMINATE_MATCHING)) { - *a->groups = controlVal; - return HWLM_TERMINATED; - } - } -""" - for iter in range(self.num_iterations): - self.produce_one_iteration(iter, self.num_iterations, cautious = False, confirmCautious = False) - - print " }" - - print " for (; ptr < buf + len; ptr += 16) {" - self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True) - print " }" - - print """ - *a->groups = controlVal; - return HWLM_SUCCESS; -} -""" - - def produce_compile_call(self): - packed_str = { False : "false", True : "true"}[self.packed] - print " { %d, %s, %d, %d, %s, %d, %d }," % ( - self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str, - self.conf_pull_back, self.conf_top_level_split) - - def get_name(self): - if self.packed: - pck_string = "_pck" - else: - pck_string = "" - - if self.num_buckets == 16: - type_string = "_fat" - else: - type_string = "" - - return "fdr_exec_teddy_%s_msks%d%s%s" % (self.arch.name, self.num_masks, pck_string, type_string) - - def __init__(self, arch, packed = False, num_masks = 1, num_buckets = 8): - self.arch = arch - self.packed = packed - self.num_masks = num_masks - self.num_buckets = num_buckets - self.num_iterations = 2 - - if packed: - self.conf_top_level_split = 32 - else: - self.conf_top_level_split = 1 - self.conf_pull_back = 0 - -class MTFat(MT): - def produce_needed_temporaries(self, max_iterations): - print " m256 p_mask;" - for iter in range(0, max_iterations): - print " m256 val_%d;" % iter - print " m256 val_%d_lo;" % iter - print " m256 val_%d_hi;" % iter - for x in range(self.num_masks): - print " m256 res_%d_%d;" % (iter, x) - if x != 0: - print " m256 res_shifted_%d_%d;" % (iter, x) - print " m256 r_%d;" % iter - print "#ifdef ARCH_64_BIT" - print " u64a r_%d_part1;" % iter - print " u64a r_%d_part2;" % iter - print " u64a r_%d_part3;" % iter - print " u64a r_%d_part4;" % iter - print "#else" - print " u32 r_%d_part1;" % iter - print " u32 r_%d_part2;" % iter - print " u32 r_%d_part3;" % iter - print " u32 r_%d_part4;" % iter - print " u32 r_%d_part5;" % iter - print " u32 r_%d_part6;" % iter - print " u32 r_%d_part7;" % iter - print " u32 r_%d_part8;" % iter - print "#endif" - - def produce_code(self): - print self.produce_header(visible = True, header_only = False) - print """ - const u8 * buf = a->buf; - const size_t len = a->len; - const u8 * ptr = buf + a->start_offset; - hwlmcb_rv_t controlVal = *a->groups; - hwlmcb_rv_t * control = &controlVal; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 * tryFloodDetect = a->firstFloodDetect; - const struct FDRConfirm *fdrc; - u32 last_match = (u32)-1; -""" - print - - self.produce_needed_temporaries(self.num_iterations) - print - - print " const struct Teddy * teddy = (const struct Teddy *)fdr;" - print " const m256 * maskBase = (const m256 *)((const u8 *)fdr + sizeof(struct Teddy));" - print " const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32*2));" % self.num_masks - print " const u8 * mainStart = ROUNDUP_PTR(ptr, 16);" - print " const size_t iterBytes = %d;" % (self.num_iterations * 16) - - print ' DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \ - ' buf, len, a->start_offset);' - print ' DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \ - ' mainStart);' - - for x in range(self.num_masks): - if (x != 0): - print " m256 res_old_%d = ones256();" % x - print " m256 lomask = set32x8(0xf);" - - print " if (ptr < mainStart) {" - print " ptr = mainStart - 16;" - self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True) - print " ptr += 16;" - print " }" - - print " if (ptr + 16 < buf + len) {" - self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True) - print " ptr += 16;" - print " }" - - print """ - for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - if (P0(ptr > tryFloodDetect)) { - tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes); - if (P0(controlVal == HWLM_TERMINATE_MATCHING)) { - *a->groups = controlVal; - return HWLM_TERMINATED; - } - } -""" - - for iter in range(self.num_iterations): - self.produce_one_iteration(iter, self.num_iterations, False, confirmCautious = False) - - print " }" - - print " for (; ptr < buf + len; ptr += 16) {" - self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True) - print " }" - - print """ - *a->groups = controlVal; - return HWLM_SUCCESS; -} -""" - - def produce_one_iteration_state_calc(self, iter, effective_num_iterations, - cautious, save_old): - if cautious: - print " val_%d = vectoredLoad2x128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks) - else: - print " val_%d = load2x128(ptr + %d);" % (iter, iter*16) - print " val_%d_lo = and256(val_%d, lomask);" % (iter, iter) - print " val_%d_hi = rshift4x64(val_%d, 4);" % (iter, iter) - print " val_%d_hi = and256(val_%d_hi, lomask);" % (iter, iter) - print - for x in range(self.num_masks): - print Template(""" - res_${ITER}_${X} = and256(vpshufb(maskBase[${X}*2] , val_${ITER}_lo), - vpshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x) - if x != 0: - if iter == 0: - print " res_shifted_%d_%d = vpalignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x, iter, x, x, x) - else: - print " res_shifted_%d_%d = vpalignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x, iter, x, iter-1, x, x) - if x != 0 and iter == effective_num_iterations - 1 and save_old: - print " res_old_%d = res_%d_%d;" % (x, iter, x) - print - if cautious: - print " r_%d = and256(res_%d_0, p_mask);" % (iter, iter) - else: - print " r_%d = res_%d_0;" % (iter, iter) - for x in range(1, self.num_masks): - print " r_%d = and256(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x) - print - - def produce_one_iteration_confirm(self, iter, confirmCautious): - setup64 = [ (0, "r_%d_part1" % iter, "extractlow64from256(r)"), - (4, "r_%d_part2" % iter, "extract64from256(r, 1);\n r = interleave256hi(r_%d, r_swap)" % (iter)), - (8, "r_%d_part3" % iter, "extractlow64from256(r)"), - (12, "r_%d_part4" % iter, "extract64from256(r, 1)") ] - - setup32 = [ (0, "r_%d_part1" % iter, "extractlow32from256(r)"), - (2, "r_%d_part2" % iter, "extract32from256(r, 1)"), - (4, "r_%d_part3" % iter, "extract32from256(r, 2)"), - (6, "r_%d_part4" % iter, "extract32from256(r, 3);\n r = interleave256hi(r_%d, r_swap)" % (iter)), - (8, "r_%d_part5" % iter, "extractlow32from256(r)"), - (10, "r_%d_part6" % iter, "extract32from256(r, 1)"), - (12, "r_%d_part7" % iter, "extract32from256(r, 2)"), - (14, "r_%d_part8" % iter, "extract32from256(r, 3)") ] - - print " if (P0(isnonzero256(r_%d))) {" % (iter) - print " m256 r_swap = swap128in256(r_%d);" % (iter) - print " m256 r = interleave256lo(r_%d, r_swap);" % (iter) - print "#ifdef ARCH_64_BIT" - for (off, val, init) in setup64: - print " %s = %s;" % (val, init) - - for (off, val, init) in setup64: - self.produce_confirm(iter, val, off, 64, cautious = confirmCautious) - print "#else" - for (off, val, init) in setup32: - print " %s = %s;" % (val, init) - - for (off, val, init) in setup32: - self.produce_confirm(iter, val, off, 32, cautious = confirmCautious) - print "#endif" - print " }" - -class MTFast: - def produce_header(self, visible, header_only = False): - s = "" - if not visible: - s += "static never_inline" - s += """ -hwlm_error_t %s(UNUSED const struct FDR *fdr, - UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name() - if header_only: - s += ";" - else: - s += "{" - s += "\n" - return s - - def produce_guard(self): - print self.arch.get_guard() - - def produce_zero_alternative(self): - print """ -#else -#define %s 0 -#endif -""" % self.get_name() - - def close_guard(self): - print "#endif" - - def produce_confirm(self, cautious): - if cautious: - cautious_str = "VECTORING" - else: - cautious_str = "NOT_CAUTIOUS" - - print " for (u32 i = 0; i < arrCnt; i++) {" - print " u32 byte = bitArr[i] / 8;" - if self.packed: - conf_split_mask = IntegerType(32).constant_to_string( - self.conf_top_level_split - 1) - print " u32 bitRem = bitArr[i] % 8;" - print " u32 confSplit = *(ptr+byte) & 0x1f;" - print " u32 idx = confSplit * %d + bitRem;" % self.num_buckets - print " u32 cf = confBase[idx];" - print " if (!cf)" - print " continue;" - print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);" - print " if (!(fdrc->groups & *control))" - print " continue;" - print """ - CautionReason reason = %s; - CONF_TYPE v; - const u8 * confirm_loc = ptr + byte - 7; - if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) { - v = lv_u64a(confirm_loc, buf, buf + len); - } else { // r == VECTORING, confirm_loc < buf - u64a histBytes = a->histBytes; - v = lv_u64a_ce(confirm_loc, buf, buf + len); - // stitch together v (which doesn't move) and history (which does) - u32 overhang = buf - confirm_loc; - histBytes >>= 64 - (overhang * 8); - v |= histBytes; - }""" % (cautious_str) - print " confWithBit(fdrc, a, ptr - buf + byte, 0, control, &last_match, v);" - else: - print " u32 cf = confBase[bitArr[i] % 8];" - print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);" - print """ - CautionReason reason = %s; - CONF_TYPE v; - const u8 * confirm_loc = ptr + byte - 7; - if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) { - v = lv_u64a(confirm_loc, buf, buf + len); - } else { // r == VECTORING, confirm_loc < buf - u64a histBytes = a->histBytes; - v = lv_u64a_ce(confirm_loc, buf, buf + len); - // stitch together v (which doesn't move) and history (which does) - u32 overhang = buf - confirm_loc; - histBytes >>= 64 - (overhang * 8); - v |= histBytes; - }""" % (cautious_str) - print " confWithBit1(fdrc, a, ptr - buf + byte, control, &last_match, v);" - print " if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {" - print " *a->groups = controlVal;" - print " return HWLM_TERMINATED;" - print " }" - print " }" - - def produce_needed_temporaries(self, max_iterations): - print " u32 arrCnt;" - print " u16 bitArr[512];" - print " m256 p_mask;" - print " m256 val_0;" - print " m256 val_0_lo;" - print " m256 val_0_hi;" - print " m256 res_0;" - print " m256 res_1;" - print " m128 lo_part;" - print " m128 hi_part;" - print "#ifdef ARCH_64_BIT" - print " u64a r_0_part;" - print "#else" - print " u32 r_0_part;" - print "#endif" - - def produce_bit_scan(self, offset, bits): - print " while (P0(!!r_0_part)) {" - if bits == 64: - print " bitArr[arrCnt++] = (u16)findAndClearLSB_64(&r_0_part) + 64 * %d;" % (offset) - else: - print " bitArr[arrCnt++] = (u16)findAndClearLSB_32(&r_0_part) + 32 * %d;" % (offset) - print " }" - - def produce_bit_check_128(self, var_name, offset): - print " if (P0(isnonzero128(%s))) {" % (var_name) - print "#ifdef ARCH_64_BIT" - print " r_0_part = movq(%s);" % (var_name) - self.produce_bit_scan(offset, 64) - print " r_0_part = movq(byteShiftRight128(%s, 8));" % (var_name) - self.produce_bit_scan(offset + 1, 64) - print "#else" - print " r_0_part = movd(%s);" % (var_name) - self.produce_bit_scan(offset * 2, 32) - for step in range(1, 4): - print " r_0_part = movd(byteShiftRight128(%s, %d));" % (var_name, step * 4) - self.produce_bit_scan(offset * 2 + step, 32) - print "#endif" - print " }" - - def produce_bit_check_256(self, iter, single_iter, cautious): - print " if (P0(isnonzero256(res_%d))) {" % (iter) - if single_iter: - print " arrCnt = 0;" - print " lo_part = cast256to128(res_%d);" % (iter) - print " hi_part = cast256to128(swap128in256(res_%d));" % (iter) - self.produce_bit_check_128("lo_part", iter * 4) - self.produce_bit_check_128("hi_part", iter * 4 + 2) - if single_iter: - self.produce_confirm(cautious) - print " }" - - def produce_one_iteration_state_calc(self, iter, cautious): - if cautious: - print " val_0 = vectoredLoad256(&p_mask, ptr + %d, buf+a->start_offset, buf+len, a->buf_history, a->len_history);" % (iter * 32) - else: - print " val_0 = load256(ptr + %d);" % (iter * 32) - print " val_0_lo = and256(val_0, lomask);" - print " val_0_hi = rshift4x64(val_0, 4);" - print " val_0_hi = and256(val_0_hi, lomask);" - print " res_%d = and256(vpshufb(maskLo , val_0_lo), vpshufb(maskHi, val_0_hi));" % (iter) - if cautious: - print " res_%d = and256(res_%d, p_mask);" % (iter, iter) - - def produce_code(self): - print self.produce_header(visible = True, header_only = False) - print """ - const u8 * buf = a->buf; - const size_t len = a->len; - const u8 * ptr = buf + a->start_offset; - hwlmcb_rv_t controlVal = *a->groups; - hwlmcb_rv_t * control = &controlVal; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 * tryFloodDetect = a->firstFloodDetect; - const struct FDRConfirm *fdrc; - u32 last_match = (u32)-1; -""" - print - - self.produce_needed_temporaries(self.num_iterations) - - print " const struct Teddy * teddy = (const struct Teddy *)fdr;" - print " const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));" - print " const m256 maskLo = set2x128(maskBase[0]);" - print " const m256 maskHi = set2x128(maskBase[1]);" - print " const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + 32);" - print " const u8 * mainStart = ROUNDUP_PTR(ptr, 32);" - print " const size_t iterBytes = %d;" % (self.num_iterations * 32) - - print ' DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \ - ' buf, len, a->start_offset);' - print ' DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \ - ' mainStart);' - print " const m256 lomask = set32x8(0xf);" - - print " if (ptr < mainStart) {" - print " ptr = mainStart - 32;" - self.produce_one_iteration_state_calc(iter = 0, cautious = True) - self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True) - print " ptr += 32;" - print " }" - - print " if (ptr + 32 < buf + len) {" - self.produce_one_iteration_state_calc(iter = 0, cautious = False) - self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True) - print " ptr += 32;" - print " }" - print """ - for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) { - __builtin_prefetch(ptr + (iterBytes*4)); - if (P0(ptr > tryFloodDetect)) { - tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes); - if (P0(controlVal == HWLM_TERMINATE_MATCHING)) { - *a->groups = controlVal; - return HWLM_TERMINATED; - } - } -""" - - for iter in range (0, self.num_iterations): - self.produce_one_iteration_state_calc(iter = iter, cautious = False) - print " arrCnt = 0;" - for iter in range (0, self.num_iterations): - self.produce_bit_check_256(iter = iter, single_iter = False, cautious = False) - self.produce_confirm(cautious = False) - print " }" - - print " for (; ptr < buf + len; ptr += 32) {" - self.produce_one_iteration_state_calc(iter = 0, cautious = True) - self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True) - print " }" - - print """ - *a->groups = controlVal; - return HWLM_SUCCESS; -} -""" - - def get_name(self): - if self.packed: - pck_string = "_pck" - else: - pck_string = "" - return "fdr_exec_teddy_%s_msks%d%s_fast" % (self.arch.name, self.num_masks, pck_string) - - def produce_compile_call(self): - packed_str = { False : "false", True : "true"}[self.packed] - print " { %d, %s, %d, %d, %s, %d, %d }," % ( - self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str, - self.conf_pull_back, self.conf_top_level_split) - - def __init__(self, arch, packed = False): - self.arch = arch - self.packed = packed - self.num_masks = 1 - self.num_buckets = 8 - self.num_iterations = 2 - - self.conf_top_level_split = 1 - self.conf_pull_back = 0 - if packed: - self.conf_top_level_split = 32 - else: - self.conf_top_level_split = 1 - self.conf_pull_back = 0 diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c new file mode 100644 index 00000000..33dd8a30 --- /dev/null +++ b/src/fdr/teddy_avx2.c @@ -0,0 +1,1110 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Teddy literal matcher: AVX2 engine runtime. + */ + +#include "fdr_internal.h" +#include "flood_runtime.h" +#include "teddy.h" +#include "teddy_internal.h" +#include "teddy_runtime_common.h" +#include "util/simd_utils.h" +#include "util/simd_utils_ssse3.h" + +#if defined(__AVX2__) + +static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = { + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff} +}; + +#ifdef ARCH_64_BIT +#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(isnonzero256(var))) { \ + m256 swap = swap128in256(var); \ + m256 r = interleave256lo(var, swap); \ + u64a part1 = extractlow64from256(r); \ + u64a part2 = extract64from256(r, 1); \ + r = interleave256hi(var, swap); \ + u64a part3 = extractlow64from256(r); \ + u64a part4 = extract64from256(r, 1); \ + if (unlikely(part1)) { \ + conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part2)) { \ + conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part3)) { \ + conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part4)) { \ + conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + } \ +} while (0); +#else +#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(isnonzero256(var))) { \ + m256 swap = swap128in256(var); \ + m256 r = interleave256lo(var, swap); \ + u32 part1 = extractlow32from256(r); \ + u32 part2 = extract32from256(r, 1); \ + u32 part3 = extract32from256(r, 2); \ + u32 part4 = extract32from256(r, 3); \ + r = interleave256hi(var, swap); \ + u32 part5 = extractlow32from256(r); \ + u32 part6 = extract32from256(r, 1); \ + u32 part7 = extract32from256(r, 2); \ + u32 part8 = extract32from256(r, 3); \ + if (unlikely(part1)) { \ + conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part2)) { \ + conf_fn(&part2, bucket, offset + 2, confBase, reason, a, ptr, \ + control, &last_match); \ + } \ + if (unlikely(part3)) { \ + conf_fn(&part3, bucket, offset + 4, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part4)) { \ + conf_fn(&part4, bucket, offset + 6, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part5)) { \ + conf_fn(&part5, bucket, offset + 8, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part6)) { \ + conf_fn(&part6, bucket, offset + 10, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part7)) { \ + conf_fn(&part7, bucket, offset + 12, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + if (unlikely(part8)) { \ + conf_fn(&part8, bucket, offset + 14, confBase, reason, a, ptr, \ + control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + } \ +} while (0); +#endif + +#define CONFIRM_FAST_TEDDY(var, offset, reason, conf_fn) \ +do { \ + if (unlikely(isnonzero256(var))) { \ + u32 arrCnt = 0; \ + m128 lo = cast256to128(var); \ + m128 hi = cast256to128(swap128in256(var)); \ + bit_array_fast_teddy(lo, bitArr, &arrCnt, offset); \ + bit_array_fast_teddy(hi, bitArr, &arrCnt, offset + 2); \ + for (u32 i = 0; i < arrCnt; i++) { \ + conf_fn(bitArr[i], confBase, reason, a, ptr, control, \ + &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ + } \ +} while (0); + +static really_inline +m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, + const u8 *buf_history, size_t len_history, + const u32 nMasks) { + m128 p_mask128; + m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history, + len_history, nMasks)); + *p_mask = set2x128(p_mask128); + return ret; +} + +/* + * \brief Copy a block of [0,31] bytes efficiently. + * + * This function is a workaround intended to stop some compilers from + * synthesizing a memcpy function call out of the copy of a small number of + * bytes that we do in vectoredLoad128. + */ +static really_inline +void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) { + switch (len) { + case 0: + break; + case 1: + *dst = *src; + break; + case 2: + unaligned_store_u16(dst, unaligned_load_u16(src)); + break; + case 3: + unaligned_store_u16(dst, unaligned_load_u16(src)); + dst[2] = src[2]; + break; + case 4: + unaligned_store_u32(dst, unaligned_load_u32(src)); + break; + case 5: + case 6: + case 7: + /* Perform copy with two overlapping 4-byte chunks. */ + unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4)); + unaligned_store_u32(dst, unaligned_load_u32(src)); + break; + case 8: + unaligned_store_u64a(dst, unaligned_load_u64a(src)); + break; + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + /* Perform copy with two overlapping 8-byte chunks. */ + unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8)); + unaligned_store_u64a(dst, unaligned_load_u64a(src)); + break; + case 16: + storeu128(dst, loadu128(src)); + break; + default: + /* Perform copy with two overlapping 16-byte chunks. */ + assert(len < 32); + storeu128(dst + len - 16, loadu128(src + len - 16)); + storeu128(dst, loadu128(src)); + break; + } +} + +static really_inline +m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, + const u8 *buf_history, size_t len_history) { + union { + u8 val8[32]; + m256 val256; + } u; + + uintptr_t copy_start; + uintptr_t copy_len; + + if (ptr >= lo) { + uintptr_t avail = (uintptr_t)(hi - ptr); + if (avail >= 32) { + *p_mask = load256(p_mask_arr256[32] + 32); + return loadu256(ptr); + } + *p_mask = load256(p_mask_arr256[avail] + 32); + copy_start = 0; + copy_len = avail; + } else { + // need contains "how many chars to pull from history" + // calculate based on what we need, what we have in the buffer + // and only what we need to make primary confirm work + uintptr_t start = (uintptr_t)(lo - ptr); + uintptr_t i; + for (i = start; ptr + i < lo; i++) { + u.val8[i] = buf_history[len_history - (lo - (ptr + i))]; + } + uintptr_t end = MIN(32, (uintptr_t)(hi - ptr)); + *p_mask = loadu256(p_mask_arr256[end - start] + 32 - start); + copy_start = i; + copy_len = end - i; + } + + // Runt block from the buffer. + copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len); + + return u.val256; +} + +static really_inline +void do_confWithBit1_fast_teddy(u16 bits, const u32 *confBase, + CautionReason reason, + const struct FDR_Runtime_Args *a, + const u8 *ptr, hwlmcb_rv_t *control, + u32 *last_match) { + u32 byte = bits / 8; + u32 cf = confBase[bits % 8]; + const struct FDRConfirm *fdrc = (const struct FDRConfirm *) + ((const u8 *)confBase + cf); + u64a confVal = getConfVal(a, ptr, byte, reason); + confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match, confVal); +} + +static really_inline +void do_confWithBit_fast_teddy(u16 bits, const u32 *confBase, + CautionReason reason, + const struct FDR_Runtime_Args *a, const u8 *ptr, + hwlmcb_rv_t *control, u32 *last_match) { + u32 byte = bits / 8; + u32 bitRem = bits % 8; + u32 confSplit = *(ptr+byte) & 0x1f; + u32 idx = confSplit * 8 + bitRem; + u32 cf = confBase[idx]; + if (!cf) { + return; + } + const struct FDRConfirm *fdrc = (const struct FDRConfirm *) + ((const u8 *)confBase + cf); + if (!(fdrc->groups & *control)) { + return; + } + u64a confVal = getConfVal(a, ptr, byte, reason); + confWithBit(fdrc, a, ptr - a->buf + byte, 0, control, last_match, confVal); +} + +static really_inline +void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) { + if (unlikely(isnonzero128(var))) { +#ifdef ARCH_64_BIT + u64a part_0 = movq(var); + while (unlikely(part_0)) { + bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_0) + + 64 * (offset); + *arrCnt += 1; + } + u64a part_1 = movq(byteShiftRight128(var, 8)); + while (unlikely(part_1)) { + bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) + + 64 * (offset + 1); + *arrCnt += 1; + } +#else + u32 part_0 = movd(var); + while (unlikely(part_0)) { + bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_0) + + 32 * (offset * 2); + *arrCnt += 1; + } + u32 part_1 = movd(byteShiftRight128(var, 4)); + while (unlikely(part_1)) { + bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) + + 32 * (offset * 2 + 1); + *arrCnt += 1; + } + u32 part_2 = movd(byteShiftRight128(var, 8)); + while (unlikely(part_2)) { + bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_2) + + 32 * (offset * 2 + 2); + *arrCnt += 1; + } + u32 part_3 = movd(byteShiftRight128(var, 12)); + while (unlikely(part_3)) { + bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_3) + + 32 * (offset * 2 + 3); + *arrCnt += 1; + } +#endif + } +} + +static really_inline +m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 p_mask, m256 val) { + m256 mask = set32x8(0xf); + m256 lo = and256(val, mask); + m256 hi = and256(rshift4x64(val, 4), mask); + return and256(and256(vpshufb(maskBase[0*2], lo), + vpshufb(maskBase[0*2+1], hi)), p_mask); +} + +static really_inline +m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 p_mask, + m256 val) { + m256 mask = set32x8(0xf); + m256 lo = and256(val, mask); + m256 hi = and256(rshift4x64(val, 4), mask); + m256 r = prep_conf_fat_teddy_m1(maskBase, p_mask, val); + + m256 res_1 = and256(vpshufb(maskBase[1*2], lo), + vpshufb(maskBase[1*2+1], hi)); + m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1); + *old_1 = res_1; + return and256(and256(r, p_mask), res_shifted_1); +} + +static really_inline +m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2, + m256 p_mask, m256 val) { + m256 mask = set32x8(0xf); + m256 lo = and256(val, mask); + m256 hi = and256(rshift4x64(val, 4), mask); + m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, p_mask, val); + + m256 res_2 = and256(vpshufb(maskBase[2*2], lo), + vpshufb(maskBase[2*2+1], hi)); + m256 res_shifted_2 = vpalignr(res_2, *old_2, 16-2); + *old_2 = res_2; + return and256(r, res_shifted_2); +} + +static really_inline +m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2, + m256 *old_3, m256 p_mask, m256 val) { + m256 mask = set32x8(0xf); + m256 lo = and256(val, mask); + m256 hi = and256(rshift4x64(val, 4), mask); + m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, p_mask, val); + + m256 res_3 = and256(vpshufb(maskBase[3*2], lo), + vpshufb(maskBase[3*2+1], hi)); + m256 res_shifted_3 = vpalignr(res_3, *old_3, 16-3); + *old_3 = res_3; + return and256(r, res_shifted_3); +} + +static really_inline +m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi, + m256 p_mask) { + m256 lo = and256(val, mask); + m256 hi = and256(rshift4x64(val, 4), mask); + m256 res = and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi)); + return and256(res, p_mask); +} + +static really_inline +const m256 * getMaskBase_avx2(const struct Teddy *teddy) { + return (const m256 *)((const u8 *)teddy + sizeof(struct Teddy)); +} + +static really_inline +const u32 * getConfBase_avx2(const struct Teddy *teddy, u8 numMask) { + return (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + + (numMask*32*2)); +} + +hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m256 *maskBase = getMaskBase_avx2(teddy); + const u32 *confBase = getConfBase_avx2(teddy, 1); + + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 1); + m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy); + ptr += 16; + } + + for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit1_teddy); + m256 r_1 = prep_conf_fat_teddy_m1(maskBase, ones256(), + load2x128(ptr + 16)); + CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit1_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 1); + m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m256 *maskBase = getMaskBase_avx2(teddy); + const u32 *confBase = getConfBase_avx2(teddy, 1); + + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 1); + m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy); + m256 r_1 = prep_conf_fat_teddy_m1(maskBase, ones256(), + load2x128(ptr + 16)); + CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 1); + m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m256 *maskBase = getMaskBase_avx2(teddy); + const u32 *confBase = getConfBase_avx2(teddy, 2); + + m256 res_old_1 = ones256(); + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 2); + m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(), + load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); + ptr += 16; + } + + for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(), + load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); + m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(), + load2x128(ptr + 16)); + CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 2); + m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m256 *maskBase = getMaskBase_avx2(teddy); + const u32 *confBase = getConfBase_avx2(teddy, 2); + + m256 res_old_1 = ones256(); + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 2); + m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(), + load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(), + load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy); + m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(), + load2x128(ptr + 16)); + CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 2); + m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m256 *maskBase = getMaskBase_avx2(teddy); + const u32 *confBase = getConfBase_avx2(teddy, 3); + + m256 res_old_1 = ones256(); + m256 res_old_2 = ones256(); + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 3); + m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, + p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, + ones256(), load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); + ptr += 16; + } + + for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, + ones256(), load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); + m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, + ones256(), load2x128(ptr + 16)); + CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 3); + m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, + p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m256 *maskBase = getMaskBase_avx2(teddy); + const u32 *confBase = getConfBase_avx2(teddy, 3); + + m256 res_old_1 = ones256(); + m256 res_old_2 = ones256(); + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 3); + m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, + p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, + ones256(), load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, + ones256(), load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy); + m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, + ones256(), load2x128(ptr + 16)); + CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 3); + m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2, + p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m256 *maskBase = getMaskBase_avx2(teddy); + const u32 *confBase = getConfBase_avx2(teddy, 4); + + m256 res_old_1 = ones256(); + m256 res_old_2 = ones256(); + m256 res_old_3 = ones256(); + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 4); + m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, ones256(), + load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); + ptr += 16; + } + + for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, ones256(), + load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy); + m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, ones256(), + load2x128(ptr + 16)); + CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 4); + m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 32; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m256 *maskBase = getMaskBase_avx2(teddy); + const u32 *confBase = getConfBase_avx2(teddy, 4); + + m256 res_old_1 = ones256(); + m256 res_old_2 = ones256(); + m256 res_old_3 = ones256(); + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 16; + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 4); + m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + if (ptr + 16 < buf_end) { + m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, ones256(), + load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); + ptr += 16; + } + + for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, ones256(), + load2x128(ptr)); + CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy); + m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, ones256(), + load2x128(ptr + 16)); + CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy); + } + + for (; ptr < buf_end; ptr += 16) { + m256 p_mask; + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end, + a->buf_history, a->len_history, 4); + m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2, + &res_old_3, p_mask, val_0); + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 64; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m128 *maskBase = getMaskBase(teddy); + const u32 *confBase = getConfBase(teddy, 1); + + const m256 maskLo = set2x128(maskBase[0]); + const m256 maskHi = set2x128(maskBase[1]); + const m256 mask = set32x8(0xf); + u16 bitArr[512]; + + const u8 *mainStart = ROUNDUP_PTR(ptr, 32); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 32; + m256 p_mask; + m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset, + buf_end, a->buf_history, a->len_history); + m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, + p_mask); + CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy); + ptr += 32; + } + + if (ptr + 32 < buf_end) { + m256 val_0 = load256(ptr + 0); + m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, + ones256()); + CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy); + ptr += 32; + } + + for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + + m256 val_0 = load256(ptr + 0); + m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, + ones256()); + CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit1_fast_teddy); + + m256 val_1 = load256(ptr + 32); + m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi, + ones256()); + CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit1_fast_teddy); + } + + for (; ptr < buf_end; ptr += 32) { + m256 p_mask; + m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset, + buf_end, a->buf_history, a->len_history); + m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, + p_mask); + CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + const u8 *buf_end = a->buf + a->len; + const u8 *ptr = a->buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 *tryFloodDetect = a->firstFloodDetect; + u32 last_match = (u32)-1; + const struct Teddy *teddy = (const struct Teddy *)fdr; + const size_t iterBytes = 64; + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", + a->buf, a->len, a->start_offset); + + const m128 *maskBase = getMaskBase(teddy); + const u32 *confBase = getConfBase(teddy, 1); + + const m256 maskLo = set2x128(maskBase[0]); + const m256 maskHi = set2x128(maskBase[1]); + const m256 mask = set32x8(0xf); + u16 bitArr[512]; + + const u8 *mainStart = ROUNDUP_PTR(ptr, 32); + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); + if (ptr < mainStart) { + ptr = mainStart - 32; + m256 p_mask; + m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset, + buf_end, a->buf_history, a->len_history); + m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, + p_mask); + CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy); + ptr += 32; + } + + if (ptr + 32 < buf_end) { + m256 val_0 = load256(ptr + 0); + m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, + ones256()); + CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy); + ptr += 32; + } + + for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + CHECK_FLOOD; + + m256 val_0 = load256(ptr + 0); + m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, + ones256()); + CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit_fast_teddy); + + m256 val_1 = load256(ptr + 32); + m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi, + ones256()); + CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit_fast_teddy); + } + + for (; ptr < buf_end; ptr += 32) { + m256 p_mask; + m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset, + buf_end, a->buf_history, a->len_history); + m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi, + p_mask); + CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy); + } + *a->groups = controlVal; + return HWLM_SUCCESS; +} + +#endif // __AVX2__ diff --git a/src/fdr/teddy_engine_description.cpp b/src/fdr/teddy_engine_description.cpp index ead448a8..d95f4937 100644 --- a/src/fdr/teddy_engine_description.cpp +++ b/src/fdr/teddy_engine_description.cpp @@ -64,7 +64,32 @@ bool TeddyEngineDescription::needConfirm(const vector &lits) const return false; } -#include "teddy_autogen_compiler.cpp" +void getTeddyDescriptions(vector *out) { + static const TeddyEngineDef defns[] = { + { 1, 0 | HS_CPU_FEATURES_AVX2, 1, 8, false, 0, 1 }, + { 2, 0 | HS_CPU_FEATURES_AVX2, 1, 8, true, 0, 32 }, + { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false, 0, 1 }, + { 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true, 0, 32 }, + { 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false, 0, 1 }, + { 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true, 0, 32 }, + { 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false, 0, 1 }, + { 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true, 0, 32 }, + { 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false, 0, 1 }, + { 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true, 0, 32 }, + { 11, 0, 1, 8, false, 0, 1 }, + { 12, 0, 1, 8, true, 0, 32 }, + { 13, 0, 2, 8, false, 0, 1 }, + { 14, 0, 2, 8, true, 0, 32 }, + { 15, 0, 3, 8, false, 0, 1 }, + { 16, 0, 3, 8, true, 0, 32 }, + { 17, 0, 4, 8, false, 0, 1 }, + { 18, 0, 4, 8, true, 0, 32 }, + }; + out->clear(); + for (const auto &def : defns) { + out->emplace_back(def); + } +} static size_t maxFloodTailLen(const vector &vl) { diff --git a/src/fdr/teddy_runtime_common.h b/src/fdr/teddy_runtime_common.h new file mode 100644 index 00000000..c50b4d16 --- /dev/null +++ b/src/fdr/teddy_runtime_common.h @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Teddy literal matcher: common runtime procedures. + */ + +#ifndef TEDDY_RUNTIME_COMMON_H_ +#define TEDDY_RUNTIME_COMMON_H_ + +#include "fdr_confirm.h" +#include "fdr_confirm_runtime.h" +#include "ue2common.h" +#include "util/bitutils.h" +#include "util/simd_utils.h" + +extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32]; + +#ifdef ARCH_64_BIT +#define TEDDY_CONF_TYPE u64a +#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf) +#else +#define TEDDY_CONF_TYPE u32 +#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf) +#endif + +#define CHECK_HWLM_TERMINATE_MATCHING \ +do { \ + if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) { \ + *a->groups = controlVal; \ + return HWLM_TERMINATED; \ + } \ +} while (0); + +#define CHECK_FLOOD \ +do { \ + if (unlikely(ptr > tryFloodDetect)) { \ + tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, \ + &floodBackoff, &controlVal, \ + iterBytes); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ +} while (0); + +/* + * \brief Copy a block of [0,15] bytes efficiently. + * + * This function is a workaround intended to stop some compilers from + * synthesizing a memcpy function call out of the copy of a small number of + * bytes that we do in vectoredLoad128. + */ +static really_inline +void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) { + switch (len) { + case 0: + break; + case 1: + *dst = *src; + break; + case 2: + unaligned_store_u16(dst, unaligned_load_u16(src)); + break; + case 3: + unaligned_store_u16(dst, unaligned_load_u16(src)); + dst[2] = src[2]; + break; + case 4: + unaligned_store_u32(dst, unaligned_load_u32(src)); + break; + case 5: + case 6: + case 7: + /* Perform copy with two overlapping 4-byte chunks. */ + unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4)); + unaligned_store_u32(dst, unaligned_load_u32(src)); + break; + case 8: + unaligned_store_u64a(dst, unaligned_load_u64a(src)); + break; + default: + /* Perform copy with two overlapping 8-byte chunks. */ + assert(len < 16); + unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8)); + unaligned_store_u64a(dst, unaligned_load_u64a(src)); + break; + } +} + +// Note: p_mask is an output param that initialises a poison mask. +static really_inline +m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi, + const u8 *buf_history, size_t len_history, + const u32 nMasks) { + union { + u8 val8[16]; + m128 val128; + } u; + u.val128 = zeroes128(); + + uintptr_t copy_start; + uintptr_t copy_len; + + if (ptr >= lo) { + uintptr_t avail = (uintptr_t)(hi - ptr); + if (avail >= 16) { + *p_mask = load128(p_mask_arr[16] + 16); + return loadu128(ptr); + } + *p_mask = load128(p_mask_arr[avail] + 16); + copy_start = 0; + copy_len = avail; + } else { + uintptr_t need = MIN((uintptr_t)(lo - ptr), + MIN(len_history, nMasks - 1)); + uintptr_t start = (uintptr_t)(lo - ptr); + uintptr_t i; + for (i = start - need; ptr + i < lo; i++) { + u.val8[i] = buf_history[len_history - (lo - (ptr + i))]; + } + uintptr_t end = MIN(16, (uintptr_t)(hi - ptr)); + *p_mask = loadu128(p_mask_arr[end - start] + 16 - start); + copy_start = i; + copy_len = end - i; + } + + // Runt block from the buffer. + copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len); + + return u.val128; +} + +static really_inline +u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte, + CautionReason reason) { + u64a confVal = 0; + const u8 *buf = a->buf; + size_t len = a->len; + const u8 *confirm_loc = ptr + byte - 7; + if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) { + confVal = lv_u64a(confirm_loc, buf, buf + len); + } else { // r == VECTORING, confirm_loc < buf + u64a histBytes = a->histBytes; + confVal = lv_u64a_ce(confirm_loc, buf, buf + len); + // stitch together confVal and history + u32 overhang = buf - confirm_loc; + histBytes >>= 64 - (overhang * 8); + confVal |= histBytes; + } + return confVal; +} + +static really_inline +void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset, + const u32 *confBase, CautionReason reason, + const struct FDR_Runtime_Args *a, const u8 *ptr, + hwlmcb_rv_t *control, u32 *last_match) { + do { + u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf); + u32 byte = bit / bucket + offset; + u32 bitRem = bit % bucket; + u32 confSplit = *(ptr+byte) & 0x1f; + u32 idx = confSplit * bucket + bitRem; + u32 cf = confBase[idx]; + if (!cf) { + continue; + } + const struct FDRConfirm *fdrc = (const struct FDRConfirm *) + ((const u8 *)confBase + cf); + if (!(fdrc->groups & *control)) { + continue; + } + u64a confVal = getConfVal(a, ptr, byte, reason); + confWithBit(fdrc, a, ptr - a->buf + byte, 0, control, + last_match, confVal); + } while (unlikely(*conf)); +} + +static really_inline +void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset, + const u32 *confBase, CautionReason reason, + const struct FDR_Runtime_Args *a, const u8 *ptr, + hwlmcb_rv_t *control, u32 *last_match) { + do { + u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf); + u32 byte = bit / bucket + offset; + u32 idx = bit % bucket; + u32 cf = confBase[idx]; + const struct FDRConfirm *fdrc = (const struct FDRConfirm *) + ((const u8 *)confBase + cf); + if (!(fdrc->groups & *control)) { + continue; + } + u64a confVal = getConfVal(a, ptr, byte, reason); + confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match, + confVal); + } while (unlikely(*conf)); +} + +static really_inline +void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset, + const u32 *confBase, CautionReason reason, + const struct FDR_Runtime_Args *a, const u8 *ptr, + hwlmcb_rv_t *control, u32 *last_match) { + do { + u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf); + u32 byte = bit / bucket + offset; + u32 idx = bit % bucket; + u32 cf = confBase[idx]; + const struct FDRConfirm *fdrc = (const struct FDRConfirm *) + ((const u8 *)confBase + cf); + if (!(fdrc->groups & *control)) { + continue; + } + u64a confVal = getConfVal(a, ptr, byte, reason); + confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control, + last_match, confVal); + } while (unlikely(*conf)); +} + +static really_inline +const m128 * getMaskBase(const struct Teddy *teddy) { + return (const m128 *)((const u8 *)teddy + sizeof(struct Teddy)); +} + +static really_inline +const u32 * getConfBase(const struct Teddy *teddy, u8 numMask) { + return (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + + (numMask*32)); +} + +#endif /* TEDDY_RUNTIME_COMMON_H_ */