mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
teddy: remove python codegen, refactor code
Major cleanup of the Teddy runtime code. Removes python code generation, splits AVX2 models into their own file, improves readability.
This commit is contained in:
parent
6899cab370
commit
ed772380c0
4
.gitignore
vendored
4
.gitignore
vendored
@ -46,10 +46,6 @@ sqlite3
|
||||
src/config.h
|
||||
src/config.h.in
|
||||
src/hs_version.h
|
||||
src/fdr/fdr_autogen.c
|
||||
src/fdr/fdr_autogen_compiler.cpp
|
||||
src/fdr/teddy_autogen.c
|
||||
src/fdr/teddy_autogen_compiler.cpp
|
||||
src/parser/Parser.cpp
|
||||
|
||||
# Generated PCRE files
|
||||
|
@ -357,11 +357,6 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
|
||||
|
||||
|
||||
# include the autogen targets
|
||||
add_subdirectory(src/fdr)
|
||||
|
||||
include_directories(${PROJECT_BINARY_DIR}/src/fdr)
|
||||
|
||||
if(NOT WIN32)
|
||||
set(RAGEL_C_FLAGS "-Wno-unused")
|
||||
endif()
|
||||
@ -381,8 +376,6 @@ SET(hs_HEADERS
|
||||
)
|
||||
install(FILES ${hs_HEADERS} DESTINATION include/hs)
|
||||
|
||||
set(fdr_autogen_targets autogen_runtime autogen_teddy_runtime)
|
||||
|
||||
set (hs_exec_SRCS
|
||||
${hs_HEADERS}
|
||||
src/hs_version.h
|
||||
@ -400,7 +393,10 @@ set (hs_exec_SRCS
|
||||
src/fdr/flood_runtime.h
|
||||
src/fdr/fdr_loadval.h
|
||||
src/fdr/teddy.c
|
||||
src/fdr/teddy_avx2.c
|
||||
src/fdr/teddy.h
|
||||
src/fdr/teddy_internal.h
|
||||
src/fdr/teddy_runtime_common.h
|
||||
src/hwlm/hwlm.c
|
||||
src/hwlm/hwlm.h
|
||||
src/hwlm/hwlm_internal.h
|
||||
@ -929,11 +925,9 @@ set (LIB_VERSION ${HS_VERSION})
|
||||
set (LIB_SOVERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION})
|
||||
|
||||
add_library(hs_exec OBJECT ${hs_exec_SRCS})
|
||||
add_dependencies(hs_exec ${fdr_autogen_targets})
|
||||
|
||||
if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
|
||||
add_library(hs_exec_shared OBJECT ${hs_exec_SRCS})
|
||||
add_dependencies(hs_exec_shared ${fdr_autogen_targets})
|
||||
set_target_properties(hs_exec_shared PROPERTIES
|
||||
POSITION_INDEPENDENT_CODE TRUE)
|
||||
endif()
|
||||
@ -964,7 +958,6 @@ endif()
|
||||
add_library(hs STATIC ${hs_SRCS} $<TARGET_OBJECTS:hs_exec>)
|
||||
|
||||
add_dependencies(hs ragel_Parser)
|
||||
add_dependencies(hs autogen_teddy_compiler)
|
||||
|
||||
if (NOT BUILD_SHARED_LIBS)
|
||||
install(TARGETS hs DESTINATION lib)
|
||||
@ -973,7 +966,6 @@ endif()
|
||||
if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
|
||||
add_library(hs_shared SHARED ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_shared>)
|
||||
add_dependencies(hs_shared ragel_Parser)
|
||||
add_dependencies(hs_shared autogen_compiler autogen_teddy_compiler)
|
||||
set_target_properties(hs_shared PROPERTIES
|
||||
OUTPUT_NAME hs
|
||||
VERSION ${LIB_VERSION}
|
||||
|
@ -1,33 +0,0 @@
|
||||
# The set of rules and other nastiness for generating FDR/Teddy source
|
||||
|
||||
# we need to add these as explicit dependencies
|
||||
set(AUTOGEN_PY_FILES
|
||||
arch.py
|
||||
autogen.py
|
||||
autogen_utils.py
|
||||
teddy_autogen.py
|
||||
)
|
||||
|
||||
function(fdr_autogen type out)
|
||||
add_custom_command (
|
||||
COMMENT "AUTOGEN ${out}"
|
||||
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${out}"
|
||||
COMMAND ${PYTHON} "${CMAKE_CURRENT_SOURCE_DIR}/autogen.py" ${type} > "${CMAKE_CURRENT_BINARY_DIR}/${out}"
|
||||
DEPENDS ${AUTOGEN_PY_FILES}
|
||||
)
|
||||
add_custom_target(autogen_${type} DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/${out}")
|
||||
endfunction(fdr_autogen)
|
||||
|
||||
#now build the functions
|
||||
fdr_autogen(runtime fdr_autogen.c)
|
||||
fdr_autogen(teddy_runtime teddy_autogen.c)
|
||||
fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp)
|
||||
|
||||
set(fdr_GENERATED_SRC
|
||||
${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen.c
|
||||
${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen.c
|
||||
${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
|
||||
PARENT_SCOPE)
|
||||
|
||||
set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE)
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR})
|
@ -1,58 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# Copyright (c) 2015, Intel Corporation
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import autogen_utils
|
||||
|
||||
# wrapper for architectures
|
||||
|
||||
class Arch:
|
||||
def __init__(self, name, extensions = []):
|
||||
self.name = name
|
||||
self.extensions = extensions
|
||||
self.target = None
|
||||
|
||||
def get_guard(self):
|
||||
# these defines definitely fall into the "belt-and-suspenders"
|
||||
# category of paranoia
|
||||
if (self.guard_list == []):
|
||||
return "#if 1"
|
||||
|
||||
return "#if " + " && ".join(self.guard_list)
|
||||
|
||||
class X86Arch(Arch):
|
||||
def __init__(self, name, extensions = []):
|
||||
Arch.__init__(self, name, extensions)
|
||||
self.guard_list = [ ]
|
||||
self.target = "0"
|
||||
|
||||
if "AVX2" in extensions:
|
||||
self.target += " | HS_CPU_FEATURES_AVX2"
|
||||
self.guard_list += [ "defined(__AVX2__)" ]
|
||||
|
||||
|
||||
arch_x86_64 = X86Arch("x86_64", extensions = [ ])
|
||||
arch_x86_64_avx2 = X86Arch("x86_64_avx2", extensions = [ "AVX2" ])
|
@ -1,118 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# Copyright (c) 2015-2016, Intel Corporation
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import sys
|
||||
from autogen_utils import *
|
||||
from teddy_autogen import *
|
||||
from arch import *
|
||||
|
||||
# teddy setup
|
||||
|
||||
def build_teddy_matchers():
|
||||
all_matchers = [ ]
|
||||
|
||||
# AVX2
|
||||
all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = False) ]
|
||||
all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = True) ]
|
||||
for n_msk in range(1, 5):
|
||||
all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = False, num_masks = n_msk, num_buckets = 16) ]
|
||||
all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = True, num_masks = n_msk, num_buckets = 16) ]
|
||||
|
||||
# SSE/SSE2/SSSE3
|
||||
for n_msk in range(1, 5):
|
||||
all_matchers += [ MT(arch = arch_x86_64, packed = False, num_masks = n_msk, num_buckets = 8) ]
|
||||
all_matchers += [ MT(arch = arch_x86_64, packed = True, num_masks = n_msk, num_buckets = 8) ]
|
||||
|
||||
return all_matchers
|
||||
|
||||
def produce_teddy_compiles(l):
|
||||
print "void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {"
|
||||
print " static const TeddyEngineDef defns[] = {"
|
||||
for m in l:
|
||||
m.produce_compile_call()
|
||||
print " };"
|
||||
print " out->clear();"
|
||||
print " for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {"
|
||||
print " out->push_back(TeddyEngineDescription(defns[i]));"
|
||||
print " }"
|
||||
print "}"
|
||||
|
||||
# see below - we don't produce our 'zeros' at the point of the teddy runtimes as they
|
||||
# are linked. So we either generate the function or we don't - then at the point of the
|
||||
# header in fdr_autogen.c we either generate the header or we #define the zero.
|
||||
|
||||
def produce_teddy_runtimes(l):
|
||||
# Since we're using -Wmissing-prototypes, we need headers first.
|
||||
for m in l:
|
||||
m.produce_guard()
|
||||
print m.produce_header(visible = True, header_only = True)
|
||||
m.close_guard()
|
||||
|
||||
for m in l:
|
||||
m.produce_guard()
|
||||
m.produce_code()
|
||||
m.close_guard()
|
||||
|
||||
# see produce_teddy_runtimes() comment for the rationale
|
||||
|
||||
def produce_teddy_headers(l):
|
||||
for m in l:
|
||||
m.produce_guard()
|
||||
print m.produce_header(visible = True, header_only = True)
|
||||
m.produce_zero_alternative()
|
||||
|
||||
# general utilities
|
||||
|
||||
def make_fdr_function_pointers(matcher_list):
|
||||
print """
|
||||
typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
|
||||
static FDRFUNCTYPE funcs[] = {
|
||||
"""
|
||||
all_funcs = " fdr_engine_exec,\n"
|
||||
all_funcs += ",\n".join([ " %s" % m.get_name() for m in matcher_list ])
|
||||
print all_funcs
|
||||
print """
|
||||
};
|
||||
"""
|
||||
|
||||
def assign_ids(matcher_list, next_id):
|
||||
for m in matcher_list:
|
||||
m.id = next_id
|
||||
next_id += 1
|
||||
return next_id
|
||||
|
||||
# Main entry point
|
||||
|
||||
tm = build_teddy_matchers()
|
||||
next_id = assign_ids(tm, 1)
|
||||
if sys.argv[1] == "runtime":
|
||||
produce_teddy_headers(tm)
|
||||
make_fdr_function_pointers(tm)
|
||||
elif sys.argv[1] == "teddy_runtime":
|
||||
produce_teddy_runtimes(tm)
|
||||
elif sys.argv[1] == "teddy_compiler":
|
||||
produce_teddy_compiles(tm)
|
@ -1,120 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# Copyright (c) 2015-2016, Intel Corporation
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import sys
|
||||
|
||||
def fail_out(msg = ""):
|
||||
print >>sys.stderr, "Internal failure in autogen.py: " + msg
|
||||
sys.exit(1)
|
||||
|
||||
class IntegerType:
|
||||
def __init__(self, size):
|
||||
self.size = size
|
||||
|
||||
def get_name(self):
|
||||
return { 256: "m256", 128 : "m128", 64 : "u64a", 32 : "u32" , 16 : "u16", 8 : "u8"}[self.size]
|
||||
|
||||
def size_in_bytes(self):
|
||||
return self.size / 8
|
||||
|
||||
def zero_expression(self):
|
||||
return "0"
|
||||
|
||||
def constant_to_string(self, n):
|
||||
if self.size == 64:
|
||||
suffix = "ULL"
|
||||
else:
|
||||
suffix = ""
|
||||
return "0x%x%s" % (n & ((1 << self.size) - 1), suffix)
|
||||
|
||||
def lowbits(self, n):
|
||||
return (1 << n) - 1
|
||||
|
||||
def highbits(self, n):
|
||||
return ~(self.lowbits(self.size - n))
|
||||
|
||||
def lowbit_mask(self, n):
|
||||
return self.constant_to_string(self.lowbits(n))
|
||||
|
||||
def lowbit_extract_expr(self, expr_string, n):
|
||||
return "(%s & %s)" % ( expr_string, self.lowbit_mask(n))
|
||||
|
||||
def flip_lowbits_expr(self, expr_string, n):
|
||||
return "(%s ^ %s)" % ( expr_string, self.lowbit_mask(n))
|
||||
|
||||
def bit_extract_expr(self, expr_string, low, high):
|
||||
lbm = self.lowbit_mask(high - low)
|
||||
return "((%s >> %d) & %s)" % (expr_string, low, lbm)
|
||||
|
||||
# shifts are +ve if left and -ve if right
|
||||
def shift_expr(self, expr_string, n):
|
||||
if n <= -self.size or n >= self.size:
|
||||
return self.zero_expression()
|
||||
elif (n > 0):
|
||||
return "(%s << %d)" % (expr_string, n)
|
||||
elif (n < 0):
|
||||
return "(%s >> %d)" % (expr_string, -n)
|
||||
else:
|
||||
return "(%s)" % (expr_string)
|
||||
|
||||
class SIMDIntegerType(IntegerType):
|
||||
def __init__(self, size):
|
||||
IntegerType.__init__(self, size)
|
||||
|
||||
def zero_expression(self):
|
||||
return "zeroes128()"
|
||||
|
||||
def lowbit_extract_expr(self, expr_string, n):
|
||||
if (n <= 32):
|
||||
tmpType = IntegerType(32)
|
||||
tmpExpr = "movd(%s)" % expr_string
|
||||
elif (32 < n <= 64):
|
||||
tmpType = IntegerType(64)
|
||||
tmpExpr = "movq(%s)" % expr_string
|
||||
return tmpType.lowbit_extract_expr(tmpExpr, n)
|
||||
|
||||
def bit_extract_expr(self, expr_string, low, high, flip):
|
||||
fail_out("Unimplemented bit extract on m128")
|
||||
|
||||
def shift_expr(self, expr_string, n):
|
||||
if n % 8 != 0:
|
||||
fail_out("Trying to shift a m128 by a bit granular value")
|
||||
|
||||
# should check that n is divisible by 8
|
||||
if n <= -self.size or n >= self.size:
|
||||
return self.zero_expression()
|
||||
elif (n > 0):
|
||||
return "byteShiftLeft128(%s, %s)" % (expr_string, n / 8)
|
||||
elif (n < 0):
|
||||
return "byteShiftRight128(%s, %s)" % (expr_string, -n / 8)
|
||||
else:
|
||||
return "(%s)" % (expr_string)
|
||||
|
||||
def lowbit_mask(self, n):
|
||||
if n % 8 != 0:
|
||||
fail_out("Trying to make a lowbit mask in a m128 by a bit granular value")
|
||||
return self.shift_expr("ones128()", -(128 - n))
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -55,6 +55,7 @@ public:
|
||||
u32 getNumBuckets() const { return numBuckets; }
|
||||
u32 getConfirmPullBackDistance() const { return confirmPullBackDistance; }
|
||||
u32 getConfirmTopLevelSplit() const { return confirmTopLevelSplit; }
|
||||
void setConfirmTopLevelSplit(u32 split) { confirmTopLevelSplit = split; }
|
||||
|
||||
bool isValidOnTarget(const target_t &target_in) const;
|
||||
virtual u32 getDefaultFloodSuffixLength() const = 0;
|
||||
|
@ -33,6 +33,7 @@
|
||||
#include "fdr_loadval.h"
|
||||
#include "fdr_streaming_runtime.h"
|
||||
#include "flood_runtime.h"
|
||||
#include "teddy.h"
|
||||
#include "teddy_internal.h"
|
||||
#include "util/simd_utils.h"
|
||||
#include "util/simd_utils_ssse3.h"
|
||||
@ -764,7 +765,34 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#include "fdr_autogen.c"
|
||||
#if defined(__AVX2__)
|
||||
#define ONLY_AVX2(func) func
|
||||
#else
|
||||
#define ONLY_AVX2(func) NULL
|
||||
#endif
|
||||
|
||||
typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
|
||||
static const FDRFUNCTYPE funcs[] = {
|
||||
fdr_engine_exec,
|
||||
ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fast),
|
||||
ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fast),
|
||||
ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fat),
|
||||
ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fat),
|
||||
ONLY_AVX2(fdr_exec_teddy_avx2_msks2_fat),
|
||||
ONLY_AVX2(fdr_exec_teddy_avx2_msks2_pck_fat),
|
||||
ONLY_AVX2(fdr_exec_teddy_avx2_msks3_fat),
|
||||
ONLY_AVX2(fdr_exec_teddy_avx2_msks3_pck_fat),
|
||||
ONLY_AVX2(fdr_exec_teddy_avx2_msks4_fat),
|
||||
ONLY_AVX2(fdr_exec_teddy_avx2_msks4_pck_fat),
|
||||
fdr_exec_teddy_msks1,
|
||||
fdr_exec_teddy_msks1_pck,
|
||||
fdr_exec_teddy_msks2,
|
||||
fdr_exec_teddy_msks2_pck,
|
||||
fdr_exec_teddy_msks3,
|
||||
fdr_exec_teddy_msks3_pck,
|
||||
fdr_exec_teddy_msks4,
|
||||
fdr_exec_teddy_msks4_pck,
|
||||
};
|
||||
|
||||
#define FAKE_HISTORY_SIZE 16
|
||||
static const u8 fake_history[FAKE_HISTORY_SIZE];
|
||||
|
@ -57,7 +57,7 @@ u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
|
||||
void getFdrDescriptions(vector<FDREngineDescription> *out) {
|
||||
static const FDREngineDef def = {0, 128, 8, 0, 1, 256};
|
||||
out->clear();
|
||||
out->push_back(FDREngineDescription(def));
|
||||
out->emplace_back(def);
|
||||
}
|
||||
|
||||
static
|
||||
|
744
src/fdr/teddy.c
744
src/fdr/teddy.c
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -26,11 +26,19 @@
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
/** \file
|
||||
* \brief Teddy literal matcher: SSSE3 engine runtime.
|
||||
*/
|
||||
|
||||
#include "fdr_internal.h"
|
||||
#include "flood_runtime.h"
|
||||
#include "teddy.h"
|
||||
#include "teddy_internal.h"
|
||||
#include "teddy_runtime_common.h"
|
||||
#include "util/simd_utils.h"
|
||||
#include "util/simd_utils_ssse3.h"
|
||||
|
||||
static const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
|
||||
const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
@ -67,178 +75,584 @@ static const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
|
||||
};
|
||||
|
||||
// Note: p_mask is an output param that initialises a poison mask.
|
||||
UNUSED static really_inline
|
||||
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
|
||||
const u8 *buf_history, size_t len_history,
|
||||
const u32 nMasks) {
|
||||
union {
|
||||
u8 val8[16];
|
||||
m128 val128;
|
||||
} u;
|
||||
u.val128 = zeroes128();
|
||||
#ifdef ARCH_64_BIT
|
||||
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||
do { \
|
||||
if (unlikely(isnonzero128(var))) { \
|
||||
u64a lo = movq(var); \
|
||||
u64a hi = movq(byteShiftRight128(var, 8)); \
|
||||
if (unlikely(lo)) { \
|
||||
conf_fn(&lo, bucket, offset, confBase, reason, a, ptr, \
|
||||
control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
if (unlikely(hi)) { \
|
||||
conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr, \
|
||||
control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
} \
|
||||
} while (0);
|
||||
#else
|
||||
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||
do { \
|
||||
if (unlikely(isnonzero128(var))) { \
|
||||
u32 part1 = movd(var); \
|
||||
u32 part2 = movd(byteShiftRight128(var, 4)); \
|
||||
u32 part3 = movd(byteShiftRight128(var, 8)); \
|
||||
u32 part4 = movd(byteShiftRight128(var, 12)); \
|
||||
if (unlikely(part1)) { \
|
||||
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
|
||||
control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
if (unlikely(part2)) { \
|
||||
conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
|
||||
control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
if (unlikely(part3)) { \
|
||||
conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
|
||||
control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
if (unlikely(part4)) { \
|
||||
conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
|
||||
control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
} \
|
||||
} while (0);
|
||||
#endif
|
||||
|
||||
if (ptr >= lo) {
|
||||
u32 avail = (u32)(hi - ptr);
|
||||
if (avail >= 16) {
|
||||
*p_mask = load128((const void*)(p_mask_arr[16] + 16));
|
||||
return loadu128(ptr);
|
||||
}
|
||||
*p_mask = load128((const void*)(p_mask_arr[avail] + 16));
|
||||
for (u32 i = 0; i < avail; i++) {
|
||||
u.val8[i] = ptr[i];
|
||||
}
|
||||
} else {
|
||||
u32 need = MIN((u32)(lo - ptr), MIN(len_history, nMasks - 1));
|
||||
u32 start = (u32)(lo - ptr);
|
||||
u32 i;
|
||||
for (i = start - need; ptr + i < lo; i++) {
|
||||
u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
|
||||
}
|
||||
u32 end = MIN(16, (u32)(hi - ptr));
|
||||
*p_mask = loadu128((const void*)(p_mask_arr[end - start] + 16 - start));
|
||||
for (; i < end; i++) {
|
||||
u.val8[i] = ptr[i];
|
||||
}
|
||||
static really_inline
|
||||
m128 prep_conf_teddy_m1(const m128 *maskBase, m128 p_mask, m128 val) {
|
||||
m128 mask = set16x8(0xf);
|
||||
m128 lo = and128(val, mask);
|
||||
m128 hi = and128(rshift2x64(val, 4), mask);
|
||||
return and128(and128(pshufb(maskBase[0*2], lo),
|
||||
pshufb(maskBase[0*2+1], hi)), p_mask);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 p_mask,
|
||||
m128 val) {
|
||||
m128 mask = set16x8(0xf);
|
||||
m128 lo = and128(val, mask);
|
||||
m128 hi = and128(rshift2x64(val, 4), mask);
|
||||
m128 r = prep_conf_teddy_m1(maskBase, p_mask, val);
|
||||
|
||||
m128 res_1 = and128(pshufb(maskBase[1*2], lo),
|
||||
pshufb(maskBase[1*2+1], hi));
|
||||
m128 res_shifted_1 = palignr(res_1, *old_1, 16-1);
|
||||
*old_1 = res_1;
|
||||
return and128(and128(r, p_mask), res_shifted_1);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
|
||||
m128 p_mask, m128 val) {
|
||||
m128 mask = set16x8(0xf);
|
||||
m128 lo = and128(val, mask);
|
||||
m128 hi = and128(rshift2x64(val, 4), mask);
|
||||
m128 r = prep_conf_teddy_m2(maskBase, old_1, p_mask, val);
|
||||
|
||||
m128 res_2 = and128(pshufb(maskBase[2*2], lo),
|
||||
pshufb(maskBase[2*2+1], hi));
|
||||
m128 res_shifted_2 = palignr(res_2, *old_2, 16-2);
|
||||
*old_2 = res_2;
|
||||
return and128(r, res_shifted_2);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
|
||||
m128 *old_3, m128 p_mask, m128 val) {
|
||||
m128 mask = set16x8(0xf);
|
||||
m128 lo = and128(val, mask);
|
||||
m128 hi = and128(rshift2x64(val, 4), mask);
|
||||
m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, p_mask, val);
|
||||
|
||||
m128 res_3 = and128(pshufb(maskBase[3*2], lo),
|
||||
pshufb(maskBase[3*2+1], hi));
|
||||
m128 res_shifted_3 = palignr(res_3, *old_3, 16-3);
|
||||
*old_3 = res_3;
|
||||
return and128(r, res_shifted_3);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t *control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m128 *maskBase = getMaskBase(teddy);
|
||||
const u32 *confBase = getConfBase(teddy, 1);
|
||||
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 1);
|
||||
m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
return u.val128;
|
||||
}
|
||||
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
UNUSED static really_inline
|
||||
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
|
||||
const u8 *buf_history, size_t len_history,
|
||||
const u32 nMasks) {
|
||||
m128 p_mask128;
|
||||
m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history, len_history, nMasks));
|
||||
*p_mask = set2x128(p_mask128);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
|
||||
};
|
||||
|
||||
|
||||
UNUSED static really_inline
|
||||
m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
|
||||
const u8 *buf_history, size_t len_history) {
|
||||
union {
|
||||
u8 val8[32];
|
||||
m256 val256;
|
||||
} u;
|
||||
|
||||
if (ptr >= lo) {
|
||||
u32 avail = (u32)(hi - ptr);
|
||||
if (avail >= 32) {
|
||||
*p_mask = load256((const void*)(p_mask_arr256[32] + 32));
|
||||
return loadu256(ptr);
|
||||
}
|
||||
*p_mask = load256((const void*)(p_mask_arr256[avail] + 32));
|
||||
for (u32 i = 0; i < avail; i++) {
|
||||
u.val8[i] = ptr[i];
|
||||
}
|
||||
} else {
|
||||
// need contains "how many chars to pull from history"
|
||||
// calculate based on what we need, what we have in the buffer
|
||||
// and only what we need to make primary confirm work
|
||||
u32 start = (u32)(lo - ptr);
|
||||
u32 i;
|
||||
for (i = start; ptr + i < lo; i++) {
|
||||
u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
|
||||
}
|
||||
u32 end = MIN(32, (u32)(hi - ptr));
|
||||
*p_mask = loadu256((const void*)(p_mask_arr256[end - start] + 32 - start));
|
||||
for (; i < end; i++) {
|
||||
u.val8[i] = ptr[i];
|
||||
}
|
||||
if (ptr + 16 < buf_end) {
|
||||
m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
return u.val256;
|
||||
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
|
||||
m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
|
||||
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 1);
|
||||
m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
|
||||
}
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t *control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
#endif // __AVX2__
|
||||
const m128 *maskBase = getMaskBase(teddy);
|
||||
const u32 *confBase = getConfBase(teddy, 1);
|
||||
|
||||
#define P0(cnd) unlikely(cnd)
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 1);
|
||||
m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
#include "fdr.h"
|
||||
#include "fdr_internal.h"
|
||||
#include "flood_runtime.h"
|
||||
if (ptr + 16 < buf_end) {
|
||||
m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
#include "fdr_confirm.h"
|
||||
#include "fdr_confirm_runtime.h"
|
||||
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
|
||||
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
#include "fdr_loadval.h"
|
||||
#include "util/bitutils.h"
|
||||
#include "teddy_internal.h"
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 1);
|
||||
m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
|
||||
}
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#include "teddy_autogen.c"
|
||||
hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t *control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m128 *maskBase = getMaskBase(teddy);
|
||||
const u32 *confBase = getConfBase(teddy, 2);
|
||||
|
||||
m128 res_old_1 = ones128();
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 2);
|
||||
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
|
||||
load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
|
||||
load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
|
||||
m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
|
||||
load128(ptr + 16));
|
||||
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 2);
|
||||
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
}
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t *control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m128 *maskBase = getMaskBase(teddy);
|
||||
const u32 *confBase = getConfBase(teddy, 2);
|
||||
|
||||
m128 res_old_1 = ones128();
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 2);
|
||||
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
|
||||
load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
|
||||
load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
|
||||
load128(ptr + 16));
|
||||
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 2);
|
||||
m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
|
||||
}
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t *control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m128 *maskBase = getMaskBase(teddy);
|
||||
const u32 *confBase = getConfBase(teddy, 3);
|
||||
|
||||
m128 res_old_1 = ones128();
|
||||
m128 res_old_2 = ones128();
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 3);
|
||||
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
ones128(), load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
ones128(), load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
|
||||
m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
ones128(), load128(ptr + 16));
|
||||
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 3);
|
||||
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
}
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t *control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m128 *maskBase = getMaskBase(teddy);
|
||||
const u32 *confBase = getConfBase(teddy, 3);
|
||||
|
||||
m128 res_old_1 = ones128();
|
||||
m128 res_old_2 = ones128();
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 3);
|
||||
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
ones128(), load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
ones128(), load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
ones128(), load128(ptr + 16));
|
||||
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 3);
|
||||
m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
|
||||
p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
|
||||
}
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t *control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m128 *maskBase = getMaskBase(teddy);
|
||||
const u32 *confBase = getConfBase(teddy, 4);
|
||||
|
||||
m128 res_old_1 = ones128();
|
||||
m128 res_old_2 = ones128();
|
||||
m128 res_old_3 = ones128();
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 4);
|
||||
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, ones128(), load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, ones128(), load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
|
||||
m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, ones128(), load128(ptr + 16));
|
||||
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 4);
|
||||
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
|
||||
}
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t *control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = (u32)-1;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m128 *maskBase = getMaskBase(teddy);
|
||||
const u32 *confBase = getConfBase(teddy, 4);
|
||||
|
||||
m128 res_old_1 = ones128();
|
||||
m128 res_old_2 = ones128();
|
||||
m128 res_old_3 = ones128();
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 4);
|
||||
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 < buf_end) {
|
||||
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, ones128(), load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
CHECK_FLOOD;
|
||||
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, ones128(), load128(ptr));
|
||||
CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, ones128(), load128(ptr + 16));
|
||||
CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
for (; ptr < buf_end; ptr += 16) {
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
|
||||
a->buf_history, a->len_history, 4);
|
||||
m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
|
||||
&res_old_3, p_mask, val_0);
|
||||
CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
|
||||
}
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
108
src/fdr/teddy.h
Normal file
108
src/fdr/teddy.h
Normal file
@ -0,0 +1,108 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Teddy literal matcher: function declarations.
|
||||
*/
|
||||
|
||||
#ifndef TEDDY_H_
|
||||
#define TEDDY_H_
|
||||
|
||||
#include "hwlm/hwlm.h"
|
||||
|
||||
struct FDR; // forward declaration from fdr_internal.h
|
||||
struct FDR_Runtime_Args;
|
||||
|
||||
hwlm_error_t fdr_exec_s1_w128(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_s2_w128(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_s4_w128(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a);
|
||||
|
||||
#endif /* __AVX2__ */
|
||||
|
||||
#endif /* TEDDY_H_ */
|
@ -1,773 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# Copyright (c) 2015-2016, Intel Corporation
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import sys
|
||||
from autogen_utils import *
|
||||
from string import Template
|
||||
|
||||
class MT:
|
||||
def produce_header(self, visible, header_only = False):
|
||||
s = ""
|
||||
if not visible:
|
||||
s += "static never_inline"
|
||||
s += """
|
||||
hwlm_error_t %s(UNUSED const struct FDR *fdr,
|
||||
UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name()
|
||||
if header_only:
|
||||
s += ";"
|
||||
else:
|
||||
s += "{"
|
||||
s += "\n"
|
||||
return s
|
||||
|
||||
def produce_guard(self):
|
||||
print self.arch.get_guard()
|
||||
|
||||
def produce_zero_alternative(self):
|
||||
print """
|
||||
#else
|
||||
#define %s 0
|
||||
#endif
|
||||
""" % self.get_name()
|
||||
|
||||
def close_guard(self):
|
||||
print "#endif"
|
||||
|
||||
def produce_confirm_base(self, conf_var_name, conf_var_size, offset, cautious, enable_confirmless, do_bailout = False):
|
||||
if cautious:
|
||||
caution_string = "VECTORING"
|
||||
else:
|
||||
caution_string = "NOT_CAUTIOUS"
|
||||
conf_split_mask = IntegerType(32).constant_to_string(
|
||||
self.conf_top_level_split - 1)
|
||||
if enable_confirmless:
|
||||
quick_check_string = """
|
||||
if (!fdrc->mult) {
|
||||
u32 id = fdrc->nBitsOrSoleID;
|
||||
if ((last_match == id) && (fdrc->flags & NoRepeat))
|
||||
continue;
|
||||
last_match = id;
|
||||
controlVal = a->cb(ptr+byte-buf, ptr+byte-buf, id, a->ctxt);
|
||||
continue;
|
||||
} """
|
||||
else:
|
||||
quick_check_string = ""
|
||||
if do_bailout:
|
||||
bailout_string = """
|
||||
if ((ptr + byte < buf + a->start_offset) || (ptr + byte >= buf + len)) continue;"""
|
||||
else:
|
||||
bailout_string = ""
|
||||
|
||||
return Template("""
|
||||
if (P0(!!$CONFVAR)) {
|
||||
do {
|
||||
u32 bit = findAndClearLSB_$CONFVAR_SIZE(&$CONFVAR);
|
||||
u32 byte = bit / $NUM_BUCKETS + $OFFSET;
|
||||
u32 bitRem = bit % $NUM_BUCKETS;
|
||||
$BAILOUT_STRING
|
||||
u32 confSplit = *(ptr+byte) & $SPLIT_MASK;
|
||||
u32 idx = confSplit * $NUM_BUCKETS + bitRem;
|
||||
u32 cf = confBase[idx];
|
||||
if (!cf)
|
||||
continue;
|
||||
fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);
|
||||
if (!(fdrc->groups & *control))
|
||||
continue;
|
||||
$QUICK_CHECK_STRING
|
||||
CautionReason reason = $CAUTION_STRING;
|
||||
CONF_TYPE v;
|
||||
const u8 * confirm_loc = ptr + byte - $CONF_PULL_BACK - 7;
|
||||
if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
|
||||
v = lv_u64a(confirm_loc, buf, buf + len);
|
||||
} else { // r == VECTORING, confirm_loc < buf
|
||||
u64a histBytes = a->histBytes;
|
||||
v = lv_u64a_ce(confirm_loc, buf, buf + len);
|
||||
// stitch together v (which doesn't move) and history (which does)
|
||||
u32 overhang = buf - confirm_loc;
|
||||
histBytes >>= 64 - (overhang * 8);
|
||||
v |= histBytes;
|
||||
}
|
||||
confWithBit(fdrc, a, ptr - buf + byte, $CONF_PULL_BACK, control, &last_match, v);
|
||||
} while(P0(!!$CONFVAR));
|
||||
if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
|
||||
*a->groups = controlVal;
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}""").substitute(CONFVAR = conf_var_name,
|
||||
CONFVAR_SIZE = conf_var_size,
|
||||
NUM_BUCKETS = self.num_buckets,
|
||||
OFFSET = offset,
|
||||
SPLIT_MASK = conf_split_mask,
|
||||
QUICK_CHECK_STRING = quick_check_string,
|
||||
BAILOUT_STRING = bailout_string,
|
||||
CAUTION_STRING = caution_string,
|
||||
CONF_PULL_BACK = self.conf_pull_back)
|
||||
|
||||
def produce_confirm(self, iter, var_name, offset, bits, cautious = True):
|
||||
if self.packed:
|
||||
print self.produce_confirm_base(var_name, bits, iter*16 + offset, cautious, enable_confirmless = False, do_bailout = False)
|
||||
else:
|
||||
if cautious:
|
||||
caution_string = "VECTORING"
|
||||
else:
|
||||
caution_string = "NOT_CAUTIOUS"
|
||||
|
||||
print " if (P0(!!%s)) {" % var_name
|
||||
print " do {"
|
||||
if bits == 64:
|
||||
print " u32 bit = findAndClearLSB_64(&%s);" % (var_name)
|
||||
else:
|
||||
print " u32 bit = findAndClearLSB_32(&%s);" % (var_name)
|
||||
print " u32 byte = bit / %d + %d;" % (self.num_buckets, iter*16 + offset)
|
||||
print " u32 idx = bit %% %d;" % self.num_buckets
|
||||
print " u32 cf = confBase[idx];"
|
||||
print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
|
||||
print " if (!(fdrc->groups & *control))"
|
||||
print " continue;"
|
||||
print """
|
||||
CautionReason reason = %s;
|
||||
CONF_TYPE v;
|
||||
const u8 * confirm_loc = ptr + byte - 7;
|
||||
if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
|
||||
v = lv_u64a(confirm_loc, buf, buf + len);
|
||||
} else { // r == VECTORING, confirm_loc < buf
|
||||
u64a histBytes = a->histBytes;
|
||||
v = lv_u64a_ce(confirm_loc, buf, buf + len);
|
||||
// stitch together v (which doesn't move) and history (which does)
|
||||
u32 overhang = buf - confirm_loc;
|
||||
histBytes >>= 64 - (overhang * 8);
|
||||
v |= histBytes;
|
||||
}""" % (caution_string)
|
||||
if self.num_masks == 1:
|
||||
print " confWithBit1(fdrc, a, ptr - buf + byte, control, &last_match, v);"
|
||||
else:
|
||||
print " confWithBitMany(fdrc, a, ptr - buf + byte, %s, control, &last_match, v);" % (caution_string)
|
||||
print " } while(P0(!!%s));" % var_name
|
||||
print " if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
|
||||
print " *a->groups = controlVal;"
|
||||
print " return HWLM_TERMINATED;"
|
||||
print " }"
|
||||
print " }"
|
||||
|
||||
def produce_needed_temporaries(self, max_iterations):
|
||||
print " m128 p_mask;"
|
||||
for iter in range(0, max_iterations):
|
||||
print " m128 val_%d;" % iter
|
||||
print " m128 val_%d_lo;" % iter
|
||||
print " m128 val_%d_hi;" % iter
|
||||
for x in range(self.num_masks):
|
||||
print " m128 res_%d_%d;" % (iter, x)
|
||||
if x != 0:
|
||||
print " m128 res_shifted_%d_%d;" % (iter, x)
|
||||
print " m128 r_%d;" % iter
|
||||
print "#ifdef ARCH_64_BIT"
|
||||
print " u64a r_%d_lopart;" % iter
|
||||
print " u64a r_%d_hipart;" % iter
|
||||
print "#else"
|
||||
print " u32 r_%d_part1;" % iter
|
||||
print " u32 r_%d_part2;" % iter
|
||||
print " u32 r_%d_part3;" % iter
|
||||
print " u32 r_%d_part4;" % iter
|
||||
print "#endif"
|
||||
|
||||
def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
|
||||
cautious, save_old):
|
||||
if cautious:
|
||||
print " val_%d = vectoredLoad128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
|
||||
else:
|
||||
print " val_%d = load128(ptr + %d);" % (iter, iter*16)
|
||||
print " val_%d_lo = and128(val_%d, lomask);" % (iter, iter)
|
||||
print " val_%d_hi = rshift2x64(val_%d, 4);" % (iter, iter)
|
||||
print " val_%d_hi = and128(val_%d_hi, lomask);" % (iter, iter)
|
||||
print
|
||||
for x in range(self.num_masks):
|
||||
print Template("""
|
||||
res_${ITER}_${X} = and128(pshufb(maskBase[${X}*2] , val_${ITER}_lo),
|
||||
pshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
|
||||
if x != 0:
|
||||
if iter == 0:
|
||||
print " res_shifted_%d_%d = palignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x, iter, x, x, x)
|
||||
else:
|
||||
print " res_shifted_%d_%d = palignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x, iter, x, iter-1, x, x)
|
||||
if x != 0 and iter == effective_num_iterations - 1 and save_old:
|
||||
print " res_old_%d = res_%d_%d;" % (x, iter, x)
|
||||
print
|
||||
if cautious:
|
||||
print " r_%d = and128(res_%d_0, p_mask);" % (iter, iter)
|
||||
else:
|
||||
print " r_%d = res_%d_0;" % (iter, iter)
|
||||
for x in range(1, self.num_masks):
|
||||
print " r_%d = and128(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
|
||||
print
|
||||
|
||||
def produce_one_iteration_confirm(self, iter, confirmCautious):
|
||||
setup64 = [ (0, "r_%d_lopart" % iter, "movq(r_%d)" % iter),
|
||||
(8, "r_%d_hipart" % iter, "movq(byteShiftRight128(r_%d, 8))" % iter) ]
|
||||
|
||||
setup32 = [ (0, "r_%d_part1" % iter, "movd(r_%d)" % iter),
|
||||
(4, "r_%d_part2" % iter, "movd(byteShiftRight128(r_%d, 4))" % iter),
|
||||
(8, "r_%d_part3" % iter, "movd(byteShiftRight128(r_%d, 8))" % iter),
|
||||
(12, "r_%d_part4" % iter, "movd(byteShiftRight128(r_%d, 12))" % iter) ]
|
||||
|
||||
print " if (P0(isnonzero128(r_%d))) {" % (iter)
|
||||
print "#ifdef ARCH_64_BIT"
|
||||
for (off, val, init) in setup64:
|
||||
print " %s = %s;" % (val, init)
|
||||
for (off, val, init) in setup64:
|
||||
self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
|
||||
print "#else"
|
||||
for (off, val, init) in setup32:
|
||||
print " %s = %s;" % (val, init)
|
||||
for (off, val, init) in setup32:
|
||||
self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
|
||||
print "#endif"
|
||||
print " }"
|
||||
|
||||
def produce_one_iteration(self, iter, effective_num_iterations, cautious = False,
|
||||
confirmCautious = True, save_old = True):
|
||||
self.produce_one_iteration_state_calc(iter, effective_num_iterations, cautious, save_old)
|
||||
self.produce_one_iteration_confirm(iter, confirmCautious)
|
||||
|
||||
def produce_code(self):
|
||||
print self.produce_header(visible = True, header_only = False)
|
||||
print """
|
||||
const u8 * buf = a->buf;
|
||||
const size_t len = a->len;
|
||||
const u8 * ptr = buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t * control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 * tryFloodDetect = a->firstFloodDetect;
|
||||
const struct FDRConfirm *fdrc;
|
||||
u32 last_match = (u32)-1;
|
||||
"""
|
||||
print
|
||||
|
||||
self.produce_needed_temporaries(self.num_iterations)
|
||||
print
|
||||
|
||||
print " const struct Teddy * teddy = (const struct Teddy *)fdr;"
|
||||
print " const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
|
||||
print " const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32));" % self.num_masks
|
||||
print " const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
|
||||
print " const size_t iterBytes = %d;" % (self.num_iterations * 16)
|
||||
|
||||
print ' DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
|
||||
' buf, len, a->start_offset);'
|
||||
print ' DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
|
||||
' mainStart);'
|
||||
|
||||
for x in range(self.num_masks):
|
||||
if (x != 0):
|
||||
print " m128 res_old_%d = ones128();" % x
|
||||
print " m128 lomask = set16x8(0xf);"
|
||||
|
||||
print " if (ptr < mainStart) {"
|
||||
print " ptr = mainStart - 16;"
|
||||
self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
|
||||
print " ptr += 16;"
|
||||
print " }"
|
||||
|
||||
print " if (ptr + 16 < buf + len) {"
|
||||
self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
|
||||
print " ptr += 16;"
|
||||
print " }"
|
||||
|
||||
print """
|
||||
for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
if (P0(ptr > tryFloodDetect)) {
|
||||
tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
|
||||
if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
|
||||
*a->groups = controlVal;
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}
|
||||
"""
|
||||
for iter in range(self.num_iterations):
|
||||
self.produce_one_iteration(iter, self.num_iterations, cautious = False, confirmCautious = False)
|
||||
|
||||
print " }"
|
||||
|
||||
print " for (; ptr < buf + len; ptr += 16) {"
|
||||
self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
|
||||
print " }"
|
||||
|
||||
print """
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
"""
|
||||
|
||||
def produce_compile_call(self):
|
||||
packed_str = { False : "false", True : "true"}[self.packed]
|
||||
print " { %d, %s, %d, %d, %s, %d, %d }," % (
|
||||
self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
|
||||
self.conf_pull_back, self.conf_top_level_split)
|
||||
|
||||
def get_name(self):
|
||||
if self.packed:
|
||||
pck_string = "_pck"
|
||||
else:
|
||||
pck_string = ""
|
||||
|
||||
if self.num_buckets == 16:
|
||||
type_string = "_fat"
|
||||
else:
|
||||
type_string = ""
|
||||
|
||||
return "fdr_exec_teddy_%s_msks%d%s%s" % (self.arch.name, self.num_masks, pck_string, type_string)
|
||||
|
||||
def __init__(self, arch, packed = False, num_masks = 1, num_buckets = 8):
|
||||
self.arch = arch
|
||||
self.packed = packed
|
||||
self.num_masks = num_masks
|
||||
self.num_buckets = num_buckets
|
||||
self.num_iterations = 2
|
||||
|
||||
if packed:
|
||||
self.conf_top_level_split = 32
|
||||
else:
|
||||
self.conf_top_level_split = 1
|
||||
self.conf_pull_back = 0
|
||||
|
||||
class MTFat(MT):
|
||||
def produce_needed_temporaries(self, max_iterations):
|
||||
print " m256 p_mask;"
|
||||
for iter in range(0, max_iterations):
|
||||
print " m256 val_%d;" % iter
|
||||
print " m256 val_%d_lo;" % iter
|
||||
print " m256 val_%d_hi;" % iter
|
||||
for x in range(self.num_masks):
|
||||
print " m256 res_%d_%d;" % (iter, x)
|
||||
if x != 0:
|
||||
print " m256 res_shifted_%d_%d;" % (iter, x)
|
||||
print " m256 r_%d;" % iter
|
||||
print "#ifdef ARCH_64_BIT"
|
||||
print " u64a r_%d_part1;" % iter
|
||||
print " u64a r_%d_part2;" % iter
|
||||
print " u64a r_%d_part3;" % iter
|
||||
print " u64a r_%d_part4;" % iter
|
||||
print "#else"
|
||||
print " u32 r_%d_part1;" % iter
|
||||
print " u32 r_%d_part2;" % iter
|
||||
print " u32 r_%d_part3;" % iter
|
||||
print " u32 r_%d_part4;" % iter
|
||||
print " u32 r_%d_part5;" % iter
|
||||
print " u32 r_%d_part6;" % iter
|
||||
print " u32 r_%d_part7;" % iter
|
||||
print " u32 r_%d_part8;" % iter
|
||||
print "#endif"
|
||||
|
||||
def produce_code(self):
|
||||
print self.produce_header(visible = True, header_only = False)
|
||||
print """
|
||||
const u8 * buf = a->buf;
|
||||
const size_t len = a->len;
|
||||
const u8 * ptr = buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t * control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 * tryFloodDetect = a->firstFloodDetect;
|
||||
const struct FDRConfirm *fdrc;
|
||||
u32 last_match = (u32)-1;
|
||||
"""
|
||||
print
|
||||
|
||||
self.produce_needed_temporaries(self.num_iterations)
|
||||
print
|
||||
|
||||
print " const struct Teddy * teddy = (const struct Teddy *)fdr;"
|
||||
print " const m256 * maskBase = (const m256 *)((const u8 *)fdr + sizeof(struct Teddy));"
|
||||
print " const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32*2));" % self.num_masks
|
||||
print " const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
|
||||
print " const size_t iterBytes = %d;" % (self.num_iterations * 16)
|
||||
|
||||
print ' DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
|
||||
' buf, len, a->start_offset);'
|
||||
print ' DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
|
||||
' mainStart);'
|
||||
|
||||
for x in range(self.num_masks):
|
||||
if (x != 0):
|
||||
print " m256 res_old_%d = ones256();" % x
|
||||
print " m256 lomask = set32x8(0xf);"
|
||||
|
||||
print " if (ptr < mainStart) {"
|
||||
print " ptr = mainStart - 16;"
|
||||
self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
|
||||
print " ptr += 16;"
|
||||
print " }"
|
||||
|
||||
print " if (ptr + 16 < buf + len) {"
|
||||
self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
|
||||
print " ptr += 16;"
|
||||
print " }"
|
||||
|
||||
print """
|
||||
for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
if (P0(ptr > tryFloodDetect)) {
|
||||
tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
|
||||
if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
|
||||
*a->groups = controlVal;
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
for iter in range(self.num_iterations):
|
||||
self.produce_one_iteration(iter, self.num_iterations, False, confirmCautious = False)
|
||||
|
||||
print " }"
|
||||
|
||||
print " for (; ptr < buf + len; ptr += 16) {"
|
||||
self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
|
||||
print " }"
|
||||
|
||||
print """
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
"""
|
||||
|
||||
def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
|
||||
cautious, save_old):
|
||||
if cautious:
|
||||
print " val_%d = vectoredLoad2x128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
|
||||
else:
|
||||
print " val_%d = load2x128(ptr + %d);" % (iter, iter*16)
|
||||
print " val_%d_lo = and256(val_%d, lomask);" % (iter, iter)
|
||||
print " val_%d_hi = rshift4x64(val_%d, 4);" % (iter, iter)
|
||||
print " val_%d_hi = and256(val_%d_hi, lomask);" % (iter, iter)
|
||||
print
|
||||
for x in range(self.num_masks):
|
||||
print Template("""
|
||||
res_${ITER}_${X} = and256(vpshufb(maskBase[${X}*2] , val_${ITER}_lo),
|
||||
vpshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
|
||||
if x != 0:
|
||||
if iter == 0:
|
||||
print " res_shifted_%d_%d = vpalignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x, iter, x, x, x)
|
||||
else:
|
||||
print " res_shifted_%d_%d = vpalignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x, iter, x, iter-1, x, x)
|
||||
if x != 0 and iter == effective_num_iterations - 1 and save_old:
|
||||
print " res_old_%d = res_%d_%d;" % (x, iter, x)
|
||||
print
|
||||
if cautious:
|
||||
print " r_%d = and256(res_%d_0, p_mask);" % (iter, iter)
|
||||
else:
|
||||
print " r_%d = res_%d_0;" % (iter, iter)
|
||||
for x in range(1, self.num_masks):
|
||||
print " r_%d = and256(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
|
||||
print
|
||||
|
||||
def produce_one_iteration_confirm(self, iter, confirmCautious):
|
||||
setup64 = [ (0, "r_%d_part1" % iter, "extractlow64from256(r)"),
|
||||
(4, "r_%d_part2" % iter, "extract64from256(r, 1);\n r = interleave256hi(r_%d, r_swap)" % (iter)),
|
||||
(8, "r_%d_part3" % iter, "extractlow64from256(r)"),
|
||||
(12, "r_%d_part4" % iter, "extract64from256(r, 1)") ]
|
||||
|
||||
setup32 = [ (0, "r_%d_part1" % iter, "extractlow32from256(r)"),
|
||||
(2, "r_%d_part2" % iter, "extract32from256(r, 1)"),
|
||||
(4, "r_%d_part3" % iter, "extract32from256(r, 2)"),
|
||||
(6, "r_%d_part4" % iter, "extract32from256(r, 3);\n r = interleave256hi(r_%d, r_swap)" % (iter)),
|
||||
(8, "r_%d_part5" % iter, "extractlow32from256(r)"),
|
||||
(10, "r_%d_part6" % iter, "extract32from256(r, 1)"),
|
||||
(12, "r_%d_part7" % iter, "extract32from256(r, 2)"),
|
||||
(14, "r_%d_part8" % iter, "extract32from256(r, 3)") ]
|
||||
|
||||
print " if (P0(isnonzero256(r_%d))) {" % (iter)
|
||||
print " m256 r_swap = swap128in256(r_%d);" % (iter)
|
||||
print " m256 r = interleave256lo(r_%d, r_swap);" % (iter)
|
||||
print "#ifdef ARCH_64_BIT"
|
||||
for (off, val, init) in setup64:
|
||||
print " %s = %s;" % (val, init)
|
||||
|
||||
for (off, val, init) in setup64:
|
||||
self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
|
||||
print "#else"
|
||||
for (off, val, init) in setup32:
|
||||
print " %s = %s;" % (val, init)
|
||||
|
||||
for (off, val, init) in setup32:
|
||||
self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
|
||||
print "#endif"
|
||||
print " }"
|
||||
|
||||
class MTFast:
|
||||
def produce_header(self, visible, header_only = False):
|
||||
s = ""
|
||||
if not visible:
|
||||
s += "static never_inline"
|
||||
s += """
|
||||
hwlm_error_t %s(UNUSED const struct FDR *fdr,
|
||||
UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name()
|
||||
if header_only:
|
||||
s += ";"
|
||||
else:
|
||||
s += "{"
|
||||
s += "\n"
|
||||
return s
|
||||
|
||||
def produce_guard(self):
|
||||
print self.arch.get_guard()
|
||||
|
||||
def produce_zero_alternative(self):
|
||||
print """
|
||||
#else
|
||||
#define %s 0
|
||||
#endif
|
||||
""" % self.get_name()
|
||||
|
||||
def close_guard(self):
|
||||
print "#endif"
|
||||
|
||||
def produce_confirm(self, cautious):
|
||||
if cautious:
|
||||
cautious_str = "VECTORING"
|
||||
else:
|
||||
cautious_str = "NOT_CAUTIOUS"
|
||||
|
||||
print " for (u32 i = 0; i < arrCnt; i++) {"
|
||||
print " u32 byte = bitArr[i] / 8;"
|
||||
if self.packed:
|
||||
conf_split_mask = IntegerType(32).constant_to_string(
|
||||
self.conf_top_level_split - 1)
|
||||
print " u32 bitRem = bitArr[i] % 8;"
|
||||
print " u32 confSplit = *(ptr+byte) & 0x1f;"
|
||||
print " u32 idx = confSplit * %d + bitRem;" % self.num_buckets
|
||||
print " u32 cf = confBase[idx];"
|
||||
print " if (!cf)"
|
||||
print " continue;"
|
||||
print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
|
||||
print " if (!(fdrc->groups & *control))"
|
||||
print " continue;"
|
||||
print """
|
||||
CautionReason reason = %s;
|
||||
CONF_TYPE v;
|
||||
const u8 * confirm_loc = ptr + byte - 7;
|
||||
if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
|
||||
v = lv_u64a(confirm_loc, buf, buf + len);
|
||||
} else { // r == VECTORING, confirm_loc < buf
|
||||
u64a histBytes = a->histBytes;
|
||||
v = lv_u64a_ce(confirm_loc, buf, buf + len);
|
||||
// stitch together v (which doesn't move) and history (which does)
|
||||
u32 overhang = buf - confirm_loc;
|
||||
histBytes >>= 64 - (overhang * 8);
|
||||
v |= histBytes;
|
||||
}""" % (cautious_str)
|
||||
print " confWithBit(fdrc, a, ptr - buf + byte, 0, control, &last_match, v);"
|
||||
else:
|
||||
print " u32 cf = confBase[bitArr[i] % 8];"
|
||||
print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
|
||||
print """
|
||||
CautionReason reason = %s;
|
||||
CONF_TYPE v;
|
||||
const u8 * confirm_loc = ptr + byte - 7;
|
||||
if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
|
||||
v = lv_u64a(confirm_loc, buf, buf + len);
|
||||
} else { // r == VECTORING, confirm_loc < buf
|
||||
u64a histBytes = a->histBytes;
|
||||
v = lv_u64a_ce(confirm_loc, buf, buf + len);
|
||||
// stitch together v (which doesn't move) and history (which does)
|
||||
u32 overhang = buf - confirm_loc;
|
||||
histBytes >>= 64 - (overhang * 8);
|
||||
v |= histBytes;
|
||||
}""" % (cautious_str)
|
||||
print " confWithBit1(fdrc, a, ptr - buf + byte, control, &last_match, v);"
|
||||
print " if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
|
||||
print " *a->groups = controlVal;"
|
||||
print " return HWLM_TERMINATED;"
|
||||
print " }"
|
||||
print " }"
|
||||
|
||||
def produce_needed_temporaries(self, max_iterations):
|
||||
print " u32 arrCnt;"
|
||||
print " u16 bitArr[512];"
|
||||
print " m256 p_mask;"
|
||||
print " m256 val_0;"
|
||||
print " m256 val_0_lo;"
|
||||
print " m256 val_0_hi;"
|
||||
print " m256 res_0;"
|
||||
print " m256 res_1;"
|
||||
print " m128 lo_part;"
|
||||
print " m128 hi_part;"
|
||||
print "#ifdef ARCH_64_BIT"
|
||||
print " u64a r_0_part;"
|
||||
print "#else"
|
||||
print " u32 r_0_part;"
|
||||
print "#endif"
|
||||
|
||||
def produce_bit_scan(self, offset, bits):
|
||||
print " while (P0(!!r_0_part)) {"
|
||||
if bits == 64:
|
||||
print " bitArr[arrCnt++] = (u16)findAndClearLSB_64(&r_0_part) + 64 * %d;" % (offset)
|
||||
else:
|
||||
print " bitArr[arrCnt++] = (u16)findAndClearLSB_32(&r_0_part) + 32 * %d;" % (offset)
|
||||
print " }"
|
||||
|
||||
def produce_bit_check_128(self, var_name, offset):
|
||||
print " if (P0(isnonzero128(%s))) {" % (var_name)
|
||||
print "#ifdef ARCH_64_BIT"
|
||||
print " r_0_part = movq(%s);" % (var_name)
|
||||
self.produce_bit_scan(offset, 64)
|
||||
print " r_0_part = movq(byteShiftRight128(%s, 8));" % (var_name)
|
||||
self.produce_bit_scan(offset + 1, 64)
|
||||
print "#else"
|
||||
print " r_0_part = movd(%s);" % (var_name)
|
||||
self.produce_bit_scan(offset * 2, 32)
|
||||
for step in range(1, 4):
|
||||
print " r_0_part = movd(byteShiftRight128(%s, %d));" % (var_name, step * 4)
|
||||
self.produce_bit_scan(offset * 2 + step, 32)
|
||||
print "#endif"
|
||||
print " }"
|
||||
|
||||
def produce_bit_check_256(self, iter, single_iter, cautious):
|
||||
print " if (P0(isnonzero256(res_%d))) {" % (iter)
|
||||
if single_iter:
|
||||
print " arrCnt = 0;"
|
||||
print " lo_part = cast256to128(res_%d);" % (iter)
|
||||
print " hi_part = cast256to128(swap128in256(res_%d));" % (iter)
|
||||
self.produce_bit_check_128("lo_part", iter * 4)
|
||||
self.produce_bit_check_128("hi_part", iter * 4 + 2)
|
||||
if single_iter:
|
||||
self.produce_confirm(cautious)
|
||||
print " }"
|
||||
|
||||
def produce_one_iteration_state_calc(self, iter, cautious):
|
||||
if cautious:
|
||||
print " val_0 = vectoredLoad256(&p_mask, ptr + %d, buf+a->start_offset, buf+len, a->buf_history, a->len_history);" % (iter * 32)
|
||||
else:
|
||||
print " val_0 = load256(ptr + %d);" % (iter * 32)
|
||||
print " val_0_lo = and256(val_0, lomask);"
|
||||
print " val_0_hi = rshift4x64(val_0, 4);"
|
||||
print " val_0_hi = and256(val_0_hi, lomask);"
|
||||
print " res_%d = and256(vpshufb(maskLo , val_0_lo), vpshufb(maskHi, val_0_hi));" % (iter)
|
||||
if cautious:
|
||||
print " res_%d = and256(res_%d, p_mask);" % (iter, iter)
|
||||
|
||||
def produce_code(self):
|
||||
print self.produce_header(visible = True, header_only = False)
|
||||
print """
|
||||
const u8 * buf = a->buf;
|
||||
const size_t len = a->len;
|
||||
const u8 * ptr = buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t * control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 * tryFloodDetect = a->firstFloodDetect;
|
||||
const struct FDRConfirm *fdrc;
|
||||
u32 last_match = (u32)-1;
|
||||
"""
|
||||
print
|
||||
|
||||
self.produce_needed_temporaries(self.num_iterations)
|
||||
|
||||
print " const struct Teddy * teddy = (const struct Teddy *)fdr;"
|
||||
print " const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
|
||||
print " const m256 maskLo = set2x128(maskBase[0]);"
|
||||
print " const m256 maskHi = set2x128(maskBase[1]);"
|
||||
print " const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + 32);"
|
||||
print " const u8 * mainStart = ROUNDUP_PTR(ptr, 32);"
|
||||
print " const size_t iterBytes = %d;" % (self.num_iterations * 32)
|
||||
|
||||
print ' DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
|
||||
' buf, len, a->start_offset);'
|
||||
print ' DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
|
||||
' mainStart);'
|
||||
print " const m256 lomask = set32x8(0xf);"
|
||||
|
||||
print " if (ptr < mainStart) {"
|
||||
print " ptr = mainStart - 32;"
|
||||
self.produce_one_iteration_state_calc(iter = 0, cautious = True)
|
||||
self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
|
||||
print " ptr += 32;"
|
||||
print " }"
|
||||
|
||||
print " if (ptr + 32 < buf + len) {"
|
||||
self.produce_one_iteration_state_calc(iter = 0, cautious = False)
|
||||
self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
|
||||
print " ptr += 32;"
|
||||
print " }"
|
||||
print """
|
||||
for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
if (P0(ptr > tryFloodDetect)) {
|
||||
tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
|
||||
if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
|
||||
*a->groups = controlVal;
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
for iter in range (0, self.num_iterations):
|
||||
self.produce_one_iteration_state_calc(iter = iter, cautious = False)
|
||||
print " arrCnt = 0;"
|
||||
for iter in range (0, self.num_iterations):
|
||||
self.produce_bit_check_256(iter = iter, single_iter = False, cautious = False)
|
||||
self.produce_confirm(cautious = False)
|
||||
print " }"
|
||||
|
||||
print " for (; ptr < buf + len; ptr += 32) {"
|
||||
self.produce_one_iteration_state_calc(iter = 0, cautious = True)
|
||||
self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
|
||||
print " }"
|
||||
|
||||
print """
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
"""
|
||||
|
||||
def get_name(self):
|
||||
if self.packed:
|
||||
pck_string = "_pck"
|
||||
else:
|
||||
pck_string = ""
|
||||
return "fdr_exec_teddy_%s_msks%d%s_fast" % (self.arch.name, self.num_masks, pck_string)
|
||||
|
||||
def produce_compile_call(self):
|
||||
packed_str = { False : "false", True : "true"}[self.packed]
|
||||
print " { %d, %s, %d, %d, %s, %d, %d }," % (
|
||||
self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
|
||||
self.conf_pull_back, self.conf_top_level_split)
|
||||
|
||||
def __init__(self, arch, packed = False):
|
||||
self.arch = arch
|
||||
self.packed = packed
|
||||
self.num_masks = 1
|
||||
self.num_buckets = 8
|
||||
self.num_iterations = 2
|
||||
|
||||
self.conf_top_level_split = 1
|
||||
self.conf_pull_back = 0
|
||||
if packed:
|
||||
self.conf_top_level_split = 32
|
||||
else:
|
||||
self.conf_top_level_split = 1
|
||||
self.conf_pull_back = 0
|
1110
src/fdr/teddy_avx2.c
Normal file
1110
src/fdr/teddy_avx2.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -64,7 +64,32 @@ bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const
|
||||
return false;
|
||||
}
|
||||
|
||||
#include "teddy_autogen_compiler.cpp"
|
||||
void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
|
||||
static const TeddyEngineDef defns[] = {
|
||||
{ 1, 0 | HS_CPU_FEATURES_AVX2, 1, 8, false, 0, 1 },
|
||||
{ 2, 0 | HS_CPU_FEATURES_AVX2, 1, 8, true, 0, 32 },
|
||||
{ 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false, 0, 1 },
|
||||
{ 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true, 0, 32 },
|
||||
{ 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false, 0, 1 },
|
||||
{ 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true, 0, 32 },
|
||||
{ 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false, 0, 1 },
|
||||
{ 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true, 0, 32 },
|
||||
{ 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false, 0, 1 },
|
||||
{ 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true, 0, 32 },
|
||||
{ 11, 0, 1, 8, false, 0, 1 },
|
||||
{ 12, 0, 1, 8, true, 0, 32 },
|
||||
{ 13, 0, 2, 8, false, 0, 1 },
|
||||
{ 14, 0, 2, 8, true, 0, 32 },
|
||||
{ 15, 0, 3, 8, false, 0, 1 },
|
||||
{ 16, 0, 3, 8, true, 0, 32 },
|
||||
{ 17, 0, 4, 8, false, 0, 1 },
|
||||
{ 18, 0, 4, 8, true, 0, 32 },
|
||||
};
|
||||
out->clear();
|
||||
for (const auto &def : defns) {
|
||||
out->emplace_back(def);
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
size_t maxFloodTailLen(const vector<hwlmLiteral> &vl) {
|
||||
|
256
src/fdr/teddy_runtime_common.h
Normal file
256
src/fdr/teddy_runtime_common.h
Normal file
@ -0,0 +1,256 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Teddy literal matcher: common runtime procedures.
|
||||
*/
|
||||
|
||||
#ifndef TEDDY_RUNTIME_COMMON_H_
|
||||
#define TEDDY_RUNTIME_COMMON_H_
|
||||
|
||||
#include "fdr_confirm.h"
|
||||
#include "fdr_confirm_runtime.h"
|
||||
#include "ue2common.h"
|
||||
#include "util/bitutils.h"
|
||||
#include "util/simd_utils.h"
|
||||
|
||||
extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
#define TEDDY_CONF_TYPE u64a
|
||||
#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
|
||||
#else
|
||||
#define TEDDY_CONF_TYPE u32
|
||||
#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf)
|
||||
#endif
|
||||
|
||||
#define CHECK_HWLM_TERMINATE_MATCHING \
|
||||
do { \
|
||||
if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) { \
|
||||
*a->groups = controlVal; \
|
||||
return HWLM_TERMINATED; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define CHECK_FLOOD \
|
||||
do { \
|
||||
if (unlikely(ptr > tryFloodDetect)) { \
|
||||
tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, \
|
||||
&floodBackoff, &controlVal, \
|
||||
iterBytes); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
/*
|
||||
* \brief Copy a block of [0,15] bytes efficiently.
|
||||
*
|
||||
* This function is a workaround intended to stop some compilers from
|
||||
* synthesizing a memcpy function call out of the copy of a small number of
|
||||
* bytes that we do in vectoredLoad128.
|
||||
*/
|
||||
static really_inline
|
||||
void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
|
||||
switch (len) {
|
||||
case 0:
|
||||
break;
|
||||
case 1:
|
||||
*dst = *src;
|
||||
break;
|
||||
case 2:
|
||||
unaligned_store_u16(dst, unaligned_load_u16(src));
|
||||
break;
|
||||
case 3:
|
||||
unaligned_store_u16(dst, unaligned_load_u16(src));
|
||||
dst[2] = src[2];
|
||||
break;
|
||||
case 4:
|
||||
unaligned_store_u32(dst, unaligned_load_u32(src));
|
||||
break;
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
/* Perform copy with two overlapping 4-byte chunks. */
|
||||
unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
|
||||
unaligned_store_u32(dst, unaligned_load_u32(src));
|
||||
break;
|
||||
case 8:
|
||||
unaligned_store_u64a(dst, unaligned_load_u64a(src));
|
||||
break;
|
||||
default:
|
||||
/* Perform copy with two overlapping 8-byte chunks. */
|
||||
assert(len < 16);
|
||||
unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
|
||||
unaligned_store_u64a(dst, unaligned_load_u64a(src));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: p_mask is an output param that initialises a poison mask.
|
||||
static really_inline
|
||||
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
|
||||
const u8 *buf_history, size_t len_history,
|
||||
const u32 nMasks) {
|
||||
union {
|
||||
u8 val8[16];
|
||||
m128 val128;
|
||||
} u;
|
||||
u.val128 = zeroes128();
|
||||
|
||||
uintptr_t copy_start;
|
||||
uintptr_t copy_len;
|
||||
|
||||
if (ptr >= lo) {
|
||||
uintptr_t avail = (uintptr_t)(hi - ptr);
|
||||
if (avail >= 16) {
|
||||
*p_mask = load128(p_mask_arr[16] + 16);
|
||||
return loadu128(ptr);
|
||||
}
|
||||
*p_mask = load128(p_mask_arr[avail] + 16);
|
||||
copy_start = 0;
|
||||
copy_len = avail;
|
||||
} else {
|
||||
uintptr_t need = MIN((uintptr_t)(lo - ptr),
|
||||
MIN(len_history, nMasks - 1));
|
||||
uintptr_t start = (uintptr_t)(lo - ptr);
|
||||
uintptr_t i;
|
||||
for (i = start - need; ptr + i < lo; i++) {
|
||||
u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
|
||||
}
|
||||
uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
|
||||
*p_mask = loadu128(p_mask_arr[end - start] + 16 - start);
|
||||
copy_start = i;
|
||||
copy_len = end - i;
|
||||
}
|
||||
|
||||
// Runt block from the buffer.
|
||||
copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len);
|
||||
|
||||
return u.val128;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
|
||||
CautionReason reason) {
|
||||
u64a confVal = 0;
|
||||
const u8 *buf = a->buf;
|
||||
size_t len = a->len;
|
||||
const u8 *confirm_loc = ptr + byte - 7;
|
||||
if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
|
||||
confVal = lv_u64a(confirm_loc, buf, buf + len);
|
||||
} else { // r == VECTORING, confirm_loc < buf
|
||||
u64a histBytes = a->histBytes;
|
||||
confVal = lv_u64a_ce(confirm_loc, buf, buf + len);
|
||||
// stitch together confVal and history
|
||||
u32 overhang = buf - confirm_loc;
|
||||
histBytes >>= 64 - (overhang * 8);
|
||||
confVal |= histBytes;
|
||||
}
|
||||
return confVal;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
|
||||
const u32 *confBase, CautionReason reason,
|
||||
const struct FDR_Runtime_Args *a, const u8 *ptr,
|
||||
hwlmcb_rv_t *control, u32 *last_match) {
|
||||
do {
|
||||
u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
|
||||
u32 byte = bit / bucket + offset;
|
||||
u32 bitRem = bit % bucket;
|
||||
u32 confSplit = *(ptr+byte) & 0x1f;
|
||||
u32 idx = confSplit * bucket + bitRem;
|
||||
u32 cf = confBase[idx];
|
||||
if (!cf) {
|
||||
continue;
|
||||
}
|
||||
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
|
||||
((const u8 *)confBase + cf);
|
||||
if (!(fdrc->groups & *control)) {
|
||||
continue;
|
||||
}
|
||||
u64a confVal = getConfVal(a, ptr, byte, reason);
|
||||
confWithBit(fdrc, a, ptr - a->buf + byte, 0, control,
|
||||
last_match, confVal);
|
||||
} while (unlikely(*conf));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
|
||||
const u32 *confBase, CautionReason reason,
|
||||
const struct FDR_Runtime_Args *a, const u8 *ptr,
|
||||
hwlmcb_rv_t *control, u32 *last_match) {
|
||||
do {
|
||||
u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
|
||||
u32 byte = bit / bucket + offset;
|
||||
u32 idx = bit % bucket;
|
||||
u32 cf = confBase[idx];
|
||||
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
|
||||
((const u8 *)confBase + cf);
|
||||
if (!(fdrc->groups & *control)) {
|
||||
continue;
|
||||
}
|
||||
u64a confVal = getConfVal(a, ptr, byte, reason);
|
||||
confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match,
|
||||
confVal);
|
||||
} while (unlikely(*conf));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
|
||||
const u32 *confBase, CautionReason reason,
|
||||
const struct FDR_Runtime_Args *a, const u8 *ptr,
|
||||
hwlmcb_rv_t *control, u32 *last_match) {
|
||||
do {
|
||||
u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
|
||||
u32 byte = bit / bucket + offset;
|
||||
u32 idx = bit % bucket;
|
||||
u32 cf = confBase[idx];
|
||||
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
|
||||
((const u8 *)confBase + cf);
|
||||
if (!(fdrc->groups & *control)) {
|
||||
continue;
|
||||
}
|
||||
u64a confVal = getConfVal(a, ptr, byte, reason);
|
||||
confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control,
|
||||
last_match, confVal);
|
||||
} while (unlikely(*conf));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
const m128 * getMaskBase(const struct Teddy *teddy) {
|
||||
return (const m128 *)((const u8 *)teddy + sizeof(struct Teddy));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
const u32 * getConfBase(const struct Teddy *teddy, u8 numMask) {
|
||||
return (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) +
|
||||
(numMask*32));
|
||||
}
|
||||
|
||||
#endif /* TEDDY_RUNTIME_COMMON_H_ */
|
Loading…
x
Reference in New Issue
Block a user