mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
fdr: Remove python codegen, add safezones
This commit is contained in:
parent
e86688e313
commit
598f0565cf
@ -964,7 +964,7 @@ endif()
|
||||
add_library(hs STATIC ${hs_SRCS} $<TARGET_OBJECTS:hs_exec>)
|
||||
|
||||
add_dependencies(hs ragel_Parser)
|
||||
add_dependencies(hs autogen_compiler autogen_teddy_compiler)
|
||||
add_dependencies(hs autogen_teddy_compiler)
|
||||
|
||||
if (NOT BUILD_SHARED_LIBS)
|
||||
install(TARGETS hs DESTINATION lib)
|
||||
|
@ -5,8 +5,6 @@ set(AUTOGEN_PY_FILES
|
||||
arch.py
|
||||
autogen.py
|
||||
autogen_utils.py
|
||||
base_autogen.py
|
||||
fdr_autogen.py
|
||||
teddy_autogen.py
|
||||
)
|
||||
|
||||
@ -22,18 +20,14 @@ endfunction(fdr_autogen)
|
||||
|
||||
#now build the functions
|
||||
fdr_autogen(runtime fdr_autogen.c)
|
||||
fdr_autogen(compiler fdr_autogen_compiler.cpp)
|
||||
fdr_autogen(teddy_runtime teddy_autogen.c)
|
||||
fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp)
|
||||
|
||||
set(fdr_GENERATED_SRC
|
||||
${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen.c
|
||||
${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp
|
||||
${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen.c
|
||||
${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
|
||||
PARENT_SCOPE)
|
||||
|
||||
set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE)
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# Copyright (c) 2015, Intel Corporation
|
||||
# Copyright (c) 2015-2016, Intel Corporation
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
@ -27,41 +27,9 @@
|
||||
|
||||
import sys
|
||||
from autogen_utils import *
|
||||
from fdr_autogen import *
|
||||
from teddy_autogen import *
|
||||
from arch import *
|
||||
|
||||
# FDR setup
|
||||
|
||||
# these are either produced - if the guard succeeds, or #defined to zeroes.
|
||||
# either the function or the zero is fine in our array of function pointers
|
||||
|
||||
def produce_fdr_runtimes(l):
|
||||
for m in l:
|
||||
m.produce_code()
|
||||
|
||||
def produce_fdr_compiles(l):
|
||||
print "void getFdrDescriptions(vector<FDREngineDescription> *out) {"
|
||||
print " static const FDREngineDef defns[] = {"
|
||||
for m in l:
|
||||
m.produce_compile_call()
|
||||
print " };"
|
||||
print " out->clear();"
|
||||
print " for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {"
|
||||
print " out->push_back(FDREngineDescription(defns[i]));"
|
||||
print " }"
|
||||
print "}"
|
||||
|
||||
def build_fdr_matchers():
|
||||
all_matchers = [ ]
|
||||
strides = [ 1, 2, 4 ]
|
||||
|
||||
common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
|
||||
for s in strides:
|
||||
all_matchers += [ M3(stride = s, **common) ]
|
||||
|
||||
return all_matchers
|
||||
|
||||
# teddy setup
|
||||
|
||||
def build_teddy_matchers():
|
||||
@ -124,7 +92,8 @@ def make_fdr_function_pointers(matcher_list):
|
||||
typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
|
||||
static FDRFUNCTYPE funcs[] = {
|
||||
"""
|
||||
all_funcs = ",\n".join([ " %s" % m.get_name() for m in matcher_list ])
|
||||
all_funcs = " fdr_engine_exec,\n"
|
||||
all_funcs += ",\n".join([ " %s" % m.get_name() for m in matcher_list ])
|
||||
print all_funcs
|
||||
print """
|
||||
};
|
||||
@ -138,16 +107,11 @@ def assign_ids(matcher_list, next_id):
|
||||
|
||||
# Main entry point
|
||||
|
||||
m = build_fdr_matchers()
|
||||
next_id = assign_ids(m, 0)
|
||||
tm = build_teddy_matchers()
|
||||
next_id = assign_ids(tm, next_id)
|
||||
if sys.argv[1] == "compiler":
|
||||
produce_fdr_compiles(m)
|
||||
elif sys.argv[1] == "runtime":
|
||||
produce_fdr_runtimes(m)
|
||||
next_id = assign_ids(tm, 1)
|
||||
if sys.argv[1] == "runtime":
|
||||
produce_teddy_headers(tm)
|
||||
make_fdr_function_pointers(m+tm)
|
||||
make_fdr_function_pointers(tm)
|
||||
elif sys.argv[1] == "teddy_runtime":
|
||||
produce_teddy_runtimes(tm)
|
||||
elif sys.argv[1] == "teddy_compiler":
|
||||
|
@ -1,6 +1,6 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# Copyright (c) 2015, Intel Corporation
|
||||
# Copyright (c) 2015-2016, Intel Corporation
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
@ -41,9 +41,6 @@ class IntegerType:
|
||||
def size_in_bytes(self):
|
||||
return self.size / 8
|
||||
|
||||
def isSIMDOnIntel(self):
|
||||
return False
|
||||
|
||||
def zero_expression(self):
|
||||
return "0"
|
||||
|
||||
@ -63,15 +60,9 @@ class IntegerType:
|
||||
def lowbit_mask(self, n):
|
||||
return self.constant_to_string(self.lowbits(n))
|
||||
|
||||
def highbit_mask(self, n):
|
||||
return self.constant_to_string(self.highbits(n))
|
||||
|
||||
def lowbit_extract_expr(self, expr_string, n):
|
||||
return "(%s & %s)" % ( expr_string, self.lowbit_mask(n))
|
||||
|
||||
def highbit_extract_expr(self, expr_string, n):
|
||||
return "(%s >> %d)" % (expr_string, self.size - n)
|
||||
|
||||
def flip_lowbits_expr(self, expr_string, n):
|
||||
return "(%s ^ %s)" % ( expr_string, self.lowbit_mask(n))
|
||||
|
||||
@ -90,36 +81,10 @@ class IntegerType:
|
||||
else:
|
||||
return "(%s)" % (expr_string)
|
||||
|
||||
# code is:
|
||||
# "normal" (always between buf and len) - the default
|
||||
# "aligned" (means normal + aligned to a natural boundary)
|
||||
# "cautious_forward" (means may go off the end of buf+len)
|
||||
# "cautious_backwards" (means may go off the start of buf)
|
||||
# "cautious_everywhere" (means may go off both)
|
||||
|
||||
def load_expr_data(self, offset = 0, code = "normal",
|
||||
base_string = "ptr", bounds_lo = "buf", bounds_hi = "buf + len"):
|
||||
if code is "normal":
|
||||
return "lv_%s(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
|
||||
elif code is "aligned":
|
||||
if self.size is 8:
|
||||
fail_out("no aligned byte loads")
|
||||
return "lv_%s_a(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
|
||||
elif code is "cautious_forward":
|
||||
return "lv_%s_cf(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
|
||||
elif code is "cautious_backward":
|
||||
return "lv_%s_cb(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
|
||||
elif code is "cautious_everywhere":
|
||||
return "lv_%s_ce(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi)
|
||||
|
||||
|
||||
class SIMDIntegerType(IntegerType):
|
||||
def __init__(self, size):
|
||||
IntegerType.__init__(self, size)
|
||||
|
||||
def isSIMDOnIntel(self):
|
||||
return True
|
||||
|
||||
def zero_expression(self):
|
||||
return "zeroes128()"
|
||||
|
||||
@ -132,9 +97,6 @@ class SIMDIntegerType(IntegerType):
|
||||
tmpExpr = "movq(%s)" % expr_string
|
||||
return tmpType.lowbit_extract_expr(tmpExpr, n)
|
||||
|
||||
def highbit_extract_expr(self, expr_string, n):
|
||||
fail_out("Unimplemented high bit extract on m128")
|
||||
|
||||
def bit_extract_expr(self, expr_string, low, high, flip):
|
||||
fail_out("Unimplemented bit extract on m128")
|
||||
|
||||
@ -146,9 +108,9 @@ class SIMDIntegerType(IntegerType):
|
||||
if n <= -self.size or n >= self.size:
|
||||
return self.zero_expression()
|
||||
elif (n > 0):
|
||||
return "_mm_slli_si128(%s, %s)" % (expr_string, n / 8)
|
||||
return "byteShiftLeft128(%s, %s)" % (expr_string, n / 8)
|
||||
elif (n < 0):
|
||||
return "_mm_srli_si128(%s, %s)" % (expr_string, -n / 8)
|
||||
return "byteShiftRight128(%s, %s)" % (expr_string, -n / 8)
|
||||
else:
|
||||
return "(%s)" % (expr_string)
|
||||
|
||||
@ -156,130 +118,3 @@ class SIMDIntegerType(IntegerType):
|
||||
if n % 8 != 0:
|
||||
fail_out("Trying to make a lowbit mask in a m128 by a bit granular value")
|
||||
return self.shift_expr("ones128()", -(128 - n))
|
||||
|
||||
def getRequiredType(bits):
|
||||
if bits == 128:
|
||||
return SIMDIntegerType(bits)
|
||||
for b in [ 8, 16, 32, 64]:
|
||||
if (bits <= b):
|
||||
return IntegerType(b)
|
||||
return None
|
||||
|
||||
class IntegerVariable:
|
||||
def __init__(self, name, type):
|
||||
self.name = name
|
||||
self.type = type
|
||||
|
||||
def gen_initializer_stmt(self, initialization_string = None):
|
||||
if initialization_string:
|
||||
return "%s %s = %s;" % (self.type.get_name(), self.name, initialization_string)
|
||||
else:
|
||||
return "%s %s;" % (self.type.get_name(), self.name)
|
||||
|
||||
|
||||
class Step:
|
||||
def __init__(self, context, offset = 0):
|
||||
self.context = context
|
||||
self.matcher = context.matcher
|
||||
self.offset = offset
|
||||
self.latency = 1
|
||||
self.dependency_list = []
|
||||
self.latest = None
|
||||
self.context.add_step(self)
|
||||
|
||||
# return a string, complete with indentation
|
||||
def emit(self):
|
||||
indent = " " * (self.offset*2 + self.matcher.default_body_indent)
|
||||
s = "\n".join( [ indent + line for line in self.val.split("\n")] )
|
||||
if self.latest:
|
||||
s += " // " + str(self.debug_step) + " L" + str(self.latency) + " LTST:%d" % self.latest
|
||||
if self.dependency_list:
|
||||
s += " Derps: "
|
||||
for (d,l) in self.dependency_list:
|
||||
s += "%d/%d " % (d.debug_step,l)
|
||||
return s
|
||||
|
||||
def add_dependency(self, step, anti_dependency = False, output_dependency = False):
|
||||
if anti_dependency or output_dependency:
|
||||
self.dependency_list += [ (step, 1) ]
|
||||
else:
|
||||
self.dependency_list += [ (step, step.latency) ]
|
||||
|
||||
def nv(self, type, var_name):
|
||||
return self.context.new_var(self, type, var_name)
|
||||
|
||||
def gv(self, var_name, reader = True, writer = False):
|
||||
return self.context.get_var(self, var_name, reader = reader, writer = writer)
|
||||
|
||||
# utility steps, generic
|
||||
|
||||
class LabelStep(Step):
|
||||
def __init__(self, context, offset = 0, label_prefix = "off"):
|
||||
Step.__init__(self, context, offset)
|
||||
self.val = "%s%d: UNUSED;" % (label_prefix, offset)
|
||||
|
||||
class OpenScopeStep(Step):
|
||||
def __init__(self, context, offset = 0):
|
||||
Step.__init__(self, context, offset)
|
||||
self.val = "{"
|
||||
|
||||
class CloseScopeStep(Step):
|
||||
def __init__(self, context, offset = 0):
|
||||
Step.__init__(self, context, offset)
|
||||
self.val = "}"
|
||||
|
||||
|
||||
class CodeGenContext:
|
||||
def __init__(self, matcher):
|
||||
self.vars = {}
|
||||
self.steps = []
|
||||
self.ctr = 0
|
||||
self.matcher = matcher
|
||||
self.var_writer = {} # var to a single writer
|
||||
self.var_readers = {} # var to a list of all the readers that read the last value
|
||||
|
||||
def new_var(self, step, type, var_name):
|
||||
var = IntegerVariable(var_name, type)
|
||||
self.vars[var_name] = var
|
||||
self.var_writer[var_name] = step
|
||||
return var
|
||||
|
||||
def get_var(self, step, var_name, reader = True, writer = False):
|
||||
if reader:
|
||||
writer_step = self.var_writer[var_name]
|
||||
if writer_step:
|
||||
step.add_dependency(writer_step)
|
||||
self.var_readers.setdefault(var_name, []).append(step)
|
||||
if writer and not reader:
|
||||
if self.var_writer[var_name]:
|
||||
step.add_dependency(self.var_writer[var_name], output_dependency = True)
|
||||
if writer:
|
||||
if self.var_readers.has_key(var_name):
|
||||
for reader in [ r for r in self.var_readers[var_name] if r is not step ]:
|
||||
step.add_dependency(reader, anti_dependency = True)
|
||||
self.var_readers[var_name] = []
|
||||
self.var_writer[var_name] = step
|
||||
return self.vars[var_name]
|
||||
|
||||
def add_step(self, step):
|
||||
self.steps += [ step ]
|
||||
step.debug_step = self.ctr
|
||||
self.ctr += 1
|
||||
|
||||
def dontschedule(self, finals):
|
||||
return "\n".join( [ s.emit() for s in self.steps ] )
|
||||
|
||||
def schedule(self, finals):
|
||||
for f in finals:
|
||||
f.latest = f.latency
|
||||
worklist = finals
|
||||
while worklist:
|
||||
current = worklist[0]
|
||||
worklist = worklist[1:]
|
||||
for (dep, lat) in current.dependency_list:
|
||||
if dep.latest is None or dep.latest < (current.latest + dep.latency):
|
||||
dep.latest = current.latest + lat
|
||||
if dep not in worklist:
|
||||
worklist += [ dep ]
|
||||
self.steps.sort(reverse = True, key = lambda s : s.latest)
|
||||
return "\n".join( [ s.emit() for s in self.steps ] )
|
||||
|
@ -1,167 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# Copyright (c) 2015, Intel Corporation
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import sys
|
||||
from autogen_utils import *
|
||||
from base_autogen import *
|
||||
from string import Template
|
||||
|
||||
class MatcherBase:
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def get_name(self):
|
||||
return "fdr_exec_%03d" % self.id
|
||||
|
||||
def produce_header(self, visible, header_only = False):
|
||||
s = ""
|
||||
if not visible:
|
||||
s += "static never_inline"
|
||||
s += """
|
||||
hwlm_error_t %s(UNUSED const struct FDR *fdr,
|
||||
UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name()
|
||||
if header_only:
|
||||
s += ";"
|
||||
else:
|
||||
s += "{"
|
||||
s += "\n"
|
||||
return s
|
||||
|
||||
def produce_guard(self):
|
||||
print self.arch.get_guard()
|
||||
|
||||
def produce_zero_alternative(self):
|
||||
print """
|
||||
#else
|
||||
#define %s 0
|
||||
#endif
|
||||
""" % self.get_name()
|
||||
|
||||
# trivial function for documentation/modularity
|
||||
def close_guard(self):
|
||||
print "#endif"
|
||||
|
||||
def produce_common_declarations(self):
|
||||
return """
|
||||
const u8 * buf = a->buf;
|
||||
const size_t len = a->len;
|
||||
const u8 * ptr = buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t * control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 * tryFloodDetect = a->firstFloodDetect;
|
||||
UNUSED u32 bit, bitRem, confSplit, idx;
|
||||
u32 byte, cf;
|
||||
const struct FDRConfirm *fdrc;
|
||||
u32 last_match = (u32)-1;
|
||||
"""
|
||||
|
||||
def produce_continue_check(self):
|
||||
return """if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
|
||||
*a->groups = controlVal;
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
"""
|
||||
def produce_flood_check(self):
|
||||
return """
|
||||
if (P0(ptr > tryFloodDetect)) {
|
||||
tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
|
||||
if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
|
||||
*a->groups = controlVal;
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
def produce_footer(self):
|
||||
return """
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
"""
|
||||
|
||||
def produce_confirm_base(self, conf_var_name, conf_var_size, offset, cautious, enable_confirmless, do_bailout = False):
|
||||
if cautious:
|
||||
caution_string = "VECTORING"
|
||||
else:
|
||||
caution_string = "NOT_CAUTIOUS"
|
||||
conf_split_mask = IntegerType(32).constant_to_string(
|
||||
self.conf_top_level_split - 1)
|
||||
if enable_confirmless:
|
||||
quick_check_string = """
|
||||
if (!fdrc->mult) {
|
||||
u32 id = fdrc->nBitsOrSoleID;
|
||||
if ((last_match == id) && (fdrc->flags & NoRepeat))
|
||||
continue;
|
||||
last_match = id;
|
||||
controlVal = a->cb(ptr+byte-buf, ptr+byte-buf, id, a->ctxt);
|
||||
continue;
|
||||
} """
|
||||
else:
|
||||
quick_check_string = ""
|
||||
if do_bailout:
|
||||
bailout_string = """
|
||||
if ((ptr + byte < buf + a->start_offset) || (ptr + byte >= buf + len)) continue;"""
|
||||
else:
|
||||
bailout_string = ""
|
||||
|
||||
return Template("""
|
||||
if (P0(!!$CONFVAR)) {
|
||||
do {
|
||||
bit = findAndClearLSB_$CONFVAR_SIZE(&$CONFVAR);
|
||||
byte = bit / $NUM_BUCKETS + $OFFSET;
|
||||
bitRem = bit % $NUM_BUCKETS;
|
||||
$BAILOUT_STRING
|
||||
confSplit = *(ptr+byte) & $SPLIT_MASK;
|
||||
idx = confSplit * $NUM_BUCKETS + bitRem;
|
||||
cf = confBase[idx];
|
||||
if (!cf)
|
||||
continue;
|
||||
fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);
|
||||
if (!(fdrc->groups & *control))
|
||||
continue;
|
||||
$QUICK_CHECK_STRING
|
||||
confWithBit(fdrc, a, ptr - buf + byte, $CAUTION_STRING, $CONF_PULL_BACK, control, &last_match);
|
||||
} while(P0(!!$CONFVAR));
|
||||
if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
|
||||
*a->groups = controlVal;
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}""").substitute(CONFVAR = conf_var_name,
|
||||
CONFVAR_SIZE = conf_var_size,
|
||||
NUM_BUCKETS = self.num_buckets,
|
||||
OFFSET = offset,
|
||||
SPLIT_MASK = conf_split_mask,
|
||||
QUICK_CHECK_STRING = quick_check_string,
|
||||
BAILOUT_STRING = bailout_string,
|
||||
CAUTION_STRING = caution_string,
|
||||
CONF_PULL_BACK = self.conf_pull_back)
|
||||
|
||||
|
||||
def indent(block, depth):
|
||||
return "\n".join([ (" " * (4*depth)) + line for line in block.splitlines() ] )
|
760
src/fdr/fdr.c
760
src/fdr/fdr.c
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -26,28 +26,752 @@
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "util/simd_utils.h"
|
||||
|
||||
#define P0(cnd) unlikely(cnd)
|
||||
|
||||
#include "fdr.h"
|
||||
#include "fdr_internal.h"
|
||||
#include "teddy_internal.h"
|
||||
|
||||
#include "flood_runtime.h"
|
||||
|
||||
#include "fdr_confirm.h"
|
||||
#include "fdr_confirm_runtime.h"
|
||||
#include "fdr_streaming_runtime.h"
|
||||
#include "fdr_internal.h"
|
||||
#include "fdr_loadval.h"
|
||||
#include "fdr_streaming_runtime.h"
|
||||
#include "flood_runtime.h"
|
||||
#include "teddy_internal.h"
|
||||
#include "util/simd_utils.h"
|
||||
#include "util/simd_utils_ssse3.h"
|
||||
|
||||
/** \brief number of bytes processed in each iteration */
|
||||
#define ITER_BYTES 16
|
||||
|
||||
/** \brief total zone buffer size */
|
||||
#define ZONE_TOTAL_SIZE 64
|
||||
|
||||
/** \brief maximum number of allowed zones */
|
||||
#define ZONE_MAX 3
|
||||
|
||||
/** \brief zone information.
|
||||
*
|
||||
* Zone represents a region of data to scan in FDR.
|
||||
*
|
||||
* The incoming buffer is to split in multiple zones to ensure two properties:
|
||||
* 1: that we can read 8? bytes behind to generate a hash safely
|
||||
* 2: that we can read the byte after the current byte (domain > 8)
|
||||
*/
|
||||
struct zone {
|
||||
/** \brief copied buffer, used only when it is a boundary zone. */
|
||||
u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
|
||||
|
||||
/** \brief shift amount for fdr state to avoid unwanted match. */
|
||||
u8 shift;
|
||||
|
||||
/** \brief if boundary zone, start points into the zone buffer after the
|
||||
* pre-padding. Otherwise, points to the main buffer, appropriately. */
|
||||
const u8 *start;
|
||||
|
||||
/** \brief if boundary zone, end points to the end of zone. Otherwise,
|
||||
* pointer to the main buffer, appropriately. */
|
||||
const u8 *end;
|
||||
|
||||
/** \brief the amount to adjust to go from a pointer in the zones region
|
||||
* (between start and end) to a pointer in the original data buffer. */
|
||||
ptrdiff_t zone_pointer_adjust;
|
||||
|
||||
/** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,
|
||||
* otherwise end of the zone buf. floodPtr always points inside the same
|
||||
* buffer as the start pointe. */
|
||||
const u8 *floodPtr;
|
||||
};
|
||||
|
||||
static
|
||||
const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
|
||||
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
|
||||
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
|
||||
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
|
||||
};
|
||||
|
||||
/* generates an initial state mask based on the last byte-ish of history rather
|
||||
* than being all accepting. If there is no history to consider, the state is
|
||||
* generated based on the minimum length of each bucket in order to prevent
|
||||
* confirms.
|
||||
*/
|
||||
static really_inline
|
||||
m128 getInitState(const struct FDR *fdr, u8 len_history, const u8 *ft,
|
||||
const struct zone *z) {
|
||||
m128 s;
|
||||
if (len_history) {
|
||||
/* +1: the zones ensure that we can read the byte at z->end */
|
||||
u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1);
|
||||
tmp &= fdr->domainMask;
|
||||
s = *((const m128 *)ft + tmp);
|
||||
s = shiftRight8Bits(s);
|
||||
} else {
|
||||
s = fdr->start;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
|
||||
u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
|
||||
u64a *conf8, m128 *s) {
|
||||
/* +1: the zones ensure that we can read the byte at z->end */
|
||||
|
||||
u64a current_data_0;
|
||||
u64a current_data_8;
|
||||
|
||||
current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
|
||||
u64a v7 = (lv_u16(itPtr + 7, start_ptr, end_ptr + 1) << 1) &
|
||||
domain_mask_adjusted;
|
||||
u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
|
||||
u64a v1 = (current_data_0 >> 7) & domain_mask_adjusted;
|
||||
u64a v2 = (current_data_0 >> 15) & domain_mask_adjusted;
|
||||
u64a v3 = (current_data_0 >> 23) & domain_mask_adjusted;
|
||||
u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
|
||||
u64a v5 = (current_data_0 >> 39) & domain_mask_adjusted;
|
||||
u64a v6 = (current_data_0 >> 47) & domain_mask_adjusted;
|
||||
current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
|
||||
u64a v15 = (lv_u16(itPtr + 15, start_ptr, end_ptr + 1) << 1) &
|
||||
domain_mask_adjusted;
|
||||
u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
|
||||
u64a v9 = (current_data_8 >> 7) & domain_mask_adjusted;
|
||||
u64a v10 = (current_data_8 >> 15) & domain_mask_adjusted;
|
||||
u64a v11 = (current_data_8 >> 23) & domain_mask_adjusted;
|
||||
u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
|
||||
u64a v13 = (current_data_8 >> 39) & domain_mask_adjusted;
|
||||
u64a v14 = (current_data_8 >> 47) & domain_mask_adjusted;
|
||||
|
||||
m128 st0 = *(const m128 *)(ft + v0*8);
|
||||
m128 st1 = *(const m128 *)(ft + v1*8);
|
||||
m128 st2 = *(const m128 *)(ft + v2*8);
|
||||
m128 st3 = *(const m128 *)(ft + v3*8);
|
||||
m128 st4 = *(const m128 *)(ft + v4*8);
|
||||
m128 st5 = *(const m128 *)(ft + v5*8);
|
||||
m128 st6 = *(const m128 *)(ft + v6*8);
|
||||
m128 st7 = *(const m128 *)(ft + v7*8);
|
||||
m128 st8 = *(const m128 *)(ft + v8*8);
|
||||
m128 st9 = *(const m128 *)(ft + v9*8);
|
||||
m128 st10 = *(const m128 *)(ft + v10*8);
|
||||
m128 st11 = *(const m128 *)(ft + v11*8);
|
||||
m128 st12 = *(const m128 *)(ft + v12*8);
|
||||
m128 st13 = *(const m128 *)(ft + v13*8);
|
||||
m128 st14 = *(const m128 *)(ft + v14*8);
|
||||
m128 st15 = *(const m128 *)(ft + v15*8);
|
||||
|
||||
st1 = byteShiftLeft128(st1, 1);
|
||||
st2 = byteShiftLeft128(st2, 2);
|
||||
st3 = byteShiftLeft128(st3, 3);
|
||||
st4 = byteShiftLeft128(st4, 4);
|
||||
st5 = byteShiftLeft128(st5, 5);
|
||||
st6 = byteShiftLeft128(st6, 6);
|
||||
st7 = byteShiftLeft128(st7, 7);
|
||||
st9 = byteShiftLeft128(st9, 1);
|
||||
st10 = byteShiftLeft128(st10, 2);
|
||||
st11 = byteShiftLeft128(st11, 3);
|
||||
st12 = byteShiftLeft128(st12, 4);
|
||||
st13 = byteShiftLeft128(st13, 5);
|
||||
st14 = byteShiftLeft128(st14, 6);
|
||||
st15 = byteShiftLeft128(st15, 7);
|
||||
|
||||
*s = or128(*s, st0);
|
||||
*s = or128(*s, st1);
|
||||
*s = or128(*s, st2);
|
||||
*s = or128(*s, st3);
|
||||
*s = or128(*s, st4);
|
||||
*s = or128(*s, st5);
|
||||
*s = or128(*s, st6);
|
||||
*s = or128(*s, st7);
|
||||
*conf0 = movq(*s);
|
||||
*s = byteShiftRight128(*s, 8);
|
||||
*conf0 ^= ~0ULL;
|
||||
|
||||
*s = or128(*s, st8);
|
||||
*s = or128(*s, st9);
|
||||
*s = or128(*s, st10);
|
||||
*s = or128(*s, st11);
|
||||
*s = or128(*s, st12);
|
||||
*s = or128(*s, st13);
|
||||
*s = or128(*s, st14);
|
||||
*s = or128(*s, st15);
|
||||
*conf8 = movq(*s);
|
||||
*s = byteShiftRight128(*s, 8);
|
||||
*conf8 ^= ~0ULL;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
|
||||
u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
|
||||
u64a *conf8, m128 *s) {
|
||||
u64a current_data_0;
|
||||
u64a current_data_8;
|
||||
|
||||
current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
|
||||
u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
|
||||
u64a v2 = (current_data_0 >> 15) & domain_mask_adjusted;
|
||||
u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
|
||||
u64a v6 = (current_data_0 >> 47) & domain_mask_adjusted;
|
||||
current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
|
||||
u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
|
||||
u64a v10 = (current_data_8 >> 15) & domain_mask_adjusted;
|
||||
u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
|
||||
u64a v14 = (current_data_8 >> 47) & domain_mask_adjusted;
|
||||
|
||||
m128 st0 = *(const m128 *)(ft + v0*8);
|
||||
m128 st2 = *(const m128 *)(ft + v2*8);
|
||||
m128 st4 = *(const m128 *)(ft + v4*8);
|
||||
m128 st6 = *(const m128 *)(ft + v6*8);
|
||||
m128 st8 = *(const m128 *)(ft + v8*8);
|
||||
m128 st10 = *(const m128 *)(ft + v10*8);
|
||||
m128 st12 = *(const m128 *)(ft + v12*8);
|
||||
m128 st14 = *(const m128 *)(ft + v14*8);
|
||||
|
||||
st2 = byteShiftLeft128(st2, 2);
|
||||
st4 = byteShiftLeft128(st4, 4);
|
||||
st6 = byteShiftLeft128(st6, 6);
|
||||
st10 = byteShiftLeft128(st10, 2);
|
||||
st12 = byteShiftLeft128(st12, 4);
|
||||
st14 = byteShiftLeft128(st14, 6);
|
||||
|
||||
*s = or128(*s, st0);
|
||||
*s = or128(*s, st2);
|
||||
*s = or128(*s, st4);
|
||||
*s = or128(*s, st6);
|
||||
*conf0 = movq(*s);
|
||||
*s = byteShiftRight128(*s, 8);
|
||||
*conf0 ^= ~0ULL;
|
||||
|
||||
*s = or128(*s, st8);
|
||||
*s = or128(*s, st10);
|
||||
*s = or128(*s, st12);
|
||||
*s = or128(*s, st14);
|
||||
*conf8 = movq(*s);
|
||||
*s = byteShiftRight128(*s, 8);
|
||||
*conf8 ^= ~0ULL;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void get_conf_stride_4(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
|
||||
u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
|
||||
u64a *conf8, m128 *s) {
|
||||
u64a current_data_0;
|
||||
u64a current_data_8;
|
||||
|
||||
current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
|
||||
u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
|
||||
u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
|
||||
current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
|
||||
u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
|
||||
u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
|
||||
|
||||
m128 st0 = *(const m128 *)(ft + v0*8);
|
||||
m128 st4 = *(const m128 *)(ft + v4*8);
|
||||
m128 st8 = *(const m128 *)(ft + v8*8);
|
||||
m128 st12 = *(const m128 *)(ft + v12*8);
|
||||
|
||||
st4 = byteShiftLeft128(st4, 4);
|
||||
st12 = byteShiftLeft128(st12, 4);
|
||||
|
||||
*s = or128(*s, st0);
|
||||
*s = or128(*s, st4);
|
||||
*conf0 = movq(*s);
|
||||
*s = byteShiftRight128(*s, 8);
|
||||
*conf0 ^= ~0ULL;
|
||||
|
||||
*s = or128(*s, st8);
|
||||
*s = or128(*s, st12);
|
||||
*conf8 = movq(*s);
|
||||
*s = byteShiftRight128(*s, 8);
|
||||
*conf8 ^= ~0ULL;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *controlVal,
|
||||
const u32 *confBase, const struct FDR_Runtime_Args *a,
|
||||
const u8 *ptr, hwlmcb_rv_t *control, u32 *last_match_id,
|
||||
struct zone *z) {
|
||||
const u8 bucket = 8;
|
||||
const u8 pullback = 1;
|
||||
|
||||
if (likely(!*conf)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* ptr is currently referring to a location in the zone's buffer, we also
|
||||
* need a pointer in the original, main buffer for the final string compare.
|
||||
*/
|
||||
const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust);
|
||||
|
||||
const u8 *confLoc = ptr;
|
||||
|
||||
do {
|
||||
u32 bit = findAndClearLSB_64(conf);
|
||||
u32 byte = bit / bucket + offset;
|
||||
u32 bitRem = bit % bucket;
|
||||
u32 confSplit = *(ptr + byte);
|
||||
u32 idx = confSplit * bucket + bitRem;
|
||||
u32 cf = confBase[idx];
|
||||
if (!cf) {
|
||||
continue;
|
||||
}
|
||||
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
|
||||
((const u8 *)confBase + cf);
|
||||
if (!(fdrc->groups & *control)) {
|
||||
continue;
|
||||
}
|
||||
if (!fdrc->mult) {
|
||||
u32 id = fdrc->nBitsOrSoleID;
|
||||
if ((*last_match_id == id) && (fdrc->flags & NoRepeat)) {
|
||||
continue;
|
||||
}
|
||||
*last_match_id = id;
|
||||
*controlVal = a->cb(ptr_main + byte - a->buf,
|
||||
ptr_main + byte - a->buf, id, a->ctxt);
|
||||
continue;
|
||||
}
|
||||
u64a confVal = *(const u64a *)(confLoc + byte - sizeof(u64a));
|
||||
confWithBit(fdrc, a, ptr_main - a->buf + byte, pullback,
|
||||
control, last_match_id, confVal);
|
||||
} while (unlikely(!!*conf));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) {
|
||||
#ifdef DEBUG
|
||||
DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf);
|
||||
DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n",
|
||||
z->start, z->end, z->shift);
|
||||
DEBUG_PRINTF("zone: zone_pointer_adjust=%zd, floodPtr=%p\n",
|
||||
z->zone_pointer_adjust, z->floodPtr);
|
||||
DEBUG_PRINTF("zone buf:");
|
||||
for (size_t i = 0; i < ZONE_TOTAL_SIZE; i++) {
|
||||
if (i % 8 == 0) {
|
||||
printf("_");
|
||||
}
|
||||
if (z->buf[i]) {
|
||||
printf("%02x", z->buf[i]);
|
||||
} else {
|
||||
printf("..");
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
#endif
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief Updates attributes for non-boundary region zone.
|
||||
*/
|
||||
static really_inline
|
||||
void createMainZone(const u8 *flood, const u8 *begin, const u8 *end,
|
||||
struct zone *z) {
|
||||
z->zone_pointer_adjust = 0; /* zone buffer is the main buffer */
|
||||
z->start = begin;
|
||||
z->end = end;
|
||||
z->floodPtr = flood;
|
||||
z->shift = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Create zone for short cases (<= ITER_BYTES).
|
||||
*
|
||||
* For this case we need to copy everything into the zone's internal buffer.
|
||||
*
|
||||
* We need to ensure that we run over real data if it exists (in history or
|
||||
* before zone begin). We also need to ensure 8 bytes before any data being
|
||||
* matched can be read (to perform a conf hash).
|
||||
*
|
||||
* We also need to ensure that the data at z->end can be read.
|
||||
*
|
||||
* Hence, the zone consists of:
|
||||
* 16 bytes of history,
|
||||
* 1 - 24 bytes of data form the buffer (ending at end),
|
||||
* 1 byte of final padding
|
||||
*/
|
||||
static really_inline
|
||||
void createShortZone(const u8 *buf, const u8 *hend, const u8 *begin,
|
||||
const u8 *end, struct zone *z) {
|
||||
/* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
|
||||
* the checks in boundary zone. */
|
||||
z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
|
||||
|
||||
ptrdiff_t z_len = end - begin;
|
||||
assert(z_len > 0);
|
||||
assert(z_len <= ITER_BYTES);
|
||||
|
||||
z->shift = ITER_BYTES - z_len; /* ignore bytes outside region specified */
|
||||
|
||||
static const size_t ZONE_SHORT_DATA_OFFSET = 16; /* after history */
|
||||
|
||||
/* we are guaranteed to always have 16 initialised bytes at the end of
|
||||
* the history buffer (they may be garbage coming from the stream state
|
||||
* preceding hbuf, but bytes that don't correspond to actual history
|
||||
* shouldn't affect computations). */
|
||||
*(m128 *)z->buf = loadu128(hend - sizeof(m128));
|
||||
|
||||
/* The amount of data we have to copy from main buffer. */
|
||||
size_t copy_len = MIN((size_t)(end - buf),
|
||||
ITER_BYTES + sizeof(CONF_TYPE));
|
||||
|
||||
u8 *zone_data = z->buf + ZONE_SHORT_DATA_OFFSET;
|
||||
switch (copy_len) {
|
||||
case 1:
|
||||
*zone_data = *(end - 1);
|
||||
break;
|
||||
case 2:
|
||||
*(u16 *)zone_data = unaligned_load_u16(end - 2);
|
||||
break;
|
||||
case 3:
|
||||
*(u16 *)zone_data = unaligned_load_u16(end - 3);
|
||||
*(zone_data + 2) = *(end - 1);
|
||||
break;
|
||||
case 4:
|
||||
*(u32 *)zone_data = unaligned_load_u32(end - 4);
|
||||
break;
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
/* perform copy with 2 overlapping 4-byte chunks from buf. */
|
||||
*(u32 *)zone_data = unaligned_load_u32(end - copy_len);
|
||||
unaligned_store_u32(zone_data + copy_len - sizeof(u32),
|
||||
unaligned_load_u32(end - sizeof(u32)));
|
||||
break;
|
||||
case 8:
|
||||
*(u64a *)zone_data = unaligned_load_u64a(end - 8);
|
||||
break;
|
||||
case 9:
|
||||
case 10:
|
||||
case 11:
|
||||
case 12:
|
||||
case 13:
|
||||
case 14:
|
||||
case 15:
|
||||
/* perform copy with 2 overlapping 8-byte chunks from buf. */
|
||||
*(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
|
||||
unaligned_store_u64a(zone_data + copy_len - sizeof(u64a),
|
||||
unaligned_load_u64a(end - sizeof(u64a)));
|
||||
break;
|
||||
case 16:
|
||||
/* copy 16-bytes from buf. */
|
||||
*(m128 *)zone_data = loadu128(end - 16);
|
||||
break;
|
||||
default:
|
||||
assert(copy_len <= sizeof(m128) + sizeof(u64a));
|
||||
|
||||
/* perform copy with (potentially overlapping) 8-byte and 16-byte chunks.
|
||||
*/
|
||||
*(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
|
||||
storeu128(zone_data + copy_len - sizeof(m128),
|
||||
loadu128(end - sizeof(m128)));
|
||||
break;
|
||||
}
|
||||
|
||||
/* set the start and end location of the zone buf
|
||||
* to be scanned */
|
||||
u8 *z_end = z->buf + ZONE_SHORT_DATA_OFFSET + copy_len;
|
||||
assert(ZONE_SHORT_DATA_OFFSET + copy_len >= ITER_BYTES);
|
||||
|
||||
/* copy the post-padding byte; this is required for domain > 8 due to
|
||||
* overhang */
|
||||
*z_end = 0;
|
||||
|
||||
z->end = z_end;
|
||||
z->start = z_end - ITER_BYTES;
|
||||
z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
|
||||
assert(z->start + z->shift == z_end - z_len);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Create a zone for the start region.
|
||||
*
|
||||
* This function requires that there is > ITER_BYTES of data in the buffer to
|
||||
* scan. The start zone itself is always responsible for scanning exactly
|
||||
* ITER_BYTES of data - there are no warmup/junk bytes scanned.
|
||||
*
|
||||
* This zone ensures that the byte at z->end can be read and corresponds to
|
||||
* the next byte of data.
|
||||
*
|
||||
* 8 bytes of history data are provided before z->start to allow proper hash
|
||||
* generation in streaming mode. If buf != begin, upto 8 bytes of data
|
||||
* prior to begin is also provided.
|
||||
*
|
||||
* Although we are not interested in bare literals which start before begin
|
||||
* if buf != begin, lookarounds associated with the literal may require
|
||||
* the data prior to begin for hash purposes.
|
||||
*/
|
||||
static really_inline
|
||||
void createStartZone(const u8 *buf, const u8 *hend, const u8 *begin,
|
||||
struct zone *z) {
|
||||
assert(ITER_BYTES == sizeof(m128));
|
||||
assert(sizeof(CONF_TYPE) == 8);
|
||||
static const size_t ZONE_START_BEGIN = sizeof(CONF_TYPE);
|
||||
|
||||
const u8 *end = begin + ITER_BYTES;
|
||||
|
||||
/* set floodPtr to the end of zone buf to avoid checks in start zone */
|
||||
z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
|
||||
|
||||
z->shift = 0; /* we are processing ITER_BYTES of real data */
|
||||
|
||||
/* we are guaranteed to always have 16 initialised bytes at the end of the
|
||||
* history buffer (they may be garbage coming from the stream state
|
||||
* preceding hbuf, but bytes that don't correspond to actual history
|
||||
* shouldn't affect computations). However, for start zones, history is only
|
||||
* required for conf hash purposes so we only need 8 bytes */
|
||||
unaligned_store_u64a(z->buf, unaligned_load_u64a(hend - sizeof(u64a)));
|
||||
|
||||
/* The amount of data we have to copy from main buffer. */
|
||||
size_t copy_len = MIN((size_t)(end - buf),
|
||||
ITER_BYTES + sizeof(CONF_TYPE));
|
||||
assert(copy_len >= 16);
|
||||
|
||||
/* copy the post-padding byte; this is required for domain > 8 due to
|
||||
* overhang. The start requires that there is data after the zone so it
|
||||
* it safe to dereference end */
|
||||
z->buf[ZONE_START_BEGIN + copy_len] = *end;
|
||||
|
||||
/* set the start and end location of the zone buf to be scanned */
|
||||
u8 *z_end = z->buf + ZONE_START_BEGIN + copy_len;
|
||||
z->end = z_end;
|
||||
z->start = z_end - ITER_BYTES;
|
||||
|
||||
/* copy the first 8 bytes of the valid region */
|
||||
unaligned_store_u64a(z->buf + ZONE_START_BEGIN,
|
||||
unaligned_load_u64a(end - copy_len));
|
||||
|
||||
/* copy the last 16 bytes, may overlap with the previous 8 byte write */
|
||||
storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
|
||||
|
||||
z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Create a zone for the end region.
|
||||
*
|
||||
* This function requires that there is > ITER_BYTES of data in the buffer to
|
||||
* scan. The end zone, however, is only responsible for a scanning the <=
|
||||
* ITER_BYTES rump of data. The end zone is required to handle a full ITER_BYTES
|
||||
* iteration as the main loop cannot handle the last byte of the buffer.
|
||||
*
|
||||
* This zone ensures that the byte at z->end can be read by filling it with a
|
||||
* padding character.
|
||||
*
|
||||
* Upto 8 bytes of data prior to begin is also provided for the purposes of
|
||||
* generating hashes. History is not copied, as all locations which require
|
||||
* history for generating a hash are the responsiblity of the start zone.
|
||||
*/
|
||||
static really_inline
|
||||
void createEndZone(const u8 *buf, const u8 *begin, const u8 *end,
|
||||
struct zone *z) {
|
||||
/* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
|
||||
* the checks in boundary zone. */
|
||||
z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
|
||||
|
||||
ptrdiff_t z_len = end - begin;
|
||||
assert(z_len > 0);
|
||||
assert(z_len <= ITER_BYTES);
|
||||
|
||||
z->shift = ITER_BYTES - z_len;
|
||||
|
||||
/* The amount of data we have to copy from main buffer. */
|
||||
size_t copy_len = MIN((size_t)(end - buf),
|
||||
ITER_BYTES + sizeof(CONF_TYPE));
|
||||
assert(copy_len >= 16);
|
||||
|
||||
/* copy the post-padding byte; this is required for domain > 8 due to
|
||||
* overhang */
|
||||
z->buf[copy_len] = 0;
|
||||
|
||||
/* set the start and end location of the zone buf
|
||||
* to be scanned */
|
||||
u8 *z_end = z->buf + copy_len;
|
||||
z->end = z_end;
|
||||
z->start = z_end - ITER_BYTES;
|
||||
assert(z->start + z->shift == z_end - z_len);
|
||||
|
||||
/* copy the first 8 bytes of the valid region */
|
||||
unaligned_store_u64a(z->buf, unaligned_load_u64a(end - copy_len));
|
||||
|
||||
/* copy the last 16 bytes, may overlap with the previous 8 byte write */
|
||||
storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
|
||||
|
||||
z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Prepare zones.
|
||||
*
|
||||
* This function prepares zones with actual buffer and some padded bytes.
|
||||
* The actual ITER_BYTES bytes in zone is preceded by main buf and/or
|
||||
* history buf and succeeded by padded bytes possibly from main buf,
|
||||
* if available.
|
||||
*/
|
||||
static really_inline
|
||||
size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
|
||||
size_t start, const u8 *flood, struct zone *zoneArr) {
|
||||
const u8 *ptr = buf + start;
|
||||
size_t remaining = len - start;
|
||||
|
||||
if (remaining <= ITER_BYTES) {
|
||||
/* enough bytes to make only one zone */
|
||||
createShortZone(buf, hend, ptr, buf + len, &zoneArr[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* enough bytes to make more than one zone */
|
||||
|
||||
size_t numZone = 0;
|
||||
createStartZone(buf, hend, ptr, &zoneArr[numZone++]);
|
||||
ptr += ITER_BYTES;
|
||||
|
||||
assert(ptr < buf + len);
|
||||
|
||||
/* find maximum buffer location that the main zone can scan
|
||||
* - must be a multiple of ITER_BYTES, and
|
||||
* - cannot contain the last byte (due to overhang)
|
||||
*/
|
||||
const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 1, ITER_BYTES);
|
||||
assert(main_end >= ptr);
|
||||
|
||||
/* create a zone if multiple of ITER_BYTES are found */
|
||||
if (main_end != ptr) {
|
||||
createMainZone(flood, ptr, main_end, &zoneArr[numZone++]);
|
||||
ptr = main_end;
|
||||
}
|
||||
/* create a zone with rest of the data from the main buffer */
|
||||
createEndZone(buf, ptr, buf + len, &zoneArr[numZone++]);
|
||||
return numZone;
|
||||
}
|
||||
|
||||
#define INVALID_MATCH_ID (~0U)
|
||||
|
||||
#define FDR_MAIN_LOOP(zz, s, get_conf_fn) \
|
||||
do { \
|
||||
const u8 *tryFloodDetect = zz->floodPtr; \
|
||||
const u8 *start_ptr = zz->start; \
|
||||
const u8 *end_ptr = zz->end; \
|
||||
\
|
||||
for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \
|
||||
itPtr += ITER_BYTES) { \
|
||||
if (unlikely(itPtr > tryFloodDetect)) { \
|
||||
tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
|
||||
&floodBackoff, &controlVal, \
|
||||
ITER_BYTES); \
|
||||
if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) { \
|
||||
return HWLM_TERMINATED; \
|
||||
} \
|
||||
} \
|
||||
__builtin_prefetch(itPtr + (ITER_BYTES*4)); \
|
||||
u64a conf0; \
|
||||
u64a conf8; \
|
||||
get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_adjusted, \
|
||||
ft, &conf0, &conf8, &s); \
|
||||
do_confirm_fdr(&conf0, 0, &controlVal, confBase, a, itPtr, \
|
||||
control, &last_match_id, zz); \
|
||||
do_confirm_fdr(&conf8, 8, &controlVal, confBase, a, itPtr, \
|
||||
control, &last_match_id, zz); \
|
||||
if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) { \
|
||||
return HWLM_TERMINATED; \
|
||||
} \
|
||||
} /* end for loop */ \
|
||||
} while (0) \
|
||||
|
||||
static never_inline
|
||||
hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a) {
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t *control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
u32 last_match_id = INVALID_MATCH_ID;
|
||||
u64a domain_mask_adjusted = fdr->domainMask << 1;
|
||||
u8 stride = fdr->stride;
|
||||
const u8 *ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
|
||||
const u32 *confBase = (const u32 *)(ft + fdr->tabSize);
|
||||
struct zone zones[ZONE_MAX];
|
||||
assert(fdr->domain > 8 && fdr->domain < 16);
|
||||
|
||||
size_t numZone = prepareZones(a->buf, a->len,
|
||||
a->buf_history + a->len_history,
|
||||
a->start_offset, a->firstFloodDetect, zones);
|
||||
assert(numZone <= ZONE_MAX);
|
||||
m128 state = getInitState(fdr, a->len_history, ft, &zones[0]);
|
||||
|
||||
for (size_t curZone = 0; curZone < numZone; curZone++) {
|
||||
struct zone *z = &zones[curZone];
|
||||
dumpZoneInfo(z, curZone);
|
||||
|
||||
/* When a zone contains less data than is processed in an iteration
|
||||
* of FDR_MAIN_LOOP(), we need to scan over some extra data.
|
||||
*
|
||||
* We have chosen to scan this extra data at the start of the
|
||||
* iteration. The extra data is either data we have already scanned or
|
||||
* garbage (if it is earlier than offset 0),
|
||||
*
|
||||
* As a result we need to shift the incoming state back so that it will
|
||||
* properly line up with the data being scanned.
|
||||
*
|
||||
* We also need to forbid reporting any matches in the data being
|
||||
* rescanned as they have already been reported (or are over garbage but
|
||||
* later stages should also provide that safety guarantee).
|
||||
*/
|
||||
|
||||
u8 shift = z->shift;
|
||||
|
||||
state = variable_byte_shift_m128(state, shift);
|
||||
|
||||
state = or128(state, load128(zone_or_mask[shift]));
|
||||
|
||||
switch (stride) {
|
||||
case 1:
|
||||
FDR_MAIN_LOOP(z, state, get_conf_stride_1);
|
||||
break;
|
||||
case 2:
|
||||
FDR_MAIN_LOOP(z, state, get_conf_stride_2);
|
||||
break;
|
||||
case 4:
|
||||
FDR_MAIN_LOOP(z, state, get_conf_stride_4);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#include "fdr_autogen.c"
|
||||
|
||||
#define FAKE_HISTORY_SIZE 16
|
||||
static const u8 fake_history[FAKE_HISTORY_SIZE];
|
||||
|
||||
hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, size_t start,
|
||||
HWLMCallback cb, void *ctxt, hwlm_group_t groups) {
|
||||
|
||||
hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
|
||||
size_t start, HWLMCallback cb, void *ctxt,
|
||||
hwlm_group_t groups) {
|
||||
const struct FDR_Runtime_Args a = {
|
||||
buf,
|
||||
len,
|
||||
@ -73,7 +797,7 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, size_t st
|
||||
hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
|
||||
size_t hlen, const u8 *buf, size_t len,
|
||||
size_t start, HWLMCallback cb, void *ctxt,
|
||||
hwlm_group_t groups, u8 * stream_state) {
|
||||
hwlm_group_t groups, u8 *stream_state) {
|
||||
struct FDR_Runtime_Args a = {
|
||||
buf,
|
||||
len,
|
||||
@ -86,9 +810,9 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
|
||||
ctxt,
|
||||
&groups,
|
||||
nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
|
||||
hbuf ? CONF_LOADVAL_CALL_CAUTIOUS(hbuf + hlen - 8, hbuf, hbuf + hlen)
|
||||
: (u64a)0
|
||||
|
||||
/* we are guaranteed to always have 16 initialised bytes at the end of
|
||||
* the history buffer (they may be garbage). */
|
||||
hbuf ? unaligned_load_u64a(hbuf + hlen - sizeof(u64a)) : (u64a)0
|
||||
};
|
||||
fdrUnpackState(fdr, &a, stream_state);
|
||||
|
||||
|
@ -1,564 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# Copyright (c) 2015, Intel Corporation
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import sys
|
||||
from autogen_utils import *
|
||||
from base_autogen import *
|
||||
from string import Template
|
||||
|
||||
class OrStep(Step):
|
||||
def __init__(self, context, offset, width):
|
||||
Step.__init__(self, context, offset)
|
||||
s_var = self.gv("st%d" % offset)
|
||||
if width < 128:
|
||||
self.val = "s |= %s;" % s_var.name
|
||||
else:
|
||||
self.val = "s = or%d(s, %s);" % (width, s_var.name)
|
||||
|
||||
class ShiftStateStep(Step):
|
||||
def __init__(self, context, offset = 0, stride_used = 1):
|
||||
Step.__init__(self, context, offset)
|
||||
m = self.matcher
|
||||
state = m.state_variable
|
||||
shift_distance = -1 * stride_used * m.num_buckets
|
||||
self.val = "%s = %s;" % (state.name, state.type.shift_expr(state.name, shift_distance))
|
||||
|
||||
class BulkLoadStep(Step):
|
||||
def __init__(self, context, offset, size, define_var = True, aligned = True):
|
||||
Step.__init__(self, context, offset)
|
||||
m = self.matcher
|
||||
self.latency = 4
|
||||
blt = m.bulk_load_type
|
||||
if aligned:
|
||||
init_string = blt.load_expr_data(self.offset, code = "aligned")
|
||||
else:
|
||||
init_string = blt.load_expr_data(self.offset)
|
||||
|
||||
var_name = "current_data_%d" % offset
|
||||
if define_var:
|
||||
lb_var = self.nv(blt, var_name)
|
||||
self.val = lb_var.gen_initializer_stmt(init_string)
|
||||
else:
|
||||
lb_var = self.gv(var_name, reader = False, writer = True)
|
||||
self.val = "%s = %s;" % (var_name, init_string)
|
||||
|
||||
class ValueExtractStep(Step):
|
||||
def __init__(self, context, offset, sub_load_cautious = False):
|
||||
Step.__init__(self, context, offset)
|
||||
m = self.matcher
|
||||
self.latency = 2
|
||||
dsb = m.datasize_bytes
|
||||
modval = offset % dsb
|
||||
|
||||
if modval == dsb - 1:
|
||||
# Case 1: reading more than one byte over the end of the bulk load
|
||||
|
||||
self.latency = 4
|
||||
if sub_load_cautious:
|
||||
code_string = "cautious_forward"
|
||||
else:
|
||||
code_string = "normal"
|
||||
load_string = m.single_load_type.load_expr_data(self.offset, code_string)
|
||||
temp_string = "(%s << %d)" % (load_string, m.reach_shift_adjust)
|
||||
else:
|
||||
# Case 2: reading a value that can be found entirely in the current register
|
||||
if m.fdr2_force_naive_load:
|
||||
load_string = m.single_load_type.load_expr_data(self.offset, "normal")
|
||||
temp_string = "(%s << %d)" % (load_string, m.reach_shift_adjust)
|
||||
else:
|
||||
lb_var = self.gv("current_data_%d" % (offset - modval))
|
||||
if modval == 0:
|
||||
# Case 2a: value is at LSB end of the register and must be left-
|
||||
# shifted into place if there is a "reach_shift_adjust" required
|
||||
temp_string = "(%s << %d)" % (lb_var.name, m.reach_shift_adjust)
|
||||
else:
|
||||
# Case 2b: value is in the middle of the register and will be
|
||||
# right-shifted into place (adjusted by "reach_shift_adjust")
|
||||
temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)
|
||||
|
||||
|
||||
init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust)
|
||||
v_var = self.nv(m.value_extract_type, "v%d" % offset)
|
||||
self.val = v_var.gen_initializer_stmt(init_string)
|
||||
|
||||
class TableLookupStep(Step):
|
||||
def __init__(self, context, reach_multiplier, offset = 0):
|
||||
Step.__init__(self, context, offset)
|
||||
m = self.matcher
|
||||
self.latency = 4
|
||||
v_var = self.gv("v%d" % offset)
|
||||
s_var = self.nv(m.state_type, "st%d" % offset)
|
||||
init_string = "*(const %s *)(ft + %s*%dU)" % ( m.state_type.get_name(),
|
||||
v_var.name, reach_multiplier)
|
||||
self.val = s_var.gen_initializer_stmt(init_string)
|
||||
|
||||
class ShiftReachMaskStep(Step):
|
||||
def __init__(self, context, offset):
|
||||
Step.__init__(self, context, offset)
|
||||
m = self.matcher
|
||||
extr = m.extract_frequency
|
||||
modval = offset % extr
|
||||
s_var = self.gv("st%d" % offset, writer = True)
|
||||
self.val = "%s = %s;" % (s_var.name, s_var.type.shift_expr(s_var.name, modval * m.num_buckets))
|
||||
|
||||
class ConfExtractStep(Step):
|
||||
def __init__(self, context, offset):
|
||||
Step.__init__(self, context, offset)
|
||||
m = self.matcher
|
||||
if m.state_type.isSIMDOnIntel():
|
||||
self.latency = 2
|
||||
init_string = m.state_type.lowbit_extract_expr("s", m.extract_size)
|
||||
extr_var = self.nv(m.extr_type, "extr%d" % offset)
|
||||
self.val = extr_var.gen_initializer_stmt(init_string)
|
||||
|
||||
class ConfAccumulateStep(Step):
|
||||
def __init__(self, context, extract_offset, conf_offset, define_var = True):
|
||||
Step.__init__(self, context, extract_offset)
|
||||
m = self.matcher
|
||||
extr_var = self.gv("extr%d" % extract_offset)
|
||||
extr_var_cast = "((%s)%s)" % (m.conf_type.get_name(), extr_var.name)
|
||||
if extract_offset == conf_offset:
|
||||
# create conf_var as a straight copy of extr
|
||||
if define_var:
|
||||
conf_var = self.nv(m.conf_type, "conf%d" % conf_offset)
|
||||
self.val = conf_var.gen_initializer_stmt(extr_var_cast)
|
||||
else:
|
||||
conf_var = self.gv("conf%d" % conf_offset, writer = True, reader = True)
|
||||
self.val = "%s = %s;" % (conf_var.name, extr_var_cast)
|
||||
else:
|
||||
# shift extr_var and insert/OR it in conf_var
|
||||
conf_var = self.gv("conf%d" % conf_offset, writer = True, reader = True)
|
||||
shift_dist = (extract_offset - conf_offset) * m.num_buckets
|
||||
self.val = "%s |= %s;" % (conf_var.name, m.conf_type.shift_expr(extr_var_cast, shift_dist))
|
||||
self.latency = 2
|
||||
|
||||
class ConfirmFlipStep(Step):
|
||||
def __init__(self, context, offset):
|
||||
Step.__init__(self, context, offset)
|
||||
m = self.matcher
|
||||
conf_var = self.gv("conf%d" % self.offset, writer = True)
|
||||
self.val = "%s = %s;" % (conf_var.name,
|
||||
conf_var.type.flip_lowbits_expr(conf_var.name, self.matcher.confirm_frequency * m.num_buckets))
|
||||
|
||||
class ConfirmStep(Step):
|
||||
def __init__(self, context, offset, cautious = False):
|
||||
Step.__init__(self, context, offset)
|
||||
m = self.matcher
|
||||
conf_var = self.gv("conf%d" % offset, writer = True)
|
||||
self.val = m.produce_confirm_base(conf_var.name, conf_var.type.size, offset, cautious,
|
||||
enable_confirmless = m.stride == 1, do_bailout = False)
|
||||
|
||||
class M3(MatcherBase):
|
||||
def produce_compile_call(self):
|
||||
print " { %d, %d, %d, %d, %s, %d, %d }," % (
|
||||
self.id, self.state_width, self.num_buckets,
|
||||
self.stride,
|
||||
self.arch.target, self.conf_pull_back, self.conf_top_level_split)
|
||||
|
||||
def produce_main_loop(self, switch_variant = False):
|
||||
stride_offsets = xrange(0, self.loop_bytes, self.stride)
|
||||
stride_offsetSet = set(stride_offsets)
|
||||
so_steps_last_block = []
|
||||
sh = None
|
||||
last_confirm = None
|
||||
ctxt = CodeGenContext(self)
|
||||
|
||||
if switch_variant:
|
||||
print " ptr -= (iterBytes - dist);"
|
||||
print " { " # need an extra scope around switch variant to stop its globals escaping
|
||||
else:
|
||||
print " if (doMainLoop) {"
|
||||
print " for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {"
|
||||
print self.produce_flood_check()
|
||||
print " __builtin_prefetch(ptr + (iterBytes*4));"
|
||||
print " assert(((size_t)ptr % START_MOD) == 0);"
|
||||
|
||||
|
||||
# just do globally for now
|
||||
if switch_variant:
|
||||
subsidiary_load_cautious = True
|
||||
confirm_cautious = True
|
||||
else:
|
||||
subsidiary_load_cautious = False
|
||||
confirm_cautious = False
|
||||
|
||||
if not self.fdr2_force_naive_load:
|
||||
bulk_load_steps = [ off for off in range(self.loop_bytes)
|
||||
if off % self.datasize_bytes == 0 and
|
||||
(set(range(off, off + self.datasize_bytes - 1)) & stride_offsetSet)]
|
||||
else:
|
||||
bulk_load_steps = []
|
||||
|
||||
confirm_steps = [ off for off in range(self.loop_bytes) if off % self.confirm_frequency == 0 ]
|
||||
|
||||
for off in bulk_load_steps:
|
||||
lb_var = ctxt.new_var(None, self.bulk_load_type, "current_data_%d" % off)
|
||||
print " " + lb_var.gen_initializer_stmt()
|
||||
|
||||
|
||||
for off in confirm_steps:
|
||||
var_name = "conf%d" % off
|
||||
conf_def_var = ctxt.new_var(None, self.conf_type, var_name)
|
||||
if switch_variant:
|
||||
init_string = "(%s)-1" % self.conf_type.get_name()
|
||||
else:
|
||||
init_string = ""
|
||||
print " " + conf_def_var.gen_initializer_stmt(init_string)
|
||||
|
||||
if switch_variant:
|
||||
print " switch(iterBytes - dist) {"
|
||||
for i in range(0, self.loop_bytes):
|
||||
print " case %d:" % i
|
||||
|
||||
# init and poison conf; over-precise but harmless
|
||||
conf_id = (i / self.confirm_frequency) * self.confirm_frequency
|
||||
if i % self.confirm_frequency:
|
||||
conf_fixup_bits = self.conf_type.size - (self.num_buckets * (i % self.confirm_frequency))
|
||||
print " conf%d >>= %d;" % (conf_id, conf_fixup_bits)
|
||||
else:
|
||||
print " conf%d = 0;" % conf_id
|
||||
|
||||
# init state
|
||||
state_fixup = i % self.extract_frequency
|
||||
state = self.state_variable
|
||||
shift_distance = self.num_buckets * state_fixup
|
||||
if state_fixup:
|
||||
print " %s = %s;" % (state.name, state.type.shift_expr(state.name, shift_distance))
|
||||
if self.state_width < 128:
|
||||
print " %s |= %s;" % (state.name, state.type.lowbit_mask(shift_distance))
|
||||
else:
|
||||
print " %s = or%d(%s, %s);" % (state.name, self.state_width, state.name, state.type.lowbit_mask(shift_distance))
|
||||
|
||||
if not self.fdr2_force_naive_load:
|
||||
# init current_data (could poison it in some cases)
|
||||
load_mod = i % self.datasize_bytes
|
||||
load_offset = i - load_mod
|
||||
if load_mod:
|
||||
# not coming in on an even boundary means having to do a load var
|
||||
# actually, there are a bunch of things we can do on this bulk load
|
||||
# to avoid having to be 'cautious_backwards' but I'm not completely
|
||||
# sure they are good ideas
|
||||
init_string = self.bulk_load_type.load_expr_data(load_offset,
|
||||
code = "cautious_backward")
|
||||
var_name = "current_data_%d" % load_offset
|
||||
lb_var = ctxt.get_var(None, var_name, reader = False, writer = True)
|
||||
print " %s = %s;" % (lb_var.name, init_string)
|
||||
|
||||
print " goto off%d;" % i
|
||||
print " case %d: goto skipSwitch;" % self.loop_bytes
|
||||
print " }"
|
||||
print " {"
|
||||
|
||||
|
||||
for off in range(self.loop_bytes):
|
||||
# X_mod is the offset we're up to relative to the last X operation
|
||||
# X_offset is which of the last X operations matches this iteration
|
||||
|
||||
if (switch_variant):
|
||||
LabelStep(ctxt, off)
|
||||
|
||||
if off in bulk_load_steps:
|
||||
if not self.fdr2_force_naive_load:
|
||||
BulkLoadStep(ctxt, off, self.datasize, define_var = False, aligned = not switch_variant)
|
||||
|
||||
if off in stride_offsets:
|
||||
if switch_variant:
|
||||
OpenScopeStep(ctxt, off)
|
||||
ValueExtractStep(ctxt, off, sub_load_cautious = subsidiary_load_cautious)
|
||||
TableLookupStep(ctxt, self.reach_mult, off)
|
||||
if off % self.extract_frequency:
|
||||
ShiftReachMaskStep(ctxt, off)
|
||||
so = OrStep(ctxt, off, self.state_width)
|
||||
if switch_variant:
|
||||
CloseScopeStep(ctxt, off)
|
||||
if sh != None:
|
||||
so.add_dependency(sh)
|
||||
so_steps_last_block += [ so ]
|
||||
|
||||
extract_mod = off % self.extract_frequency
|
||||
extract_offset = off - extract_mod
|
||||
extract_ready = extract_mod == self.extract_frequency - 1
|
||||
if extract_ready:
|
||||
if switch_variant:
|
||||
OpenScopeStep(ctxt, off)
|
||||
ex = ConfExtractStep(ctxt, extract_offset)
|
||||
ConfAccumulateStep(ctxt, extract_offset, confirm_offset, define_var = False)
|
||||
for so_step in so_steps_last_block:
|
||||
ex.add_dependency(so_step)
|
||||
if switch_variant:
|
||||
CloseScopeStep(ctxt, off)
|
||||
so_steps_last_block = []
|
||||
sh = ShiftStateStep(ctxt, extract_offset, stride_used = self.extract_frequency)
|
||||
sh.add_dependency(ex)
|
||||
|
||||
confirm_mod = off % self.confirm_frequency
|
||||
confirm_offset = off - confirm_mod
|
||||
confirm_ready = confirm_mod == self.confirm_frequency - 1
|
||||
if confirm_ready:
|
||||
cflip = ConfirmFlipStep(ctxt, confirm_offset)
|
||||
cf = ConfirmStep(ctxt, confirm_offset, cautious = confirm_cautious )
|
||||
if last_confirm:
|
||||
cf.add_dependency(last_confirm)
|
||||
last_confirm = cf
|
||||
|
||||
|
||||
if not switch_variant:
|
||||
print ctxt.schedule([ last_confirm, sh ])
|
||||
else:
|
||||
print ctxt.dontschedule([ last_confirm, sh ])
|
||||
|
||||
if switch_variant:
|
||||
print "skipSwitch:;"
|
||||
print " ptr += iterBytes;"
|
||||
print " }" # close extra scope around switch variant
|
||||
print " }"
|
||||
|
||||
|
||||
def produce_init_state(self):
|
||||
state = self.state_variable
|
||||
s_type = self.state_type
|
||||
shift_distance = -1 * self.num_buckets
|
||||
shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance))
|
||||
|
||||
s = Template("""
|
||||
$TYPENAME s;
|
||||
if (a->len_history) {
|
||||
u32 tmp = 0;
|
||||
if (a->start_offset == 0) {
|
||||
tmp = a->buf_history[a->len_history - 1];
|
||||
tmp |= (a->buf[0] << 8);
|
||||
} else {
|
||||
tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
|
||||
}
|
||||
tmp &= fdr->domainMask;
|
||||
s = *((const $TYPENAME *)ft + tmp);
|
||||
$SHIFT_EXPR;
|
||||
} else {
|
||||
s = *(const $TYPENAME *)&fdr->start;
|
||||
}
|
||||
""").substitute(TYPENAME = s_type.get_name(),
|
||||
ZERO_EXPR = s_type.zero_expression(),
|
||||
SHIFT_EXPR = shift_expr)
|
||||
return s
|
||||
|
||||
def produce_code(self):
|
||||
|
||||
loop_read_behind = 0
|
||||
loop_read_ahead = self.loop_bytes + 1
|
||||
|
||||
# we set up mask and shift stuff for extracting our masks from registers
|
||||
#
|
||||
# we have a choice as to whether to mask out the value early or
|
||||
# extract the value (shift first) then mask it
|
||||
#
|
||||
# Intel has a free scaling factor from 1/2/4/8 so we want to combine
|
||||
# the extra needed shift for SSE registers with the mask operation
|
||||
|
||||
ssb = self.state_type.size / 8 # state size in bytes
|
||||
|
||||
# Intel path
|
||||
if ssb == 16:
|
||||
# obscure corner - we don't have the room in the register to
|
||||
# do this for all values so we don't. domain==16 is pretty
|
||||
# bad anyhow, of course
|
||||
self.reach_mult = 8
|
||||
else:
|
||||
self.reach_mult = ssb
|
||||
|
||||
shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
|
||||
self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
|
||||
|
||||
print self.produce_header(visible = False)
|
||||
|
||||
print "// ",
|
||||
print " Arch: " + self.arch.name,
|
||||
print " State type: " + self.state_type.get_name(),
|
||||
print " Num buckets: %d" % self.num_buckets,
|
||||
print " Stride: %d" % self.stride
|
||||
|
||||
print self.produce_common_declarations()
|
||||
|
||||
print " assert(fdr->domain > 8 && fdr->domain < 16);"
|
||||
print
|
||||
print " u64a domain_mask = fdr->domainMask;"
|
||||
print " const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));"
|
||||
print " const u32 * confBase = (const u32 *)(ft + fdr->tabSize);"
|
||||
print self.produce_init_state()
|
||||
print " const size_t iterBytes = %d;" % self.loop_bytes
|
||||
print " const size_t START_MOD = %d;" % self.datasize_bytes
|
||||
print " const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
|
||||
|
||||
print """
|
||||
while (ptr < buf + len) {
|
||||
|
||||
u8 doMainLoop = 1;
|
||||
size_t remaining = len - (ptr - buf);
|
||||
size_t dist;
|
||||
if (remaining <= iterBytes) {
|
||||
dist = remaining; // once through the switch and we're done
|
||||
} else if (remaining < 2 * iterBytes) {
|
||||
// nibble some stuff off the front, skip the main loop,
|
||||
// then come back here
|
||||
dist = iterBytes; // maybe could be cleverer
|
||||
} else {
|
||||
// now, we need to see if we can make it to a main loop iteration
|
||||
// if so, we need to ensure that the main loop iteration is aligned
|
||||
// to a START_MOD boundary and i >= 8 so we can read ptr + i - 8
|
||||
|
||||
// see if we can do it - if not, just switch the main loop off,
|
||||
// eat iterBytes in cautious mode, and come back to this loop
|
||||
|
||||
const u8 * target = MAX(buf + 8, ptr);
|
||||
target = ROUNDUP_PTR(target, START_MOD);
|
||||
dist = target - ptr;
|
||||
if (dist > iterBytes) {
|
||||
doMainLoop = 0;
|
||||
dist = iterBytes;
|
||||
}
|
||||
}
|
||||
"""
|
||||
self.produce_main_loop(switch_variant = True)
|
||||
self.produce_main_loop(switch_variant = False)
|
||||
print """
|
||||
}
|
||||
"""
|
||||
print self.produce_footer()
|
||||
|
||||
def get_name(self):
|
||||
return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width)
|
||||
|
||||
def __init__(self, state_width, stride,
|
||||
arch,
|
||||
table_state_width = None,
|
||||
num_buckets = 8,
|
||||
extract_frequency = None,
|
||||
confirm_frequency = None):
|
||||
|
||||
# First - set up the values that are fundamental to how this matcher will operate
|
||||
self.arch = arch
|
||||
|
||||
# get the width of the state width on which we operate internally
|
||||
if state_width not in [ 128 ]:
|
||||
fail_out("Unknown state width: %d" % state_width)
|
||||
self.state_width = state_width
|
||||
self.state_type = getRequiredType(self.state_width)
|
||||
self.state_variable = IntegerVariable("s", self.state_type)
|
||||
|
||||
table_state_width = state_width
|
||||
self.table_state_width = state_width
|
||||
self.table_state_type = getRequiredType(self.table_state_width)
|
||||
|
||||
# this is the load type required for domain [9:15] if we want to
|
||||
# load it one at a time
|
||||
self.single_load_type = IntegerType(16)
|
||||
|
||||
# stride is the frequency with which we make data-driven
|
||||
# accesses to our reach table
|
||||
if stride not in [ 1, 2, 4, 8]:
|
||||
fail_out("Unsupported stride: %d" % stride)
|
||||
if stride * num_buckets > state_width:
|
||||
fail_out("Stride %d is too big for the number of buckets %d given state width %d\n" % (stride, num_buckets, state_width))
|
||||
self.stride = stride
|
||||
|
||||
if num_buckets != 8:
|
||||
fail_out("Unsupported number of buckets: %d" % num_buckets)
|
||||
if state_width % num_buckets and state_width == 128:
|
||||
fail_out("Bucket scheme requires bit-shifts on m128 (failing)")
|
||||
self.num_buckets = num_buckets
|
||||
|
||||
# Second - set up derived or optimization values - these can be
|
||||
# overridden by arguments that are passed in
|
||||
|
||||
self.datasize = 64
|
||||
self.bulk_load_type = IntegerType(self.datasize)
|
||||
self.datasize_bytes = self.datasize/8
|
||||
|
||||
self.value_extract_type = IntegerType(self.datasize)
|
||||
|
||||
self.fdr2_force_naive_load = False # disable everywhere for trunk
|
||||
|
||||
# extract frequency is how frequently (in bytes) we destructively shift
|
||||
# our state value after having pulled out that many bytes into a
|
||||
# confirm register (of one sort or another).
|
||||
# none means a default value - datasize, our biggest easily available GPR
|
||||
if extract_frequency is None:
|
||||
extract_frequency = self.datasize_bytes
|
||||
self.extract_frequency = extract_frequency
|
||||
self.extract_size = self.extract_frequency*self.num_buckets
|
||||
if extract_frequency < stride:
|
||||
fail_out("Can't extract at extract frequency %d with stride %d" % (extract_frequency, stride))
|
||||
if extract_frequency not in [ None, 1, 2, 4, 8, 16]:
|
||||
fail_out("Weird extract frequency: %d" % extract_frequency)
|
||||
|
||||
if self.extract_size <= 32:
|
||||
self.extr_type = IntegerType(32)
|
||||
elif self.extract_size <= 64:
|
||||
self.extr_type = IntegerType(64)
|
||||
else:
|
||||
fail_out("Implausible size %d required for confirm extract step" % size)
|
||||
|
||||
# extract_frequency is how often we pull out our state and place
|
||||
# it somewhere in a lossless fashion
|
||||
# confirm_frequency, on the other hand, is how frequently we
|
||||
# take the state extracted by extract_frequency and cobble it
|
||||
# together into a matching loop
|
||||
# confirm_frequency must be a multiple of extract_frequency
|
||||
# and must fit into a fast register; for now; we're going to
|
||||
# stay in the GPR domain
|
||||
if confirm_frequency is None:
|
||||
confirm_frequency = self.extract_frequency
|
||||
self.confirm_frequency = confirm_frequency
|
||||
if confirm_frequency % self.extract_frequency:
|
||||
fail_out("Confirm frequency %d must be evenly divisible by extract_frequency %d" % (confirm_frequency, self.extract_frequency))
|
||||
|
||||
self.conf_size = self.confirm_frequency * self.num_buckets
|
||||
if self.conf_size <= 32:
|
||||
self.conf_type = IntegerType(32)
|
||||
elif self.conf_size <= 64:
|
||||
self.conf_type = IntegerType(64)
|
||||
else:
|
||||
fail_out("Implausible size %d required for confirm accumulate step" % self.conf_size)
|
||||
|
||||
# how many bytes in flight at once
|
||||
self.loop_bytes = 16
|
||||
|
||||
# confirm configuration
|
||||
|
||||
# how many entries in the top-level confirm table - 256 means
|
||||
# complete split on the last character
|
||||
self.conf_top_level_split = 256
|
||||
|
||||
# how much we 'pull back' in confirm - this is obviously related
|
||||
# to the first level conf but we will keep two separate paramters
|
||||
# for this to avoid the risk of conflating these
|
||||
self.conf_pull_back = 1
|
||||
|
||||
if self.conf_pull_back > 0 and self.conf_top_level_split < 256:
|
||||
fail_out("Pull back distance %d not supported by top level split %d" % (self.conf_pull_back, self.conf_top_level_split))
|
||||
|
||||
# minor stuff
|
||||
self.default_body_indent = 8
|
@ -187,9 +187,9 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
|
||||
/* we are allowing domains 9 to 15 only */
|
||||
assert(eng.bits > 8 && eng.bits < 16);
|
||||
fdr->domain = eng.bits;
|
||||
fdr->schemeWidthByte = eng.schemeWidth / 8;
|
||||
fdr->domainMask = (1 << eng.bits) - 1;
|
||||
fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte;
|
||||
fdr->tabSize = (1 << eng.bits) * (eng.schemeWidth / 8);
|
||||
fdr->stride = eng.stride;
|
||||
|
||||
if (link.first) {
|
||||
fdr->link = verify_u32(ptr - fdr_base);
|
||||
@ -544,6 +544,7 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
|
||||
// temporary hack for unit testing
|
||||
if (hint != HINT_INVALID) {
|
||||
des->bits = 9;
|
||||
des->stride = 1;
|
||||
}
|
||||
|
||||
FDRCompiler fc(lits, *des, make_small);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -36,144 +36,121 @@
|
||||
#include "util/bitutils.h"
|
||||
#include "util/compare.h"
|
||||
|
||||
#define CONF_LOADVAL_CALL lv_u64a
|
||||
#define CONF_LOADVAL_CALL_CAUTIOUS lv_u64a_ce
|
||||
|
||||
// this is ordinary confirmation function which runs through
|
||||
// the whole confirmation procedure
|
||||
static really_inline
|
||||
void confWithBit(const struct FDRConfirm * fdrc,
|
||||
const struct FDR_Runtime_Args * a,
|
||||
size_t i,
|
||||
CautionReason r,
|
||||
u32 pullBackAmount,
|
||||
hwlmcb_rv_t *control,
|
||||
u32 * last_match) {
|
||||
void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a,
|
||||
size_t i, u32 pullBackAmount, hwlmcb_rv_t *control,
|
||||
u32 *last_match, u64a conf_key) {
|
||||
assert(i < a->len);
|
||||
assert(ISALIGNED(fdrc));
|
||||
|
||||
const u8 * buf = a->buf;
|
||||
const size_t len = a->len;
|
||||
|
||||
CONF_TYPE v;
|
||||
const u8 * confirm_loc = buf + i - pullBackAmount - 7;
|
||||
if (likely(r == NOT_CAUTIOUS || confirm_loc >= buf)) {
|
||||
v = CONF_LOADVAL_CALL(confirm_loc, buf, buf + len);
|
||||
} else { // r == VECTORING, confirm_loc < buf
|
||||
u64a histBytes = a->histBytes;
|
||||
v = CONF_LOADVAL_CALL_CAUTIOUS(confirm_loc, buf, buf + len);
|
||||
// stitch together v (which doesn't move) and history (which does)
|
||||
u32 overhang = buf - confirm_loc;
|
||||
histBytes >>= 64 - (overhang * 8);
|
||||
v |= histBytes;
|
||||
u32 c = CONF_HASH_CALL(conf_key, fdrc->andmsk, fdrc->mult,
|
||||
fdrc->nBitsOrSoleID);
|
||||
u32 start = getConfirmLitIndex(fdrc)[c];
|
||||
if (likely(!start)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u32 c = CONF_HASH_CALL(v, fdrc->andmsk, fdrc->mult, fdrc->nBitsOrSoleID);
|
||||
u32 start = getConfirmLitIndex(fdrc)[c];
|
||||
if (P0(start)) {
|
||||
const struct LitInfo *l =
|
||||
(const struct LitInfo *)((const u8 *)fdrc + start);
|
||||
const struct LitInfo *li
|
||||
= (const struct LitInfo *)((const u8 *)fdrc + start);
|
||||
|
||||
u8 oldNext; // initialized in loop
|
||||
do {
|
||||
assert(ISALIGNED(l));
|
||||
u8 oldNext; // initialized in loop
|
||||
do {
|
||||
assert(ISALIGNED(li));
|
||||
|
||||
if (P0( (v & l->msk) != l->v)) {
|
||||
if (unlikely((conf_key & li->msk) != li->v)) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((*last_match == li->id) && (li->flags & NoRepeat)) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
const u8 *loc = buf + i - li->size + 1 - pullBackAmount;
|
||||
|
||||
u8 caseless = li->flags & Caseless;
|
||||
if (loc < buf) {
|
||||
u32 full_overhang = buf - loc;
|
||||
|
||||
const u8 *history = caseless ? a->buf_history_nocase
|
||||
: a->buf_history;
|
||||
size_t len_history = caseless ? a->len_history_nocase
|
||||
: a->len_history;
|
||||
|
||||
// can't do a vectored confirm either if we don't have
|
||||
// the bytes
|
||||
if (full_overhang > len_history) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((*last_match == l->id) && (l->flags & NoRepeat)) {
|
||||
goto out;
|
||||
// as for the regular case, no need to do a full confirm if
|
||||
// we're a short literal
|
||||
if (unlikely(li->size > sizeof(CONF_TYPE))) {
|
||||
const u8 *s1 = li->s;
|
||||
const u8 *s2 = s1 + full_overhang;
|
||||
const u8 *loc1 = history + len_history - full_overhang;
|
||||
const u8 *loc2 = buf;
|
||||
size_t size1 = MIN(full_overhang, li->size - sizeof(CONF_TYPE));
|
||||
size_t wind_size2_back = sizeof(CONF_TYPE) + full_overhang;
|
||||
size_t size2 = wind_size2_back > li->size ?
|
||||
0 : li->size - wind_size2_back;
|
||||
|
||||
if (cmpForward(loc1, s1, size1, caseless)) {
|
||||
goto out;
|
||||
}
|
||||
if (cmpForward(loc2, s2, size2, caseless)) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
} else { // NON-VECTORING PATH
|
||||
|
||||
const u8 * loc = buf + i - l->size + 1 - pullBackAmount;
|
||||
// if string < conf_type we don't need regular string cmp
|
||||
if (unlikely(li->size > sizeof(CONF_TYPE))) {
|
||||
if (cmpForward(loc, li->s, li->size - sizeof(CONF_TYPE),
|
||||
caseless)) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
u8 caseless = l->flags & Caseless;
|
||||
if (loc < buf) {
|
||||
u32 full_overhang = buf - loc;
|
||||
if (unlikely(!(li->groups & *control))) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
const u8 * history = (caseless) ?
|
||||
a->buf_history_nocase : a->buf_history;
|
||||
size_t len_history = (caseless) ?
|
||||
a->len_history_nocase : a->len_history;
|
||||
|
||||
// can't do a vectored confirm either if we don't have
|
||||
// the bytes
|
||||
if (unlikely(li->flags & ComplexConfirm)) {
|
||||
const u8 *loc2 = buf + i - li->extended_size + 1 - pullBackAmount;
|
||||
if (loc2 < buf) {
|
||||
u32 full_overhang = buf - loc2;
|
||||
size_t len_history = caseless ? a->len_history_nocase
|
||||
: a->len_history;
|
||||
if (full_overhang > len_history) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
// as for the regular case, no need to do a full confirm if
|
||||
// we're a short literal
|
||||
if (unlikely(l->size > sizeof(CONF_TYPE))) {
|
||||
const u8 * s1 = l->s;
|
||||
const u8 * s2 = s1 + full_overhang;
|
||||
const u8 * loc1 = history + len_history - full_overhang;
|
||||
const u8 * loc2 = buf;
|
||||
size_t size1 = MIN(full_overhang,
|
||||
l->size - sizeof(CONF_TYPE));
|
||||
size_t wind_size2_back = sizeof(CONF_TYPE) +
|
||||
full_overhang;
|
||||
size_t size2 = wind_size2_back > l->size ?
|
||||
0 : l->size - wind_size2_back;
|
||||
|
||||
if (cmpForward(loc1, s1, size1, caseless)) {
|
||||
goto out;
|
||||
}
|
||||
if (cmpForward(loc2, s2, size2, caseless)) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
} else { // NON-VECTORING PATH
|
||||
|
||||
// if string < conf_type we don't need regular string cmp
|
||||
if (unlikely(l->size > sizeof(CONF_TYPE))) {
|
||||
if (cmpForward(loc, l->s, l->size - sizeof(CONF_TYPE), caseless)) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (P0(!(l->groups & *control))) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (unlikely(l->flags & ComplexConfirm)) {
|
||||
const u8 * loc2 = buf + i - l->extended_size + 1 - pullBackAmount;
|
||||
if (loc2 < buf) {
|
||||
u32 full_overhang = buf - loc2;
|
||||
size_t len_history = (caseless) ?
|
||||
a->len_history_nocase : a->len_history;
|
||||
if (full_overhang > len_history) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*last_match = l->id;
|
||||
*control = a->cb(loc - buf, i, l->id, a->ctxt);
|
||||
out:
|
||||
oldNext = l->next; // oldNext is either 0 or an 'adjust' value
|
||||
l = (const struct LitInfo*)((const u8 *)l + oldNext + l->size);
|
||||
} while (oldNext);
|
||||
}
|
||||
*last_match = li->id;
|
||||
*control = a->cb(loc - buf, i, li->id, a->ctxt);
|
||||
out:
|
||||
oldNext = li->next; // oldNext is either 0 or an 'adjust' value
|
||||
li = (const struct LitInfo *)((const u8 *)li + oldNext + li->size);
|
||||
} while (oldNext);
|
||||
}
|
||||
|
||||
// 'light-weight' confirmation function which is used by 1-mask Teddy;
|
||||
// in the 'confirmless' case it simply calls callback function,
|
||||
// otherwise it calls 'confWithBit' function for the full confirmation procedure
|
||||
static really_inline
|
||||
void confWithBit1(const struct FDRConfirm * fdrc,
|
||||
const struct FDR_Runtime_Args * a,
|
||||
size_t i,
|
||||
CautionReason r,
|
||||
hwlmcb_rv_t *control,
|
||||
u32 * last_match) {
|
||||
void confWithBit1(const struct FDRConfirm *fdrc,
|
||||
const struct FDR_Runtime_Args *a, size_t i,
|
||||
hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
|
||||
assert(i < a->len);
|
||||
assert(ISALIGNED(fdrc));
|
||||
|
||||
if (unlikely(fdrc->mult)) {
|
||||
confWithBit(fdrc, a, i, r, 0, control, last_match);
|
||||
confWithBit(fdrc, a, i, 0, control, last_match, conf_key);
|
||||
return;
|
||||
} else {
|
||||
u32 id = fdrc->nBitsOrSoleID;
|
||||
@ -190,12 +167,9 @@ void confWithBit1(const struct FDRConfirm * fdrc,
|
||||
// In the 'confirmless' case it makes fast 32-bit comparison,
|
||||
// otherwise it calls 'confWithBit' function for the full confirmation procedure
|
||||
static really_inline
|
||||
void confWithBitMany(const struct FDRConfirm * fdrc,
|
||||
const struct FDR_Runtime_Args * a,
|
||||
size_t i,
|
||||
CautionReason r,
|
||||
hwlmcb_rv_t *control,
|
||||
u32 * last_match) {
|
||||
void confWithBitMany(const struct FDRConfirm *fdrc,
|
||||
const struct FDR_Runtime_Args *a, size_t i, CautionReason r,
|
||||
hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
|
||||
assert(i < a->len);
|
||||
assert(ISALIGNED(fdrc));
|
||||
|
||||
@ -204,7 +178,7 @@ void confWithBitMany(const struct FDRConfirm * fdrc,
|
||||
}
|
||||
|
||||
if (unlikely(fdrc->mult)) {
|
||||
confWithBit(fdrc, a, i, r, 0, control, last_match);
|
||||
confWithBit(fdrc, a, i, 0, control, last_match, conf_key);
|
||||
return;
|
||||
} else {
|
||||
const u32 id = fdrc->nBitsOrSoleID;
|
||||
@ -215,7 +189,7 @@ void confWithBitMany(const struct FDRConfirm * fdrc,
|
||||
}
|
||||
|
||||
if (r == VECTORING && len > i - a->start_offset) {
|
||||
if (len > (i + a->len_history)) {
|
||||
if (len > i + a->len_history) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -68,8 +68,7 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
|
||||
}
|
||||
|
||||
if (isTeddy) {
|
||||
unique_ptr<TeddyEngineDescription> des =
|
||||
getTeddyDescription(fdr->engineID);
|
||||
auto des = getTeddyDescription(fdr->engineID);
|
||||
if (des) {
|
||||
fprintf(f, " masks %u\n", des->numMasks);
|
||||
fprintf(f, " buckets %u\n", des->getNumBuckets());
|
||||
@ -78,16 +77,8 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
|
||||
fprintf(f, " <unknown engine>\n");
|
||||
}
|
||||
} else {
|
||||
unique_ptr<FDREngineDescription> des =
|
||||
getFdrDescription(fdr->engineID);
|
||||
if (des) {
|
||||
fprintf(f, " domain %u\n", des->bits);
|
||||
fprintf(f, " stride %u\n", des->stride);
|
||||
fprintf(f, " buckets %u\n", des->getNumBuckets());
|
||||
fprintf(f, " width %u\n", des->schemeWidth);
|
||||
} else {
|
||||
fprintf(f, " <unknown engine>\n");
|
||||
}
|
||||
fprintf(f, " domain %u\n", fdr->domain);
|
||||
fprintf(f, " stride %u\n", fdr->stride);
|
||||
}
|
||||
|
||||
fprintf(f, " strings ???\n");
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -42,13 +42,11 @@ using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
#include "fdr_autogen_compiler.cpp"
|
||||
|
||||
FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
|
||||
: EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
|
||||
def.numBuckets, def.confirmPullBackDistance,
|
||||
def.confirmTopLevelSplit),
|
||||
schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {}
|
||||
schemeWidth(def.schemeWidth), stride(0), bits(0) {}
|
||||
|
||||
u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
|
||||
// rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
|
||||
@ -56,6 +54,12 @@ u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
|
||||
return ((getSchemeWidth() + getNumBuckets() - 1) / getNumBuckets()) + 1;
|
||||
}
|
||||
|
||||
void getFdrDescriptions(vector<FDREngineDescription> *out) {
|
||||
static const FDREngineDef def = {0, 128, 8, 0, 1, 256};
|
||||
out->clear();
|
||||
out->push_back(FDREngineDescription(def));
|
||||
}
|
||||
|
||||
static
|
||||
u32 findDesiredStride(size_t num_lits, size_t min_len, size_t min_len_count) {
|
||||
u32 desiredStride = 1; // always our safe fallback
|
||||
@ -108,32 +112,33 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
|
||||
FDREngineDescription *best = nullptr;
|
||||
u32 best_score = 0;
|
||||
|
||||
FDREngineDescription &eng = allDescs[0];
|
||||
|
||||
for (u32 domain = 9; domain <= 15; domain++) {
|
||||
for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
|
||||
for (size_t stride = 1; stride <= 4; stride *= 2) {
|
||||
// to make sure that domains >=14 have stride 1 according to origin
|
||||
if (domain > 13 && engineID > 0) {
|
||||
if (domain > 13 && stride > 1) {
|
||||
continue;
|
||||
}
|
||||
FDREngineDescription &eng = allDescs[engineID];
|
||||
if (!eng.isValidOnTarget(target)) {
|
||||
continue;
|
||||
}
|
||||
if (msl < eng.stride) {
|
||||
if (msl < stride) {
|
||||
continue;
|
||||
}
|
||||
|
||||
u32 score = 100;
|
||||
|
||||
score -= absdiff(desiredStride, eng.stride);
|
||||
score -= absdiff(desiredStride, stride);
|
||||
|
||||
if (eng.stride <= desiredStride) {
|
||||
score += eng.stride;
|
||||
if (stride <= desiredStride) {
|
||||
score += stride;
|
||||
}
|
||||
|
||||
u32 effLits = vl.size(); /* * desiredStride;*/
|
||||
u32 ideal;
|
||||
if (effLits < eng.getNumBuckets()) {
|
||||
if (eng.stride == 1) {
|
||||
if (stride == 1) {
|
||||
ideal = 8;
|
||||
} else {
|
||||
ideal = 10;
|
||||
@ -158,27 +163,28 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
|
||||
ideal -= 2;
|
||||
}
|
||||
|
||||
if (eng.stride > 1) {
|
||||
if (stride > 1) {
|
||||
ideal++;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("effLits %u\n", effLits);
|
||||
|
||||
if (target.is_atom_class() && !make_small && effLits < 4000) {
|
||||
/* Unless it is a very heavy case, we want to build smaller tables
|
||||
* on lightweight machines due to their small caches. */
|
||||
/* Unless it is a very heavy case, we want to build smaller
|
||||
* tables on lightweight machines due to their small caches. */
|
||||
ideal -= 2;
|
||||
}
|
||||
|
||||
score -= absdiff(ideal, domain);
|
||||
|
||||
DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
|
||||
DEBUG_PRINTF("fdr %u: width=%u, domain=%u, buckets=%u, stride=%zu "
|
||||
"-> score=%u\n",
|
||||
eng.getID(), eng.schemeWidth, eng.bits,
|
||||
eng.getNumBuckets(), eng.stride, score);
|
||||
eng.getID(), eng.schemeWidth, domain,
|
||||
eng.getNumBuckets(), stride, score);
|
||||
|
||||
if (!best || score > best_score) {
|
||||
eng.bits = domain;
|
||||
eng.stride = stride;
|
||||
best = ŋ
|
||||
best_score = score;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -42,7 +42,6 @@ struct FDREngineDef {
|
||||
u32 id;
|
||||
u32 schemeWidth;
|
||||
u32 numBuckets;
|
||||
u32 stride;
|
||||
u64a cpu_features;
|
||||
u32 confirmPullBackDistance;
|
||||
u32 confirmTopLevelSplit;
|
||||
@ -73,7 +72,6 @@ chooseEngine(const target_t &target, const std::vector<hwlmLiteral> &vl,
|
||||
bool make_small);
|
||||
std::unique_ptr<FDREngineDescription> getFdrDescription(u32 engineID);
|
||||
void getFdrDescriptions(std::vector<FDREngineDescription> *out);
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
#endif
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -76,17 +76,17 @@ struct FDR {
|
||||
* structures (spillover strings and hash table) if we're a secondary
|
||||
* structure. */
|
||||
u32 link;
|
||||
u8 domain; /* dynamic domain info */
|
||||
u8 schemeWidthByte; /* scheme width in bytes */
|
||||
u8 stride; /* stride - how frequeuntly the data is consulted by the first
|
||||
* stage matcher */
|
||||
u8 domain; /* number of bits used to index into main FDR table. This value
|
||||
* is used only of debugging/asserts. */
|
||||
u16 domainMask; /* pre-computed domain mask */
|
||||
u32 tabSize; /* pre-computed hashtable size in bytes */
|
||||
u32 pad1;
|
||||
u32 pad;
|
||||
|
||||
union {
|
||||
u32 s_u32;
|
||||
u64a s_u64a;
|
||||
m128 s_m128;
|
||||
} start;
|
||||
m128 start; /* initial start state to use at offset 0. The state has been set
|
||||
* up based on the min length of buckets to reduce the need for
|
||||
* pointless confirms. */
|
||||
};
|
||||
|
||||
/** \brief FDR runtime arguments.
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -37,7 +37,12 @@
|
||||
#define MAKE_LOADVAL(type, name) \
|
||||
static really_inline type name (const u8 * ptr, UNUSED const u8 * lo, UNUSED const u8 * hi)
|
||||
|
||||
#define NORMAL_SAFE(type) assert(ptr >= lo && (ptr + sizeof(type) - 1) < hi)
|
||||
#define NORMAL_SAFE(type) \
|
||||
do { \
|
||||
assert(ptr >= lo); \
|
||||
assert(ptr + sizeof(type) - 1 < hi); \
|
||||
} while(0)
|
||||
|
||||
#define ALIGNED_SAFE(type) NORMAL_SAFE(type); assert(((size_t)ptr % sizeof(type)) == 0);
|
||||
// these ones need asserts to test the property that we're not handling dynamically
|
||||
#define CAUTIOUS_FORWARD_SAFE(type) assert(ptr >= lo)
|
||||
|
@ -1,6 +1,6 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# Copyright (c) 2015, Intel Corporation
|
||||
# Copyright (c) 2015-2016, Intel Corporation
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
@ -27,19 +27,110 @@
|
||||
|
||||
import sys
|
||||
from autogen_utils import *
|
||||
from base_autogen import *
|
||||
from string import Template
|
||||
|
||||
class MT(MatcherBase):
|
||||
class MT:
|
||||
def produce_header(self, visible, header_only = False):
|
||||
s = ""
|
||||
if not visible:
|
||||
s += "static never_inline"
|
||||
s += """
|
||||
hwlm_error_t %s(UNUSED const struct FDR *fdr,
|
||||
UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name()
|
||||
if header_only:
|
||||
s += ";"
|
||||
else:
|
||||
s += "{"
|
||||
s += "\n"
|
||||
return s
|
||||
|
||||
def produce_guard(self):
|
||||
print self.arch.get_guard()
|
||||
|
||||
def produce_zero_alternative(self):
|
||||
print """
|
||||
#else
|
||||
#define %s 0
|
||||
#endif
|
||||
""" % self.get_name()
|
||||
|
||||
def close_guard(self):
|
||||
print "#endif"
|
||||
|
||||
def produce_confirm_base(self, conf_var_name, conf_var_size, offset, cautious, enable_confirmless, do_bailout = False):
|
||||
if cautious:
|
||||
caution_string = "VECTORING"
|
||||
else:
|
||||
caution_string = "NOT_CAUTIOUS"
|
||||
conf_split_mask = IntegerType(32).constant_to_string(
|
||||
self.conf_top_level_split - 1)
|
||||
if enable_confirmless:
|
||||
quick_check_string = """
|
||||
if (!fdrc->mult) {
|
||||
u32 id = fdrc->nBitsOrSoleID;
|
||||
if ((last_match == id) && (fdrc->flags & NoRepeat))
|
||||
continue;
|
||||
last_match = id;
|
||||
controlVal = a->cb(ptr+byte-buf, ptr+byte-buf, id, a->ctxt);
|
||||
continue;
|
||||
} """
|
||||
else:
|
||||
quick_check_string = ""
|
||||
if do_bailout:
|
||||
bailout_string = """
|
||||
if ((ptr + byte < buf + a->start_offset) || (ptr + byte >= buf + len)) continue;"""
|
||||
else:
|
||||
bailout_string = ""
|
||||
|
||||
return Template("""
|
||||
if (P0(!!$CONFVAR)) {
|
||||
do {
|
||||
u32 bit = findAndClearLSB_$CONFVAR_SIZE(&$CONFVAR);
|
||||
u32 byte = bit / $NUM_BUCKETS + $OFFSET;
|
||||
u32 bitRem = bit % $NUM_BUCKETS;
|
||||
$BAILOUT_STRING
|
||||
u32 confSplit = *(ptr+byte) & $SPLIT_MASK;
|
||||
u32 idx = confSplit * $NUM_BUCKETS + bitRem;
|
||||
u32 cf = confBase[idx];
|
||||
if (!cf)
|
||||
continue;
|
||||
fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);
|
||||
if (!(fdrc->groups & *control))
|
||||
continue;
|
||||
$QUICK_CHECK_STRING
|
||||
CautionReason reason = $CAUTION_STRING;
|
||||
CONF_TYPE v;
|
||||
const u8 * confirm_loc = ptr + byte - $CONF_PULL_BACK - 7;
|
||||
if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
|
||||
v = lv_u64a(confirm_loc, buf, buf + len);
|
||||
} else { // r == VECTORING, confirm_loc < buf
|
||||
u64a histBytes = a->histBytes;
|
||||
v = lv_u64a_ce(confirm_loc, buf, buf + len);
|
||||
// stitch together v (which doesn't move) and history (which does)
|
||||
u32 overhang = buf - confirm_loc;
|
||||
histBytes >>= 64 - (overhang * 8);
|
||||
v |= histBytes;
|
||||
}
|
||||
confWithBit(fdrc, a, ptr - buf + byte, $CONF_PULL_BACK, control, &last_match, v);
|
||||
} while(P0(!!$CONFVAR));
|
||||
if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
|
||||
*a->groups = controlVal;
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}""").substitute(CONFVAR = conf_var_name,
|
||||
CONFVAR_SIZE = conf_var_size,
|
||||
NUM_BUCKETS = self.num_buckets,
|
||||
OFFSET = offset,
|
||||
SPLIT_MASK = conf_split_mask,
|
||||
QUICK_CHECK_STRING = quick_check_string,
|
||||
BAILOUT_STRING = bailout_string,
|
||||
CAUTION_STRING = caution_string,
|
||||
CONF_PULL_BACK = self.conf_pull_back)
|
||||
|
||||
def produce_confirm(self, iter, var_name, offset, bits, cautious = True):
|
||||
if self.packed:
|
||||
print self.produce_confirm_base(var_name, bits, iter*16 + offset, cautious, enable_confirmless = False, do_bailout = False)
|
||||
else:
|
||||
if self.num_masks == 1:
|
||||
conf_func = "confWithBit1"
|
||||
else:
|
||||
conf_func = "confWithBitMany"
|
||||
|
||||
if cautious:
|
||||
caution_string = "VECTORING"
|
||||
else:
|
||||
@ -48,16 +139,33 @@ class MT(MatcherBase):
|
||||
print " if (P0(!!%s)) {" % var_name
|
||||
print " do {"
|
||||
if bits == 64:
|
||||
print " bit = findAndClearLSB_64(&%s);" % (var_name)
|
||||
print " u32 bit = findAndClearLSB_64(&%s);" % (var_name)
|
||||
else:
|
||||
print " bit = findAndClearLSB_32(&%s);" % (var_name)
|
||||
print " byte = bit / %d + %d;" % (self.num_buckets, iter*16 + offset)
|
||||
print " idx = bit %% %d;" % self.num_buckets
|
||||
print " cf = confBase[idx];"
|
||||
print " u32 bit = findAndClearLSB_32(&%s);" % (var_name)
|
||||
print " u32 byte = bit / %d + %d;" % (self.num_buckets, iter*16 + offset)
|
||||
print " u32 idx = bit %% %d;" % self.num_buckets
|
||||
print " u32 cf = confBase[idx];"
|
||||
print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
|
||||
print " if (!(fdrc->groups & *control))"
|
||||
print " continue;"
|
||||
print " %s(fdrc, a, ptr - buf + byte, %s, control, &last_match);" % (conf_func, caution_string)
|
||||
print """
|
||||
CautionReason reason = %s;
|
||||
CONF_TYPE v;
|
||||
const u8 * confirm_loc = ptr + byte - 7;
|
||||
if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
|
||||
v = lv_u64a(confirm_loc, buf, buf + len);
|
||||
} else { // r == VECTORING, confirm_loc < buf
|
||||
u64a histBytes = a->histBytes;
|
||||
v = lv_u64a_ce(confirm_loc, buf, buf + len);
|
||||
// stitch together v (which doesn't move) and history (which does)
|
||||
u32 overhang = buf - confirm_loc;
|
||||
histBytes >>= 64 - (overhang * 8);
|
||||
v |= histBytes;
|
||||
}""" % (caution_string)
|
||||
if self.num_masks == 1:
|
||||
print " confWithBit1(fdrc, a, ptr - buf + byte, control, &last_match, v);"
|
||||
else:
|
||||
print " confWithBitMany(fdrc, a, ptr - buf + byte, %s, control, &last_match, v);" % (caution_string)
|
||||
print " } while(P0(!!%s));" % var_name
|
||||
print " if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
|
||||
print " *a->groups = controlVal;"
|
||||
@ -146,7 +254,17 @@ class MT(MatcherBase):
|
||||
|
||||
def produce_code(self):
|
||||
print self.produce_header(visible = True, header_only = False)
|
||||
print self.produce_common_declarations()
|
||||
print """
|
||||
const u8 * buf = a->buf;
|
||||
const size_t len = a->len;
|
||||
const u8 * ptr = buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t * control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 * tryFloodDetect = a->firstFloodDetect;
|
||||
const struct FDRConfirm *fdrc;
|
||||
u32 last_match = (u32)-1;
|
||||
"""
|
||||
print
|
||||
|
||||
self.produce_needed_temporaries(self.num_iterations)
|
||||
@ -179,10 +297,17 @@ class MT(MatcherBase):
|
||||
print " ptr += 16;"
|
||||
print " }"
|
||||
|
||||
print " for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
|
||||
print " __builtin_prefetch(ptr + (iterBytes*4));"
|
||||
print self.produce_flood_check()
|
||||
|
||||
print """
|
||||
for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
if (P0(ptr > tryFloodDetect)) {
|
||||
tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
|
||||
if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
|
||||
*a->groups = controlVal;
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}
|
||||
"""
|
||||
for iter in range(self.num_iterations):
|
||||
self.produce_one_iteration(iter, self.num_iterations, cautious = False, confirmCautious = False)
|
||||
|
||||
@ -192,7 +317,11 @@ class MT(MatcherBase):
|
||||
self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
|
||||
print " }"
|
||||
|
||||
print self.produce_footer()
|
||||
print """
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
"""
|
||||
|
||||
def produce_compile_call(self):
|
||||
packed_str = { False : "false", True : "true"}[self.packed]
|
||||
@ -256,7 +385,17 @@ class MTFat(MT):
|
||||
|
||||
def produce_code(self):
|
||||
print self.produce_header(visible = True, header_only = False)
|
||||
print self.produce_common_declarations()
|
||||
print """
|
||||
const u8 * buf = a->buf;
|
||||
const size_t len = a->len;
|
||||
const u8 * ptr = buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t * control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 * tryFloodDetect = a->firstFloodDetect;
|
||||
const struct FDRConfirm *fdrc;
|
||||
u32 last_match = (u32)-1;
|
||||
"""
|
||||
print
|
||||
|
||||
self.produce_needed_temporaries(self.num_iterations)
|
||||
@ -289,9 +428,17 @@ class MTFat(MT):
|
||||
print " ptr += 16;"
|
||||
print " }"
|
||||
|
||||
print " for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
|
||||
print " __builtin_prefetch(ptr + (iterBytes*4));"
|
||||
print self.produce_flood_check()
|
||||
print """
|
||||
for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
if (P0(ptr > tryFloodDetect)) {
|
||||
tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
|
||||
if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
|
||||
*a->groups = controlVal;
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
for iter in range(self.num_iterations):
|
||||
self.produce_one_iteration(iter, self.num_iterations, False, confirmCautious = False)
|
||||
@ -302,7 +449,11 @@ class MTFat(MT):
|
||||
self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
|
||||
print " }"
|
||||
|
||||
print self.produce_footer()
|
||||
print """
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
"""
|
||||
|
||||
def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
|
||||
cautious, save_old):
|
||||
@ -367,7 +518,33 @@ class MTFat(MT):
|
||||
print "#endif"
|
||||
print " }"
|
||||
|
||||
class MTFast(MatcherBase):
|
||||
class MTFast:
|
||||
def produce_header(self, visible, header_only = False):
|
||||
s = ""
|
||||
if not visible:
|
||||
s += "static never_inline"
|
||||
s += """
|
||||
hwlm_error_t %s(UNUSED const struct FDR *fdr,
|
||||
UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name()
|
||||
if header_only:
|
||||
s += ";"
|
||||
else:
|
||||
s += "{"
|
||||
s += "\n"
|
||||
return s
|
||||
|
||||
def produce_guard(self):
|
||||
print self.arch.get_guard()
|
||||
|
||||
def produce_zero_alternative(self):
|
||||
print """
|
||||
#else
|
||||
#define %s 0
|
||||
#endif
|
||||
""" % self.get_name()
|
||||
|
||||
def close_guard(self):
|
||||
print "#endif"
|
||||
|
||||
def produce_confirm(self, cautious):
|
||||
if cautious:
|
||||
@ -376,24 +553,52 @@ class MTFast(MatcherBase):
|
||||
cautious_str = "NOT_CAUTIOUS"
|
||||
|
||||
print " for (u32 i = 0; i < arrCnt; i++) {"
|
||||
print " byte = bitArr[i] / 8;"
|
||||
print " u32 byte = bitArr[i] / 8;"
|
||||
if self.packed:
|
||||
conf_split_mask = IntegerType(32).constant_to_string(
|
||||
self.conf_top_level_split - 1)
|
||||
print " bitRem = bitArr[i] % 8;"
|
||||
print " confSplit = *(ptr+byte) & 0x1f;"
|
||||
print " idx = confSplit * %d + bitRem;" % self.num_buckets
|
||||
print " cf = confBase[idx];"
|
||||
print " u32 bitRem = bitArr[i] % 8;"
|
||||
print " u32 confSplit = *(ptr+byte) & 0x1f;"
|
||||
print " u32 idx = confSplit * %d + bitRem;" % self.num_buckets
|
||||
print " u32 cf = confBase[idx];"
|
||||
print " if (!cf)"
|
||||
print " continue;"
|
||||
print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
|
||||
print " if (!(fdrc->groups & *control))"
|
||||
print " continue;"
|
||||
print " confWithBit(fdrc, a, ptr - buf + byte, %s, 0, control, &last_match);" % cautious_str
|
||||
print """
|
||||
CautionReason reason = %s;
|
||||
CONF_TYPE v;
|
||||
const u8 * confirm_loc = ptr + byte - 7;
|
||||
if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
|
||||
v = lv_u64a(confirm_loc, buf, buf + len);
|
||||
} else { // r == VECTORING, confirm_loc < buf
|
||||
u64a histBytes = a->histBytes;
|
||||
v = lv_u64a_ce(confirm_loc, buf, buf + len);
|
||||
// stitch together v (which doesn't move) and history (which does)
|
||||
u32 overhang = buf - confirm_loc;
|
||||
histBytes >>= 64 - (overhang * 8);
|
||||
v |= histBytes;
|
||||
}""" % (cautious_str)
|
||||
print " confWithBit(fdrc, a, ptr - buf + byte, 0, control, &last_match, v);"
|
||||
else:
|
||||
print " cf = confBase[bitArr[i] % 8];"
|
||||
print " u32 cf = confBase[bitArr[i] % 8];"
|
||||
print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
|
||||
print " confWithBit1(fdrc, a, ptr - buf + byte, %s, control, &last_match);" % cautious_str
|
||||
print """
|
||||
CautionReason reason = %s;
|
||||
CONF_TYPE v;
|
||||
const u8 * confirm_loc = ptr + byte - 7;
|
||||
if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
|
||||
v = lv_u64a(confirm_loc, buf, buf + len);
|
||||
} else { // r == VECTORING, confirm_loc < buf
|
||||
u64a histBytes = a->histBytes;
|
||||
v = lv_u64a_ce(confirm_loc, buf, buf + len);
|
||||
// stitch together v (which doesn't move) and history (which does)
|
||||
u32 overhang = buf - confirm_loc;
|
||||
histBytes >>= 64 - (overhang * 8);
|
||||
v |= histBytes;
|
||||
}""" % (cautious_str)
|
||||
print " confWithBit1(fdrc, a, ptr - buf + byte, control, &last_match, v);"
|
||||
print " if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
|
||||
print " *a->groups = controlVal;"
|
||||
print " return HWLM_TERMINATED;"
|
||||
@ -467,7 +672,17 @@ class MTFast(MatcherBase):
|
||||
|
||||
def produce_code(self):
|
||||
print self.produce_header(visible = True, header_only = False)
|
||||
print self.produce_common_declarations()
|
||||
print """
|
||||
const u8 * buf = a->buf;
|
||||
const size_t len = a->len;
|
||||
const u8 * ptr = buf + a->start_offset;
|
||||
hwlmcb_rv_t controlVal = *a->groups;
|
||||
hwlmcb_rv_t * control = &controlVal;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 * tryFloodDetect = a->firstFloodDetect;
|
||||
const struct FDRConfirm *fdrc;
|
||||
u32 last_match = (u32)-1;
|
||||
"""
|
||||
print
|
||||
|
||||
self.produce_needed_temporaries(self.num_iterations)
|
||||
@ -498,9 +713,18 @@ class MTFast(MatcherBase):
|
||||
self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
|
||||
print " ptr += 32;"
|
||||
print " }"
|
||||
print " for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {"
|
||||
print " __builtin_prefetch(ptr + (iterBytes*4));"
|
||||
print self.produce_flood_check()
|
||||
print """
|
||||
for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes*4));
|
||||
if (P0(ptr > tryFloodDetect)) {
|
||||
tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
|
||||
if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
|
||||
*a->groups = controlVal;
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
for iter in range (0, self.num_iterations):
|
||||
self.produce_one_iteration_state_calc(iter = iter, cautious = False)
|
||||
print " arrCnt = 0;"
|
||||
@ -514,7 +738,11 @@ class MTFast(MatcherBase):
|
||||
self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
|
||||
print " }"
|
||||
|
||||
print self.produce_footer()
|
||||
print """
|
||||
*a->groups = controlVal;
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
"""
|
||||
|
||||
def get_name(self):
|
||||
if self.packed:
|
||||
|
@ -450,11 +450,19 @@ void maintainHistoryBuffer(const struct RoseEngine *rose, char *state,
|
||||
|
||||
static really_inline
|
||||
void init_stream(struct hs_stream *s, const struct RoseEngine *rose) {
|
||||
char *state = getMultiState(s);
|
||||
|
||||
// Make absolutely sure that the 16 bytes leading up to the end of the
|
||||
// history buffer are initialised, as we rely on this (regardless of the
|
||||
// actual values used) in FDR.
|
||||
char *hist_end = state + rose->stateOffsets.history + rose->historyRequired;
|
||||
assert(hist_end - 16 >= (const char *)s);
|
||||
unaligned_store_u64a(hist_end - 16, 0xDEADDEADDEADDEADull);
|
||||
unaligned_store_u64a(hist_end - 8, 0xDEADDEADDEADDEADull);
|
||||
|
||||
s->rose = rose;
|
||||
s->offset = 0;
|
||||
|
||||
char *state = getMultiState(s);
|
||||
|
||||
setStreamStatus(state, 0);
|
||||
roseInitState(rose, state);
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -403,8 +403,11 @@ TEST_P(FDRFloodp, WithMask) {
|
||||
TEST_P(FDRFloodp, StreamingMask) {
|
||||
const u32 hint = GetParam();
|
||||
SCOPED_TRACE(hint);
|
||||
const size_t fake_history_size = 16;
|
||||
const vector<u8> fake_history(fake_history_size, 0);
|
||||
const size_t dataSize = 1024;
|
||||
vector<u8> data(dataSize);
|
||||
vector<u8> tempdata(dataSize + fake_history_size); // headroom
|
||||
u8 c = '\0';
|
||||
|
||||
while (1) {
|
||||
@ -487,18 +490,28 @@ TEST_P(FDRFloodp, StreamingMask) {
|
||||
|
||||
for (u32 streamChunk = 1; streamChunk <= 16; streamChunk *= 2) {
|
||||
matchesCounts.clear();
|
||||
fdrStatus = fdrExecStreaming(fdr.get(), nullptr, 0, &data[0], streamChunk,
|
||||
0, countCallback, &matchesCounts, HWLM_ALL_GROUPS, nullptr);
|
||||
const u8 *d = data.data();
|
||||
// reference past the end of fake history to allow headroom
|
||||
const u8 *fhist = fake_history.data() + fake_history_size;
|
||||
fdrStatus = fdrExecStreaming(fdr.get(), fhist, 0, d, streamChunk, 0,
|
||||
countCallback, &matchesCounts,
|
||||
HWLM_ALL_GROUPS, nullptr);
|
||||
ASSERT_EQ(0, fdrStatus);
|
||||
for (u32 j = streamChunk; j < dataSize; j += streamChunk) {
|
||||
if (j < 8) {
|
||||
fdrStatus = fdrExecStreaming(fdr.get(), &data[0], j,
|
||||
&data[0] + j, streamChunk, 0, countCallback,
|
||||
&matchesCounts, HWLM_ALL_GROUPS, nullptr);
|
||||
if (j < 16) {
|
||||
/* allow 16 bytes headroom on read to avoid invalid
|
||||
* memory read during the FDR zone creation.*/
|
||||
memset(tempdata.data(), c, dataSize + fake_history_size);
|
||||
const u8 *tmp_d = tempdata.data() + fake_history_size;
|
||||
fdrStatus = fdrExecStreaming(fdr.get(), tmp_d, j, tmp_d + j,
|
||||
streamChunk, 0, countCallback,
|
||||
&matchesCounts,
|
||||
HWLM_ALL_GROUPS, nullptr);
|
||||
} else {
|
||||
fdrStatus = fdrExecStreaming(fdr.get(), &data[0] + j - 8,
|
||||
8, &data[0] + j, streamChunk, 0, countCallback,
|
||||
&matchesCounts, HWLM_ALL_GROUPS, nullptr);
|
||||
fdrStatus = fdrExecStreaming(fdr.get(), d + j - 8, 8, d + j,
|
||||
streamChunk, 0, countCallback,
|
||||
&matchesCounts,
|
||||
HWLM_ALL_GROUPS, nullptr);
|
||||
}
|
||||
ASSERT_EQ(0, fdrStatus);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user