diff --git a/CMakeLists.txt b/CMakeLists.txt index 0e118dd0..ad7bb3f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -964,7 +964,7 @@ endif() add_library(hs STATIC ${hs_SRCS} $) add_dependencies(hs ragel_Parser) -add_dependencies(hs autogen_compiler autogen_teddy_compiler) +add_dependencies(hs autogen_teddy_compiler) if (NOT BUILD_SHARED_LIBS) install(TARGETS hs DESTINATION lib) diff --git a/src/fdr/CMakeLists.txt b/src/fdr/CMakeLists.txt index 1436c3fc..7bbf82ff 100644 --- a/src/fdr/CMakeLists.txt +++ b/src/fdr/CMakeLists.txt @@ -5,8 +5,6 @@ set(AUTOGEN_PY_FILES arch.py autogen.py autogen_utils.py - base_autogen.py - fdr_autogen.py teddy_autogen.py ) @@ -22,18 +20,14 @@ endfunction(fdr_autogen) #now build the functions fdr_autogen(runtime fdr_autogen.c) -fdr_autogen(compiler fdr_autogen_compiler.cpp) fdr_autogen(teddy_runtime teddy_autogen.c) fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp) set(fdr_GENERATED_SRC ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen.c - ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen_compiler.cpp ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen.c ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp PARENT_SCOPE) set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE) include_directories(${CMAKE_CURRENT_BINARY_DIR}) - - diff --git a/src/fdr/autogen.py b/src/fdr/autogen.py index e5b4f39e..a8510487 100755 --- a/src/fdr/autogen.py +++ b/src/fdr/autogen.py @@ -1,6 +1,6 @@ #!/usr/bin/python -# Copyright (c) 2015, Intel Corporation +# Copyright (c) 2015-2016, Intel Corporation # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: @@ -27,41 +27,9 @@ import sys from autogen_utils import * -from fdr_autogen import * from teddy_autogen import * from arch import * -# FDR setup - -# these are either produced - if the guard succeeds, or #defined to zeroes. -# either the function or the zero is fine in our array of function pointers - -def produce_fdr_runtimes(l): - for m in l: - m.produce_code() - -def produce_fdr_compiles(l): - print "void getFdrDescriptions(vector *out) {" - print " static const FDREngineDef defns[] = {" - for m in l: - m.produce_compile_call() - print " };" - print " out->clear();" - print " for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {" - print " out->push_back(FDREngineDescription(defns[i]));" - print " }" - print "}" - -def build_fdr_matchers(): - all_matchers = [ ] - strides = [ 1, 2, 4 ] - - common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 } - for s in strides: - all_matchers += [ M3(stride = s, **common) ] - - return all_matchers - # teddy setup def build_teddy_matchers(): @@ -124,7 +92,8 @@ def make_fdr_function_pointers(matcher_list): typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a); static FDRFUNCTYPE funcs[] = { """ - all_funcs = ",\n".join([ " %s" % m.get_name() for m in matcher_list ]) + all_funcs = " fdr_engine_exec,\n" + all_funcs += ",\n".join([ " %s" % m.get_name() for m in matcher_list ]) print all_funcs print """ }; @@ -138,16 +107,11 @@ def assign_ids(matcher_list, next_id): # Main entry point -m = build_fdr_matchers() -next_id = assign_ids(m, 0) tm = build_teddy_matchers() -next_id = assign_ids(tm, next_id) -if sys.argv[1] == "compiler": - produce_fdr_compiles(m) -elif sys.argv[1] == "runtime": - produce_fdr_runtimes(m) +next_id = assign_ids(tm, 1) +if sys.argv[1] == "runtime": produce_teddy_headers(tm) - make_fdr_function_pointers(m+tm) + make_fdr_function_pointers(tm) elif sys.argv[1] == "teddy_runtime": produce_teddy_runtimes(tm) elif sys.argv[1] == "teddy_compiler": diff --git a/src/fdr/autogen_utils.py b/src/fdr/autogen_utils.py index e3679ad9..3544bc7b 100755 --- a/src/fdr/autogen_utils.py +++ b/src/fdr/autogen_utils.py @@ -1,6 +1,6 @@ #!/usr/bin/python -# Copyright (c) 2015, Intel Corporation +# Copyright (c) 2015-2016, Intel Corporation # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: @@ -41,9 +41,6 @@ class IntegerType: def size_in_bytes(self): return self.size / 8 - def isSIMDOnIntel(self): - return False - def zero_expression(self): return "0" @@ -63,15 +60,9 @@ class IntegerType: def lowbit_mask(self, n): return self.constant_to_string(self.lowbits(n)) - def highbit_mask(self, n): - return self.constant_to_string(self.highbits(n)) - def lowbit_extract_expr(self, expr_string, n): return "(%s & %s)" % ( expr_string, self.lowbit_mask(n)) - def highbit_extract_expr(self, expr_string, n): - return "(%s >> %d)" % (expr_string, self.size - n) - def flip_lowbits_expr(self, expr_string, n): return "(%s ^ %s)" % ( expr_string, self.lowbit_mask(n)) @@ -90,36 +81,10 @@ class IntegerType: else: return "(%s)" % (expr_string) - # code is: - # "normal" (always between buf and len) - the default - # "aligned" (means normal + aligned to a natural boundary) - # "cautious_forward" (means may go off the end of buf+len) - # "cautious_backwards" (means may go off the start of buf) - # "cautious_everywhere" (means may go off both) - - def load_expr_data(self, offset = 0, code = "normal", - base_string = "ptr", bounds_lo = "buf", bounds_hi = "buf + len"): - if code is "normal": - return "lv_%s(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi) - elif code is "aligned": - if self.size is 8: - fail_out("no aligned byte loads") - return "lv_%s_a(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi) - elif code is "cautious_forward": - return "lv_%s_cf(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi) - elif code is "cautious_backward": - return "lv_%s_cb(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi) - elif code is "cautious_everywhere": - return "lv_%s_ce(%s + %d, %s, %s)" % (self.get_name(), base_string, offset, bounds_lo, bounds_hi) - - class SIMDIntegerType(IntegerType): def __init__(self, size): IntegerType.__init__(self, size) - def isSIMDOnIntel(self): - return True - def zero_expression(self): return "zeroes128()" @@ -132,9 +97,6 @@ class SIMDIntegerType(IntegerType): tmpExpr = "movq(%s)" % expr_string return tmpType.lowbit_extract_expr(tmpExpr, n) - def highbit_extract_expr(self, expr_string, n): - fail_out("Unimplemented high bit extract on m128") - def bit_extract_expr(self, expr_string, low, high, flip): fail_out("Unimplemented bit extract on m128") @@ -146,9 +108,9 @@ class SIMDIntegerType(IntegerType): if n <= -self.size or n >= self.size: return self.zero_expression() elif (n > 0): - return "_mm_slli_si128(%s, %s)" % (expr_string, n / 8) + return "byteShiftLeft128(%s, %s)" % (expr_string, n / 8) elif (n < 0): - return "_mm_srli_si128(%s, %s)" % (expr_string, -n / 8) + return "byteShiftRight128(%s, %s)" % (expr_string, -n / 8) else: return "(%s)" % (expr_string) @@ -156,130 +118,3 @@ class SIMDIntegerType(IntegerType): if n % 8 != 0: fail_out("Trying to make a lowbit mask in a m128 by a bit granular value") return self.shift_expr("ones128()", -(128 - n)) - -def getRequiredType(bits): - if bits == 128: - return SIMDIntegerType(bits) - for b in [ 8, 16, 32, 64]: - if (bits <= b): - return IntegerType(b) - return None - -class IntegerVariable: - def __init__(self, name, type): - self.name = name - self.type = type - - def gen_initializer_stmt(self, initialization_string = None): - if initialization_string: - return "%s %s = %s;" % (self.type.get_name(), self.name, initialization_string) - else: - return "%s %s;" % (self.type.get_name(), self.name) - - -class Step: - def __init__(self, context, offset = 0): - self.context = context - self.matcher = context.matcher - self.offset = offset - self.latency = 1 - self.dependency_list = [] - self.latest = None - self.context.add_step(self) - - # return a string, complete with indentation - def emit(self): - indent = " " * (self.offset*2 + self.matcher.default_body_indent) - s = "\n".join( [ indent + line for line in self.val.split("\n")] ) - if self.latest: - s += " // " + str(self.debug_step) + " L" + str(self.latency) + " LTST:%d" % self.latest - if self.dependency_list: - s += " Derps: " - for (d,l) in self.dependency_list: - s += "%d/%d " % (d.debug_step,l) - return s - - def add_dependency(self, step, anti_dependency = False, output_dependency = False): - if anti_dependency or output_dependency: - self.dependency_list += [ (step, 1) ] - else: - self.dependency_list += [ (step, step.latency) ] - - def nv(self, type, var_name): - return self.context.new_var(self, type, var_name) - - def gv(self, var_name, reader = True, writer = False): - return self.context.get_var(self, var_name, reader = reader, writer = writer) - -# utility steps, generic - -class LabelStep(Step): - def __init__(self, context, offset = 0, label_prefix = "off"): - Step.__init__(self, context, offset) - self.val = "%s%d: UNUSED;" % (label_prefix, offset) - -class OpenScopeStep(Step): - def __init__(self, context, offset = 0): - Step.__init__(self, context, offset) - self.val = "{" - -class CloseScopeStep(Step): - def __init__(self, context, offset = 0): - Step.__init__(self, context, offset) - self.val = "}" - - -class CodeGenContext: - def __init__(self, matcher): - self.vars = {} - self.steps = [] - self.ctr = 0 - self.matcher = matcher - self.var_writer = {} # var to a single writer - self.var_readers = {} # var to a list of all the readers that read the last value - - def new_var(self, step, type, var_name): - var = IntegerVariable(var_name, type) - self.vars[var_name] = var - self.var_writer[var_name] = step - return var - - def get_var(self, step, var_name, reader = True, writer = False): - if reader: - writer_step = self.var_writer[var_name] - if writer_step: - step.add_dependency(writer_step) - self.var_readers.setdefault(var_name, []).append(step) - if writer and not reader: - if self.var_writer[var_name]: - step.add_dependency(self.var_writer[var_name], output_dependency = True) - if writer: - if self.var_readers.has_key(var_name): - for reader in [ r for r in self.var_readers[var_name] if r is not step ]: - step.add_dependency(reader, anti_dependency = True) - self.var_readers[var_name] = [] - self.var_writer[var_name] = step - return self.vars[var_name] - - def add_step(self, step): - self.steps += [ step ] - step.debug_step = self.ctr - self.ctr += 1 - - def dontschedule(self, finals): - return "\n".join( [ s.emit() for s in self.steps ] ) - - def schedule(self, finals): - for f in finals: - f.latest = f.latency - worklist = finals - while worklist: - current = worklist[0] - worklist = worklist[1:] - for (dep, lat) in current.dependency_list: - if dep.latest is None or dep.latest < (current.latest + dep.latency): - dep.latest = current.latest + lat - if dep not in worklist: - worklist += [ dep ] - self.steps.sort(reverse = True, key = lambda s : s.latest) - return "\n".join( [ s.emit() for s in self.steps ] ) diff --git a/src/fdr/base_autogen.py b/src/fdr/base_autogen.py deleted file mode 100644 index c9cf1b37..00000000 --- a/src/fdr/base_autogen.py +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/python - -# Copyright (c) 2015, Intel Corporation -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of Intel Corporation nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import sys -from autogen_utils import * -from base_autogen import * -from string import Template - -class MatcherBase: - - def __init__(self): - pass - - def get_name(self): - return "fdr_exec_%03d" % self.id - - def produce_header(self, visible, header_only = False): - s = "" - if not visible: - s += "static never_inline" - s += """ -hwlm_error_t %s(UNUSED const struct FDR *fdr, - UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name() - if header_only: - s += ";" - else: - s += "{" - s += "\n" - return s - - def produce_guard(self): - print self.arch.get_guard() - - def produce_zero_alternative(self): - print """ -#else -#define %s 0 -#endif -""" % self.get_name() - - # trivial function for documentation/modularity - def close_guard(self): - print "#endif" - - def produce_common_declarations(self): - return """ - const u8 * buf = a->buf; - const size_t len = a->len; - const u8 * ptr = buf + a->start_offset; - hwlmcb_rv_t controlVal = *a->groups; - hwlmcb_rv_t * control = &controlVal; - u32 floodBackoff = FLOOD_BACKOFF_START; - const u8 * tryFloodDetect = a->firstFloodDetect; - UNUSED u32 bit, bitRem, confSplit, idx; - u32 byte, cf; - const struct FDRConfirm *fdrc; - u32 last_match = (u32)-1; -""" - - def produce_continue_check(self): - return """if (P0(controlVal == HWLM_TERMINATE_MATCHING)) { - *a->groups = controlVal; - return HWLM_TERMINATED; -} -""" - def produce_flood_check(self): - return """ - if (P0(ptr > tryFloodDetect)) { - tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes); - if (P0(controlVal == HWLM_TERMINATE_MATCHING)) { - *a->groups = controlVal; - return HWLM_TERMINATED; - } - } -""" - - def produce_footer(self): - return """ - *a->groups = controlVal; - return HWLM_SUCCESS; -} -""" - - def produce_confirm_base(self, conf_var_name, conf_var_size, offset, cautious, enable_confirmless, do_bailout = False): - if cautious: - caution_string = "VECTORING" - else: - caution_string = "NOT_CAUTIOUS" - conf_split_mask = IntegerType(32).constant_to_string( - self.conf_top_level_split - 1) - if enable_confirmless: - quick_check_string = """ - if (!fdrc->mult) { - u32 id = fdrc->nBitsOrSoleID; - if ((last_match == id) && (fdrc->flags & NoRepeat)) - continue; - last_match = id; - controlVal = a->cb(ptr+byte-buf, ptr+byte-buf, id, a->ctxt); - continue; - } """ - else: - quick_check_string = "" - if do_bailout: - bailout_string = """ - if ((ptr + byte < buf + a->start_offset) || (ptr + byte >= buf + len)) continue;""" - else: - bailout_string = "" - - return Template(""" -if (P0(!!$CONFVAR)) { - do { - bit = findAndClearLSB_$CONFVAR_SIZE(&$CONFVAR); - byte = bit / $NUM_BUCKETS + $OFFSET; - bitRem = bit % $NUM_BUCKETS; - $BAILOUT_STRING - confSplit = *(ptr+byte) & $SPLIT_MASK; - idx = confSplit * $NUM_BUCKETS + bitRem; - cf = confBase[idx]; - if (!cf) - continue; - fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf); - if (!(fdrc->groups & *control)) - continue; - $QUICK_CHECK_STRING - confWithBit(fdrc, a, ptr - buf + byte, $CAUTION_STRING, $CONF_PULL_BACK, control, &last_match); - } while(P0(!!$CONFVAR)); - if (P0(controlVal == HWLM_TERMINATE_MATCHING)) { - *a->groups = controlVal; - return HWLM_TERMINATED; - } -}""").substitute(CONFVAR = conf_var_name, - CONFVAR_SIZE = conf_var_size, - NUM_BUCKETS = self.num_buckets, - OFFSET = offset, - SPLIT_MASK = conf_split_mask, - QUICK_CHECK_STRING = quick_check_string, - BAILOUT_STRING = bailout_string, - CAUTION_STRING = caution_string, - CONF_PULL_BACK = self.conf_pull_back) - - -def indent(block, depth): - return "\n".join([ (" " * (4*depth)) + line for line in block.splitlines() ] ) diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index f83a4265..c955680b 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -26,28 +26,752 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include "util/simd_utils.h" - -#define P0(cnd) unlikely(cnd) - #include "fdr.h" -#include "fdr_internal.h" -#include "teddy_internal.h" - -#include "flood_runtime.h" - #include "fdr_confirm.h" #include "fdr_confirm_runtime.h" -#include "fdr_streaming_runtime.h" +#include "fdr_internal.h" #include "fdr_loadval.h" +#include "fdr_streaming_runtime.h" +#include "flood_runtime.h" +#include "teddy_internal.h" +#include "util/simd_utils.h" +#include "util/simd_utils_ssse3.h" + +/** \brief number of bytes processed in each iteration */ +#define ITER_BYTES 16 + +/** \brief total zone buffer size */ +#define ZONE_TOTAL_SIZE 64 + +/** \brief maximum number of allowed zones */ +#define ZONE_MAX 3 + +/** \brief zone information. + * + * Zone represents a region of data to scan in FDR. + * + * The incoming buffer is to split in multiple zones to ensure two properties: + * 1: that we can read 8? bytes behind to generate a hash safely + * 2: that we can read the byte after the current byte (domain > 8) + */ +struct zone { + /** \brief copied buffer, used only when it is a boundary zone. */ + u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE]; + + /** \brief shift amount for fdr state to avoid unwanted match. */ + u8 shift; + + /** \brief if boundary zone, start points into the zone buffer after the + * pre-padding. Otherwise, points to the main buffer, appropriately. */ + const u8 *start; + + /** \brief if boundary zone, end points to the end of zone. Otherwise, + * pointer to the main buffer, appropriately. */ + const u8 *end; + + /** \brief the amount to adjust to go from a pointer in the zones region + * (between start and end) to a pointer in the original data buffer. */ + ptrdiff_t zone_pointer_adjust; + + /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones, + * otherwise end of the zone buf. floodPtr always points inside the same + * buffer as the start pointe. */ + const u8 *floodPtr; +}; + +static +const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = { + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } +}; + +/* generates an initial state mask based on the last byte-ish of history rather + * than being all accepting. If there is no history to consider, the state is + * generated based on the minimum length of each bucket in order to prevent + * confirms. + */ +static really_inline +m128 getInitState(const struct FDR *fdr, u8 len_history, const u8 *ft, + const struct zone *z) { + m128 s; + if (len_history) { + /* +1: the zones ensure that we can read the byte at z->end */ + u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1); + tmp &= fdr->domainMask; + s = *((const m128 *)ft + tmp); + s = shiftRight8Bits(s); + } else { + s = fdr->start; + } + return s; +} + +static really_inline +void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr, + u64a domain_mask_adjusted, const u8 *ft, u64a *conf0, + u64a *conf8, m128 *s) { + /* +1: the zones ensure that we can read the byte at z->end */ + + u64a current_data_0; + u64a current_data_8; + + current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr); + u64a v7 = (lv_u16(itPtr + 7, start_ptr, end_ptr + 1) << 1) & + domain_mask_adjusted; + u64a v0 = (current_data_0 << 1) & domain_mask_adjusted; + u64a v1 = (current_data_0 >> 7) & domain_mask_adjusted; + u64a v2 = (current_data_0 >> 15) & domain_mask_adjusted; + u64a v3 = (current_data_0 >> 23) & domain_mask_adjusted; + u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted; + u64a v5 = (current_data_0 >> 39) & domain_mask_adjusted; + u64a v6 = (current_data_0 >> 47) & domain_mask_adjusted; + current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr); + u64a v15 = (lv_u16(itPtr + 15, start_ptr, end_ptr + 1) << 1) & + domain_mask_adjusted; + u64a v8 = (current_data_8 << 1) & domain_mask_adjusted; + u64a v9 = (current_data_8 >> 7) & domain_mask_adjusted; + u64a v10 = (current_data_8 >> 15) & domain_mask_adjusted; + u64a v11 = (current_data_8 >> 23) & domain_mask_adjusted; + u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted; + u64a v13 = (current_data_8 >> 39) & domain_mask_adjusted; + u64a v14 = (current_data_8 >> 47) & domain_mask_adjusted; + + m128 st0 = *(const m128 *)(ft + v0*8); + m128 st1 = *(const m128 *)(ft + v1*8); + m128 st2 = *(const m128 *)(ft + v2*8); + m128 st3 = *(const m128 *)(ft + v3*8); + m128 st4 = *(const m128 *)(ft + v4*8); + m128 st5 = *(const m128 *)(ft + v5*8); + m128 st6 = *(const m128 *)(ft + v6*8); + m128 st7 = *(const m128 *)(ft + v7*8); + m128 st8 = *(const m128 *)(ft + v8*8); + m128 st9 = *(const m128 *)(ft + v9*8); + m128 st10 = *(const m128 *)(ft + v10*8); + m128 st11 = *(const m128 *)(ft + v11*8); + m128 st12 = *(const m128 *)(ft + v12*8); + m128 st13 = *(const m128 *)(ft + v13*8); + m128 st14 = *(const m128 *)(ft + v14*8); + m128 st15 = *(const m128 *)(ft + v15*8); + + st1 = byteShiftLeft128(st1, 1); + st2 = byteShiftLeft128(st2, 2); + st3 = byteShiftLeft128(st3, 3); + st4 = byteShiftLeft128(st4, 4); + st5 = byteShiftLeft128(st5, 5); + st6 = byteShiftLeft128(st6, 6); + st7 = byteShiftLeft128(st7, 7); + st9 = byteShiftLeft128(st9, 1); + st10 = byteShiftLeft128(st10, 2); + st11 = byteShiftLeft128(st11, 3); + st12 = byteShiftLeft128(st12, 4); + st13 = byteShiftLeft128(st13, 5); + st14 = byteShiftLeft128(st14, 6); + st15 = byteShiftLeft128(st15, 7); + + *s = or128(*s, st0); + *s = or128(*s, st1); + *s = or128(*s, st2); + *s = or128(*s, st3); + *s = or128(*s, st4); + *s = or128(*s, st5); + *s = or128(*s, st6); + *s = or128(*s, st7); + *conf0 = movq(*s); + *s = byteShiftRight128(*s, 8); + *conf0 ^= ~0ULL; + + *s = or128(*s, st8); + *s = or128(*s, st9); + *s = or128(*s, st10); + *s = or128(*s, st11); + *s = or128(*s, st12); + *s = or128(*s, st13); + *s = or128(*s, st14); + *s = or128(*s, st15); + *conf8 = movq(*s); + *s = byteShiftRight128(*s, 8); + *conf8 ^= ~0ULL; +} + +static really_inline +void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr, + u64a domain_mask_adjusted, const u8 *ft, u64a *conf0, + u64a *conf8, m128 *s) { + u64a current_data_0; + u64a current_data_8; + + current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr); + u64a v0 = (current_data_0 << 1) & domain_mask_adjusted; + u64a v2 = (current_data_0 >> 15) & domain_mask_adjusted; + u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted; + u64a v6 = (current_data_0 >> 47) & domain_mask_adjusted; + current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr); + u64a v8 = (current_data_8 << 1) & domain_mask_adjusted; + u64a v10 = (current_data_8 >> 15) & domain_mask_adjusted; + u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted; + u64a v14 = (current_data_8 >> 47) & domain_mask_adjusted; + + m128 st0 = *(const m128 *)(ft + v0*8); + m128 st2 = *(const m128 *)(ft + v2*8); + m128 st4 = *(const m128 *)(ft + v4*8); + m128 st6 = *(const m128 *)(ft + v6*8); + m128 st8 = *(const m128 *)(ft + v8*8); + m128 st10 = *(const m128 *)(ft + v10*8); + m128 st12 = *(const m128 *)(ft + v12*8); + m128 st14 = *(const m128 *)(ft + v14*8); + + st2 = byteShiftLeft128(st2, 2); + st4 = byteShiftLeft128(st4, 4); + st6 = byteShiftLeft128(st6, 6); + st10 = byteShiftLeft128(st10, 2); + st12 = byteShiftLeft128(st12, 4); + st14 = byteShiftLeft128(st14, 6); + + *s = or128(*s, st0); + *s = or128(*s, st2); + *s = or128(*s, st4); + *s = or128(*s, st6); + *conf0 = movq(*s); + *s = byteShiftRight128(*s, 8); + *conf0 ^= ~0ULL; + + *s = or128(*s, st8); + *s = or128(*s, st10); + *s = or128(*s, st12); + *s = or128(*s, st14); + *conf8 = movq(*s); + *s = byteShiftRight128(*s, 8); + *conf8 ^= ~0ULL; +} + +static really_inline +void get_conf_stride_4(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr, + u64a domain_mask_adjusted, const u8 *ft, u64a *conf0, + u64a *conf8, m128 *s) { + u64a current_data_0; + u64a current_data_8; + + current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr); + u64a v0 = (current_data_0 << 1) & domain_mask_adjusted; + u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted; + current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr); + u64a v8 = (current_data_8 << 1) & domain_mask_adjusted; + u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted; + + m128 st0 = *(const m128 *)(ft + v0*8); + m128 st4 = *(const m128 *)(ft + v4*8); + m128 st8 = *(const m128 *)(ft + v8*8); + m128 st12 = *(const m128 *)(ft + v12*8); + + st4 = byteShiftLeft128(st4, 4); + st12 = byteShiftLeft128(st12, 4); + + *s = or128(*s, st0); + *s = or128(*s, st4); + *conf0 = movq(*s); + *s = byteShiftRight128(*s, 8); + *conf0 ^= ~0ULL; + + *s = or128(*s, st8); + *s = or128(*s, st12); + *conf8 = movq(*s); + *s = byteShiftRight128(*s, 8); + *conf8 ^= ~0ULL; +} + +static really_inline +void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *controlVal, + const u32 *confBase, const struct FDR_Runtime_Args *a, + const u8 *ptr, hwlmcb_rv_t *control, u32 *last_match_id, + struct zone *z) { + const u8 bucket = 8; + const u8 pullback = 1; + + if (likely(!*conf)) { + return; + } + + /* ptr is currently referring to a location in the zone's buffer, we also + * need a pointer in the original, main buffer for the final string compare. + */ + const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); + + const u8 *confLoc = ptr; + + do { + u32 bit = findAndClearLSB_64(conf); + u32 byte = bit / bucket + offset; + u32 bitRem = bit % bucket; + u32 confSplit = *(ptr + byte); + u32 idx = confSplit * bucket + bitRem; + u32 cf = confBase[idx]; + if (!cf) { + continue; + } + const struct FDRConfirm *fdrc = (const struct FDRConfirm *) + ((const u8 *)confBase + cf); + if (!(fdrc->groups & *control)) { + continue; + } + if (!fdrc->mult) { + u32 id = fdrc->nBitsOrSoleID; + if ((*last_match_id == id) && (fdrc->flags & NoRepeat)) { + continue; + } + *last_match_id = id; + *controlVal = a->cb(ptr_main + byte - a->buf, + ptr_main + byte - a->buf, id, a->ctxt); + continue; + } + u64a confVal = *(const u64a *)(confLoc + byte - sizeof(u64a)); + confWithBit(fdrc, a, ptr_main - a->buf + byte, pullback, + control, last_match_id, confVal); + } while (unlikely(!!*conf)); +} + +static really_inline +void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) { +#ifdef DEBUG + DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf); + DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n", + z->start, z->end, z->shift); + DEBUG_PRINTF("zone: zone_pointer_adjust=%zd, floodPtr=%p\n", + z->zone_pointer_adjust, z->floodPtr); + DEBUG_PRINTF("zone buf:"); + for (size_t i = 0; i < ZONE_TOTAL_SIZE; i++) { + if (i % 8 == 0) { + printf("_"); + } + if (z->buf[i]) { + printf("%02x", z->buf[i]); + } else { + printf(".."); + } + } + printf("\n"); +#endif +}; + +/** + * \brief Updates attributes for non-boundary region zone. + */ +static really_inline +void createMainZone(const u8 *flood, const u8 *begin, const u8 *end, + struct zone *z) { + z->zone_pointer_adjust = 0; /* zone buffer is the main buffer */ + z->start = begin; + z->end = end; + z->floodPtr = flood; + z->shift = 0; +} + +/** + * \brief Create zone for short cases (<= ITER_BYTES). + * + * For this case we need to copy everything into the zone's internal buffer. + * + * We need to ensure that we run over real data if it exists (in history or + * before zone begin). We also need to ensure 8 bytes before any data being + * matched can be read (to perform a conf hash). + * + * We also need to ensure that the data at z->end can be read. + * + * Hence, the zone consists of: + * 16 bytes of history, + * 1 - 24 bytes of data form the buffer (ending at end), + * 1 byte of final padding + */ +static really_inline +void createShortZone(const u8 *buf, const u8 *hend, const u8 *begin, + const u8 *end, struct zone *z) { + /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid + * the checks in boundary zone. */ + z->floodPtr = z->buf + ZONE_TOTAL_SIZE; + + ptrdiff_t z_len = end - begin; + assert(z_len > 0); + assert(z_len <= ITER_BYTES); + + z->shift = ITER_BYTES - z_len; /* ignore bytes outside region specified */ + + static const size_t ZONE_SHORT_DATA_OFFSET = 16; /* after history */ + + /* we are guaranteed to always have 16 initialised bytes at the end of + * the history buffer (they may be garbage coming from the stream state + * preceding hbuf, but bytes that don't correspond to actual history + * shouldn't affect computations). */ + *(m128 *)z->buf = loadu128(hend - sizeof(m128)); + + /* The amount of data we have to copy from main buffer. */ + size_t copy_len = MIN((size_t)(end - buf), + ITER_BYTES + sizeof(CONF_TYPE)); + + u8 *zone_data = z->buf + ZONE_SHORT_DATA_OFFSET; + switch (copy_len) { + case 1: + *zone_data = *(end - 1); + break; + case 2: + *(u16 *)zone_data = unaligned_load_u16(end - 2); + break; + case 3: + *(u16 *)zone_data = unaligned_load_u16(end - 3); + *(zone_data + 2) = *(end - 1); + break; + case 4: + *(u32 *)zone_data = unaligned_load_u32(end - 4); + break; + case 5: + case 6: + case 7: + /* perform copy with 2 overlapping 4-byte chunks from buf. */ + *(u32 *)zone_data = unaligned_load_u32(end - copy_len); + unaligned_store_u32(zone_data + copy_len - sizeof(u32), + unaligned_load_u32(end - sizeof(u32))); + break; + case 8: + *(u64a *)zone_data = unaligned_load_u64a(end - 8); + break; + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + /* perform copy with 2 overlapping 8-byte chunks from buf. */ + *(u64a *)zone_data = unaligned_load_u64a(end - copy_len); + unaligned_store_u64a(zone_data + copy_len - sizeof(u64a), + unaligned_load_u64a(end - sizeof(u64a))); + break; + case 16: + /* copy 16-bytes from buf. */ + *(m128 *)zone_data = loadu128(end - 16); + break; + default: + assert(copy_len <= sizeof(m128) + sizeof(u64a)); + + /* perform copy with (potentially overlapping) 8-byte and 16-byte chunks. + */ + *(u64a *)zone_data = unaligned_load_u64a(end - copy_len); + storeu128(zone_data + copy_len - sizeof(m128), + loadu128(end - sizeof(m128))); + break; + } + + /* set the start and end location of the zone buf + * to be scanned */ + u8 *z_end = z->buf + ZONE_SHORT_DATA_OFFSET + copy_len; + assert(ZONE_SHORT_DATA_OFFSET + copy_len >= ITER_BYTES); + + /* copy the post-padding byte; this is required for domain > 8 due to + * overhang */ + *z_end = 0; + + z->end = z_end; + z->start = z_end - ITER_BYTES; + z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end); + assert(z->start + z->shift == z_end - z_len); +} + +/** + * \brief Create a zone for the start region. + * + * This function requires that there is > ITER_BYTES of data in the buffer to + * scan. The start zone itself is always responsible for scanning exactly + * ITER_BYTES of data - there are no warmup/junk bytes scanned. + * + * This zone ensures that the byte at z->end can be read and corresponds to + * the next byte of data. + * + * 8 bytes of history data are provided before z->start to allow proper hash + * generation in streaming mode. If buf != begin, upto 8 bytes of data + * prior to begin is also provided. + * + * Although we are not interested in bare literals which start before begin + * if buf != begin, lookarounds associated with the literal may require + * the data prior to begin for hash purposes. + */ +static really_inline +void createStartZone(const u8 *buf, const u8 *hend, const u8 *begin, + struct zone *z) { + assert(ITER_BYTES == sizeof(m128)); + assert(sizeof(CONF_TYPE) == 8); + static const size_t ZONE_START_BEGIN = sizeof(CONF_TYPE); + + const u8 *end = begin + ITER_BYTES; + + /* set floodPtr to the end of zone buf to avoid checks in start zone */ + z->floodPtr = z->buf + ZONE_TOTAL_SIZE; + + z->shift = 0; /* we are processing ITER_BYTES of real data */ + + /* we are guaranteed to always have 16 initialised bytes at the end of the + * history buffer (they may be garbage coming from the stream state + * preceding hbuf, but bytes that don't correspond to actual history + * shouldn't affect computations). However, for start zones, history is only + * required for conf hash purposes so we only need 8 bytes */ + unaligned_store_u64a(z->buf, unaligned_load_u64a(hend - sizeof(u64a))); + + /* The amount of data we have to copy from main buffer. */ + size_t copy_len = MIN((size_t)(end - buf), + ITER_BYTES + sizeof(CONF_TYPE)); + assert(copy_len >= 16); + + /* copy the post-padding byte; this is required for domain > 8 due to + * overhang. The start requires that there is data after the zone so it + * it safe to dereference end */ + z->buf[ZONE_START_BEGIN + copy_len] = *end; + + /* set the start and end location of the zone buf to be scanned */ + u8 *z_end = z->buf + ZONE_START_BEGIN + copy_len; + z->end = z_end; + z->start = z_end - ITER_BYTES; + + /* copy the first 8 bytes of the valid region */ + unaligned_store_u64a(z->buf + ZONE_START_BEGIN, + unaligned_load_u64a(end - copy_len)); + + /* copy the last 16 bytes, may overlap with the previous 8 byte write */ + storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128))); + + z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end); +} + +/** + * \brief Create a zone for the end region. + * + * This function requires that there is > ITER_BYTES of data in the buffer to + * scan. The end zone, however, is only responsible for a scanning the <= + * ITER_BYTES rump of data. The end zone is required to handle a full ITER_BYTES + * iteration as the main loop cannot handle the last byte of the buffer. + * + * This zone ensures that the byte at z->end can be read by filling it with a + * padding character. + * + * Upto 8 bytes of data prior to begin is also provided for the purposes of + * generating hashes. History is not copied, as all locations which require + * history for generating a hash are the responsiblity of the start zone. + */ +static really_inline +void createEndZone(const u8 *buf, const u8 *begin, const u8 *end, + struct zone *z) { + /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid + * the checks in boundary zone. */ + z->floodPtr = z->buf + ZONE_TOTAL_SIZE; + + ptrdiff_t z_len = end - begin; + assert(z_len > 0); + assert(z_len <= ITER_BYTES); + + z->shift = ITER_BYTES - z_len; + + /* The amount of data we have to copy from main buffer. */ + size_t copy_len = MIN((size_t)(end - buf), + ITER_BYTES + sizeof(CONF_TYPE)); + assert(copy_len >= 16); + + /* copy the post-padding byte; this is required for domain > 8 due to + * overhang */ + z->buf[copy_len] = 0; + + /* set the start and end location of the zone buf + * to be scanned */ + u8 *z_end = z->buf + copy_len; + z->end = z_end; + z->start = z_end - ITER_BYTES; + assert(z->start + z->shift == z_end - z_len); + + /* copy the first 8 bytes of the valid region */ + unaligned_store_u64a(z->buf, unaligned_load_u64a(end - copy_len)); + + /* copy the last 16 bytes, may overlap with the previous 8 byte write */ + storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128))); + + z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end); +} + +/** + * \brief Prepare zones. + * + * This function prepares zones with actual buffer and some padded bytes. + * The actual ITER_BYTES bytes in zone is preceded by main buf and/or + * history buf and succeeded by padded bytes possibly from main buf, + * if available. + */ +static really_inline +size_t prepareZones(const u8 *buf, size_t len, const u8 *hend, + size_t start, const u8 *flood, struct zone *zoneArr) { + const u8 *ptr = buf + start; + size_t remaining = len - start; + + if (remaining <= ITER_BYTES) { + /* enough bytes to make only one zone */ + createShortZone(buf, hend, ptr, buf + len, &zoneArr[0]); + return 1; + } + + /* enough bytes to make more than one zone */ + + size_t numZone = 0; + createStartZone(buf, hend, ptr, &zoneArr[numZone++]); + ptr += ITER_BYTES; + + assert(ptr < buf + len); + + /* find maximum buffer location that the main zone can scan + * - must be a multiple of ITER_BYTES, and + * - cannot contain the last byte (due to overhang) + */ + const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 1, ITER_BYTES); + assert(main_end >= ptr); + + /* create a zone if multiple of ITER_BYTES are found */ + if (main_end != ptr) { + createMainZone(flood, ptr, main_end, &zoneArr[numZone++]); + ptr = main_end; + } + /* create a zone with rest of the data from the main buffer */ + createEndZone(buf, ptr, buf + len, &zoneArr[numZone++]); + return numZone; +} + +#define INVALID_MATCH_ID (~0U) + +#define FDR_MAIN_LOOP(zz, s, get_conf_fn) \ + do { \ + const u8 *tryFloodDetect = zz->floodPtr; \ + const u8 *start_ptr = zz->start; \ + const u8 *end_ptr = zz->end; \ + \ + for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \ + itPtr += ITER_BYTES) { \ + if (unlikely(itPtr > tryFloodDetect)) { \ + tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\ + &floodBackoff, &controlVal, \ + ITER_BYTES); \ + if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) { \ + return HWLM_TERMINATED; \ + } \ + } \ + __builtin_prefetch(itPtr + (ITER_BYTES*4)); \ + u64a conf0; \ + u64a conf8; \ + get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_adjusted, \ + ft, &conf0, &conf8, &s); \ + do_confirm_fdr(&conf0, 0, &controlVal, confBase, a, itPtr, \ + control, &last_match_id, zz); \ + do_confirm_fdr(&conf8, 8, &controlVal, confBase, a, itPtr, \ + control, &last_match_id, zz); \ + if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) { \ + return HWLM_TERMINATED; \ + } \ + } /* end for loop */ \ + } while (0) \ + +static never_inline +hwlm_error_t fdr_engine_exec(const struct FDR *fdr, + const struct FDR_Runtime_Args *a) { + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t *control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + u32 last_match_id = INVALID_MATCH_ID; + u64a domain_mask_adjusted = fdr->domainMask << 1; + u8 stride = fdr->stride; + const u8 *ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR)); + const u32 *confBase = (const u32 *)(ft + fdr->tabSize); + struct zone zones[ZONE_MAX]; + assert(fdr->domain > 8 && fdr->domain < 16); + + size_t numZone = prepareZones(a->buf, a->len, + a->buf_history + a->len_history, + a->start_offset, a->firstFloodDetect, zones); + assert(numZone <= ZONE_MAX); + m128 state = getInitState(fdr, a->len_history, ft, &zones[0]); + + for (size_t curZone = 0; curZone < numZone; curZone++) { + struct zone *z = &zones[curZone]; + dumpZoneInfo(z, curZone); + + /* When a zone contains less data than is processed in an iteration + * of FDR_MAIN_LOOP(), we need to scan over some extra data. + * + * We have chosen to scan this extra data at the start of the + * iteration. The extra data is either data we have already scanned or + * garbage (if it is earlier than offset 0), + * + * As a result we need to shift the incoming state back so that it will + * properly line up with the data being scanned. + * + * We also need to forbid reporting any matches in the data being + * rescanned as they have already been reported (or are over garbage but + * later stages should also provide that safety guarantee). + */ + + u8 shift = z->shift; + + state = variable_byte_shift_m128(state, shift); + + state = or128(state, load128(zone_or_mask[shift])); + + switch (stride) { + case 1: + FDR_MAIN_LOOP(z, state, get_conf_stride_1); + break; + case 2: + FDR_MAIN_LOOP(z, state, get_conf_stride_2); + break; + case 4: + FDR_MAIN_LOOP(z, state, get_conf_stride_4); + break; + default: + break; + } + } + + return HWLM_SUCCESS; +} + #include "fdr_autogen.c" #define FAKE_HISTORY_SIZE 16 static const u8 fake_history[FAKE_HISTORY_SIZE]; -hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, size_t start, - HWLMCallback cb, void *ctxt, hwlm_group_t groups) { - +hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, + size_t start, HWLMCallback cb, void *ctxt, + hwlm_group_t groups) { const struct FDR_Runtime_Args a = { buf, len, @@ -73,7 +797,7 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, size_t st hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf, size_t hlen, const u8 *buf, size_t len, size_t start, HWLMCallback cb, void *ctxt, - hwlm_group_t groups, u8 * stream_state) { + hwlm_group_t groups, u8 *stream_state) { struct FDR_Runtime_Args a = { buf, len, @@ -86,9 +810,9 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf, ctxt, &groups, nextFloodDetect(buf, len, FLOOD_BACKOFF_START), - hbuf ? CONF_LOADVAL_CALL_CAUTIOUS(hbuf + hlen - 8, hbuf, hbuf + hlen) - : (u64a)0 - + /* we are guaranteed to always have 16 initialised bytes at the end of + * the history buffer (they may be garbage). */ + hbuf ? unaligned_load_u64a(hbuf + hlen - sizeof(u64a)) : (u64a)0 }; fdrUnpackState(fdr, &a, stream_state); diff --git a/src/fdr/fdr_autogen.py b/src/fdr/fdr_autogen.py deleted file mode 100755 index 748d811f..00000000 --- a/src/fdr/fdr_autogen.py +++ /dev/null @@ -1,564 +0,0 @@ -#!/usr/bin/python - -# Copyright (c) 2015, Intel Corporation -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of Intel Corporation nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import sys -from autogen_utils import * -from base_autogen import * -from string import Template - -class OrStep(Step): - def __init__(self, context, offset, width): - Step.__init__(self, context, offset) - s_var = self.gv("st%d" % offset) - if width < 128: - self.val = "s |= %s;" % s_var.name - else: - self.val = "s = or%d(s, %s);" % (width, s_var.name) - -class ShiftStateStep(Step): - def __init__(self, context, offset = 0, stride_used = 1): - Step.__init__(self, context, offset) - m = self.matcher - state = m.state_variable - shift_distance = -1 * stride_used * m.num_buckets - self.val = "%s = %s;" % (state.name, state.type.shift_expr(state.name, shift_distance)) - -class BulkLoadStep(Step): - def __init__(self, context, offset, size, define_var = True, aligned = True): - Step.__init__(self, context, offset) - m = self.matcher - self.latency = 4 - blt = m.bulk_load_type - if aligned: - init_string = blt.load_expr_data(self.offset, code = "aligned") - else: - init_string = blt.load_expr_data(self.offset) - - var_name = "current_data_%d" % offset - if define_var: - lb_var = self.nv(blt, var_name) - self.val = lb_var.gen_initializer_stmt(init_string) - else: - lb_var = self.gv(var_name, reader = False, writer = True) - self.val = "%s = %s;" % (var_name, init_string) - -class ValueExtractStep(Step): - def __init__(self, context, offset, sub_load_cautious = False): - Step.__init__(self, context, offset) - m = self.matcher - self.latency = 2 - dsb = m.datasize_bytes - modval = offset % dsb - - if modval == dsb - 1: - # Case 1: reading more than one byte over the end of the bulk load - - self.latency = 4 - if sub_load_cautious: - code_string = "cautious_forward" - else: - code_string = "normal" - load_string = m.single_load_type.load_expr_data(self.offset, code_string) - temp_string = "(%s << %d)" % (load_string, m.reach_shift_adjust) - else: - # Case 2: reading a value that can be found entirely in the current register - if m.fdr2_force_naive_load: - load_string = m.single_load_type.load_expr_data(self.offset, "normal") - temp_string = "(%s << %d)" % (load_string, m.reach_shift_adjust) - else: - lb_var = self.gv("current_data_%d" % (offset - modval)) - if modval == 0: - # Case 2a: value is at LSB end of the register and must be left- - # shifted into place if there is a "reach_shift_adjust" required - temp_string = "(%s << %d)" % (lb_var.name, m.reach_shift_adjust) - else: - # Case 2b: value is in the middle of the register and will be - # right-shifted into place (adjusted by "reach_shift_adjust") - temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust) - - - init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust) - v_var = self.nv(m.value_extract_type, "v%d" % offset) - self.val = v_var.gen_initializer_stmt(init_string) - -class TableLookupStep(Step): - def __init__(self, context, reach_multiplier, offset = 0): - Step.__init__(self, context, offset) - m = self.matcher - self.latency = 4 - v_var = self.gv("v%d" % offset) - s_var = self.nv(m.state_type, "st%d" % offset) - init_string = "*(const %s *)(ft + %s*%dU)" % ( m.state_type.get_name(), - v_var.name, reach_multiplier) - self.val = s_var.gen_initializer_stmt(init_string) - -class ShiftReachMaskStep(Step): - def __init__(self, context, offset): - Step.__init__(self, context, offset) - m = self.matcher - extr = m.extract_frequency - modval = offset % extr - s_var = self.gv("st%d" % offset, writer = True) - self.val = "%s = %s;" % (s_var.name, s_var.type.shift_expr(s_var.name, modval * m.num_buckets)) - -class ConfExtractStep(Step): - def __init__(self, context, offset): - Step.__init__(self, context, offset) - m = self.matcher - if m.state_type.isSIMDOnIntel(): - self.latency = 2 - init_string = m.state_type.lowbit_extract_expr("s", m.extract_size) - extr_var = self.nv(m.extr_type, "extr%d" % offset) - self.val = extr_var.gen_initializer_stmt(init_string) - -class ConfAccumulateStep(Step): - def __init__(self, context, extract_offset, conf_offset, define_var = True): - Step.__init__(self, context, extract_offset) - m = self.matcher - extr_var = self.gv("extr%d" % extract_offset) - extr_var_cast = "((%s)%s)" % (m.conf_type.get_name(), extr_var.name) - if extract_offset == conf_offset: - # create conf_var as a straight copy of extr - if define_var: - conf_var = self.nv(m.conf_type, "conf%d" % conf_offset) - self.val = conf_var.gen_initializer_stmt(extr_var_cast) - else: - conf_var = self.gv("conf%d" % conf_offset, writer = True, reader = True) - self.val = "%s = %s;" % (conf_var.name, extr_var_cast) - else: - # shift extr_var and insert/OR it in conf_var - conf_var = self.gv("conf%d" % conf_offset, writer = True, reader = True) - shift_dist = (extract_offset - conf_offset) * m.num_buckets - self.val = "%s |= %s;" % (conf_var.name, m.conf_type.shift_expr(extr_var_cast, shift_dist)) - self.latency = 2 - -class ConfirmFlipStep(Step): - def __init__(self, context, offset): - Step.__init__(self, context, offset) - m = self.matcher - conf_var = self.gv("conf%d" % self.offset, writer = True) - self.val = "%s = %s;" % (conf_var.name, - conf_var.type.flip_lowbits_expr(conf_var.name, self.matcher.confirm_frequency * m.num_buckets)) - -class ConfirmStep(Step): - def __init__(self, context, offset, cautious = False): - Step.__init__(self, context, offset) - m = self.matcher - conf_var = self.gv("conf%d" % offset, writer = True) - self.val = m.produce_confirm_base(conf_var.name, conf_var.type.size, offset, cautious, - enable_confirmless = m.stride == 1, do_bailout = False) - -class M3(MatcherBase): - def produce_compile_call(self): - print " { %d, %d, %d, %d, %s, %d, %d }," % ( - self.id, self.state_width, self.num_buckets, - self.stride, - self.arch.target, self.conf_pull_back, self.conf_top_level_split) - - def produce_main_loop(self, switch_variant = False): - stride_offsets = xrange(0, self.loop_bytes, self.stride) - stride_offsetSet = set(stride_offsets) - so_steps_last_block = [] - sh = None - last_confirm = None - ctxt = CodeGenContext(self) - - if switch_variant: - print " ptr -= (iterBytes - dist);" - print " { " # need an extra scope around switch variant to stop its globals escaping - else: - print " if (doMainLoop) {" - print " for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {" - print self.produce_flood_check() - print " __builtin_prefetch(ptr + (iterBytes*4));" - print " assert(((size_t)ptr % START_MOD) == 0);" - - - # just do globally for now - if switch_variant: - subsidiary_load_cautious = True - confirm_cautious = True - else: - subsidiary_load_cautious = False - confirm_cautious = False - - if not self.fdr2_force_naive_load: - bulk_load_steps = [ off for off in range(self.loop_bytes) - if off % self.datasize_bytes == 0 and - (set(range(off, off + self.datasize_bytes - 1)) & stride_offsetSet)] - else: - bulk_load_steps = [] - - confirm_steps = [ off for off in range(self.loop_bytes) if off % self.confirm_frequency == 0 ] - - for off in bulk_load_steps: - lb_var = ctxt.new_var(None, self.bulk_load_type, "current_data_%d" % off) - print " " + lb_var.gen_initializer_stmt() - - - for off in confirm_steps: - var_name = "conf%d" % off - conf_def_var = ctxt.new_var(None, self.conf_type, var_name) - if switch_variant: - init_string = "(%s)-1" % self.conf_type.get_name() - else: - init_string = "" - print " " + conf_def_var.gen_initializer_stmt(init_string) - - if switch_variant: - print " switch(iterBytes - dist) {" - for i in range(0, self.loop_bytes): - print " case %d:" % i - - # init and poison conf; over-precise but harmless - conf_id = (i / self.confirm_frequency) * self.confirm_frequency - if i % self.confirm_frequency: - conf_fixup_bits = self.conf_type.size - (self.num_buckets * (i % self.confirm_frequency)) - print " conf%d >>= %d;" % (conf_id, conf_fixup_bits) - else: - print " conf%d = 0;" % conf_id - - # init state - state_fixup = i % self.extract_frequency - state = self.state_variable - shift_distance = self.num_buckets * state_fixup - if state_fixup: - print " %s = %s;" % (state.name, state.type.shift_expr(state.name, shift_distance)) - if self.state_width < 128: - print " %s |= %s;" % (state.name, state.type.lowbit_mask(shift_distance)) - else: - print " %s = or%d(%s, %s);" % (state.name, self.state_width, state.name, state.type.lowbit_mask(shift_distance)) - - if not self.fdr2_force_naive_load: - # init current_data (could poison it in some cases) - load_mod = i % self.datasize_bytes - load_offset = i - load_mod - if load_mod: - # not coming in on an even boundary means having to do a load var - # actually, there are a bunch of things we can do on this bulk load - # to avoid having to be 'cautious_backwards' but I'm not completely - # sure they are good ideas - init_string = self.bulk_load_type.load_expr_data(load_offset, - code = "cautious_backward") - var_name = "current_data_%d" % load_offset - lb_var = ctxt.get_var(None, var_name, reader = False, writer = True) - print " %s = %s;" % (lb_var.name, init_string) - - print " goto off%d;" % i - print " case %d: goto skipSwitch;" % self.loop_bytes - print " }" - print " {" - - - for off in range(self.loop_bytes): - # X_mod is the offset we're up to relative to the last X operation - # X_offset is which of the last X operations matches this iteration - - if (switch_variant): - LabelStep(ctxt, off) - - if off in bulk_load_steps: - if not self.fdr2_force_naive_load: - BulkLoadStep(ctxt, off, self.datasize, define_var = False, aligned = not switch_variant) - - if off in stride_offsets: - if switch_variant: - OpenScopeStep(ctxt, off) - ValueExtractStep(ctxt, off, sub_load_cautious = subsidiary_load_cautious) - TableLookupStep(ctxt, self.reach_mult, off) - if off % self.extract_frequency: - ShiftReachMaskStep(ctxt, off) - so = OrStep(ctxt, off, self.state_width) - if switch_variant: - CloseScopeStep(ctxt, off) - if sh != None: - so.add_dependency(sh) - so_steps_last_block += [ so ] - - extract_mod = off % self.extract_frequency - extract_offset = off - extract_mod - extract_ready = extract_mod == self.extract_frequency - 1 - if extract_ready: - if switch_variant: - OpenScopeStep(ctxt, off) - ex = ConfExtractStep(ctxt, extract_offset) - ConfAccumulateStep(ctxt, extract_offset, confirm_offset, define_var = False) - for so_step in so_steps_last_block: - ex.add_dependency(so_step) - if switch_variant: - CloseScopeStep(ctxt, off) - so_steps_last_block = [] - sh = ShiftStateStep(ctxt, extract_offset, stride_used = self.extract_frequency) - sh.add_dependency(ex) - - confirm_mod = off % self.confirm_frequency - confirm_offset = off - confirm_mod - confirm_ready = confirm_mod == self.confirm_frequency - 1 - if confirm_ready: - cflip = ConfirmFlipStep(ctxt, confirm_offset) - cf = ConfirmStep(ctxt, confirm_offset, cautious = confirm_cautious ) - if last_confirm: - cf.add_dependency(last_confirm) - last_confirm = cf - - - if not switch_variant: - print ctxt.schedule([ last_confirm, sh ]) - else: - print ctxt.dontschedule([ last_confirm, sh ]) - - if switch_variant: - print "skipSwitch:;" - print " ptr += iterBytes;" - print " }" # close extra scope around switch variant - print " }" - - - def produce_init_state(self): - state = self.state_variable - s_type = self.state_type - shift_distance = -1 * self.num_buckets - shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance)) - - s = Template(""" - $TYPENAME s; - if (a->len_history) { - u32 tmp = 0; - if (a->start_offset == 0) { - tmp = a->buf_history[a->len_history - 1]; - tmp |= (a->buf[0] << 8); - } else { - tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len); - } - tmp &= fdr->domainMask; - s = *((const $TYPENAME *)ft + tmp); - $SHIFT_EXPR; - } else { - s = *(const $TYPENAME *)&fdr->start; - } -""").substitute(TYPENAME = s_type.get_name(), - ZERO_EXPR = s_type.zero_expression(), - SHIFT_EXPR = shift_expr) - return s - - def produce_code(self): - - loop_read_behind = 0 - loop_read_ahead = self.loop_bytes + 1 - - # we set up mask and shift stuff for extracting our masks from registers - # - # we have a choice as to whether to mask out the value early or - # extract the value (shift first) then mask it - # - # Intel has a free scaling factor from 1/2/4/8 so we want to combine - # the extra needed shift for SSE registers with the mask operation - - ssb = self.state_type.size / 8 # state size in bytes - - # Intel path - if ssb == 16: - # obscure corner - we don't have the room in the register to - # do this for all values so we don't. domain==16 is pretty - # bad anyhow, of course - self.reach_mult = 8 - else: - self.reach_mult = ssb - - shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 } - self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ] - - print self.produce_header(visible = False) - - print "// ", - print " Arch: " + self.arch.name, - print " State type: " + self.state_type.get_name(), - print " Num buckets: %d" % self.num_buckets, - print " Stride: %d" % self.stride - - print self.produce_common_declarations() - - print " assert(fdr->domain > 8 && fdr->domain < 16);" - print - print " u64a domain_mask = fdr->domainMask;" - print " const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));" - print " const u32 * confBase = (const u32 *)(ft + fdr->tabSize);" - print self.produce_init_state() - print " const size_t iterBytes = %d;" % self.loop_bytes - print " const size_t START_MOD = %d;" % self.datasize_bytes - print " const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead - - print """ - while (ptr < buf + len) { - - u8 doMainLoop = 1; - size_t remaining = len - (ptr - buf); - size_t dist; - if (remaining <= iterBytes) { - dist = remaining; // once through the switch and we're done - } else if (remaining < 2 * iterBytes) { - // nibble some stuff off the front, skip the main loop, - // then come back here - dist = iterBytes; // maybe could be cleverer - } else { - // now, we need to see if we can make it to a main loop iteration - // if so, we need to ensure that the main loop iteration is aligned - // to a START_MOD boundary and i >= 8 so we can read ptr + i - 8 - - // see if we can do it - if not, just switch the main loop off, - // eat iterBytes in cautious mode, and come back to this loop - - const u8 * target = MAX(buf + 8, ptr); - target = ROUNDUP_PTR(target, START_MOD); - dist = target - ptr; - if (dist > iterBytes) { - doMainLoop = 0; - dist = iterBytes; - } - } -""" - self.produce_main_loop(switch_variant = True) - self.produce_main_loop(switch_variant = False) - print """ - } -""" - print self.produce_footer() - - def get_name(self): - return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width) - - def __init__(self, state_width, stride, - arch, - table_state_width = None, - num_buckets = 8, - extract_frequency = None, - confirm_frequency = None): - - # First - set up the values that are fundamental to how this matcher will operate - self.arch = arch - - # get the width of the state width on which we operate internally - if state_width not in [ 128 ]: - fail_out("Unknown state width: %d" % state_width) - self.state_width = state_width - self.state_type = getRequiredType(self.state_width) - self.state_variable = IntegerVariable("s", self.state_type) - - table_state_width = state_width - self.table_state_width = state_width - self.table_state_type = getRequiredType(self.table_state_width) - - # this is the load type required for domain [9:15] if we want to - # load it one at a time - self.single_load_type = IntegerType(16) - - # stride is the frequency with which we make data-driven - # accesses to our reach table - if stride not in [ 1, 2, 4, 8]: - fail_out("Unsupported stride: %d" % stride) - if stride * num_buckets > state_width: - fail_out("Stride %d is too big for the number of buckets %d given state width %d\n" % (stride, num_buckets, state_width)) - self.stride = stride - - if num_buckets != 8: - fail_out("Unsupported number of buckets: %d" % num_buckets) - if state_width % num_buckets and state_width == 128: - fail_out("Bucket scheme requires bit-shifts on m128 (failing)") - self.num_buckets = num_buckets - - # Second - set up derived or optimization values - these can be - # overridden by arguments that are passed in - - self.datasize = 64 - self.bulk_load_type = IntegerType(self.datasize) - self.datasize_bytes = self.datasize/8 - - self.value_extract_type = IntegerType(self.datasize) - - self.fdr2_force_naive_load = False # disable everywhere for trunk - - # extract frequency is how frequently (in bytes) we destructively shift - # our state value after having pulled out that many bytes into a - # confirm register (of one sort or another). - # none means a default value - datasize, our biggest easily available GPR - if extract_frequency is None: - extract_frequency = self.datasize_bytes - self.extract_frequency = extract_frequency - self.extract_size = self.extract_frequency*self.num_buckets - if extract_frequency < stride: - fail_out("Can't extract at extract frequency %d with stride %d" % (extract_frequency, stride)) - if extract_frequency not in [ None, 1, 2, 4, 8, 16]: - fail_out("Weird extract frequency: %d" % extract_frequency) - - if self.extract_size <= 32: - self.extr_type = IntegerType(32) - elif self.extract_size <= 64: - self.extr_type = IntegerType(64) - else: - fail_out("Implausible size %d required for confirm extract step" % size) - - # extract_frequency is how often we pull out our state and place - # it somewhere in a lossless fashion - # confirm_frequency, on the other hand, is how frequently we - # take the state extracted by extract_frequency and cobble it - # together into a matching loop - # confirm_frequency must be a multiple of extract_frequency - # and must fit into a fast register; for now; we're going to - # stay in the GPR domain - if confirm_frequency is None: - confirm_frequency = self.extract_frequency - self.confirm_frequency = confirm_frequency - if confirm_frequency % self.extract_frequency: - fail_out("Confirm frequency %d must be evenly divisible by extract_frequency %d" % (confirm_frequency, self.extract_frequency)) - - self.conf_size = self.confirm_frequency * self.num_buckets - if self.conf_size <= 32: - self.conf_type = IntegerType(32) - elif self.conf_size <= 64: - self.conf_type = IntegerType(64) - else: - fail_out("Implausible size %d required for confirm accumulate step" % self.conf_size) - - # how many bytes in flight at once - self.loop_bytes = 16 - - # confirm configuration - - # how many entries in the top-level confirm table - 256 means - # complete split on the last character - self.conf_top_level_split = 256 - - # how much we 'pull back' in confirm - this is obviously related - # to the first level conf but we will keep two separate paramters - # for this to avoid the risk of conflating these - self.conf_pull_back = 1 - - if self.conf_pull_back > 0 and self.conf_top_level_split < 256: - fail_out("Pull back distance %d not supported by top level split %d" % (self.conf_pull_back, self.conf_top_level_split)) - - # minor stuff - self.default_body_indent = 8 diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp index ccf177f0..0c4ef35d 100644 --- a/src/fdr/fdr_compile.cpp +++ b/src/fdr/fdr_compile.cpp @@ -187,9 +187,9 @@ aligned_unique_ptr FDRCompiler::setupFDR(pair link) { /* we are allowing domains 9 to 15 only */ assert(eng.bits > 8 && eng.bits < 16); fdr->domain = eng.bits; - fdr->schemeWidthByte = eng.schemeWidth / 8; fdr->domainMask = (1 << eng.bits) - 1; - fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte; + fdr->tabSize = (1 << eng.bits) * (eng.schemeWidth / 8); + fdr->stride = eng.stride; if (link.first) { fdr->link = verify_u32(ptr - fdr_base); @@ -544,6 +544,7 @@ fdrBuildTableInternal(const vector &lits, bool make_small, // temporary hack for unit testing if (hint != HINT_INVALID) { des->bits = 9; + des->stride = 1; } FDRCompiler fc(lits, *des, make_small); diff --git a/src/fdr/fdr_confirm_runtime.h b/src/fdr/fdr_confirm_runtime.h index a77a8b89..9b1df593 100644 --- a/src/fdr/fdr_confirm_runtime.h +++ b/src/fdr/fdr_confirm_runtime.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -36,144 +36,121 @@ #include "util/bitutils.h" #include "util/compare.h" -#define CONF_LOADVAL_CALL lv_u64a -#define CONF_LOADVAL_CALL_CAUTIOUS lv_u64a_ce - // this is ordinary confirmation function which runs through // the whole confirmation procedure static really_inline -void confWithBit(const struct FDRConfirm * fdrc, - const struct FDR_Runtime_Args * a, - size_t i, - CautionReason r, - u32 pullBackAmount, - hwlmcb_rv_t *control, - u32 * last_match) { +void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a, + size_t i, u32 pullBackAmount, hwlmcb_rv_t *control, + u32 *last_match, u64a conf_key) { assert(i < a->len); assert(ISALIGNED(fdrc)); const u8 * buf = a->buf; - const size_t len = a->len; - - CONF_TYPE v; - const u8 * confirm_loc = buf + i - pullBackAmount - 7; - if (likely(r == NOT_CAUTIOUS || confirm_loc >= buf)) { - v = CONF_LOADVAL_CALL(confirm_loc, buf, buf + len); - } else { // r == VECTORING, confirm_loc < buf - u64a histBytes = a->histBytes; - v = CONF_LOADVAL_CALL_CAUTIOUS(confirm_loc, buf, buf + len); - // stitch together v (which doesn't move) and history (which does) - u32 overhang = buf - confirm_loc; - histBytes >>= 64 - (overhang * 8); - v |= histBytes; + u32 c = CONF_HASH_CALL(conf_key, fdrc->andmsk, fdrc->mult, + fdrc->nBitsOrSoleID); + u32 start = getConfirmLitIndex(fdrc)[c]; + if (likely(!start)) { + return; } - u32 c = CONF_HASH_CALL(v, fdrc->andmsk, fdrc->mult, fdrc->nBitsOrSoleID); - u32 start = getConfirmLitIndex(fdrc)[c]; - if (P0(start)) { - const struct LitInfo *l = - (const struct LitInfo *)((const u8 *)fdrc + start); + const struct LitInfo *li + = (const struct LitInfo *)((const u8 *)fdrc + start); - u8 oldNext; // initialized in loop - do { - assert(ISALIGNED(l)); + u8 oldNext; // initialized in loop + do { + assert(ISALIGNED(li)); - if (P0( (v & l->msk) != l->v)) { + if (unlikely((conf_key & li->msk) != li->v)) { + goto out; + } + + if ((*last_match == li->id) && (li->flags & NoRepeat)) { + goto out; + } + + const u8 *loc = buf + i - li->size + 1 - pullBackAmount; + + u8 caseless = li->flags & Caseless; + if (loc < buf) { + u32 full_overhang = buf - loc; + + const u8 *history = caseless ? a->buf_history_nocase + : a->buf_history; + size_t len_history = caseless ? a->len_history_nocase + : a->len_history; + + // can't do a vectored confirm either if we don't have + // the bytes + if (full_overhang > len_history) { goto out; } - if ((*last_match == l->id) && (l->flags & NoRepeat)) { - goto out; + // as for the regular case, no need to do a full confirm if + // we're a short literal + if (unlikely(li->size > sizeof(CONF_TYPE))) { + const u8 *s1 = li->s; + const u8 *s2 = s1 + full_overhang; + const u8 *loc1 = history + len_history - full_overhang; + const u8 *loc2 = buf; + size_t size1 = MIN(full_overhang, li->size - sizeof(CONF_TYPE)); + size_t wind_size2_back = sizeof(CONF_TYPE) + full_overhang; + size_t size2 = wind_size2_back > li->size ? + 0 : li->size - wind_size2_back; + + if (cmpForward(loc1, s1, size1, caseless)) { + goto out; + } + if (cmpForward(loc2, s2, size2, caseless)) { + goto out; + } } + } else { // NON-VECTORING PATH - const u8 * loc = buf + i - l->size + 1 - pullBackAmount; + // if string < conf_type we don't need regular string cmp + if (unlikely(li->size > sizeof(CONF_TYPE))) { + if (cmpForward(loc, li->s, li->size - sizeof(CONF_TYPE), + caseless)) { + goto out; + } + } + } - u8 caseless = l->flags & Caseless; - if (loc < buf) { - u32 full_overhang = buf - loc; + if (unlikely(!(li->groups & *control))) { + goto out; + } - const u8 * history = (caseless) ? - a->buf_history_nocase : a->buf_history; - size_t len_history = (caseless) ? - a->len_history_nocase : a->len_history; - - // can't do a vectored confirm either if we don't have - // the bytes + if (unlikely(li->flags & ComplexConfirm)) { + const u8 *loc2 = buf + i - li->extended_size + 1 - pullBackAmount; + if (loc2 < buf) { + u32 full_overhang = buf - loc2; + size_t len_history = caseless ? a->len_history_nocase + : a->len_history; if (full_overhang > len_history) { goto out; } - - // as for the regular case, no need to do a full confirm if - // we're a short literal - if (unlikely(l->size > sizeof(CONF_TYPE))) { - const u8 * s1 = l->s; - const u8 * s2 = s1 + full_overhang; - const u8 * loc1 = history + len_history - full_overhang; - const u8 * loc2 = buf; - size_t size1 = MIN(full_overhang, - l->size - sizeof(CONF_TYPE)); - size_t wind_size2_back = sizeof(CONF_TYPE) + - full_overhang; - size_t size2 = wind_size2_back > l->size ? - 0 : l->size - wind_size2_back; - - if (cmpForward(loc1, s1, size1, caseless)) { - goto out; - } - if (cmpForward(loc2, s2, size2, caseless)) { - goto out; - } - } - } else { // NON-VECTORING PATH - - // if string < conf_type we don't need regular string cmp - if (unlikely(l->size > sizeof(CONF_TYPE))) { - if (cmpForward(loc, l->s, l->size - sizeof(CONF_TYPE), caseless)) { - goto out; - } - } } + } - if (P0(!(l->groups & *control))) { - goto out; - } - - if (unlikely(l->flags & ComplexConfirm)) { - const u8 * loc2 = buf + i - l->extended_size + 1 - pullBackAmount; - if (loc2 < buf) { - u32 full_overhang = buf - loc2; - size_t len_history = (caseless) ? - a->len_history_nocase : a->len_history; - if (full_overhang > len_history) { - goto out; - } - } - } - - *last_match = l->id; - *control = a->cb(loc - buf, i, l->id, a->ctxt); -out: - oldNext = l->next; // oldNext is either 0 or an 'adjust' value - l = (const struct LitInfo*)((const u8 *)l + oldNext + l->size); - } while (oldNext); - } + *last_match = li->id; + *control = a->cb(loc - buf, i, li->id, a->ctxt); + out: + oldNext = li->next; // oldNext is either 0 or an 'adjust' value + li = (const struct LitInfo *)((const u8 *)li + oldNext + li->size); + } while (oldNext); } // 'light-weight' confirmation function which is used by 1-mask Teddy; // in the 'confirmless' case it simply calls callback function, // otherwise it calls 'confWithBit' function for the full confirmation procedure static really_inline -void confWithBit1(const struct FDRConfirm * fdrc, - const struct FDR_Runtime_Args * a, - size_t i, - CautionReason r, - hwlmcb_rv_t *control, - u32 * last_match) { +void confWithBit1(const struct FDRConfirm *fdrc, + const struct FDR_Runtime_Args *a, size_t i, + hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) { assert(i < a->len); assert(ISALIGNED(fdrc)); if (unlikely(fdrc->mult)) { - confWithBit(fdrc, a, i, r, 0, control, last_match); + confWithBit(fdrc, a, i, 0, control, last_match, conf_key); return; } else { u32 id = fdrc->nBitsOrSoleID; @@ -190,12 +167,9 @@ void confWithBit1(const struct FDRConfirm * fdrc, // In the 'confirmless' case it makes fast 32-bit comparison, // otherwise it calls 'confWithBit' function for the full confirmation procedure static really_inline -void confWithBitMany(const struct FDRConfirm * fdrc, - const struct FDR_Runtime_Args * a, - size_t i, - CautionReason r, - hwlmcb_rv_t *control, - u32 * last_match) { +void confWithBitMany(const struct FDRConfirm *fdrc, + const struct FDR_Runtime_Args *a, size_t i, CautionReason r, + hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) { assert(i < a->len); assert(ISALIGNED(fdrc)); @@ -204,7 +178,7 @@ void confWithBitMany(const struct FDRConfirm * fdrc, } if (unlikely(fdrc->mult)) { - confWithBit(fdrc, a, i, r, 0, control, last_match); + confWithBit(fdrc, a, i, 0, control, last_match, conf_key); return; } else { const u32 id = fdrc->nBitsOrSoleID; @@ -215,7 +189,7 @@ void confWithBitMany(const struct FDRConfirm * fdrc, } if (r == VECTORING && len > i - a->start_offset) { - if (len > (i + a->len_history)) { + if (len > i + a->len_history) { return; } diff --git a/src/fdr/fdr_dump.cpp b/src/fdr/fdr_dump.cpp index a141f388..7e794bb3 100644 --- a/src/fdr/fdr_dump.cpp +++ b/src/fdr/fdr_dump.cpp @@ -68,8 +68,7 @@ void fdrPrintStats(const FDR *fdr, FILE *f) { } if (isTeddy) { - unique_ptr des = - getTeddyDescription(fdr->engineID); + auto des = getTeddyDescription(fdr->engineID); if (des) { fprintf(f, " masks %u\n", des->numMasks); fprintf(f, " buckets %u\n", des->getNumBuckets()); @@ -78,16 +77,8 @@ void fdrPrintStats(const FDR *fdr, FILE *f) { fprintf(f, " \n"); } } else { - unique_ptr des = - getFdrDescription(fdr->engineID); - if (des) { - fprintf(f, " domain %u\n", des->bits); - fprintf(f, " stride %u\n", des->stride); - fprintf(f, " buckets %u\n", des->getNumBuckets()); - fprintf(f, " width %u\n", des->schemeWidth); - } else { - fprintf(f, " \n"); - } + fprintf(f, " domain %u\n", fdr->domain); + fprintf(f, " stride %u\n", fdr->stride); } fprintf(f, " strings ???\n"); diff --git a/src/fdr/fdr_engine_description.cpp b/src/fdr/fdr_engine_description.cpp index 5d470c7e..103bc214 100644 --- a/src/fdr/fdr_engine_description.cpp +++ b/src/fdr/fdr_engine_description.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -42,13 +42,11 @@ using namespace std; namespace ue2 { -#include "fdr_autogen_compiler.cpp" - FDREngineDescription::FDREngineDescription(const FDREngineDef &def) : EngineDescription(def.id, targetByArchFeatures(def.cpu_features), def.numBuckets, def.confirmPullBackDistance, def.confirmTopLevelSplit), - schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {} + schemeWidth(def.schemeWidth), stride(0), bits(0) {} u32 FDREngineDescription::getDefaultFloodSuffixLength() const { // rounding up, so that scheme width 32 and 6 buckets is 6 not 5! @@ -56,6 +54,12 @@ u32 FDREngineDescription::getDefaultFloodSuffixLength() const { return ((getSchemeWidth() + getNumBuckets() - 1) / getNumBuckets()) + 1; } +void getFdrDescriptions(vector *out) { + static const FDREngineDef def = {0, 128, 8, 0, 1, 256}; + out->clear(); + out->push_back(FDREngineDescription(def)); +} + static u32 findDesiredStride(size_t num_lits, size_t min_len, size_t min_len_count) { u32 desiredStride = 1; // always our safe fallback @@ -108,32 +112,33 @@ unique_ptr chooseEngine(const target_t &target, FDREngineDescription *best = nullptr; u32 best_score = 0; + FDREngineDescription &eng = allDescs[0]; + for (u32 domain = 9; domain <= 15; domain++) { - for (size_t engineID = 0; engineID < allDescs.size(); engineID++) { + for (size_t stride = 1; stride <= 4; stride *= 2) { // to make sure that domains >=14 have stride 1 according to origin - if (domain > 13 && engineID > 0) { + if (domain > 13 && stride > 1) { continue; } - FDREngineDescription &eng = allDescs[engineID]; if (!eng.isValidOnTarget(target)) { continue; } - if (msl < eng.stride) { + if (msl < stride) { continue; } u32 score = 100; - score -= absdiff(desiredStride, eng.stride); + score -= absdiff(desiredStride, stride); - if (eng.stride <= desiredStride) { - score += eng.stride; + if (stride <= desiredStride) { + score += stride; } u32 effLits = vl.size(); /* * desiredStride;*/ u32 ideal; if (effLits < eng.getNumBuckets()) { - if (eng.stride == 1) { + if (stride == 1) { ideal = 8; } else { ideal = 10; @@ -158,27 +163,28 @@ unique_ptr chooseEngine(const target_t &target, ideal -= 2; } - if (eng.stride > 1) { + if (stride > 1) { ideal++; } DEBUG_PRINTF("effLits %u\n", effLits); if (target.is_atom_class() && !make_small && effLits < 4000) { - /* Unless it is a very heavy case, we want to build smaller tables - * on lightweight machines due to their small caches. */ + /* Unless it is a very heavy case, we want to build smaller + * tables on lightweight machines due to their small caches. */ ideal -= 2; } score -= absdiff(ideal, domain); - DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u " + DEBUG_PRINTF("fdr %u: width=%u, domain=%u, buckets=%u, stride=%zu " "-> score=%u\n", - eng.getID(), eng.schemeWidth, eng.bits, - eng.getNumBuckets(), eng.stride, score); + eng.getID(), eng.schemeWidth, domain, + eng.getNumBuckets(), stride, score); if (!best || score > best_score) { eng.bits = domain; + eng.stride = stride; best = ŋ best_score = score; } diff --git a/src/fdr/fdr_engine_description.h b/src/fdr/fdr_engine_description.h index 45f64ac0..d4e70d4b 100644 --- a/src/fdr/fdr_engine_description.h +++ b/src/fdr/fdr_engine_description.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -42,7 +42,6 @@ struct FDREngineDef { u32 id; u32 schemeWidth; u32 numBuckets; - u32 stride; u64a cpu_features; u32 confirmPullBackDistance; u32 confirmTopLevelSplit; @@ -73,7 +72,6 @@ chooseEngine(const target_t &target, const std::vector &vl, bool make_small); std::unique_ptr getFdrDescription(u32 engineID); void getFdrDescriptions(std::vector *out); - } // namespace ue2 #endif diff --git a/src/fdr/fdr_internal.h b/src/fdr/fdr_internal.h index 607e039c..cde13f6c 100644 --- a/src/fdr/fdr_internal.h +++ b/src/fdr/fdr_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -76,17 +76,17 @@ struct FDR { * structures (spillover strings and hash table) if we're a secondary * structure. */ u32 link; - u8 domain; /* dynamic domain info */ - u8 schemeWidthByte; /* scheme width in bytes */ + u8 stride; /* stride - how frequeuntly the data is consulted by the first + * stage matcher */ + u8 domain; /* number of bits used to index into main FDR table. This value + * is used only of debugging/asserts. */ u16 domainMask; /* pre-computed domain mask */ u32 tabSize; /* pre-computed hashtable size in bytes */ - u32 pad1; + u32 pad; - union { - u32 s_u32; - u64a s_u64a; - m128 s_m128; - } start; + m128 start; /* initial start state to use at offset 0. The state has been set + * up based on the min length of buckets to reduce the need for + * pointless confirms. */ }; /** \brief FDR runtime arguments. diff --git a/src/fdr/fdr_loadval.h b/src/fdr/fdr_loadval.h index 95e8981f..37baf823 100644 --- a/src/fdr/fdr_loadval.h +++ b/src/fdr/fdr_loadval.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -37,7 +37,12 @@ #define MAKE_LOADVAL(type, name) \ static really_inline type name (const u8 * ptr, UNUSED const u8 * lo, UNUSED const u8 * hi) -#define NORMAL_SAFE(type) assert(ptr >= lo && (ptr + sizeof(type) - 1) < hi) +#define NORMAL_SAFE(type) \ + do { \ + assert(ptr >= lo); \ + assert(ptr + sizeof(type) - 1 < hi); \ + } while(0) + #define ALIGNED_SAFE(type) NORMAL_SAFE(type); assert(((size_t)ptr % sizeof(type)) == 0); // these ones need asserts to test the property that we're not handling dynamically #define CAUTIOUS_FORWARD_SAFE(type) assert(ptr >= lo) diff --git a/src/fdr/teddy_autogen.py b/src/fdr/teddy_autogen.py index 21050110..1cada00c 100755 --- a/src/fdr/teddy_autogen.py +++ b/src/fdr/teddy_autogen.py @@ -1,6 +1,6 @@ #!/usr/bin/python -# Copyright (c) 2015, Intel Corporation +# Copyright (c) 2015-2016, Intel Corporation # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: @@ -27,19 +27,110 @@ import sys from autogen_utils import * -from base_autogen import * from string import Template -class MT(MatcherBase): +class MT: + def produce_header(self, visible, header_only = False): + s = "" + if not visible: + s += "static never_inline" + s += """ +hwlm_error_t %s(UNUSED const struct FDR *fdr, + UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name() + if header_only: + s += ";" + else: + s += "{" + s += "\n" + return s + + def produce_guard(self): + print self.arch.get_guard() + + def produce_zero_alternative(self): + print """ +#else +#define %s 0 +#endif +""" % self.get_name() + + def close_guard(self): + print "#endif" + + def produce_confirm_base(self, conf_var_name, conf_var_size, offset, cautious, enable_confirmless, do_bailout = False): + if cautious: + caution_string = "VECTORING" + else: + caution_string = "NOT_CAUTIOUS" + conf_split_mask = IntegerType(32).constant_to_string( + self.conf_top_level_split - 1) + if enable_confirmless: + quick_check_string = """ + if (!fdrc->mult) { + u32 id = fdrc->nBitsOrSoleID; + if ((last_match == id) && (fdrc->flags & NoRepeat)) + continue; + last_match = id; + controlVal = a->cb(ptr+byte-buf, ptr+byte-buf, id, a->ctxt); + continue; + } """ + else: + quick_check_string = "" + if do_bailout: + bailout_string = """ + if ((ptr + byte < buf + a->start_offset) || (ptr + byte >= buf + len)) continue;""" + else: + bailout_string = "" + + return Template(""" +if (P0(!!$CONFVAR)) { + do { + u32 bit = findAndClearLSB_$CONFVAR_SIZE(&$CONFVAR); + u32 byte = bit / $NUM_BUCKETS + $OFFSET; + u32 bitRem = bit % $NUM_BUCKETS; + $BAILOUT_STRING + u32 confSplit = *(ptr+byte) & $SPLIT_MASK; + u32 idx = confSplit * $NUM_BUCKETS + bitRem; + u32 cf = confBase[idx]; + if (!cf) + continue; + fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf); + if (!(fdrc->groups & *control)) + continue; + $QUICK_CHECK_STRING + CautionReason reason = $CAUTION_STRING; + CONF_TYPE v; + const u8 * confirm_loc = ptr + byte - $CONF_PULL_BACK - 7; + if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) { + v = lv_u64a(confirm_loc, buf, buf + len); + } else { // r == VECTORING, confirm_loc < buf + u64a histBytes = a->histBytes; + v = lv_u64a_ce(confirm_loc, buf, buf + len); + // stitch together v (which doesn't move) and history (which does) + u32 overhang = buf - confirm_loc; + histBytes >>= 64 - (overhang * 8); + v |= histBytes; + } + confWithBit(fdrc, a, ptr - buf + byte, $CONF_PULL_BACK, control, &last_match, v); + } while(P0(!!$CONFVAR)); + if (P0(controlVal == HWLM_TERMINATE_MATCHING)) { + *a->groups = controlVal; + return HWLM_TERMINATED; + } +}""").substitute(CONFVAR = conf_var_name, + CONFVAR_SIZE = conf_var_size, + NUM_BUCKETS = self.num_buckets, + OFFSET = offset, + SPLIT_MASK = conf_split_mask, + QUICK_CHECK_STRING = quick_check_string, + BAILOUT_STRING = bailout_string, + CAUTION_STRING = caution_string, + CONF_PULL_BACK = self.conf_pull_back) + def produce_confirm(self, iter, var_name, offset, bits, cautious = True): if self.packed: print self.produce_confirm_base(var_name, bits, iter*16 + offset, cautious, enable_confirmless = False, do_bailout = False) else: - if self.num_masks == 1: - conf_func = "confWithBit1" - else: - conf_func = "confWithBitMany" - if cautious: caution_string = "VECTORING" else: @@ -48,16 +139,33 @@ class MT(MatcherBase): print " if (P0(!!%s)) {" % var_name print " do {" if bits == 64: - print " bit = findAndClearLSB_64(&%s);" % (var_name) + print " u32 bit = findAndClearLSB_64(&%s);" % (var_name) else: - print " bit = findAndClearLSB_32(&%s);" % (var_name) - print " byte = bit / %d + %d;" % (self.num_buckets, iter*16 + offset) - print " idx = bit %% %d;" % self.num_buckets - print " cf = confBase[idx];" + print " u32 bit = findAndClearLSB_32(&%s);" % (var_name) + print " u32 byte = bit / %d + %d;" % (self.num_buckets, iter*16 + offset) + print " u32 idx = bit %% %d;" % self.num_buckets + print " u32 cf = confBase[idx];" print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);" print " if (!(fdrc->groups & *control))" print " continue;" - print " %s(fdrc, a, ptr - buf + byte, %s, control, &last_match);" % (conf_func, caution_string) + print """ + CautionReason reason = %s; + CONF_TYPE v; + const u8 * confirm_loc = ptr + byte - 7; + if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) { + v = lv_u64a(confirm_loc, buf, buf + len); + } else { // r == VECTORING, confirm_loc < buf + u64a histBytes = a->histBytes; + v = lv_u64a_ce(confirm_loc, buf, buf + len); + // stitch together v (which doesn't move) and history (which does) + u32 overhang = buf - confirm_loc; + histBytes >>= 64 - (overhang * 8); + v |= histBytes; + }""" % (caution_string) + if self.num_masks == 1: + print " confWithBit1(fdrc, a, ptr - buf + byte, control, &last_match, v);" + else: + print " confWithBitMany(fdrc, a, ptr - buf + byte, %s, control, &last_match, v);" % (caution_string) print " } while(P0(!!%s));" % var_name print " if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {" print " *a->groups = controlVal;" @@ -146,7 +254,17 @@ class MT(MatcherBase): def produce_code(self): print self.produce_header(visible = True, header_only = False) - print self.produce_common_declarations() + print """ + const u8 * buf = a->buf; + const size_t len = a->len; + const u8 * ptr = buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t * control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 * tryFloodDetect = a->firstFloodDetect; + const struct FDRConfirm *fdrc; + u32 last_match = (u32)-1; +""" print self.produce_needed_temporaries(self.num_iterations) @@ -179,10 +297,17 @@ class MT(MatcherBase): print " ptr += 16;" print " }" - print " for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {" - print " __builtin_prefetch(ptr + (iterBytes*4));" - print self.produce_flood_check() - + print """ + for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + if (P0(ptr > tryFloodDetect)) { + tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes); + if (P0(controlVal == HWLM_TERMINATE_MATCHING)) { + *a->groups = controlVal; + return HWLM_TERMINATED; + } + } +""" for iter in range(self.num_iterations): self.produce_one_iteration(iter, self.num_iterations, cautious = False, confirmCautious = False) @@ -192,7 +317,11 @@ class MT(MatcherBase): self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True) print " }" - print self.produce_footer() + print """ + *a->groups = controlVal; + return HWLM_SUCCESS; +} +""" def produce_compile_call(self): packed_str = { False : "false", True : "true"}[self.packed] @@ -256,7 +385,17 @@ class MTFat(MT): def produce_code(self): print self.produce_header(visible = True, header_only = False) - print self.produce_common_declarations() + print """ + const u8 * buf = a->buf; + const size_t len = a->len; + const u8 * ptr = buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t * control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 * tryFloodDetect = a->firstFloodDetect; + const struct FDRConfirm *fdrc; + u32 last_match = (u32)-1; +""" print self.produce_needed_temporaries(self.num_iterations) @@ -289,9 +428,17 @@ class MTFat(MT): print " ptr += 16;" print " }" - print " for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {" - print " __builtin_prefetch(ptr + (iterBytes*4));" - print self.produce_flood_check() + print """ + for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + if (P0(ptr > tryFloodDetect)) { + tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes); + if (P0(controlVal == HWLM_TERMINATE_MATCHING)) { + *a->groups = controlVal; + return HWLM_TERMINATED; + } + } +""" for iter in range(self.num_iterations): self.produce_one_iteration(iter, self.num_iterations, False, confirmCautious = False) @@ -302,7 +449,11 @@ class MTFat(MT): self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True) print " }" - print self.produce_footer() + print """ + *a->groups = controlVal; + return HWLM_SUCCESS; +} +""" def produce_one_iteration_state_calc(self, iter, effective_num_iterations, cautious, save_old): @@ -367,7 +518,33 @@ class MTFat(MT): print "#endif" print " }" -class MTFast(MatcherBase): +class MTFast: + def produce_header(self, visible, header_only = False): + s = "" + if not visible: + s += "static never_inline" + s += """ +hwlm_error_t %s(UNUSED const struct FDR *fdr, + UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name() + if header_only: + s += ";" + else: + s += "{" + s += "\n" + return s + + def produce_guard(self): + print self.arch.get_guard() + + def produce_zero_alternative(self): + print """ +#else +#define %s 0 +#endif +""" % self.get_name() + + def close_guard(self): + print "#endif" def produce_confirm(self, cautious): if cautious: @@ -376,24 +553,52 @@ class MTFast(MatcherBase): cautious_str = "NOT_CAUTIOUS" print " for (u32 i = 0; i < arrCnt; i++) {" - print " byte = bitArr[i] / 8;" + print " u32 byte = bitArr[i] / 8;" if self.packed: conf_split_mask = IntegerType(32).constant_to_string( self.conf_top_level_split - 1) - print " bitRem = bitArr[i] % 8;" - print " confSplit = *(ptr+byte) & 0x1f;" - print " idx = confSplit * %d + bitRem;" % self.num_buckets - print " cf = confBase[idx];" + print " u32 bitRem = bitArr[i] % 8;" + print " u32 confSplit = *(ptr+byte) & 0x1f;" + print " u32 idx = confSplit * %d + bitRem;" % self.num_buckets + print " u32 cf = confBase[idx];" print " if (!cf)" print " continue;" print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);" print " if (!(fdrc->groups & *control))" print " continue;" - print " confWithBit(fdrc, a, ptr - buf + byte, %s, 0, control, &last_match);" % cautious_str + print """ + CautionReason reason = %s; + CONF_TYPE v; + const u8 * confirm_loc = ptr + byte - 7; + if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) { + v = lv_u64a(confirm_loc, buf, buf + len); + } else { // r == VECTORING, confirm_loc < buf + u64a histBytes = a->histBytes; + v = lv_u64a_ce(confirm_loc, buf, buf + len); + // stitch together v (which doesn't move) and history (which does) + u32 overhang = buf - confirm_loc; + histBytes >>= 64 - (overhang * 8); + v |= histBytes; + }""" % (cautious_str) + print " confWithBit(fdrc, a, ptr - buf + byte, 0, control, &last_match, v);" else: - print " cf = confBase[bitArr[i] % 8];" + print " u32 cf = confBase[bitArr[i] % 8];" print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);" - print " confWithBit1(fdrc, a, ptr - buf + byte, %s, control, &last_match);" % cautious_str + print """ + CautionReason reason = %s; + CONF_TYPE v; + const u8 * confirm_loc = ptr + byte - 7; + if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) { + v = lv_u64a(confirm_loc, buf, buf + len); + } else { // r == VECTORING, confirm_loc < buf + u64a histBytes = a->histBytes; + v = lv_u64a_ce(confirm_loc, buf, buf + len); + // stitch together v (which doesn't move) and history (which does) + u32 overhang = buf - confirm_loc; + histBytes >>= 64 - (overhang * 8); + v |= histBytes; + }""" % (cautious_str) + print " confWithBit1(fdrc, a, ptr - buf + byte, control, &last_match, v);" print " if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {" print " *a->groups = controlVal;" print " return HWLM_TERMINATED;" @@ -467,7 +672,17 @@ class MTFast(MatcherBase): def produce_code(self): print self.produce_header(visible = True, header_only = False) - print self.produce_common_declarations() + print """ + const u8 * buf = a->buf; + const size_t len = a->len; + const u8 * ptr = buf + a->start_offset; + hwlmcb_rv_t controlVal = *a->groups; + hwlmcb_rv_t * control = &controlVal; + u32 floodBackoff = FLOOD_BACKOFF_START; + const u8 * tryFloodDetect = a->firstFloodDetect; + const struct FDRConfirm *fdrc; + u32 last_match = (u32)-1; +""" print self.produce_needed_temporaries(self.num_iterations) @@ -498,9 +713,18 @@ class MTFast(MatcherBase): self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True) print " ptr += 32;" print " }" - print " for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {" - print " __builtin_prefetch(ptr + (iterBytes*4));" - print self.produce_flood_check() + print """ + for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) { + __builtin_prefetch(ptr + (iterBytes*4)); + if (P0(ptr > tryFloodDetect)) { + tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes); + if (P0(controlVal == HWLM_TERMINATE_MATCHING)) { + *a->groups = controlVal; + return HWLM_TERMINATED; + } + } +""" + for iter in range (0, self.num_iterations): self.produce_one_iteration_state_calc(iter = iter, cautious = False) print " arrCnt = 0;" @@ -514,7 +738,11 @@ class MTFast(MatcherBase): self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True) print " }" - print self.produce_footer() + print """ + *a->groups = controlVal; + return HWLM_SUCCESS; +} +""" def get_name(self): if self.packed: diff --git a/src/runtime.c b/src/runtime.c index cab61227..852eaf92 100644 --- a/src/runtime.c +++ b/src/runtime.c @@ -450,11 +450,19 @@ void maintainHistoryBuffer(const struct RoseEngine *rose, char *state, static really_inline void init_stream(struct hs_stream *s, const struct RoseEngine *rose) { + char *state = getMultiState(s); + + // Make absolutely sure that the 16 bytes leading up to the end of the + // history buffer are initialised, as we rely on this (regardless of the + // actual values used) in FDR. + char *hist_end = state + rose->stateOffsets.history + rose->historyRequired; + assert(hist_end - 16 >= (const char *)s); + unaligned_store_u64a(hist_end - 16, 0xDEADDEADDEADDEADull); + unaligned_store_u64a(hist_end - 8, 0xDEADDEADDEADDEADull); + s->rose = rose; s->offset = 0; - char *state = getMultiState(s); - setStreamStatus(state, 0); roseInitState(rose, state); diff --git a/unit/internal/fdr_flood.cpp b/unit/internal/fdr_flood.cpp index bda8c624..68d8f632 100644 --- a/unit/internal/fdr_flood.cpp +++ b/unit/internal/fdr_flood.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2016, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -403,8 +403,11 @@ TEST_P(FDRFloodp, WithMask) { TEST_P(FDRFloodp, StreamingMask) { const u32 hint = GetParam(); SCOPED_TRACE(hint); + const size_t fake_history_size = 16; + const vector fake_history(fake_history_size, 0); const size_t dataSize = 1024; vector data(dataSize); + vector tempdata(dataSize + fake_history_size); // headroom u8 c = '\0'; while (1) { @@ -487,18 +490,28 @@ TEST_P(FDRFloodp, StreamingMask) { for (u32 streamChunk = 1; streamChunk <= 16; streamChunk *= 2) { matchesCounts.clear(); - fdrStatus = fdrExecStreaming(fdr.get(), nullptr, 0, &data[0], streamChunk, - 0, countCallback, &matchesCounts, HWLM_ALL_GROUPS, nullptr); + const u8 *d = data.data(); + // reference past the end of fake history to allow headroom + const u8 *fhist = fake_history.data() + fake_history_size; + fdrStatus = fdrExecStreaming(fdr.get(), fhist, 0, d, streamChunk, 0, + countCallback, &matchesCounts, + HWLM_ALL_GROUPS, nullptr); ASSERT_EQ(0, fdrStatus); for (u32 j = streamChunk; j < dataSize; j += streamChunk) { - if (j < 8) { - fdrStatus = fdrExecStreaming(fdr.get(), &data[0], j, - &data[0] + j, streamChunk, 0, countCallback, - &matchesCounts, HWLM_ALL_GROUPS, nullptr); + if (j < 16) { + /* allow 16 bytes headroom on read to avoid invalid + * memory read during the FDR zone creation.*/ + memset(tempdata.data(), c, dataSize + fake_history_size); + const u8 *tmp_d = tempdata.data() + fake_history_size; + fdrStatus = fdrExecStreaming(fdr.get(), tmp_d, j, tmp_d + j, + streamChunk, 0, countCallback, + &matchesCounts, + HWLM_ALL_GROUPS, nullptr); } else { - fdrStatus = fdrExecStreaming(fdr.get(), &data[0] + j - 8, - 8, &data[0] + j, streamChunk, 0, countCallback, - &matchesCounts, HWLM_ALL_GROUPS, nullptr); + fdrStatus = fdrExecStreaming(fdr.get(), d + j - 8, 8, d + j, + streamChunk, 0, countCallback, + &matchesCounts, + HWLM_ALL_GROUPS, nullptr); } ASSERT_EQ(0, fdrStatus); }