diff --git a/src/fdr/autogen.py b/src/fdr/autogen.py index 36e4c16c..e5b4f39e 100755 --- a/src/fdr/autogen.py +++ b/src/fdr/autogen.py @@ -54,16 +54,11 @@ def produce_fdr_compiles(l): def build_fdr_matchers(): all_matchers = [ ] - domains = [8, 10, 11, 12, 13] - big_domains = [ 14, 15 ] + strides = [ 1, 2, 4 ] common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 } - for d in domains: - all_matchers += [ M3(stride = 1, domain = d, **common) ] - all_matchers += [ M3(stride = 2, domain = d, **common) ] - all_matchers += [ M3(stride = 4, domain = d, **common) ] - for d in big_domains: - all_matchers += [ M3(stride = 1, domain = d, **common) ] + for s in strides: + all_matchers += [ M3(stride = s, **common) ] return all_matchers diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index 082800f1..f83a4265 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -40,27 +40,6 @@ #include "fdr_confirm_runtime.h" #include "fdr_streaming_runtime.h" #include "fdr_loadval.h" - -static really_inline UNUSED -u32 getPreStartVal(const struct FDR_Runtime_Args *a, u32 numBits) { - u32 r = 0; - if (a->start_offset == 0) { - if (numBits <= 8) { - r = a->buf_history[a->len_history - 1]; - } else { - r = a->buf_history[a->len_history - 1]; - r |= (a->buf[0] << 8); - } - } else { - if (numBits <= 8) { - r = a->buf[a->start_offset - 1]; - } else { - r = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len); - } - } - return r & ((1 << numBits) - 1); -} - #include "fdr_autogen.c" #define FAKE_HISTORY_SIZE 16 diff --git a/src/fdr/fdr_autogen.py b/src/fdr/fdr_autogen.py index 685cca3b..748d811f 100755 --- a/src/fdr/fdr_autogen.py +++ b/src/fdr/fdr_autogen.py @@ -74,12 +74,12 @@ class ValueExtractStep(Step): dsb = m.datasize_bytes modval = offset % dsb - if m.domain > 8 and modval == dsb - 1: + if modval == dsb - 1: # Case 1: reading more than one byte over the end of the bulk load self.latency = 4 if sub_load_cautious: - code_string = "cautious_forward" + code_string = "cautious_forward" else: code_string = "normal" load_string = m.single_load_type.load_expr_data(self.offset, code_string) @@ -101,7 +101,7 @@ class ValueExtractStep(Step): temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust) - init_string = "(%s) & 0x%x" % (temp_string, m.reach_mask) + init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust) v_var = self.nv(m.value_extract_type, "v%d" % offset) self.val = v_var.gen_initializer_stmt(init_string) @@ -173,14 +173,10 @@ class ConfirmStep(Step): enable_confirmless = m.stride == 1, do_bailout = False) class M3(MatcherBase): - def get_hash_safety_parameters(self): - h_size = self.single_load_type.size_in_bytes() - return (0, h_size - 1) - def produce_compile_call(self): - print " { %d, %d, %d, %d, %d, %s, %d, %d }," % ( + print " { %d, %d, %d, %d, %s, %d, %d }," % ( self.id, self.state_width, self.num_buckets, - self.stride, self.domain, + self.stride, self.arch.target, self.conf_pull_back, self.conf_top_level_split) def produce_main_loop(self, switch_variant = False): @@ -192,8 +188,8 @@ class M3(MatcherBase): ctxt = CodeGenContext(self) if switch_variant: - print " ptr -= (iterBytes - dist);" - print " { " # need an extra scope around switch variant to stop its globals escaping + print " ptr -= (iterBytes - dist);" + print " { " # need an extra scope around switch variant to stop its globals escaping else: print " if (doMainLoop) {" print " for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {" @@ -349,25 +345,30 @@ class M3(MatcherBase): shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance)) s = Template(""" - $TYPENAME s; - if (a->len_history) { - u32 tmp = getPreStartVal(a, $DOMAIN); - s = *((const $TYPENAME *)ft + tmp); - $SHIFT_EXPR; - } else { - s = *(const $TYPENAME *)&fdr->start; - } + $TYPENAME s; + if (a->len_history) { + u32 tmp = 0; + if (a->start_offset == 0) { + tmp = a->buf_history[a->len_history - 1]; + tmp |= (a->buf[0] << 8); + } else { + tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len); + } + tmp &= fdr->domainMask; + s = *((const $TYPENAME *)ft + tmp); + $SHIFT_EXPR; + } else { + s = *(const $TYPENAME *)&fdr->start; + } """).substitute(TYPENAME = s_type.get_name(), ZERO_EXPR = s_type.zero_expression(), - DOMAIN = self.domain, SHIFT_EXPR = shift_expr) return s def produce_code(self): - (behind, ahead) = self.get_hash_safety_parameters() - loop_read_behind = behind - loop_read_ahead = self.loop_bytes + ahead + loop_read_behind = 0 + loop_read_ahead = self.loop_bytes + 1 # we set up mask and shift stuff for extracting our masks from registers # @@ -380,7 +381,7 @@ class M3(MatcherBase): ssb = self.state_type.size / 8 # state size in bytes # Intel path - if ssb == 16 and self.domain == 16: + if ssb == 16: # obscure corner - we don't have the room in the register to # do this for all values so we don't. domain==16 is pretty # bad anyhow, of course @@ -390,7 +391,6 @@ class M3(MatcherBase): shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 } self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ] - self.reach_mask = ((1 << self.domain) - 1) << self.reach_shift_adjust print self.produce_header(visible = False) @@ -398,21 +398,19 @@ class M3(MatcherBase): print " Arch: " + self.arch.name, print " State type: " + self.state_type.get_name(), print " Num buckets: %d" % self.num_buckets, - print " Domain: %d" % self.domain, print " Stride: %d" % self.stride print self.produce_common_declarations() - print - print "\tconst size_t tabSize = %d;" % self.table_size - print """ - const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR)); - const u32 * confBase = (const u32 *)(ft + tabSize); -""" + print " assert(fdr->domain > 8 && fdr->domain < 16);" + print + print " u64a domain_mask = fdr->domainMask;" + print " const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));" + print " const u32 * confBase = (const u32 *)(ft + fdr->tabSize);" print self.produce_init_state() - print "\tconst size_t iterBytes = %d;" % self.loop_bytes - print "\tconst size_t START_MOD = %d;" % self.datasize_bytes - print "\tconst size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead + print " const size_t iterBytes = %d;" % self.loop_bytes + print " const size_t START_MOD = %d;" % self.datasize_bytes + print " const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead print """ while (ptr < buf + len) { @@ -451,9 +449,9 @@ class M3(MatcherBase): print self.produce_footer() def get_name(self): - return "fdr_exec_%s_d%d_s%d_w%d" % (self.arch.name, self.domain, self.stride, self.state_width) + return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width) - def __init__(self, state_width, domain, stride, + def __init__(self, state_width, stride, arch, table_state_width = None, num_buckets = 8, @@ -474,17 +472,9 @@ class M3(MatcherBase): self.table_state_width = state_width self.table_state_type = getRequiredType(self.table_state_width) - # domain is the number of bits that we draw from our input to - # index our 'reach' table - if not 8 <= domain <= 16: - fail_out("Unsupported domain: %d" % domain) - self.domain = domain - # this is the load type required for this domain if we want to + # this is the load type required for domain [9:15] if we want to # load it one at a time - self.single_load_type = getRequiredType(self.domain) - - # table size - self.table_size = 2**domain * table_state_width // 8 + self.single_load_type = IntegerType(16) # stride is the frequency with which we make data-driven # accesses to our reach table diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp index 8be44370..ccf17626 100644 --- a/src/fdr/fdr_compile.cpp +++ b/src/fdr/fdr_compile.cpp @@ -184,6 +184,13 @@ aligned_unique_ptr FDRCompiler::setupFDR(pair link) { ptr += floodControlTmp.second; aligned_free(floodControlTmp.first); + /* we are allowing domains 9 to 15 only */ + assert(eng.bits > 8 && eng.bits < 16); + fdr->domain = eng.bits; + fdr->schemeWidthByte = eng.schemeWidth / 8; + fdr->domainMask = (1 << eng.bits) - 1; + fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte; + if (link.first) { fdr->link = verify_u32(ptr - fdr_base); memcpy(ptr, link.first, link.second); @@ -534,6 +541,11 @@ fdrBuildTableInternal(const vector &lits, bool make_small, return nullptr; } + // temporary hack for unit testing + if (hint != HINT_INVALID) { + des->bits = 9; + } + FDRCompiler fc(lits, *des, make_small); return fc.build(link); } diff --git a/src/fdr/fdr_dump.cpp b/src/fdr/fdr_dump.cpp index ae246270..158170c2 100644 --- a/src/fdr/fdr_dump.cpp +++ b/src/fdr/fdr_dump.cpp @@ -81,6 +81,7 @@ void fdrPrintStats(const FDR *fdr, FILE *f) { unique_ptr des = getFdrDescription(fdr->engineID); if (des) { + fprintf(f, " domain %u\n", des->bits); fprintf(f, " stride %u\n", des->stride); fprintf(f, " buckets %u\n", des->getNumBuckets()); fprintf(f, " width %u\n", des->schemeWidth); diff --git a/src/fdr/fdr_engine_description.cpp b/src/fdr/fdr_engine_description.cpp index 2a6fda79..5d470c7e 100644 --- a/src/fdr/fdr_engine_description.cpp +++ b/src/fdr/fdr_engine_description.cpp @@ -48,7 +48,7 @@ FDREngineDescription::FDREngineDescription(const FDREngineDef &def) : EngineDescription(def.id, targetByArchFeatures(def.cpu_features), def.numBuckets, def.confirmPullBackDistance, def.confirmTopLevelSplit), - schemeWidth(def.schemeWidth), stride(def.stride), bits(def.bits) {} + schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {} u32 FDREngineDescription::getDefaultFloodSuffixLength() const { // rounding up, so that scheme width 32 and 6 buckets is 6 not 5! @@ -105,76 +105,83 @@ unique_ptr chooseEngine(const target_t &target, DEBUG_PRINTF("%zu lits, msl=%zu, desiredStride=%u\n", vl.size(), msl, desiredStride); - const FDREngineDescription *best = nullptr; + FDREngineDescription *best = nullptr; u32 best_score = 0; - for (size_t engineID = 0; engineID < allDescs.size(); engineID++) { - const FDREngineDescription &eng = allDescs[engineID]; - if (!eng.isValidOnTarget(target)) { - continue; - } - if (msl < eng.stride) { - continue; - } - - u32 score = 100; - - score -= absdiff(desiredStride, eng.stride); - - if (eng.stride <= desiredStride) { - score += eng.stride; - } - - u32 effLits = vl.size(); /* * desiredStride;*/ - u32 ideal; - if (effLits < eng.getNumBuckets()) { - if (eng.stride == 1) { - ideal = 8; - } else { - ideal = 10; + for (u32 domain = 9; domain <= 15; domain++) { + for (size_t engineID = 0; engineID < allDescs.size(); engineID++) { + // to make sure that domains >=14 have stride 1 according to origin + if (domain > 13 && engineID > 0) { + continue; + } + FDREngineDescription &eng = allDescs[engineID]; + if (!eng.isValidOnTarget(target)) { + continue; + } + if (msl < eng.stride) { + continue; } - } else if (effLits < 20) { - ideal = 10; - } else if (effLits < 100) { - ideal = 11; - } else if (effLits < 1000) { - ideal = 12; - } else if (effLits < 10000) { - ideal = 13; - } else { - ideal = 15; - } - if (ideal != 8 && eng.schemeWidth == 32) { - ideal += 1; - } + u32 score = 100; - if (make_small) { - ideal -= 2; - } + score -= absdiff(desiredStride, eng.stride); - if (eng.stride > 1) { - ideal++; - } + if (eng.stride <= desiredStride) { + score += eng.stride; + } - DEBUG_PRINTF("effLits %u\n", effLits); + u32 effLits = vl.size(); /* * desiredStride;*/ + u32 ideal; + if (effLits < eng.getNumBuckets()) { + if (eng.stride == 1) { + ideal = 8; + } else { + ideal = 10; + } + } else if (effLits < 20) { + ideal = 10; + } else if (effLits < 100) { + ideal = 11; + } else if (effLits < 1000) { + ideal = 12; + } else if (effLits < 10000) { + ideal = 13; + } else { + ideal = 15; + } - if (target.is_atom_class() && !make_small && effLits < 4000) { - /* Unless it is a very heavy case, we want to build smaller tables - * on lightweight machines due to their small caches. */ - ideal -= 2; - } + if (ideal != 8 && eng.schemeWidth == 32) { + ideal += 1; + } - score -= absdiff(ideal, eng.bits); + if (make_small) { + ideal -= 2; + } - DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u " - "-> score=%u\n", - eng.getID(), eng.schemeWidth, eng.bits, - eng.getNumBuckets(), eng.stride, score); + if (eng.stride > 1) { + ideal++; + } - if (!best || score > best_score) { - best = ŋ - best_score = score; + DEBUG_PRINTF("effLits %u\n", effLits); + + if (target.is_atom_class() && !make_small && effLits < 4000) { + /* Unless it is a very heavy case, we want to build smaller tables + * on lightweight machines due to their small caches. */ + ideal -= 2; + } + + score -= absdiff(ideal, domain); + + DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u " + "-> score=%u\n", + eng.getID(), eng.schemeWidth, eng.bits, + eng.getNumBuckets(), eng.stride, score); + + if (!best || score > best_score) { + eng.bits = domain; + best = ŋ + best_score = score; + } } } diff --git a/src/fdr/fdr_engine_description.h b/src/fdr/fdr_engine_description.h index d936095b..45f64ac0 100644 --- a/src/fdr/fdr_engine_description.h +++ b/src/fdr/fdr_engine_description.h @@ -43,7 +43,6 @@ struct FDREngineDef { u32 schemeWidth; u32 numBuckets; u32 stride; - u32 bits; u64a cpu_features; u32 confirmPullBackDistance; u32 confirmTopLevelSplit; diff --git a/src/fdr/fdr_internal.h b/src/fdr/fdr_internal.h index 6c722777..607e039c 100644 --- a/src/fdr/fdr_internal.h +++ b/src/fdr/fdr_internal.h @@ -76,9 +76,11 @@ struct FDR { * structures (spillover strings and hash table) if we're a secondary * structure. */ u32 link; + u8 domain; /* dynamic domain info */ + u8 schemeWidthByte; /* scheme width in bytes */ + u16 domainMask; /* pre-computed domain mask */ + u32 tabSize; /* pre-computed hashtable size in bytes */ u32 pad1; - u32 pad2; - u32 pad3; union { u32 s_u32;