FDR runtime simplification

Removed static specialisation of domains.
This commit is contained in:
Mohammad Abdul Awal 2015-11-17 17:50:23 +00:00 committed by Matthew Barr
parent abbd548899
commit 313822c157
8 changed files with 124 additions and 139 deletions

View File

@ -54,16 +54,11 @@ def produce_fdr_compiles(l):
def build_fdr_matchers():
all_matchers = [ ]
domains = [8, 10, 11, 12, 13]
big_domains = [ 14, 15 ]
strides = [ 1, 2, 4 ]
common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
for d in domains:
all_matchers += [ M3(stride = 1, domain = d, **common) ]
all_matchers += [ M3(stride = 2, domain = d, **common) ]
all_matchers += [ M3(stride = 4, domain = d, **common) ]
for d in big_domains:
all_matchers += [ M3(stride = 1, domain = d, **common) ]
for s in strides:
all_matchers += [ M3(stride = s, **common) ]
return all_matchers

View File

@ -40,27 +40,6 @@
#include "fdr_confirm_runtime.h"
#include "fdr_streaming_runtime.h"
#include "fdr_loadval.h"
static really_inline UNUSED
u32 getPreStartVal(const struct FDR_Runtime_Args *a, u32 numBits) {
u32 r = 0;
if (a->start_offset == 0) {
if (numBits <= 8) {
r = a->buf_history[a->len_history - 1];
} else {
r = a->buf_history[a->len_history - 1];
r |= (a->buf[0] << 8);
}
} else {
if (numBits <= 8) {
r = a->buf[a->start_offset - 1];
} else {
r = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
}
}
return r & ((1 << numBits) - 1);
}
#include "fdr_autogen.c"
#define FAKE_HISTORY_SIZE 16

View File

@ -74,12 +74,12 @@ class ValueExtractStep(Step):
dsb = m.datasize_bytes
modval = offset % dsb
if m.domain > 8 and modval == dsb - 1:
if modval == dsb - 1:
# Case 1: reading more than one byte over the end of the bulk load
self.latency = 4
if sub_load_cautious:
code_string = "cautious_forward"
code_string = "cautious_forward"
else:
code_string = "normal"
load_string = m.single_load_type.load_expr_data(self.offset, code_string)
@ -101,7 +101,7 @@ class ValueExtractStep(Step):
temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)
init_string = "(%s) & 0x%x" % (temp_string, m.reach_mask)
init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust)
v_var = self.nv(m.value_extract_type, "v%d" % offset)
self.val = v_var.gen_initializer_stmt(init_string)
@ -173,14 +173,10 @@ class ConfirmStep(Step):
enable_confirmless = m.stride == 1, do_bailout = False)
class M3(MatcherBase):
def get_hash_safety_parameters(self):
h_size = self.single_load_type.size_in_bytes()
return (0, h_size - 1)
def produce_compile_call(self):
print " { %d, %d, %d, %d, %d, %s, %d, %d }," % (
print " { %d, %d, %d, %d, %s, %d, %d }," % (
self.id, self.state_width, self.num_buckets,
self.stride, self.domain,
self.stride,
self.arch.target, self.conf_pull_back, self.conf_top_level_split)
def produce_main_loop(self, switch_variant = False):
@ -192,8 +188,8 @@ class M3(MatcherBase):
ctxt = CodeGenContext(self)
if switch_variant:
print " ptr -= (iterBytes - dist);"
print " { " # need an extra scope around switch variant to stop its globals escaping
print " ptr -= (iterBytes - dist);"
print " { " # need an extra scope around switch variant to stop its globals escaping
else:
print " if (doMainLoop) {"
print " for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {"
@ -349,25 +345,30 @@ class M3(MatcherBase):
shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance))
s = Template("""
$TYPENAME s;
if (a->len_history) {
u32 tmp = getPreStartVal(a, $DOMAIN);
s = *((const $TYPENAME *)ft + tmp);
$SHIFT_EXPR;
} else {
s = *(const $TYPENAME *)&fdr->start;
}
$TYPENAME s;
if (a->len_history) {
u32 tmp = 0;
if (a->start_offset == 0) {
tmp = a->buf_history[a->len_history - 1];
tmp |= (a->buf[0] << 8);
} else {
tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
}
tmp &= fdr->domainMask;
s = *((const $TYPENAME *)ft + tmp);
$SHIFT_EXPR;
} else {
s = *(const $TYPENAME *)&fdr->start;
}
""").substitute(TYPENAME = s_type.get_name(),
ZERO_EXPR = s_type.zero_expression(),
DOMAIN = self.domain,
SHIFT_EXPR = shift_expr)
return s
def produce_code(self):
(behind, ahead) = self.get_hash_safety_parameters()
loop_read_behind = behind
loop_read_ahead = self.loop_bytes + ahead
loop_read_behind = 0
loop_read_ahead = self.loop_bytes + 1
# we set up mask and shift stuff for extracting our masks from registers
#
@ -380,7 +381,7 @@ class M3(MatcherBase):
ssb = self.state_type.size / 8 # state size in bytes
# Intel path
if ssb == 16 and self.domain == 16:
if ssb == 16:
# obscure corner - we don't have the room in the register to
# do this for all values so we don't. domain==16 is pretty
# bad anyhow, of course
@ -390,7 +391,6 @@ class M3(MatcherBase):
shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
self.reach_mask = ((1 << self.domain) - 1) << self.reach_shift_adjust
print self.produce_header(visible = False)
@ -398,21 +398,19 @@ class M3(MatcherBase):
print " Arch: " + self.arch.name,
print " State type: " + self.state_type.get_name(),
print " Num buckets: %d" % self.num_buckets,
print " Domain: %d" % self.domain,
print " Stride: %d" % self.stride
print self.produce_common_declarations()
print
print "\tconst size_t tabSize = %d;" % self.table_size
print """
const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
const u32 * confBase = (const u32 *)(ft + tabSize);
"""
print " assert(fdr->domain > 8 && fdr->domain < 16);"
print
print " u64a domain_mask = fdr->domainMask;"
print " const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));"
print " const u32 * confBase = (const u32 *)(ft + fdr->tabSize);"
print self.produce_init_state()
print "\tconst size_t iterBytes = %d;" % self.loop_bytes
print "\tconst size_t START_MOD = %d;" % self.datasize_bytes
print "\tconst size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
print " const size_t iterBytes = %d;" % self.loop_bytes
print " const size_t START_MOD = %d;" % self.datasize_bytes
print " const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
print """
while (ptr < buf + len) {
@ -451,9 +449,9 @@ class M3(MatcherBase):
print self.produce_footer()
def get_name(self):
return "fdr_exec_%s_d%d_s%d_w%d" % (self.arch.name, self.domain, self.stride, self.state_width)
return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width)
def __init__(self, state_width, domain, stride,
def __init__(self, state_width, stride,
arch,
table_state_width = None,
num_buckets = 8,
@ -474,17 +472,9 @@ class M3(MatcherBase):
self.table_state_width = state_width
self.table_state_type = getRequiredType(self.table_state_width)
# domain is the number of bits that we draw from our input to
# index our 'reach' table
if not 8 <= domain <= 16:
fail_out("Unsupported domain: %d" % domain)
self.domain = domain
# this is the load type required for this domain if we want to
# this is the load type required for domain [9:15] if we want to
# load it one at a time
self.single_load_type = getRequiredType(self.domain)
# table size
self.table_size = 2**domain * table_state_width // 8
self.single_load_type = IntegerType(16)
# stride is the frequency with which we make data-driven
# accesses to our reach table

View File

@ -184,6 +184,13 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
ptr += floodControlTmp.second;
aligned_free(floodControlTmp.first);
/* we are allowing domains 9 to 15 only */
assert(eng.bits > 8 && eng.bits < 16);
fdr->domain = eng.bits;
fdr->schemeWidthByte = eng.schemeWidth / 8;
fdr->domainMask = (1 << eng.bits) - 1;
fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte;
if (link.first) {
fdr->link = verify_u32(ptr - fdr_base);
memcpy(ptr, link.first, link.second);
@ -534,6 +541,11 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
return nullptr;
}
// temporary hack for unit testing
if (hint != HINT_INVALID) {
des->bits = 9;
}
FDRCompiler fc(lits, *des, make_small);
return fc.build(link);
}

View File

@ -81,6 +81,7 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
unique_ptr<FDREngineDescription> des =
getFdrDescription(fdr->engineID);
if (des) {
fprintf(f, " domain %u\n", des->bits);
fprintf(f, " stride %u\n", des->stride);
fprintf(f, " buckets %u\n", des->getNumBuckets());
fprintf(f, " width %u\n", des->schemeWidth);

View File

@ -48,7 +48,7 @@ FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
: EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
def.numBuckets, def.confirmPullBackDistance,
def.confirmTopLevelSplit),
schemeWidth(def.schemeWidth), stride(def.stride), bits(def.bits) {}
schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {}
u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
// rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
@ -105,76 +105,83 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
DEBUG_PRINTF("%zu lits, msl=%zu, desiredStride=%u\n", vl.size(), msl,
desiredStride);
const FDREngineDescription *best = nullptr;
FDREngineDescription *best = nullptr;
u32 best_score = 0;
for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
const FDREngineDescription &eng = allDescs[engineID];
if (!eng.isValidOnTarget(target)) {
continue;
}
if (msl < eng.stride) {
continue;
}
u32 score = 100;
score -= absdiff(desiredStride, eng.stride);
if (eng.stride <= desiredStride) {
score += eng.stride;
}
u32 effLits = vl.size(); /* * desiredStride;*/
u32 ideal;
if (effLits < eng.getNumBuckets()) {
if (eng.stride == 1) {
ideal = 8;
} else {
ideal = 10;
for (u32 domain = 9; domain <= 15; domain++) {
for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
// to make sure that domains >=14 have stride 1 according to origin
if (domain > 13 && engineID > 0) {
continue;
}
FDREngineDescription &eng = allDescs[engineID];
if (!eng.isValidOnTarget(target)) {
continue;
}
if (msl < eng.stride) {
continue;
}
} else if (effLits < 20) {
ideal = 10;
} else if (effLits < 100) {
ideal = 11;
} else if (effLits < 1000) {
ideal = 12;
} else if (effLits < 10000) {
ideal = 13;
} else {
ideal = 15;
}
if (ideal != 8 && eng.schemeWidth == 32) {
ideal += 1;
}
u32 score = 100;
if (make_small) {
ideal -= 2;
}
score -= absdiff(desiredStride, eng.stride);
if (eng.stride > 1) {
ideal++;
}
if (eng.stride <= desiredStride) {
score += eng.stride;
}
DEBUG_PRINTF("effLits %u\n", effLits);
u32 effLits = vl.size(); /* * desiredStride;*/
u32 ideal;
if (effLits < eng.getNumBuckets()) {
if (eng.stride == 1) {
ideal = 8;
} else {
ideal = 10;
}
} else if (effLits < 20) {
ideal = 10;
} else if (effLits < 100) {
ideal = 11;
} else if (effLits < 1000) {
ideal = 12;
} else if (effLits < 10000) {
ideal = 13;
} else {
ideal = 15;
}
if (target.is_atom_class() && !make_small && effLits < 4000) {
/* Unless it is a very heavy case, we want to build smaller tables
* on lightweight machines due to their small caches. */
ideal -= 2;
}
if (ideal != 8 && eng.schemeWidth == 32) {
ideal += 1;
}
score -= absdiff(ideal, eng.bits);
if (make_small) {
ideal -= 2;
}
DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
"-> score=%u\n",
eng.getID(), eng.schemeWidth, eng.bits,
eng.getNumBuckets(), eng.stride, score);
if (eng.stride > 1) {
ideal++;
}
if (!best || score > best_score) {
best = &eng;
best_score = score;
DEBUG_PRINTF("effLits %u\n", effLits);
if (target.is_atom_class() && !make_small && effLits < 4000) {
/* Unless it is a very heavy case, we want to build smaller tables
* on lightweight machines due to their small caches. */
ideal -= 2;
}
score -= absdiff(ideal, domain);
DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
"-> score=%u\n",
eng.getID(), eng.schemeWidth, eng.bits,
eng.getNumBuckets(), eng.stride, score);
if (!best || score > best_score) {
eng.bits = domain;
best = &eng;
best_score = score;
}
}
}

View File

@ -43,7 +43,6 @@ struct FDREngineDef {
u32 schemeWidth;
u32 numBuckets;
u32 stride;
u32 bits;
u64a cpu_features;
u32 confirmPullBackDistance;
u32 confirmTopLevelSplit;

View File

@ -76,9 +76,11 @@ struct FDR {
* structures (spillover strings and hash table) if we're a secondary
* structure. */
u32 link;
u8 domain; /* dynamic domain info */
u8 schemeWidthByte; /* scheme width in bytes */
u16 domainMask; /* pre-computed domain mask */
u32 tabSize; /* pre-computed hashtable size in bytes */
u32 pad1;
u32 pad2;
u32 pad3;
union {
u32 s_u32;