mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
FDR runtime simplification
Removed static specialisation of domains.
This commit is contained in:
parent
abbd548899
commit
313822c157
@ -54,16 +54,11 @@ def produce_fdr_compiles(l):
|
||||
|
||||
def build_fdr_matchers():
|
||||
all_matchers = [ ]
|
||||
domains = [8, 10, 11, 12, 13]
|
||||
big_domains = [ 14, 15 ]
|
||||
strides = [ 1, 2, 4 ]
|
||||
|
||||
common = { "state_width" : 128, "num_buckets" : 8, "extract_frequency" : 8, "arch" : arch_x86_64 }
|
||||
for d in domains:
|
||||
all_matchers += [ M3(stride = 1, domain = d, **common) ]
|
||||
all_matchers += [ M3(stride = 2, domain = d, **common) ]
|
||||
all_matchers += [ M3(stride = 4, domain = d, **common) ]
|
||||
for d in big_domains:
|
||||
all_matchers += [ M3(stride = 1, domain = d, **common) ]
|
||||
for s in strides:
|
||||
all_matchers += [ M3(stride = s, **common) ]
|
||||
|
||||
return all_matchers
|
||||
|
||||
|
@ -40,27 +40,6 @@
|
||||
#include "fdr_confirm_runtime.h"
|
||||
#include "fdr_streaming_runtime.h"
|
||||
#include "fdr_loadval.h"
|
||||
|
||||
static really_inline UNUSED
|
||||
u32 getPreStartVal(const struct FDR_Runtime_Args *a, u32 numBits) {
|
||||
u32 r = 0;
|
||||
if (a->start_offset == 0) {
|
||||
if (numBits <= 8) {
|
||||
r = a->buf_history[a->len_history - 1];
|
||||
} else {
|
||||
r = a->buf_history[a->len_history - 1];
|
||||
r |= (a->buf[0] << 8);
|
||||
}
|
||||
} else {
|
||||
if (numBits <= 8) {
|
||||
r = a->buf[a->start_offset - 1];
|
||||
} else {
|
||||
r = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
|
||||
}
|
||||
}
|
||||
return r & ((1 << numBits) - 1);
|
||||
}
|
||||
|
||||
#include "fdr_autogen.c"
|
||||
|
||||
#define FAKE_HISTORY_SIZE 16
|
||||
|
@ -74,12 +74,12 @@ class ValueExtractStep(Step):
|
||||
dsb = m.datasize_bytes
|
||||
modval = offset % dsb
|
||||
|
||||
if m.domain > 8 and modval == dsb - 1:
|
||||
if modval == dsb - 1:
|
||||
# Case 1: reading more than one byte over the end of the bulk load
|
||||
|
||||
self.latency = 4
|
||||
if sub_load_cautious:
|
||||
code_string = "cautious_forward"
|
||||
code_string = "cautious_forward"
|
||||
else:
|
||||
code_string = "normal"
|
||||
load_string = m.single_load_type.load_expr_data(self.offset, code_string)
|
||||
@ -101,7 +101,7 @@ class ValueExtractStep(Step):
|
||||
temp_string = "(%s >> %d)" % (lb_var.name, modval*8 - m.reach_shift_adjust)
|
||||
|
||||
|
||||
init_string = "(%s) & 0x%x" % (temp_string, m.reach_mask)
|
||||
init_string = "(%s) & (domain_mask << %d)" % (temp_string, m.reach_shift_adjust)
|
||||
v_var = self.nv(m.value_extract_type, "v%d" % offset)
|
||||
self.val = v_var.gen_initializer_stmt(init_string)
|
||||
|
||||
@ -173,14 +173,10 @@ class ConfirmStep(Step):
|
||||
enable_confirmless = m.stride == 1, do_bailout = False)
|
||||
|
||||
class M3(MatcherBase):
|
||||
def get_hash_safety_parameters(self):
|
||||
h_size = self.single_load_type.size_in_bytes()
|
||||
return (0, h_size - 1)
|
||||
|
||||
def produce_compile_call(self):
|
||||
print " { %d, %d, %d, %d, %d, %s, %d, %d }," % (
|
||||
print " { %d, %d, %d, %d, %s, %d, %d }," % (
|
||||
self.id, self.state_width, self.num_buckets,
|
||||
self.stride, self.domain,
|
||||
self.stride,
|
||||
self.arch.target, self.conf_pull_back, self.conf_top_level_split)
|
||||
|
||||
def produce_main_loop(self, switch_variant = False):
|
||||
@ -192,8 +188,8 @@ class M3(MatcherBase):
|
||||
ctxt = CodeGenContext(self)
|
||||
|
||||
if switch_variant:
|
||||
print " ptr -= (iterBytes - dist);"
|
||||
print " { " # need an extra scope around switch variant to stop its globals escaping
|
||||
print " ptr -= (iterBytes - dist);"
|
||||
print " { " # need an extra scope around switch variant to stop its globals escaping
|
||||
else:
|
||||
print " if (doMainLoop) {"
|
||||
print " for (; ptr + LOOP_READ_AHEAD < buf + len; ptr += iterBytes) {"
|
||||
@ -349,25 +345,30 @@ class M3(MatcherBase):
|
||||
shift_expr = "%s = %s" % (state.name, state.type.shift_expr(state.name, shift_distance))
|
||||
|
||||
s = Template("""
|
||||
$TYPENAME s;
|
||||
if (a->len_history) {
|
||||
u32 tmp = getPreStartVal(a, $DOMAIN);
|
||||
s = *((const $TYPENAME *)ft + tmp);
|
||||
$SHIFT_EXPR;
|
||||
} else {
|
||||
s = *(const $TYPENAME *)&fdr->start;
|
||||
}
|
||||
$TYPENAME s;
|
||||
if (a->len_history) {
|
||||
u32 tmp = 0;
|
||||
if (a->start_offset == 0) {
|
||||
tmp = a->buf_history[a->len_history - 1];
|
||||
tmp |= (a->buf[0] << 8);
|
||||
} else {
|
||||
tmp = lv_u16(a->buf + a->start_offset - 1, a->buf, a->buf + a->len);
|
||||
}
|
||||
tmp &= fdr->domainMask;
|
||||
s = *((const $TYPENAME *)ft + tmp);
|
||||
$SHIFT_EXPR;
|
||||
} else {
|
||||
s = *(const $TYPENAME *)&fdr->start;
|
||||
}
|
||||
""").substitute(TYPENAME = s_type.get_name(),
|
||||
ZERO_EXPR = s_type.zero_expression(),
|
||||
DOMAIN = self.domain,
|
||||
SHIFT_EXPR = shift_expr)
|
||||
return s
|
||||
|
||||
def produce_code(self):
|
||||
|
||||
(behind, ahead) = self.get_hash_safety_parameters()
|
||||
loop_read_behind = behind
|
||||
loop_read_ahead = self.loop_bytes + ahead
|
||||
loop_read_behind = 0
|
||||
loop_read_ahead = self.loop_bytes + 1
|
||||
|
||||
# we set up mask and shift stuff for extracting our masks from registers
|
||||
#
|
||||
@ -380,7 +381,7 @@ class M3(MatcherBase):
|
||||
ssb = self.state_type.size / 8 # state size in bytes
|
||||
|
||||
# Intel path
|
||||
if ssb == 16 and self.domain == 16:
|
||||
if ssb == 16:
|
||||
# obscure corner - we don't have the room in the register to
|
||||
# do this for all values so we don't. domain==16 is pretty
|
||||
# bad anyhow, of course
|
||||
@ -390,7 +391,6 @@ class M3(MatcherBase):
|
||||
|
||||
shift_amts = { 1 : 0, 2 : 1, 4 : 2, 8 : 3, 16: 4 }
|
||||
self.reach_shift_adjust = shift_amts[ ssb/self.reach_mult ]
|
||||
self.reach_mask = ((1 << self.domain) - 1) << self.reach_shift_adjust
|
||||
|
||||
print self.produce_header(visible = False)
|
||||
|
||||
@ -398,21 +398,19 @@ class M3(MatcherBase):
|
||||
print " Arch: " + self.arch.name,
|
||||
print " State type: " + self.state_type.get_name(),
|
||||
print " Num buckets: %d" % self.num_buckets,
|
||||
print " Domain: %d" % self.domain,
|
||||
print " Stride: %d" % self.stride
|
||||
|
||||
print self.produce_common_declarations()
|
||||
print
|
||||
|
||||
print "\tconst size_t tabSize = %d;" % self.table_size
|
||||
print """
|
||||
const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
|
||||
const u32 * confBase = (const u32 *)(ft + tabSize);
|
||||
"""
|
||||
print " assert(fdr->domain > 8 && fdr->domain < 16);"
|
||||
print
|
||||
print " u64a domain_mask = fdr->domainMask;"
|
||||
print " const u8 * ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));"
|
||||
print " const u32 * confBase = (const u32 *)(ft + fdr->tabSize);"
|
||||
print self.produce_init_state()
|
||||
print "\tconst size_t iterBytes = %d;" % self.loop_bytes
|
||||
print "\tconst size_t START_MOD = %d;" % self.datasize_bytes
|
||||
print "\tconst size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
|
||||
print " const size_t iterBytes = %d;" % self.loop_bytes
|
||||
print " const size_t START_MOD = %d;" % self.datasize_bytes
|
||||
print " const size_t LOOP_READ_AHEAD = %d;" % loop_read_ahead
|
||||
|
||||
print """
|
||||
while (ptr < buf + len) {
|
||||
@ -451,9 +449,9 @@ class M3(MatcherBase):
|
||||
print self.produce_footer()
|
||||
|
||||
def get_name(self):
|
||||
return "fdr_exec_%s_d%d_s%d_w%d" % (self.arch.name, self.domain, self.stride, self.state_width)
|
||||
return "fdr_exec_%s_s%d_w%d" % (self.arch.name, self.stride, self.state_width)
|
||||
|
||||
def __init__(self, state_width, domain, stride,
|
||||
def __init__(self, state_width, stride,
|
||||
arch,
|
||||
table_state_width = None,
|
||||
num_buckets = 8,
|
||||
@ -474,17 +472,9 @@ class M3(MatcherBase):
|
||||
self.table_state_width = state_width
|
||||
self.table_state_type = getRequiredType(self.table_state_width)
|
||||
|
||||
# domain is the number of bits that we draw from our input to
|
||||
# index our 'reach' table
|
||||
if not 8 <= domain <= 16:
|
||||
fail_out("Unsupported domain: %d" % domain)
|
||||
self.domain = domain
|
||||
# this is the load type required for this domain if we want to
|
||||
# this is the load type required for domain [9:15] if we want to
|
||||
# load it one at a time
|
||||
self.single_load_type = getRequiredType(self.domain)
|
||||
|
||||
# table size
|
||||
self.table_size = 2**domain * table_state_width // 8
|
||||
self.single_load_type = IntegerType(16)
|
||||
|
||||
# stride is the frequency with which we make data-driven
|
||||
# accesses to our reach table
|
||||
|
@ -184,6 +184,13 @@ aligned_unique_ptr<FDR> FDRCompiler::setupFDR(pair<u8 *, size_t> link) {
|
||||
ptr += floodControlTmp.second;
|
||||
aligned_free(floodControlTmp.first);
|
||||
|
||||
/* we are allowing domains 9 to 15 only */
|
||||
assert(eng.bits > 8 && eng.bits < 16);
|
||||
fdr->domain = eng.bits;
|
||||
fdr->schemeWidthByte = eng.schemeWidth / 8;
|
||||
fdr->domainMask = (1 << eng.bits) - 1;
|
||||
fdr->tabSize = (1 << eng.bits) * fdr->schemeWidthByte;
|
||||
|
||||
if (link.first) {
|
||||
fdr->link = verify_u32(ptr - fdr_base);
|
||||
memcpy(ptr, link.first, link.second);
|
||||
@ -534,6 +541,11 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// temporary hack for unit testing
|
||||
if (hint != HINT_INVALID) {
|
||||
des->bits = 9;
|
||||
}
|
||||
|
||||
FDRCompiler fc(lits, *des, make_small);
|
||||
return fc.build(link);
|
||||
}
|
||||
|
@ -81,6 +81,7 @@ void fdrPrintStats(const FDR *fdr, FILE *f) {
|
||||
unique_ptr<FDREngineDescription> des =
|
||||
getFdrDescription(fdr->engineID);
|
||||
if (des) {
|
||||
fprintf(f, " domain %u\n", des->bits);
|
||||
fprintf(f, " stride %u\n", des->stride);
|
||||
fprintf(f, " buckets %u\n", des->getNumBuckets());
|
||||
fprintf(f, " width %u\n", des->schemeWidth);
|
||||
|
@ -48,7 +48,7 @@ FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
|
||||
: EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
|
||||
def.numBuckets, def.confirmPullBackDistance,
|
||||
def.confirmTopLevelSplit),
|
||||
schemeWidth(def.schemeWidth), stride(def.stride), bits(def.bits) {}
|
||||
schemeWidth(def.schemeWidth), stride(def.stride), bits(0) {}
|
||||
|
||||
u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
|
||||
// rounding up, so that scheme width 32 and 6 buckets is 6 not 5!
|
||||
@ -105,76 +105,83 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
|
||||
DEBUG_PRINTF("%zu lits, msl=%zu, desiredStride=%u\n", vl.size(), msl,
|
||||
desiredStride);
|
||||
|
||||
const FDREngineDescription *best = nullptr;
|
||||
FDREngineDescription *best = nullptr;
|
||||
u32 best_score = 0;
|
||||
|
||||
for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
|
||||
const FDREngineDescription &eng = allDescs[engineID];
|
||||
if (!eng.isValidOnTarget(target)) {
|
||||
continue;
|
||||
}
|
||||
if (msl < eng.stride) {
|
||||
continue;
|
||||
}
|
||||
|
||||
u32 score = 100;
|
||||
|
||||
score -= absdiff(desiredStride, eng.stride);
|
||||
|
||||
if (eng.stride <= desiredStride) {
|
||||
score += eng.stride;
|
||||
}
|
||||
|
||||
u32 effLits = vl.size(); /* * desiredStride;*/
|
||||
u32 ideal;
|
||||
if (effLits < eng.getNumBuckets()) {
|
||||
if (eng.stride == 1) {
|
||||
ideal = 8;
|
||||
} else {
|
||||
ideal = 10;
|
||||
for (u32 domain = 9; domain <= 15; domain++) {
|
||||
for (size_t engineID = 0; engineID < allDescs.size(); engineID++) {
|
||||
// to make sure that domains >=14 have stride 1 according to origin
|
||||
if (domain > 13 && engineID > 0) {
|
||||
continue;
|
||||
}
|
||||
FDREngineDescription &eng = allDescs[engineID];
|
||||
if (!eng.isValidOnTarget(target)) {
|
||||
continue;
|
||||
}
|
||||
if (msl < eng.stride) {
|
||||
continue;
|
||||
}
|
||||
} else if (effLits < 20) {
|
||||
ideal = 10;
|
||||
} else if (effLits < 100) {
|
||||
ideal = 11;
|
||||
} else if (effLits < 1000) {
|
||||
ideal = 12;
|
||||
} else if (effLits < 10000) {
|
||||
ideal = 13;
|
||||
} else {
|
||||
ideal = 15;
|
||||
}
|
||||
|
||||
if (ideal != 8 && eng.schemeWidth == 32) {
|
||||
ideal += 1;
|
||||
}
|
||||
u32 score = 100;
|
||||
|
||||
if (make_small) {
|
||||
ideal -= 2;
|
||||
}
|
||||
score -= absdiff(desiredStride, eng.stride);
|
||||
|
||||
if (eng.stride > 1) {
|
||||
ideal++;
|
||||
}
|
||||
if (eng.stride <= desiredStride) {
|
||||
score += eng.stride;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("effLits %u\n", effLits);
|
||||
u32 effLits = vl.size(); /* * desiredStride;*/
|
||||
u32 ideal;
|
||||
if (effLits < eng.getNumBuckets()) {
|
||||
if (eng.stride == 1) {
|
||||
ideal = 8;
|
||||
} else {
|
||||
ideal = 10;
|
||||
}
|
||||
} else if (effLits < 20) {
|
||||
ideal = 10;
|
||||
} else if (effLits < 100) {
|
||||
ideal = 11;
|
||||
} else if (effLits < 1000) {
|
||||
ideal = 12;
|
||||
} else if (effLits < 10000) {
|
||||
ideal = 13;
|
||||
} else {
|
||||
ideal = 15;
|
||||
}
|
||||
|
||||
if (target.is_atom_class() && !make_small && effLits < 4000) {
|
||||
/* Unless it is a very heavy case, we want to build smaller tables
|
||||
* on lightweight machines due to their small caches. */
|
||||
ideal -= 2;
|
||||
}
|
||||
if (ideal != 8 && eng.schemeWidth == 32) {
|
||||
ideal += 1;
|
||||
}
|
||||
|
||||
score -= absdiff(ideal, eng.bits);
|
||||
if (make_small) {
|
||||
ideal -= 2;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
|
||||
"-> score=%u\n",
|
||||
eng.getID(), eng.schemeWidth, eng.bits,
|
||||
eng.getNumBuckets(), eng.stride, score);
|
||||
if (eng.stride > 1) {
|
||||
ideal++;
|
||||
}
|
||||
|
||||
if (!best || score > best_score) {
|
||||
best = ŋ
|
||||
best_score = score;
|
||||
DEBUG_PRINTF("effLits %u\n", effLits);
|
||||
|
||||
if (target.is_atom_class() && !make_small && effLits < 4000) {
|
||||
/* Unless it is a very heavy case, we want to build smaller tables
|
||||
* on lightweight machines due to their small caches. */
|
||||
ideal -= 2;
|
||||
}
|
||||
|
||||
score -= absdiff(ideal, domain);
|
||||
|
||||
DEBUG_PRINTF("fdr %u: width=%u, bits=%u, buckets=%u, stride=%u "
|
||||
"-> score=%u\n",
|
||||
eng.getID(), eng.schemeWidth, eng.bits,
|
||||
eng.getNumBuckets(), eng.stride, score);
|
||||
|
||||
if (!best || score > best_score) {
|
||||
eng.bits = domain;
|
||||
best = ŋ
|
||||
best_score = score;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -43,7 +43,6 @@ struct FDREngineDef {
|
||||
u32 schemeWidth;
|
||||
u32 numBuckets;
|
||||
u32 stride;
|
||||
u32 bits;
|
||||
u64a cpu_features;
|
||||
u32 confirmPullBackDistance;
|
||||
u32 confirmTopLevelSplit;
|
||||
|
@ -76,9 +76,11 @@ struct FDR {
|
||||
* structures (spillover strings and hash table) if we're a secondary
|
||||
* structure. */
|
||||
u32 link;
|
||||
u8 domain; /* dynamic domain info */
|
||||
u8 schemeWidthByte; /* scheme width in bytes */
|
||||
u16 domainMask; /* pre-computed domain mask */
|
||||
u32 tabSize; /* pre-computed hashtable size in bytes */
|
||||
u32 pad1;
|
||||
u32 pad2;
|
||||
u32 pad3;
|
||||
|
||||
union {
|
||||
u32 s_u32;
|
||||
|
Loading…
x
Reference in New Issue
Block a user