From 39c6a0c7bfea6d8e4378b06bb8d13ec54b827b63 Mon Sep 17 00:00:00 2001 From: Justin Viiret Date: Thu, 7 Jul 2016 10:25:49 +1000 Subject: [PATCH] rose: check literal bounds when building SB table Literals that cannot lead to a report in the first ROSE_SMALL_BLOCK_LEN bytes may be dropped from the small block table. --- src/rose/rose_build_dump.cpp | 20 +---- src/rose/rose_build_matchers.cpp | 136 ++++++++++++++++++++++++++++--- src/rose/rose_build_matchers.h | 8 +- 3 files changed, 133 insertions(+), 31 deletions(-) diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp index 2c3f326e..fc60af4c 100644 --- a/src/rose/rose_build_dump.cpp +++ b/src/rose/rose_build_dump.cpp @@ -458,18 +458,6 @@ void dumpTestLiterals(const string &filename, const vector &lits) { of.close(); } -namespace { -struct LongerThanLimit { - explicit LongerThanLimit(size_t len) : max_len(len) {} - bool operator()(const hwlmLiteral &lit) const { - return lit.s.length() > max_len; - } - - private: - size_t max_len; -}; -} - static void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) { auto lits = fillHamsterLiteralList(build, ROSE_ANCHORED); @@ -481,12 +469,10 @@ void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) { lits = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED); dumpTestLiterals(base + "rose_eod_test_literals.txt", lits); - lits = fillHamsterLiteralList(build, ROSE_FLOATING); - auto lits2 = fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK); + lits = fillHamsterLiteralList(build, ROSE_FLOATING, ROSE_SMALL_BLOCK_LEN); + auto lits2 = fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK, + ROSE_SMALL_BLOCK_LEN); lits.insert(end(lits), begin(lits2), end(lits2)); - lits.erase(remove_if(lits.begin(), lits.end(), - LongerThanLimit(ROSE_SMALL_BLOCK_LEN)), - lits.end()); dumpTestLiterals(base + "rose_smallblock_test_literals.txt", lits); } diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp index 498af2f0..7b20bd1c 100644 --- a/src/rose/rose_build_matchers.cpp +++ b/src/rose/rose_build_matchers.cpp @@ -38,12 +38,14 @@ #include "hwlm/hwlm_build.h" #include "hwlm/hwlm_literal.h" #include "nfa/castlecompile.h" +#include "nfa/nfa_api_queue.h" #include "util/charreach_util.h" #include "util/compile_context.h" #include "util/compile_error.h" #include "util/dump_charclass.h" #include "util/report.h" #include "util/report_manager.h" +#include "util/verify_types.h" #include "ue2common.h" #include @@ -519,8 +521,111 @@ bool isNoRunsLiteral(const RoseBuildImpl &build, const u32 id, return true; } +static +const raw_puff &getChainedPuff(const RoseBuildImpl &build, + const Report &report) { + DEBUG_PRINTF("chained report, event %u\n", report.onmatch); + + // MPV has already been moved to the outfixes vector. + assert(!build.mpv_outfix); + + auto mpv_outfix_it = find_if( + begin(build.outfixes), end(build.outfixes), + [](const OutfixInfo &outfix) { return outfix.is_nonempty_mpv(); }); + assert(mpv_outfix_it != end(build.outfixes)); + const auto *mpv = mpv_outfix_it->mpv(); + + u32 puff_index = report.onmatch - MQE_TOP_FIRST; + assert(puff_index < mpv->triggered_puffettes.size()); + return mpv->triggered_puffettes.at(puff_index); +} + +/** + * \brief Returns a conservative estimate of the minimum offset at which the + * given literal can lead to a report. + * + * TODO: This could be made more precise by calculating a "distance to accept" + * for every vertex in the graph; right now we're only accurate for leaf nodes. + */ +static +u64a literalMinReportOffset(const RoseBuildImpl &build, + const rose_literal_id &lit, + const rose_literal_info &info) { + const auto &g = build.g; + + const u32 lit_len = verify_u32(lit.elength()); + + u64a lit_min_offset = UINT64_MAX; + + for (const auto &v : info.vertices) { + DEBUG_PRINTF("vertex %zu min_offset=%u\n", g[v].idx, g[v].min_offset); + + u64a vert_offset = g[v].min_offset; + + if (vert_offset >= lit_min_offset) { + continue; + } + + u64a min_offset = UINT64_MAX; + + for (const auto &id : g[v].reports) { + const Report &report = build.rm.getReport(id); + DEBUG_PRINTF("report id %u, min offset=%llu\n", id, + report.minOffset); + if (report.type == INTERNAL_ROSE_CHAIN) { + // This vertex triggers an MPV, which will fire reports after + // repeating for a while. + assert(report.minOffset == 0); // Should not have bounds. + const auto &puff = getChainedPuff(build, report); + DEBUG_PRINTF("chained puff repeats=%u\n", puff.repeats); + const Report &puff_report = build.rm.getReport(puff.report); + DEBUG_PRINTF("puff report %u, min offset=%llu\n", puff.report, + puff_report.minOffset); + min_offset = min(min_offset, max(vert_offset + puff.repeats, + puff_report.minOffset)); + } else { + DEBUG_PRINTF("report min offset=%llu\n", report.minOffset); + min_offset = min(min_offset, max(vert_offset, + report.minOffset)); + } + } + + if (g[v].suffix) { + depth suffix_width = findMinWidth(g[v].suffix, g[v].suffix.top); + assert(suffix_width.is_reachable()); + DEBUG_PRINTF("suffix with width %s\n", suffix_width.str().c_str()); + min_offset = min(min_offset, vert_offset + suffix_width); + } + + if (!isLeafNode(v, g) || min_offset == UINT64_MAX) { + min_offset = vert_offset; + } + + lit_min_offset = min(lit_min_offset, min_offset); + } + + // If this literal in the undelayed literal corresponding to some delayed + // literals, we must take their minimum offsets into account. + for (const u32 &delayed_id : info.delayed_ids) { + const auto &delayed_lit = build.literals.right.at(delayed_id); + const auto &delayed_info = build.literal_info.at(delayed_id); + u64a delayed_min_offset = literalMinReportOffset(build, delayed_lit, + delayed_info); + DEBUG_PRINTF("delayed_id=%u, min_offset = %llu\n", delayed_id, + delayed_min_offset); + lit_min_offset = min(lit_min_offset, delayed_min_offset); + } + + // If we share a vertex with a shorter literal, our min offset might dip + // below the length of this one. + lit_min_offset = max(lit_min_offset, u64a{lit_len}); + + return lit_min_offset; +} + vector fillHamsterLiteralList(const RoseBuildImpl &build, - rose_literal_table table) { + rose_literal_table table, + u32 max_offset) { vector lits; for (const auto &e : build.literals.right) { @@ -546,6 +651,15 @@ vector fillHamsterLiteralList(const RoseBuildImpl &build, DEBUG_PRINTF("lit='%s'\n", escapeString(lit).c_str()); + if (max_offset != ROSE_BOUND_INF) { + u64a min_report = literalMinReportOffset(build, e.second, info); + if (min_report > max_offset) { + DEBUG_PRINTF("min report offset=%llu exceeds max_offset=%u\n", + min_report, max_offset); + continue; + } + } + const vector &msk = e.second.msk; const vector &cmp = e.second.cmp; @@ -664,7 +778,8 @@ aligned_unique_ptr buildSmallBlockMatcher(const RoseBuildImpl &build, return nullptr; } - auto lits = fillHamsterLiteralList(build, ROSE_FLOATING); + auto lits = fillHamsterLiteralList(build, ROSE_FLOATING, + ROSE_SMALL_BLOCK_LEN); if (lits.empty()) { DEBUG_PRINTF("no floating table\n"); return nullptr; @@ -673,8 +788,8 @@ aligned_unique_ptr buildSmallBlockMatcher(const RoseBuildImpl &build, return nullptr; } - auto anchored_lits = - fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK); + auto anchored_lits = fillHamsterLiteralList(build, + ROSE_ANCHORED_SMALL_BLOCK, ROSE_SMALL_BLOCK_LEN); if (anchored_lits.empty()) { DEBUG_PRINTF("no small-block anchored literals\n"); return nullptr; @@ -682,15 +797,10 @@ aligned_unique_ptr buildSmallBlockMatcher(const RoseBuildImpl &build, lits.insert(lits.end(), anchored_lits.begin(), anchored_lits.end()); - // Remove literals that are longer than our small block length, as they can - // never match. TODO: improve by removing literals that have a min match - // offset greater than ROSE_SMALL_BLOCK_LEN, which will catch anchored cases - // with preceding dots that put them over the limit. - auto longer_than_limit = [](const hwlmLiteral &lit) { - return lit.s.length() > ROSE_SMALL_BLOCK_LEN; - }; - lits.erase(remove_if(lits.begin(), lits.end(), longer_than_limit), - lits.end()); + // None of our literals should be longer than the small block limit. + assert(all_of(begin(lits), end(lits), [](const hwlmLiteral &lit) { + return lit.s.length() <= ROSE_SMALL_BLOCK_LEN; + })); if (lits.empty()) { DEBUG_PRINTF("no literals shorter than small block len\n"); diff --git a/src/rose/rose_build_matchers.h b/src/rose/rose_build_matchers.h index 7d5c9283..2a225bf5 100644 --- a/src/rose/rose_build_matchers.h +++ b/src/rose/rose_build_matchers.h @@ -44,8 +44,14 @@ namespace ue2 { struct hwlmLiteral; +/** + * \brief Build up a vector of literals for the given table. + * + * If max_offset is specified (and not ROSE_BOUND_INF), then literals that can + * only lead to a pattern match after max_offset may be excluded. + */ std::vector fillHamsterLiteralList(const RoseBuildImpl &build, - rose_literal_table table); + rose_literal_table table, u32 max_offset = ROSE_BOUND_INF); aligned_unique_ptr buildFloatingMatcher(const RoseBuildImpl &build, rose_group *fgroups,