rose: check literal bounds when building SB table

Literals that cannot lead to a report in the first ROSE_SMALL_BLOCK_LEN
bytes may be dropped from the small block table.
This commit is contained in:
Justin Viiret 2016-07-07 10:25:49 +10:00 committed by Matthew Barr
parent 9346a9090e
commit 39c6a0c7bf
3 changed files with 133 additions and 31 deletions

View File

@ -458,18 +458,6 @@ void dumpTestLiterals(const string &filename, const vector<hwlmLiteral> &lits) {
of.close(); of.close();
} }
namespace {
struct LongerThanLimit {
explicit LongerThanLimit(size_t len) : max_len(len) {}
bool operator()(const hwlmLiteral &lit) const {
return lit.s.length() > max_len;
}
private:
size_t max_len;
};
}
static static
void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) { void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) {
auto lits = fillHamsterLiteralList(build, ROSE_ANCHORED); auto lits = fillHamsterLiteralList(build, ROSE_ANCHORED);
@ -481,12 +469,10 @@ void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) {
lits = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED); lits = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED);
dumpTestLiterals(base + "rose_eod_test_literals.txt", lits); dumpTestLiterals(base + "rose_eod_test_literals.txt", lits);
lits = fillHamsterLiteralList(build, ROSE_FLOATING); lits = fillHamsterLiteralList(build, ROSE_FLOATING, ROSE_SMALL_BLOCK_LEN);
auto lits2 = fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK); auto lits2 = fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK,
ROSE_SMALL_BLOCK_LEN);
lits.insert(end(lits), begin(lits2), end(lits2)); lits.insert(end(lits), begin(lits2), end(lits2));
lits.erase(remove_if(lits.begin(), lits.end(),
LongerThanLimit(ROSE_SMALL_BLOCK_LEN)),
lits.end());
dumpTestLiterals(base + "rose_smallblock_test_literals.txt", lits); dumpTestLiterals(base + "rose_smallblock_test_literals.txt", lits);
} }

View File

@ -38,12 +38,14 @@
#include "hwlm/hwlm_build.h" #include "hwlm/hwlm_build.h"
#include "hwlm/hwlm_literal.h" #include "hwlm/hwlm_literal.h"
#include "nfa/castlecompile.h" #include "nfa/castlecompile.h"
#include "nfa/nfa_api_queue.h"
#include "util/charreach_util.h" #include "util/charreach_util.h"
#include "util/compile_context.h" #include "util/compile_context.h"
#include "util/compile_error.h" #include "util/compile_error.h"
#include "util/dump_charclass.h" #include "util/dump_charclass.h"
#include "util/report.h" #include "util/report.h"
#include "util/report_manager.h" #include "util/report_manager.h"
#include "util/verify_types.h"
#include "ue2common.h" #include "ue2common.h"
#include <iomanip> #include <iomanip>
@ -519,8 +521,111 @@ bool isNoRunsLiteral(const RoseBuildImpl &build, const u32 id,
return true; return true;
} }
static
const raw_puff &getChainedPuff(const RoseBuildImpl &build,
const Report &report) {
DEBUG_PRINTF("chained report, event %u\n", report.onmatch);
// MPV has already been moved to the outfixes vector.
assert(!build.mpv_outfix);
auto mpv_outfix_it = find_if(
begin(build.outfixes), end(build.outfixes),
[](const OutfixInfo &outfix) { return outfix.is_nonempty_mpv(); });
assert(mpv_outfix_it != end(build.outfixes));
const auto *mpv = mpv_outfix_it->mpv();
u32 puff_index = report.onmatch - MQE_TOP_FIRST;
assert(puff_index < mpv->triggered_puffettes.size());
return mpv->triggered_puffettes.at(puff_index);
}
/**
* \brief Returns a conservative estimate of the minimum offset at which the
* given literal can lead to a report.
*
* TODO: This could be made more precise by calculating a "distance to accept"
* for every vertex in the graph; right now we're only accurate for leaf nodes.
*/
static
u64a literalMinReportOffset(const RoseBuildImpl &build,
const rose_literal_id &lit,
const rose_literal_info &info) {
const auto &g = build.g;
const u32 lit_len = verify_u32(lit.elength());
u64a lit_min_offset = UINT64_MAX;
for (const auto &v : info.vertices) {
DEBUG_PRINTF("vertex %zu min_offset=%u\n", g[v].idx, g[v].min_offset);
u64a vert_offset = g[v].min_offset;
if (vert_offset >= lit_min_offset) {
continue;
}
u64a min_offset = UINT64_MAX;
for (const auto &id : g[v].reports) {
const Report &report = build.rm.getReport(id);
DEBUG_PRINTF("report id %u, min offset=%llu\n", id,
report.minOffset);
if (report.type == INTERNAL_ROSE_CHAIN) {
// This vertex triggers an MPV, which will fire reports after
// repeating for a while.
assert(report.minOffset == 0); // Should not have bounds.
const auto &puff = getChainedPuff(build, report);
DEBUG_PRINTF("chained puff repeats=%u\n", puff.repeats);
const Report &puff_report = build.rm.getReport(puff.report);
DEBUG_PRINTF("puff report %u, min offset=%llu\n", puff.report,
puff_report.minOffset);
min_offset = min(min_offset, max(vert_offset + puff.repeats,
puff_report.minOffset));
} else {
DEBUG_PRINTF("report min offset=%llu\n", report.minOffset);
min_offset = min(min_offset, max(vert_offset,
report.minOffset));
}
}
if (g[v].suffix) {
depth suffix_width = findMinWidth(g[v].suffix, g[v].suffix.top);
assert(suffix_width.is_reachable());
DEBUG_PRINTF("suffix with width %s\n", suffix_width.str().c_str());
min_offset = min(min_offset, vert_offset + suffix_width);
}
if (!isLeafNode(v, g) || min_offset == UINT64_MAX) {
min_offset = vert_offset;
}
lit_min_offset = min(lit_min_offset, min_offset);
}
// If this literal in the undelayed literal corresponding to some delayed
// literals, we must take their minimum offsets into account.
for (const u32 &delayed_id : info.delayed_ids) {
const auto &delayed_lit = build.literals.right.at(delayed_id);
const auto &delayed_info = build.literal_info.at(delayed_id);
u64a delayed_min_offset = literalMinReportOffset(build, delayed_lit,
delayed_info);
DEBUG_PRINTF("delayed_id=%u, min_offset = %llu\n", delayed_id,
delayed_min_offset);
lit_min_offset = min(lit_min_offset, delayed_min_offset);
}
// If we share a vertex with a shorter literal, our min offset might dip
// below the length of this one.
lit_min_offset = max(lit_min_offset, u64a{lit_len});
return lit_min_offset;
}
vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build, vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
rose_literal_table table) { rose_literal_table table,
u32 max_offset) {
vector<hwlmLiteral> lits; vector<hwlmLiteral> lits;
for (const auto &e : build.literals.right) { for (const auto &e : build.literals.right) {
@ -546,6 +651,15 @@ vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
DEBUG_PRINTF("lit='%s'\n", escapeString(lit).c_str()); DEBUG_PRINTF("lit='%s'\n", escapeString(lit).c_str());
if (max_offset != ROSE_BOUND_INF) {
u64a min_report = literalMinReportOffset(build, e.second, info);
if (min_report > max_offset) {
DEBUG_PRINTF("min report offset=%llu exceeds max_offset=%u\n",
min_report, max_offset);
continue;
}
}
const vector<u8> &msk = e.second.msk; const vector<u8> &msk = e.second.msk;
const vector<u8> &cmp = e.second.cmp; const vector<u8> &cmp = e.second.cmp;
@ -664,7 +778,8 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
return nullptr; return nullptr;
} }
auto lits = fillHamsterLiteralList(build, ROSE_FLOATING); auto lits = fillHamsterLiteralList(build, ROSE_FLOATING,
ROSE_SMALL_BLOCK_LEN);
if (lits.empty()) { if (lits.empty()) {
DEBUG_PRINTF("no floating table\n"); DEBUG_PRINTF("no floating table\n");
return nullptr; return nullptr;
@ -673,8 +788,8 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
return nullptr; return nullptr;
} }
auto anchored_lits = auto anchored_lits = fillHamsterLiteralList(build,
fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK); ROSE_ANCHORED_SMALL_BLOCK, ROSE_SMALL_BLOCK_LEN);
if (anchored_lits.empty()) { if (anchored_lits.empty()) {
DEBUG_PRINTF("no small-block anchored literals\n"); DEBUG_PRINTF("no small-block anchored literals\n");
return nullptr; return nullptr;
@ -682,15 +797,10 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
lits.insert(lits.end(), anchored_lits.begin(), anchored_lits.end()); lits.insert(lits.end(), anchored_lits.begin(), anchored_lits.end());
// Remove literals that are longer than our small block length, as they can // None of our literals should be longer than the small block limit.
// never match. TODO: improve by removing literals that have a min match assert(all_of(begin(lits), end(lits), [](const hwlmLiteral &lit) {
// offset greater than ROSE_SMALL_BLOCK_LEN, which will catch anchored cases return lit.s.length() <= ROSE_SMALL_BLOCK_LEN;
// with preceding dots that put them over the limit. }));
auto longer_than_limit = [](const hwlmLiteral &lit) {
return lit.s.length() > ROSE_SMALL_BLOCK_LEN;
};
lits.erase(remove_if(lits.begin(), lits.end(), longer_than_limit),
lits.end());
if (lits.empty()) { if (lits.empty()) {
DEBUG_PRINTF("no literals shorter than small block len\n"); DEBUG_PRINTF("no literals shorter than small block len\n");

View File

@ -44,8 +44,14 @@ namespace ue2 {
struct hwlmLiteral; struct hwlmLiteral;
/**
* \brief Build up a vector of literals for the given table.
*
* If max_offset is specified (and not ROSE_BOUND_INF), then literals that can
* only lead to a pattern match after max_offset may be excluded.
*/
std::vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build, std::vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
rose_literal_table table); rose_literal_table table, u32 max_offset = ROSE_BOUND_INF);
aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build, aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
rose_group *fgroups, rose_group *fgroups,