dfa: prune_overlong -> clear_deeper_reports

Rather than pruning states, we simply clear reports on states that are
too deep and allow Hopcroft minimisation to reduce the size of the DFA
afterwards.
This commit is contained in:
Justin Viiret 2017-04-13 13:18:33 +10:00 committed by Matthew Barr
parent 8650a1a33f
commit 41d7aa8281
3 changed files with 18 additions and 107 deletions

View File

@ -167,70 +167,8 @@ u32 calc_min_dist_from_bob(raw_dfa &raw, vector<u32> *dist_in) {
return last_d; return last_d;
} }
static bool clear_deeper_reports(raw_dfa &raw, u32 max_offset) {
vector<vector<dstate_id_t>> find_in_edges(const raw_dfa &raw) { DEBUG_PRINTF("clearing reports on states deeper than %u\n", max_offset);
vector<vector<dstate_id_t>> in_edges(raw.states.size());
flat_set<dstate_id_t> seen;
for (u32 s = 1; s < raw.states.size(); s++) {
seen.clear();
for (u32 j = 0; j < raw.alpha_size; j++) {
dstate_id_t t = raw.states[s].next[j];
if (!seen.insert(t).second) {
continue;
}
in_edges[t].push_back(s);
}
}
return in_edges;
}
static
vector<u32> calc_min_dist_to_accept(const raw_dfa &raw,
const vector<vector<dstate_id_t>> &in_edges) {
vector<u32> dist(raw.states.size(), ~0U);
/* for reporting states to start from */
deque<dstate_id_t> to_visit;
for (u32 s = 0; s < raw.states.size(); s++) {
if (state_has_reports(raw, s)) {
to_visit.push_back(s);
dist[s] = 0;
}
}
/* bfs */
UNUSED u32 last_d = 0;
while (!to_visit.empty()) {
dstate_id_t s = to_visit.front();
to_visit.pop_front();
assert(s != DEAD_STATE);
u32 d = dist[s];
assert(d >= last_d);
assert(d != ~0U);
for (auto t : in_edges[s]) {
if (t == DEAD_STATE) {
continue;
}
if (dist[t] == ~0U) {
to_visit.push_back(t);
dist[t] = d + 1;
} else {
assert(dist[t] <= d + 1);
}
}
last_d = d;
}
return dist;
}
bool prune_overlong(raw_dfa &raw, u32 max_offset) {
DEBUG_PRINTF("pruning to at most %u\n", max_offset);
vector<u32> bob_dist; vector<u32> bob_dist;
u32 max_min_dist_bob = calc_min_dist_from_bob(raw, &bob_dist); u32 max_min_dist_bob = calc_min_dist_from_bob(raw, &bob_dist);
@ -238,47 +176,18 @@ bool prune_overlong(raw_dfa &raw, u32 max_offset) {
return false; return false;
} }
vector<u32> accept_dist = calc_min_dist_to_accept(raw, find_in_edges(raw)); bool changed = false;
/* look over the states and filter out any which cannot reach a report
* states before max_offset */
vector<dstate_id_t> new_ids(raw.states.size());
vector<dstate> new_states;
u32 count = 1;
new_states.push_back(raw.states[DEAD_STATE]);
for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) { for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
if (bob_dist[s] + accept_dist[s] > max_offset) { if (bob_dist[s] > max_offset && state_has_reports(raw, s)) {
DEBUG_PRINTF("pruned %u: bob %u, report %u\n", s, bob_dist[s], DEBUG_PRINTF("clearing reports on %u (depth %u)\n", s, bob_dist[s]);
accept_dist[s]); auto &ds = raw.states[s];
new_ids[s] = DEAD_STATE; ds.reports.clear();
} else { ds.reports_eod.clear();
new_ids[s] = count++; changed = true;
new_states.push_back(raw.states[s]);
assert(new_states.size() == count);
assert(new_ids[s] <= s);
} }
} }
/* swap states */ return changed;
DEBUG_PRINTF("pruned %zu -> %u\n", raw.states.size(), count);
raw.states = std::move(new_states);
new_states.clear();
/* update edges and daddys to refer to the new ids */
for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
for (u32 j = 0; j < raw.alpha_size; j++) {
dstate_id_t old_t = raw.states[s].next[j];
raw.states[s].next[j] = new_ids[old_t];
}
raw.states[s].daddy = new_ids[raw.states[s].daddy];
}
/* update specials */
raw.start_floating = new_ids[raw.start_floating];
raw.start_anchored = new_ids[raw.start_anchored];
return true;
} }
set<ReportID> all_reports(const raw_dfa &rdfa) { set<ReportID> all_reports(const raw_dfa &rdfa) {

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2016, Intel Corporation * Copyright (c) 2015-2017, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -39,10 +39,12 @@ namespace ue2 {
u32 remove_leading_dots(raw_dfa &raw); u32 remove_leading_dots(raw_dfa &raw);
/** /**
* Prunes any states which cannot be reached within max_offset from start of * \brief Clear reports on any states that are deeper than \a max_offset from
* stream. Returns false if no changes are made to the rdfa * start of stream.
*
* Returns false if no changes are made to the DFA.
*/ */
bool prune_overlong(raw_dfa &raw, u32 max_offset); bool clear_deeper_reports(raw_dfa &raw, u32 max_offset);
std::set<ReportID> all_reports(const raw_dfa &rdfa); std::set<ReportID> all_reports(const raw_dfa &rdfa);
bool has_eod_accepts(const raw_dfa &rdfa); bool has_eod_accepts(const raw_dfa &rdfa);

View File

@ -279,7 +279,7 @@ void SmallWriteBuildImpl::add(const NGHolder &g, const ExpressionInfo &expr) {
return; return;
} }
if (prune_overlong(*r, cc.grey.smallWriteLargestBuffer)) { if (clear_deeper_reports(*r, cc.grey.smallWriteLargestBuffer)) {
minimize_hopcroft(*r, cc.grey); minimize_hopcroft(*r, cc.grey);
} }
@ -725,7 +725,7 @@ bytecode_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
if (*small_region <= *start_offset) { if (*small_region <= *start_offset) {
return nullptr; return nullptr;
} }
if (prune_overlong(rdfa, *small_region - *start_offset)) { if (clear_deeper_reports(rdfa, *small_region - *start_offset)) {
minimize_hopcroft(rdfa, cc.grey); minimize_hopcroft(rdfa, cc.grey);
if (rdfa.start_anchored == DEAD_STATE) { if (rdfa.start_anchored == DEAD_STATE) {
DEBUG_PRINTF("all patterns pruned out\n"); DEBUG_PRINTF("all patterns pruned out\n");