dfa: prune_overlong -> clear_deeper_reports

Rather than pruning states, we simply clear reports on states that are
too deep and allow Hopcroft minimisation to reduce the size of the DFA
afterwards.
This commit is contained in:
Justin Viiret 2017-04-13 13:18:33 +10:00 committed by Matthew Barr
parent 8650a1a33f
commit 41d7aa8281
3 changed files with 18 additions and 107 deletions

View File

@ -167,70 +167,8 @@ u32 calc_min_dist_from_bob(raw_dfa &raw, vector<u32> *dist_in) {
return last_d;
}
static
vector<vector<dstate_id_t>> find_in_edges(const raw_dfa &raw) {
vector<vector<dstate_id_t>> in_edges(raw.states.size());
flat_set<dstate_id_t> seen;
for (u32 s = 1; s < raw.states.size(); s++) {
seen.clear();
for (u32 j = 0; j < raw.alpha_size; j++) {
dstate_id_t t = raw.states[s].next[j];
if (!seen.insert(t).second) {
continue;
}
in_edges[t].push_back(s);
}
}
return in_edges;
}
static
vector<u32> calc_min_dist_to_accept(const raw_dfa &raw,
const vector<vector<dstate_id_t>> &in_edges) {
vector<u32> dist(raw.states.size(), ~0U);
/* for reporting states to start from */
deque<dstate_id_t> to_visit;
for (u32 s = 0; s < raw.states.size(); s++) {
if (state_has_reports(raw, s)) {
to_visit.push_back(s);
dist[s] = 0;
}
}
/* bfs */
UNUSED u32 last_d = 0;
while (!to_visit.empty()) {
dstate_id_t s = to_visit.front();
to_visit.pop_front();
assert(s != DEAD_STATE);
u32 d = dist[s];
assert(d >= last_d);
assert(d != ~0U);
for (auto t : in_edges[s]) {
if (t == DEAD_STATE) {
continue;
}
if (dist[t] == ~0U) {
to_visit.push_back(t);
dist[t] = d + 1;
} else {
assert(dist[t] <= d + 1);
}
}
last_d = d;
}
return dist;
}
bool prune_overlong(raw_dfa &raw, u32 max_offset) {
DEBUG_PRINTF("pruning to at most %u\n", max_offset);
bool clear_deeper_reports(raw_dfa &raw, u32 max_offset) {
DEBUG_PRINTF("clearing reports on states deeper than %u\n", max_offset);
vector<u32> bob_dist;
u32 max_min_dist_bob = calc_min_dist_from_bob(raw, &bob_dist);
@ -238,47 +176,18 @@ bool prune_overlong(raw_dfa &raw, u32 max_offset) {
return false;
}
vector<u32> accept_dist = calc_min_dist_to_accept(raw, find_in_edges(raw));
/* look over the states and filter out any which cannot reach a report
* states before max_offset */
vector<dstate_id_t> new_ids(raw.states.size());
vector<dstate> new_states;
u32 count = 1;
new_states.push_back(raw.states[DEAD_STATE]);
bool changed = false;
for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
if (bob_dist[s] + accept_dist[s] > max_offset) {
DEBUG_PRINTF("pruned %u: bob %u, report %u\n", s, bob_dist[s],
accept_dist[s]);
new_ids[s] = DEAD_STATE;
} else {
new_ids[s] = count++;
new_states.push_back(raw.states[s]);
assert(new_states.size() == count);
assert(new_ids[s] <= s);
if (bob_dist[s] > max_offset && state_has_reports(raw, s)) {
DEBUG_PRINTF("clearing reports on %u (depth %u)\n", s, bob_dist[s]);
auto &ds = raw.states[s];
ds.reports.clear();
ds.reports_eod.clear();
changed = true;
}
}
/* swap states */
DEBUG_PRINTF("pruned %zu -> %u\n", raw.states.size(), count);
raw.states = std::move(new_states);
new_states.clear();
/* update edges and daddys to refer to the new ids */
for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
for (u32 j = 0; j < raw.alpha_size; j++) {
dstate_id_t old_t = raw.states[s].next[j];
raw.states[s].next[j] = new_ids[old_t];
}
raw.states[s].daddy = new_ids[raw.states[s].daddy];
}
/* update specials */
raw.start_floating = new_ids[raw.start_floating];
raw.start_anchored = new_ids[raw.start_anchored];
return true;
return changed;
}
set<ReportID> all_reports(const raw_dfa &rdfa) {

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2016, Intel Corporation
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@ -39,10 +39,12 @@ namespace ue2 {
u32 remove_leading_dots(raw_dfa &raw);
/**
* Prunes any states which cannot be reached within max_offset from start of
* stream. Returns false if no changes are made to the rdfa
* \brief Clear reports on any states that are deeper than \a max_offset from
* start of stream.
*
* Returns false if no changes are made to the DFA.
*/
bool prune_overlong(raw_dfa &raw, u32 max_offset);
bool clear_deeper_reports(raw_dfa &raw, u32 max_offset);
std::set<ReportID> all_reports(const raw_dfa &rdfa);
bool has_eod_accepts(const raw_dfa &rdfa);

View File

@ -279,7 +279,7 @@ void SmallWriteBuildImpl::add(const NGHolder &g, const ExpressionInfo &expr) {
return;
}
if (prune_overlong(*r, cc.grey.smallWriteLargestBuffer)) {
if (clear_deeper_reports(*r, cc.grey.smallWriteLargestBuffer)) {
minimize_hopcroft(*r, cc.grey);
}
@ -725,7 +725,7 @@ bytecode_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
if (*small_region <= *start_offset) {
return nullptr;
}
if (prune_overlong(rdfa, *small_region - *start_offset)) {
if (clear_deeper_reports(rdfa, *small_region - *start_offset)) {
minimize_hopcroft(rdfa, cc.grey);
if (rdfa.start_anchored == DEAD_STATE) {
DEBUG_PRINTF("all patterns pruned out\n");