mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
dfa: prune_overlong -> clear_deeper_reports
Rather than pruning states, we simply clear reports on states that are too deep and allow Hopcroft minimisation to reduce the size of the DFA afterwards.
This commit is contained in:
parent
8650a1a33f
commit
41d7aa8281
@ -167,70 +167,8 @@ u32 calc_min_dist_from_bob(raw_dfa &raw, vector<u32> *dist_in) {
|
||||
return last_d;
|
||||
}
|
||||
|
||||
static
|
||||
vector<vector<dstate_id_t>> find_in_edges(const raw_dfa &raw) {
|
||||
vector<vector<dstate_id_t>> in_edges(raw.states.size());
|
||||
flat_set<dstate_id_t> seen;
|
||||
|
||||
for (u32 s = 1; s < raw.states.size(); s++) {
|
||||
seen.clear();
|
||||
for (u32 j = 0; j < raw.alpha_size; j++) {
|
||||
dstate_id_t t = raw.states[s].next[j];
|
||||
if (!seen.insert(t).second) {
|
||||
continue;
|
||||
}
|
||||
in_edges[t].push_back(s);
|
||||
}
|
||||
}
|
||||
|
||||
return in_edges;
|
||||
}
|
||||
|
||||
static
|
||||
vector<u32> calc_min_dist_to_accept(const raw_dfa &raw,
|
||||
const vector<vector<dstate_id_t>> &in_edges) {
|
||||
vector<u32> dist(raw.states.size(), ~0U);
|
||||
|
||||
/* for reporting states to start from */
|
||||
deque<dstate_id_t> to_visit;
|
||||
for (u32 s = 0; s < raw.states.size(); s++) {
|
||||
if (state_has_reports(raw, s)) {
|
||||
to_visit.push_back(s);
|
||||
dist[s] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* bfs */
|
||||
UNUSED u32 last_d = 0;
|
||||
while (!to_visit.empty()) {
|
||||
dstate_id_t s = to_visit.front();
|
||||
to_visit.pop_front();
|
||||
assert(s != DEAD_STATE);
|
||||
|
||||
u32 d = dist[s];
|
||||
assert(d >= last_d);
|
||||
assert(d != ~0U);
|
||||
|
||||
for (auto t : in_edges[s]) {
|
||||
if (t == DEAD_STATE) {
|
||||
continue;
|
||||
}
|
||||
if (dist[t] == ~0U) {
|
||||
to_visit.push_back(t);
|
||||
dist[t] = d + 1;
|
||||
} else {
|
||||
assert(dist[t] <= d + 1);
|
||||
}
|
||||
}
|
||||
|
||||
last_d = d;
|
||||
}
|
||||
|
||||
return dist;
|
||||
}
|
||||
|
||||
bool prune_overlong(raw_dfa &raw, u32 max_offset) {
|
||||
DEBUG_PRINTF("pruning to at most %u\n", max_offset);
|
||||
bool clear_deeper_reports(raw_dfa &raw, u32 max_offset) {
|
||||
DEBUG_PRINTF("clearing reports on states deeper than %u\n", max_offset);
|
||||
vector<u32> bob_dist;
|
||||
u32 max_min_dist_bob = calc_min_dist_from_bob(raw, &bob_dist);
|
||||
|
||||
@ -238,47 +176,18 @@ bool prune_overlong(raw_dfa &raw, u32 max_offset) {
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<u32> accept_dist = calc_min_dist_to_accept(raw, find_in_edges(raw));
|
||||
|
||||
/* look over the states and filter out any which cannot reach a report
|
||||
* states before max_offset */
|
||||
vector<dstate_id_t> new_ids(raw.states.size());
|
||||
vector<dstate> new_states;
|
||||
u32 count = 1;
|
||||
new_states.push_back(raw.states[DEAD_STATE]);
|
||||
|
||||
bool changed = false;
|
||||
for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
|
||||
if (bob_dist[s] + accept_dist[s] > max_offset) {
|
||||
DEBUG_PRINTF("pruned %u: bob %u, report %u\n", s, bob_dist[s],
|
||||
accept_dist[s]);
|
||||
new_ids[s] = DEAD_STATE;
|
||||
} else {
|
||||
new_ids[s] = count++;
|
||||
new_states.push_back(raw.states[s]);
|
||||
assert(new_states.size() == count);
|
||||
assert(new_ids[s] <= s);
|
||||
if (bob_dist[s] > max_offset && state_has_reports(raw, s)) {
|
||||
DEBUG_PRINTF("clearing reports on %u (depth %u)\n", s, bob_dist[s]);
|
||||
auto &ds = raw.states[s];
|
||||
ds.reports.clear();
|
||||
ds.reports_eod.clear();
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* swap states */
|
||||
DEBUG_PRINTF("pruned %zu -> %u\n", raw.states.size(), count);
|
||||
raw.states = std::move(new_states);
|
||||
new_states.clear();
|
||||
|
||||
/* update edges and daddys to refer to the new ids */
|
||||
for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
|
||||
for (u32 j = 0; j < raw.alpha_size; j++) {
|
||||
dstate_id_t old_t = raw.states[s].next[j];
|
||||
raw.states[s].next[j] = new_ids[old_t];
|
||||
}
|
||||
raw.states[s].daddy = new_ids[raw.states[s].daddy];
|
||||
}
|
||||
|
||||
/* update specials */
|
||||
raw.start_floating = new_ids[raw.start_floating];
|
||||
raw.start_anchored = new_ids[raw.start_anchored];
|
||||
|
||||
return true;
|
||||
return changed;
|
||||
}
|
||||
|
||||
set<ReportID> all_reports(const raw_dfa &rdfa) {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -39,10 +39,12 @@ namespace ue2 {
|
||||
u32 remove_leading_dots(raw_dfa &raw);
|
||||
|
||||
/**
|
||||
* Prunes any states which cannot be reached within max_offset from start of
|
||||
* stream. Returns false if no changes are made to the rdfa
|
||||
* \brief Clear reports on any states that are deeper than \a max_offset from
|
||||
* start of stream.
|
||||
*
|
||||
* Returns false if no changes are made to the DFA.
|
||||
*/
|
||||
bool prune_overlong(raw_dfa &raw, u32 max_offset);
|
||||
bool clear_deeper_reports(raw_dfa &raw, u32 max_offset);
|
||||
|
||||
std::set<ReportID> all_reports(const raw_dfa &rdfa);
|
||||
bool has_eod_accepts(const raw_dfa &rdfa);
|
||||
|
@ -279,7 +279,7 @@ void SmallWriteBuildImpl::add(const NGHolder &g, const ExpressionInfo &expr) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (prune_overlong(*r, cc.grey.smallWriteLargestBuffer)) {
|
||||
if (clear_deeper_reports(*r, cc.grey.smallWriteLargestBuffer)) {
|
||||
minimize_hopcroft(*r, cc.grey);
|
||||
}
|
||||
|
||||
@ -725,7 +725,7 @@ bytecode_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
|
||||
if (*small_region <= *start_offset) {
|
||||
return nullptr;
|
||||
}
|
||||
if (prune_overlong(rdfa, *small_region - *start_offset)) {
|
||||
if (clear_deeper_reports(rdfa, *small_region - *start_offset)) {
|
||||
minimize_hopcroft(rdfa, cc.grey);
|
||||
if (rdfa.start_anchored == DEAD_STATE) {
|
||||
DEBUG_PRINTF("all patterns pruned out\n");
|
||||
|
Loading…
x
Reference in New Issue
Block a user