mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
dfa: prune_overlong -> clear_deeper_reports
Rather than pruning states, we simply clear reports on states that are too deep and allow Hopcroft minimisation to reduce the size of the DFA afterwards.
This commit is contained in:
parent
8650a1a33f
commit
41d7aa8281
@ -167,70 +167,8 @@ u32 calc_min_dist_from_bob(raw_dfa &raw, vector<u32> *dist_in) {
|
|||||||
return last_d;
|
return last_d;
|
||||||
}
|
}
|
||||||
|
|
||||||
static
|
bool clear_deeper_reports(raw_dfa &raw, u32 max_offset) {
|
||||||
vector<vector<dstate_id_t>> find_in_edges(const raw_dfa &raw) {
|
DEBUG_PRINTF("clearing reports on states deeper than %u\n", max_offset);
|
||||||
vector<vector<dstate_id_t>> in_edges(raw.states.size());
|
|
||||||
flat_set<dstate_id_t> seen;
|
|
||||||
|
|
||||||
for (u32 s = 1; s < raw.states.size(); s++) {
|
|
||||||
seen.clear();
|
|
||||||
for (u32 j = 0; j < raw.alpha_size; j++) {
|
|
||||||
dstate_id_t t = raw.states[s].next[j];
|
|
||||||
if (!seen.insert(t).second) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
in_edges[t].push_back(s);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return in_edges;
|
|
||||||
}
|
|
||||||
|
|
||||||
static
|
|
||||||
vector<u32> calc_min_dist_to_accept(const raw_dfa &raw,
|
|
||||||
const vector<vector<dstate_id_t>> &in_edges) {
|
|
||||||
vector<u32> dist(raw.states.size(), ~0U);
|
|
||||||
|
|
||||||
/* for reporting states to start from */
|
|
||||||
deque<dstate_id_t> to_visit;
|
|
||||||
for (u32 s = 0; s < raw.states.size(); s++) {
|
|
||||||
if (state_has_reports(raw, s)) {
|
|
||||||
to_visit.push_back(s);
|
|
||||||
dist[s] = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* bfs */
|
|
||||||
UNUSED u32 last_d = 0;
|
|
||||||
while (!to_visit.empty()) {
|
|
||||||
dstate_id_t s = to_visit.front();
|
|
||||||
to_visit.pop_front();
|
|
||||||
assert(s != DEAD_STATE);
|
|
||||||
|
|
||||||
u32 d = dist[s];
|
|
||||||
assert(d >= last_d);
|
|
||||||
assert(d != ~0U);
|
|
||||||
|
|
||||||
for (auto t : in_edges[s]) {
|
|
||||||
if (t == DEAD_STATE) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (dist[t] == ~0U) {
|
|
||||||
to_visit.push_back(t);
|
|
||||||
dist[t] = d + 1;
|
|
||||||
} else {
|
|
||||||
assert(dist[t] <= d + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
last_d = d;
|
|
||||||
}
|
|
||||||
|
|
||||||
return dist;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool prune_overlong(raw_dfa &raw, u32 max_offset) {
|
|
||||||
DEBUG_PRINTF("pruning to at most %u\n", max_offset);
|
|
||||||
vector<u32> bob_dist;
|
vector<u32> bob_dist;
|
||||||
u32 max_min_dist_bob = calc_min_dist_from_bob(raw, &bob_dist);
|
u32 max_min_dist_bob = calc_min_dist_from_bob(raw, &bob_dist);
|
||||||
|
|
||||||
@ -238,47 +176,18 @@ bool prune_overlong(raw_dfa &raw, u32 max_offset) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<u32> accept_dist = calc_min_dist_to_accept(raw, find_in_edges(raw));
|
bool changed = false;
|
||||||
|
|
||||||
/* look over the states and filter out any which cannot reach a report
|
|
||||||
* states before max_offset */
|
|
||||||
vector<dstate_id_t> new_ids(raw.states.size());
|
|
||||||
vector<dstate> new_states;
|
|
||||||
u32 count = 1;
|
|
||||||
new_states.push_back(raw.states[DEAD_STATE]);
|
|
||||||
|
|
||||||
for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
|
for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
|
||||||
if (bob_dist[s] + accept_dist[s] > max_offset) {
|
if (bob_dist[s] > max_offset && state_has_reports(raw, s)) {
|
||||||
DEBUG_PRINTF("pruned %u: bob %u, report %u\n", s, bob_dist[s],
|
DEBUG_PRINTF("clearing reports on %u (depth %u)\n", s, bob_dist[s]);
|
||||||
accept_dist[s]);
|
auto &ds = raw.states[s];
|
||||||
new_ids[s] = DEAD_STATE;
|
ds.reports.clear();
|
||||||
} else {
|
ds.reports_eod.clear();
|
||||||
new_ids[s] = count++;
|
changed = true;
|
||||||
new_states.push_back(raw.states[s]);
|
|
||||||
assert(new_states.size() == count);
|
|
||||||
assert(new_ids[s] <= s);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* swap states */
|
return changed;
|
||||||
DEBUG_PRINTF("pruned %zu -> %u\n", raw.states.size(), count);
|
|
||||||
raw.states = std::move(new_states);
|
|
||||||
new_states.clear();
|
|
||||||
|
|
||||||
/* update edges and daddys to refer to the new ids */
|
|
||||||
for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
|
|
||||||
for (u32 j = 0; j < raw.alpha_size; j++) {
|
|
||||||
dstate_id_t old_t = raw.states[s].next[j];
|
|
||||||
raw.states[s].next[j] = new_ids[old_t];
|
|
||||||
}
|
|
||||||
raw.states[s].daddy = new_ids[raw.states[s].daddy];
|
|
||||||
}
|
|
||||||
|
|
||||||
/* update specials */
|
|
||||||
raw.start_floating = new_ids[raw.start_floating];
|
|
||||||
raw.start_anchored = new_ids[raw.start_anchored];
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
set<ReportID> all_reports(const raw_dfa &rdfa) {
|
set<ReportID> all_reports(const raw_dfa &rdfa) {
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015-2016, Intel Corporation
|
* Copyright (c) 2015-2017, Intel Corporation
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@ -39,10 +39,12 @@ namespace ue2 {
|
|||||||
u32 remove_leading_dots(raw_dfa &raw);
|
u32 remove_leading_dots(raw_dfa &raw);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Prunes any states which cannot be reached within max_offset from start of
|
* \brief Clear reports on any states that are deeper than \a max_offset from
|
||||||
* stream. Returns false if no changes are made to the rdfa
|
* start of stream.
|
||||||
|
*
|
||||||
|
* Returns false if no changes are made to the DFA.
|
||||||
*/
|
*/
|
||||||
bool prune_overlong(raw_dfa &raw, u32 max_offset);
|
bool clear_deeper_reports(raw_dfa &raw, u32 max_offset);
|
||||||
|
|
||||||
std::set<ReportID> all_reports(const raw_dfa &rdfa);
|
std::set<ReportID> all_reports(const raw_dfa &rdfa);
|
||||||
bool has_eod_accepts(const raw_dfa &rdfa);
|
bool has_eod_accepts(const raw_dfa &rdfa);
|
||||||
|
@ -279,7 +279,7 @@ void SmallWriteBuildImpl::add(const NGHolder &g, const ExpressionInfo &expr) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prune_overlong(*r, cc.grey.smallWriteLargestBuffer)) {
|
if (clear_deeper_reports(*r, cc.grey.smallWriteLargestBuffer)) {
|
||||||
minimize_hopcroft(*r, cc.grey);
|
minimize_hopcroft(*r, cc.grey);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -725,7 +725,7 @@ bytecode_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
|
|||||||
if (*small_region <= *start_offset) {
|
if (*small_region <= *start_offset) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
if (prune_overlong(rdfa, *small_region - *start_offset)) {
|
if (clear_deeper_reports(rdfa, *small_region - *start_offset)) {
|
||||||
minimize_hopcroft(rdfa, cc.grey);
|
minimize_hopcroft(rdfa, cc.grey);
|
||||||
if (rdfa.start_anchored == DEAD_STATE) {
|
if (rdfa.start_anchored == DEAD_STATE) {
|
||||||
DEBUG_PRINTF("all patterns pruned out\n");
|
DEBUG_PRINTF("all patterns pruned out\n");
|
||||||
|
Loading…
x
Reference in New Issue
Block a user