diff --git a/src/nfa/mcclellancompile_util.cpp b/src/nfa/mcclellancompile_util.cpp index 7a3ceaf1..17e022fe 100644 --- a/src/nfa/mcclellancompile_util.cpp +++ b/src/nfa/mcclellancompile_util.cpp @@ -167,70 +167,8 @@ u32 calc_min_dist_from_bob(raw_dfa &raw, vector *dist_in) { return last_d; } -static -vector> find_in_edges(const raw_dfa &raw) { - vector> in_edges(raw.states.size()); - flat_set seen; - - for (u32 s = 1; s < raw.states.size(); s++) { - seen.clear(); - for (u32 j = 0; j < raw.alpha_size; j++) { - dstate_id_t t = raw.states[s].next[j]; - if (!seen.insert(t).second) { - continue; - } - in_edges[t].push_back(s); - } - } - - return in_edges; -} - -static -vector calc_min_dist_to_accept(const raw_dfa &raw, - const vector> &in_edges) { - vector dist(raw.states.size(), ~0U); - - /* for reporting states to start from */ - deque to_visit; - for (u32 s = 0; s < raw.states.size(); s++) { - if (state_has_reports(raw, s)) { - to_visit.push_back(s); - dist[s] = 0; - } - } - - /* bfs */ - UNUSED u32 last_d = 0; - while (!to_visit.empty()) { - dstate_id_t s = to_visit.front(); - to_visit.pop_front(); - assert(s != DEAD_STATE); - - u32 d = dist[s]; - assert(d >= last_d); - assert(d != ~0U); - - for (auto t : in_edges[s]) { - if (t == DEAD_STATE) { - continue; - } - if (dist[t] == ~0U) { - to_visit.push_back(t); - dist[t] = d + 1; - } else { - assert(dist[t] <= d + 1); - } - } - - last_d = d; - } - - return dist; -} - -bool prune_overlong(raw_dfa &raw, u32 max_offset) { - DEBUG_PRINTF("pruning to at most %u\n", max_offset); +bool clear_deeper_reports(raw_dfa &raw, u32 max_offset) { + DEBUG_PRINTF("clearing reports on states deeper than %u\n", max_offset); vector bob_dist; u32 max_min_dist_bob = calc_min_dist_from_bob(raw, &bob_dist); @@ -238,47 +176,18 @@ bool prune_overlong(raw_dfa &raw, u32 max_offset) { return false; } - vector accept_dist = calc_min_dist_to_accept(raw, find_in_edges(raw)); - - /* look over the states and filter out any which cannot reach a report - * states before max_offset */ - vector new_ids(raw.states.size()); - vector new_states; - u32 count = 1; - new_states.push_back(raw.states[DEAD_STATE]); - + bool changed = false; for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) { - if (bob_dist[s] + accept_dist[s] > max_offset) { - DEBUG_PRINTF("pruned %u: bob %u, report %u\n", s, bob_dist[s], - accept_dist[s]); - new_ids[s] = DEAD_STATE; - } else { - new_ids[s] = count++; - new_states.push_back(raw.states[s]); - assert(new_states.size() == count); - assert(new_ids[s] <= s); + if (bob_dist[s] > max_offset && state_has_reports(raw, s)) { + DEBUG_PRINTF("clearing reports on %u (depth %u)\n", s, bob_dist[s]); + auto &ds = raw.states[s]; + ds.reports.clear(); + ds.reports_eod.clear(); + changed = true; } } - /* swap states */ - DEBUG_PRINTF("pruned %zu -> %u\n", raw.states.size(), count); - raw.states = std::move(new_states); - new_states.clear(); - - /* update edges and daddys to refer to the new ids */ - for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) { - for (u32 j = 0; j < raw.alpha_size; j++) { - dstate_id_t old_t = raw.states[s].next[j]; - raw.states[s].next[j] = new_ids[old_t]; - } - raw.states[s].daddy = new_ids[raw.states[s].daddy]; - } - - /* update specials */ - raw.start_floating = new_ids[raw.start_floating]; - raw.start_anchored = new_ids[raw.start_anchored]; - - return true; + return changed; } set all_reports(const raw_dfa &rdfa) { diff --git a/src/nfa/mcclellancompile_util.h b/src/nfa/mcclellancompile_util.h index 554c1efd..d681e06b 100644 --- a/src/nfa/mcclellancompile_util.h +++ b/src/nfa/mcclellancompile_util.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -39,10 +39,12 @@ namespace ue2 { u32 remove_leading_dots(raw_dfa &raw); /** - * Prunes any states which cannot be reached within max_offset from start of - * stream. Returns false if no changes are made to the rdfa + * \brief Clear reports on any states that are deeper than \a max_offset from + * start of stream. + * + * Returns false if no changes are made to the DFA. */ -bool prune_overlong(raw_dfa &raw, u32 max_offset); +bool clear_deeper_reports(raw_dfa &raw, u32 max_offset); std::set all_reports(const raw_dfa &rdfa); bool has_eod_accepts(const raw_dfa &rdfa); diff --git a/src/smallwrite/smallwrite_build.cpp b/src/smallwrite/smallwrite_build.cpp index a27db736..bcdd12bb 100644 --- a/src/smallwrite/smallwrite_build.cpp +++ b/src/smallwrite/smallwrite_build.cpp @@ -279,7 +279,7 @@ void SmallWriteBuildImpl::add(const NGHolder &g, const ExpressionInfo &expr) { return; } - if (prune_overlong(*r, cc.grey.smallWriteLargestBuffer)) { + if (clear_deeper_reports(*r, cc.grey.smallWriteLargestBuffer)) { minimize_hopcroft(*r, cc.grey); } @@ -725,7 +725,7 @@ bytecode_ptr prepEngine(raw_dfa &rdfa, u32 roseQuality, if (*small_region <= *start_offset) { return nullptr; } - if (prune_overlong(rdfa, *small_region - *start_offset)) { + if (clear_deeper_reports(rdfa, *small_region - *start_offset)) { minimize_hopcroft(rdfa, cc.grey); if (rdfa.start_anchored == DEAD_STATE) { DEBUG_PRINTF("all patterns pruned out\n");