/* * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "rdfa_merge.h" #include "grey.h" #include "dfa_min.h" #include "mcclellancompile_util.h" #include "rdfa.h" #include "ue2common.h" #include "nfagraph/ng_mcclellan_internal.h" #include "util/container.h" #include "util/determinise.h" #include "util/flat_containers.h" #include "util/report_manager.h" #include "util/unordered.h" #include #include using namespace std; namespace ue2 { #define MAX_DFA_STATES 16383 namespace { class Automaton_Merge { public: using StateSet = vector; using StateMap = ue2_unordered_map; Automaton_Merge(const raw_dfa *rdfa1, const raw_dfa *rdfa2, const ReportManager *rm_in, const Grey &grey_in) : rm(rm_in), grey(grey_in), nfas{rdfa1, rdfa2}, dead(2) { calculateAlphabet(); populateAsFs(); prunable = isPrunable(); } Automaton_Merge(const vector &dfas, const ReportManager *rm_in, const Grey &grey_in) : rm(rm_in), grey(grey_in), nfas(dfas), dead(nfas.size()) { calculateAlphabet(); populateAsFs(); prunable = isPrunable(); } void populateAsFs(void) { bool fs_same = true; bool fs_dead = true; as.resize(nfas.size()); fs.resize(nfas.size()); for (size_t i = 0, end = nfas.size(); i < end; i++) { as[i] = nfas[i]->start_anchored; fs[i] = nfas[i]->start_floating; if (fs[i]) { fs_dead = false; } if (as[i] != fs[i]) { fs_same = false; } } start_anchored = DEAD_STATE + 1; if (fs_same) { start_floating = start_anchored; } else if (fs_dead) { start_floating = DEAD_STATE; } else { start_floating = start_anchored + 1; } } void calculateAlphabet(void) { DEBUG_PRINTF("calculating alphabet\n"); vector esets = {CharReach::dot()}; for (const auto &rdfa : nfas) { DEBUG_PRINTF("...next dfa alphabet\n"); assert(rdfa); const auto &alpha_remap = rdfa->alpha_remap; for (size_t i = 0; i < esets.size(); i++) { assert(esets[i].count()); if (esets[i].count() == 1) { DEBUG_PRINTF("skipping singleton eq set\n"); continue; } CharReach t; u8 leader_s = alpha_remap[esets[i].find_first()]; DEBUG_PRINTF("checking eq set, leader %02hhx \n", leader_s); for (size_t s = esets[i].find_first(); s != CharReach::npos; s = esets[i].find_next(s)) { if (alpha_remap[s] != leader_s) { t.set(s); } } if (t.any() && t != esets[i]) { esets[i] &= ~t; esets.emplace_back(t); } } } // Sort so that our alphabet mapping isn't dependent on the order of // rdfas passed in. sort(esets.begin(), esets.end()); alphasize = buildAlphabetFromEquivSets(esets, alpha, unalpha); } bool isPrunable() const { if (!grey.highlanderPruneDFA || !rm) { DEBUG_PRINTF("disabled, or not managed reports\n"); return false; } assert(!nfas.empty()); if (!generates_callbacks(nfas.front()->kind)) { DEBUG_PRINTF("doesn't generate callbacks\n"); return false; } // Collect all reports from all merge candidates. flat_set merge_reports; for (const auto &rdfa : nfas) { insert(&merge_reports, all_reports(*rdfa)); } DEBUG_PRINTF("all reports: %s\n", as_string_list(merge_reports).c_str()); // Return true if they're all exhaustible with the same exhaustion key. u32 ekey = INVALID_EKEY; for (const auto &report_id : merge_reports) { const Report &r = rm->getReport(report_id); if (!isSimpleExhaustible(r)) { DEBUG_PRINTF("report %u not simple exhaustible\n", report_id); return false; } assert(r.ekey != INVALID_EKEY); if (ekey == INVALID_EKEY) { ekey = r.ekey; } else if (ekey != r.ekey) { DEBUG_PRINTF("two different ekeys, %u and %u\n", ekey, r.ekey); return false; } } DEBUG_PRINTF("is prunable\n"); return true; } void transition(const StateSet &in, StateSet *next) { u16 t[ALPHABET_SIZE]; for (u32 i = 0; i < alphasize; i++) { next[i].resize(nfas.size()); } for (size_t j = 0, j_end = nfas.size(); j < j_end; j++) { getFullTransitionFromState(*nfas[j], in[j], t); for (u32 i = 0; i < alphasize; i++) { next[i][j] = t[unalpha[i]]; } } } const vector initial() { vector rv = {as}; if (start_floating != DEAD_STATE && start_floating != start_anchored) { rv.emplace_back(fs); } return rv; } private: void reports_i(const StateSet &in, flat_set dstate::*r_set, flat_set &r) const { for (size_t i = 0, end = nfas.size(); i < end; i++) { const auto &rs = nfas[i]->states[in[i]].*r_set; insert(&r, rs); } } public: void reports(const StateSet &in, flat_set &rv) const { reports_i(in, &dstate::reports, rv); } void reportsEod(const StateSet &in, flat_set &rv) const { reports_i(in, &dstate::reports_eod, rv); } bool canPrune(const flat_set &test_reports) const { if (!grey.highlanderPruneDFA || !prunable) { return false; } // Must all be external reports. assert(rm); for (const auto &report_id : test_reports) { if (!isExternalReport(rm->getReport(report_id))) { return false; } } return true; } /** True if the minimization algorithm should be run after merging. */ bool shouldMinimize() const { // We only need to run minimization if our merged DFAs shared a report. flat_set seen_reports; for (const auto &rdfa : nfas) { for (const auto &report_id : all_reports(*rdfa)) { if (!seen_reports.insert(report_id).second) { DEBUG_PRINTF("report %u in several dfas\n", report_id); return true; } } } return false; } private: const ReportManager *rm; const Grey &grey; vector nfas; vector as; vector fs; bool prunable = false; public: std::array alpha; std::array unalpha; u16 alphasize; StateSet dead; u16 start_anchored; u16 start_floating; }; } // namespace unique_ptr mergeTwoDfas(const raw_dfa *d1, const raw_dfa *d2, size_t max_states, const ReportManager *rm, const Grey &grey) { assert(d1 && d2); assert(d1->kind == d2->kind); assert(max_states <= MAX_DFA_STATES); auto rdfa = std::make_unique(d1->kind); Automaton_Merge autom(d1, d2, rm, grey); if (determinise(autom, rdfa->states, max_states)) { rdfa->start_anchored = autom.start_anchored; rdfa->start_floating = autom.start_floating; rdfa->alpha_size = autom.alphasize; rdfa->alpha_remap = autom.alpha; DEBUG_PRINTF("merge succeeded, %zu states\n", rdfa->states.size()); if (autom.shouldMinimize()) { minimize_hopcroft(*rdfa, grey); DEBUG_PRINTF("minimized, %zu states\n", rdfa->states.size()); } return rdfa; } return nullptr; } void mergeDfas(vector> &dfas, size_t max_states, const ReportManager *rm, const Grey &grey) { assert(max_states <= MAX_DFA_STATES); if (dfas.size() <= 1) { return; } DEBUG_PRINTF("before merging, we have %zu dfas\n", dfas.size()); queue> q; for (auto &dfa : dfas) { q.push(move(dfa)); } // All DFAs are now on the queue, so we'll clear the vector and use it for // output from here. dfas.clear(); while (q.size() > 1) { // Attempt to merge the two front elements of the queue. unique_ptr d1 = move(q.front()); q.pop(); unique_ptr d2 = move(q.front()); q.pop(); auto rdfa = mergeTwoDfas(d1.get(), d2.get(), max_states, rm, grey); if (rdfa) { q.push(move(rdfa)); } else { DEBUG_PRINTF("failed to merge\n"); // Put the larger of the two DFAs on the output list, retain the // smaller one on the queue for further merge attempts. if (d2->states.size() > d1->states.size()) { dfas.emplace_back(move(d2)); q.push(move(d1)); } else { dfas.emplace_back(move(d1)); q.push(move(d2)); } } } while (!q.empty()) { dfas.emplace_back(move(q.front())); q.pop(); } DEBUG_PRINTF("after merging, we have %zu dfas\n", dfas.size()); } unique_ptr mergeAllDfas(const vector &dfas, size_t max_states, const ReportManager *rm, const Grey &grey) { assert(max_states <= MAX_DFA_STATES); assert(!dfas.empty()); // All the DFAs should be of the same kind. const auto kind = dfas.front()->kind; assert(all_of(begin(dfas), end(dfas), [&kind](const raw_dfa *rdfa) { return rdfa->kind == kind; })); auto rdfa = std::make_unique(kind); Automaton_Merge n(dfas, rm, grey); DEBUG_PRINTF("merging dfa\n"); if (!determinise(n, rdfa->states, max_states)) { DEBUG_PRINTF("state limit (%zu) exceeded\n", max_states); return nullptr; /* over state limit */ } rdfa->start_anchored = n.start_anchored; rdfa->start_floating = n.start_floating; rdfa->alpha_size = n.alphasize; rdfa->alpha_remap = n.alpha; DEBUG_PRINTF("merged, building impl dfa (a,f) = (%hu,%hu)\n", rdfa->start_anchored, rdfa->start_floating); if (n.shouldMinimize()) { minimize_hopcroft(*rdfa, grey); DEBUG_PRINTF("minimized, %zu states\n", rdfa->states.size()); } return rdfa; } } // namespace ue2