vectorscan/src/nfa/rdfa_merge.cpp

399 lines
12 KiB
C++

/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "rdfa_merge.h"
#include "grey.h"
#include "dfa_min.h"
#include "mcclellancompile_util.h"
#include "rdfa.h"
#include "ue2common.h"
#include "nfagraph/ng_mcclellan_internal.h"
#include "util/container.h"
#include "util/determinise.h"
#include "util/flat_containers.h"
#include "util/report_manager.h"
#include "util/unordered.h"
#include <algorithm>
#include <queue>
using namespace std;
namespace ue2 {
#define MAX_DFA_STATES 16383
namespace {
class Automaton_Merge {
public:
using StateSet = vector<u16>;
using StateMap = ue2_unordered_map<StateSet, dstate_id_t>;
Automaton_Merge(const raw_dfa *rdfa1, const raw_dfa *rdfa2,
const ReportManager *rm_in, const Grey &grey_in)
: rm(rm_in), grey(grey_in), nfas{rdfa1, rdfa2}, dead(2) {
calculateAlphabet();
populateAsFs();
prunable = isPrunable();
}
Automaton_Merge(const vector<const raw_dfa *> &dfas,
const ReportManager *rm_in, const Grey &grey_in)
: rm(rm_in), grey(grey_in), nfas(dfas), dead(nfas.size()) {
calculateAlphabet();
populateAsFs();
prunable = isPrunable();
}
void populateAsFs(void) {
bool fs_same = true;
bool fs_dead = true;
as.resize(nfas.size());
fs.resize(nfas.size());
for (size_t i = 0, end = nfas.size(); i < end; i++) {
as[i] = nfas[i]->start_anchored;
fs[i] = nfas[i]->start_floating;
if (fs[i]) {
fs_dead = false;
}
if (as[i] != fs[i]) {
fs_same = false;
}
}
start_anchored = DEAD_STATE + 1;
if (fs_same) {
start_floating = start_anchored;
} else if (fs_dead) {
start_floating = DEAD_STATE;
} else {
start_floating = start_anchored + 1;
}
}
void calculateAlphabet(void) {
DEBUG_PRINTF("calculating alphabet\n");
vector<CharReach> esets = {CharReach::dot()};
for (const auto &rdfa : nfas) {
DEBUG_PRINTF("...next dfa alphabet\n");
assert(rdfa);
const auto &alpha_remap = rdfa->alpha_remap;
for (size_t i = 0; i < esets.size(); i++) {
assert(esets[i].count());
if (esets[i].count() == 1) {
DEBUG_PRINTF("skipping singleton eq set\n");
continue;
}
CharReach t;
u8 leader_s = alpha_remap[esets[i].find_first()];
DEBUG_PRINTF("checking eq set, leader %02hhx \n", leader_s);
for (size_t s = esets[i].find_first(); s != CharReach::npos;
s = esets[i].find_next(s)) {
if (alpha_remap[s] != leader_s) {
t.set(s);
}
}
if (t.any() && t != esets[i]) {
esets[i] &= ~t;
esets.emplace_back(t);
}
}
}
// Sort so that our alphabet mapping isn't dependent on the order of
// rdfas passed in.
sort(esets.begin(), esets.end());
alphasize = buildAlphabetFromEquivSets(esets, alpha, unalpha);
}
bool isPrunable() const {
if (!grey.highlanderPruneDFA || !rm) {
DEBUG_PRINTF("disabled, or not managed reports\n");
return false;
}
assert(!nfas.empty());
if (!generates_callbacks(nfas.front()->kind)) {
DEBUG_PRINTF("doesn't generate callbacks\n");
return false;
}
// Collect all reports from all merge candidates.
flat_set<ReportID> merge_reports;
for (const auto &rdfa : nfas) {
insert(&merge_reports, all_reports(*rdfa));
}
DEBUG_PRINTF("all reports: %s\n", as_string_list(merge_reports).c_str());
// Return true if they're all exhaustible with the same exhaustion key.
u32 ekey = INVALID_EKEY;
for (const auto &report_id : merge_reports) {
const Report &r = rm->getReport(report_id);
if (!isSimpleExhaustible(r)) {
DEBUG_PRINTF("report %u not simple exhaustible\n", report_id);
return false;
}
assert(r.ekey != INVALID_EKEY);
if (ekey == INVALID_EKEY) {
ekey = r.ekey;
} else if (ekey != r.ekey) {
DEBUG_PRINTF("two different ekeys, %u and %u\n", ekey, r.ekey);
return false;
}
}
DEBUG_PRINTF("is prunable\n");
return true;
}
void transition(const StateSet &in, StateSet *next) {
u16 t[ALPHABET_SIZE];
for (u32 i = 0; i < alphasize; i++) {
next[i].resize(nfas.size());
}
for (size_t j = 0, j_end = nfas.size(); j < j_end; j++) {
getFullTransitionFromState(*nfas[j], in[j], t);
for (u32 i = 0; i < alphasize; i++) {
next[i][j] = t[unalpha[i]];
}
}
}
const vector<StateSet> initial() {
vector<StateSet> rv = {as};
if (start_floating != DEAD_STATE && start_floating != start_anchored) {
rv.emplace_back(fs);
}
return rv;
}
private:
void reports_i(const StateSet &in, flat_set<ReportID> dstate::*r_set,
flat_set<ReportID> &r) const {
for (size_t i = 0, end = nfas.size(); i < end; i++) {
const auto &rs = nfas[i]->states[in[i]].*r_set;
insert(&r, rs);
}
}
public:
void reports(const StateSet &in, flat_set<ReportID> &rv) const {
reports_i(in, &dstate::reports, rv);
}
void reportsEod(const StateSet &in, flat_set<ReportID> &rv) const {
reports_i(in, &dstate::reports_eod, rv);
}
bool canPrune(const flat_set<ReportID> &test_reports) const {
if (!grey.highlanderPruneDFA || !prunable) {
return false;
}
// Must all be external reports.
assert(rm);
for (const auto &report_id : test_reports) {
if (!isExternalReport(rm->getReport(report_id))) {
return false;
}
}
return true;
}
/** True if the minimization algorithm should be run after merging. */
bool shouldMinimize() const {
// We only need to run minimization if our merged DFAs shared a report.
flat_set<ReportID> seen_reports;
for (const auto &rdfa : nfas) {
for (const auto &report_id : all_reports(*rdfa)) {
if (!seen_reports.insert(report_id).second) {
DEBUG_PRINTF("report %u in several dfas\n", report_id);
return true;
}
}
}
return false;
}
private:
const ReportManager *rm;
const Grey &grey;
vector<const raw_dfa *> nfas;
vector<dstate_id_t> as;
vector<dstate_id_t> fs;
bool prunable = false;
public:
std::array<u16, ALPHABET_SIZE> alpha;
std::array<u16, ALPHABET_SIZE> unalpha;
u16 alphasize;
StateSet dead;
u16 start_anchored;
u16 start_floating;
};
} // namespace
unique_ptr<raw_dfa> mergeTwoDfas(const raw_dfa *d1, const raw_dfa *d2,
size_t max_states, const ReportManager *rm,
const Grey &grey) {
assert(d1 && d2);
assert(d1->kind == d2->kind);
assert(max_states <= MAX_DFA_STATES);
auto rdfa = std::make_unique<raw_dfa>(d1->kind);
Automaton_Merge autom(d1, d2, rm, grey);
if (determinise(autom, rdfa->states, max_states)) {
rdfa->start_anchored = autom.start_anchored;
rdfa->start_floating = autom.start_floating;
rdfa->alpha_size = autom.alphasize;
rdfa->alpha_remap = autom.alpha;
DEBUG_PRINTF("merge succeeded, %zu states\n", rdfa->states.size());
if (autom.shouldMinimize()) {
minimize_hopcroft(*rdfa, grey);
DEBUG_PRINTF("minimized, %zu states\n", rdfa->states.size());
}
return rdfa;
}
return nullptr;
}
void mergeDfas(vector<unique_ptr<raw_dfa>> &dfas, size_t max_states,
const ReportManager *rm, const Grey &grey) {
assert(max_states <= MAX_DFA_STATES);
if (dfas.size() <= 1) {
return;
}
DEBUG_PRINTF("before merging, we have %zu dfas\n", dfas.size());
queue<unique_ptr<raw_dfa>> q;
for (auto &dfa : dfas) {
q.push(move(dfa));
}
// All DFAs are now on the queue, so we'll clear the vector and use it for
// output from here.
dfas.clear();
while (q.size() > 1) {
// Attempt to merge the two front elements of the queue.
unique_ptr<raw_dfa> d1 = move(q.front());
q.pop();
unique_ptr<raw_dfa> d2 = move(q.front());
q.pop();
auto rdfa = mergeTwoDfas(d1.get(), d2.get(), max_states, rm, grey);
if (rdfa) {
q.push(move(rdfa));
} else {
DEBUG_PRINTF("failed to merge\n");
// Put the larger of the two DFAs on the output list, retain the
// smaller one on the queue for further merge attempts.
if (d2->states.size() > d1->states.size()) {
dfas.emplace_back(move(d2));
q.push(move(d1));
} else {
dfas.emplace_back(move(d1));
q.push(move(d2));
}
}
}
while (!q.empty()) {
dfas.emplace_back(move(q.front()));
q.pop();
}
DEBUG_PRINTF("after merging, we have %zu dfas\n", dfas.size());
}
unique_ptr<raw_dfa> mergeAllDfas(const vector<const raw_dfa *> &dfas,
size_t max_states, const ReportManager *rm,
const Grey &grey) {
assert(max_states <= MAX_DFA_STATES);
assert(!dfas.empty());
// All the DFAs should be of the same kind.
const auto kind = dfas.front()->kind;
assert(all_of(begin(dfas), end(dfas),
[&kind](const raw_dfa *rdfa) { return rdfa->kind == kind; }));
auto rdfa = std::make_unique<raw_dfa>(kind);
Automaton_Merge n(dfas, rm, grey);
DEBUG_PRINTF("merging dfa\n");
if (!determinise(n, rdfa->states, max_states)) {
DEBUG_PRINTF("state limit (%zu) exceeded\n", max_states);
return nullptr; /* over state limit */
}
rdfa->start_anchored = n.start_anchored;
rdfa->start_floating = n.start_floating;
rdfa->alpha_size = n.alphasize;
rdfa->alpha_remap = n.alpha;
DEBUG_PRINTF("merged, building impl dfa (a,f) = (%hu,%hu)\n",
rdfa->start_anchored, rdfa->start_floating);
if (n.shouldMinimize()) {
minimize_hopcroft(*rdfa, grey);
DEBUG_PRINTF("minimized, %zu states\n", rdfa->states.size());
}
return rdfa;
}
} // namespace ue2