vectorscan/src/rose/rose_build_anchored.cpp

923 lines
27 KiB
C++

/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "rose_build_anchored.h"
#include "grey.h"
#include "rose_build_impl.h"
#include "rose_build_matchers.h"
#include "rose_internal.h"
#include "ue2common.h"
#include "nfa/dfa_min.h"
#include "nfa/mcclellancompile.h"
#include "nfa/mcclellancompile_util.h"
#include "nfa/nfa_build_util.h"
#include "nfa/rdfa_merge.h"
#include "nfagraph/ng_holder.h"
#include "nfagraph/ng_repeat.h"
#include "nfagraph/ng_util.h"
#include "nfagraph/ng_mcclellan_internal.h"
#include "util/alloc.h"
#include "util/bitfield.h"
#include "util/charreach.h"
#include "util/compile_context.h"
#include "util/compile_error.h"
#include "util/container.h"
#include "util/determinise.h"
#include "util/flat_containers.h"
#include "util/graph_range.h"
#include "util/order_check.h"
#include "util/ue2string.h"
#include "util/unordered.h"
#include "util/verify_types.h"
#include <map>
#include <queue>
#include <set>
#include <vector>
using namespace std;
namespace ue2 {
#define ANCHORED_NFA_STATE_LIMIT 512
#define MAX_DFA_STATES 16000
#define DFA_PAIR_MERGE_THRESHOLD 5000
#define MAX_SMALL_START_REACH 4
#define INIT_STATE (DEAD_STATE + 1)
#define NO_FRAG_ID (~0U)
// Adds a vertex with the given reach.
static
NFAVertex add_vertex(NGHolder &h, const CharReach &cr) {
NFAVertex v = add_vertex(h);
h[v].char_reach = cr;
return v;
}
static
void add_edges(const set<NFAVertex> &parents, NFAVertex v, NGHolder &h) {
for (auto p : parents) {
add_edge(p, v, h);
}
}
static
set<NFAVertex> addDotsToGraph(NGHolder &h, NFAVertex start, u32 min, u32 max,
const CharReach &cr) {
DEBUG_PRINTF("adding [%u, %u] to graph\n", min, max);
u32 i = 0;
set<NFAVertex> curr;
curr.insert(start);
for (; i < min; i++) {
NFAVertex next = add_vertex(h, cr);
add_edges(curr, next, h);
curr.clear();
curr.insert(next);
}
assert(max != ROSE_BOUND_INF);
set<NFAVertex> orig = curr;
for (; i < max; i++) {
NFAVertex next = add_vertex(h, cr);
add_edges(curr, next, h);
curr.clear();
curr.insert(next);
curr.insert(orig.begin(), orig.end());
}
return curr;
}
static
NFAVertex addToGraph(NGHolder &h, const set<NFAVertex> &curr,
const ue2_literal &s) {
DEBUG_PRINTF("adding %s to graph\n", dumpString(s).c_str());
assert(!s.empty());
ue2_literal::const_iterator it = s.begin();
NFAVertex u = add_vertex(h, *it);
add_edges(curr, u, h);
for (++it; it != s.end(); ++it) {
NFAVertex next = add_vertex(h, *it);
add_edge(u, next, h);
u = next;
}
return u;
}
static
void mergeAnchoredDfas(vector<unique_ptr<raw_dfa>> &dfas,
const RoseBuildImpl &build) {
// First, group our DFAs into "small start" and "big start" sets.
vector<unique_ptr<raw_dfa>> small_starts, big_starts;
for (auto &rdfa : dfas) {
u32 start_size = mcclellanStartReachSize(rdfa.get());
if (start_size <= MAX_SMALL_START_REACH) {
small_starts.emplace_back(std::move(rdfa));
} else {
big_starts.emplace_back(std::move(rdfa));
}
}
dfas.clear();
DEBUG_PRINTF("%zu dfas with small starts, %zu dfas with big starts\n",
small_starts.size(), big_starts.size());
mergeDfas(small_starts, MAX_DFA_STATES, nullptr, build.cc.grey);
mergeDfas(big_starts, MAX_DFA_STATES, nullptr, build.cc.grey);
// Rehome our groups into one vector.
for (auto &rdfa : small_starts) {
dfas.emplace_back(std::move(rdfa));
}
for (auto &rdfa : big_starts) {
dfas.emplace_back(std::move(rdfa));
}
// Final test: if we've built two DFAs here that are small enough, we can
// try to merge them.
if (dfas.size() == 2) {
size_t total_states = dfas[0]->states.size() + dfas[1]->states.size();
if (total_states < DFA_PAIR_MERGE_THRESHOLD) {
DEBUG_PRINTF("doing small pair merge\n");
mergeDfas(dfas, MAX_DFA_STATES, nullptr, build.cc.grey);
}
}
}
static
void remapAnchoredReports(raw_dfa &rdfa, const vector<u32> &frag_map) {
for (dstate &ds : rdfa.states) {
assert(ds.reports_eod.empty()); // Not used in anchored matcher.
if (ds.reports.empty()) {
continue;
}
flat_set<ReportID> new_reports;
for (auto id : ds.reports) {
assert(id < frag_map.size());
new_reports.insert(frag_map[id]);
}
ds.reports = std::move(new_reports);
}
}
/**
* \brief Replaces the report ids currently in the dfas (rose graph literal
* ids) with the fragment id for each literal.
*/
static
void remapAnchoredReports(RoseBuildImpl &build, const vector<u32> &frag_map) {
for (auto &m : build.anchored_nfas) {
for (auto &rdfa : m.second) {
assert(rdfa);
remapAnchoredReports(*rdfa, frag_map);
}
}
}
/**
* Returns mapping from literal ids to fragment ids.
*/
static
vector<u32> reverseFragMap(const RoseBuildImpl &build,
const vector<LitFragment> &fragments) {
vector<u32> rev(build.literal_info.size(), NO_FRAG_ID);
for (const auto &f : fragments) {
for (u32 lit_id : f.lit_ids) {
assert(lit_id < rev.size());
rev[lit_id] = f.fragment_id;
}
}
return rev;
}
/**
* \brief Replace the reports (which are literal final_ids) in the given
* raw_dfa with program offsets.
*/
static
void remapIdsToPrograms(const vector<LitFragment> &fragments, raw_dfa &rdfa) {
for (dstate &ds : rdfa.states) {
assert(ds.reports_eod.empty()); // Not used in anchored matcher.
if (ds.reports.empty()) {
continue;
}
flat_set<ReportID> new_reports;
for (auto fragment_id : ds.reports) {
const auto &frag = fragments.at(fragment_id);
new_reports.insert(frag.lit_program_offset);
}
ds.reports = std::move(new_reports);
}
}
static
unique_ptr<NGHolder> populate_holder(const simple_anchored_info &sai,
const flat_set<u32> &exit_ids) {
DEBUG_PRINTF("populating holder for ^.{%u,%u}%s\n", sai.min_bound,
sai.max_bound, dumpString(sai.literal).c_str());
auto h_ptr = make_unique<NGHolder>();
NGHolder &h = *h_ptr;
auto ends = addDotsToGraph(h, h.start, sai.min_bound, sai.max_bound,
CharReach::dot());
NFAVertex v = addToGraph(h, ends, sai.literal);
add_edge(v, h.accept, h);
h[v].reports.insert(exit_ids.begin(), exit_ids.end());
return h_ptr;
}
u32 anchoredStateSize(const anchored_matcher_info &atable) {
const struct anchored_matcher_info *curr = &atable;
// Walk the list until we find the last element; total state size will be
// that engine's state offset plus its state requirement.
while (curr->next_offset) {
curr = (const anchored_matcher_info *)
((const char *)curr + curr->next_offset);
}
const NFA *nfa = (const NFA *)((const char *)curr + sizeof(*curr));
return curr->state_offset + nfa->streamStateSize;
}
namespace {
using nfa_state_set = bitfield<ANCHORED_NFA_STATE_LIMIT>;
struct Holder_StateSet {
Holder_StateSet() : wdelay(0) {}
nfa_state_set wrap_state;
u32 wdelay;
bool operator==(const Holder_StateSet &b) const {
return wdelay == b.wdelay && wrap_state == b.wrap_state;
}
size_t hash() const {
return hash_all(wrap_state, wdelay);
}
};
class Automaton_Holder {
public:
using StateSet = Holder_StateSet;
using StateMap = ue2_unordered_map<StateSet, dstate_id_t>;
explicit Automaton_Holder(const NGHolder &g_in) : g(g_in) {
for (auto v : vertices_range(g)) {
vertexToIndex[v] = indexToVertex.size();
indexToVertex.emplace_back(v);
}
assert(indexToVertex.size() <= ANCHORED_NFA_STATE_LIMIT);
DEBUG_PRINTF("%zu states\n", indexToVertex.size());
init.wdelay = 0;
init.wrap_state.set(vertexToIndex[g.start]);
DEBUG_PRINTF("init wdelay %u\n", init.wdelay);
calculateAlphabet();
cr_by_index = populateCR(g, indexToVertex, alpha);
}
private:
void calculateAlphabet() {
vector<CharReach> esets(1, CharReach::dot());
for (auto v : indexToVertex) {
const CharReach &cr = g[v].char_reach;
for (size_t i = 0; i < esets.size(); i++) {
if (esets[i].count() == 1) {
continue;
}
CharReach t = cr & esets[i];
if (t.any() && t != esets[i]) {
esets[i] &= ~t;
esets.emplace_back(t);
}
}
}
alphasize = buildAlphabetFromEquivSets(esets, alpha, unalpha);
}
public:
void transition(const StateSet &in, StateSet *next) {
/* track the dfa state, reset nfa states */
u32 wdelay = in.wdelay ? in.wdelay - 1 : 0;
for (symbol_t s = 0; s < alphasize; s++) {
next[s].wrap_state.reset();
next[s].wdelay = wdelay;
}
nfa_state_set gsucc;
if (wdelay != in.wdelay) {
DEBUG_PRINTF("enabling start\n");
gsucc.set(vertexToIndex[g.startDs]);
}
for (size_t i = in.wrap_state.find_first(); i != nfa_state_set::npos;
i = in.wrap_state.find_next(i)) {
NFAVertex v = indexToVertex[i];
for (auto w : adjacent_vertices_range(v, g)) {
if (!contains(vertexToIndex, w)
|| w == g.accept || w == g.acceptEod) {
continue;
}
if (w == g.startDs) {
continue;
}
gsucc.set(vertexToIndex[w]);
}
}
for (size_t j = gsucc.find_first(); j != nfa_state_set::npos;
j = gsucc.find_next(j)) {
const CharReach &cr = cr_by_index[j];
for (size_t s = cr.find_first(); s != CharReach::npos;
s = cr.find_next(s)) {
next[s].wrap_state.set(j); /* pre alpha'ed */
}
}
next[alpha[TOP]] = in;
}
const vector<StateSet> initial() {
return {init};
}
void reports(const StateSet &in, flat_set<ReportID> &rv) {
rv.clear();
for (size_t i = in.wrap_state.find_first(); i != nfa_state_set::npos;
i = in.wrap_state.find_next(i)) {
NFAVertex v = indexToVertex[i];
if (edge(v, g.accept, g).second) {
assert(!g[v].reports.empty());
insert(&rv, g[v].reports);
} else {
assert(g[v].reports.empty());
}
}
}
void reportsEod(const StateSet &, flat_set<ReportID> &r) {
r.clear();
}
static bool canPrune(const flat_set<ReportID> &) {
/* used by ng_ to prune states after highlander accepts */
return false;
}
private:
const NGHolder &g;
unordered_map<NFAVertex, u32> vertexToIndex;
vector<NFAVertex> indexToVertex;
vector<CharReach> cr_by_index;
StateSet init;
public:
StateSet dead;
array<u16, ALPHABET_SIZE> alpha;
array<u16, ALPHABET_SIZE> unalpha;
u16 alphasize;
};
} // namespace
static
bool check_dupe(const raw_dfa &rdfa,
const vector<unique_ptr<raw_dfa>> &existing, ReportID *remap) {
if (!remap) {
DEBUG_PRINTF("no remap\n");
return false;
}
set<ReportID> rdfa_reports;
for (const auto &ds : rdfa.states) {
rdfa_reports.insert(ds.reports.begin(), ds.reports.end());
}
if (rdfa_reports.size() != 1) {
return false; /* too complicated for now would need mapping TODO */
}
for (const auto &e_rdfa : existing) {
assert(e_rdfa);
const raw_dfa &b = *e_rdfa;
if (rdfa.start_anchored != b.start_anchored ||
rdfa.alpha_size != b.alpha_size ||
rdfa.states.size() != b.states.size() ||
rdfa.alpha_remap != b.alpha_remap) {
continue;
}
set<ReportID> b_reports;
for (u32 i = 0; i < b.states.size(); i++) {
assert(b.states[i].reports_eod.empty());
assert(rdfa.states[i].reports_eod.empty());
if (rdfa.states[i].reports.size() != b.states[i].reports.size()) {
goto next_dfa;
}
b_reports.insert(b.states[i].reports.begin(),
b.states[i].reports.end());
assert(rdfa.states[i].next.size() == b.states[i].next.size());
if (!equal(rdfa.states[i].next.begin(), rdfa.states[i].next.end(),
b.states[i].next.begin())) {
goto next_dfa;
}
}
if (b_reports.size() != 1) {
continue;
}
*remap = *b_reports.begin();
DEBUG_PRINTF("dupe found remapping to %u\n", *remap);
return true;
next_dfa:;
}
return false;
}
static
bool check_dupe_simple(const RoseBuildImpl &build, u32 min_bound, u32 max_bound,
const ue2_literal &lit, ReportID *remap) {
if (!remap) {
DEBUG_PRINTF("no remap\n");
return false;
}
simple_anchored_info sai(min_bound, max_bound, lit);
if (contains(build.anchored_simple, sai)) {
*remap = *build.anchored_simple.at(sai).begin();
return true;
}
return false;
}
static
NFAVertex extractLiteral(const NGHolder &h, ue2_literal *lit) {
vector<NFAVertex> lit_verts;
NFAVertex v = h.accept;
while ((v = getSoleSourceVertex(h, v))) {
const CharReach &cr = h[v].char_reach;
if (cr.count() > 1 && !cr.isCaselessChar()) {
break;
}
lit_verts.emplace_back(v);
}
if (lit_verts.empty()) {
return NGHolder::null_vertex();
}
bool nocase = false;
bool case_set = false;
for (auto it = lit_verts.rbegin(), ite = lit_verts.rend(); it != ite;
++it) {
const CharReach &cr = h[*it].char_reach;
if (cr.isAlpha()) {
bool cr_nocase = cr.count() != 1;
if (case_set && cr_nocase != nocase) {
return NGHolder::null_vertex();
}
case_set = true;
nocase = cr_nocase;
lit->push_back(cr.find_first(), nocase);
} else {
lit->push_back(cr.find_first(), false);
}
}
return lit_verts.back();
}
static
bool isSimple(const NGHolder &h, u32 *min_bound, u32 *max_bound,
ue2_literal *lit, u32 *report) {
assert(!proper_out_degree(h.startDs, h));
assert(in_degree(h.acceptEod, h) == 1);
DEBUG_PRINTF("looking for simple case\n");
NFAVertex lit_head = extractLiteral(h, lit);
if (lit_head == NGHolder::null_vertex()) {
DEBUG_PRINTF("no literal found\n");
return false;
}
const auto &reps = h[*inv_adjacent_vertices(h.accept, h).first].reports;
if (reps.size() != 1) {
return false;
}
*report = *reps.begin();
assert(!lit->empty());
set<NFAVertex> rep_exits;
/* lit should only be connected to dot vertices */
for (auto u : inv_adjacent_vertices_range(lit_head, h)) {
DEBUG_PRINTF("checking %zu\n", h[u].index);
if (!h[u].char_reach.all()) {
return false;
}
if (u != h.start) {
rep_exits.insert(u);
}
}
if (rep_exits.empty()) {
DEBUG_PRINTF("direct anchored\n");
assert(edge(h.start, lit_head, h).second);
*min_bound = 0;
*max_bound = 0;
return true;
}
NFAVertex key = *rep_exits.begin();
// Special-case the check for '^.foo' or '^.?foo'.
if (rep_exits.size() == 1 && edge(h.start, key, h).second &&
out_degree(key, h) == 1) {
DEBUG_PRINTF("one exit\n");
assert(edge(h.start, h.startDs, h).second);
size_t num_enters = out_degree(h.start, h);
if (num_enters == 2) {
DEBUG_PRINTF("^.{1,1} prefix\n");
*min_bound = 1;
*max_bound = 1;
return true;
}
if (num_enters == 3 && edge(h.start, lit_head, h).second) {
DEBUG_PRINTF("^.{0,1} prefix\n");
*min_bound = 0;
*max_bound = 1;
return true;
}
}
vector<GraphRepeatInfo> repeats;
findRepeats(h, 2, &repeats);
vector<GraphRepeatInfo>::const_iterator it;
for (it = repeats.begin(); it != repeats.end(); ++it) {
DEBUG_PRINTF("checking.. %zu verts\n", it->vertices.size());
if (find(it->vertices.begin(), it->vertices.end(), key)
!= it->vertices.end()) {
break;
}
}
if (it == repeats.end()) {
DEBUG_PRINTF("no repeat found\n");
return false;
}
set<NFAVertex> rep_verts;
insert(&rep_verts, it->vertices);
if (!is_subset_of(rep_exits, rep_verts)) {
DEBUG_PRINTF("bad exit check\n");
return false;
}
set<NFAVertex> rep_enters;
insert(&rep_enters, adjacent_vertices(h.start, h));
rep_enters.erase(lit_head);
rep_enters.erase(h.startDs);
if (!is_subset_of(rep_enters, rep_verts)) {
DEBUG_PRINTF("bad entry check\n");
return false;
}
u32 min_b = it->repeatMin;
if (edge(h.start, lit_head, h).second) { /* jump edge */
if (min_b != 1) {
DEBUG_PRINTF("jump edge around repeat with min bound\n");
return false;
}
min_b = 0;
}
*min_bound = min_b;
*max_bound = it->repeatMax;
DEBUG_PRINTF("repeat %u %u before %s\n", *min_bound, *max_bound,
dumpString(*lit).c_str());
return true;
}
static
int finalise_out(RoseBuildImpl &build, const NGHolder &h,
const Automaton_Holder &autom, unique_ptr<raw_dfa> out_dfa,
ReportID *remap) {
u32 min_bound = ~0U;
u32 max_bound = ~0U;
ue2_literal lit;
u32 simple_report = MO_INVALID_IDX;
if (isSimple(h, &min_bound, &max_bound, &lit, &simple_report)) {
assert(simple_report != MO_INVALID_IDX);
if (check_dupe_simple(build, min_bound, max_bound, lit, remap)) {
DEBUG_PRINTF("found duplicate remapping to %u\n", *remap);
return ANCHORED_REMAP;
}
DEBUG_PRINTF("add with report %u\n", simple_report);
build.anchored_simple[simple_anchored_info(min_bound, max_bound, lit)]
.insert(simple_report);
return ANCHORED_SUCCESS;
}
out_dfa->start_anchored = INIT_STATE;
out_dfa->start_floating = DEAD_STATE;
out_dfa->alpha_size = autom.alphasize;
out_dfa->alpha_remap = autom.alpha;
auto hash = hash_dfa_no_reports(*out_dfa);
if (check_dupe(*out_dfa, build.anchored_nfas[hash], remap)) {
return ANCHORED_REMAP;
}
build.anchored_nfas[hash].emplace_back(std::move(out_dfa));
return ANCHORED_SUCCESS;
}
static
int addAutomaton(RoseBuildImpl &build, const NGHolder &h, ReportID *remap) {
if (num_vertices(h) > ANCHORED_NFA_STATE_LIMIT) {
DEBUG_PRINTF("autom bad!\n");
return ANCHORED_FAIL;
}
Automaton_Holder autom(h);
auto out_dfa = std::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
if (determinise(autom, out_dfa->states, MAX_DFA_STATES)) {
return finalise_out(build, h, autom, std::move(out_dfa), remap);
}
DEBUG_PRINTF("determinise failed\n");
return ANCHORED_FAIL;
}
static
void setReports(NGHolder &h, const map<NFAVertex, set<u32>> &reportMap,
const unordered_map<NFAVertex, NFAVertex> &orig_to_copy) {
for (const auto &m : reportMap) {
NFAVertex t = orig_to_copy.at(m.first);
assert(!m.second.empty());
add_edge(t, h.accept, h);
insert(&h[t].reports, m.second);
}
}
int addAnchoredNFA(RoseBuildImpl &build, const NGHolder &wrapper,
const map<NFAVertex, set<u32>> &reportMap) {
NGHolder h;
unordered_map<NFAVertex, NFAVertex> orig_to_copy;
cloneHolder(h, wrapper, &orig_to_copy);
clear_in_edges(h.accept, h);
clear_in_edges(h.acceptEod, h);
add_edge(h.accept, h.acceptEod, h);
clearReports(h);
setReports(h, reportMap, orig_to_copy);
return addAutomaton(build, h, nullptr);
}
int addToAnchoredMatcher(RoseBuildImpl &build, const NGHolder &anchored,
u32 exit_id, ReportID *remap) {
NGHolder h;
cloneHolder(h, anchored);
clearReports(h);
assert(in_degree(h.acceptEod, h) == 1);
for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
h[v].reports.clear();
h[v].reports.insert(exit_id);
}
return addAutomaton(build, h, remap);
}
static
void buildSimpleDfas(const RoseBuildImpl &build, const vector<u32> &frag_map,
vector<unique_ptr<raw_dfa>> *anchored_dfas) {
/* we should have determinised all of these before so there should be no
* chance of failure. */
flat_set<u32> exit_ids;
for (const auto &simple : build.anchored_simple) {
exit_ids.clear();
for (auto lit_id : simple.second) {
assert(lit_id < frag_map.size());
exit_ids.insert(frag_map[lit_id]);
}
auto h = populate_holder(simple.first, exit_ids);
Automaton_Holder autom(*h);
auto rdfa = std::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
UNUSED bool rv = determinise(autom, rdfa->states, MAX_DFA_STATES);
assert(rv);
rdfa->start_anchored = INIT_STATE;
rdfa->start_floating = DEAD_STATE;
rdfa->alpha_size = autom.alphasize;
rdfa->alpha_remap = autom.alpha;
anchored_dfas->emplace_back(std::move(rdfa));
}
}
/**
* Fill the given vector with all of the raw_dfas we need to compile into the
* anchored matcher. Takes ownership of the input structures, clearing them
* from RoseBuildImpl.
*/
static
vector<unique_ptr<raw_dfa>> getAnchoredDfas(RoseBuildImpl &build,
const vector<u32> &frag_map) {
vector<unique_ptr<raw_dfa>> dfas;
// DFAs that already exist as raw_dfas.
for (auto &anch_dfas : build.anchored_nfas) {
for (auto &rdfa : anch_dfas.second) {
dfas.emplace_back(std::move(rdfa));
}
}
build.anchored_nfas.clear();
// DFAs we currently have as simple literals.
if (!build.anchored_simple.empty()) {
buildSimpleDfas(build, frag_map, &dfas);
build.anchored_simple.clear();
}
return dfas;
}
/**
* \brief Builds our anchored DFAs into runtime NFAs.
*
* Constructs a vector of NFA structures and a vector of their start offsets
* (number of dots removed from the prefix) from the raw_dfa structures given.
*
* Note: frees the raw_dfa structures on completion.
*
* \return Total bytes required for the complete anchored matcher.
*/
static
size_t buildNfas(vector<raw_dfa> &anchored_dfas,
vector<bytecode_ptr<NFA>> *nfas,
vector<u32> *start_offset, const CompileContext &cc,
const ReportManager &rm) {
const size_t num_dfas = anchored_dfas.size();
nfas->reserve(num_dfas);
start_offset->reserve(num_dfas);
size_t total_size = 0;
for (auto &rdfa : anchored_dfas) {
u32 removed_dots = remove_leading_dots(rdfa);
start_offset->emplace_back(removed_dots);
minimize_hopcroft(rdfa, cc.grey);
auto nfa = mcclellanCompile(rdfa, cc, rm, false);
if (!nfa) {
assert(0);
throw std::bad_alloc();
}
assert(nfa->length);
total_size += ROUNDUP_CL(sizeof(anchored_matcher_info) + nfa->length);
nfas->emplace_back(std::move(nfa));
}
// We no longer need to keep the raw_dfa structures around.
anchored_dfas.clear();
return total_size;
}
vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build,
const vector<LitFragment> &fragments) {
vector<raw_dfa> dfas;
if (build.anchored_nfas.empty() && build.anchored_simple.empty()) {
DEBUG_PRINTF("empty\n");
return dfas;
}
const auto frag_map = reverseFragMap(build, fragments);
remapAnchoredReports(build, frag_map);
auto anch_dfas = getAnchoredDfas(build, frag_map);
mergeAnchoredDfas(anch_dfas, build);
dfas.reserve(anch_dfas.size());
for (auto &rdfa : anch_dfas) {
assert(rdfa);
dfas.emplace_back(std::move(*rdfa));
}
return dfas;
}
bytecode_ptr<anchored_matcher_info>
buildAnchoredMatcher(RoseBuildImpl &build, const vector<LitFragment> &fragments,
vector<raw_dfa> &dfas) {
const CompileContext &cc = build.cc;
if (dfas.empty()) {
DEBUG_PRINTF("empty\n");
return bytecode_ptr<anchored_matcher_info>(nullptr);
}
for (auto &rdfa : dfas) {
remapIdsToPrograms(fragments, rdfa);
}
vector<bytecode_ptr<NFA>> nfas;
vector<u32> start_offset; // start offset for each dfa (dots removed)
size_t total_size = buildNfas(dfas, &nfas, &start_offset, cc, build.rm);
if (total_size > cc.grey.limitRoseAnchoredSize) {
throw ResourceLimitError();
}
auto atable =
make_zeroed_bytecode_ptr<anchored_matcher_info>(total_size, 64);
char *curr = (char *)atable.get();
u32 state_offset = 0;
for (size_t i = 0; i < nfas.size(); i++) {
const NFA *nfa = nfas[i].get();
anchored_matcher_info *ami = (anchored_matcher_info *)curr;
const char *prev_curr = curr;
curr += sizeof(anchored_matcher_info);
memcpy(curr, nfa, nfa->length);
curr += nfa->length;
curr = ROUNDUP_PTR(curr, 64);
if (i + 1 == nfas.size()) {
ami->next_offset = 0U;
} else {
ami->next_offset = verify_u32(curr - prev_curr);
}
ami->state_offset = state_offset;
state_offset += nfa->streamStateSize;
ami->anchoredMinDistance = start_offset[i];
}
DEBUG_PRINTF("success %zu\n", atable.size());
return atable;
}
} // namespace ue2