determinise: use queue, improve api

- Use a queue rather than always building the full vector of state
   sets.
 - Make more use of move, emplace, reserve.
 - Write directly into dstates argument.
 - Return bool rather than int.
This commit is contained in:
Justin Viiret 2017-05-11 17:07:26 +10:00 committed by Matthew Barr
parent 64db576b9e
commit 31141dd35b
5 changed files with 48 additions and 44 deletions

View File

@ -289,7 +289,7 @@ unique_ptr<raw_dfa> mergeTwoDfas(const raw_dfa *d1, const raw_dfa *d2,
auto rdfa = ue2::make_unique<raw_dfa>(d1->kind); auto rdfa = ue2::make_unique<raw_dfa>(d1->kind);
Automaton_Merge autom(d1, d2, rm, grey); Automaton_Merge autom(d1, d2, rm, grey);
if (!determinise(autom, rdfa->states, max_states)) { if (determinise(autom, rdfa->states, max_states)) {
rdfa->start_anchored = autom.start_anchored; rdfa->start_anchored = autom.start_anchored;
rdfa->start_floating = autom.start_floating; rdfa->start_floating = autom.start_floating;
rdfa->alpha_size = autom.alphasize; rdfa->alpha_size = autom.alphasize;
@ -374,7 +374,7 @@ unique_ptr<raw_dfa> mergeAllDfas(const vector<const raw_dfa *> &dfas,
DEBUG_PRINTF("merging dfa\n"); DEBUG_PRINTF("merging dfa\n");
if (determinise(n, rdfa->states, max_states)) { if (!determinise(n, rdfa->states, max_states)) {
DEBUG_PRINTF("state limit (%zu) exceeded\n", max_states); DEBUG_PRINTF("state limit (%zu) exceeded\n", max_states);
return nullptr; /* over state limit */ return nullptr; /* over state limit */
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015-2016, Intel Corporation * Copyright (c) 2015-2017, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -518,7 +518,7 @@ bool doHaig(const NGHolder &g, som_type som,
vector<StateSet> nfa_state_map; vector<StateSet> nfa_state_map;
Auto n(g, som, triggers, unordered_som); Auto n(g, som, triggers, unordered_som);
try { try {
if (determinise(n, rdfa->states, state_limit, &nfa_state_map)) { if (!determinise(n, rdfa->states, state_limit, &nfa_state_map)) {
DEBUG_PRINTF("state limit exceeded\n"); DEBUG_PRINTF("state limit exceeded\n");
return false; return false;
} }
@ -726,9 +726,8 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df
NODE_START, NODE_START,
dfas[0]->stream_som_loc_width); dfas[0]->stream_som_loc_width);
int rv = determinise(n, rdfa->states, limit, &nfa_state_map); if (!determinise(n, rdfa->states, limit, &nfa_state_map)) {
if (rv) { DEBUG_PRINTF("state limit (%u) exceeded\n", limit);
DEBUG_PRINTF("%d:state limit (%u) exceeded\n", rv, limit);
return nullptr; /* over state limit */ return nullptr; /* over state limit */
} }

View File

@ -433,6 +433,7 @@ public:
} }
return allExternalReports(*rm, test_reports); return allExternalReports(*rm, test_reports);
} }
private: private:
const ReportManager *rm; const ReportManager *rm;
public: public:
@ -568,7 +569,7 @@ unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
/* Fast path. Automaton_Graph uses a bitfield internally to represent /* Fast path. Automaton_Graph uses a bitfield internally to represent
* states and is quicker than Automaton_Big. */ * states and is quicker than Automaton_Big. */
Automaton_Graph n(rm, graph, single_trigger, triggers, prunable); Automaton_Graph n(rm, graph, single_trigger, triggers, prunable);
if (determinise(n, rdfa->states, state_limit)) { if (!determinise(n, rdfa->states, state_limit)) {
DEBUG_PRINTF("state limit exceeded\n"); DEBUG_PRINTF("state limit exceeded\n");
return nullptr; /* over state limit */ return nullptr; /* over state limit */
} }
@ -580,7 +581,7 @@ unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
} else { } else {
/* Slow path. Too many states to use Automaton_Graph. */ /* Slow path. Too many states to use Automaton_Graph. */
Automaton_Big n(rm, graph, single_trigger, triggers, prunable); Automaton_Big n(rm, graph, single_trigger, triggers, prunable);
if (determinise(n, rdfa->states, state_limit)) { if (!determinise(n, rdfa->states, state_limit)) {
DEBUG_PRINTF("state limit exceeded\n"); DEBUG_PRINTF("state limit exceeded\n");
return nullptr; /* over state limit */ return nullptr; /* over state limit */
} }

View File

@ -701,8 +701,8 @@ int addAutomaton(RoseBuildImpl &build, const NGHolder &h, ReportID *remap) {
Automaton_Holder autom(h); Automaton_Holder autom(h);
unique_ptr<raw_dfa> out_dfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW); auto out_dfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
if (!determinise(autom, out_dfa->states, MAX_DFA_STATES)) { if (determinise(autom, out_dfa->states, MAX_DFA_STATES)) {
return finalise_out(build, h, autom, move(out_dfa), remap); return finalise_out(build, h, autom, move(out_dfa), remap);
} }
@ -764,8 +764,8 @@ void buildSimpleDfas(const RoseBuildImpl &build, const vector<u32> &frag_map,
auto h = populate_holder(simple.first, exit_ids); auto h = populate_holder(simple.first, exit_ids);
Automaton_Holder autom(*h); Automaton_Holder autom(*h);
auto rdfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW); auto rdfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
UNUSED int rv = determinise(autom, rdfa->states, MAX_DFA_STATES); UNUSED bool rv = determinise(autom, rdfa->states, MAX_DFA_STATES);
assert(!rv); assert(rv);
rdfa->start_anchored = INIT_STATE; rdfa->start_anchored = INIT_STATE;
rdfa->start_floating = DEAD_STATE; rdfa->start_floating = DEAD_STATE;
rdfa->alpha_size = autom.alphasize; rdfa->alpha_size = autom.alphasize;

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015, Intel Corporation * Copyright (c) 2015-2017, Intel Corporation
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -38,14 +38,13 @@
#include "container.h" #include "container.h"
#include "ue2common.h" #include "ue2common.h"
#include <array>
#include <algorithm> #include <algorithm>
#include <array>
#include <queue>
#include <vector> #include <vector>
namespace ue2 { namespace ue2 {
#define DETERMINISE_RESERVE_SIZE 10
/* Automaton details: /* Automaton details:
* *
* const vector<StateSet> initial() * const vector<StateSet> initial()
@ -73,42 +72,44 @@ namespace ue2 {
* \param state_limit limit on the number of dfa states to construct * \param state_limit limit on the number of dfa states to construct
* \param statesets_out a mapping from DFA state to the set of NFA states in * \param statesets_out a mapping from DFA state to the set of NFA states in
* the automaton * the automaton
* \return zero on success * \return true on success, false if state limit exceeded
*/ */
template<class Auto, class ds> template<class Auto, class ds>
never_inline never_inline
int determinise(Auto &n, std::vector<ds> &dstates_out, dstate_id_t state_limit, bool determinise(Auto &n, std::vector<ds> &dstates, size_t state_limit,
std::vector<typename Auto::StateSet> *statesets_out = nullptr) { std::vector<typename Auto::StateSet> *statesets_out = nullptr) {
DEBUG_PRINTF("the determinator\n"); DEBUG_PRINTF("the determinator\n");
typedef typename Auto::StateSet StateSet; typedef typename Auto::StateSet StateSet;
typedef typename Auto::StateMap DstateIdMap; typedef typename Auto::StateMap DstateIdMap;
DstateIdMap dstate_ids; DstateIdMap dstate_ids;
std::vector<StateSet> statesets;
const size_t alphabet_size = n.alphasize; const size_t alphabet_size = n.alphasize;
std::vector<ds> dstates; dstates.clear();
dstates.reserve(DETERMINISE_RESERVE_SIZE); dstates.reserve(state_limit);
statesets.reserve(DETERMINISE_RESERVE_SIZE);
dstate_ids[n.dead] = DEAD_STATE; dstate_ids.emplace(n.dead, DEAD_STATE);
dstates.push_back(ds(alphabet_size)); dstates.push_back(ds(alphabet_size));
std::fill_n(dstates[0].next.begin(), alphabet_size, DEAD_STATE); std::fill_n(dstates[0].next.begin(), alphabet_size, DEAD_STATE);
statesets.push_back(n.dead); std::queue<std::pair<StateSet, dstate_id_t>> q;
q.emplace(n.dead, DEAD_STATE);
const std::vector<StateSet> &init = n.initial(); const std::vector<StateSet> &init = n.initial();
for (u32 i = 0; i < init.size(); i++) { for (u32 i = 0; i < init.size(); i++) {
statesets.push_back(init[i]); q.emplace(init[i], dstates.size());
assert(!contains(dstate_ids, init[i])); assert(!contains(dstate_ids, init[i]));
dstate_ids[init[i]] = dstates.size(); dstate_ids.emplace(init[i], dstates.size());
dstates.push_back(ds(alphabet_size)); dstates.push_back(ds(alphabet_size));
} }
std::vector<StateSet> succs(alphabet_size, n.dead); std::vector<StateSet> succs(alphabet_size, n.dead);
for (dstate_id_t curr_id = DEAD_STATE; curr_id < dstates.size();
curr_id++) { while (!q.empty()) {
StateSet &curr = statesets[curr_id]; auto m = std::move(q.front());
q.pop();
StateSet &curr = m.first;
dstate_id_t curr_id = m.second;
DEBUG_PRINTF("curr: %hu\n", curr_id); DEBUG_PRINTF("curr: %hu\n", curr_id);
@ -139,43 +140,46 @@ int determinise(Auto &n, std::vector<ds> &dstates_out, dstate_id_t state_limit,
if (s && succs[s] == succs[s - 1]) { if (s && succs[s] == succs[s - 1]) {
succ_id = dstates[curr_id].next[s - 1]; succ_id = dstates[curr_id].next[s - 1];
} else { } else {
typename DstateIdMap::const_iterator dstate_id_iter; auto p = dstate_ids.emplace(succs[s], dstates.size());
dstate_id_iter = dstate_ids.find(succs[s]); succ_id = p.first->second;
if (!p.second) { /* succs[s] is already present */
if (dstate_id_iter != dstate_ids.end()) {
succ_id = dstate_id_iter->second;
if (succ_id > curr_id && !dstates[succ_id].daddy if (succ_id > curr_id && !dstates[succ_id].daddy
&& n.unalpha[s] < N_CHARS) { && n.unalpha[s] < N_CHARS) {
dstates[succ_id].daddy = curr_id; dstates[succ_id].daddy = curr_id;
} }
} else { } else {
statesets.push_back(succs[s]);
succ_id = dstates.size();
dstate_ids[succs[s]] = succ_id;
dstates.push_back(ds(alphabet_size)); dstates.push_back(ds(alphabet_size));
dstates.back().daddy = n.unalpha[s] < N_CHARS ? curr_id : 0; dstates.back().daddy = n.unalpha[s] < N_CHARS ? curr_id : 0;
q.emplace(succs[s], succ_id);
} }
DEBUG_PRINTF("-->%hu on %02hx\n", succ_id, n.unalpha[s]); DEBUG_PRINTF("-->%hu on %02hx\n", succ_id, n.unalpha[s]);
} }
if (succ_id >= state_limit) { if (succ_id >= state_limit) {
DEBUG_PRINTF("succ_id %hu >= state_limit %hu\n", DEBUG_PRINTF("succ_id %hu >= state_limit %zu\n",
succ_id, state_limit); succ_id, state_limit);
return -2; dstates.clear();
return false;
} }
dstates[curr_id].next[s] = succ_id; dstates[curr_id].next[s] = succ_id;
} }
} }
dstates_out = dstates; // The dstates vector will persist in the raw_dfa.
dstates.shrink_to_fit();
if (statesets_out) { if (statesets_out) {
statesets_out->swap(statesets); auto &statesets = *statesets_out;
statesets.resize(dstate_ids.size());
for (auto &m : dstate_ids) {
statesets[m.second] = std::move(m.first);
} }
}
DEBUG_PRINTF("ok\n"); DEBUG_PRINTF("ok\n");
return 0; return true;
} }
static inline static inline