mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
ng_find_matches: Simplify and improve performance
Improve performance by using bitsets rather than sets of vertex indices.
This commit is contained in:
parent
9ae908fd11
commit
5dd4aa9c13
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2015, Intel Corporation
|
* Copyright (c) 2015-2016, Intel Corporation
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions are met:
|
* modification, are permitted provided that the following conditions are met:
|
||||||
@ -47,34 +47,59 @@
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace ue2;
|
using namespace ue2;
|
||||||
|
|
||||||
// convenience typedefs
|
namespace {
|
||||||
typedef map<NFAVertex,size_t> SOMMap;
|
|
||||||
typedef set<pair<size_t, size_t> > MatchSet;
|
struct StateSet {
|
||||||
|
explicit StateSet(size_t sz) : s(sz), som(sz, 0) {}
|
||||||
|
boost::dynamic_bitset<> s; // bitset of states that are on
|
||||||
|
vector<size_t> som; // som value for each state
|
||||||
|
};
|
||||||
|
|
||||||
|
using MatchSet = set<pair<size_t, size_t>>;
|
||||||
|
|
||||||
struct fmstate {
|
struct fmstate {
|
||||||
SOMMap states;
|
const size_t num_states; // number of vertices in graph
|
||||||
SOMMap next;
|
StateSet states; // currently active states
|
||||||
size_t offset;
|
StateSet next; // states on after this iteration
|
||||||
unsigned char cur;
|
vector<NFAVertex> vertices; // mapping from index to vertex
|
||||||
unsigned char prev;
|
size_t offset = 0;
|
||||||
|
unsigned char cur = 0;
|
||||||
|
unsigned char prev = 0;
|
||||||
const bool som;
|
const bool som;
|
||||||
const bool utf8;
|
const bool utf8;
|
||||||
const bool allowStartDs;
|
const bool allowStartDs;
|
||||||
const ReportManager &rm;
|
const ReportManager &rm;
|
||||||
|
|
||||||
fmstate(const bool som_in, const bool utf8_in, const bool aSD_in,
|
boost::dynamic_bitset<> accept; // states leading to accept
|
||||||
|
boost::dynamic_bitset<> accept_with_eod; // states leading to accept or eod
|
||||||
|
|
||||||
|
fmstate(const NGHolder &g, bool som_in, bool utf8_in, bool aSD_in,
|
||||||
const ReportManager &rm_in)
|
const ReportManager &rm_in)
|
||||||
: offset(0), cur(0), prev(0), som(som_in), utf8(utf8_in),
|
: num_states(num_vertices(g)), states(num_states), next(num_states),
|
||||||
allowStartDs(aSD_in), rm(rm_in) {}
|
vertices(num_vertices(g), NFAGraph::null_vertex()), som(som_in),
|
||||||
|
utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in), accept(num_states),
|
||||||
|
accept_with_eod(num_states) {
|
||||||
|
// init states
|
||||||
|
states.s.set(g[g.start].index);
|
||||||
|
if (allowStartDs) {
|
||||||
|
states.s.set(g[g.startDs].index);
|
||||||
|
}
|
||||||
|
// fill vertex mapping
|
||||||
|
for (const auto &v : vertices_range(g)) {
|
||||||
|
vertices[g[v].index] = v;
|
||||||
|
}
|
||||||
|
// init accept states
|
||||||
|
for (const auto &u : inv_adjacent_vertices_range(g.accept, g)) {
|
||||||
|
accept.set(g[u].index);
|
||||||
|
}
|
||||||
|
accept_with_eod = accept;
|
||||||
|
for (const auto &u : inv_adjacent_vertices_range(g.acceptEod, g)) {
|
||||||
|
accept_with_eod.set(g[u].index);
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static
|
} // namespace
|
||||||
void initStates(const NGHolder &g, struct fmstate &state) {
|
|
||||||
state.states.insert(make_pair(g.start, 0));
|
|
||||||
if (state.allowStartDs) {
|
|
||||||
state.states.insert(make_pair(g.startDs, 0));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static
|
static
|
||||||
bool isWordChar(const unsigned char c) {
|
bool isWordChar(const unsigned char c) {
|
||||||
@ -115,17 +140,9 @@ bool isUtf8CodePoint(const char c) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
bool canReach(const NGHolder &g, const NFAVertex &src, const NFAVertex &dst,
|
bool canReach(const NGHolder &g, const NFAEdge &e,
|
||||||
struct fmstate &state) {
|
struct fmstate &state) {
|
||||||
// find relevant edge and see whether it has asserts
|
auto flags = g[e].assert_flags;
|
||||||
NFAEdge e;
|
|
||||||
bool exists;
|
|
||||||
u32 flags;
|
|
||||||
|
|
||||||
tie(e, exists) = edge(src, dst, g);
|
|
||||||
assert(exists);
|
|
||||||
|
|
||||||
flags = g[e].assert_flags;
|
|
||||||
if (!flags) {
|
if (!flags) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -160,33 +177,35 @@ bool canReach(const NGHolder &g, const NFAVertex &src, const NFAVertex &dst,
|
|||||||
static
|
static
|
||||||
void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state,
|
void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state,
|
||||||
bool allowEodMatches) {
|
bool allowEodMatches) {
|
||||||
SOMMap::const_iterator it, ite;
|
auto acc_states = state.states.s;
|
||||||
|
acc_states &= allowEodMatches ? state.accept_with_eod : state.accept;
|
||||||
|
|
||||||
for (it = state.states.begin(), ite = state.states.end(); it != ite; ++it) {
|
for (size_t i = acc_states.find_first(); i != acc_states.npos;
|
||||||
NFAGraph::adjacency_iterator ai, ae;
|
i = acc_states.find_next(i)) {
|
||||||
|
const NFAVertex u = state.vertices[i];
|
||||||
|
const size_t &som_offset = state.states.som[i];
|
||||||
|
|
||||||
// we can't accept anything from startDs in between UTF-8 codepoints
|
// we can't accept anything from startDs in between UTF-8 codepoints
|
||||||
if (state.utf8 && it->first == g.startDs && !isUtf8CodePoint(state.cur)) {
|
if (state.utf8 && u == g.startDs && !isUtf8CodePoint(state.cur)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (tie(ai, ae) = adjacent_vertices(it->first, g); ai != ae; ++ai) {
|
for (const auto &e : out_edges_range(u, g)) {
|
||||||
if (*ai == g.accept || (*ai == g.acceptEod && allowEodMatches)) {
|
NFAVertex v = target(e, g);
|
||||||
|
if (v == g.accept || (v == g.acceptEod && allowEodMatches)) {
|
||||||
// check edge assertions if we are allowed to reach accept
|
// check edge assertions if we are allowed to reach accept
|
||||||
if (!canReach(g, it->first, *ai, state)) {
|
if (!canReach(g, e, state)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
DEBUG_PRINTF("match found at %zu\n", state.offset);
|
DEBUG_PRINTF("match found at %zu\n", state.offset);
|
||||||
|
|
||||||
assert(!g[it->first].reports.empty());
|
assert(!g[u].reports.empty());
|
||||||
for (const auto &report_id :
|
for (const auto &report_id : g[u].reports) {
|
||||||
g[it->first].reports) {
|
|
||||||
const Report &ri = state.rm.getReport(report_id);
|
const Report &ri = state.rm.getReport(report_id);
|
||||||
|
|
||||||
DEBUG_PRINTF("report %u has offset adjustment %d\n",
|
DEBUG_PRINTF("report %u has offset adjustment %d\n",
|
||||||
report_id, ri.offsetAdjust);
|
report_id, ri.offsetAdjust);
|
||||||
matches.insert(
|
matches.emplace(som_offset, state.offset + ri.offsetAdjust);
|
||||||
make_pair(it->second, state.offset + ri.offsetAdjust));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -195,55 +214,57 @@ void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state,
|
|||||||
|
|
||||||
static
|
static
|
||||||
void step(const NGHolder &g, struct fmstate &state) {
|
void step(const NGHolder &g, struct fmstate &state) {
|
||||||
state.next.clear();
|
state.next.s.reset();
|
||||||
SOMMap::iterator it, ite;
|
|
||||||
|
|
||||||
for (it = state.states.begin(), ite = state.states.end(); it != ite; ++it) {
|
for (size_t i = state.states.s.find_first(); i != state.states.s.npos;
|
||||||
NFAGraph::adjacency_iterator ai, ae;
|
i = state.states.s.find_next(i)) {
|
||||||
|
const NFAVertex &u = state.vertices[i];
|
||||||
|
const size_t &u_som_offset = state.states.som[i];
|
||||||
|
|
||||||
for (tie(ai, ae) = adjacent_vertices(it->first, g); ai != ae; ++ai) {
|
for (const auto &e : out_edges_range(u, g)) {
|
||||||
if (*ai == g.acceptEod) {
|
NFAVertex v = target(e, g);
|
||||||
|
if (v == g.acceptEod) {
|
||||||
// can't know the future: we don't know if we're at EOD.
|
// can't know the future: we don't know if we're at EOD.
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (*ai == g.accept) {
|
if (v == g.accept) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!state.allowStartDs && *ai == g.startDs) {
|
if (!state.allowStartDs && v == g.startDs) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const CharReach &cr = g[*ai].char_reach;
|
const CharReach &cr = g[v].char_reach;
|
||||||
|
const size_t v_idx = g[v].index;
|
||||||
|
|
||||||
// check reachability and edge assertions
|
// check reachability and edge assertions
|
||||||
if (cr.test(state.cur) && canReach(g, it->first, *ai, state)) {
|
if (cr.test(state.cur) && canReach(g, e, state)) {
|
||||||
SOMMap::const_iterator ni;
|
|
||||||
size_t next_som;
|
|
||||||
|
|
||||||
// if we aren't in SOM mode, just set every SOM to 0
|
// if we aren't in SOM mode, just set every SOM to 0
|
||||||
if (!state.som) {
|
if (!state.som) {
|
||||||
state.next[*ai] = 0;
|
state.next.s.set(v_idx);
|
||||||
|
state.next.som[v_idx] = 0;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// if this is first vertex since start, use current offset as SOM
|
// if this is first vertex since start, use current offset as SOM
|
||||||
if (it->first == g.start || it->first == g.startDs ||
|
size_t next_som;
|
||||||
is_virtual_start(it->first, g)) {
|
if (u == g.start || u == g.startDs || is_virtual_start(u, g)) {
|
||||||
next_som = state.offset;
|
next_som = state.offset;
|
||||||
} else {
|
} else {
|
||||||
// else, inherit SOM from predecessor
|
// else, inherit SOM from predecessor
|
||||||
next_som = it->second;
|
next_som = u_som_offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if the vertex is already active
|
// check if the vertex is already active
|
||||||
ni = state.next.find(*ai);
|
|
||||||
|
|
||||||
// if this vertex is not yet active, use current SOM
|
// if this vertex is not yet active, use current SOM
|
||||||
if (ni == state.next.end()) {
|
if (!state.next.s.test(v_idx)) {
|
||||||
state.next[*ai] = next_som;
|
state.next.s.set(v_idx);
|
||||||
|
state.next.som[v_idx] = next_som;
|
||||||
} else {
|
} else {
|
||||||
// else, work out leftmost SOM
|
// else, work out leftmost SOM
|
||||||
state.next[*ai] = min(next_som, ni->second);
|
state.next.som[v_idx] =
|
||||||
|
min(next_som, state.next.som[v_idx]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -251,34 +272,32 @@ void step(const NGHolder &g, struct fmstate &state) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// filter extraneous matches
|
// filter extraneous matches
|
||||||
static void filterMatches(MatchSet &matches) {
|
static
|
||||||
|
void filterMatches(MatchSet &matches) {
|
||||||
set<size_t> eom;
|
set<size_t> eom;
|
||||||
MatchSet::iterator msit;
|
|
||||||
|
|
||||||
// first, collect all end-offset matches
|
// first, collect all end-offset matches
|
||||||
for (msit = matches.begin(); msit != matches.end(); ++msit) {
|
for (const auto &match : matches) {
|
||||||
eom.insert(msit->second);
|
eom.insert(match.second);
|
||||||
}
|
}
|
||||||
|
|
||||||
// now, go through all the end-offsets and filter extra matches
|
// now, go through all the end-offsets and filter extra matches
|
||||||
set<size_t>::const_iterator eomit;
|
for (const auto &elem : eom) {
|
||||||
for (eomit = eom.begin(); eomit != eom.end(); ++eomit) {
|
|
||||||
|
|
||||||
// find minimum SOM for this EOM
|
// find minimum SOM for this EOM
|
||||||
size_t min_som = -1U;
|
size_t min_som = -1U;
|
||||||
for (msit = matches.begin(); msit != matches.end(); ++msit) {
|
for (const auto &match : matches) {
|
||||||
// skip entries with wrong EOM
|
// skip entries with wrong EOM
|
||||||
if (msit->second != *eomit) {
|
if (match.second != elem) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
min_som = min(min_som, msit->first);
|
min_som = min(min_som, match.first);
|
||||||
}
|
}
|
||||||
|
|
||||||
msit = matches.begin();
|
auto msit = matches.begin();
|
||||||
while (msit != matches.end()) {
|
while (msit != matches.end()) {
|
||||||
// skip everything that doesn't match
|
// skip everything that doesn't match
|
||||||
if (msit->second != *eomit || msit->first <= min_som) {
|
if (msit->second != elem || msit->first <= min_som) {
|
||||||
++msit;
|
++msit;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -295,14 +314,13 @@ static void filterMatches(MatchSet &matches) {
|
|||||||
void findMatches(const NGHolder &g, const ReportManager &rm,
|
void findMatches(const NGHolder &g, const ReportManager &rm,
|
||||||
const string &input, MatchSet &matches, const bool notEod,
|
const string &input, MatchSet &matches, const bool notEod,
|
||||||
const bool som, const bool utf8) {
|
const bool som, const bool utf8) {
|
||||||
|
assert(hasCorrectlyNumberedVertices(g));
|
||||||
|
|
||||||
const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0);
|
const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0);
|
||||||
|
|
||||||
struct fmstate state(som, utf8, allowStartDs, rm);
|
struct fmstate state(g, som, utf8, allowStartDs, rm);
|
||||||
|
|
||||||
initStates(g, state);
|
for (auto it = input.begin(), ite = input.end(); it != ite; ++it) {
|
||||||
|
|
||||||
string::const_iterator it, ite;
|
|
||||||
for (it = input.begin(), ite = input.end(); it != ite; ++it) {
|
|
||||||
state.offset = distance(input.begin(), it);
|
state.offset = distance(input.begin(), it);
|
||||||
state.cur = *it;
|
state.cur = *it;
|
||||||
|
|
||||||
@ -310,14 +328,15 @@ void findMatches(const NGHolder &g, const ReportManager &rm,
|
|||||||
|
|
||||||
getMatches(g, matches, state, false);
|
getMatches(g, matches, state, false);
|
||||||
|
|
||||||
DEBUG_PRINTF("index %zu, %zu states on\n", state.offset, state.next.size());
|
DEBUG_PRINTF("index %zu, %zu states on\n", state.offset,
|
||||||
if (state.next.empty()) {
|
state.next.s.count());
|
||||||
|
if (state.next.s.empty()) {
|
||||||
if (state.som) {
|
if (state.som) {
|
||||||
filterMatches(matches);
|
filterMatches(matches);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
state.states.swap(state.next);
|
state.states = state.next;
|
||||||
state.prev = state.cur;
|
state.prev = state.cur;
|
||||||
}
|
}
|
||||||
state.offset = input.size();
|
state.offset = input.size();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user