mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
ng_literal_analysis: use ue2_graph
This reduces compile time ~10% on a number of large cases.
This commit is contained in:
parent
32af5fa794
commit
06cde4c94d
@ -40,17 +40,16 @@
|
||||
#include "util/depth.h"
|
||||
#include "util/graph.h"
|
||||
#include "util/graph_range.h"
|
||||
#include "util/ue2_graph.h"
|
||||
#include "util/ue2string.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <queue>
|
||||
|
||||
#include <boost/graph/adjacency_list.hpp>
|
||||
#include <boost/graph/boykov_kolmogorov_max_flow.hpp>
|
||||
|
||||
using namespace std;
|
||||
using boost::vertex_index;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
@ -65,24 +64,29 @@ namespace {
|
||||
/* Small literal graph type used for the suffix tree used in
|
||||
* compressAndScore. */
|
||||
|
||||
|
||||
struct LitGraphVertexProps {
|
||||
LitGraphVertexProps() {}
|
||||
explicit LitGraphVertexProps(const ue2_literal::elem &c_in) : c(c_in) {}
|
||||
LitGraphVertexProps() = default;
|
||||
explicit LitGraphVertexProps(ue2_literal::elem c_in) : c(move(c_in)) {}
|
||||
ue2_literal::elem c; // string element (char + bool)
|
||||
size_t index; // managed by ue2_graph
|
||||
};
|
||||
|
||||
struct LitGraphEdgeProps {
|
||||
LitGraphEdgeProps() {}
|
||||
LitGraphEdgeProps() = default;
|
||||
explicit LitGraphEdgeProps(u64a score_in) : score(score_in) {}
|
||||
u64a score = NO_LITERAL_AT_EDGE_SCORE;
|
||||
size_t index; /* only initialised when the reverse edges are added. */
|
||||
size_t index; // managed by ue2_graph
|
||||
};
|
||||
|
||||
struct LitGraph
|
||||
: public ue2_graph<LitGraph, LitGraphVertexProps, LitGraphEdgeProps> {
|
||||
|
||||
LitGraph() : root(add_vertex(*this)), sink(add_vertex(*this)) {}
|
||||
|
||||
const vertex_descriptor root;
|
||||
const vertex_descriptor sink;
|
||||
};
|
||||
|
||||
/* keep edgeList = listS as you cannot remove edges if edgeList = vecS */
|
||||
typedef boost::adjacency_list<boost::vecS, boost::vecS, boost::bidirectionalS,
|
||||
LitGraphVertexProps, LitGraphEdgeProps,
|
||||
boost::no_property> LitGraph;
|
||||
typedef LitGraph::vertex_descriptor LitVertex;
|
||||
typedef LitGraph::edge_descriptor LitEdge;
|
||||
|
||||
@ -95,17 +99,16 @@ typedef std::queue<VertexPair> LitVertexQ;
|
||||
|
||||
/** \brief Dump the literal graph in Graphviz format. */
|
||||
static UNUSED
|
||||
void dumpGraph(const char *filename, const LitGraph &lg, const LitVertex &root,
|
||||
const LitVertex &sink) {
|
||||
void dumpGraph(const char *filename, const LitGraph &lg) {
|
||||
ofstream fout(filename);
|
||||
|
||||
fout << "digraph G {" << endl;
|
||||
|
||||
for (auto v : vertices_range(lg)) {
|
||||
fout << boost::get(vertex_index, lg, v);
|
||||
if (v == root) {
|
||||
fout << lg[v].index;
|
||||
if (v == lg.root) {
|
||||
fout << "[label=\"ROOT\"];";
|
||||
} else if (v == sink) {
|
||||
} else if (v == lg.sink) {
|
||||
fout << "[label=\"SINK\"];";
|
||||
} else {
|
||||
ue2_literal s;
|
||||
@ -117,10 +120,9 @@ void dumpGraph(const char *filename, const LitGraph &lg, const LitVertex &root,
|
||||
|
||||
for (const auto &e : edges_range(lg)) {
|
||||
LitVertex u = source(e, lg), v = target(e, lg);
|
||||
fout << boost::get(vertex_index, lg, u) << " -> " <<
|
||||
boost::get(vertex_index, lg, v) <<
|
||||
"[label=\"" << lg[e].score << "\"]" <<
|
||||
";" << endl;
|
||||
fout << lg[u].index << " -> " << lg[v].index << "[label=\""
|
||||
<< lg[e].score << "\"]"
|
||||
<< ";" << endl;
|
||||
}
|
||||
|
||||
fout << "}" << endl;
|
||||
@ -142,11 +144,11 @@ bool allowExpand(size_t numItems, size_t totalPathsSoFar) {
|
||||
}
|
||||
|
||||
static
|
||||
LitVertex addToLitGraph(LitGraph &lg, LitVertex sink,
|
||||
LitVertex pred, const ue2_literal::elem &c) {
|
||||
LitVertex addToLitGraph(LitGraph &lg, LitVertex pred,
|
||||
const ue2_literal::elem &c) {
|
||||
// Check if we already have this in the graph.
|
||||
for (auto v : adjacent_vertices_range(pred, lg)) {
|
||||
if (v == sink) {
|
||||
if (v == lg.sink) {
|
||||
continue;
|
||||
}
|
||||
if (lg[v].c == c) {
|
||||
@ -160,9 +162,10 @@ LitVertex addToLitGraph(LitGraph &lg, LitVertex sink,
|
||||
}
|
||||
|
||||
static
|
||||
void addToQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex sink,
|
||||
LitVertex pred, const CharReach &cr, NFAVertex v) {
|
||||
for (size_t i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) {
|
||||
void addToQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex pred,
|
||||
const CharReach &cr, NFAVertex v) {
|
||||
for (size_t i = cr.find_first(); i != CharReach::npos;
|
||||
i = cr.find_next(i)) {
|
||||
if (myisupper(i) && cr.test(mytolower(i))) {
|
||||
// ignore upper half of a nocase pair
|
||||
continue;
|
||||
@ -170,14 +173,14 @@ void addToQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex sink,
|
||||
|
||||
bool nocase = myislower(i) && cr.test(mytoupper(i));
|
||||
ue2_literal::elem c((char)i, nocase);
|
||||
LitVertex lv = addToLitGraph(lg, sink, pred, c);
|
||||
LitVertex lv = addToLitGraph(lg, pred, c);
|
||||
workQ.push(VertexPair(lv, v));
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void initWorkQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex root,
|
||||
LitVertex sink, const NGHolder &g, const NFAEdge &e) {
|
||||
void initWorkQueue(LitVertexQ &workQ, LitGraph &lg, const NGHolder &g,
|
||||
const NFAEdge &e) {
|
||||
NFAVertex u = source(e, g);
|
||||
NFAVertex v = target(e, g);
|
||||
const CharReach &cr = g[v].char_reach;
|
||||
@ -186,7 +189,7 @@ void initWorkQueue(LitVertexQ &workQ, LitGraph &lg, LitVertex root,
|
||||
return;
|
||||
}
|
||||
|
||||
addToQueue(workQ, lg, sink, root, cr, u);
|
||||
addToQueue(workQ, lg, lg.root, cr, u);
|
||||
}
|
||||
|
||||
static
|
||||
@ -198,7 +201,8 @@ u32 crCardinality(const CharReach &cr) {
|
||||
}
|
||||
|
||||
u32 rv = 0;
|
||||
for (size_t i = cr.find_first(); i != CharReach::npos; i = cr.find_next(i)) {
|
||||
for (size_t i = cr.find_first(); i != CharReach::npos;
|
||||
i = cr.find_next(i)) {
|
||||
if (myisupper(i) && cr.test(mytolower(i))) {
|
||||
// ignore upper half of a nocase pair
|
||||
continue;
|
||||
@ -213,10 +217,10 @@ u32 crCardinality(const CharReach &cr) {
|
||||
* identifying vertices connected to the sink and removing their other
|
||||
* out-edges. */
|
||||
static
|
||||
void filterLitGraph(LitGraph &lg, const LitVertex sink) {
|
||||
for (auto v : inv_adjacent_vertices_range(sink, lg)) {
|
||||
remove_out_edge_if(v, [&lg, &sink](const LitEdge &e) {
|
||||
return target(e, lg) != sink;
|
||||
void filterLitGraph(LitGraph &lg) {
|
||||
for (auto v : inv_adjacent_vertices_range(lg.sink, lg)) {
|
||||
remove_out_edge_if(v, [&lg](const LitEdge &e) {
|
||||
return target(e, lg) != lg.sink;
|
||||
}, lg);
|
||||
}
|
||||
|
||||
@ -229,13 +233,12 @@ void filterLitGraph(LitGraph &lg, const LitVertex sink) {
|
||||
* from each predecessor of the sink (note: it's a suffix tree except for this
|
||||
* convenience) towards the source, storing each string as we go. */
|
||||
static
|
||||
void extractLiterals(const LitGraph &lg, const LitVertex root,
|
||||
const LitVertex sink, set<ue2_literal> &s) {
|
||||
void extractLiterals(const LitGraph &lg, set<ue2_literal> &s) {
|
||||
ue2_literal lit;
|
||||
|
||||
for (auto u : inv_adjacent_vertices_range(sink, lg)) {
|
||||
for (auto u : inv_adjacent_vertices_range(lg.sink, lg)) {
|
||||
lit.clear();
|
||||
while (u != root) {
|
||||
while (u != lg.root) {
|
||||
lit.push_back(lg[u].c);
|
||||
assert(in_degree(u, lg) <= 1);
|
||||
LitGraph::inv_adjacency_iterator ai2, ae2;
|
||||
@ -277,11 +280,9 @@ void processWorkQueue(const NGHolder &g, const NFAEdge &e,
|
||||
}
|
||||
|
||||
LitGraph lg;
|
||||
LitVertex root = add_vertex(lg);
|
||||
LitVertex sink = add_vertex(lg);
|
||||
|
||||
LitVertexQ workQ;
|
||||
initWorkQueue(workQ, lg, root, sink, g, e);
|
||||
initWorkQueue(workQ, lg, g, e);
|
||||
|
||||
while (!workQ.empty()) {
|
||||
const LitVertex lv = workQ.front().first;
|
||||
@ -290,18 +291,18 @@ void processWorkQueue(const NGHolder &g, const NFAEdge &e,
|
||||
|
||||
u32 cr_card = crCardinality(cr);
|
||||
size_t numItems = cr_card * in_degree(t, g);
|
||||
size_t committed_count = workQ.size() + in_degree(sink, lg) - 1;
|
||||
size_t committed_count = workQ.size() + in_degree(lg.sink, lg) - 1;
|
||||
|
||||
if (g[t].index == NODE_START) {
|
||||
// reached start, add to literal set
|
||||
add_edge_if_not_present(lv, sink, lg);
|
||||
add_edge_if_not_present(lv, lg.sink, lg);
|
||||
goto next_work_elem;
|
||||
}
|
||||
|
||||
// Expand next vertex
|
||||
if (allowExpand(numItems, committed_count)) {
|
||||
for (auto u : inv_adjacent_vertices_range(t, g)) {
|
||||
addToQueue(workQ, lg, sink, lv, cr, u);
|
||||
addToQueue(workQ, lg, lv, cr, u);
|
||||
}
|
||||
goto next_work_elem;
|
||||
}
|
||||
@ -317,21 +318,21 @@ void processWorkQueue(const NGHolder &g, const NFAEdge &e,
|
||||
|
||||
bool nocase = myislower(i) && cr.test(mytoupper(i));
|
||||
ue2_literal::elem c((char)i, nocase);
|
||||
LitVertex lt = addToLitGraph(lg, sink, lv, c);
|
||||
add_edge_if_not_present(lt, sink, lg);
|
||||
LitVertex lt = addToLitGraph(lg, lv, c);
|
||||
add_edge_if_not_present(lt, lg.sink, lg);
|
||||
}
|
||||
goto next_work_elem;
|
||||
}
|
||||
|
||||
// add to literal set
|
||||
add_edge_if_not_present(lv, sink, lg);
|
||||
add_edge_if_not_present(lv, lg.sink, lg);
|
||||
next_work_elem:
|
||||
workQ.pop();
|
||||
}
|
||||
|
||||
filterLitGraph(lg, sink);
|
||||
//dumpGraph("litgraph.dot", lg, root, sink);
|
||||
extractLiterals(lg, root, sink, s);
|
||||
filterLitGraph(lg);
|
||||
//dumpGraph("litgraph.dot", lg);
|
||||
extractLiterals(lg, s);
|
||||
|
||||
// Our literal set should contain no literal that is a suffix of another.
|
||||
assert(!hasSuffixLiterals(s));
|
||||
@ -410,16 +411,15 @@ u64a calculateScore(const ue2_literal &s) {
|
||||
|
||||
/** Adds a literal in reverse order, building up a suffix tree. */
|
||||
static
|
||||
void addReversedLiteral(const ue2_literal &lit, LitGraph &lg,
|
||||
const LitVertex &root, const LitVertex &sink) {
|
||||
void addReversedLiteral(const ue2_literal &lit, LitGraph &lg) {
|
||||
DEBUG_PRINTF("literal: '%s'\n", escapeString(lit).c_str());
|
||||
ue2_literal suffix;
|
||||
LitVertex v = root;
|
||||
LitVertex v = lg.root;
|
||||
for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
|
||||
suffix.push_back(*it);
|
||||
LitVertex w;
|
||||
for (auto v2 : adjacent_vertices_range(v, lg)) {
|
||||
if (v2 != sink && lg[v2].c == *it) {
|
||||
if (v2 != lg.sink && lg[v2].c == *it) {
|
||||
w = v2;
|
||||
goto next_char;
|
||||
}
|
||||
@ -431,17 +431,18 @@ next_char:
|
||||
}
|
||||
|
||||
// Wire the last vertex to the sink.
|
||||
add_edge(v, sink, lg);
|
||||
add_edge(v, lg.sink, lg);
|
||||
}
|
||||
|
||||
static
|
||||
void extractLiterals(const vector<LitEdge> &cutset, const LitGraph &lg,
|
||||
const LitVertex &root, set<ue2_literal> &s) {
|
||||
set<ue2_literal> &s) {
|
||||
for (const auto &e : cutset) {
|
||||
LitVertex u = source(e, lg), v = target(e, lg);
|
||||
LitVertex u = source(e, lg);
|
||||
LitVertex v = target(e, lg);
|
||||
ue2_literal lit;
|
||||
lit.push_back(lg[v].c);
|
||||
while (u != root) {
|
||||
while (u != lg.root) {
|
||||
lit.push_back(lg[u].c);
|
||||
assert(in_degree(u, lg) == 1);
|
||||
LitGraph::inv_adjacency_iterator ai, ae;
|
||||
@ -488,10 +489,7 @@ const char *describeColor(boost::default_color_type c) {
|
||||
static
|
||||
vector<LitEdge> add_reverse_edges_and_index(LitGraph &lg) {
|
||||
vector<LitEdge> fwd_edges;
|
||||
|
||||
size_t next_index = 0;
|
||||
for (const auto &e : edges_range(lg)) {
|
||||
lg[e].index = next_index++;
|
||||
fwd_edges.push_back(e);
|
||||
}
|
||||
|
||||
@ -503,9 +501,7 @@ vector<LitEdge> add_reverse_edges_and_index(LitGraph &lg) {
|
||||
|
||||
assert(!edge(v, u, lg).second);
|
||||
|
||||
LitEdge rev = add_edge(v, u, lg).first;
|
||||
lg[rev].score = 0;
|
||||
lg[rev].index = next_index++;
|
||||
LitEdge rev = add_edge(v, u, LitGraphEdgeProps(0), lg).first;
|
||||
rev_map[lg[e].index] = rev;
|
||||
rev_map[lg[rev].index] = e;
|
||||
}
|
||||
@ -514,20 +510,19 @@ vector<LitEdge> add_reverse_edges_and_index(LitGraph &lg) {
|
||||
}
|
||||
|
||||
static
|
||||
void findMinCut(LitGraph &lg, const LitVertex &root, const LitVertex &sink,
|
||||
vector<LitEdge> &cutset) {
|
||||
void findMinCut(LitGraph &lg, vector<LitEdge> &cutset) {
|
||||
cutset.clear();
|
||||
|
||||
//dumpGraph("litgraph.dot", lg, root, sink);
|
||||
//dumpGraph("litgraph.dot", lg);
|
||||
|
||||
assert(!in_degree(root, lg));
|
||||
assert(!out_degree(sink, lg));
|
||||
assert(!in_degree(lg.root, lg));
|
||||
assert(!out_degree(lg.sink, lg));
|
||||
size_t num_real_edges = num_edges(lg);
|
||||
|
||||
// Add reverse edges for the convenience of the BGL's max flow algorithm.
|
||||
vector<LitEdge> rev_edges = add_reverse_edges_and_index(lg);
|
||||
|
||||
const auto v_index_map = get(vertex_index, lg);
|
||||
const auto v_index_map = get(&LitGraphVertexProps::index, lg);
|
||||
const auto e_index_map = get(&LitGraphEdgeProps::index, lg);
|
||||
const size_t num_verts = num_vertices(lg);
|
||||
vector<boost::default_color_type> colors(num_verts);
|
||||
@ -542,7 +537,7 @@ void findMinCut(LitGraph &lg, const LitVertex &root, const LitVertex &sink,
|
||||
make_iterator_property_map(predecessors.begin(), v_index_map),
|
||||
make_iterator_property_map(colors.begin(), v_index_map),
|
||||
make_iterator_property_map(distances.begin(), v_index_map),
|
||||
v_index_map, root, sink);
|
||||
v_index_map, lg.root, lg.sink);
|
||||
DEBUG_PRINTF("done, flow = %llu\n", flow);
|
||||
|
||||
/* remove reverse edges */
|
||||
@ -555,21 +550,20 @@ void findMinCut(LitGraph &lg, const LitVertex &root, const LitVertex &sink,
|
||||
|
||||
for (const auto &e : edges_range(lg)) {
|
||||
const LitVertex u = source(e, lg), v = target(e, lg);
|
||||
const auto ucolor = colors[boost::get(vertex_index, lg, u)];
|
||||
const auto vcolor = colors[boost::get(vertex_index, lg, v)];
|
||||
const auto ucolor = colors[lg[u].index];
|
||||
const auto vcolor = colors[lg[v].index];
|
||||
|
||||
DEBUG_PRINTF("edge %zu:%s -> %zu:%s score %llu\n",
|
||||
boost::get(vertex_index, lg, u), describeColor(ucolor),
|
||||
boost::get(vertex_index, lg, v), describeColor(vcolor),
|
||||
DEBUG_PRINTF("edge %zu:%s -> %zu:%s score %llu\n", lg[u].index,
|
||||
describeColor(ucolor), lg[v].index, describeColor(vcolor),
|
||||
lg[e].score);
|
||||
|
||||
if (ucolor != boost::white_color && vcolor == boost::white_color) {
|
||||
assert(target(e, lg) != sink);
|
||||
assert(v != lg.sink);
|
||||
white_cut.push_back(e);
|
||||
white_flow += lg[e].score;
|
||||
}
|
||||
if (ucolor == boost::black_color && vcolor != boost::black_color) {
|
||||
assert(target(e, lg) != sink);
|
||||
assert(v != lg.sink);
|
||||
black_cut.push_back(e);
|
||||
black_flow += lg[e].score;
|
||||
}
|
||||
@ -609,21 +603,19 @@ u64a compressAndScore(set<ue2_literal> &s) {
|
||||
initialScore);
|
||||
|
||||
LitGraph lg;
|
||||
const LitVertex root = add_vertex(lg);
|
||||
const LitVertex sink = add_vertex(lg);
|
||||
|
||||
for (const auto &lit : s) {
|
||||
addReversedLiteral(lit, lg, root, sink);
|
||||
addReversedLiteral(lit, lg);
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("suffix tree has %zu vertices and %zu edges\n",
|
||||
num_vertices(lg), num_edges(lg));
|
||||
|
||||
vector<LitEdge> cutset;
|
||||
findMinCut(lg, root, sink, cutset);
|
||||
findMinCut(lg, cutset);
|
||||
|
||||
s.clear();
|
||||
extractLiterals(cutset, lg, root, s);
|
||||
extractLiterals(cutset, lg, s);
|
||||
|
||||
u64a score = scoreSet(s);
|
||||
DEBUG_PRINTF("compressed score is %llu\n", score);
|
||||
|
Loading…
x
Reference in New Issue
Block a user