simplify max clique analysis

This commit is contained in:
Xiang Wang 2015-12-02 07:24:57 -05:00 committed by Matthew Barr
parent 8c09d054c9
commit 7bcd2b07c9

View File

@ -64,6 +64,7 @@ using boost::adaptors::map_values;
namespace ue2 { namespace ue2 {
#define CASTLE_MAX_TOPS 32 #define CASTLE_MAX_TOPS 32
#define CLIQUE_GRAPH_MAX_SIZE 1000
static static
u32 depth_to_u32(const depth &d) { u32 depth_to_u32(const depth &d) {
@ -107,209 +108,90 @@ void writeCastleScanEngine(const CharReach &cr, Castle *c) {
} }
static static
size_t literalOverlap(const vector<CharReach> &a, const vector<CharReach> &b) { bool literalOverlap(const vector<CharReach> &a, const vector<CharReach> &b,
const size_t dist) {
for (size_t i = 0; i < b.size(); i++) { for (size_t i = 0; i < b.size(); i++) {
if (i > dist) {
return true;
}
size_t overlap_len = b.size() - i; size_t overlap_len = b.size() - i;
if (overlap_len <= a.size()) { if (overlap_len <= a.size()) {
if (matches(a.end() - overlap_len, a.end(), b.begin(), if (matches(a.end() - overlap_len, a.end(), b.begin(),
b.end() - i)) { b.end() - i)) {
return i; return false;
} }
} else { } else {
assert(overlap_len > a.size()); assert(overlap_len > a.size());
if (matches(a.begin(), a.end(), b.end() - i - a.size(), if (matches(a.begin(), a.end(), b.end() - i - a.size(),
b.end() - i)) { b.end() - i)) {
return i; return false;
} }
} }
} }
return b.size(); return b.size() > dist;
} }
// UE-2666 case 1: The problem of find largest exclusive subcastles group
// can be reformulated as finding the largest clique (subgraph where every
// vertex is connected to every other vertex) in the graph. We use an
// approximate algorithm here to find the maximum clique.
// References
// ----------
// [1] Boppana, R., & Halldórsson, M. M. (1992).
// Approximating maximum independent sets by excluding subgraphs.
// BIT Numerical Mathematics, 32(2), 180196. Springer.
// doi:10.1007/BF01994876
// ----------
struct CliqueVertexProps { struct CliqueVertexProps {
CliqueVertexProps() {} CliqueVertexProps() {}
explicit CliqueVertexProps(u32 state_in) : stateId(state_in) {} explicit CliqueVertexProps(u32 state_in) : stateId(state_in) {}
u32 stateId = ~0U; u32 stateId = ~0U;
u32 parentId = ~0U;
bool leftChild = false; /* tells us if it is the left child of its parent */
vector<u32> clique1; /* clique for the left branch */
vector<u32> indepSet1; /* independent set for the left branch */
vector<u32> clique2; /* clique for the right branch */
vector<u32> indepSet2; /* independent set for the right branch */
}; };
typedef boost::adjacency_list<boost::listS, boost::listS, boost::undirectedS, typedef boost::adjacency_list<boost::listS, boost::listS, boost::undirectedS,
CliqueVertexProps> CliqueGraph; CliqueVertexProps> CliqueGraph;
typedef CliqueGraph::vertex_descriptor CliqueVertex; typedef CliqueGraph::vertex_descriptor CliqueVertex;
static
unique_ptr<CliqueGraph> makeCG(const vector<vector<u32>> &exclusiveSet) {
u32 size = exclusiveSet.size();
vector<CliqueVertex> vertices;
unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
for (u32 i = 0; i < size; ++i) {
CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
vertices.push_back(v);
}
// construct the complement graph, then its maximum independent sets
// are equal to the maximum clique of the original graph
for (u32 i = 0; i < size; ++i) {
CliqueVertex s = vertices[i];
vector<u32> complement(size, 0);
for (u32 j = 0; j < exclusiveSet[i].size(); ++j) {
u32 val = exclusiveSet[i][j];
complement[val] = 1;
}
for (u32 k = i + 1; k < size; ++k) {
if (!complement[k]) {
CliqueVertex d = vertices[k];
add_edge(s, d, *cg);
}
}
}
return cg;
}
static
void updateCliqueInfo(CliqueGraph &cg, const CliqueVertex &n,
vector<u32> &clique, vector<u32> &indepSet) {
u32 id = cg[n].stateId;
if (cg[n].clique1.size() + 1 > cg[n].clique2.size()) {
cg[n].clique1.push_back(id);
clique.swap(cg[n].clique1);
} else {
clique.swap(cg[n].clique2);
}
if (cg[n].indepSet2.size() + 1 > cg[n].indepSet1.size()) {
cg[n].indepSet2.push_back(id);
indepSet.swap(cg[n].indepSet2);
} else {
indepSet.swap(cg[n].indepSet1);
}
}
static static
void getNeighborInfo(const CliqueGraph &g, vector<u32> &neighbor, void getNeighborInfo(const CliqueGraph &g, vector<u32> &neighbor,
vector<u32> &nonneighbor, const CliqueVertex &cv, const CliqueVertex &cv, const set<u32> &group) {
const set<u32> &group) {
u32 id = g[cv].stateId; u32 id = g[cv].stateId;
ue2::unordered_set<u32> neighborId; ue2::unordered_set<u32> neighborId;
// find neighbors for cv // find neighbors for cv
for (const auto &v : adjacent_vertices_range(cv, g)) { for (const auto &v : adjacent_vertices_range(cv, g)) {
if (g[v].stateId != id && contains(group, g[v].stateId)) { if (g[v].stateId != id && contains(group, g[v].stateId)){
neighbor.push_back(g[v].stateId); neighbor.push_back(g[v].stateId);
neighborId.insert(g[v].stateId); neighborId.insert(g[v].stateId);
} DEBUG_PRINTF("Neighbor:%u\n", g[v].stateId);
}
neighborId.insert(id);
// find non-neighbors for cv
for (const auto &v : vertices_range(g)) {
if (!contains(neighborId, g[v].stateId) &&
contains(group, g[v].stateId)) {
nonneighbor.push_back(g[v].stateId);
} }
} }
} }
static static
void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique, void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique) {
vector<u32> &indepSet) {
stack<vector<u32>> gStack; stack<vector<u32>> gStack;
// create mapping between vertex and id // Create mapping between vertex and id
map<u32, CliqueVertex> vertexMap; map<u32, CliqueVertex> vertexMap;
vector<u32> init; vector<u32> init;
for (auto &v : vertices_range(cg)) { for (const auto &v : vertices_range(cg)) {
vertexMap[cg[v].stateId] = v; vertexMap[cg[v].stateId] = v;
init.push_back(cg[v].stateId); init.push_back(cg[v].stateId);
} }
gStack.push(init); gStack.push(init);
// get the vertex to start from // Get the vertex to start from
set<u32> foundVertexId;
ue2::unordered_set<u32> visitedId;
CliqueGraph::vertex_iterator vi, ve; CliqueGraph::vertex_iterator vi, ve;
tie(vi, ve) = vertices(cg); tie(vi, ve) = vertices(cg);
CliqueVertex start = *vi;
u32 startId = cg[start].stateId;
DEBUG_PRINTF("startId:%u\n", startId);
bool leftChild = false;
u32 prevId = startId;
while (!gStack.empty()) { while (!gStack.empty()) {
const auto &g = gStack.top(); vector<u32> g = gStack.top();
// choose a vertex from the graph
assert(!g.empty());
u32 id = g[0];
CliqueVertex &n = vertexMap.at(id);
vector<u32> neighbor;
vector<u32> nonneighbor;
set<u32> subgraphId(g.begin(), g.end());
getNeighborInfo(cg, neighbor, nonneighbor, n, subgraphId);
if (contains(foundVertexId, id)) {
prevId = id;
// get non-neighbors for right branch
if (visitedId.insert(id).second) {
DEBUG_PRINTF("right branch\n");
if (!nonneighbor.empty()) {
gStack.push(nonneighbor);
leftChild = false;
}
} else {
if (id != startId) {
// both the left and right branches are visited,
// update its parent's clique and independent sets
u32 parentId = cg[n].parentId;
CliqueVertex &parent = vertexMap.at(parentId);
if (cg[n].leftChild) {
updateCliqueInfo(cg, n, cg[parent].clique1,
cg[parent].indepSet1);
} else {
updateCliqueInfo(cg, n, cg[parent].clique2,
cg[parent].indepSet2);
}
}
gStack.pop(); gStack.pop();
}
} else { // Choose a vertex from the graph
foundVertexId.insert(id); u32 id = g[0];
cg[n].leftChild = leftChild; const CliqueVertex &n = vertexMap.at(id);
cg[n].parentId = prevId; clique.push_back(id);
cg[n].clique1.clear(); // Corresponding vertex in the original graph
cg[n].clique2.clear(); vector<u32> neighbor;
cg[n].indepSet1.clear(); set<u32> subgraphId(g.begin(), g.end());
cg[n].indepSet2.clear(); getNeighborInfo(cg, neighbor, n, subgraphId);
// get neighbors for left branch // Get graph consisting of neighbors for left branch
if (!neighbor.empty()) { if (!neighbor.empty()) {
gStack.push(neighbor); gStack.push(neighbor);
leftChild = true;
}
prevId = id;
} }
} }
updateCliqueInfo(cg, start, clique, indepSet);
} }
template<typename Graph> template<typename Graph>
@ -322,9 +204,8 @@ bool graph_empty(const Graph &g) {
static static
vector<u32> removeClique(CliqueGraph &cg) { vector<u32> removeClique(CliqueGraph &cg) {
vector<vector<u32>> cliquesVec(1); vector<vector<u32>> cliquesVec(1);
vector<vector<u32>> indepSetsVec(1);
DEBUG_PRINTF("graph size:%lu\n", num_vertices(cg)); DEBUG_PRINTF("graph size:%lu\n", num_vertices(cg));
findCliqueGroup(cg, cliquesVec[0], indepSetsVec[0]); findCliqueGroup(cg, cliquesVec[0]);
while (!graph_empty(cg)) { while (!graph_empty(cg)) {
const vector<u32> &c = cliquesVec.back(); const vector<u32> &c = cliquesVec.back();
vector<CliqueVertex> dead; vector<CliqueVertex> dead;
@ -341,30 +222,22 @@ vector<u32> removeClique(CliqueGraph &cg) {
break; break;
} }
vector<u32> clique; vector<u32> clique;
vector<u32> indepSet; findCliqueGroup(cg, clique);
findCliqueGroup(cg, clique, indepSet);
cliquesVec.push_back(clique); cliquesVec.push_back(clique);
indepSetsVec.push_back(indepSet);
} }
// get the independent set with max size // get the independent set with max size
size_t max = 0; size_t max = 0;
size_t id = 0; size_t id = 0;
for (size_t j = 0; j < indepSetsVec.size(); ++j) { for (size_t j = 0; j < cliquesVec.size(); ++j) {
if (indepSetsVec[j].size() > max) { if (cliquesVec[j].size() > max) {
max = indepSetsVec[j].size(); max = cliquesVec[j].size();
id = j; id = j;
} }
} }
DEBUG_PRINTF("clique size:%lu\n", indepSetsVec[id].size()); DEBUG_PRINTF("clique size:%lu\n", cliquesVec[id].size());
return indepSetsVec[id]; return cliquesVec[id];
}
static
vector<u32> findMaxClique(const vector<vector<u32>> &exclusiveSet) {
auto cg = makeCG(exclusiveSet);
return removeClique(*cg);
} }
// if the location of any reset character in one literal are after // if the location of any reset character in one literal are after
@ -378,10 +251,10 @@ bool findExclusivePair(const u32 id1, const u32 id2,
const auto &triggers2 = triggers[id2]; const auto &triggers2 = triggers[id2];
for (u32 i = 0; i < triggers1.size(); ++i) { for (u32 i = 0; i < triggers1.size(); ++i) {
for (u32 j = 0; j < triggers2.size(); ++j) { for (u32 j = 0; j < triggers2.size(); ++j) {
size_t max_overlap1 = literalOverlap(triggers1[i], triggers2[j]); if (!literalOverlap(triggers1[i], triggers2[j],
size_t max_overlap2 = literalOverlap(triggers2[j], triggers1[i]); min_reset_dist[id2][j]) ||
if (max_overlap1 <= min_reset_dist[id2][j] || !literalOverlap(triggers2[j], triggers1[i],
max_overlap2 <= min_reset_dist[id1][i]) { min_reset_dist[id1][i])) {
return false; return false;
} }
} }
@ -397,28 +270,33 @@ vector<u32> checkExclusion(const CharReach &cr,
return group; return group;
} }
vector<vector<size_t> > min_reset_dist; vector<vector<size_t>> min_reset_dist;
// get min reset distance for each repeat // get min reset distance for each repeat
for (auto it = triggers.begin(); it != triggers.end(); it++) { for (auto it = triggers.begin(); it != triggers.end(); it++) {
const vector<size_t> &tmp_dist = minResetDistToEnd(*it, cr); const vector<size_t> &tmp_dist = minResetDistToEnd(*it, cr);
min_reset_dist.push_back(tmp_dist); min_reset_dist.push_back(tmp_dist);
} }
vector<vector<u32>> exclusiveSet; vector<CliqueVertex> vertices;
unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
for (u32 i = 0; i < triggers.size(); ++i) {
CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
vertices.push_back(v);
}
// find exclusive pair for each repeat // find exclusive pair for each repeat
for (u32 i = 0; i < triggers.size(); ++i) { for (u32 i = 0; i < triggers.size(); ++i) {
vector<u32> repeatIds; CliqueVertex s = vertices[i];
for (u32 j = i + 1; j < triggers.size(); ++j) { for (u32 j = i + 1; j < triggers.size(); ++j) {
if (findExclusivePair(i, j, min_reset_dist, triggers)) { if (findExclusivePair(i, j, min_reset_dist, triggers)) {
repeatIds.push_back(j); CliqueVertex d = vertices[j];
add_edge(s, d, *cg);
} }
} }
exclusiveSet.push_back(repeatIds);
DEBUG_PRINTF("Exclusive pair size:%lu\n", repeatIds.size());
} }
// find the largest exclusive group // find the largest exclusive group
return findMaxClique(exclusiveSet); return removeClique(*cg);
} }
static static
@ -576,7 +454,7 @@ buildCastle(const CastleProto &proto,
repeatInfoPair.push_back(make_pair(min_period, is_reset)); repeatInfoPair.push_back(make_pair(min_period, is_reset));
if (is_reset) { if (is_reset && candidateRepeats.size() < CLIQUE_GRAPH_MAX_SIZE) {
candidateTriggers.push_back(triggers.at(top)); candidateTriggers.push_back(triggers.at(top));
candidateRepeats.push_back(i); candidateRepeats.push_back(i);
} }
@ -585,7 +463,7 @@ buildCastle(const CastleProto &proto,
// Case 1: exclusive repeats // Case 1: exclusive repeats
bool exclusive = false; bool exclusive = false;
bool pureExclusive = false; bool pureExclusive = false;
u8 activeIdxSize = 0; u32 activeIdxSize = 0;
set<u32> exclusiveGroup; set<u32> exclusiveGroup;
if (cc.grey.castleExclusive) { if (cc.grey.castleExclusive) {
vector<u32> tmpGroup = checkExclusion(cr, candidateTriggers); vector<u32> tmpGroup = checkExclusion(cr, candidateTriggers);
@ -594,7 +472,7 @@ buildCastle(const CastleProto &proto,
// Case 1: mutual exclusive repeats group found, initialize state // Case 1: mutual exclusive repeats group found, initialize state
// sizes // sizes
exclusive = true; exclusive = true;
activeIdxSize = calcPackedBytes(exclusiveSize); activeIdxSize = calcPackedBytes(numRepeats + 1);
if (exclusiveSize == numRepeats) { if (exclusiveSize == numRepeats) {
pureExclusive = true; pureExclusive = true;
streamStateSize = 0; streamStateSize = 0;
@ -642,7 +520,7 @@ buildCastle(const CastleProto &proto,
c->numRepeats = verify_u32(subs.size()); c->numRepeats = verify_u32(subs.size());
c->exclusive = exclusive; c->exclusive = exclusive;
c->pureExclusive = pureExclusive; c->pureExclusive = pureExclusive;
c->activeIdxSize = activeIdxSize; c->activeIdxSize = verify_u8(activeIdxSize);
writeCastleScanEngine(cr, c); writeCastleScanEngine(cr, c);