From 7bcd2b07c92a56ebc7dd537ccfc361d2e0fa824e Mon Sep 17 00:00:00 2001 From: Xiang Wang Date: Wed, 2 Dec 2015 07:24:57 -0500 Subject: [PATCH] simplify max clique analysis --- src/nfa/castlecompile.cpp | 228 +++++++++----------------------------- 1 file changed, 53 insertions(+), 175 deletions(-) diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp index cc5c599b..e5cc9267 100644 --- a/src/nfa/castlecompile.cpp +++ b/src/nfa/castlecompile.cpp @@ -64,6 +64,7 @@ using boost::adaptors::map_values; namespace ue2 { #define CASTLE_MAX_TOPS 32 +#define CLIQUE_GRAPH_MAX_SIZE 1000 static u32 depth_to_u32(const depth &d) { @@ -107,209 +108,90 @@ void writeCastleScanEngine(const CharReach &cr, Castle *c) { } static -size_t literalOverlap(const vector &a, const vector &b) { +bool literalOverlap(const vector &a, const vector &b, + const size_t dist) { for (size_t i = 0; i < b.size(); i++) { + if (i > dist) { + return true; + } size_t overlap_len = b.size() - i; if (overlap_len <= a.size()) { if (matches(a.end() - overlap_len, a.end(), b.begin(), b.end() - i)) { - return i; + return false; } } else { assert(overlap_len > a.size()); if (matches(a.begin(), a.end(), b.end() - i - a.size(), b.end() - i)) { - return i; + return false; } } } - return b.size(); + return b.size() > dist; } -// UE-2666 case 1: The problem of find largest exclusive subcastles group -// can be reformulated as finding the largest clique (subgraph where every -// vertex is connected to every other vertex) in the graph. We use an -// approximate algorithm here to find the maximum clique. -// References -// ---------- -// [1] Boppana, R., & Halldórsson, M. M. (1992). -// Approximating maximum independent sets by excluding subgraphs. -// BIT Numerical Mathematics, 32(2), 180–196. Springer. -// doi:10.1007/BF01994876 -// ---------- - struct CliqueVertexProps { CliqueVertexProps() {} explicit CliqueVertexProps(u32 state_in) : stateId(state_in) {} u32 stateId = ~0U; - u32 parentId = ~0U; - bool leftChild = false; /* tells us if it is the left child of its parent */ - - vector clique1; /* clique for the left branch */ - vector indepSet1; /* independent set for the left branch */ - vector clique2; /* clique for the right branch */ - vector indepSet2; /* independent set for the right branch */ }; typedef boost::adjacency_list CliqueGraph; typedef CliqueGraph::vertex_descriptor CliqueVertex; -static -unique_ptr makeCG(const vector> &exclusiveSet) { - u32 size = exclusiveSet.size(); - - vector vertices; - unique_ptr cg = make_unique(); - for (u32 i = 0; i < size; ++i) { - CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg); - vertices.push_back(v); - } - - // construct the complement graph, then its maximum independent sets - // are equal to the maximum clique of the original graph - for (u32 i = 0; i < size; ++i) { - CliqueVertex s = vertices[i]; - vector complement(size, 0); - for (u32 j = 0; j < exclusiveSet[i].size(); ++j) { - u32 val = exclusiveSet[i][j]; - complement[val] = 1; - } - - for (u32 k = i + 1; k < size; ++k) { - if (!complement[k]) { - CliqueVertex d = vertices[k]; - add_edge(s, d, *cg); - } - } - } - return cg; -} - -static -void updateCliqueInfo(CliqueGraph &cg, const CliqueVertex &n, - vector &clique, vector &indepSet) { - u32 id = cg[n].stateId; - if (cg[n].clique1.size() + 1 > cg[n].clique2.size()) { - cg[n].clique1.push_back(id); - clique.swap(cg[n].clique1); - } else { - clique.swap(cg[n].clique2); - } - - if (cg[n].indepSet2.size() + 1 > cg[n].indepSet1.size()) { - cg[n].indepSet2.push_back(id); - indepSet.swap(cg[n].indepSet2); - } else { - indepSet.swap(cg[n].indepSet1); - } -} - static void getNeighborInfo(const CliqueGraph &g, vector &neighbor, - vector &nonneighbor, const CliqueVertex &cv, - const set &group) { + const CliqueVertex &cv, const set &group) { u32 id = g[cv].stateId; ue2::unordered_set neighborId; // find neighbors for cv for (const auto &v : adjacent_vertices_range(cv, g)) { - if (g[v].stateId != id && contains(group, g[v].stateId)) { + if (g[v].stateId != id && contains(group, g[v].stateId)){ neighbor.push_back(g[v].stateId); neighborId.insert(g[v].stateId); - } - } - - neighborId.insert(id); - // find non-neighbors for cv - for (const auto &v : vertices_range(g)) { - if (!contains(neighborId, g[v].stateId) && - contains(group, g[v].stateId)) { - nonneighbor.push_back(g[v].stateId); + DEBUG_PRINTF("Neighbor:%u\n", g[v].stateId); } } } static -void findCliqueGroup(CliqueGraph &cg, vector &clique, - vector &indepSet) { +void findCliqueGroup(CliqueGraph &cg, vector &clique) { stack> gStack; - // create mapping between vertex and id + // Create mapping between vertex and id map vertexMap; vector init; - for (auto &v : vertices_range(cg)) { + for (const auto &v : vertices_range(cg)) { vertexMap[cg[v].stateId] = v; init.push_back(cg[v].stateId); } gStack.push(init); - // get the vertex to start from - set foundVertexId; - ue2::unordered_set visitedId; + // Get the vertex to start from CliqueGraph::vertex_iterator vi, ve; tie(vi, ve) = vertices(cg); - CliqueVertex start = *vi; - u32 startId = cg[start].stateId; - DEBUG_PRINTF("startId:%u\n", startId); - bool leftChild = false; - u32 prevId = startId; while (!gStack.empty()) { - const auto &g = gStack.top(); + vector g = gStack.top(); + gStack.pop(); - // choose a vertex from the graph - assert(!g.empty()); + // Choose a vertex from the graph u32 id = g[0]; - CliqueVertex &n = vertexMap.at(id); - + const CliqueVertex &n = vertexMap.at(id); + clique.push_back(id); + // Corresponding vertex in the original graph vector neighbor; - vector nonneighbor; set subgraphId(g.begin(), g.end()); - getNeighborInfo(cg, neighbor, nonneighbor, n, subgraphId); - if (contains(foundVertexId, id)) { - prevId = id; - // get non-neighbors for right branch - if (visitedId.insert(id).second) { - DEBUG_PRINTF("right branch\n"); - if (!nonneighbor.empty()) { - gStack.push(nonneighbor); - leftChild = false; - } - } else { - if (id != startId) { - // both the left and right branches are visited, - // update its parent's clique and independent sets - u32 parentId = cg[n].parentId; - CliqueVertex &parent = vertexMap.at(parentId); - if (cg[n].leftChild) { - updateCliqueInfo(cg, n, cg[parent].clique1, - cg[parent].indepSet1); - } else { - updateCliqueInfo(cg, n, cg[parent].clique2, - cg[parent].indepSet2); - } - } - gStack.pop(); - } - } else { - foundVertexId.insert(id); - cg[n].leftChild = leftChild; - cg[n].parentId = prevId; - cg[n].clique1.clear(); - cg[n].clique2.clear(); - cg[n].indepSet1.clear(); - cg[n].indepSet2.clear(); - // get neighbors for left branch - if (!neighbor.empty()) { - gStack.push(neighbor); - leftChild = true; - } - prevId = id; + getNeighborInfo(cg, neighbor, n, subgraphId); + // Get graph consisting of neighbors for left branch + if (!neighbor.empty()) { + gStack.push(neighbor); } } - updateCliqueInfo(cg, start, clique, indepSet); } template @@ -322,9 +204,8 @@ bool graph_empty(const Graph &g) { static vector removeClique(CliqueGraph &cg) { vector> cliquesVec(1); - vector> indepSetsVec(1); DEBUG_PRINTF("graph size:%lu\n", num_vertices(cg)); - findCliqueGroup(cg, cliquesVec[0], indepSetsVec[0]); + findCliqueGroup(cg, cliquesVec[0]); while (!graph_empty(cg)) { const vector &c = cliquesVec.back(); vector dead; @@ -341,30 +222,22 @@ vector removeClique(CliqueGraph &cg) { break; } vector clique; - vector indepSet; - findCliqueGroup(cg, clique, indepSet); + findCliqueGroup(cg, clique); cliquesVec.push_back(clique); - indepSetsVec.push_back(indepSet); } // get the independent set with max size size_t max = 0; size_t id = 0; - for (size_t j = 0; j < indepSetsVec.size(); ++j) { - if (indepSetsVec[j].size() > max) { - max = indepSetsVec[j].size(); + for (size_t j = 0; j < cliquesVec.size(); ++j) { + if (cliquesVec[j].size() > max) { + max = cliquesVec[j].size(); id = j; } } - DEBUG_PRINTF("clique size:%lu\n", indepSetsVec[id].size()); - return indepSetsVec[id]; -} - -static -vector findMaxClique(const vector> &exclusiveSet) { - auto cg = makeCG(exclusiveSet); - return removeClique(*cg); + DEBUG_PRINTF("clique size:%lu\n", cliquesVec[id].size()); + return cliquesVec[id]; } // if the location of any reset character in one literal are after @@ -378,10 +251,10 @@ bool findExclusivePair(const u32 id1, const u32 id2, const auto &triggers2 = triggers[id2]; for (u32 i = 0; i < triggers1.size(); ++i) { for (u32 j = 0; j < triggers2.size(); ++j) { - size_t max_overlap1 = literalOverlap(triggers1[i], triggers2[j]); - size_t max_overlap2 = literalOverlap(triggers2[j], triggers1[i]); - if (max_overlap1 <= min_reset_dist[id2][j] || - max_overlap2 <= min_reset_dist[id1][i]) { + if (!literalOverlap(triggers1[i], triggers2[j], + min_reset_dist[id2][j]) || + !literalOverlap(triggers2[j], triggers1[i], + min_reset_dist[id1][i])) { return false; } } @@ -397,28 +270,33 @@ vector checkExclusion(const CharReach &cr, return group; } - vector > min_reset_dist; + vector> min_reset_dist; // get min reset distance for each repeat for (auto it = triggers.begin(); it != triggers.end(); it++) { const vector &tmp_dist = minResetDistToEnd(*it, cr); min_reset_dist.push_back(tmp_dist); } - vector> exclusiveSet; + vector vertices; + unique_ptr cg = make_unique(); + for (u32 i = 0; i < triggers.size(); ++i) { + CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg); + vertices.push_back(v); + } + // find exclusive pair for each repeat for (u32 i = 0; i < triggers.size(); ++i) { - vector repeatIds; + CliqueVertex s = vertices[i]; for (u32 j = i + 1; j < triggers.size(); ++j) { if (findExclusivePair(i, j, min_reset_dist, triggers)) { - repeatIds.push_back(j); + CliqueVertex d = vertices[j]; + add_edge(s, d, *cg); } } - exclusiveSet.push_back(repeatIds); - DEBUG_PRINTF("Exclusive pair size:%lu\n", repeatIds.size()); } // find the largest exclusive group - return findMaxClique(exclusiveSet); + return removeClique(*cg); } static @@ -576,7 +454,7 @@ buildCastle(const CastleProto &proto, repeatInfoPair.push_back(make_pair(min_period, is_reset)); - if (is_reset) { + if (is_reset && candidateRepeats.size() < CLIQUE_GRAPH_MAX_SIZE) { candidateTriggers.push_back(triggers.at(top)); candidateRepeats.push_back(i); } @@ -585,7 +463,7 @@ buildCastle(const CastleProto &proto, // Case 1: exclusive repeats bool exclusive = false; bool pureExclusive = false; - u8 activeIdxSize = 0; + u32 activeIdxSize = 0; set exclusiveGroup; if (cc.grey.castleExclusive) { vector tmpGroup = checkExclusion(cr, candidateTriggers); @@ -594,7 +472,7 @@ buildCastle(const CastleProto &proto, // Case 1: mutual exclusive repeats group found, initialize state // sizes exclusive = true; - activeIdxSize = calcPackedBytes(exclusiveSize); + activeIdxSize = calcPackedBytes(numRepeats + 1); if (exclusiveSize == numRepeats) { pureExclusive = true; streamStateSize = 0; @@ -642,7 +520,7 @@ buildCastle(const CastleProto &proto, c->numRepeats = verify_u32(subs.size()); c->exclusive = exclusive; c->pureExclusive = pureExclusive; - c->activeIdxSize = activeIdxSize; + c->activeIdxSize = verify_u8(activeIdxSize); writeCastleScanEngine(cr, c);