violet: initial implementation

This commit is contained in:
Alex Coyte
2016-07-15 13:07:00 +10:00
committed by Matthew Barr
parent b13a90e5d2
commit 5c5ec905cc
31 changed files with 3171 additions and 201 deletions

View File

@@ -339,6 +339,12 @@ void processWorkQueue(const NGHolder &g, const NFAEdge &e,
g[source(e, g)].index, g[target(e, g)].index, s.size());
}
bool bad_mixed_sensitivity(const ue2_literal &s) {
/* TODO: if the mixed cases is entirely within MAX_MASK2_WIDTH of the end,
* we should be able to handle it */
return mixed_sensitivity(s) && s.length() > MAX_MASK2_WIDTH;
}
static
u64a litUniqueness(const string &s) {
CharReach seen(s);
@@ -624,6 +630,48 @@ u64a compressAndScore(set<ue2_literal> &s) {
return score;
}
/* like compressAndScore, but replaces long mixed sensitivity literals with
* something weaker. */
u64a sanitizeAndCompressAndScore(set<ue2_literal> &lits) {
const size_t maxExploded = 8; // only case-explode this far
/* TODO: the whole compression thing could be made better by systematically
* considering replacing literal sets not just by common suffixes but also
* by nocase literals. */
vector<ue2_literal> replacements;
for (auto it = lits.begin(); it != lits.end();) {
auto jt = it;
++it;
if (!bad_mixed_sensitivity(*jt)) {
continue;
}
/* we have to replace *jt with something... */
ue2_literal s = *jt;
lits.erase(jt);
vector<ue2_literal> exploded;
for (auto cit = caseIterateBegin(s); cit != caseIterateEnd(); ++cit) {
exploded.emplace_back(*cit, false);
if (exploded.size() > maxExploded) {
goto dont_explode;
}
}
insert(&replacements, replacements.end(), exploded);
continue;
dont_explode:
make_nocase(&s);
replacements.push_back(s);
}
insert(&lits, replacements);
return compressAndScore(lits);
}
u64a scoreSet(const set<ue2_literal> &s) {
if (s.empty()) {
return NO_LITERAL_AT_EDGE_SCORE;
@@ -674,7 +722,7 @@ set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAVertex &v,
return s;
}
vector<u64a> scoreEdges(const NGHolder &g) {
vector<u64a> scoreEdges(const NGHolder &g, const flat_set<NFAEdge> &known_bad) {
assert(hasCorrectlyNumberedEdges(g));
vector<u64a> scores(num_edges(g));
@@ -682,8 +730,12 @@ vector<u64a> scoreEdges(const NGHolder &g) {
for (const auto &e : edges_range(g)) {
u32 eidx = g[e].index;
assert(eidx < scores.size());
set<ue2_literal> ls = getLiteralSet(g, e);
scores[eidx] = compressAndScore(ls);
if (contains(known_bad, e)) {
scores[eidx] = NO_LITERAL_AT_EDGE_SCORE;
} else {
set<ue2_literal> ls = getLiteralSet(g, e);
scores[eidx] = compressAndScore(ls);
}
}
return scores;
@@ -842,4 +894,49 @@ bool getTrailingLiteral(const NGHolder &g, ue2_literal *lit_out) {
return true;
}
bool literalIsWholeGraph(const NGHolder &g, const ue2_literal &lit) {
NFAVertex v = g.accept;
for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
NGHolder::inv_adjacency_iterator ai, ae;
tie(ai, ae) = inv_adjacent_vertices(v, g);
if (ai == ae) {
assert(0); // no predecessors?
return false;
}
v = *ai++;
if (ai != ae) {
DEBUG_PRINTF("branch, fail\n");
return false;
}
if (is_special(v, g)) {
DEBUG_PRINTF("special found, fail\n");
return false;
}
const CharReach &cr_g = g[v].char_reach;
const CharReach &cr_l = *it;
if (!cr_l.isSubsetOf(cr_g)) {
/* running over the prefix is needed to prevent false postives */
DEBUG_PRINTF("reach fail\n");
return false;
}
}
// Our last value for v should have only start states for predecessors.
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (!is_any_start(u, g)) {
DEBUG_PRINTF("pred is not start\n");
return false;
}
}
assert(num_vertices(g) == lit.length() + N_SPECIALS);
DEBUG_PRINTF("ok\n");
return true;
}
} // namespace ue2