mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-11-21 03:16:40 +03:00
violet: initial implementation
This commit is contained in:
@@ -339,6 +339,12 @@ void processWorkQueue(const NGHolder &g, const NFAEdge &e,
|
||||
g[source(e, g)].index, g[target(e, g)].index, s.size());
|
||||
}
|
||||
|
||||
bool bad_mixed_sensitivity(const ue2_literal &s) {
|
||||
/* TODO: if the mixed cases is entirely within MAX_MASK2_WIDTH of the end,
|
||||
* we should be able to handle it */
|
||||
return mixed_sensitivity(s) && s.length() > MAX_MASK2_WIDTH;
|
||||
}
|
||||
|
||||
static
|
||||
u64a litUniqueness(const string &s) {
|
||||
CharReach seen(s);
|
||||
@@ -624,6 +630,48 @@ u64a compressAndScore(set<ue2_literal> &s) {
|
||||
return score;
|
||||
}
|
||||
|
||||
/* like compressAndScore, but replaces long mixed sensitivity literals with
|
||||
* something weaker. */
|
||||
u64a sanitizeAndCompressAndScore(set<ue2_literal> &lits) {
|
||||
const size_t maxExploded = 8; // only case-explode this far
|
||||
|
||||
/* TODO: the whole compression thing could be made better by systematically
|
||||
* considering replacing literal sets not just by common suffixes but also
|
||||
* by nocase literals. */
|
||||
|
||||
vector<ue2_literal> replacements;
|
||||
|
||||
for (auto it = lits.begin(); it != lits.end();) {
|
||||
auto jt = it;
|
||||
++it;
|
||||
|
||||
if (!bad_mixed_sensitivity(*jt)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* we have to replace *jt with something... */
|
||||
ue2_literal s = *jt;
|
||||
lits.erase(jt);
|
||||
|
||||
vector<ue2_literal> exploded;
|
||||
for (auto cit = caseIterateBegin(s); cit != caseIterateEnd(); ++cit) {
|
||||
exploded.emplace_back(*cit, false);
|
||||
if (exploded.size() > maxExploded) {
|
||||
goto dont_explode;
|
||||
}
|
||||
}
|
||||
insert(&replacements, replacements.end(), exploded);
|
||||
|
||||
continue;
|
||||
dont_explode:
|
||||
make_nocase(&s);
|
||||
replacements.push_back(s);
|
||||
}
|
||||
|
||||
insert(&lits, replacements);
|
||||
return compressAndScore(lits);
|
||||
}
|
||||
|
||||
u64a scoreSet(const set<ue2_literal> &s) {
|
||||
if (s.empty()) {
|
||||
return NO_LITERAL_AT_EDGE_SCORE;
|
||||
@@ -674,7 +722,7 @@ set<ue2_literal> getLiteralSet(const NGHolder &g, const NFAVertex &v,
|
||||
return s;
|
||||
}
|
||||
|
||||
vector<u64a> scoreEdges(const NGHolder &g) {
|
||||
vector<u64a> scoreEdges(const NGHolder &g, const flat_set<NFAEdge> &known_bad) {
|
||||
assert(hasCorrectlyNumberedEdges(g));
|
||||
|
||||
vector<u64a> scores(num_edges(g));
|
||||
@@ -682,8 +730,12 @@ vector<u64a> scoreEdges(const NGHolder &g) {
|
||||
for (const auto &e : edges_range(g)) {
|
||||
u32 eidx = g[e].index;
|
||||
assert(eidx < scores.size());
|
||||
set<ue2_literal> ls = getLiteralSet(g, e);
|
||||
scores[eidx] = compressAndScore(ls);
|
||||
if (contains(known_bad, e)) {
|
||||
scores[eidx] = NO_LITERAL_AT_EDGE_SCORE;
|
||||
} else {
|
||||
set<ue2_literal> ls = getLiteralSet(g, e);
|
||||
scores[eidx] = compressAndScore(ls);
|
||||
}
|
||||
}
|
||||
|
||||
return scores;
|
||||
@@ -842,4 +894,49 @@ bool getTrailingLiteral(const NGHolder &g, ue2_literal *lit_out) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool literalIsWholeGraph(const NGHolder &g, const ue2_literal &lit) {
|
||||
NFAVertex v = g.accept;
|
||||
|
||||
for (auto it = lit.rbegin(), ite = lit.rend(); it != ite; ++it) {
|
||||
NGHolder::inv_adjacency_iterator ai, ae;
|
||||
tie(ai, ae) = inv_adjacent_vertices(v, g);
|
||||
if (ai == ae) {
|
||||
assert(0); // no predecessors?
|
||||
return false;
|
||||
}
|
||||
v = *ai++;
|
||||
if (ai != ae) {
|
||||
DEBUG_PRINTF("branch, fail\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (is_special(v, g)) {
|
||||
DEBUG_PRINTF("special found, fail\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
const CharReach &cr_g = g[v].char_reach;
|
||||
const CharReach &cr_l = *it;
|
||||
|
||||
if (!cr_l.isSubsetOf(cr_g)) {
|
||||
/* running over the prefix is needed to prevent false postives */
|
||||
DEBUG_PRINTF("reach fail\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Our last value for v should have only start states for predecessors.
|
||||
for (auto u : inv_adjacent_vertices_range(v, g)) {
|
||||
if (!is_any_start(u, g)) {
|
||||
DEBUG_PRINTF("pred is not start\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
assert(num_vertices(g) == lit.length() + N_SPECIALS);
|
||||
|
||||
DEBUG_PRINTF("ok\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace ue2
|
||||
|
||||
Reference in New Issue
Block a user