vectorscan/src/nfagraph/ng_utf8.cpp
2015-10-20 09:13:35 +11:00

306 lines
9.5 KiB
C++

/*
* Copyright (c) 2015, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief UTF-8 transforms and operations.
*/
#include "ng_utf8.h"
#include "ng.h"
#include "ng_prune.h"
#include "ng_util.h"
#include "util/graph_range.h"
#include "util/unicode_def.h"
#include <set>
#include <vector>
using namespace std;
namespace ue2 {
static
void allowIllegal(NGWrapper &w, NFAVertex v, u8 pred_char) {
if (in_degree(v, w) != 1) {
DEBUG_PRINTF("unexpected pred\n");
assert(0); /* should be true due to the early stage of this analysis */
return;
}
CharReach &cr = w[v].char_reach;
if (pred_char == 0xe0) {
assert(cr.isSubsetOf(CharReach(0xa0, 0xbf)));
if (cr == CharReach(0xa0, 0xbf)) {
cr |= CharReach(0x80, 0x9f);
}
} else if (pred_char == 0xf0) {
assert(cr.isSubsetOf(CharReach(0x90, 0xbf)));
if (cr == CharReach(0x90, 0xbf)) {
cr |= CharReach(0x80, 0x8f);
}
} else if (pred_char == 0xf4) {
assert(cr.isSubsetOf(CharReach(0x80, 0x8f)));
if (cr == CharReach(0x80, 0x8f)) {
cr |= CharReach(0x90, 0xbf);
}
} else {
assert(0); /* unexpected pred */
}
}
/** \brief Relax forbidden UTF-8 sequences.
*
* Some byte sequences can not appear in valid UTF-8 as they encode code points
* above \\x{10ffff} or they represent overlong encodings. As we require valid
* UTF-8 input, we have no defined behaviour in these cases, as a result we can
* accept them if it simplifies the graph. */
void relaxForbiddenUtf8(NGWrapper &w) {
if (!w.utf8) {
return;
}
const CharReach e0(0xe0);
const CharReach f0(0xf0);
const CharReach f4(0xf4);
for (auto v : vertices_range(w)) {
const CharReach &cr = w[v].char_reach;
if (cr == e0 || cr == f0 || cr == f4) {
u8 pred_char = cr.find_first();
for (auto t : adjacent_vertices_range(v, w)) {
allowIllegal(w, t, pred_char);
}
}
}
}
static
bool hasPredInSet(const NGHolder &g, NFAVertex v, const set<NFAVertex> &s) {
for (auto u : inv_adjacent_vertices_range(v, g)) {
if (contains(s, u)) {
return true;
}
}
return false;
}
static
bool hasSuccInSet(const NGHolder &g, NFAVertex v, const set<NFAVertex> &s) {
for (auto w : adjacent_vertices_range(v, g)) {
if (contains(s, w)) {
return true;
}
}
return false;
}
static
void findSeeds(const NGHolder &h, const bool som, vector<NFAVertex> *seeds) {
set<NFAVertex> bad; /* from zero-width asserts near accepts, etc */
for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
const CharReach &cr = h[v].char_reach;
if (!isutf8ascii(cr) && !isutf8start(cr)) {
bad.insert(v);
}
}
for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) {
const CharReach &cr = h[v].char_reach;
if (!isutf8ascii(cr) && !isutf8start(cr)) {
bad.insert(v);
}
}
// we want to be careful with asserts connected to starts
// as well as they may not finish a code point
for (auto v : vertices_range(h)) {
if (is_virtual_start(v, h)) {
bad.insert(v);
insert(&bad, adjacent_vertices(v, h));
}
}
/* we cannot handle vertices connected to accept as would report matches in
* the middle of codepoints. acceptEod is not a problem as the input must
* end at a codepoint boundary */
bad.insert(h.accept);
// If we're in SOM mode, we don't want to mess with vertices that have a
// direct edge from startDs.
if (som) {
insert(&bad, adjacent_vertices(h.startDs, h));
}
set<NFAVertex> already_seeds; /* already marked as seeds */
for (auto v : vertices_range(h)) {
const CharReach &cr = h[v].char_reach;
if (!isutf8ascii(cr) || !hasSelfLoop(v, h)) {
continue;
}
if (hasSuccInSet(h, v, bad)) {
continue;
}
// Skip vertices that are directly connected to other vertices already
// in the seeds list: we can't collapse two of these directly next to
// each other.
if (hasPredInSet(h, v, already_seeds) ||
hasSuccInSet(h, v, already_seeds)) {
continue;
}
DEBUG_PRINTF("%u is a seed\n", h[v].index);
seeds->push_back(v);
already_seeds.insert(v);
}
}
static
bool expandCyclic(NGHolder &h, NFAVertex v) {
DEBUG_PRINTF("inspecting %u\n", h[v].index);
bool changes = false;
set<NFAVertex> v_preds;
set<NFAVertex> v_succs;
pred(h, v, &v_preds);
succ(h, v, &v_succs);
set<NFAVertex> start_siblings;
set<NFAVertex> end_siblings;
CharReach &v_cr = h[v].char_reach;
/* We need to find start vertices which have all of our preds.
* As we have a self loop, it must be one of our succs. */
for (auto a : adjacent_vertices_range(v, h)) {
set<NFAVertex> a_preds;
pred(h, a, &a_preds);
if (a_preds == v_preds && isutf8start(h[a].char_reach)) {
DEBUG_PRINTF("%u is a start v\n", h[a].index);
start_siblings.insert(a);
}
}
/* We also need to find full cont vertices which have all our own succs;
* As we have a self loop, it must be one of our preds. */
for (auto a : inv_adjacent_vertices_range(v, h)) {
set<NFAVertex> a_succs;
succ(h, a, &a_succs);
if (a_succs == v_succs && h[a].char_reach == UTF_CONT_CR) {
DEBUG_PRINTF("%u is a full tail cont\n", h[a].index);
end_siblings.insert(a);
}
}
for (auto s : start_siblings) {
if (out_degree(s, h) != 1) {
continue;
}
const CharReach &cr = h[s].char_reach;
if (cr.isSubsetOf(UTF_TWO_START_CR)) {
if (end_siblings.find(*adjacent_vertices(s, h).first)
== end_siblings.end()) {
DEBUG_PRINTF("%u is odd\n", h[s].index);
continue;
}
} else if (cr.isSubsetOf(UTF_THREE_START_CR)) {
NFAVertex m = *adjacent_vertices(s, h).first;
if (h[m].char_reach != UTF_CONT_CR
|| out_degree(m, h) != 1) {
continue;
}
if (end_siblings.find(*adjacent_vertices(m, h).first)
== end_siblings.end()) {
DEBUG_PRINTF("%u is odd\n", h[s].index);
continue;
}
} else if (cr.isSubsetOf(UTF_FOUR_START_CR)) {
NFAVertex m1 = *adjacent_vertices(s, h).first;
if (h[m1].char_reach != UTF_CONT_CR
|| out_degree(m1, h) != 1) {
continue;
}
NFAVertex m2 = *adjacent_vertices(m1, h).first;
if (h[m2].char_reach != UTF_CONT_CR
|| out_degree(m2, h) != 1) {
continue;
}
if (end_siblings.find(*adjacent_vertices(m2, h).first)
== end_siblings.end()) {
DEBUG_PRINTF("%u is odd\n", h[s].index);
continue;
}
} else {
DEBUG_PRINTF("%u is bad\n", h[s].index);
continue;
}
v_cr |= cr;
clear_vertex(s, h);
changes = true;
}
if (changes) {
v_cr |= UTF_CONT_CR; /* we need to add in cont reach */
v_cr.set(0xc0); /* we can also add in the forbidden bytes as we require
* valid unicode data */
v_cr.set(0xc1);
v_cr |= CharReach(0xf5, 0xff);
}
return changes;
}
/** \brief Contract cycles of UTF-8 code points down to a single cyclic vertex
* where possible, based on the assumption that we will always be matching
* against well-formed input. */
void utf8DotRestoration(NGHolder &h, bool som) {
vector<NFAVertex> seeds; /* cyclic ascii vertices */
findSeeds(h, som, &seeds);
bool changes = false;
for (auto v : seeds) {
changes |= expandCyclic(h, v);
}
if (changes) {
pruneUseless(h);
}
}
} // namespace ue2