mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-11-19 02:30:35 +03:00
Initial commit of Hyperscan
This commit is contained in:
305
src/nfagraph/ng_utf8.cpp
Normal file
305
src/nfagraph/ng_utf8.cpp
Normal file
@@ -0,0 +1,305 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief UTF-8 transforms and operations.
|
||||
*/
|
||||
#include "ng_utf8.h"
|
||||
|
||||
#include "ng.h"
|
||||
#include "ng_prune.h"
|
||||
#include "ng_util.h"
|
||||
#include "util/graph_range.h"
|
||||
#include "util/unicode_def.h"
|
||||
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
static
|
||||
void allowIllegal(NGWrapper &w, NFAVertex v, u8 pred_char) {
|
||||
if (in_degree(v, w) != 1) {
|
||||
DEBUG_PRINTF("unexpected pred\n");
|
||||
assert(0); /* should be true due to the early stage of this analysis */
|
||||
return;
|
||||
}
|
||||
|
||||
CharReach &cr = w[v].char_reach;
|
||||
if (pred_char == 0xe0) {
|
||||
assert(cr.isSubsetOf(CharReach(0xa0, 0xbf)));
|
||||
if (cr == CharReach(0xa0, 0xbf)) {
|
||||
cr |= CharReach(0x80, 0x9f);
|
||||
}
|
||||
} else if (pred_char == 0xf0) {
|
||||
assert(cr.isSubsetOf(CharReach(0x90, 0xbf)));
|
||||
if (cr == CharReach(0x90, 0xbf)) {
|
||||
cr |= CharReach(0x80, 0x8f);
|
||||
}
|
||||
} else if (pred_char == 0xf4) {
|
||||
assert(cr.isSubsetOf(CharReach(0x80, 0x8f)));
|
||||
if (cr == CharReach(0x80, 0x8f)) {
|
||||
cr |= CharReach(0x90, 0xbf);
|
||||
}
|
||||
} else {
|
||||
assert(0); /* unexpected pred */
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Relax forbidden UTF-8 sequences.
|
||||
*
|
||||
* Some byte sequences can not appear in valid UTF-8 as they encode code points
|
||||
* above \\x{10ffff} or they represent overlong encodings. As we require valid
|
||||
* UTF-8 input, we have no defined behaviour in these cases, as a result we can
|
||||
* accept them if it simplifies the graph. */
|
||||
void relaxForbiddenUtf8(NGWrapper &w) {
|
||||
if (!w.utf8) {
|
||||
return;
|
||||
}
|
||||
|
||||
const CharReach e0(0xe0);
|
||||
const CharReach f0(0xf0);
|
||||
const CharReach f4(0xf4);
|
||||
|
||||
for (auto v : vertices_range(w)) {
|
||||
const CharReach &cr = w[v].char_reach;
|
||||
if (cr == e0 || cr == f0 || cr == f4) {
|
||||
u8 pred_char = cr.find_first();
|
||||
for (auto t : adjacent_vertices_range(v, w)) {
|
||||
allowIllegal(w, t, pred_char);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
bool hasPredInSet(const NGHolder &g, NFAVertex v, const set<NFAVertex> &s) {
|
||||
for (auto u : inv_adjacent_vertices_range(v, g)) {
|
||||
if (contains(s, u)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static
|
||||
bool hasSuccInSet(const NGHolder &g, NFAVertex v, const set<NFAVertex> &s) {
|
||||
for (auto w : adjacent_vertices_range(v, g)) {
|
||||
if (contains(s, w)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static
|
||||
void findSeeds(const NGHolder &h, const bool som, vector<NFAVertex> *seeds) {
|
||||
set<NFAVertex> bad; /* from zero-width asserts near accepts, etc */
|
||||
for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
|
||||
const CharReach &cr = h[v].char_reach;
|
||||
if (!isutf8ascii(cr) && !isutf8start(cr)) {
|
||||
bad.insert(v);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) {
|
||||
const CharReach &cr = h[v].char_reach;
|
||||
if (!isutf8ascii(cr) && !isutf8start(cr)) {
|
||||
bad.insert(v);
|
||||
}
|
||||
}
|
||||
|
||||
// we want to be careful with asserts connected to starts
|
||||
// as well as they may not finish a code point
|
||||
for (auto v : vertices_range(h)) {
|
||||
if (is_virtual_start(v, h)) {
|
||||
bad.insert(v);
|
||||
insert(&bad, adjacent_vertices(v, h));
|
||||
}
|
||||
}
|
||||
|
||||
/* we cannot handle vertices connected to accept as would report matches in
|
||||
* the middle of codepoints. acceptEod is not a problem as the input must
|
||||
* end at a codepoint boundary */
|
||||
bad.insert(h.accept);
|
||||
|
||||
// If we're in SOM mode, we don't want to mess with vertices that have a
|
||||
// direct edge from startDs.
|
||||
if (som) {
|
||||
insert(&bad, adjacent_vertices(h.startDs, h));
|
||||
}
|
||||
|
||||
set<NFAVertex> already_seeds; /* already marked as seeds */
|
||||
for (auto v : vertices_range(h)) {
|
||||
const CharReach &cr = h[v].char_reach;
|
||||
|
||||
if (!isutf8ascii(cr) || !hasSelfLoop(v, h)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (hasSuccInSet(h, v, bad)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip vertices that are directly connected to other vertices already
|
||||
// in the seeds list: we can't collapse two of these directly next to
|
||||
// each other.
|
||||
if (hasPredInSet(h, v, already_seeds) ||
|
||||
hasSuccInSet(h, v, already_seeds)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("%u is a seed\n", h[v].index);
|
||||
seeds->push_back(v);
|
||||
already_seeds.insert(v);
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
bool expandCyclic(NGHolder &h, NFAVertex v) {
|
||||
DEBUG_PRINTF("inspecting %u\n", h[v].index);
|
||||
bool changes = false;
|
||||
|
||||
set<NFAVertex> v_preds;
|
||||
set<NFAVertex> v_succs;
|
||||
pred(h, v, &v_preds);
|
||||
succ(h, v, &v_succs);
|
||||
set<NFAVertex> start_siblings;
|
||||
set<NFAVertex> end_siblings;
|
||||
|
||||
CharReach &v_cr = h[v].char_reach;
|
||||
|
||||
/* We need to find start vertices which have all of our preds.
|
||||
* As we have a self loop, it must be one of our succs. */
|
||||
for (auto a : adjacent_vertices_range(v, h)) {
|
||||
set<NFAVertex> a_preds;
|
||||
pred(h, a, &a_preds);
|
||||
|
||||
if (a_preds == v_preds && isutf8start(h[a].char_reach)) {
|
||||
DEBUG_PRINTF("%u is a start v\n", h[a].index);
|
||||
start_siblings.insert(a);
|
||||
}
|
||||
}
|
||||
|
||||
/* We also need to find full cont vertices which have all our own succs;
|
||||
* As we have a self loop, it must be one of our preds. */
|
||||
for (auto a : inv_adjacent_vertices_range(v, h)) {
|
||||
set<NFAVertex> a_succs;
|
||||
succ(h, a, &a_succs);
|
||||
|
||||
if (a_succs == v_succs && h[a].char_reach == UTF_CONT_CR) {
|
||||
DEBUG_PRINTF("%u is a full tail cont\n", h[a].index);
|
||||
end_siblings.insert(a);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto s : start_siblings) {
|
||||
if (out_degree(s, h) != 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const CharReach &cr = h[s].char_reach;
|
||||
if (cr.isSubsetOf(UTF_TWO_START_CR)) {
|
||||
if (end_siblings.find(*adjacent_vertices(s, h).first)
|
||||
== end_siblings.end()) {
|
||||
DEBUG_PRINTF("%u is odd\n", h[s].index);
|
||||
continue;
|
||||
}
|
||||
} else if (cr.isSubsetOf(UTF_THREE_START_CR)) {
|
||||
NFAVertex m = *adjacent_vertices(s, h).first;
|
||||
|
||||
if (h[m].char_reach != UTF_CONT_CR
|
||||
|| out_degree(m, h) != 1) {
|
||||
continue;
|
||||
}
|
||||
if (end_siblings.find(*adjacent_vertices(m, h).first)
|
||||
== end_siblings.end()) {
|
||||
DEBUG_PRINTF("%u is odd\n", h[s].index);
|
||||
continue;
|
||||
}
|
||||
} else if (cr.isSubsetOf(UTF_FOUR_START_CR)) {
|
||||
NFAVertex m1 = *adjacent_vertices(s, h).first;
|
||||
|
||||
if (h[m1].char_reach != UTF_CONT_CR
|
||||
|| out_degree(m1, h) != 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
NFAVertex m2 = *adjacent_vertices(m1, h).first;
|
||||
|
||||
if (h[m2].char_reach != UTF_CONT_CR
|
||||
|| out_degree(m2, h) != 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (end_siblings.find(*adjacent_vertices(m2, h).first)
|
||||
== end_siblings.end()) {
|
||||
DEBUG_PRINTF("%u is odd\n", h[s].index);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
DEBUG_PRINTF("%u is bad\n", h[s].index);
|
||||
continue;
|
||||
}
|
||||
|
||||
v_cr |= cr;
|
||||
clear_vertex(s, h);
|
||||
changes = true;
|
||||
}
|
||||
|
||||
if (changes) {
|
||||
v_cr |= UTF_CONT_CR; /* we need to add in cont reach */
|
||||
v_cr.set(0xc0); /* we can also add in the forbidden bytes as we require
|
||||
* valid unicode data */
|
||||
v_cr.set(0xc1);
|
||||
v_cr |= CharReach(0xf5, 0xff);
|
||||
}
|
||||
|
||||
return changes;
|
||||
}
|
||||
|
||||
/** \brief Contract cycles of UTF-8 code points down to a single cyclic vertex
|
||||
* where possible, based on the assumption that we will always be matching
|
||||
* against well-formed input. */
|
||||
void utf8DotRestoration(NGHolder &h, bool som) {
|
||||
vector<NFAVertex> seeds; /* cyclic ascii vertices */
|
||||
findSeeds(h, som, &seeds);
|
||||
|
||||
bool changes = false;
|
||||
for (auto v : seeds) {
|
||||
changes |= expandCyclic(h, v);
|
||||
}
|
||||
|
||||
if (changes) {
|
||||
pruneUseless(h);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ue2
|
||||
Reference in New Issue
Block a user