ng: split NGWrapper into NGHolder, ExpressionInfo

We now use NGHolder for all graph information, while other expression
properties (report, flag information, etc) go in new class
ExpressionInfo.
This commit is contained in:
Justin Viiret
2017-03-16 18:18:34 +11:00
committed by Matthew Barr
parent fadfab6d8c
commit 5dfae12a62
41 changed files with 726 additions and 612 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2016, Intel Corporation
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -42,6 +42,8 @@
* word-to-word and word-to-nonword) are dropped.
*/
#include "asserts.h"
#include "compiler/compiler.h"
#include "nfagraph/ng.h"
#include "nfagraph/ng_prune.h"
#include "nfagraph/ng_redundancy.h"
@@ -115,8 +117,8 @@ u32 conjunct(u32 flags1, u32 flags2) {
typedef map<pair<NFAVertex, NFAVertex>, NFAEdge> edge_cache_t;
static
void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
u32 &assert_edge_count) {
void replaceAssertVertex(NGHolder &g, NFAVertex t, const ExpressionInfo &expr,
edge_cache_t &edge_cache, u32 &assert_edge_count) {
DEBUG_PRINTF("replacing assert vertex %zu\n", g[t].index);
const u32 flags = g[t].assert_flags;
@@ -178,8 +180,7 @@ void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
edge_cache.emplace(cache_key, e);
g[e].assert_flags = flags;
if (++assert_edge_count > MAX_ASSERT_EDGES) {
throw CompileError(g.expressionIndex,
"Pattern is too large.");
throw CompileError(expr.index, "Pattern is too large.");
}
} else {
NFAEdge e = ecit->second;
@@ -200,21 +201,23 @@ void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
}
static
void setReportId(ReportManager &rm, NGWrapper &g, NFAVertex v, s32 adj) {
void setReportId(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
NFAVertex v, s32 adj) {
// Don't try and set the report ID of a special vertex.
assert(!is_special(v, g));
// There should be no reports set already.
assert(g[v].reports.empty());
Report r = rm.getBasicInternalReport(g, adj);
Report r = rm.getBasicInternalReport(expr, adj);
g[v].reports.insert(rm.getInternalId(r));
DEBUG_PRINTF("set report id for vertex %zu, adj %d\n", g[v].index, adj);
}
static
void checkForMultilineStart(ReportManager &rm, NGWrapper &g) {
void checkForMultilineStart(ReportManager &rm, NGHolder &g,
const ExpressionInfo &expr) {
vector<NFAEdge> dead;
for (auto v : adjacent_vertices_range(g.start, g)) {
if (!(g[v].assert_flags & POS_FLAG_MULTILINE_START)) {
@@ -238,7 +241,7 @@ void checkForMultilineStart(ReportManager &rm, NGWrapper &g) {
for (const auto &e : dead) {
NFAVertex dummy = add_vertex(g);
g[dummy].char_reach.setall();
setReportId(rm, g, dummy, -1);
setReportId(rm, g, expr, dummy, -1);
add_edge(source(e, g), dummy, g[e], g);
add_edge(dummy, g.accept, g);
}
@@ -263,7 +266,8 @@ bool hasAssertVertices(const NGHolder &g) {
* Remove the horrors that are the temporary assert vertices which arise from
* our construction method. Allows the rest of our code base to live in
* blissful ignorance of their existence. */
void removeAssertVertices(ReportManager &rm, NGWrapper &g) {
void removeAssertVertices(ReportManager &rm, NGHolder &g,
const ExpressionInfo &expr) {
size_t num = 0;
DEBUG_PRINTF("before: graph has %zu vertices\n", num_vertices(g));
@@ -285,12 +289,12 @@ void removeAssertVertices(ReportManager &rm, NGWrapper &g) {
for (auto v : vertices_range(g)) {
if (g[v].assert_flags & WORDBOUNDARY_FLAGS) {
replaceAssertVertex(g, v, edge_cache, assert_edge_count);
replaceAssertVertex(g, v, expr, edge_cache, assert_edge_count);
num++;
}
}
checkForMultilineStart(rm, g);
checkForMultilineStart(rm, g, expr);
if (num) {
DEBUG_PRINTF("resolved %zu assert vertices\n", num);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, Intel Corporation
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -35,8 +35,9 @@
namespace ue2 {
class ExpressionInfo;
class ReportManager;
class NGWrapper;
class NGHolder;
/** \brief Convert temporary assert vertices (from construction method) to
* edge-based flags.
@@ -44,7 +45,8 @@ class NGWrapper;
* Remove the horrors that are the temporary assert vertices which arise from
* our construction method. Allows the rest of our code base to live in
* blissful ignorance of their existence. */
void removeAssertVertices(ReportManager &rm, NGWrapper &g);
void removeAssertVertices(ReportManager &rm, NGHolder &g,
const ExpressionInfo &expr);
} // namespace ue2

View File

@@ -73,7 +73,6 @@ using namespace std;
namespace ue2 {
static
void validateExt(const hs_expr_ext &ext) {
static const unsigned long long ALL_EXT_FLAGS = HS_EXT_FLAG_MIN_OFFSET |
@@ -100,26 +99,18 @@ void validateExt(const hs_expr_ext &ext) {
}
ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
unsigned flags, ReportID actionId,
unsigned flags, ReportID report,
const hs_expr_ext *ext)
: utf8(false),
allow_vacuous(flags & HS_FLAG_ALLOWEMPTY),
highlander(flags & HS_FLAG_SINGLEMATCH),
prefilter(flags & HS_FLAG_PREFILTER),
som(SOM_NONE),
index(index_in),
id(actionId),
min_offset(0),
max_offset(MAX_OFFSET),
min_length(0),
edit_distance(0) {
: expr(index_in, flags & HS_FLAG_ALLOWEMPTY, flags & HS_FLAG_SINGLEMATCH,
false, flags & HS_FLAG_PREFILTER, SOM_NONE, report, 0, MAX_OFFSET,
0, 0) {
ParseMode mode(flags);
component = parse(expression, mode);
utf8 = mode.utf8; /* utf8 may be set by parse() */
expr.utf8 = mode.utf8; /* utf8 may be set by parse() */
if (utf8 && !isValidUtf8(expression)) {
if (expr.utf8 && !isValidUtf8(expression)) {
throw ParseError("Expression is not valid UTF-8.");
}
@@ -147,7 +138,7 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
// Set SOM type.
if (flags & HS_FLAG_SOM_LEFTMOST) {
som = SOM_LEFT;
expr.som = SOM_LEFT;
}
// Set extended parameters, if we have them.
@@ -156,29 +147,29 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
validateExt(*ext);
if (ext->flags & HS_EXT_FLAG_MIN_OFFSET) {
min_offset = ext->min_offset;
expr.min_offset = ext->min_offset;
}
if (ext->flags & HS_EXT_FLAG_MAX_OFFSET) {
max_offset = ext->max_offset;
expr.max_offset = ext->max_offset;
}
if (ext->flags & HS_EXT_FLAG_MIN_LENGTH) {
min_length = ext->min_length;
expr.min_length = ext->min_length;
}
if (ext->flags & HS_EXT_FLAG_EDIT_DISTANCE) {
edit_distance = ext->edit_distance;
expr.edit_distance = ext->edit_distance;
}
}
// These are validated in validateExt, so an error will already have been
// thrown if these conditions don't hold.
assert(max_offset >= min_offset);
assert(max_offset >= min_length);
assert(expr.max_offset >= expr.min_offset);
assert(expr.max_offset >= expr.min_length);
// Since prefiltering and SOM aren't supported together, we must squash any
// min_length constraint as well.
if (flags & HS_FLAG_PREFILTER && min_length) {
if (flags & HS_FLAG_PREFILTER && expr.min_length) {
DEBUG_PRINTF("prefiltering mode: squashing min_length constraint\n");
min_length = 0;
expr.min_length = 0;
}
}
@@ -187,25 +178,25 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
* \brief Dumps the parse tree to screen in debug mode and to disk in dump
* mode.
*/
void dumpExpression(UNUSED const ParsedExpression &expr,
void dumpExpression(UNUSED const ParsedExpression &pe,
UNUSED const char *stage, UNUSED const Grey &grey) {
#if defined(DEBUG)
DEBUG_PRINTF("===== Rule ID: %u (internalID: %u) =====\n", expr.id,
expr.index);
DEBUG_PRINTF("===== Rule ID: %u (expression index: %u) =====\n",
pe.expr.report, pe.expr.index);
ostringstream debug_tree;
dumpTree(debug_tree, expr.component.get());
dumpTree(debug_tree, pe.component.get());
printf("%s\n", debug_tree.str().c_str());
#endif // DEBUG
#if defined(DUMP_SUPPORT)
if (grey.dumpFlags & Grey::DUMP_PARSE) {
stringstream ss;
ss << grey.dumpPath << "Expr_" << expr.index << "_componenttree_"
ss << grey.dumpPath << "Expr_" << pe.expr.index << "_componenttree_"
<< stage << ".txt";
ofstream out(ss.str().c_str());
out << "Component Tree for " << expr.id << endl;
dumpTree(out, expr.component.get());
if (expr.utf8) {
out << "Component Tree for " << pe.expr.report << endl;
dumpTree(out, pe.component.get());
if (pe.expr.utf8) {
out << "UTF8 mode" << endl;
}
}
@@ -215,13 +206,13 @@ void dumpExpression(UNUSED const ParsedExpression &expr,
/** \brief Run Component tree optimisations on \a expr. */
static
void optimise(ParsedExpression &expr) {
if (expr.min_length || expr.som) {
void optimise(ParsedExpression &pe) {
if (pe.expr.min_length || pe.expr.som) {
return;
}
DEBUG_PRINTF("optimising\n");
expr.component->optimise(true /* root is connected to sds */);
pe.component->optimise(true /* root is connected to sds */);
}
void addExpression(NG &ng, unsigned index, const char *expression,
@@ -238,34 +229,34 @@ void addExpression(NG &ng, unsigned index, const char *expression,
// Do per-expression processing: errors here will result in an exception
// being thrown up to our caller
ParsedExpression expr(index, expression, flags, id, ext);
dumpExpression(expr, "orig", cc.grey);
ParsedExpression pe(index, expression, flags, id, ext);
dumpExpression(pe, "orig", cc.grey);
// Apply prefiltering transformations if desired.
if (expr.prefilter) {
prefilterTree(expr.component, ParseMode(flags));
dumpExpression(expr, "prefiltered", cc.grey);
if (pe.expr.prefilter) {
prefilterTree(pe.component, ParseMode(flags));
dumpExpression(pe, "prefiltered", cc.grey);
}
// Expressions containing zero-width assertions and other extended pcre
// types aren't supported yet. This call will throw a ParseError exception
// if the component tree contains such a construct.
checkUnsupported(*expr.component);
checkUnsupported(*pe.component);
expr.component->checkEmbeddedStartAnchor(true);
expr.component->checkEmbeddedEndAnchor(true);
pe.component->checkEmbeddedStartAnchor(true);
pe.component->checkEmbeddedEndAnchor(true);
if (cc.grey.optimiseComponentTree) {
optimise(expr);
dumpExpression(expr, "opt", cc.grey);
optimise(pe);
dumpExpression(pe, "opt", cc.grey);
}
DEBUG_PRINTF("component=%p, nfaId=%u, reportId=%u\n",
expr.component.get(), expr.index, expr.id);
pe.component.get(), pe.expr.index, pe.expr.report);
// You can only use the SOM flags if you've also specified an SOM
// precision mode.
if (expr.som != SOM_NONE && cc.streaming && !ng.ssm.somPrecision()) {
if (pe.expr.som != SOM_NONE && cc.streaming && !ng.ssm.somPrecision()) {
throw CompileError("To use a SOM expression flag in streaming mode, "
"an SOM precision mode (e.g. "
"HS_MODE_SOM_HORIZON_LARGE) must be specified.");
@@ -273,26 +264,26 @@ void addExpression(NG &ng, unsigned index, const char *expression,
// If this expression is a literal, we can feed it directly to Rose rather
// than building the NFA graph.
if (shortcutLiteral(ng, expr)) {
if (shortcutLiteral(ng, pe)) {
DEBUG_PRINTF("took literal short cut\n");
return;
}
unique_ptr<NGWrapper> g = buildWrapper(ng.rm, cc, expr);
if (!g) {
auto built_expr = buildGraph(ng.rm, cc, pe);
if (!built_expr.g) {
DEBUG_PRINTF("NFA build failed on ID %u, but no exception was "
"thrown.\n", expr.id);
"thrown.\n", pe.expr.report);
throw CompileError("Internal error.");
}
if (!expr.allow_vacuous && matches_everywhere(*g)) {
auto &g = *built_expr.g;
if (!pe.expr.allow_vacuous && matches_everywhere(g)) {
throw CompileError("Pattern matches empty buffer; use "
"HS_FLAG_ALLOWEMPTY to enable support.");
}
if (!ng.addGraph(*g)) {
DEBUG_PRINTF("NFA addGraph failed on ID %u.\n", expr.id);
if (!ng.addGraph(built_expr.expr, g)) {
DEBUG_PRINTF("NFA addGraph failed on ID %u.\n", pe.expr.report);
throw CompileError("Error compiling expression.");
}
}
@@ -453,41 +444,42 @@ bool isSupported(const Component &c) {
}
#endif
unique_ptr<NGWrapper> buildWrapper(ReportManager &rm, const CompileContext &cc,
const ParsedExpression &expr) {
assert(isSupported(*expr.component));
BuiltExpression buildGraph(ReportManager &rm, const CompileContext &cc,
const ParsedExpression &pe) {
assert(isSupported(*pe.component));
const unique_ptr<NFABuilder> builder = makeNFABuilder(rm, cc, expr);
const auto builder = makeNFABuilder(rm, cc, pe);
assert(builder);
// Set up START and ACCEPT states; retrieve the special states
const auto bs = makeGlushkovBuildState(*builder, expr.prefilter);
const auto bs = makeGlushkovBuildState(*builder, pe.expr.prefilter);
// Map position IDs to characters/components
expr.component->notePositions(*bs);
pe.component->notePositions(*bs);
// Wire the start dotstar state to the firsts
connectInitialStates(*bs, expr);
connectInitialStates(*bs, pe);
DEBUG_PRINTF("wire up body of expr\n");
// Build the rest of the FOLLOW set
vector<PositionInfo> initials = {builder->getStartDotStar(),
builder->getStart()};
expr.component->buildFollowSet(*bs, initials);
pe.component->buildFollowSet(*bs, initials);
// Wire the lasts to the accept state
connectFinalStates(*bs, expr);
connectFinalStates(*bs, pe);
// Create our edges
bs->buildEdges();
auto g = builder->getGraph();
assert(g);
BuiltExpression built_expr = builder->getGraph();
assert(built_expr.g);
dumpDotWrapper(*g, "00_before_asserts", cc.grey);
removeAssertVertices(rm, *g);
dumpDotWrapper(*built_expr.g, built_expr.expr, "00_before_asserts",
cc.grey);
removeAssertVertices(rm, *built_expr.g, built_expr.expr);
return g;
return built_expr;
}
} // namespace ue2

View File

@@ -35,8 +35,8 @@
#include "ue2common.h"
#include "database.h"
#include "compiler/expression_info.h"
#include "parser/Component.h"
#include "som/som.h"
#include <memory>
#include <boost/core/noncopyable.hpp>
@@ -50,35 +50,32 @@ struct CompileContext;
struct Grey;
struct target_t;
class NG;
class NGHolder;
class ReportManager;
class NGWrapper;
/** Class gathering together the pieces of a parsed expression.
* Note: Owns the provided component.
*/
/** \brief Class gathering together the pieces of a parsed expression. */
class ParsedExpression : boost::noncopyable {
public:
ParsedExpression(unsigned index, const char *expression, unsigned flags,
ReportID actionId, const hs_expr_ext *ext = nullptr);
ReportID report, const hs_expr_ext *ext = nullptr);
bool utf8; //!< UTF-8 mode flag specified
/** \brief Expression information (from flags, extparam etc) */
ExpressionInfo expr;
/** \brief root node of parsed component tree. */
std::unique_ptr<ue2::Component> component;
/** \brief Root node of parsed component tree. */
std::unique_ptr<Component> component;
};
const bool allow_vacuous; //!< HS_FLAG_ALLOWEMPTY specified
const bool highlander; //!< HS_FLAG_SINGLEMATCH specified
const bool prefilter; //!< HS_FLAG_PREFILTER specified
som_type som; //!< chosen SOM mode, or SOM_NONE
/**
* \brief Class gathering together the pieces of an expression that has been
* built into an NFA graph.
*/
struct BuiltExpression {
/** \brief Expression information (from flags, extparam etc) */
ExpressionInfo expr;
/** \brief index in expressions array passed to \ref hs_compile_multi */
const unsigned index;
const ReportID id; //!< user-specified pattern ID
u64a min_offset; //!< 0 if not used
u64a max_offset; //!< MAX_OFFSET if not used
u64a min_length; //!< 0 if not used
u32 edit_distance; //!< 0 if not used
/** \brief Built Glushkov NFA graph. */
std::unique_ptr<NGHolder> g;
};
/**
@@ -95,12 +92,12 @@ public:
* @param ext
* Struct containing extra parameters for this expression, or NULL if
* none.
* @param actionId
* @param report
* The identifier to associate with the expression; returned by engine on
* match.
*/
void addExpression(NG &ng, unsigned index, const char *expression,
unsigned flags, const hs_expr_ext *ext, ReportID actionId);
unsigned flags, const hs_expr_ext *ext, ReportID report);
/**
* Build a Hyperscan database out of the expressions we've been given. A
@@ -128,9 +125,8 @@ struct hs_database *build(NG &ng, unsigned int *length);
* @return
* nullptr on error.
*/
std::unique_ptr<NGWrapper> buildWrapper(ReportManager &rm,
const CompileContext &cc,
const ParsedExpression &expr);
BuiltExpression buildGraph(ReportManager &rm, const CompileContext &cc,
const ParsedExpression &expr);
/**
* Build a platform_t out of a target_t.

View File

@@ -0,0 +1,102 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* \file
* \brief ExpressionInfo class for storing the properties of an expression.
*/
#ifndef COMPILER_EXPRESSION_INFO_H
#define COMPILER_EXPRESSION_INFO_H
#include "ue2common.h"
#include "som/som.h"
namespace ue2 {
/** \brief Properties of an expression. */
class ExpressionInfo {
public:
ExpressionInfo(unsigned int index_in, bool allow_vacuous_in,
bool highlander_in, bool utf8_in, bool prefilter_in,
som_type som_in, ReportID report_in, u64a min_offset_in,
u64a max_offset_in, u64a min_length_in, u32 edit_distance_in)
: index(index_in), report(report_in), allow_vacuous(allow_vacuous_in),
highlander(highlander_in), utf8(utf8_in), prefilter(prefilter_in),
som(som_in), min_offset(min_offset_in), max_offset(max_offset_in),
min_length(min_length_in), edit_distance(edit_distance_in) {}
/**
* \brief Index of the expression represented by this graph.
*
* Used:
* - down the track in error handling;
* - for identifying parts of an expression in highlander mode.
*/
unsigned int index;
/** \brief Report ID specified by the user. */
ReportID report;
/** \brief Vacuous pattern is allowed. (HS_FLAG_ALLOWEMPTY) */
bool allow_vacuous;
/** \brief "Highlander" (single match) pattern. (HS_FLAG_SINGLEMATCH) */
bool highlander;
/** \brief UTF-8 pattern. (HS_FLAG_UTF8) */
bool utf8;
/** \brief Prefiltering pattern. (HS_FLAG_PREFILTER) */
bool prefilter;
/** \brief Start-of-match type requested, or SOM_NONE. */
som_type som;
/** \brief Minimum match offset extended parameter. 0 if not used. */
u64a min_offset;
/**
* \brief Maximum match offset extended parameter.
* MAX_OFFSET if not used.
*/
u64a max_offset;
/** \brief Minimum match length extended parameter. 0 if not used. */
u64a min_length;
/**
* \brief Approximate matching edit distance extended parameter.
* 0 if not used.
*/
u32 edit_distance;
};
}
#endif // COMPILER_EXPRESSION_INFO_H