mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-10-01 03:57:43 +03:00
hscollider: tool for testing Hyperscan match behaviour against PCRE
This commit is contained in:
513
tools/hscollider/GroundTruth.cpp
Normal file
513
tools/hscollider/GroundTruth.cpp
Normal file
@@ -0,0 +1,513 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "ExpressionParser.h"
|
||||
#include "expressions.h"
|
||||
#include "GroundTruth.h"
|
||||
#include "pcre_util.h"
|
||||
|
||||
#include "hs_compile.h" // for hs_expr_ext
|
||||
#include "ue2common.h"
|
||||
#include "parser/control_verbs.h"
|
||||
#include "parser/Parser.h"
|
||||
#include "parser/parse_error.h"
|
||||
#include "util/make_unique.h"
|
||||
#include "util/unicode_def.h"
|
||||
#include "util/unordered.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <pcre.h>
|
||||
|
||||
/* -X, -Y support
|
||||
* as PCRE performance is `non-linear' and these options add a large amount of
|
||||
* scanning, the following short cuts are used:
|
||||
* 1: the suffix is not scanned - we are more interested in the matches from
|
||||
* the original corpora.
|
||||
* 2: only the last 50 bytes of the prefix is scanned. This may lead to some
|
||||
* minor correctness issues for a few patterns.
|
||||
*/
|
||||
|
||||
using namespace std;
|
||||
using namespace ue2;
|
||||
|
||||
// We store matches in a hash table as we're likely to see lots of them. These
|
||||
// are moved into a ResultSet at the end.
|
||||
using PcreMatchSet = ue2::ue2_unordered_set<pair<unsigned, unsigned>>;
|
||||
|
||||
namespace {
|
||||
struct CalloutContext {
|
||||
explicit CalloutContext(ostream &os) : out(os) {}
|
||||
ostream &out;
|
||||
PcreMatchSet matches;
|
||||
};
|
||||
}
|
||||
|
||||
static
|
||||
int pcreCallOut(pcre_callout_block *block) {
|
||||
assert(block);
|
||||
assert(block->callout_data);
|
||||
CalloutContext *ctx = static_cast<CalloutContext *>(block->callout_data);
|
||||
|
||||
if (echo_matches) {
|
||||
ctx->out << "PCRE Match @ (" << block->start_match << ","
|
||||
<< block->current_position << ")" << endl;
|
||||
}
|
||||
|
||||
unsigned int from = block->start_match;
|
||||
unsigned int to = block->current_position;
|
||||
assert(from <= to);
|
||||
|
||||
ctx->matches.insert(make_pair(from, to));
|
||||
return 1;
|
||||
}
|
||||
|
||||
static
|
||||
bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander,
|
||||
bool *prefilter, bool *som, hs_expr_ext *ext) {
|
||||
string regex;
|
||||
unsigned int hs_flags = 0;
|
||||
if (!readExpression(expr, regex, &hs_flags, ext)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
expr.swap(regex);
|
||||
|
||||
if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (force_utf8) {
|
||||
*flags |= PCRE_UTF8;
|
||||
}
|
||||
|
||||
if (force_prefilter) {
|
||||
*prefilter = true;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static
|
||||
string pcreErrStr(int err) {
|
||||
switch (err) {
|
||||
case PCRE_ERROR_NOMATCH:
|
||||
return "PCRE_ERROR_NOMATCH";
|
||||
case PCRE_ERROR_NULL:
|
||||
return "PCRE_ERROR_NULL";
|
||||
case PCRE_ERROR_BADOPTION:
|
||||
return "PCRE_ERROR_BADOPTION";
|
||||
case PCRE_ERROR_BADMAGIC:
|
||||
return "PCRE_ERROR_BADMAGIC";
|
||||
#if defined(PCRE_ERROR_UNKNOWN_OPCODE)
|
||||
case PCRE_ERROR_UNKNOWN_OPCODE:
|
||||
return "PCRE_ERROR_UNKNOWN_OPCODE";
|
||||
#else
|
||||
case PCRE_ERROR_UNKNOWN_NODE:
|
||||
return "PCRE_ERROR_UNKNOWN_NODE";
|
||||
#endif
|
||||
case PCRE_ERROR_NOMEMORY:
|
||||
return "PCRE_ERROR_NOMEMORY";
|
||||
case PCRE_ERROR_NOSUBSTRING:
|
||||
return "PCRE_ERROR_NOSUBSTRING";
|
||||
case PCRE_ERROR_MATCHLIMIT:
|
||||
return "PCRE_ERROR_MATCHLIMIT";
|
||||
case PCRE_ERROR_CALLOUT:
|
||||
return "PCRE_ERROR_CALLOUT";
|
||||
case PCRE_ERROR_BADUTF8:
|
||||
return "PCRE_ERROR_BADUTF8";
|
||||
case PCRE_ERROR_BADUTF8_OFFSET:
|
||||
return "PCRE_ERROR_BADUTF8_OFFSET";
|
||||
case PCRE_ERROR_PARTIAL:
|
||||
return "PCRE_ERROR_PARTIAL";
|
||||
case PCRE_ERROR_BADPARTIAL:
|
||||
return "PCRE_ERROR_BADPARTIAL";
|
||||
case PCRE_ERROR_INTERNAL:
|
||||
return "PCRE_ERROR_INTERNAL";
|
||||
case PCRE_ERROR_BADCOUNT:
|
||||
return "PCRE_ERROR_BADCOUNT";
|
||||
#if defined(PCRE_ERROR_RECURSIONLIMIT)
|
||||
case PCRE_ERROR_RECURSIONLIMIT:
|
||||
return "PCRE_ERROR_RECURSIONLIMIT";
|
||||
#endif
|
||||
case PCRE_ERROR_DFA_UITEM:
|
||||
return "PCRE_ERROR_DFA_UITEM";
|
||||
case PCRE_ERROR_DFA_UCOND:
|
||||
return "PCRE_ERROR_DFA_UCOND";
|
||||
case PCRE_ERROR_DFA_UMLIMIT:
|
||||
return "PCRE_ERROR_DFA_UMLIMIT";
|
||||
case PCRE_ERROR_DFA_WSSIZE:
|
||||
return "PCRE_ERROR_DFA_WSSIZE";
|
||||
case PCRE_ERROR_DFA_RECURSE:
|
||||
return "PCRE_ERROR_DFA_RECURSE";
|
||||
default:
|
||||
{
|
||||
ostringstream oss;
|
||||
oss << "Unknown PCRE error (value: " << err << ")";
|
||||
return oss.str();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
|
||||
unsigned long int limit,
|
||||
unsigned long int limit_recursion)
|
||||
: out(os), m_expr(expr), matchLimit(limit),
|
||||
matchLimitRecursion(limit_recursion) {}
|
||||
|
||||
void GroundTruth::global_prep() {
|
||||
// We're using pcre callouts
|
||||
pcre_callout = &pcreCallOut;
|
||||
}
|
||||
|
||||
static
|
||||
void addCallout(string &re) {
|
||||
// If the string begins with "(*UTF8)" or "(*UTF8)(*UCP)", we want to keep
|
||||
// it at the front. We reuse the control verbs mini-parser for this.
|
||||
size_t startpos = 0;
|
||||
try {
|
||||
ue2::ParseMode mode;
|
||||
const char *ptr = ue2::read_control_verbs(
|
||||
re.c_str(), re.c_str() + re.size(), 0, mode);
|
||||
startpos = ptr - re.c_str();
|
||||
} catch (const ue2::ParseError &err) {
|
||||
// fall through
|
||||
}
|
||||
assert(startpos <= re.length());
|
||||
re.insert(startpos, "(?:");
|
||||
// We include a \E to close any open \Q quoted block. If there isn't
|
||||
// one, pcre will ignore the \E.
|
||||
re.append("\\E)(?C)");
|
||||
}
|
||||
|
||||
unique_ptr<CompiledPcre>
|
||||
GroundTruth::compile(unsigned id, bool no_callouts) {
|
||||
bool highlander = false;
|
||||
bool prefilter = false;
|
||||
bool som = false;
|
||||
|
||||
// we can still match approximate matching patterns with PCRE if edit
|
||||
// distance 0 is requested
|
||||
if (force_edit_distance && edit_distance) {
|
||||
throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
|
||||
}
|
||||
|
||||
ExpressionMap::const_iterator i = m_expr.find(id);
|
||||
if (i == m_expr.end()) {
|
||||
throw PcreCompileFailure("ID not found in expression map.");
|
||||
}
|
||||
|
||||
string re(i->second);
|
||||
unsigned flags;
|
||||
hs_expr_ext ext;
|
||||
|
||||
// Decode the flags
|
||||
if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som, &ext)) {
|
||||
throw PcreCompileFailure("Unable to decode flags.");
|
||||
}
|
||||
|
||||
// filter out flags not supported by PCRE
|
||||
u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET |
|
||||
HS_EXT_FLAG_MIN_LENGTH;
|
||||
if (ext.flags & ~supported) {
|
||||
// edit distance is a known unsupported flag, so just throw a soft error
|
||||
if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) {
|
||||
throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
|
||||
}
|
||||
if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
|
||||
throw SoftPcreCompileFailure(
|
||||
"Hamming distance not supported by PCRE.");
|
||||
}
|
||||
throw PcreCompileFailure("Unsupported extended flags.");
|
||||
}
|
||||
|
||||
// SOM flags might be set globally.
|
||||
som |= !!somFlags;
|
||||
|
||||
// For traditional Hyperscan, add global callout to pattern.
|
||||
if (!no_callouts) {
|
||||
addCallout(re);
|
||||
}
|
||||
|
||||
// Compile the pattern
|
||||
const char *errptr = nullptr;
|
||||
int errloc = 0;
|
||||
int errcode = 0;
|
||||
|
||||
unique_ptr<CompiledPcre> compiled = make_unique<CompiledPcre>();
|
||||
compiled->utf8 = flags & PCRE_UTF8;
|
||||
compiled->highlander = highlander;
|
||||
compiled->prefilter = prefilter;
|
||||
compiled->som = som;
|
||||
compiled->min_offset = ext.min_offset;
|
||||
compiled->max_offset = ext.max_offset;
|
||||
compiled->min_length = ext.min_length;
|
||||
compiled->expression = i->second; // original PCRE
|
||||
flags |= PCRE_NO_AUTO_POSSESS;
|
||||
|
||||
compiled->bytecode =
|
||||
pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr);
|
||||
|
||||
if (!compiled->bytecode || errptr) {
|
||||
assert(errcode);
|
||||
ostringstream oss;
|
||||
oss << "Failed to compile expression '" << re << '\'';
|
||||
oss << " (" << errptr << " at " << errloc << ").";
|
||||
if (errcode == 20) { // "regular expression is too large"
|
||||
throw SoftPcreCompileFailure(oss.str());
|
||||
} else if (errcode == 25) { // "lookbehind assertion is not fixed length"
|
||||
throw SoftPcreCompileFailure(oss.str());
|
||||
} else {
|
||||
throw PcreCompileFailure(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
// Study the pattern
|
||||
shared_ptr<pcre_extra> extra(pcre_study(compiled->bytecode, 0, &errptr),
|
||||
free);
|
||||
if (errptr) {
|
||||
ostringstream oss;
|
||||
oss << "Error studying pattern (" << errptr << ").";
|
||||
throw PcreCompileFailure(oss.str());
|
||||
}
|
||||
|
||||
int infoRes =
|
||||
pcre_fullinfo(compiled->bytecode, extra.get(), PCRE_INFO_CAPTURECOUNT,
|
||||
&compiled->captureCount);
|
||||
if (infoRes < PCRE_ERROR_NOMATCH) {
|
||||
ostringstream oss;
|
||||
oss << "Error determining number of capturing subpatterns ("
|
||||
<< pcreErrStr(infoRes) << ").";
|
||||
throw PcreCompileFailure(oss.str());
|
||||
}
|
||||
|
||||
return compiled;
|
||||
}
|
||||
|
||||
static
|
||||
void filterLeftmostSom(ResultSet &rs) {
|
||||
if (rs.matches.size() <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
set<u64a> seen; // End offsets.
|
||||
set<MatchResult>::iterator it = rs.matches.begin();
|
||||
while (it != rs.matches.end()) {
|
||||
if (seen.insert(it->to).second) {
|
||||
++it; // First time we've seen this end-offset.
|
||||
} else {
|
||||
rs.matches.erase(it++); // Dupe with a "righter" SOM.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void filterExtParams(ResultSet &rs, const CompiledPcre &compiled) {
|
||||
set<MatchResult>::iterator it = rs.matches.begin();
|
||||
while (it != rs.matches.end()) {
|
||||
unsigned int from = it->from, to = it->to;
|
||||
unsigned int len = to - from;
|
||||
if (to < compiled.min_offset || to > compiled.max_offset ||
|
||||
len < compiled.min_length) {
|
||||
rs.matches.erase(it++);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
int scanBasic(const CompiledPcre &compiled, const string &buffer,
|
||||
const pcre_extra &extra, vector<int> &ovector,
|
||||
CalloutContext &ctx) {
|
||||
const size_t prefix_len = g_corpora_prefix.size();
|
||||
const size_t suffix_len = g_corpora_suffix.size();
|
||||
|
||||
size_t begin_offset = prefix_len - MIN(50, prefix_len);
|
||||
size_t real_len = buffer.size();
|
||||
|
||||
if (suffix_len > 2) {
|
||||
real_len -= suffix_len - 2;
|
||||
}
|
||||
|
||||
int flags = suffix_len ? PCRE_NOTEOL : 0;
|
||||
int ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), real_len,
|
||||
begin_offset, flags, &ovector[0], ovector.size());
|
||||
|
||||
if (!g_corpora_prefix.empty()) {
|
||||
PcreMatchSet tmp;
|
||||
tmp.swap(ctx.matches);
|
||||
|
||||
for (const auto &m : tmp) {
|
||||
unsigned from = m.first;
|
||||
unsigned to = m.second;
|
||||
if (to >= prefix_len && to <= buffer.size() - suffix_len) {
|
||||
from = from < prefix_len ? 0 : from - prefix_len;
|
||||
to -= prefix_len;
|
||||
ctx.matches.insert(make_pair(from, to));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static
|
||||
int scanOffset(const CompiledPcre &compiled, const string &buffer,
|
||||
const pcre_extra &extra, vector<int> &ovector,
|
||||
CalloutContext &ctx) {
|
||||
size_t offset = MIN(100, g_streamOffset);
|
||||
assert(offset > 0);
|
||||
|
||||
const string buf(string(offset, '\0') + buffer);
|
||||
|
||||
// First, scan our preamble so that we can discard any matches therein
|
||||
// after the real scan, later. We use PCRE_NOTEOL so that end-anchors in
|
||||
// our expression don't match at the end of the preamble.
|
||||
int ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), offset, 0,
|
||||
PCRE_NOTEOL, &ovector[0], ovector.size());
|
||||
if (ret < PCRE_ERROR_NOMATCH) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
PcreMatchSet pre_matches;
|
||||
pre_matches.swap(ctx.matches);
|
||||
|
||||
// Real scan.
|
||||
ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), buf.size(), 0, 0,
|
||||
&ovector[0], ovector.size());
|
||||
if (ret < PCRE_ERROR_NOMATCH) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Erase any matches due entirely to the preamble.
|
||||
for (const auto &m : pre_matches) {
|
||||
ctx.matches.erase(m);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
|
||||
const string &buffer, ResultSet &rs, string &error) {
|
||||
CalloutContext ctx(out);
|
||||
|
||||
pcre_extra extra;
|
||||
extra.flags = 0;
|
||||
|
||||
// Switch on callouts.
|
||||
extra.flags |= PCRE_EXTRA_CALLOUT_DATA;
|
||||
extra.callout_data = &ctx;
|
||||
|
||||
// Set the match_limit (in order to bound execution time on very complex
|
||||
// patterns)
|
||||
extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION);
|
||||
extra.match_limit = matchLimit;
|
||||
extra.match_limit_recursion = matchLimitRecursion;
|
||||
|
||||
#ifdef PCRE_NO_START_OPTIMIZE
|
||||
// Switch off optimizations that may result in callouts not occurring.
|
||||
extra.flags |= PCRE_NO_START_OPTIMIZE;
|
||||
#endif
|
||||
|
||||
// Ensure there's enough room in the ovector for the capture groups in this
|
||||
// pattern.
|
||||
int ovecsize = (compiled.captureCount + 1) * 3;
|
||||
ovector.resize(ovecsize);
|
||||
|
||||
int ret;
|
||||
switch (colliderMode) {
|
||||
case MODE_BLOCK:
|
||||
case MODE_STREAMING:
|
||||
case MODE_VECTORED:
|
||||
if (g_streamOffset) {
|
||||
ret = scanOffset(compiled, buffer, extra, ovector, ctx);
|
||||
} else {
|
||||
ret = scanBasic(compiled, buffer, extra, ovector, ctx);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
ret = PCRE_ERROR_NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
if (ret < PCRE_ERROR_NOMATCH) {
|
||||
error = pcreErrStr(ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Move matches into a ResultSet.
|
||||
for (const auto &m : ctx.matches) {
|
||||
unsigned long long from = m.first;
|
||||
unsigned long long to = m.second;
|
||||
|
||||
if (g_streamOffset) {
|
||||
// Subtract stream offset imposed by offset test.
|
||||
unsigned long long offset = min(100ull, g_streamOffset);
|
||||
assert(to >= offset);
|
||||
from -= min(offset, from);
|
||||
to -= offset;
|
||||
}
|
||||
|
||||
rs.addMatch(from, to);
|
||||
}
|
||||
|
||||
// If we have no matches, there's no further work to do.
|
||||
if (rs.matches.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (compiled.som) {
|
||||
filterLeftmostSom(rs);
|
||||
}
|
||||
|
||||
filterExtParams(rs, compiled);
|
||||
|
||||
// If we haven't been asked for SOM, strip the from offsets.
|
||||
if (!compiled.som) {
|
||||
set<MatchResult> endonly;
|
||||
for (const auto &m : rs.matches) {
|
||||
endonly.insert(MatchResult(0, m.to));
|
||||
}
|
||||
rs.matches.swap(endonly);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
Reference in New Issue
Block a user