mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
514 lines
16 KiB
C++
514 lines
16 KiB
C++
/*
|
|
* Copyright (c) 2015-2017, Intel Corporation
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Intel Corporation nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "config.h"
|
|
|
|
#include "common.h"
|
|
#include "ExpressionParser.h"
|
|
#include "expressions.h"
|
|
#include "GroundTruth.h"
|
|
#include "pcre_util.h"
|
|
|
|
#include "hs_compile.h" // for hs_expr_ext
|
|
#include "ue2common.h"
|
|
#include "parser/control_verbs.h"
|
|
#include "parser/Parser.h"
|
|
#include "parser/parse_error.h"
|
|
#include "util/make_unique.h"
|
|
#include "util/unicode_def.h"
|
|
#include "util/unordered.h"
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <ostream>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include <pcre.h>
|
|
|
|
/* -X, -Y support
|
|
* as PCRE performance is `non-linear' and these options add a large amount of
|
|
* scanning, the following short cuts are used:
|
|
* 1: the suffix is not scanned - we are more interested in the matches from
|
|
* the original corpora.
|
|
* 2: only the last 50 bytes of the prefix is scanned. This may lead to some
|
|
* minor correctness issues for a few patterns.
|
|
*/
|
|
|
|
using namespace std;
|
|
using namespace ue2;
|
|
|
|
// We store matches in a hash table as we're likely to see lots of them. These
|
|
// are moved into a ResultSet at the end.
|
|
using PcreMatchSet = ue2::ue2_unordered_set<pair<unsigned, unsigned>>;
|
|
|
|
namespace {
|
|
struct CalloutContext {
|
|
explicit CalloutContext(ostream &os) : out(os) {}
|
|
ostream &out;
|
|
PcreMatchSet matches;
|
|
};
|
|
}
|
|
|
|
static
|
|
int pcreCallOut(pcre_callout_block *block) {
|
|
assert(block);
|
|
assert(block->callout_data);
|
|
CalloutContext *ctx = static_cast<CalloutContext *>(block->callout_data);
|
|
|
|
if (echo_matches) {
|
|
ctx->out << "PCRE Match @ (" << block->start_match << ","
|
|
<< block->current_position << ")" << endl;
|
|
}
|
|
|
|
unsigned int from = block->start_match;
|
|
unsigned int to = block->current_position;
|
|
assert(from <= to);
|
|
|
|
ctx->matches.insert(make_pair(from, to));
|
|
return 1;
|
|
}
|
|
|
|
static
|
|
bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander,
|
|
bool *prefilter, bool *som, hs_expr_ext *ext) {
|
|
string regex;
|
|
unsigned int hs_flags = 0;
|
|
if (!readExpression(expr, regex, &hs_flags, ext)) {
|
|
return false;
|
|
}
|
|
|
|
expr.swap(regex);
|
|
|
|
if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som)) {
|
|
return false;
|
|
}
|
|
|
|
if (force_utf8) {
|
|
*flags |= PCRE_UTF8;
|
|
}
|
|
|
|
if (force_prefilter) {
|
|
*prefilter = true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static
|
|
string pcreErrStr(int err) {
|
|
switch (err) {
|
|
case PCRE_ERROR_NOMATCH:
|
|
return "PCRE_ERROR_NOMATCH";
|
|
case PCRE_ERROR_NULL:
|
|
return "PCRE_ERROR_NULL";
|
|
case PCRE_ERROR_BADOPTION:
|
|
return "PCRE_ERROR_BADOPTION";
|
|
case PCRE_ERROR_BADMAGIC:
|
|
return "PCRE_ERROR_BADMAGIC";
|
|
#if defined(PCRE_ERROR_UNKNOWN_OPCODE)
|
|
case PCRE_ERROR_UNKNOWN_OPCODE:
|
|
return "PCRE_ERROR_UNKNOWN_OPCODE";
|
|
#else
|
|
case PCRE_ERROR_UNKNOWN_NODE:
|
|
return "PCRE_ERROR_UNKNOWN_NODE";
|
|
#endif
|
|
case PCRE_ERROR_NOMEMORY:
|
|
return "PCRE_ERROR_NOMEMORY";
|
|
case PCRE_ERROR_NOSUBSTRING:
|
|
return "PCRE_ERROR_NOSUBSTRING";
|
|
case PCRE_ERROR_MATCHLIMIT:
|
|
return "PCRE_ERROR_MATCHLIMIT";
|
|
case PCRE_ERROR_CALLOUT:
|
|
return "PCRE_ERROR_CALLOUT";
|
|
case PCRE_ERROR_BADUTF8:
|
|
return "PCRE_ERROR_BADUTF8";
|
|
case PCRE_ERROR_BADUTF8_OFFSET:
|
|
return "PCRE_ERROR_BADUTF8_OFFSET";
|
|
case PCRE_ERROR_PARTIAL:
|
|
return "PCRE_ERROR_PARTIAL";
|
|
case PCRE_ERROR_BADPARTIAL:
|
|
return "PCRE_ERROR_BADPARTIAL";
|
|
case PCRE_ERROR_INTERNAL:
|
|
return "PCRE_ERROR_INTERNAL";
|
|
case PCRE_ERROR_BADCOUNT:
|
|
return "PCRE_ERROR_BADCOUNT";
|
|
#if defined(PCRE_ERROR_RECURSIONLIMIT)
|
|
case PCRE_ERROR_RECURSIONLIMIT:
|
|
return "PCRE_ERROR_RECURSIONLIMIT";
|
|
#endif
|
|
case PCRE_ERROR_DFA_UITEM:
|
|
return "PCRE_ERROR_DFA_UITEM";
|
|
case PCRE_ERROR_DFA_UCOND:
|
|
return "PCRE_ERROR_DFA_UCOND";
|
|
case PCRE_ERROR_DFA_UMLIMIT:
|
|
return "PCRE_ERROR_DFA_UMLIMIT";
|
|
case PCRE_ERROR_DFA_WSSIZE:
|
|
return "PCRE_ERROR_DFA_WSSIZE";
|
|
case PCRE_ERROR_DFA_RECURSE:
|
|
return "PCRE_ERROR_DFA_RECURSE";
|
|
default:
|
|
{
|
|
ostringstream oss;
|
|
oss << "Unknown PCRE error (value: " << err << ")";
|
|
return oss.str();
|
|
}
|
|
}
|
|
}
|
|
|
|
GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
|
|
unsigned long int limit,
|
|
unsigned long int limit_recursion)
|
|
: out(os), m_expr(expr), matchLimit(limit),
|
|
matchLimitRecursion(limit_recursion) {}
|
|
|
|
void GroundTruth::global_prep() {
|
|
// We're using pcre callouts
|
|
pcre_callout = &pcreCallOut;
|
|
}
|
|
|
|
static
|
|
void addCallout(string &re) {
|
|
// If the string begins with "(*UTF8)" or "(*UTF8)(*UCP)", we want to keep
|
|
// it at the front. We reuse the control verbs mini-parser for this.
|
|
size_t startpos = 0;
|
|
try {
|
|
ue2::ParseMode mode;
|
|
const char *ptr = ue2::read_control_verbs(
|
|
re.c_str(), re.c_str() + re.size(), 0, mode);
|
|
startpos = ptr - re.c_str();
|
|
} catch (const ue2::ParseError &err) {
|
|
// fall through
|
|
}
|
|
assert(startpos <= re.length());
|
|
re.insert(startpos, "(?:");
|
|
// We include a \E to close any open \Q quoted block. If there isn't
|
|
// one, pcre will ignore the \E.
|
|
re.append("\\E)(?C)");
|
|
}
|
|
|
|
unique_ptr<CompiledPcre>
|
|
GroundTruth::compile(unsigned id, bool no_callouts) {
|
|
bool highlander = false;
|
|
bool prefilter = false;
|
|
bool som = false;
|
|
|
|
// we can still match approximate matching patterns with PCRE if edit
|
|
// distance 0 is requested
|
|
if (force_edit_distance && edit_distance) {
|
|
throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
|
|
}
|
|
|
|
ExpressionMap::const_iterator i = m_expr.find(id);
|
|
if (i == m_expr.end()) {
|
|
throw PcreCompileFailure("ID not found in expression map.");
|
|
}
|
|
|
|
string re(i->second);
|
|
unsigned flags;
|
|
hs_expr_ext ext;
|
|
|
|
// Decode the flags
|
|
if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som, &ext)) {
|
|
throw PcreCompileFailure("Unable to decode flags.");
|
|
}
|
|
|
|
// filter out flags not supported by PCRE
|
|
u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET |
|
|
HS_EXT_FLAG_MIN_LENGTH;
|
|
if (ext.flags & ~supported) {
|
|
// edit distance is a known unsupported flag, so just throw a soft error
|
|
if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) {
|
|
throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
|
|
}
|
|
if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
|
|
throw SoftPcreCompileFailure(
|
|
"Hamming distance not supported by PCRE.");
|
|
}
|
|
throw PcreCompileFailure("Unsupported extended flags.");
|
|
}
|
|
|
|
// SOM flags might be set globally.
|
|
som |= !!somFlags;
|
|
|
|
// For traditional Hyperscan, add global callout to pattern.
|
|
if (!no_callouts) {
|
|
addCallout(re);
|
|
}
|
|
|
|
// Compile the pattern
|
|
const char *errptr = nullptr;
|
|
int errloc = 0;
|
|
int errcode = 0;
|
|
|
|
unique_ptr<CompiledPcre> compiled = make_unique<CompiledPcre>();
|
|
compiled->utf8 = flags & PCRE_UTF8;
|
|
compiled->highlander = highlander;
|
|
compiled->prefilter = prefilter;
|
|
compiled->som = som;
|
|
compiled->min_offset = ext.min_offset;
|
|
compiled->max_offset = ext.max_offset;
|
|
compiled->min_length = ext.min_length;
|
|
compiled->expression = i->second; // original PCRE
|
|
flags |= PCRE_NO_AUTO_POSSESS;
|
|
|
|
compiled->bytecode =
|
|
pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr);
|
|
|
|
if (!compiled->bytecode || errptr) {
|
|
assert(errcode);
|
|
ostringstream oss;
|
|
oss << "Failed to compile expression '" << re << '\'';
|
|
oss << " (" << errptr << " at " << errloc << ").";
|
|
if (errcode == 20) { // "regular expression is too large"
|
|
throw SoftPcreCompileFailure(oss.str());
|
|
} else if (errcode == 25) { // "lookbehind assertion is not fixed length"
|
|
throw SoftPcreCompileFailure(oss.str());
|
|
} else {
|
|
throw PcreCompileFailure(oss.str());
|
|
}
|
|
}
|
|
|
|
// Study the pattern
|
|
shared_ptr<pcre_extra> extra(pcre_study(compiled->bytecode, 0, &errptr),
|
|
free);
|
|
if (errptr) {
|
|
ostringstream oss;
|
|
oss << "Error studying pattern (" << errptr << ").";
|
|
throw PcreCompileFailure(oss.str());
|
|
}
|
|
|
|
int infoRes =
|
|
pcre_fullinfo(compiled->bytecode, extra.get(), PCRE_INFO_CAPTURECOUNT,
|
|
&compiled->captureCount);
|
|
if (infoRes < PCRE_ERROR_NOMATCH) {
|
|
ostringstream oss;
|
|
oss << "Error determining number of capturing subpatterns ("
|
|
<< pcreErrStr(infoRes) << ").";
|
|
throw PcreCompileFailure(oss.str());
|
|
}
|
|
|
|
return compiled;
|
|
}
|
|
|
|
static
|
|
void filterLeftmostSom(ResultSet &rs) {
|
|
if (rs.matches.size() <= 1) {
|
|
return;
|
|
}
|
|
|
|
set<u64a> seen; // End offsets.
|
|
set<MatchResult>::iterator it = rs.matches.begin();
|
|
while (it != rs.matches.end()) {
|
|
if (seen.insert(it->to).second) {
|
|
++it; // First time we've seen this end-offset.
|
|
} else {
|
|
rs.matches.erase(it++); // Dupe with a "righter" SOM.
|
|
}
|
|
}
|
|
}
|
|
|
|
static
|
|
void filterExtParams(ResultSet &rs, const CompiledPcre &compiled) {
|
|
set<MatchResult>::iterator it = rs.matches.begin();
|
|
while (it != rs.matches.end()) {
|
|
unsigned int from = it->from, to = it->to;
|
|
unsigned int len = to - from;
|
|
if (to < compiled.min_offset || to > compiled.max_offset ||
|
|
len < compiled.min_length) {
|
|
rs.matches.erase(it++);
|
|
} else {
|
|
++it;
|
|
}
|
|
}
|
|
}
|
|
|
|
static
|
|
int scanBasic(const CompiledPcre &compiled, const string &buffer,
|
|
const pcre_extra &extra, vector<int> &ovector,
|
|
CalloutContext &ctx) {
|
|
const size_t prefix_len = g_corpora_prefix.size();
|
|
const size_t suffix_len = g_corpora_suffix.size();
|
|
|
|
size_t begin_offset = prefix_len - MIN(50, prefix_len);
|
|
size_t real_len = buffer.size();
|
|
|
|
if (suffix_len > 2) {
|
|
real_len -= suffix_len - 2;
|
|
}
|
|
|
|
int flags = suffix_len ? PCRE_NOTEOL : 0;
|
|
int ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), real_len,
|
|
begin_offset, flags, &ovector[0], ovector.size());
|
|
|
|
if (!g_corpora_prefix.empty()) {
|
|
PcreMatchSet tmp;
|
|
tmp.swap(ctx.matches);
|
|
|
|
for (const auto &m : tmp) {
|
|
unsigned from = m.first;
|
|
unsigned to = m.second;
|
|
if (to >= prefix_len && to <= buffer.size() - suffix_len) {
|
|
from = from < prefix_len ? 0 : from - prefix_len;
|
|
to -= prefix_len;
|
|
ctx.matches.insert(make_pair(from, to));
|
|
}
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static
|
|
int scanOffset(const CompiledPcre &compiled, const string &buffer,
|
|
const pcre_extra &extra, vector<int> &ovector,
|
|
CalloutContext &ctx) {
|
|
size_t offset = MIN(100, g_streamOffset);
|
|
assert(offset > 0);
|
|
|
|
const string buf(string(offset, '\0') + buffer);
|
|
|
|
// First, scan our preamble so that we can discard any matches therein
|
|
// after the real scan, later. We use PCRE_NOTEOL so that end-anchors in
|
|
// our expression don't match at the end of the preamble.
|
|
int ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), offset, 0,
|
|
PCRE_NOTEOL, &ovector[0], ovector.size());
|
|
if (ret < PCRE_ERROR_NOMATCH) {
|
|
return ret;
|
|
}
|
|
|
|
PcreMatchSet pre_matches;
|
|
pre_matches.swap(ctx.matches);
|
|
|
|
// Real scan.
|
|
ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), buf.size(), 0, 0,
|
|
&ovector[0], ovector.size());
|
|
if (ret < PCRE_ERROR_NOMATCH) {
|
|
return ret;
|
|
}
|
|
|
|
// Erase any matches due entirely to the preamble.
|
|
for (const auto &m : pre_matches) {
|
|
ctx.matches.erase(m);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
|
|
const string &buffer, ResultSet &rs, string &error) {
|
|
CalloutContext ctx(out);
|
|
|
|
pcre_extra extra;
|
|
extra.flags = 0;
|
|
|
|
// Switch on callouts.
|
|
extra.flags |= PCRE_EXTRA_CALLOUT_DATA;
|
|
extra.callout_data = &ctx;
|
|
|
|
// Set the match_limit (in order to bound execution time on very complex
|
|
// patterns)
|
|
extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION);
|
|
extra.match_limit = matchLimit;
|
|
extra.match_limit_recursion = matchLimitRecursion;
|
|
|
|
#ifdef PCRE_NO_START_OPTIMIZE
|
|
// Switch off optimizations that may result in callouts not occurring.
|
|
extra.flags |= PCRE_NO_START_OPTIMIZE;
|
|
#endif
|
|
|
|
// Ensure there's enough room in the ovector for the capture groups in this
|
|
// pattern.
|
|
int ovecsize = (compiled.captureCount + 1) * 3;
|
|
ovector.resize(ovecsize);
|
|
|
|
int ret;
|
|
switch (colliderMode) {
|
|
case MODE_BLOCK:
|
|
case MODE_STREAMING:
|
|
case MODE_VECTORED:
|
|
if (g_streamOffset) {
|
|
ret = scanOffset(compiled, buffer, extra, ovector, ctx);
|
|
} else {
|
|
ret = scanBasic(compiled, buffer, extra, ovector, ctx);
|
|
}
|
|
break;
|
|
default:
|
|
assert(0);
|
|
ret = PCRE_ERROR_NULL;
|
|
break;
|
|
}
|
|
|
|
if (ret < PCRE_ERROR_NOMATCH) {
|
|
error = pcreErrStr(ret);
|
|
return false;
|
|
}
|
|
|
|
// Move matches into a ResultSet.
|
|
for (const auto &m : ctx.matches) {
|
|
unsigned long long from = m.first;
|
|
unsigned long long to = m.second;
|
|
|
|
if (g_streamOffset) {
|
|
// Subtract stream offset imposed by offset test.
|
|
unsigned long long offset = min(100ull, g_streamOffset);
|
|
assert(to >= offset);
|
|
from -= min(offset, from);
|
|
to -= offset;
|
|
}
|
|
|
|
rs.addMatch(from, to);
|
|
}
|
|
|
|
// If we have no matches, there's no further work to do.
|
|
if (rs.matches.empty()) {
|
|
return true;
|
|
}
|
|
|
|
if (compiled.som) {
|
|
filterLeftmostSom(rs);
|
|
}
|
|
|
|
filterExtParams(rs, compiled);
|
|
|
|
// If we haven't been asked for SOM, strip the from offsets.
|
|
if (!compiled.som) {
|
|
set<MatchResult> endonly;
|
|
for (const auto &m : rs.matches) {
|
|
endonly.insert(MatchResult(0, m.to));
|
|
}
|
|
rs.matches.swap(endonly);
|
|
}
|
|
|
|
return true;
|
|
}
|