hscollider: tool for testing Hyperscan match behaviour against PCRE

2026-01-17 16:00:26 +03:00 · 2017-12-12 09:29:20 +11:00
parent fae8d21127
commit 1330265ced
32 changed files with 6960 additions and 0 deletions
--- a/tools/hscollider/GroundTruth.cpp
+++ b/tools/hscollider/GroundTruth.cpp
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common.h"
+#include "ExpressionParser.h"
+#include "expressions.h"
+#include "GroundTruth.h"
+#include "pcre_util.h"
+
+#include "hs_compile.h" // for hs_expr_ext
+#include "ue2common.h"
+#include "parser/control_verbs.h"
+#include "parser/Parser.h"
+#include "parser/parse_error.h"
+#include "util/make_unique.h"
+#include "util/unicode_def.h"
+#include "util/unordered.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <pcre.h>
+
+/* -X, -Y support
+ * as PCRE performance is `non-linear' and these options add a large amount of
+ * scanning, the following short cuts are used:
+ * 1: the suffix is not scanned - we are more interested in the matches from
+ *    the original corpora.
+ * 2: only the last 50 bytes of the prefix is scanned. This may lead to some
+ *    minor correctness issues for a few patterns.
+ */
+
+using namespace std;
+using namespace ue2;
+
+// We store matches in a hash table as we're likely to see lots of them. These
+// are moved into a ResultSet at the end.
+using PcreMatchSet = ue2::ue2_unordered_set<pair<unsigned, unsigned>>;
+
+namespace {
+struct CalloutContext {
+    explicit CalloutContext(ostream &os) : out(os) {}
+    ostream &out;
+    PcreMatchSet matches;
+};
+}
+
+static
+int pcreCallOut(pcre_callout_block *block) {
+    assert(block);
+    assert(block->callout_data);
+    CalloutContext *ctx = static_cast<CalloutContext *>(block->callout_data);
+
+    if (echo_matches) {
+        ctx->out << "PCRE Match @ (" << block->start_match << ","
+                 << block->current_position << ")" << endl;
+    }
+
+    unsigned int from = block->start_match;
+    unsigned int to = block->current_position;
+    assert(from <= to);
+
+    ctx->matches.insert(make_pair(from, to));
+    return 1;
+}
+
+static
+bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander,
+                    bool *prefilter, bool *som, hs_expr_ext *ext) {
+    string regex;
+    unsigned int hs_flags = 0;
+    if (!readExpression(expr, regex, &hs_flags, ext)) {
+        return false;
+    }
+
+    expr.swap(regex);
+
+    if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som)) {
+        return false;
+    }
+
+    if (force_utf8) {
+        *flags |= PCRE_UTF8;
+    }
+
+    if (force_prefilter) {
+        *prefilter = true;
+    }
+
+    return true;
+}
+
+static
+string pcreErrStr(int err) {
+    switch (err) {
+        case PCRE_ERROR_NOMATCH:
+            return "PCRE_ERROR_NOMATCH";
+        case PCRE_ERROR_NULL:
+            return "PCRE_ERROR_NULL";
+        case PCRE_ERROR_BADOPTION:
+            return "PCRE_ERROR_BADOPTION";
+        case PCRE_ERROR_BADMAGIC:
+            return "PCRE_ERROR_BADMAGIC";
+#if defined(PCRE_ERROR_UNKNOWN_OPCODE)
+        case PCRE_ERROR_UNKNOWN_OPCODE:
+            return "PCRE_ERROR_UNKNOWN_OPCODE";
+#else
+         case PCRE_ERROR_UNKNOWN_NODE:
+             return "PCRE_ERROR_UNKNOWN_NODE";
+#endif
+        case PCRE_ERROR_NOMEMORY:
+            return "PCRE_ERROR_NOMEMORY";
+        case PCRE_ERROR_NOSUBSTRING:
+            return "PCRE_ERROR_NOSUBSTRING";
+        case PCRE_ERROR_MATCHLIMIT:
+            return "PCRE_ERROR_MATCHLIMIT";
+        case PCRE_ERROR_CALLOUT:
+            return "PCRE_ERROR_CALLOUT";
+        case PCRE_ERROR_BADUTF8:
+            return "PCRE_ERROR_BADUTF8";
+        case PCRE_ERROR_BADUTF8_OFFSET:
+            return "PCRE_ERROR_BADUTF8_OFFSET";
+        case PCRE_ERROR_PARTIAL:
+            return "PCRE_ERROR_PARTIAL";
+        case PCRE_ERROR_BADPARTIAL:
+            return "PCRE_ERROR_BADPARTIAL";
+        case PCRE_ERROR_INTERNAL:
+            return "PCRE_ERROR_INTERNAL";
+        case PCRE_ERROR_BADCOUNT:
+            return "PCRE_ERROR_BADCOUNT";
+#if defined(PCRE_ERROR_RECURSIONLIMIT)
+        case PCRE_ERROR_RECURSIONLIMIT:
+            return "PCRE_ERROR_RECURSIONLIMIT";
+#endif
+        case PCRE_ERROR_DFA_UITEM:
+            return "PCRE_ERROR_DFA_UITEM";
+        case PCRE_ERROR_DFA_UCOND:
+            return "PCRE_ERROR_DFA_UCOND";
+        case PCRE_ERROR_DFA_UMLIMIT:
+            return "PCRE_ERROR_DFA_UMLIMIT";
+        case PCRE_ERROR_DFA_WSSIZE:
+            return "PCRE_ERROR_DFA_WSSIZE";
+        case PCRE_ERROR_DFA_RECURSE:
+            return "PCRE_ERROR_DFA_RECURSE";
+        default:
+            {
+                ostringstream oss;
+                oss << "Unknown PCRE error (value: " << err << ")";
+                return oss.str();
+            }
+    }
+}
+
+GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
+                         unsigned long int limit,
+                         unsigned long int limit_recursion)
+    : out(os), m_expr(expr), matchLimit(limit),
+      matchLimitRecursion(limit_recursion) {}
+
+void GroundTruth::global_prep() {
+    // We're using pcre callouts
+    pcre_callout = &pcreCallOut;
+}
+
+static
+void addCallout(string &re) {
+    // If the string begins with "(*UTF8)" or "(*UTF8)(*UCP)", we want to keep
+    // it at the front. We reuse the control verbs mini-parser for this.
+    size_t startpos = 0;
+    try {
+        ue2::ParseMode mode;
+        const char *ptr = ue2::read_control_verbs(
+            re.c_str(), re.c_str() + re.size(), 0, mode);
+        startpos = ptr - re.c_str();
+    } catch (const ue2::ParseError &err) {
+        // fall through
+    }
+    assert(startpos <= re.length());
+    re.insert(startpos, "(?:");
+    // We include a \E to close any open \Q quoted block. If there isn't
+    // one, pcre will ignore the \E.
+    re.append("\\E)(?C)");
+}
+
+unique_ptr<CompiledPcre>
+GroundTruth::compile(unsigned id, bool no_callouts) {
+    bool highlander = false;
+    bool prefilter = false;
+    bool som = false;
+
+    // we can still match approximate matching patterns with PCRE if edit
+    // distance 0 is requested
+    if (force_edit_distance && edit_distance) {
+        throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
+    }
+
+    ExpressionMap::const_iterator i = m_expr.find(id);
+    if (i == m_expr.end()) {
+        throw PcreCompileFailure("ID not found in expression map.");
+    }
+
+    string re(i->second);
+    unsigned flags;
+    hs_expr_ext ext;
+
+    // Decode the flags
+    if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som, &ext)) {
+        throw PcreCompileFailure("Unable to decode flags.");
+    }
+
+    // filter out flags not supported by PCRE
+    u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET |
+                     HS_EXT_FLAG_MIN_LENGTH;
+    if (ext.flags & ~supported) {
+        // edit distance is a known unsupported flag, so just throw a soft error
+        if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) {
+            throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
+        }
+        if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
+            throw SoftPcreCompileFailure(
+                "Hamming distance not supported by PCRE.");
+        }
+        throw PcreCompileFailure("Unsupported extended flags.");
+    }
+
+    // SOM flags might be set globally.
+    som |= !!somFlags;
+
+    // For traditional Hyperscan, add global callout to pattern.
+    if (!no_callouts) {
+        addCallout(re);
+    }
+
+    // Compile the pattern
+    const char *errptr = nullptr;
+    int errloc = 0;
+    int errcode = 0;
+
+    unique_ptr<CompiledPcre> compiled = make_unique<CompiledPcre>();
+    compiled->utf8 = flags & PCRE_UTF8;
+    compiled->highlander = highlander;
+    compiled->prefilter = prefilter;
+    compiled->som = som;
+    compiled->min_offset = ext.min_offset;
+    compiled->max_offset = ext.max_offset;
+    compiled->min_length = ext.min_length;
+    compiled->expression = i->second; // original PCRE
+    flags |= PCRE_NO_AUTO_POSSESS;
+
+    compiled->bytecode =
+        pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr);
+
+    if (!compiled->bytecode || errptr) {
+        assert(errcode);
+        ostringstream oss;
+        oss << "Failed to compile expression '" << re << '\'';
+        oss << " (" << errptr << " at " << errloc << ").";
+        if (errcode == 20) { // "regular expression is too large"
+            throw SoftPcreCompileFailure(oss.str());
+        } else if (errcode == 25) { // "lookbehind assertion is not fixed length"
+            throw SoftPcreCompileFailure(oss.str());
+        } else {
+            throw PcreCompileFailure(oss.str());
+        }
+    }
+
+    // Study the pattern
+    shared_ptr<pcre_extra> extra(pcre_study(compiled->bytecode, 0, &errptr),
+                                 free);
+    if (errptr) {
+        ostringstream oss;
+        oss << "Error studying pattern (" << errptr << ").";
+        throw PcreCompileFailure(oss.str());
+    }
+
+    int infoRes =
+        pcre_fullinfo(compiled->bytecode, extra.get(), PCRE_INFO_CAPTURECOUNT,
+                      &compiled->captureCount);
+    if (infoRes < PCRE_ERROR_NOMATCH) {
+        ostringstream oss;
+        oss << "Error determining number of capturing subpatterns ("
+            << pcreErrStr(infoRes) << ").";
+        throw PcreCompileFailure(oss.str());
+    }
+
+    return compiled;
+}
+
+static
+void filterLeftmostSom(ResultSet &rs) {
+    if (rs.matches.size() <= 1) {
+        return;
+    }
+
+    set<u64a> seen; // End offsets.
+    set<MatchResult>::iterator it = rs.matches.begin();
+    while (it != rs.matches.end()) {
+        if (seen.insert(it->to).second) {
+            ++it; // First time we've seen this end-offset.
+        } else {
+            rs.matches.erase(it++); // Dupe with a "righter" SOM.
+        }
+    }
+}
+
+static
+void filterExtParams(ResultSet &rs, const CompiledPcre &compiled) {
+    set<MatchResult>::iterator it = rs.matches.begin();
+    while (it != rs.matches.end()) {
+        unsigned int from = it->from, to = it->to;
+        unsigned int len = to - from;
+        if (to < compiled.min_offset || to > compiled.max_offset ||
+                len < compiled.min_length) {
+            rs.matches.erase(it++);
+        } else {
+            ++it;
+        }
+    }
+}
+
+static
+int scanBasic(const CompiledPcre &compiled, const string &buffer,
+              const pcre_extra &extra, vector<int> &ovector,
+              CalloutContext &ctx) {
+    const size_t prefix_len = g_corpora_prefix.size();
+    const size_t suffix_len = g_corpora_suffix.size();
+
+    size_t begin_offset = prefix_len - MIN(50, prefix_len);
+    size_t real_len = buffer.size();
+
+    if (suffix_len > 2) {
+        real_len -= suffix_len - 2;
+    }
+
+    int flags = suffix_len ? PCRE_NOTEOL : 0;
+    int ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), real_len,
+                        begin_offset, flags, &ovector[0], ovector.size());
+
+    if (!g_corpora_prefix.empty()) {
+        PcreMatchSet tmp;
+        tmp.swap(ctx.matches);
+
+        for (const auto &m : tmp) {
+            unsigned from = m.first;
+            unsigned to = m.second;
+            if (to >= prefix_len && to <= buffer.size() - suffix_len) {
+                from = from < prefix_len ? 0 : from - prefix_len;
+                to -= prefix_len;
+                ctx.matches.insert(make_pair(from, to));
+            }
+        }
+    }
+
+    return ret;
+}
+
+static
+int scanOffset(const CompiledPcre &compiled, const string &buffer,
+               const pcre_extra &extra, vector<int> &ovector,
+               CalloutContext &ctx) {
+    size_t offset = MIN(100, g_streamOffset);
+    assert(offset > 0);
+
+    const string buf(string(offset, '\0') + buffer);
+
+    // First, scan our preamble so that we can discard any matches therein
+    // after the real scan, later. We use PCRE_NOTEOL so that end-anchors in
+    // our expression don't match at the end of the preamble.
+    int ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), offset, 0,
+                        PCRE_NOTEOL, &ovector[0], ovector.size());
+    if (ret < PCRE_ERROR_NOMATCH) {
+        return ret;
+    }
+
+    PcreMatchSet pre_matches;
+    pre_matches.swap(ctx.matches);
+
+    // Real scan.
+    ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), buf.size(), 0, 0,
+                    &ovector[0], ovector.size());
+    if (ret < PCRE_ERROR_NOMATCH) {
+        return ret;
+    }
+
+    // Erase any matches due entirely to the preamble.
+    for (const auto &m : pre_matches) {
+        ctx.matches.erase(m);
+    }
+
+    return ret;
+}
+
+bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
+                      const string &buffer, ResultSet &rs, string &error) {
+    CalloutContext ctx(out);
+
+    pcre_extra extra;
+    extra.flags = 0;
+
+    // Switch on callouts.
+    extra.flags |= PCRE_EXTRA_CALLOUT_DATA;
+    extra.callout_data = &ctx;
+
+    // Set the match_limit (in order to bound execution time on very complex
+    // patterns)
+    extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION);
+    extra.match_limit = matchLimit;
+    extra.match_limit_recursion = matchLimitRecursion;
+
+#ifdef PCRE_NO_START_OPTIMIZE
+    // Switch off optimizations that may result in callouts not occurring.
+    extra.flags |= PCRE_NO_START_OPTIMIZE;
+#endif
+
+    // Ensure there's enough room in the ovector for the capture groups in this
+    // pattern.
+    int ovecsize = (compiled.captureCount + 1) * 3;
+    ovector.resize(ovecsize);
+
+    int ret;
+    switch (colliderMode) {
+    case MODE_BLOCK:
+    case MODE_STREAMING:
+    case MODE_VECTORED:
+        if (g_streamOffset) {
+            ret = scanOffset(compiled, buffer, extra, ovector, ctx);
+        } else {
+            ret = scanBasic(compiled, buffer, extra, ovector, ctx);
+        }
+        break;
+    default:
+        assert(0);
+        ret = PCRE_ERROR_NULL;
+        break;
+    }
+
+    if (ret < PCRE_ERROR_NOMATCH) {
+        error = pcreErrStr(ret);
+        return false;
+    }
+
+    // Move matches into a ResultSet.
+    for (const auto &m : ctx.matches) {
+        unsigned long long from = m.first;
+        unsigned long long to = m.second;
+
+        if (g_streamOffset) {
+            // Subtract stream offset imposed by offset test.
+            unsigned long long offset = min(100ull, g_streamOffset);
+            assert(to >= offset);
+            from -= min(offset, from);
+            to -= offset;
+        }
+
+        rs.addMatch(from, to);
+    }
+
+    // If we have no matches, there's no further work to do.
+    if (rs.matches.empty()) {
+        return true;
+    }
+
+    if (compiled.som) {
+        filterLeftmostSom(rs);
+    }
+
+    filterExtParams(rs, compiled);
+
+    // If we haven't been asked for SOM, strip the from offsets.
+    if (!compiled.som) {
+        set<MatchResult> endonly;
+        for (const auto &m : rs.matches) {
+            endonly.insert(MatchResult(0, m.to));
+        }
+        rs.matches.swap(endonly);
+    }
+
+    return true;
+}