vectorscan/tools/hscollider/GroundTruth.cpp

/*
 * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"

#include "common.h"
#include "ExpressionParser.h"
#include "expressions.h"
#include "GroundTruth.h"
#include "pcre_util.h"

#include "hs_compile.h" // for hs_expr_ext
#include "ue2common.h"
#include "parser/control_verbs.h"
#include "parser/Parser.h"
#include "parser/parse_error.h"
#include "util/make_unique.h"
#include "util/unicode_def.h"
#include "util/unordered.h"

#include <algorithm>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ostream>
#include <sstream>
#include <string>
#include <vector>

#include <pcre.h>

/* -X, -Y support
 * as PCRE performance is `non-linear' and these options add a large amount of
 * scanning, the following short cuts are used:
 * 1: the suffix is not scanned - we are more interested in the matches from
 *    the original corpora.
 * 2: only the last 50 bytes of the prefix is scanned. This may lead to some
 *    minor correctness issues for a few patterns.
 */

using namespace std;
using namespace ue2;

// We store matches in a hash table as we're likely to see lots of them. These
// are moved into a ResultSet at the end.
using PcreMatchSet = ue2::ue2_unordered_set<pair<unsigned, unsigned>>;

namespace {
struct CalloutContext {
    explicit CalloutContext(ostream &os) : out(os) {}
    ostream &out;
    PcreMatchSet matches;
};
}

static
int pcreCallOut(pcre_callout_block *block) {
    assert(block);
    assert(block->callout_data);
    CalloutContext *ctx = static_cast<CalloutContext *>(block->callout_data);

    if (echo_matches) {
        ctx->out << "PCRE Match @ (" << block->start_match << ","
                 << block->current_position << ")" << endl;
    }

    unsigned int from = block->start_match;
    unsigned int to = block->current_position;
    assert(from <= to);

    ctx->matches.insert(make_pair(from, to));
    return 1;
}

static
bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander,
                    bool *prefilter, bool *som, hs_expr_ext *ext) {
    string regex;
    unsigned int hs_flags = 0;
    if (!readExpression(expr, regex, &hs_flags, ext)) {
        return false;
    }

    expr.swap(regex);

    if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som)) {
        return false;
    }

    if (force_utf8) {
        *flags |= PCRE_UTF8;
    }

    if (force_prefilter) {
        *prefilter = true;
    }

    return true;
}

static
string pcreErrStr(int err) {
    switch (err) {
        case PCRE_ERROR_NOMATCH:
            return "PCRE_ERROR_NOMATCH";
        case PCRE_ERROR_NULL:
            return "PCRE_ERROR_NULL";
        case PCRE_ERROR_BADOPTION:
            return "PCRE_ERROR_BADOPTION";
        case PCRE_ERROR_BADMAGIC:
            return "PCRE_ERROR_BADMAGIC";
#if defined(PCRE_ERROR_UNKNOWN_OPCODE)
        case PCRE_ERROR_UNKNOWN_OPCODE:
            return "PCRE_ERROR_UNKNOWN_OPCODE";
#else
         case PCRE_ERROR_UNKNOWN_NODE:
             return "PCRE_ERROR_UNKNOWN_NODE";
#endif
        case PCRE_ERROR_NOMEMORY:
            return "PCRE_ERROR_NOMEMORY";
        case PCRE_ERROR_NOSUBSTRING:
            return "PCRE_ERROR_NOSUBSTRING";
        case PCRE_ERROR_MATCHLIMIT:
            return "PCRE_ERROR_MATCHLIMIT";
        case PCRE_ERROR_CALLOUT:
            return "PCRE_ERROR_CALLOUT";
        case PCRE_ERROR_BADUTF8:
            return "PCRE_ERROR_BADUTF8";
        case PCRE_ERROR_BADUTF8_OFFSET:
            return "PCRE_ERROR_BADUTF8_OFFSET";
        case PCRE_ERROR_PARTIAL:
            return "PCRE_ERROR_PARTIAL";
        case PCRE_ERROR_BADPARTIAL:
            return "PCRE_ERROR_BADPARTIAL";
        case PCRE_ERROR_INTERNAL:
            return "PCRE_ERROR_INTERNAL";
        case PCRE_ERROR_BADCOUNT:
            return "PCRE_ERROR_BADCOUNT";
#if defined(PCRE_ERROR_RECURSIONLIMIT)
        case PCRE_ERROR_RECURSIONLIMIT:
            return "PCRE_ERROR_RECURSIONLIMIT";
#endif
        case PCRE_ERROR_DFA_UITEM:
            return "PCRE_ERROR_DFA_UITEM";
        case PCRE_ERROR_DFA_UCOND:
            return "PCRE_ERROR_DFA_UCOND";
        case PCRE_ERROR_DFA_UMLIMIT:
            return "PCRE_ERROR_DFA_UMLIMIT";
        case PCRE_ERROR_DFA_WSSIZE:
            return "PCRE_ERROR_DFA_WSSIZE";
        case PCRE_ERROR_DFA_RECURSE:
            return "PCRE_ERROR_DFA_RECURSE";
        default:
            {
                ostringstream oss;
                oss << "Unknown PCRE error (value: " << err << ")";
                return oss.str();
            }
    }
}

GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
                         unsigned long int limit,
                         unsigned long int limit_recursion)
    : out(os), m_expr(expr), matchLimit(limit),
      matchLimitRecursion(limit_recursion) {}

void GroundTruth::global_prep() {
    // We're using pcre callouts
    pcre_callout = &pcreCallOut;
}

static
void addCallout(string &re) {
    // If the string begins with "(*UTF8)" or "(*UTF8)(*UCP)", we want to keep
    // it at the front. We reuse the control verbs mini-parser for this.
    size_t startpos = 0;
    try {
        ue2::ParseMode mode;
        const char *ptr = ue2::read_control_verbs(
            re.c_str(), re.c_str() + re.size(), 0, mode);
        startpos = ptr - re.c_str();
    } catch (const ue2::ParseError &err) {
        // fall through
    }
    assert(startpos <= re.length());
    re.insert(startpos, "(?:");
    // We include a \E to close any open \Q quoted block. If there isn't
    // one, pcre will ignore the \E.
    re.append("\\E)(?C)");
}

unique_ptr<CompiledPcre>
GroundTruth::compile(unsigned id, bool no_callouts) {
    bool highlander = false;
    bool prefilter = false;
    bool som = false;

    // we can still match approximate matching patterns with PCRE if edit
    // distance 0 is requested
    if (force_edit_distance && edit_distance) {
        throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
    }

    ExpressionMap::const_iterator i = m_expr.find(id);
    if (i == m_expr.end()) {
        throw PcreCompileFailure("ID not found in expression map.");
    }

    string re(i->second);
    unsigned flags;
    hs_expr_ext ext;

    // Decode the flags
    if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som, &ext)) {
        throw PcreCompileFailure("Unable to decode flags.");
    }

    // filter out flags not supported by PCRE
    u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET |
                     HS_EXT_FLAG_MIN_LENGTH;
    if (ext.flags & ~supported) {
        // edit distance is a known unsupported flag, so just throw a soft error
        if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) {
            throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
        }
        if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
            throw SoftPcreCompileFailure(
                "Hamming distance not supported by PCRE.");
        }
        throw PcreCompileFailure("Unsupported extended flags.");
    }

    // SOM flags might be set globally.
    som |= !!somFlags;

    // For traditional Hyperscan, add global callout to pattern.
    if (!no_callouts) {
        addCallout(re);
    }

    // Compile the pattern
    const char *errptr = nullptr;
    int errloc = 0;
    int errcode = 0;

    unique_ptr<CompiledPcre> compiled = make_unique<CompiledPcre>();
    compiled->utf8 = flags & PCRE_UTF8;
    compiled->highlander = highlander;
    compiled->prefilter = prefilter;
    compiled->som = som;
    compiled->min_offset = ext.min_offset;
    compiled->max_offset = ext.max_offset;
    compiled->min_length = ext.min_length;
    compiled->expression = i->second; // original PCRE
    flags |= PCRE_NO_AUTO_POSSESS;

    compiled->bytecode =
        pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr);

    if (!compiled->bytecode || errptr) {
        assert(errcode);
        ostringstream oss;
        oss << "Failed to compile expression '" << re << '\'';
        oss << " (" << errptr << " at " << errloc << ").";
        if (errcode == 20) { // "regular expression is too large"
            throw SoftPcreCompileFailure(oss.str());
        } else if (errcode == 25) { // "lookbehind assertion is not fixed length"
            throw SoftPcreCompileFailure(oss.str());
        } else {
            throw PcreCompileFailure(oss.str());
        }
    }

    // Study the pattern
    shared_ptr<pcre_extra> extra(pcre_study(compiled->bytecode, 0, &errptr),
                                 free);
    if (errptr) {
        ostringstream oss;
        oss << "Error studying pattern (" << errptr << ").";
        throw PcreCompileFailure(oss.str());
    }

    int infoRes =
        pcre_fullinfo(compiled->bytecode, extra.get(), PCRE_INFO_CAPTURECOUNT,
                      &compiled->captureCount);
    if (infoRes < PCRE_ERROR_NOMATCH) {
        ostringstream oss;
        oss << "Error determining number of capturing subpatterns ("
            << pcreErrStr(infoRes) << ").";
        throw PcreCompileFailure(oss.str());
    }

    return compiled;
}

static
void filterLeftmostSom(ResultSet &rs) {
    if (rs.matches.size() <= 1) {
        return;
    }

    set<u64a> seen; // End offsets.
    set<MatchResult>::iterator it = rs.matches.begin();
    while (it != rs.matches.end()) {
        if (seen.insert(it->to).second) {
            ++it; // First time we've seen this end-offset.
        } else {
            rs.matches.erase(it++); // Dupe with a "righter" SOM.
        }
    }
}

static
void filterExtParams(ResultSet &rs, const CompiledPcre &compiled) {
    set<MatchResult>::iterator it = rs.matches.begin();
    while (it != rs.matches.end()) {
        unsigned int from = it->from, to = it->to;
        unsigned int len = to - from;
        if (to < compiled.min_offset || to > compiled.max_offset ||
                len < compiled.min_length) {
            rs.matches.erase(it++);
        } else {
            ++it;
        }
    }
}

static
int scanBasic(const CompiledPcre &compiled, const string &buffer,
              const pcre_extra &extra, vector<int> &ovector,
              CalloutContext &ctx) {
    const size_t prefix_len = g_corpora_prefix.size();
    const size_t suffix_len = g_corpora_suffix.size();

    size_t begin_offset = prefix_len - MIN(50, prefix_len);
    size_t real_len = buffer.size();

    if (suffix_len > 2) {
        real_len -= suffix_len - 2;
    }

    int flags = suffix_len ? PCRE_NOTEOL : 0;
    int ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), real_len,
                        begin_offset, flags, &ovector[0], ovector.size());

    if (!g_corpora_prefix.empty()) {
        PcreMatchSet tmp;
        tmp.swap(ctx.matches);

        for (const auto &m : tmp) {
            unsigned from = m.first;
            unsigned to = m.second;
            if (to >= prefix_len && to <= buffer.size() - suffix_len) {
                from = from < prefix_len ? 0 : from - prefix_len;
                to -= prefix_len;
                ctx.matches.insert(make_pair(from, to));
            }
        }
    }

    return ret;
}

static
int scanOffset(const CompiledPcre &compiled, const string &buffer,
               const pcre_extra &extra, vector<int> &ovector,
               CalloutContext &ctx) {
    size_t offset = MIN(100, g_streamOffset);
    assert(offset > 0);

    const string buf(string(offset, '\0') + buffer);

    // First, scan our preamble so that we can discard any matches therein
    // after the real scan, later. We use PCRE_NOTEOL so that end-anchors in
    // our expression don't match at the end of the preamble.
    int ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), offset, 0,
                        PCRE_NOTEOL, &ovector[0], ovector.size());
    if (ret < PCRE_ERROR_NOMATCH) {
        return ret;
    }

    PcreMatchSet pre_matches;
    pre_matches.swap(ctx.matches);

    // Real scan.
    ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), buf.size(), 0, 0,
                    &ovector[0], ovector.size());
    if (ret < PCRE_ERROR_NOMATCH) {
        return ret;
    }

    // Erase any matches due entirely to the preamble.
    for (const auto &m : pre_matches) {
        ctx.matches.erase(m);
    }

    return ret;
}

bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
                      const string &buffer, ResultSet &rs, string &error) {
    CalloutContext ctx(out);

    pcre_extra extra;
    extra.flags = 0;

    // Switch on callouts.
    extra.flags |= PCRE_EXTRA_CALLOUT_DATA;
    extra.callout_data = &ctx;

    // Set the match_limit (in order to bound execution time on very complex
    // patterns)
    extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION);
    extra.match_limit = matchLimit;
    extra.match_limit_recursion = matchLimitRecursion;

#ifdef PCRE_NO_START_OPTIMIZE
    // Switch off optimizations that may result in callouts not occurring.
    extra.flags |= PCRE_NO_START_OPTIMIZE;
#endif

    // Ensure there's enough room in the ovector for the capture groups in this
    // pattern.
    int ovecsize = (compiled.captureCount + 1) * 3;
    ovector.resize(ovecsize);

    int ret;
    switch (colliderMode) {
    case MODE_BLOCK:
    case MODE_STREAMING:
    case MODE_VECTORED:
        if (g_streamOffset) {
            ret = scanOffset(compiled, buffer, extra, ovector, ctx);
        } else {
            ret = scanBasic(compiled, buffer, extra, ovector, ctx);
        }
        break;
    default:
        assert(0);
        ret = PCRE_ERROR_NULL;
        break;
    }

    if (ret < PCRE_ERROR_NOMATCH) {
        error = pcreErrStr(ret);
        return false;
    }

    // Move matches into a ResultSet.
    for (const auto &m : ctx.matches) {
        unsigned long long from = m.first;
        unsigned long long to = m.second;

        if (g_streamOffset) {
            // Subtract stream offset imposed by offset test.
            unsigned long long offset = min(100ull, g_streamOffset);
            assert(to >= offset);
            from -= min(offset, from);
            to -= offset;
        }

        rs.addMatch(from, to);
    }

    // If we have no matches, there's no further work to do.
    if (rs.matches.empty()) {
        return true;
    }

    if (compiled.som) {
        filterLeftmostSom(rs);
    }

    filterExtParams(rs, compiled);

    // If we haven't been asked for SOM, strip the from offsets.
    if (!compiled.som) {
        set<MatchResult> endonly;
        for (const auto &m : rs.matches) {
            endonly.insert(MatchResult(0, m.to));
        }
        rs.matches.swap(endonly);
    }

    return true;
}