vectorscan/tools/hscollider/GroundTruth.cpp

/*
 * Copyright (c) 2015-2019, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifdef _WIN32
#define PCRE_STATIC
#endif
#include "config.h"

#include "common.h"
#include "ExpressionParser.h"
#include "expressions.h"
#include "GroundTruth.h"
#include "pcre_util.h"

#include "hs_compile.h" // for hs_expr_ext
#include "ue2common.h"
#include "parser/control_verbs.h"
#include "parser/Parser.h"
#include "parser/parse_error.h"
#include "util/make_unique.h"
#include "util/string_util.h"
#include "util/unicode_def.h"
#include "util/unordered.h"

#include <algorithm>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ostream>
#include <sstream>
#include <string>
#include <vector>

#include <pcre.h>

/* -X, -Y support
 * as PCRE performance is `non-linear' and these options add a large amount of
 * scanning, the following short cuts are used:
 * 1: the suffix is not scanned - we are more interested in the matches from
 *    the original corpora.
 * 2: only the last 50 bytes of the prefix is scanned. This may lead to some
 *    minor correctness issues for a few patterns.
 */

using namespace std;
using namespace ue2;

// We store matches in a hash table as we're likely to see lots of them. These
// are moved into a ResultSet at the end.
using PcreMatchSet = ue2::ue2_unordered_set<pair<unsigned, unsigned>>;

namespace {
struct CalloutContext {
    explicit CalloutContext(ostream &os) : out(os) {}
    ostream &out;
    PcreMatchSet matches;
};
}

static
int pcreCallOut(pcre_callout_block *block) {
    assert(block);
    assert(block->callout_data);
    CalloutContext *ctx = static_cast<CalloutContext *>(block->callout_data);

    if (echo_matches) {
        ctx->out << "PCRE Match @ (" << block->start_match << ","
                 << block->current_position << ")" << endl;
    }

    unsigned int from = block->start_match;
    unsigned int to = block->current_position;
    assert(from <= to);

    ctx->matches.insert(make_pair(from, to));
    return 1;
}

static
bool decodeExprPcre(string &expr, unsigned *flags, bool *highlander,
                    bool *prefilter, bool *som, bool *combination,
                    bool *quiet, hs_expr_ext *ext) {
    string regex;
    unsigned int hs_flags = 0;
    if (!readExpression(expr, regex, &hs_flags, ext)) {
        return false;
    }

    if (use_literal_api) {
        // filter out flags not supported by pure literal API.
        u32 not_supported = HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8 |
                             HS_FLAG_UCP | HS_FLAG_PREFILTER;
        hs_flags &= ~not_supported;
        force_utf8 = false;
        force_prefilter = false;
    }

    expr.swap(regex);

    if (!getPcreFlags(hs_flags, flags, highlander, prefilter, som,
                      combination, quiet)) {
        return false;
    }

    if (force_utf8) {
        *flags |= PCRE_UTF8;
    }

    if (force_prefilter) {
        *prefilter = true;
    }

    return true;
}

static
string pcreErrStr(int err) {
    switch (err) {
        case PCRE_ERROR_NOMATCH:
            return "PCRE_ERROR_NOMATCH";
        case PCRE_ERROR_NULL:
            return "PCRE_ERROR_NULL";
        case PCRE_ERROR_BADOPTION:
            return "PCRE_ERROR_BADOPTION";
        case PCRE_ERROR_BADMAGIC:
            return "PCRE_ERROR_BADMAGIC";
#if defined(PCRE_ERROR_UNKNOWN_OPCODE)
        case PCRE_ERROR_UNKNOWN_OPCODE:
            return "PCRE_ERROR_UNKNOWN_OPCODE";
#else
         case PCRE_ERROR_UNKNOWN_NODE:
             return "PCRE_ERROR_UNKNOWN_NODE";
#endif
        case PCRE_ERROR_NOMEMORY:
            return "PCRE_ERROR_NOMEMORY";
        case PCRE_ERROR_NOSUBSTRING:
            return "PCRE_ERROR_NOSUBSTRING";
        case PCRE_ERROR_MATCHLIMIT:
            return "PCRE_ERROR_MATCHLIMIT";
        case PCRE_ERROR_CALLOUT:
            return "PCRE_ERROR_CALLOUT";
        case PCRE_ERROR_BADUTF8:
            return "PCRE_ERROR_BADUTF8";
        case PCRE_ERROR_BADUTF8_OFFSET:
            return "PCRE_ERROR_BADUTF8_OFFSET";
        case PCRE_ERROR_PARTIAL:
            return "PCRE_ERROR_PARTIAL";
        case PCRE_ERROR_BADPARTIAL:
            return "PCRE_ERROR_BADPARTIAL";
        case PCRE_ERROR_INTERNAL:
            return "PCRE_ERROR_INTERNAL";
        case PCRE_ERROR_BADCOUNT:
            return "PCRE_ERROR_BADCOUNT";
#if defined(PCRE_ERROR_RECURSIONLIMIT)
        case PCRE_ERROR_RECURSIONLIMIT:
            return "PCRE_ERROR_RECURSIONLIMIT";
#endif
        case PCRE_ERROR_DFA_UITEM:
            return "PCRE_ERROR_DFA_UITEM";
        case PCRE_ERROR_DFA_UCOND:
            return "PCRE_ERROR_DFA_UCOND";
        case PCRE_ERROR_DFA_UMLIMIT:
            return "PCRE_ERROR_DFA_UMLIMIT";
        case PCRE_ERROR_DFA_WSSIZE:
            return "PCRE_ERROR_DFA_WSSIZE";
        case PCRE_ERROR_DFA_RECURSE:
            return "PCRE_ERROR_DFA_RECURSE";
        default:
            {
                ostringstream oss;
                oss << "Unknown PCRE error (value: " << err << ")";
                return oss.str();
            }
    }
}

/* that is, a mode provided by native hyperscan */
static
bool isStandardMode(unsigned int mode) {
    return mode == MODE_BLOCK
        || mode == MODE_STREAMING
        || mode == MODE_VECTORED;
}

GroundTruth::GroundTruth(ostream &os, const ExpressionMap &expr,
                         unsigned long int limit,
                         unsigned long int limit_recursion)
    : out(os), m_expr(expr), matchLimit(limit),
      matchLimitRecursion(limit_recursion) {}

void GroundTruth::global_prep() {
    if (isStandardMode(colliderMode)) {
        // We're using pcre callouts
        pcre_callout = &pcreCallOut;
    }
}

static
void addCallout(string &re) {
    // If the string begins with "(*UTF8)" or "(*UTF8)(*UCP)", we want to keep
    // it at the front. We reuse the control verbs mini-parser for this.
    size_t startpos = 0;
    try {
        ue2::ParseMode mode;
        const char *ptr = ue2::read_control_verbs(
            re.c_str(), re.c_str() + re.size(), 0, mode);
        startpos = ptr - re.c_str();
    } catch (const ue2::ParseError &err) {
        // fall through
    }
    assert(startpos <= re.length());
    re.insert(startpos, "(?:");
    // We include a \E to close any open \Q quoted block. If there isn't
    // one, pcre will ignore the \E.
    re.append("\\E)(?C)");
}

unique_ptr<CompiledPcre>
GroundTruth::compile(unsigned id, bool no_callouts) {
    bool highlander = false;
    bool prefilter = false;
    bool som = false;
    bool combination = false;
    bool quiet = false;

    // we can still match approximate matching patterns with PCRE if edit
    // distance 0 is requested
    if (force_edit_distance && edit_distance) {
        throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
    }

    ExpressionMap::const_iterator i = m_expr.find(id);
    if (i == m_expr.end()) {
        throw PcreCompileFailure("ID not found in expression map.");
    }

    string re(i->second);
    unsigned flags;
    hs_expr_ext ext;

    // Decode the flags
    if (!decodeExprPcre(re, &flags, &highlander, &prefilter, &som,
                        &combination, &quiet, &ext)) {
        throw PcreCompileFailure("Unable to decode flags.");
    }

    // When hyperscan literal api is on, transfer the regex string into hex.
    if (use_literal_api && !combination) {
        unsigned char *pat
            = reinterpret_cast<unsigned char *>(const_cast<char *>(re.c_str()));
        char *str = makeHex(pat, re.length());
        if (!str) {
            throw PcreCompileFailure("makeHex() malloc failure.");
        }
        re.assign(str);
        free(str);
    }

    // filter out flags not supported by PCRE
    u64a supported = HS_EXT_FLAG_MIN_OFFSET | HS_EXT_FLAG_MAX_OFFSET |
                     HS_EXT_FLAG_MIN_LENGTH;
    if (use_literal_api) {
        ext.flags &= 0ULL;
        ext.min_offset = 0;
        ext.max_offset = MAX_OFFSET;
        ext.min_length = 0;
        ext.edit_distance = 0;
        ext.hamming_distance = 0;
    }
    if (ext.flags & ~supported) {
        // edit distance is a known unsupported flag, so just throw a soft error
        if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) {
            throw SoftPcreCompileFailure("Edit distance not supported by PCRE.");
        }
        if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
            throw SoftPcreCompileFailure(
                "Hamming distance not supported by PCRE.");
        }
        throw PcreCompileFailure("Unsupported extended flags.");
    }

    // Hybrid mode implies SOM.
    if (colliderMode == MODE_HYBRID) {
        assert(!use_NFA);
        som = true;
    }

    // SOM flags might be set globally.
    som |= !!somFlags;

    // For traditional Hyperscan, add global callout to pattern.
    if (!combination && !no_callouts && isStandardMode(colliderMode)) {
        addCallout(re);
    }

    // Compile the pattern
    const char *errptr = nullptr;
    int errloc = 0;
    int errcode = 0;

    unique_ptr<CompiledPcre> compiled = make_unique<CompiledPcre>();
    compiled->utf8 = flags & PCRE_UTF8;
    compiled->highlander = highlander;
    compiled->prefilter = prefilter;
    compiled->som = som;
    compiled->combination = combination;
    compiled->quiet = quiet;
    compiled->min_offset = ext.min_offset;
    compiled->max_offset = ext.max_offset;
    compiled->min_length = ext.min_length;
    compiled->expression = i->second; // original PCRE
    flags |= PCRE_NO_AUTO_POSSESS;

    if (compiled->combination) {
        compiled->pl.parseLogicalCombination(id, re.c_str(), ~0U, 0, ~0ULL);
        compiled->pl.logicalKeyRenumber();
        compiled->report = id;
        return compiled;
    }

    compiled->bytecode =
        pcre_compile2(re.c_str(), flags, &errcode, &errptr, &errloc, nullptr);

    if (!compiled->bytecode || errptr) {
        assert(errcode);
        ostringstream oss;
        oss << "Failed to compile expression '" << re << '\'';
        oss << " (" << errptr << " at " << errloc << ").";
        if (errcode == 20) { // "regular expression is too large"
            throw SoftPcreCompileFailure(oss.str());
        } else if (errcode == 25) { // "lookbehind assertion is not fixed length"
            throw SoftPcreCompileFailure(oss.str());
        } else {
            throw PcreCompileFailure(oss.str());
        }
    }

    // Study the pattern
    shared_ptr<pcre_extra> extra(pcre_study(compiled->bytecode, 0, &errptr),
                                 free);
    if (errptr) {
        ostringstream oss;
        oss << "Error studying pattern (" << errptr << ").";
        throw PcreCompileFailure(oss.str());
    }

    int infoRes =
        pcre_fullinfo(compiled->bytecode, extra.get(), PCRE_INFO_CAPTURECOUNT,
                      &compiled->captureCount);
    if (infoRes < PCRE_ERROR_NOMATCH) {
        ostringstream oss;
        oss << "Error determining number of capturing subpatterns ("
            << pcreErrStr(infoRes) << ").";
        throw PcreCompileFailure(oss.str());
    }

    return compiled;
}

static
void filterLeftmostSom(ResultSet &rs) {
    if (rs.matches.size() <= 1) {
        return;
    }

    set<u64a> seen; // End offsets.
    set<MatchResult>::iterator it = rs.matches.begin();
    while (it != rs.matches.end()) {
        if (seen.insert(it->to).second) {
            ++it; // First time we've seen this end-offset.
        } else {
            rs.matches.erase(it++); // Dupe with a "righter" SOM.
        }
    }
}

static
void filterExtParams(ResultSet &rs, const CompiledPcre &compiled) {
    set<MatchResult>::iterator it = rs.matches.begin();
    while (it != rs.matches.end()) {
        unsigned int from = it->from, to = it->to;
        unsigned int len = to - from;
        if (to < compiled.min_offset || to > compiled.max_offset ||
                len < compiled.min_length) {
            rs.matches.erase(it++);
        } else {
            ++it;
        }
    }
}

static
int scanBasic(const CompiledPcre &compiled, const string &buffer,
              const pcre_extra &extra, vector<int> &ovector,
              CalloutContext &ctx) {
    const size_t prefix_len = g_corpora_prefix.size();
    const size_t suffix_len = g_corpora_suffix.size();

    size_t begin_offset = prefix_len - MIN(50, prefix_len);
    size_t real_len = buffer.size();

    if (suffix_len > 2) {
        real_len -= suffix_len - 2;
    }

    int flags = suffix_len ? PCRE_NOTEOL : 0;
    int ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), real_len,
                        begin_offset, flags, &ovector[0], ovector.size());

    if (!g_corpora_prefix.empty()) {
        PcreMatchSet tmp;
        tmp.swap(ctx.matches);

        for (const auto &m : tmp) {
            unsigned from = m.first;
            unsigned to = m.second;
            if (to >= prefix_len && to <= buffer.size() - suffix_len) {
                from = from < prefix_len ? 0 : from - prefix_len;
                to -= prefix_len;
                ctx.matches.insert(make_pair(from, to));
            }
        }
    }

    return ret;
}

static
bool isUtf8(const CompiledPcre &compiled) {
    unsigned long int options = 0;
    pcre_fullinfo(compiled.bytecode, NULL, PCRE_INFO_OPTIONS, &options);
    return options & PCRE_UTF8;
}

static
CaptureVec makeCaptureVec(const vector<int> &ovector, int ret) {
    assert(ret > 0);

    CaptureVec cap;

    if (no_groups) {
        return cap; // No group info requested.
    }

    cap.reserve(ret * 2);
    for (int i = 0; i < ret * 2; i += 2) {
        int from = ovector[i], to = ovector[i + 1];
        cap.push_back(make_pair(from, to));
    }
    return cap;
}

static
int scanHybrid(const CompiledPcre &compiled, const string &buffer,
               const pcre_extra &extra, vector<int> &ovector,
               ResultSet &rs, ostream &out) {
    int len = (int)buffer.length();
    int startoffset = 0;
    bool utf8 = isUtf8(compiled);

    int flags = 0;
    int ret;
    do {
        ret = pcre_exec(compiled.bytecode, &extra, buffer.c_str(), len,
                        startoffset, flags, &ovector[0], ovector.size());

        if (ret <= PCRE_ERROR_NOMATCH) {
            return ret;
        }

        int from = ovector.at(0);
        int to = ovector.at(1);
        rs.addMatch(from, to, makeCaptureVec(ovector, ret));

        if (echo_matches) {
            out << "PCRE Match @ (" << from << "," << to << ")" << endl;
        }

        // If we only wanted a single match, we're done.
        if (compiled.highlander) break;

        // Next scan starts at the first codepoint after the match. It's
        // possible that we have a vacuous match, in which case we must step
        // past it to ensure that we always progress.
        if (from != to) {
            startoffset = to;
        } else if (utf8) {
            startoffset = to + 1;
            while (startoffset < len
                   && ((buffer[startoffset] & 0xc0) == UTF_CONT_BYTE_HEADER)) {
                ++startoffset;
            }
        } else {
            startoffset = to + 1;
        }
    } while (startoffset <= len);

    return ret;
}

static
int scanOffset(const CompiledPcre &compiled, const string &buffer,
               const pcre_extra &extra, vector<int> &ovector,
               CalloutContext &ctx) {
    size_t offset = MIN(100, g_streamOffset);
    assert(offset > 0);

    const string buf(string(offset, '\0') + buffer);

    // First, scan our preamble so that we can discard any matches therein
    // after the real scan, later. We use PCRE_NOTEOL so that end-anchors in
    // our expression don't match at the end of the preamble.
    int ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), offset, 0,
                        PCRE_NOTEOL, &ovector[0], ovector.size());
    if (ret < PCRE_ERROR_NOMATCH) {
        return ret;
    }

    PcreMatchSet pre_matches;
    pre_matches.swap(ctx.matches);

    // Real scan.
    ret = pcre_exec(compiled.bytecode, &extra, buf.c_str(), buf.size(), 0, 0,
                    &ovector[0], ovector.size());
    if (ret < PCRE_ERROR_NOMATCH) {
        return ret;
    }

    // Erase any matches due entirely to the preamble.
    for (const auto &m : pre_matches) {
        ctx.matches.erase(m);
    }

    return ret;
}

/** \brief Returns 1 if compliant to all logical combinations. */
static
char isLogicalCombination(vector<char> &lv, const vector<LogicalOp> &comb,
                          size_t lkeyCount, unsigned start, unsigned result) {
    assert(start <= result);
    for (unsigned i = start; i <= result; i++) {
        const LogicalOp &op = comb[i - lkeyCount];
        assert(i == op.id);
        switch (op.op) {
        case LOGICAL_OP_NOT:
            lv[op.id] = !lv[op.ro];
            break;
        case LOGICAL_OP_AND:
            lv[op.id] = lv[op.lo] & lv[op.ro]; // &&
            break;
        case LOGICAL_OP_OR:
            lv[op.id] = lv[op.lo] | lv[op.ro]; // ||
            break;
        default:
            assert(0);
            break;
        }
    }
    return lv[result];
}

/** \brief Returns 1 if combination matches when no sub-expression matches. */
static
char isPurelyNegativeMatch(vector<char> &lv, const vector<LogicalOp> &comb,
                           size_t lkeyCount, unsigned start, unsigned result) {
    assert(start <= result);
    for (unsigned i = start; i <= result; i++) {
        const LogicalOp &op = comb[i - lkeyCount];
        assert(i == op.id);
        switch (op.op) {
        case LOGICAL_OP_NOT:
            if ((op.ro < lkeyCount) && lv[op.ro]) {
                // sub-expression not negative
                return 0;
            }
            lv[op.id] = !lv[op.ro];
            break;
        case LOGICAL_OP_AND:
            if (((op.lo < lkeyCount) && lv[op.lo]) ||
                ((op.ro < lkeyCount) && lv[op.ro])) {
                // sub-expression not negative
                return 0;
            }
            lv[op.id] = lv[op.lo] & lv[op.ro]; // &&
            break;
        case LOGICAL_OP_OR:
            if (((op.lo < lkeyCount) && lv[op.lo]) ||
                ((op.ro < lkeyCount) && lv[op.ro])) {
                // sub-expression not negative
                return 0;
            }
            lv[op.id] = lv[op.lo] | lv[op.ro]; // ||
            break;
        default:
            assert(0);
            break;
        }
    }
    return lv[result];
}

bool GroundTruth::run(unsigned, const CompiledPcre &compiled,
                      const string &buffer, ResultSet &rs, string &error) {
    if (compiled.quiet) {
        return true;
    }

    if (compiled.combination) {
        // Compile and run sub-expressions, store match results.
        map<unsigned long long, set<MatchResult>> offset_to_matches;
        map<unsigned long long, set<unsigned>> offset_to_lkeys;
        set<unsigned> sub_exps;
        const auto &m_lkey = compiled.pl.getLkeyMap();
        for (const auto &it_lkey : m_lkey) {
            if (sub_exps.find(it_lkey.first) == sub_exps.end()) {
                sub_exps.emplace(it_lkey.first);
                ResultSet sub_rs(RESULT_FROM_PCRE);
                shared_ptr<CompiledPcre> sub_pcre;
                try {
                    sub_pcre = compile(it_lkey.first);
                }
                catch (const SoftPcreCompileFailure &err) {
                    return false;
                }
                catch (const PcreCompileFailure &err) {
                    return false;
                }
                sub_pcre->quiet = false; // force not quiet in sub-exp.
                if (!run(it_lkey.first, *sub_pcre, buffer, sub_rs, error)) {
                    rs.clear();
                    return false;
                }
                for (const auto &it_mr : sub_rs.matches) {
                    offset_to_matches[it_mr.to].emplace(it_mr);
                    offset_to_lkeys[it_mr.to].emplace(it_lkey.second);
                    if (sub_pcre->highlander) {
                        break;
                    }
                }
            }
        }
        // Calculate rs for combination expression.
        vector<char> lv;
        const auto &comb = compiled.pl.getLogicalTree();
        lv.resize(m_lkey.size() + comb.size());
        const auto &li = compiled.pl.getCombInfoById(compiled.report);
        for (const auto &it : offset_to_lkeys) {
            for (auto report : it.second) {
                lv[report] = 1;
            }
            if (isLogicalCombination(lv, comb, m_lkey.size(),
                                     li.start, li.result)) {
                for (const auto &mr : offset_to_matches.at(it.first)) {
                    if ((mr.to >= compiled.min_offset) &&
                        (mr.to <= compiled.max_offset)) {
                        rs.addMatch(mr.from, mr.to);
                    }
                }
            }
        }
        if (isPurelyNegativeMatch(lv, comb, m_lkey.size(),
                                  li.start, li.result)) {
            u64a to = buffer.length();
            if ((to >= compiled.min_offset) && (to <= compiled.max_offset)) {
                rs.addMatch(0, to);
            }
        }
        return true;
    }

    CalloutContext ctx(out);

    pcre_extra extra;
    extra.flags = 0;

    // If running in traditional HyperScan mode, switch on callouts.
    bool usingCallouts = isStandardMode(colliderMode);
    if (usingCallouts) {
        // Switch on callouts.
        extra.flags |= PCRE_EXTRA_CALLOUT_DATA;
        extra.callout_data = &ctx;
    }

    // Set the match_limit (in order to bound execution time on very complex
    // patterns)
    extra.flags |= (PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION);
    if (colliderMode == MODE_HYBRID) {
        extra.match_limit = 10000000;
        extra.match_limit_recursion = 1500;
    } else {
        extra.match_limit = matchLimit;
        extra.match_limit_recursion = matchLimitRecursion;
    }

#ifdef PCRE_NO_START_OPTIMIZE
    // Switch off optimizations that may result in callouts not occurring.
    extra.flags |= PCRE_NO_START_OPTIMIZE;
#endif

    // Ensure there's enough room in the ovector for the capture groups in this
    // pattern.
    int ovecsize = (compiled.captureCount + 1) * 3;
    ovector.resize(ovecsize);

    int ret;
    bool hybrid = false;
    switch (colliderMode) {
    case MODE_BLOCK:
    case MODE_STREAMING:
    case MODE_VECTORED:
        if (g_streamOffset) {
            ret = scanOffset(compiled, buffer, extra, ovector, ctx);
        } else {
            ret = scanBasic(compiled, buffer, extra, ovector, ctx);
        }
        break;
    case MODE_HYBRID:
        ret = scanHybrid(compiled, buffer, extra, ovector, rs, out);
        hybrid = true;
        break;
    default:
        assert(0);
        ret = PCRE_ERROR_NULL;
        break;
    }

    if (ret < PCRE_ERROR_NOMATCH) {
        error = pcreErrStr(ret);
        return false;
    }

    // Move matches into a ResultSet.
    for (const auto &m : ctx.matches) {
        unsigned long long from = m.first;
        unsigned long long to = m.second;

        if (g_streamOffset) {
            // Subtract stream offset imposed by offset test.
            unsigned long long offset = min(100ull, g_streamOffset);
            assert(to >= offset);
            from -= min(offset, from);
            to -= offset;
        }

        rs.addMatch(from, to);
    }

    // If we have no matches, there's no further work to do.
    if (rs.matches.empty()) {
        return true;
    }

    if (compiled.som && !hybrid) {
        filterLeftmostSom(rs);
    }

    filterExtParams(rs, compiled);

    // If we haven't been asked for SOM, strip the from offsets.
    if (!compiled.som) {
        set<MatchResult> endonly;
        for (const auto &m : rs.matches) {
            endonly.insert(MatchResult(0, m.to));
        }
        rs.matches.swap(endonly);
    }

    return true;
}