Multibyte acceleration compile side

2025-06-28 16:41:01 +03:00 · 2015-12-09 13:38:58 +00:00 · 2015-12-09 13:38:58 +00:00 · 87424713a7
commit 87424713a7
parent 081b3ef369
8 changed files with 1002 additions and 13 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -599,6 +599,8 @@ SET (hs_SRCS
    src/nfa/mpv_internal.h
    src/nfa/mpvcompile.cpp
    src/nfa/mpvcompile.h
+    src/nfa/multiaccel_compilehelper.cpp
+    src/nfa/multiaccel_compilehelper.h
    src/nfa/nfa_api.h
    src/nfa/nfa_api_queue.h
    src/nfa/nfa_api_util.h
--- a/src/nfa/accelcompile.cpp
+++ b/src/nfa/accelcompile.cpp
@ -169,13 +169,285 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
    aux->accel_type = ACCEL_NONE;
 }

+static
+void buildAccelMulti(const AccelInfo &info, AccelAux *aux) {
+    if (info.ma_type == MultibyteAccelInfo::MAT_NONE) {
+        DEBUG_PRINTF("no multimatch for us :(");
+        return;
+    }
+
+    u32 offset = info.multiaccel_offset;
+    const CharReach &stops = info.multiaccel_stops;
+
+    assert(aux->accel_type == ACCEL_NONE);
+    if (stops.all()) {
+        return;
+    }
+
+    size_t outs = stops.count();
+    DEBUG_PRINTF("%zu outs\n", outs);
+    assert(outs && outs < 256);
+
+    switch (info.ma_type) {
+    case MultibyteAccelInfo::MAT_LONG:
+        if (outs == 1) {
+            aux->accel_type = ACCEL_MLVERM;
+            aux->mverm.offset = offset;
+            aux->mverm.c = stops.find_first();
+            aux->mverm.len = info.ma_len1;
+            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
+            return;
+        }
+        if (outs == 2 && stops.isCaselessChar()) {
+            aux->accel_type = ACCEL_MLVERM_NOCASE;
+            aux->mverm.offset = offset;
+            aux->mverm.c = stops.find_first() & CASE_CLEAR;
+            aux->mverm.len = info.ma_len1;
+            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
+                         aux->verm.c);
+            return;
+        }
+        break;
+    case MultibyteAccelInfo::MAT_LONGGRAB:
+        if (outs == 1) {
+            aux->accel_type = ACCEL_MLGVERM;
+            aux->mverm.offset = offset;
+            aux->mverm.c = stops.find_first();
+            aux->mverm.len = info.ma_len1;
+            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
+            return;
+        }
+        if (outs == 2 && stops.isCaselessChar()) {
+            aux->accel_type = ACCEL_MLGVERM_NOCASE;
+            aux->mverm.offset = offset;
+            aux->mverm.c = stops.find_first() & CASE_CLEAR;
+            aux->mverm.len = info.ma_len1;
+            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
+                         aux->verm.c);
+            return;
+        }
+        break;
+    case MultibyteAccelInfo::MAT_SHIFT:
+        if (outs == 1) {
+            aux->accel_type = ACCEL_MSVERM;
+            aux->mverm.offset = offset;
+            aux->mverm.c = stops.find_first();
+            aux->mverm.len = info.ma_len1;
+            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
+            return;
+        }
+        if (outs == 2 && stops.isCaselessChar()) {
+            aux->accel_type = ACCEL_MSVERM_NOCASE;
+            aux->mverm.offset = offset;
+            aux->mverm.c = stops.find_first() & CASE_CLEAR;
+            aux->mverm.len = info.ma_len1;
+            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
+                         aux->verm.c);
+            return;
+        }
+        break;
+    case MultibyteAccelInfo::MAT_SHIFTGRAB:
+        if (outs == 1) {
+            aux->accel_type = ACCEL_MSGVERM;
+            aux->mverm.offset = offset;
+            aux->mverm.c = stops.find_first();
+            aux->mverm.len = info.ma_len1;
+            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
+            return;
+        }
+        if (outs == 2 && stops.isCaselessChar()) {
+            aux->accel_type = ACCEL_MSGVERM_NOCASE;
+            aux->mverm.offset = offset;
+            aux->mverm.c = stops.find_first() & CASE_CLEAR;
+            aux->mverm.len = info.ma_len1;
+            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
+                         aux->verm.c);
+            return;
+        }
+        break;
+    case MultibyteAccelInfo::MAT_DSHIFT:
+        if (outs == 1) {
+            aux->accel_type = ACCEL_MDSVERM;
+            aux->mdverm.offset = offset;
+            aux->mdverm.c = stops.find_first();
+            aux->mdverm.len1 = info.ma_len1;
+            aux->mdverm.len2 = info.ma_len2;
+            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
+            return;
+        }
+        if (outs == 2 && stops.isCaselessChar()) {
+            aux->accel_type = ACCEL_MDSVERM_NOCASE;
+            aux->mverm.offset = offset;
+            aux->mverm.c = stops.find_first() & CASE_CLEAR;
+            aux->mdverm.len1 = info.ma_len1;
+            aux->mdverm.len2 = info.ma_len2;
+            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
+                         aux->verm.c);
+            return;
+        }
+        break;
+    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
+        if (outs == 1) {
+            aux->accel_type = ACCEL_MDSGVERM;
+            aux->mdverm.offset = offset;
+            aux->mdverm.c = stops.find_first();
+            aux->mdverm.len1 = info.ma_len1;
+            aux->mdverm.len2 = info.ma_len2;
+            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
+            return;
+        }
+        if (outs == 2 && stops.isCaselessChar()) {
+            aux->accel_type = ACCEL_MDSGVERM_NOCASE;
+            aux->mverm.offset = offset;
+            aux->mverm.c = stops.find_first() & CASE_CLEAR;
+            aux->mdverm.len1 = info.ma_len1;
+            aux->mdverm.len2 = info.ma_len2;
+            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
+                         aux->verm.c);
+            return;
+        }
+        break;
+    default:
+        // shouldn't happen
+        assert(0);
+        return;
+    }
+
+    DEBUG_PRINTF("attempting shufti for %zu chars\n", outs);
+
+    switch (info.ma_type) {
+    case MultibyteAccelInfo::MAT_LONG:
+        if (shuftiBuildMasks(stops, &aux->mshufti.lo,
+                &aux->mshufti.hi) == -1) {
+            break;
+        }
+        aux->accel_type = ACCEL_MLSHUFTI;
+        aux->mshufti.offset = offset;
+        aux->mshufti.len = info.ma_len1;
+        return;
+    case MultibyteAccelInfo::MAT_LONGGRAB:
+        if (shuftiBuildMasks(stops, &aux->mshufti.lo,
+                &aux->mshufti.hi) == -1) {
+            break;
+        }
+        aux->accel_type = ACCEL_MLGSHUFTI;
+        aux->mshufti.offset = offset;
+        aux->mshufti.len = info.ma_len1;
+        return;
+    case MultibyteAccelInfo::MAT_SHIFT:
+        if (shuftiBuildMasks(stops, &aux->mshufti.lo,
+                &aux->mshufti.hi) == -1) {
+            break;
+        }
+        aux->accel_type = ACCEL_MSSHUFTI;
+        aux->mshufti.offset = offset;
+        aux->mshufti.len = info.ma_len1;
+        return;
+    case MultibyteAccelInfo::MAT_SHIFTGRAB:
+        if (shuftiBuildMasks(stops, &aux->mshufti.lo,
+                &aux->mshufti.hi) == -1) {
+            break;
+        }
+        aux->accel_type = ACCEL_MSGSHUFTI;
+        aux->mshufti.offset = offset;
+        aux->mshufti.len = info.ma_len1;
+        return;
+    case MultibyteAccelInfo::MAT_DSHIFT:
+        if (shuftiBuildMasks(stops, &aux->mdshufti.lo,
+                &aux->mdshufti.hi) == -1) {
+            break;
+        }
+        aux->accel_type = ACCEL_MDSSHUFTI;
+        aux->mdshufti.offset = offset;
+        aux->mdshufti.len1 = info.ma_len1;
+        aux->mdshufti.len2 = info.ma_len2;
+        return;
+    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
+        if (shuftiBuildMasks(stops, &aux->mdshufti.lo,
+                &aux->mdshufti.hi) == -1) {
+            break;
+        }
+        aux->accel_type = ACCEL_MDSGSHUFTI;
+        aux->mdshufti.offset = offset;
+        aux->mdshufti.len1 = info.ma_len1;
+        aux->mdshufti.len2 = info.ma_len2;
+        return;
+    default:
+        // shouldn't happen
+        assert(0);
+        return;
+    }
+    DEBUG_PRINTF("shufti build failed, falling through\n");
+
+    if (outs <= ACCEL_MAX_STOP_CHAR) {
+        DEBUG_PRINTF("building Truffle for %zu chars\n", outs);
+        switch (info.ma_type) {
+        case MultibyteAccelInfo::MAT_LONG:
+            aux->accel_type = ACCEL_MLTRUFFLE;
+            aux->mtruffle.offset = offset;
+            aux->mtruffle.len = info.ma_len1;
+            truffleBuildMasks(stops, &aux->mtruffle.mask1,
+                              &aux->mtruffle.mask2);
+            break;
+        case MultibyteAccelInfo::MAT_LONGGRAB:
+            aux->accel_type = ACCEL_MLGTRUFFLE;
+            aux->mtruffle.offset = offset;
+            aux->mtruffle.len = info.ma_len1;
+            truffleBuildMasks(stops, &aux->mtruffle.mask1,
+                              &aux->mtruffle.mask2);
+            break;
+        case MultibyteAccelInfo::MAT_SHIFT:
+            aux->accel_type = ACCEL_MSTRUFFLE;
+            aux->mtruffle.offset = offset;
+            aux->mtruffle.len = info.ma_len1;
+            truffleBuildMasks(stops, &aux->mtruffle.mask1,
+                              &aux->mtruffle.mask2);
+            break;
+        case MultibyteAccelInfo::MAT_SHIFTGRAB:
+            aux->accel_type = ACCEL_MSGTRUFFLE;
+            aux->mtruffle.offset = offset;
+            aux->mtruffle.len = info.ma_len1;
+            truffleBuildMasks(stops, &aux->mtruffle.mask1,
+                              &aux->mtruffle.mask2);
+            break;
+        case MultibyteAccelInfo::MAT_DSHIFT:
+            aux->accel_type = ACCEL_MDSTRUFFLE;
+            aux->mdtruffle.offset = offset;
+            aux->mdtruffle.len1 = info.ma_len1;
+            aux->mdtruffle.len2 = info.ma_len2;
+            truffleBuildMasks(stops, &aux->mtruffle.mask1,
+                              &aux->mdtruffle.mask2);
+            break;
+        case MultibyteAccelInfo::MAT_DSHIFTGRAB:
+            aux->accel_type = ACCEL_MDSGTRUFFLE;
+            aux->mdtruffle.offset = offset;
+            aux->mdtruffle.len1 = info.ma_len1;
+            aux->mdtruffle.len2 = info.ma_len2;
+            truffleBuildMasks(stops, &aux->mtruffle.mask1,
+                              &aux->mdtruffle.mask2);
+            break;
+        default:
+            // shouldn't happen
+            assert(0);
+            return;
+        }
+        return;
+    }
+
+    DEBUG_PRINTF("unable to accelerate multibyte case with %zu outs\n", outs);
+}
+
 bool buildAccelAux(const AccelInfo &info, AccelAux *aux) {
    assert(aux->accel_type == ACCEL_NONE);
    if (info.single_stops.none()) {
        DEBUG_PRINTF("picked red tape\n");
        aux->accel_type = ACCEL_RED_TAPE;
        aux->generic.offset = info.single_offset;
-    } else {
+    }
+    if (aux->accel_type == ACCEL_NONE) {
+        buildAccelMulti(info, aux);
+    }
+    if (aux->accel_type == ACCEL_NONE) {
        buildAccelDouble(info, aux);
    }
    if (aux->accel_type == ACCEL_NONE) {
--- a/src/nfa/accelcompile.h
+++ b/src/nfa/accelcompile.h
@ -32,6 +32,7 @@
 #include "ue2common.h"
 #include "util/charreach.h"
 #include "util/ue2_containers.h"
+#include "nfagraph/ng_limex_accel.h"

 union AccelAux;

@ -39,7 +40,9 @@ namespace ue2 {

 struct AccelInfo {
    AccelInfo() : single_offset(0U), double_offset(0U),
-                  single_stops(CharReach::dot()) {}
+                  single_stops(CharReach::dot()),
+                  multiaccel_offset(0), ma_len1(0), ma_len2(0),
+                  ma_type(MultibyteAccelInfo::MAT_NONE) {}
    u32 single_offset; /**< offset correction to apply to single schemes */
    u32 double_offset; /**< offset correction to apply to double schemes */
    CharReach double_stop1;  /**<  single-byte accel stop literals for double
@ -47,6 +50,11 @@ struct AccelInfo {
    flat_set<std::pair<u8, u8>> double_stop2; /**< double-byte accel stop
                                               * literals */
    CharReach single_stops; /**< escapes for single byte acceleration */
+    u32 multiaccel_offset; /**< offset correction to apply to multibyte schemes */
+    CharReach multiaccel_stops; /**< escapes for multibyte acceleration */
+    u32 ma_len1; /**< multiaccel len1 */
+    u32 ma_len2; /**< multiaccel len2 */
+    MultibyteAccelInfo::multiaccel_type ma_type; /**< multiaccel type */
 };

 bool buildAccelAux(const AccelInfo &info, AccelAux *aux);
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@ -80,9 +80,11 @@ struct precalcAccel {
    CharReach double_cr;
    flat_set<pair<u8, u8>> double_lits; /* double-byte accel stop literals */
    u32 double_offset;
+
+    MultibyteAccelInfo ma_info;
 };

-struct meteor_accel_info {
+struct limex_accel_info {
    ue2::unordered_set<NFAVertex> accelerable;
    map<NFAStateSet, precalcAccel> precalc;
    ue2::unordered_map<NFAVertex, flat_set<NFAVertex> > friends;
@ -162,7 +164,7 @@ struct build_info {
    bool stateCompression;
    const CompileContext &cc;
    u32 num_states;
-    meteor_accel_info accel;
+    limex_accel_info accel;
 };

 // Constants for scoring mechanism
@ -334,12 +336,16 @@ void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
 }

 struct AccelBuild {
-    AccelBuild() : v(NFAGraph::null_vertex()), state(0), offset(0) {}
+    AccelBuild() : v(NFAGraph::null_vertex()), state(0), offset(0), ma_len1(0),
+            ma_len2(0), ma_type(MultibyteAccelInfo::MAT_NONE) {}
    NFAVertex v;
    u32 state;
    u32 offset; // offset correction to apply
    CharReach stop1; // single-byte accel stop literals
    flat_set<pair<u8, u8>> stop2; // double-byte accel stop literals
+    u32 ma_len1; // multiaccel len1
+    u32 ma_len2; // multiaccel len2
+    MultibyteAccelInfo::multiaccel_type ma_type; // multiaccel type
 };

 static
@ -354,7 +360,12 @@ void findStopLiterals(const build_info &bi, NFAVertex v, AccelBuild &build) {
        build.stop1 = CharReach::dot();
    } else {
        const precalcAccel &precalc = bi.accel.precalc.at(ss);
-        if (precalc.double_lits.empty()) {
+        unsigned ma_len = precalc.ma_info.len1 + precalc.ma_info.len2;
+        if (ma_len >= MULTIACCEL_MIN_LEN) {
+            build.ma_len1 = precalc.ma_info.len1;
+            build.stop1 = precalc.ma_info.cr;
+            build.offset = precalc.ma_info.offset;
+        } else if (precalc.double_lits.empty()) {
            build.stop1 = precalc.single_cr;
            build.offset = precalc.single_offset;
        } else {
@ -534,7 +545,7 @@ void filterAccelStates(NGHolder &g, const map<u32, NFAVertex> &tops,
 }

 static
-bool containsBadSubset(const meteor_accel_info &accel,
+bool containsBadSubset(const limex_accel_info &accel,
                       const NFAStateSet &state_set, const u32 effective_sds) {
    NFAStateSet subset(state_set.size());
    for (size_t j = state_set.find_first(); j != state_set.npos;
@ -559,7 +570,8 @@ void doAccelCommon(NGHolder &g,
                   ue2::unordered_map<NFAVertex, AccelScheme> &accel_map,
                   const ue2::unordered_map<NFAVertex, u32> &state_ids,
                   const map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
-                   const u32 num_states, meteor_accel_info *accel) {
+                   const u32 num_states, limex_accel_info *accel,
+                   const CompileContext &cc) {
    vector<CharReach> refined_cr = reduced_cr(g, br_cyclic);

    vector<NFAVertex> astates;
@ -607,10 +619,22 @@ void doAccelCommon(NGHolder &g,

        DEBUG_PRINTF("accel %u ok with offset %u\n", i, as.offset);

+        // try multibyte acceleration first
+        MultibyteAccelInfo mai = nfaCheckMultiAccel(g, states, cc);
+
        precalcAccel &pa = accel->precalc[state_set];
+        useful |= state_set;
+
+        // if we successfully built a multibyte accel scheme, use that
+        if (mai.type != MultibyteAccelInfo::MAT_NONE) {
+            pa.ma_info = mai;
+
+            DEBUG_PRINTF("multibyte acceleration!\n");
+            continue;
+        }
+
        pa.single_offset = as.offset;
        pa.single_cr = as.cr;
-        useful |= state_set;

        if (states.size() == 1) {
            DoubleAccelInfo b = findBestDoubleAccelInfo(g, states.front());
@ -660,7 +684,7 @@ void fillAccelInfo(build_info &bi) {
    filterAccelStates(bi.h, bi.tops, &bi.accel.accel_map);
    assert(bi.accel.accel_map.size() <= NFA_MAX_ACCEL_STATES);
    doAccelCommon(bi.h, bi.accel.accel_map, bi.state_ids, bi.br_cyclic,
-                  bi.num_states, &bi.accel);
+                  bi.num_states, &bi.accel, bi.cc);
 }

 /** The AccelAux structure has large alignment specified, and this makes some
@ -672,7 +696,7 @@ static
 void buildAccel(const build_info &args, NFAStateSet &accelMask,
                NFAStateSet &accelFriendsMask, AccelAuxVector &auxvec,
                vector<u8> &accelTable) {
-    const meteor_accel_info &accel = args.accel;
+    const limex_accel_info &accel = args.accel;

    // Init, all zeroes.
    accelMask.resize(args.num_states);
@ -737,8 +761,16 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,

        if (contains(accel.precalc, states)) {
            const precalcAccel &precalc = accel.precalc.at(states);
-            ainfo.single_offset = precalc.single_offset;
-            ainfo.single_stops = precalc.single_cr;
+            if (precalc.ma_info.type != MultibyteAccelInfo::MAT_NONE) {
+                ainfo.ma_len1 = precalc.ma_info.len1;
+                ainfo.ma_len2 = precalc.ma_info.len2;
+                ainfo.multiaccel_offset = precalc.ma_info.offset;
+                ainfo.multiaccel_stops = precalc.ma_info.cr;
+                ainfo.ma_type = precalc.ma_info.type;
+            } else {
+                ainfo.single_offset = precalc.single_offset;
+                ainfo.single_stops = precalc.single_cr;
+            }
        }

        buildAccelAux(ainfo, &aux);
--- a/src/nfa/multiaccel_compilehelper.cpp
+++ b/src/nfa/multiaccel_compilehelper.cpp
@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "multiaccel_compilehelper.h"
+
+using namespace std;
+using namespace ue2;
+
+#ifdef DEBUG
+static const char* state_to_str[] = {
+    "FIRST_RUN",
+    "SECOND_RUN",
+    "WAITING_FOR_GRAB",
+    "FIRST_TAIL",
+    "SECOND_TAIL",
+    "STOPPED",
+    "INVALID"
+};
+static const char* type_to_str[] = {
+    "SHIFT",
+    "SHIFTGRAB",
+    "DOUBLESHIFT",
+    "DOUBLESHIFTGRAB",
+    "LONG",
+    "LONGGRAB",
+    "NONE"
+};
+
+static
+void dumpMultiaccelState(const accel_data &d) {
+    DEBUG_PRINTF("type: %s state: %s len1: %u tlen1: %u len2: %u tlen2: %u\n",
+                 type_to_str[(unsigned) d.type],
+                 state_to_str[(unsigned) d.state],
+                 d.len1, d.tlen1, d.len2, d.tlen2);
+}
+#endif
+
+/* stop all the matching. this may render most schemes invalid. */
+static
+void stop(accel_data &d) {
+    switch (d.state) {
+    case STATE_STOPPED:
+    case STATE_INVALID:
+        break;
+    case STATE_FIRST_TAIL:
+    case STATE_SECOND_RUN:
+        /*
+         * Shift matchers are special case, because they have "tails".
+         * When shift matcher reaches a mid/endpoint, tail mode is
+         * activated, which looks for more matches to extend the match.
+         *
+         * For example, consider pattern /a{5}ba{3}/. Under normal circumstances,
+         * long-grab matcher will be picked for this pattern (matching a run of a's,
+         * followed by a not-a), because doubleshift matcher would be confused by
+         * consecutive a's and would parse the pattern as a.{0}a.{0}a (two shifts
+         * by 1) and throw out the rest of the pattern.
+         *
+         * With tails, we defer ending the run until we actually run out of
+         * matching characters, so the above pattern will now be parsed by
+         * doubleshift matcher as /a.{3}a.{3}a/ (two shifts by 4).
+         *
+         * So if we are stopping shift matchers, we should check if we aren't in
+         * the process of matching first tail or second run. If we are, we can't
+         * finish the second run as we are stopping, but we can try and split
+         * the first tail instead to obtain a valid second run.
+         */
+        if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
+                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.tlen1 == 0) {
+            // can't split an empty void...
+            d.state = STATE_INVALID;
+            break;
+        }
+        d.len2 = 0;
+        d.state = STATE_STOPPED;
+        break;
+    case STATE_SECOND_TAIL:
+        d.state = STATE_STOPPED;
+        break;
+    case STATE_WAITING_FOR_GRAB:
+    case STATE_FIRST_RUN:
+        if (d.type == MultibyteAccelInfo::MAT_LONG) {
+            d.state = STATE_STOPPED;
+        } else {
+            d.state = STATE_INVALID;
+        }
+        break;
+    }
+}
+
+static
+void validate(accel_data &d, unsigned max_len) {
+    // try and fit in all our tails
+    if (d.len1 + d.tlen1 + d.len2 + d.tlen2 < max_len && d.len2 > 0) {
+        // case 1: everything fits in
+        d.len1 += d.tlen1;
+        d.len2 += d.tlen2;
+        d.tlen1 = 0;
+        d.tlen2 = 0;
+    } else if (d.len1 + d.tlen1 + d.len2 < max_len && d.len2 > 0) {
+        // case 2: everything but the second tail fits in
+        d.len1 += d.tlen1;
+        d.tlen1 = 0;
+        // try going for a partial tail
+        if (d.tlen2 != 0) {
+            int new_tlen2 = max_len - 1 - d.len1 - d.len2;
+            if (new_tlen2 > 0) {
+                d.len2 += new_tlen2;
+            }
+            d.tlen2 = 0;
+        }
+    } else if (d.len1 + d.tlen1 < max_len) {
+        // case 3: first run and its tail fits in
+        if (d.type == MultibyteAccelInfo::MAT_DSHIFT ||
+                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
+            // split the tail into a second run
+            d.len2 = d.tlen1;
+        } else {
+            d.len1 += d.tlen1;
+            d.len2 = 0;
+        }
+        d.tlen1 = 0;
+        d.tlen2 = 0;
+    } else if (d.len1 < max_len) {
+        // case 4: nothing but the first run fits in
+        // try going for a partial tail
+        if (d.tlen1 != 0) {
+            int new_tlen1 = max_len - 1 - d.len1;
+            if (new_tlen1 > 0) {
+                d.len1 += new_tlen1;
+            }
+            d.tlen1 = 0;
+        }
+        d.len2 = 0;
+        d.tlen2 = 0;
+    }
+    // if we removed our second run, doubleshift matchers are no longer valid
+    if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
+                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.len2 == 0) {
+        d.state = STATE_INVALID;
+    } else if ((d.type == MultibyteAccelInfo::MAT_LONG) && d.len1 >= max_len) {
+        // long matchers can just stop whenever they want to
+        d.len1 = max_len - 1;
+    }
+
+    // now, general sanity checks
+    if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) >= max_len) {
+        d.state = STATE_INVALID;
+    }
+    if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) < MULTIACCEL_MIN_LEN) {
+        d.state = STATE_INVALID;
+    }
+}
+
+static
+void match(accel_data &d, const CharReach &ref_cr, const CharReach &cur_cr) {
+    switch (d.type) {
+    case MultibyteAccelInfo::MAT_LONG:
+        {
+            /*
+             * For long matcher, we want lots of consecutive same-or-subset
+             * char-reaches
+             */
+            if ((ref_cr & cur_cr) == cur_cr) {
+                d.len1++;
+            } else {
+                d.state = STATE_STOPPED;
+            }
+        }
+        break;
+
+    case MultibyteAccelInfo::MAT_LONGGRAB:
+        {
+            /*
+             * For long-grab matcher, we want lots of consecutive same-or-subset
+             * char-reaches with a negative match in the end.
+             */
+            if ((ref_cr & cur_cr) == cur_cr) {
+                d.len1++;
+            } else if (!(ref_cr & cur_cr).any()) {
+                /* we grabbed, stop immediately */
+                d.state = STATE_STOPPED;
+            } else {
+                /* our run-n-grab was interrupted; mark as invalid */
+                d.state = STATE_INVALID;
+            }
+        }
+        break;
+
+    case MultibyteAccelInfo::MAT_SHIFTGRAB:
+        {
+            /*
+             * For shift-grab matcher, we want two matches separated by anything;
+             * however the second vertex *must* be a negative (non-overlapping) match.
+             *
+             * Shiftgrab matcher is identical to shift except for presence of grab.
+             */
+            if (d.state == STATE_WAITING_FOR_GRAB) {
+                if ((ref_cr & cur_cr).any()) {
+                    d.state = STATE_INVALID;
+                } else {
+                    d.state = STATE_FIRST_RUN;
+                    d.len1++;
+                }
+                return;
+            }
+        }
+        /* no break, falling through */
+    case MultibyteAccelInfo::MAT_SHIFT:
+        {
+            /*
+             * For shift-matcher, we want two matches separated by anything.
+             */
+            if (ref_cr == cur_cr) {
+                // keep matching tail
+                switch (d.state) {
+                case STATE_FIRST_RUN:
+                    d.state = STATE_FIRST_TAIL;
+                    break;
+                case STATE_FIRST_TAIL:
+                    d.tlen1++;
+                    break;
+                default:
+                    // shouldn't happen
+                    assert(0);
+                }
+            } else {
+                switch (d.state) {
+                case STATE_FIRST_RUN:
+                    // simply advance
+                    d.len1++;
+                    break;
+                case STATE_FIRST_TAIL:
+                    // we found a non-matching char after tail, so stop
+                    d.state = STATE_STOPPED;
+                    break;
+                default:
+                    // shouldn't happen
+                    assert(0);
+                }
+            }
+        }
+        break;
+
+    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
+        {
+            /*
+             * For double shift-grab matcher, we want two matches separated by
+             * either negative matches or dots; however the second vertex *must*
+             * be a negative match.
+             *
+             * Doubleshiftgrab matcher is identical to doubleshift except for
+             * presence of grab.
+             */
+            if (d.state == STATE_WAITING_FOR_GRAB) {
+                if ((ref_cr & cur_cr).any()) {
+                    d.state = STATE_INVALID;
+                } else {
+                    d.state = STATE_FIRST_RUN;
+                    d.len1++;
+                }
+                return;
+            }
+        }
+        /* no break, falling through */
+    case MultibyteAccelInfo::MAT_DSHIFT:
+        {
+            /*
+             * For double shift matcher, we want three matches, each separated
+             * by a lot of anything.
+             *
+             * Doubleshift matcher is complicated by presence of tails.
+             */
+            if (ref_cr == cur_cr) {
+                // decide if we are activating second shift or matching tails
+                switch (d.state) {
+                case STATE_FIRST_RUN:
+                    d.state = STATE_FIRST_TAIL;
+                    d.len2 = 1; // we're now ready for our second run
+                    break;
+                case STATE_FIRST_TAIL:
+                    d.tlen1++;
+                    break;
+                case STATE_SECOND_RUN:
+                    d.state = STATE_SECOND_TAIL;
+                    break;
+                case STATE_SECOND_TAIL:
+                    d.tlen2++;
+                    break;
+                default:
+                    // shouldn't happen
+                    assert(0);
+                }
+            } else {
+                switch (d.state) {
+                case STATE_FIRST_RUN:
+                    d.len1++;
+                    break;
+                case STATE_FIRST_TAIL:
+                    // start second run
+                    d.state = STATE_SECOND_RUN;
+                    d.len2++;
+                    break;
+                case STATE_SECOND_RUN:
+                    d.len2++;
+                    break;
+                case STATE_SECOND_TAIL:
+                    // stop
+                    d.state = STATE_STOPPED;
+                    break;
+                default:
+                    // shouldn't happen
+                    assert(0);
+                }
+            }
+        }
+        break;
+
+    default:
+        // shouldn't happen
+        assert(0);
+        break;
+    }
+}
+
+MultiaccelCompileHelper::MultiaccelCompileHelper(const CharReach &ref_cr, u32 off,
+                                                 unsigned max_len) :
+        cr(ref_cr), offset(off), max_len(max_len) {
+    int accel_num = (int) MultibyteAccelInfo::MAT_MAX;
+    accels.resize(accel_num);
+
+    // mark everything as valid
+    for (int i = 0; i < accel_num; i++) {
+        accel_data &ad = accels[i];
+        ad.len1 = 1;
+        ad.type = (MultibyteAccelInfo::multiaccel_type) i;
+
+        /* for shift-grab matchers, we are waiting for the grab right at the start */
+        if (ad.type == MultibyteAccelInfo::MAT_SHIFTGRAB
+                || ad.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
+            ad.state = STATE_WAITING_FOR_GRAB;
+        } else {
+            ad.state = STATE_FIRST_RUN;
+        }
+    }
+}
+
+bool MultiaccelCompileHelper::canAdvance() {
+    for (const accel_data &ad : accels) {
+        if (ad.state != STATE_STOPPED && ad.state != STATE_INVALID) {
+            return true;
+        }
+    }
+    return false;
+}
+
+void MultiaccelCompileHelper::advance(const CharReach &cur_cr) {
+    for (accel_data &ad : accels) {
+        if (ad.state == STATE_STOPPED || ad.state == STATE_INVALID) {
+            continue;
+        }
+        match(ad, cr, cur_cr);
+#ifdef DEBUG
+        dumpMultiaccelState(ad);
+#endif
+    }
+}
+
+MultibyteAccelInfo MultiaccelCompileHelper::getBestScheme() {
+    int best_len = 0;
+    accel_data best;
+
+    DEBUG_PRINTF("Stopping multiaccel compile\n");
+
+    for (accel_data &ad : accels) {
+        // stop our matching
+        stop(ad);
+        validate(ad, max_len);
+
+#ifdef DEBUG
+        dumpMultiaccelState(ad);
+#endif
+
+        // skip invalid schemes
+        if (ad.state == STATE_INVALID) {
+            continue;
+        }
+        DEBUG_PRINTF("Marking as viable\n");
+
+        // TODO: relative strengths of accel schemes? maybe e.g. a shorter
+        // long match would in some cases be preferable to a longer
+        // double shift match (for example, depending on length)?
+        int as_len = ad.len1 + ad.len2;
+        if (as_len >= best_len) {
+            DEBUG_PRINTF("Marking as best\n");
+            best_len = as_len;
+            best = ad;
+        }
+    }
+    // if we found at least one accel scheme, return it
+    if (best.state != STATE_INVALID) {
+#ifdef DEBUG
+        DEBUG_PRINTF("Picked best multiaccel state:\n");
+        dumpMultiaccelState(best);
+#endif
+        MultibyteAccelInfo info;
+        info.cr = cr;
+        info.offset = offset;
+        info.len1 = best.len1;
+        info.len2 = best.len2;
+        info.type = best.type;
+        return info;
+    }
+    return MultibyteAccelInfo();
+}
--- a/src/nfa/multiaccel_compilehelper.h
+++ b/src/nfa/multiaccel_compilehelper.h
@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2015, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MULTIACCELCOMPILE_H_
+#define MULTIACCELCOMPILE_H_
+
+#include "ue2common.h"
+
+#include "nfagraph/ng_limex_accel.h"
+
+#include <vector>
+
+namespace ue2 {
+
+/* accel scheme state machine */
+enum accel_scheme_state {
+    STATE_FIRST_RUN,
+    STATE_SECOND_RUN,
+    STATE_WAITING_FOR_GRAB,
+    STATE_FIRST_TAIL,
+    STATE_SECOND_TAIL,
+    STATE_STOPPED,
+    STATE_INVALID
+};
+
+struct accel_data {
+    MultibyteAccelInfo::multiaccel_type type = MultibyteAccelInfo::MAT_NONE;
+    accel_scheme_state state = STATE_INVALID;
+    unsigned len1 = 0; /* length of first run */
+    unsigned len2 = 0; /* length of second run, if present */
+    unsigned tlen1 = 0; /* first tail length */
+    unsigned tlen2 = 0; /* second tail length */
+};
+
+class MultiaccelCompileHelper {
+private:
+    const CharReach &cr;
+    u32 offset;
+    std::vector<accel_data> accels;
+    unsigned max_len;
+public:
+    MultiaccelCompileHelper(const CharReach &cr, u32 off, unsigned max_len);
+    bool canAdvance();
+    MultibyteAccelInfo getBestScheme();
+    void advance(const ue2::CharReach &cr);
+};
+
+}; // namespace
+
+#endif /* MULTIACCELCOMPILE_H_ */
--- a/src/nfagraph/ng_limex_accel.cpp
+++ b/src/nfagraph/ng_limex_accel.cpp
@ -37,12 +37,15 @@
 #include "ue2common.h"

 #include "nfa/accel.h"
+#include "nfa/multiaccel_compilehelper.h"

 #include "util/bitutils.h" // for CASE_CLEAR
 #include "util/charreach.h"
+#include "util/compile_context.h"
 #include "util/container.h"
 #include "util/dump_charclass.h"
 #include "util/graph_range.h"
+#include "util/target_info.h"

 #include <algorithm>
 #include <map>
@ -647,6 +650,134 @@ NFAVertex get_sds_or_proxy(const NGHolder &g) {
    return g.startDs;
 }

+static
+NFAVertex find_next(const NFAVertex v, const NGHolder &g) {
+    NFAVertex res = NFAGraph::null_vertex();
+    for (NFAVertex u :  adjacent_vertices_range(v, g)) {
+        if (u != v) {
+            res = u;
+            break;
+        }
+    }
+    return res;
+}
+
+/** \brief Check if vertex \a v is a multi accelerable state (for a limex NFA). */
+MultibyteAccelInfo nfaCheckMultiAccel(const NGHolder &g,
+                                      const vector<NFAVertex> &states,
+                                      const CompileContext &cc) {
+    // For a set of states to be accelerable, we basically have to have only
+    // one state to accelerate.
+    if (states.size() != 1) {
+        DEBUG_PRINTF("can't accelerate multiple states\n");
+        return MultibyteAccelInfo();
+    }
+
+    // Get our base vertex
+    NFAVertex v = states[0];
+
+    // We need the base vertex to be a self-looping dotall leading to exactly
+    // one vertex.
+    if (!hasSelfLoop(v, g)) {
+        DEBUG_PRINTF("base vertex has self-loop\n");
+        return MultibyteAccelInfo();
+    }
+
+    if (!g[v].char_reach.all()) {
+        DEBUG_PRINTF("can't accelerate anything but dot\n");
+        return MultibyteAccelInfo();
+    }
+
+    if (proper_out_degree(v, g) != 1) {
+        DEBUG_PRINTF("can't accelerate states with multiple successors\n");
+        return MultibyteAccelInfo();
+    }
+
+    // find our start vertex
+    NFAVertex cur = find_next(v, g);
+    if (cur == NFAGraph::null_vertex()) {
+        DEBUG_PRINTF("invalid start vertex\n");
+        return MultibyteAccelInfo();
+    }
+
+    bool has_offset = false;
+    u32 offset = 0;
+    CharReach cr = g[cur].char_reach;
+
+    // if we start with a dot, we have an offset, so defer figuring out the
+    // real CharReach for this accel scheme
+    if (cr == CharReach::dot()) {
+        has_offset = true;
+        offset = 1;
+    }
+
+    // figure out our offset
+    while (has_offset) {
+        // vertices have to have no self loops
+        if (hasSelfLoop(cur, g)) {
+            DEBUG_PRINTF("can't have self-loops\n");
+            return MultibyteAccelInfo();
+        }
+
+        // we have to have exactly 1 successor to have this acceleration scheme
+        if (out_degree(cur, g) != 1) {
+            DEBUG_PRINTF("can't have multiple successors\n");
+            return MultibyteAccelInfo();
+        }
+
+        cur = *adjacent_vertices(cur, g).first;
+
+        // if we met a special vertex, bail out
+        if (is_special(cur, g)) {
+            DEBUG_PRINTF("can't have special vertices\n");
+            return MultibyteAccelInfo();
+        }
+
+        // now, get the real char reach
+        if (g[cur].char_reach != CharReach::dot()) {
+            cr = g[cur].char_reach;
+            has_offset = false;
+        } else {
+            offset++;
+        }
+    }
+
+    // now, fire up the compilation machinery
+    target_t ti = cc.target_info;
+    unsigned max_len = ti.has_avx2() ? MULTIACCEL_MAX_LEN_AVX2 : MULTIACCEL_MAX_LEN_SSE;
+    MultiaccelCompileHelper mac(cr, offset, max_len);
+
+    while (mac.canAdvance()) {
+        // vertices have to have no self loops
+        if (hasSelfLoop(cur, g)) {
+            break;
+        }
+
+        // we have to have exactly 1 successor to have this acceleration scheme
+        if (out_degree(cur, g) != 1) {
+            break;
+        }
+
+        cur = *adjacent_vertices(cur, g).first;
+
+        // if we met a special vertex, bail out
+        if (is_special(cur, g)) {
+            break;
+        }
+
+        mac.advance(g[cur].char_reach);
+    }
+    MultibyteAccelInfo mai = mac.getBestScheme();
+#ifdef DEBUG
+    DEBUG_PRINTF("Multibyte acceleration scheme: type: %u offset: %u lengths: %u,%u\n",
+                 mai.type, mai.offset, mai.len1, mai.len2);
+    for (size_t c = mai.cr.find_first(); c != CharReach::npos; c = mai.cr.find_next(c)) {
+        DEBUG_PRINTF("multibyte accel char: %zu\n", c);
+    }
+#endif
+    return mai;
+}
+
 /** \brief Check if vertex \a v is an accelerable state (for a limex NFA). */
 bool nfaCheckAccel(const NGHolder &g, NFAVertex v,
                   const vector<CharReach> &refined_cr,
--- a/src/nfagraph/ng_limex_accel.h
+++ b/src/nfagraph/ng_limex_accel.h
@ -50,6 +50,12 @@ namespace ue2 {
 #define MAX_MERGED_ACCEL_STOPS 200
 #define ACCEL_MAX_STOP_CHAR 24
 #define ACCEL_MAX_FLOATING_STOP_CHAR 192 /* accelerating sds is important */
+#define MULTIACCEL_MIN_LEN 3
+#define MULTIACCEL_MAX_LEN_SSE 15
+#define MULTIACCEL_MAX_LEN_AVX2 31
+
+// forward-declaration of CompileContext
+struct CompileContext;

 void findAccelFriends(const NGHolder &g, NFAVertex v,
                  const std::map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
@ -65,6 +71,25 @@ struct DoubleAccelInfo {

 DoubleAccelInfo findBestDoubleAccelInfo(const NGHolder &g, NFAVertex v);

+struct MultibyteAccelInfo {
+    /* multibyte accel schemes, ordered by strength */
+    enum multiaccel_type {
+        MAT_SHIFT,
+        MAT_SHIFTGRAB,
+        MAT_DSHIFT,
+        MAT_DSHIFTGRAB,
+        MAT_LONG,
+        MAT_LONGGRAB,
+        MAT_MAX,
+        MAT_NONE = MAT_MAX
+    };
+    CharReach cr;
+    u32 offset = 0;
+    u32 len1 = 0;
+    u32 len2 = 0;
+    multiaccel_type type = MAT_NONE;
+};
+
 struct AccelScheme {
    AccelScheme(const CharReach &cr_in, u32 offset_in)
        : cr(cr_in), offset(offset_in) {
@ -109,6 +134,11 @@ bool nfaCheckAccel(const NGHolder &g, NFAVertex v,
                   const std::map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
                   AccelScheme *as, bool allow_wide);

+/** \brief Check if vertex \a v is a multi accelerable state (for a limex NFA). */
+MultibyteAccelInfo nfaCheckMultiAccel(const NGHolder &g,
+                                      const std::vector<NFAVertex> &verts,
+                                      const CompileContext &cc);
+
 } // namespace ue2

 #endif