De-multiaccel

2025-06-28 16:41:01 +03:00 · 2017-03-31 10:38:03 +11:00 · 2017-03-31 10:38:03 +11:00 · 423569ec82
commit 423569ec82
parent 2b1a7da188
38 changed files with 217 additions and 5293 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -557,25 +557,6 @@ set (hs_exec_SRCS
    src/nfa/mpv.h
    src/nfa/mpv.c
    src/nfa/mpv_internal.h
    src/nfa/multiaccel_common.h
    src/nfa/multiaccel_doubleshift.h
    src/nfa/multiaccel_doubleshiftgrab.h
    src/nfa/multiaccel_long.h
    src/nfa/multiaccel_longgrab.h
    src/nfa/multiaccel_shift.h
    src/nfa/multiaccel_shiftgrab.h
    src/nfa/multishufti.c
    src/nfa/multishufti_avx2.h
    src/nfa/multishufti_sse.h
    src/nfa/multishufti.h
    src/nfa/multitruffle.c
    src/nfa/multitruffle_avx2.h
    src/nfa/multitruffle_sse.h
    src/nfa/multitruffle.h
    src/nfa/multivermicelli.c
    src/nfa/multivermicelli.h
    src/nfa/multivermicelli_sse.h
    src/nfa/multivermicelli_avx2.h
    src/nfa/nfa_api.h
    src/nfa/nfa_api_dispatch.c
    src/nfa/nfa_internal.h
@ -589,13 +570,11 @@ set (hs_exec_SRCS
    src/nfa/sheng_impl.h
    src/nfa/sheng_impl4.h
    src/nfa/sheng_internal.h
    src/nfa/shufti_common.h
    src/nfa/shufti.c
    src/nfa/shufti.h
    src/nfa/tamarama.c
    src/nfa/tamarama.h
    src/nfa/tamarama_internal.h
    src/nfa/truffle_common.h
    src/nfa/truffle.c
    src/nfa/truffle.h
    src/nfa/vermicelli.h
@ -736,8 +715,6 @@ SET (hs_SRCS
    src/nfa/mpv_internal.h
    src/nfa/mpvcompile.cpp
    src/nfa/mpvcompile.h
    src/nfa/multiaccel_compilehelper.cpp
    src/nfa/multiaccel_compilehelper.h
    src/nfa/nfa_api.h
    src/nfa/nfa_api_queue.h
    src/nfa/nfa_api_util.h
--- a/src/nfa/accel.c
+++ b/src/nfa/accel.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -30,9 +30,6 @@
 #include "shufti.h"
 #include "truffle.h"
 #include "vermicelli.h"
 #include "multishufti.h"
 #include "multitruffle.h"
 #include "multivermicelli.h"
 #include "ue2common.h"
 const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
@ -132,220 +129,6 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
        rv = c_end;
        break;
    /* multibyte matchers */
    case ACCEL_MLVERM:
        DEBUG_PRINTF("accel mlverm %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = long_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MLVERM_NOCASE:
        DEBUG_PRINTF("accel mlverm nc %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = long_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MLGVERM:
        DEBUG_PRINTF("accel mlgverm %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = longgrab_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MLGVERM_NOCASE:
        DEBUG_PRINTF("accel mlgverm nc %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = longgrab_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MSVERM:
        DEBUG_PRINTF("accel msverm %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shift_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MSVERM_NOCASE:
        DEBUG_PRINTF("accel msverm nc %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shift_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MSGVERM:
        DEBUG_PRINTF("accel msgverm %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shiftgrab_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MSGVERM_NOCASE:
        DEBUG_PRINTF("accel msgverm nc %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shiftgrab_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
        break;
    case ACCEL_MDSVERM:
        DEBUG_PRINTF("accel mdsverm %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshift_vermicelliExec(accel->mdverm.c, 0, c, c_end,
                                        accel->mdverm.len1, accel->mdverm.len2);
        break;
    case ACCEL_MDSVERM_NOCASE:
        DEBUG_PRINTF("accel mdsverm nc %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshift_vermicelliExec(accel->mdverm.c, 1, c, c_end,
                                        accel->mdverm.len1, accel->mdverm.len2);
        break;
    case ACCEL_MDSGVERM:
        DEBUG_PRINTF("accel mdsgverm %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshiftgrab_vermicelliExec(accel->mdverm.c, 0, c, c_end,
                                            accel->mdverm.len1, accel->mdverm.len2);
        break;
    case ACCEL_MDSGVERM_NOCASE:
        DEBUG_PRINTF("accel mdsgverm nc %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshiftgrab_vermicelliExec(accel->mdverm.c, 1, c, c_end,
                                            accel->mdverm.len1, accel->mdverm.len2);
        break;
    case ACCEL_MLSHUFTI:
        DEBUG_PRINTF("accel mlshufti %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = long_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
                             accel->mshufti.len);
        break;
    case ACCEL_MLGSHUFTI:
        DEBUG_PRINTF("accel mlgshufti %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = longgrab_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
                                 accel->mshufti.len);
        break;
    case ACCEL_MSSHUFTI:
        DEBUG_PRINTF("accel msshufti %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shift_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
                              accel->mshufti.len);
        break;
    case ACCEL_MSGSHUFTI:
        DEBUG_PRINTF("accel msgshufti %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shiftgrab_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
                                  accel->mshufti.len);
        break;
    case ACCEL_MDSSHUFTI:
        DEBUG_PRINTF("accel mdsshufti %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshift_shuftiExec(accel->mdshufti.lo, accel->mdshufti.hi, c, c_end,
                                     accel->mdshufti.len1, accel->mdshufti.len2);
        break;
    case ACCEL_MDSGSHUFTI:
        DEBUG_PRINTF("accel msgshufti %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshiftgrab_shuftiExec(accel->mdshufti.lo, accel->mdshufti.hi, c, c_end,
                                         accel->mdshufti.len1, accel->mdshufti.len2);
        break;
    case ACCEL_MLTRUFFLE:
        DEBUG_PRINTF("accel mltruffle %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = long_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
                               c, c_end, accel->mtruffle.len);
        break;
    case ACCEL_MLGTRUFFLE:
        DEBUG_PRINTF("accel mlgtruffle %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = longgrab_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
                                   c, c_end, accel->mtruffle.len);
        break;
    case ACCEL_MSTRUFFLE:
        DEBUG_PRINTF("accel mstruffle %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shift_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
                               c, c_end, accel->mtruffle.len);
        break;
    case ACCEL_MSGTRUFFLE:
        DEBUG_PRINTF("accel msgtruffle %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = shiftgrab_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
                                   c, c_end, accel->mtruffle.len);
        break;
    case ACCEL_MDSTRUFFLE:
        DEBUG_PRINTF("accel mdstruffle %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshift_truffleExec(accel->mdtruffle.mask1,
                                     accel->mdtruffle.mask2, c, c_end,
                                     accel->mdtruffle.len1,
                                     accel->mdtruffle.len2);
        break;
    case ACCEL_MDSGTRUFFLE:
        DEBUG_PRINTF("accel mdsgtruffle %p %p\n", c, c_end);
        if (c + 15 >= c_end) {
            return c;
        }
        rv = doubleshiftgrab_truffleExec(accel->mdtruffle.mask1,
                                         accel->mdtruffle.mask2, c, c_end,
                                         accel->mdtruffle.len1,
                                         accel->mdtruffle.len2);
        break;
    default:
        assert(!"not here");
--- a/src/nfa/accel.h
+++ b/src/nfa/accel.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -61,36 +61,7 @@ enum AccelType {
    ACCEL_DSHUFTI,
    ACCEL_TRUFFLE,
    ACCEL_RED_TAPE,
    /* multibyte vermicellis */
    ACCEL_MLVERM,
    ACCEL_MLVERM_NOCASE,
    ACCEL_MLGVERM,
    ACCEL_MLGVERM_NOCASE,
    ACCEL_MSVERM,
    ACCEL_MSVERM_NOCASE,
    ACCEL_MSGVERM,
    ACCEL_MSGVERM_NOCASE,
    ACCEL_MDSVERM,
    ACCEL_MDSVERM_NOCASE,
    ACCEL_MDSGVERM,
    ACCEL_MDSGVERM_NOCASE,
    /* multibyte shuftis */
    ACCEL_MLSHUFTI,
    ACCEL_MLGSHUFTI,
    ACCEL_MSSHUFTI,
    ACCEL_MSGSHUFTI,
    ACCEL_MDSSHUFTI,
    ACCEL_MDSGSHUFTI,
    /* multibyte truffles */
    ACCEL_MLTRUFFLE,
    ACCEL_MLGTRUFFLE,
    ACCEL_MSTRUFFLE,
    ACCEL_MSGTRUFFLE,
    ACCEL_MDSTRUFFLE,
    ACCEL_MDSGTRUFFLE,
    /* masked dverm */
    ACCEL_DVERM_MASKED,
 };
 /** \brief Structure for accel framework. */
@ -140,42 +111,12 @@ union AccelAux {
        m128 lo2;
        m128 hi2;
    } dshufti;
    struct {
        u8 accel_type;
        u8 offset;
        m128 lo;
        m128 hi;
        u8 len;
    } mshufti;
    struct {
        u8 accel_type;
        u8 offset;
        m128 lo;
        m128 hi;
        u8 len1;
        u8 len2;
    } mdshufti;
    struct {
        u8 accel_type;
        u8 offset;
        m128 mask1;
        m128 mask2;
    } truffle;
    struct {
        u8 accel_type;
        u8 offset;
        m128 mask1;
        m128 mask2;
        u8 len;
    } mtruffle;
    struct {
        u8 accel_type;
        u8 offset;
        m128 mask1;
        m128 mask2;
        u8 len1;
        u8 len2;
    } mdtruffle;
 };
 /**
--- a/src/nfa/accel_dump.cpp
+++ b/src/nfa/accel_dump.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -93,54 +93,6 @@ const char *accelName(u8 accel_type) {
        return "truffle";
    case ACCEL_RED_TAPE:
        return "red tape";
    case ACCEL_MLVERM:
        return "multibyte long vermicelli";
    case ACCEL_MLVERM_NOCASE:
        return "multibyte long vermicelli nocase";
    case ACCEL_MLGVERM:
        return "multibyte long-grab vermicelli";
    case ACCEL_MLGVERM_NOCASE:
        return "multibyte long-grab vermicelli nocase";
    case ACCEL_MSVERM:
        return "multibyte shift vermicelli";
    case ACCEL_MSVERM_NOCASE:
        return "multibyte shift vermicelli nocase";
    case ACCEL_MSGVERM:
        return "multibyte shift-grab vermicelli";
    case ACCEL_MSGVERM_NOCASE:
        return "multibyte shift-grab vermicelli nocase";
    case ACCEL_MDSVERM:
        return "multibyte doubleshift vermicelli";
    case ACCEL_MDSVERM_NOCASE:
        return "multibyte doubleshift vermicelli nocase";
    case ACCEL_MDSGVERM:
        return "multibyte doubleshift-grab vermicelli";
    case ACCEL_MDSGVERM_NOCASE:
        return "multibyte doubleshift-grab vermicelli nocase";
    case ACCEL_MLSHUFTI:
        return "multibyte long shufti";
    case ACCEL_MLGSHUFTI:
        return "multibyte long-grab shufti";
    case ACCEL_MSSHUFTI:
        return "multibyte shift shufti";
    case ACCEL_MSGSHUFTI:
        return "multibyte shift-grab shufti";
    case ACCEL_MDSSHUFTI:
        return "multibyte doubleshift shufti";
    case ACCEL_MDSGSHUFTI:
        return "multibyte doubleshift-grab shufti";
    case ACCEL_MLTRUFFLE:
        return "multibyte long truffle";
    case ACCEL_MLGTRUFFLE:
        return "multibyte long-grab truffle";
    case ACCEL_MSTRUFFLE:
        return "multibyte shift truffle";
    case ACCEL_MSGTRUFFLE:
        return "multibyte shift-grab truffle";
    case ACCEL_MDSTRUFFLE:
        return "multibyte doubleshift truffle";
    case ACCEL_MDSGTRUFFLE:
        return "multibyte doubleshift-grab truffle";
    default:
        return "unknown!";
    }
@ -283,59 +235,6 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) {
                             (const u8 *)&accel.truffle.mask2);
        break;
    }
    case ACCEL_MLVERM:
    case ACCEL_MLVERM_NOCASE:
    case ACCEL_MLGVERM:
    case ACCEL_MLGVERM_NOCASE:
    case ACCEL_MSVERM:
    case ACCEL_MSVERM_NOCASE:
    case ACCEL_MSGVERM:
    case ACCEL_MSGVERM_NOCASE:
        fprintf(f, " [\\x%02hhx] len:%u\n", accel.mverm.c, accel.mverm.len);
        break;
    case ACCEL_MDSVERM:
    case ACCEL_MDSVERM_NOCASE:
    case ACCEL_MDSGVERM:
    case ACCEL_MDSGVERM_NOCASE:
        fprintf(f, " [\\x%02hhx] len1:%u len2:%u\n", accel.mdverm.c, accel.mdverm.len1,
                accel.mdverm.len2);
        break;
    case ACCEL_MLSHUFTI:
    case ACCEL_MLGSHUFTI:
    case ACCEL_MSSHUFTI:
    case ACCEL_MSGSHUFTI:
        fprintf(f, " len:%u\n", accel.mshufti.len);
        dumpShuftiMasks(f, (const u8 *)&accel.mshufti.lo,
                        (const u8 *)&accel.mshufti.hi);
        dumpShuftiCharReach(f, (const u8 *)&accel.mshufti.lo,
                            (const u8 *)&accel.mshufti.hi);
        break;
    case ACCEL_MDSSHUFTI:
    case ACCEL_MDSGSHUFTI:
        fprintf(f, " len1:%u len2:%u\n", accel.mdshufti.len1, accel.mdshufti.len2);
        dumpShuftiMasks(f, (const u8 *)&accel.mdshufti.lo,
                        (const u8 *)&accel.mdshufti.hi);
        dumpShuftiCharReach(f, (const u8 *)&accel.mdshufti.lo,
                            (const u8 *)&accel.mdshufti.hi);
        break;
    case ACCEL_MLTRUFFLE:
    case ACCEL_MLGTRUFFLE:
    case ACCEL_MSTRUFFLE:
    case ACCEL_MSGTRUFFLE:
        fprintf(f, " len:%u\n", accel.mtruffle.len);
        dumpTruffleMasks(f, (const u8 *)&accel.mtruffle.mask1,
                         (const u8 *)&accel.mtruffle.mask2);
        dumpTruffleCharReach(f, (const u8 *)&accel.mtruffle.mask1,
                             (const u8 *)&accel.mtruffle.mask2);
        break;
    case ACCEL_MDSTRUFFLE:
    case ACCEL_MDSGTRUFFLE:
        fprintf(f, " len1:%u len2:%u\n", accel.mdtruffle.len1, accel.mdtruffle.len2);
        dumpTruffleMasks(f, (const u8 *)&accel.mdtruffle.mask1,
                         (const u8 *)&accel.mdtruffle.mask2);
        dumpTruffleCharReach(f, (const u8 *)&accel.mdtruffle.mask1,
                             (const u8 *)&accel.mdtruffle.mask2);
        break;
    default:
        fprintf(f, "\n");
        break;
--- a/src/nfa/accelcompile.cpp
+++ b/src/nfa/accelcompile.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -225,274 +225,6 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
    aux->accel_type = ACCEL_NONE;
 }
 static
 void buildAccelMulti(const AccelInfo &info, AccelAux *aux) {
    if (info.ma_type == MultibyteAccelInfo::MAT_NONE) {
        DEBUG_PRINTF("no multimatch for us :(");
        return;
    }
    u32 offset = info.multiaccel_offset;
    const CharReach &stops = info.multiaccel_stops;
    assert(aux->accel_type == ACCEL_NONE);
    if (stops.all()) {
        return;
    }
    size_t outs = stops.count();
    DEBUG_PRINTF("%zu outs\n", outs);
    assert(outs && outs < 256);
    switch (info.ma_type) {
    case MultibyteAccelInfo::MAT_LONG:
        if (outs == 1) {
            aux->accel_type = ACCEL_MLVERM;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first();
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
            return;
        }
        if (outs == 2 && stops.isCaselessChar()) {
            aux->accel_type = ACCEL_MLVERM_NOCASE;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first() & CASE_CLEAR;
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
                         aux->verm.c);
            return;
        }
        break;
    case MultibyteAccelInfo::MAT_LONGGRAB:
        if (outs == 1) {
            aux->accel_type = ACCEL_MLGVERM;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first();
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
            return;
        }
        if (outs == 2 && stops.isCaselessChar()) {
            aux->accel_type = ACCEL_MLGVERM_NOCASE;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first() & CASE_CLEAR;
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
                         aux->verm.c);
            return;
        }
        break;
    case MultibyteAccelInfo::MAT_SHIFT:
        if (outs == 1) {
            aux->accel_type = ACCEL_MSVERM;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first();
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
            return;
        }
        if (outs == 2 && stops.isCaselessChar()) {
            aux->accel_type = ACCEL_MSVERM_NOCASE;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first() & CASE_CLEAR;
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
                         aux->verm.c);
            return;
        }
        break;
    case MultibyteAccelInfo::MAT_SHIFTGRAB:
        if (outs == 1) {
            aux->accel_type = ACCEL_MSGVERM;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first();
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
            return;
        }
        if (outs == 2 && stops.isCaselessChar()) {
            aux->accel_type = ACCEL_MSGVERM_NOCASE;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first() & CASE_CLEAR;
            aux->mverm.len = info.ma_len1;
            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
                         aux->verm.c);
            return;
        }
        break;
    case MultibyteAccelInfo::MAT_DSHIFT:
        if (outs == 1) {
            aux->accel_type = ACCEL_MDSVERM;
            aux->mdverm.offset = offset;
            aux->mdverm.c = stops.find_first();
            aux->mdverm.len1 = info.ma_len1;
            aux->mdverm.len2 = info.ma_len2;
            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
            return;
        }
        if (outs == 2 && stops.isCaselessChar()) {
            aux->accel_type = ACCEL_MDSVERM_NOCASE;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first() & CASE_CLEAR;
            aux->mdverm.len1 = info.ma_len1;
            aux->mdverm.len2 = info.ma_len2;
            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
                         aux->verm.c);
            return;
        }
        break;
    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
        if (outs == 1) {
            aux->accel_type = ACCEL_MDSGVERM;
            aux->mdverm.offset = offset;
            aux->mdverm.c = stops.find_first();
            aux->mdverm.len1 = info.ma_len1;
            aux->mdverm.len2 = info.ma_len2;
            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
            return;
        }
        if (outs == 2 && stops.isCaselessChar()) {
            aux->accel_type = ACCEL_MDSGVERM_NOCASE;
            aux->mverm.offset = offset;
            aux->mverm.c = stops.find_first() & CASE_CLEAR;
            aux->mdverm.len1 = info.ma_len1;
            aux->mdverm.len2 = info.ma_len2;
            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
                         aux->verm.c);
            return;
        }
        break;
    default:
        // shouldn't happen
        assert(0);
        return;
    }
    DEBUG_PRINTF("attempting shufti for %zu chars\n", outs);
    switch (info.ma_type) {
    case MultibyteAccelInfo::MAT_LONG:
        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
                (u8 *)&aux->mshufti.hi) == -1) {
            break;
        }
        aux->accel_type = ACCEL_MLSHUFTI;
        aux->mshufti.offset = offset;
        aux->mshufti.len = info.ma_len1;
        return;
    case MultibyteAccelInfo::MAT_LONGGRAB:
        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
                (u8 *)&aux->mshufti.hi) == -1) {
            break;
        }
        aux->accel_type = ACCEL_MLGSHUFTI;
        aux->mshufti.offset = offset;
        aux->mshufti.len = info.ma_len1;
        return;
    case MultibyteAccelInfo::MAT_SHIFT:
        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
                (u8 *)&aux->mshufti.hi) == -1) {
            break;
        }
        aux->accel_type = ACCEL_MSSHUFTI;
        aux->mshufti.offset = offset;
        aux->mshufti.len = info.ma_len1;
        return;
    case MultibyteAccelInfo::MAT_SHIFTGRAB:
        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
                (u8 *)&aux->mshufti.hi) == -1) {
            break;
        }
        aux->accel_type = ACCEL_MSGSHUFTI;
        aux->mshufti.offset = offset;
        aux->mshufti.len = info.ma_len1;
        return;
    case MultibyteAccelInfo::MAT_DSHIFT:
        if (shuftiBuildMasks(stops, (u8 *)&aux->mdshufti.lo,
                (u8 *)&aux->mdshufti.hi) == -1) {
            break;
        }
        aux->accel_type = ACCEL_MDSSHUFTI;
        aux->mdshufti.offset = offset;
        aux->mdshufti.len1 = info.ma_len1;
        aux->mdshufti.len2 = info.ma_len2;
        return;
    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
        if (shuftiBuildMasks(stops, (u8 *)&aux->mdshufti.lo,
                (u8 *)&aux->mdshufti.hi) == -1) {
            break;
        }
        aux->accel_type = ACCEL_MDSGSHUFTI;
        aux->mdshufti.offset = offset;
        aux->mdshufti.len1 = info.ma_len1;
        aux->mdshufti.len2 = info.ma_len2;
        return;
    default:
        // shouldn't happen
        assert(0);
        return;
    }
    DEBUG_PRINTF("shufti build failed, falling through\n");
    if (outs <= ACCEL_MAX_STOP_CHAR) {
        DEBUG_PRINTF("building Truffle for %zu chars\n", outs);
        switch (info.ma_type) {
        case MultibyteAccelInfo::MAT_LONG:
            aux->accel_type = ACCEL_MLTRUFFLE;
            aux->mtruffle.offset = offset;
            aux->mtruffle.len = info.ma_len1;
            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
                              (u8 *)&aux->mtruffle.mask2);
            break;
        case MultibyteAccelInfo::MAT_LONGGRAB:
            aux->accel_type = ACCEL_MLGTRUFFLE;
            aux->mtruffle.offset = offset;
            aux->mtruffle.len = info.ma_len1;
            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
                              (u8 *)&aux->mtruffle.mask2);
            break;
        case MultibyteAccelInfo::MAT_SHIFT:
            aux->accel_type = ACCEL_MSTRUFFLE;
            aux->mtruffle.offset = offset;
            aux->mtruffle.len = info.ma_len1;
            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
                              (u8 *)&aux->mtruffle.mask2);
            break;
        case MultibyteAccelInfo::MAT_SHIFTGRAB:
            aux->accel_type = ACCEL_MSGTRUFFLE;
            aux->mtruffle.offset = offset;
            aux->mtruffle.len = info.ma_len1;
            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
                              (u8 *)&aux->mtruffle.mask2);
            break;
        case MultibyteAccelInfo::MAT_DSHIFT:
            aux->accel_type = ACCEL_MDSTRUFFLE;
            aux->mdtruffle.offset = offset;
            aux->mdtruffle.len1 = info.ma_len1;
            aux->mdtruffle.len2 = info.ma_len2;
            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
                              (u8 *)&aux->mdtruffle.mask2);
            break;
        case MultibyteAccelInfo::MAT_DSHIFTGRAB:
            aux->accel_type = ACCEL_MDSGTRUFFLE;
            aux->mdtruffle.offset = offset;
            aux->mdtruffle.len1 = info.ma_len1;
            aux->mdtruffle.len2 = info.ma_len2;
            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
                              (u8 *)&aux->mdtruffle.mask2);
            break;
        default:
            // shouldn't happen
            assert(0);
            return;
        }
        return;
    }
    DEBUG_PRINTF("unable to accelerate multibyte case with %zu outs\n", outs);
 }
 bool buildAccelAux(const AccelInfo &info, AccelAux *aux) {
    assert(aux->accel_type == ACCEL_NONE);
    if (info.single_stops.none()) {
@ -500,9 +232,6 @@ bool buildAccelAux(const AccelInfo &info, AccelAux *aux) {
        aux->accel_type = ACCEL_RED_TAPE;
        aux->generic.offset = info.single_offset;
    }
    if (aux->accel_type == ACCEL_NONE) {
        buildAccelMulti(info, aux);
    }
    if (aux->accel_type == ACCEL_NONE) {
        buildAccelDouble(info, aux);
    }
--- a/src/nfa/accelcompile.h
+++ b/src/nfa/accelcompile.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -37,30 +37,9 @@ union AccelAux;
 namespace ue2 {
 struct MultibyteAccelInfo {
    /* multibyte accel schemes, ordered by strength */
    enum multiaccel_type {
        MAT_SHIFT,
        MAT_SHIFTGRAB,
        MAT_DSHIFT,
        MAT_DSHIFTGRAB,
        MAT_LONG,
        MAT_LONGGRAB,
        MAT_MAX,
        MAT_NONE = MAT_MAX
    };
    CharReach cr;
    u32 offset = 0;
    u32 len1 = 0;
    u32 len2 = 0;
    multiaccel_type type = MAT_NONE;
 };
 struct AccelInfo {
    AccelInfo() : single_offset(0U), double_offset(0U),
-                  single_stops(CharReach::dot()),
+                  single_stops(CharReach::dot()) {}
                  multiaccel_offset(0), ma_len1(0), ma_len2(0),
                  ma_type(MultibyteAccelInfo::MAT_NONE) {}
    u32 single_offset; /**< offset correction to apply to single schemes */
    u32 double_offset; /**< offset correction to apply to double schemes */
    CharReach double_stop1;  /**<  single-byte accel stop literals for double
@ -68,11 +47,6 @@ struct AccelInfo {
    flat_set<std::pair<u8, u8>> double_stop2; /**< double-byte accel stop
                                               * literals */
    CharReach single_stops; /**< escapes for single byte acceleration */
    u32 multiaccel_offset; /**< offset correction to apply to multibyte schemes */
    CharReach multiaccel_stops; /**< escapes for multibyte acceleration */
    u32 ma_len1; /**< multiaccel len1 */
    u32 ma_len2; /**< multiaccel len2 */
    MultibyteAccelInfo::multiaccel_type ma_type; /**< multiaccel type */
 };
 bool buildAccelAux(const AccelInfo &info, AccelAux *aux);
--- a/src/nfa/limex_accel.c
+++ b/src/nfa/limex_accel.c
@ -39,9 +39,6 @@
 #include "nfa_internal.h"
 #include "shufti.h"
 #include "truffle.h"
 #include "multishufti.h"
 #include "multitruffle.h"
 #include "multivermicelli.h"
 #include "ue2common.h"
 #include "vermicelli.h"
 #include "util/arch.h"
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@ -93,8 +93,6 @@ struct precalcAccel {
    CharReach double_cr;
    flat_set<pair<u8, u8>> double_lits; /* double-byte accel stop literals */
    u32 double_offset;
    MultibyteAccelInfo ma_info;
 };
 struct limex_accel_info {
@ -358,16 +356,12 @@ void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
 }
 struct AccelBuild {
-    AccelBuild() : v(NGHolder::null_vertex()), state(0), offset(0), ma_len1(0),
+    AccelBuild() : v(NGHolder::null_vertex()), state(0), offset(0) {}
            ma_len2(0), ma_type(MultibyteAccelInfo::MAT_NONE) {}
    NFAVertex v;
    u32 state;
    u32 offset; // offset correction to apply
    CharReach stop1; // single-byte accel stop literals
    flat_set<pair<u8, u8>> stop2; // double-byte accel stop literals
    u32 ma_len1; // multiaccel len1
    u32 ma_len2; // multiaccel len2
    MultibyteAccelInfo::multiaccel_type ma_type; // multiaccel type
 };
 static
@ -382,12 +376,7 @@ void findStopLiterals(const build_info &bi, NFAVertex v, AccelBuild &build) {
        build.stop1 = CharReach::dot();
    } else {
        const precalcAccel &precalc = bi.accel.precalc.at(ss);
-        unsigned ma_len = precalc.ma_info.len1 + precalc.ma_info.len2;
+        if (precalc.double_lits.empty()) {
        if (ma_len >= MULTIACCEL_MIN_LEN) {
            build.ma_len1 = precalc.ma_info.len1;
            build.stop1 = precalc.ma_info.cr;
            build.offset = precalc.ma_info.offset;
        } else if (precalc.double_lits.empty()) {
            build.stop1 = precalc.single_cr;
            build.offset = precalc.single_offset;
        } else {
@ -606,7 +595,6 @@ void fillAccelInfo(build_info &bi) {
    limex_accel_info &accel = bi.accel;
    unordered_map<NFAVertex, AccelScheme> &accel_map = accel.accel_map;
    const map<NFAVertex, BoundedRepeatSummary> &br_cyclic = bi.br_cyclic;
    const CompileContext &cc = bi.cc;
    const unordered_map<NFAVertex, u32> &state_ids = bi.state_ids;
    const u32 num_states = bi.num_states;
@ -663,27 +651,17 @@ void fillAccelInfo(build_info &bi) {
        DEBUG_PRINTF("accel %u ok with offset s%u, d%u\n", i, as.offset,
                     as.double_offset);
        // try multibyte acceleration first
        MultibyteAccelInfo mai = nfaCheckMultiAccel(g, states, cc);
        precalcAccel &pa = accel.precalc[state_set];
        useful |= state_set;
        // if we successfully built a multibyte accel scheme, use that
        if (mai.type != MultibyteAccelInfo::MAT_NONE) {
            pa.ma_info = mai;
            DEBUG_PRINTF("multibyte acceleration!\n");
            continue;
        }
        pa.single_offset = as.offset;
        pa.single_cr = as.cr;
        if (as.double_byte.size() != 0) {
            pa.double_offset = as.double_offset;
            pa.double_lits = as.double_byte;
            pa.double_cr = as.double_cr;
-        };
+        }
        useful |= state_set;
    }
    for (const auto &m : accel_map) {
@ -700,19 +678,8 @@ void fillAccelInfo(build_info &bi) {
        state_set.reset();
        state_set.set(state_id);
        bool is_multi = false;
        auto p_it = accel.precalc.find(state_set);
        if (p_it != accel.precalc.end()) {
            const precalcAccel &pa = p_it->second;
            offset = max(pa.double_offset, pa.single_offset);
            is_multi = pa.ma_info.type != MultibyteAccelInfo::MAT_NONE;
            assert(offset <= MAX_ACCEL_DEPTH);
        }
        accel.accelerable.insert(v);
-        if (!is_multi) {
+        findAccelFriends(g, v, br_cyclic, offset, &accel.friends[v]);
            findAccelFriends(g, v, br_cyclic, offset, &accel.friends[v]);
        }
    }
 }
@ -954,16 +921,8 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
            if (contains(accel.precalc, effective_states)) {
                const auto &precalc = accel.precalc.at(effective_states);
-                if (precalc.ma_info.type != MultibyteAccelInfo::MAT_NONE) {
+                ainfo.single_offset = precalc.single_offset;
-                    ainfo.ma_len1 = precalc.ma_info.len1;
+                ainfo.single_stops = precalc.single_cr;
                    ainfo.ma_len2 = precalc.ma_info.len2;
                    ainfo.multiaccel_offset = precalc.ma_info.offset;
                    ainfo.multiaccel_stops = precalc.ma_info.cr;
                    ainfo.ma_type = precalc.ma_info.type;
                } else {
                    ainfo.single_offset = precalc.single_offset;
                    ainfo.single_stops = precalc.single_cr;
                }
            }
        }
--- a/src/nfa/multiaccel_common.h
+++ b/src/nfa/multiaccel_common.h
@ -1,265 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_COMMON_H_
 #define MULTIACCEL_COMMON_H_
 #include "config.h"
 #include "ue2common.h"
 #include "util/join.h"
 #include "util/bitutils.h"
 /*
 * When doing shifting, remember that the total number of shifts should be n-1
 */
 #define VARISHIFT(src, dst, len) \
    do { \
        (dst) &= (src) >> (len); \
    } while (0)
 #define STATIC_SHIFT1(x) \
    do { \
        (x) &= (x) >> 1; \
    } while (0)
 #define STATIC_SHIFT2(x) \
    do { \
        (x) &= (x) >> 2;\
    } while (0)
 #define STATIC_SHIFT4(x) \
    do { \
        (x) &= (x) >> 4; \
    } while (0)
 #define STATIC_SHIFT8(x) \
    do { \
        (x) &= (x) >> 8; \
    } while (0)
 #define SHIFT1(x) \
    do {} while (0)
 #define SHIFT2(x) \
    do { \
        STATIC_SHIFT1(x); \
    } while (0)
 #define SHIFT3(x) \
    do { \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT1(x); \
    } while (0)
 #define SHIFT4(x) \
    do { \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT5(x) \
    do { \
        SHIFT4(x); \
        STATIC_SHIFT1(x); \
    } while (0)
 #define SHIFT6(x) \
    do { \
        SHIFT4(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT7(x) \
    do { \
        SHIFT4(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT8(x) \
    do { \
        SHIFT4(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT9(x) \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT1(x); \
    } while (0)
 #define SHIFT10(x) \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT11(x) \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT12(x); \
    do { \
        SHIFT8(x);\
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT13(x); \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT14(x) \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT2(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT15(x) \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT16(x) \
    do { \
        SHIFT8(x); \
        STATIC_SHIFT8(x); \
    } while (0)
 #define SHIFT17(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT1(x); \
    } while (0)
 #define SHIFT18(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT19(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT20(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT21(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT22(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT2(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT23(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT24(x) \
    do { \
        SHIFT16(x); \
        STATIC_SHIFT8(x); \
    } while (0)
 #define SHIFT25(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT1(x); \
    } while (0)
 #define SHIFT26(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT27(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
    } while (0)
 #define SHIFT28(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT29(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT30(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT2(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT31(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT1(x); \
        STATIC_SHIFT2(x); \
        STATIC_SHIFT4(x); \
    } while (0)
 #define SHIFT32(x) \
    do { \
        SHIFT24(x); \
        STATIC_SHIFT8(x); \
    } while (0)
 /*
 * this function is used by 32-bit multiaccel matchers. 32-bit matchers accept
 * a 32-bit integer as a buffer, where low 16 bits is movemask result and
 * high 16 bits are "don't care" values. this function is not expected to return
 * a result higher than 16.
 */
 static really_inline
 const u8 *match32(const u8 *buf, const u32 z) {
    if (unlikely(z != 0)) {
        u32 pos = ctz32(z);
        assert(pos < 16);
        return buf + pos;
    }
    return NULL;
 }
 /*
 * this function is used by 64-bit multiaccel matchers. 64-bit matchers accept
 * a 64-bit integer as a buffer, where low 32 bits is movemask result and
 * high 32 bits are "don't care" values. this function is not expected to return
 * a result higher than 32.
 */
 static really_inline
 const u8 *match64(const u8 *buf, const u64a z) {
    if (unlikely(z != 0)) {
        u32 pos = ctz64(z);
        assert(pos < 32);
        return buf + pos;
    }
    return NULL;
 }
 #endif /* MULTIACCEL_COMMON_H_ */
--- a/src/nfa/multiaccel_compilehelper.cpp
+++ b/src/nfa/multiaccel_compilehelper.cpp
@ -1,439 +0,0 @@
 /*
 * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "multiaccel_compilehelper.h"
 using namespace std;
 using namespace ue2;
 #ifdef DEBUG
 static const char* state_to_str[] = {
    "FIRST_RUN",
    "SECOND_RUN",
    "WAITING_FOR_GRAB",
    "FIRST_TAIL",
    "SECOND_TAIL",
    "STOPPED",
    "INVALID"
 };
 static const char* type_to_str[] = {
    "SHIFT",
    "SHIFTGRAB",
    "DOUBLESHIFT",
    "DOUBLESHIFTGRAB",
    "LONG",
    "LONGGRAB",
    "NONE"
 };
 static
 void dumpMultiaccelState(const accel_data &d) {
    DEBUG_PRINTF("type: %s state: %s len1: %u tlen1: %u len2: %u tlen2: %u\n",
                 type_to_str[(unsigned) d.type],
                 state_to_str[(unsigned) d.state],
                 d.len1, d.tlen1, d.len2, d.tlen2);
 }
 #endif
 /* stop all the matching. this may render most schemes invalid. */
 static
 void stop(accel_data &d) {
    switch (d.state) {
    case STATE_STOPPED:
    case STATE_INVALID:
        break;
    case STATE_FIRST_TAIL:
    case STATE_SECOND_RUN:
        /*
         * Shift matchers are special case, because they have "tails".
         * When shift matcher reaches a mid/endpoint, tail mode is
         * activated, which looks for more matches to extend the match.
         *
         * For example, consider pattern /a{5}ba{3}/. Under normal circumstances,
         * long-grab matcher will be picked for this pattern (matching a run of a's,
         * followed by a not-a), because doubleshift matcher would be confused by
         * consecutive a's and would parse the pattern as a.{0}a.{0}a (two shifts
         * by 1) and throw out the rest of the pattern.
         *
         * With tails, we defer ending the run until we actually run out of
         * matching characters, so the above pattern will now be parsed by
         * doubleshift matcher as /a.{3}a.{3}a/ (two shifts by 4).
         *
         * So if we are stopping shift matchers, we should check if we aren't in
         * the process of matching first tail or second run. If we are, we can't
         * finish the second run as we are stopping, but we can try and split
         * the first tail instead to obtain a valid second run.
         */
        if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.tlen1 == 0) {
            // can't split an empty void...
            d.state = STATE_INVALID;
            break;
        }
        d.len2 = 0;
        d.state = STATE_STOPPED;
        break;
    case STATE_SECOND_TAIL:
        d.state = STATE_STOPPED;
        break;
    case STATE_WAITING_FOR_GRAB:
    case STATE_FIRST_RUN:
        if (d.type == MultibyteAccelInfo::MAT_LONG) {
            d.state = STATE_STOPPED;
        } else {
            d.state = STATE_INVALID;
        }
        break;
    }
 }
 static
 void validate(accel_data &d, unsigned max_len) {
    // try and fit in all our tails
    if (d.len1 + d.tlen1 + d.len2 + d.tlen2 < max_len && d.len2 > 0) {
        // case 1: everything fits in
        d.len1 += d.tlen1;
        d.len2 += d.tlen2;
        d.tlen1 = 0;
        d.tlen2 = 0;
    } else if (d.len1 + d.tlen1 + d.len2 < max_len && d.len2 > 0) {
        // case 2: everything but the second tail fits in
        d.len1 += d.tlen1;
        d.tlen1 = 0;
        // try going for a partial tail
        if (d.tlen2 != 0) {
            int new_tlen2 = max_len - 1 - d.len1 - d.len2;
            if (new_tlen2 > 0) {
                d.len2 += new_tlen2;
            }
            d.tlen2 = 0;
        }
    } else if (d.len1 + d.tlen1 < max_len) {
        // case 3: first run and its tail fits in
        if (d.type == MultibyteAccelInfo::MAT_DSHIFT ||
                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
            // split the tail into a second run
            d.len2 = d.tlen1;
        } else {
            d.len1 += d.tlen1;
            d.len2 = 0;
        }
        d.tlen1 = 0;
        d.tlen2 = 0;
    } else if (d.len1 < max_len) {
        // case 4: nothing but the first run fits in
        // try going for a partial tail
        if (d.tlen1 != 0) {
            int new_tlen1 = max_len - 1 - d.len1;
            if (new_tlen1 > 0) {
                d.len1 += new_tlen1;
            }
            d.tlen1 = 0;
        }
        d.len2 = 0;
        d.tlen2 = 0;
    }
    // if we removed our second run, doubleshift matchers are no longer valid
    if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.len2 == 0) {
        d.state = STATE_INVALID;
    } else if ((d.type == MultibyteAccelInfo::MAT_LONG) && d.len1 >= max_len) {
        // long matchers can just stop whenever they want to
        d.len1 = max_len - 1;
    }
    // now, general sanity checks
    if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) >= max_len) {
        d.state = STATE_INVALID;
    }
    if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) < MULTIACCEL_MIN_LEN) {
        d.state = STATE_INVALID;
    }
 }
 static
 void match(accel_data &d, const CharReach &ref_cr, const CharReach &cur_cr) {
    switch (d.type) {
    case MultibyteAccelInfo::MAT_LONG:
        {
            /*
             * For long matcher, we want lots of consecutive same-or-subset
             * char-reaches
             */
            if ((ref_cr & cur_cr) == cur_cr) {
                d.len1++;
            } else {
                d.state = STATE_STOPPED;
            }
        }
        break;
    case MultibyteAccelInfo::MAT_LONGGRAB:
        {
            /*
             * For long-grab matcher, we want lots of consecutive same-or-subset
             * char-reaches with a negative match in the end.
             */
            if ((ref_cr & cur_cr) == cur_cr) {
                d.len1++;
            } else if (!(ref_cr & cur_cr).any()) {
                /* we grabbed, stop immediately */
                d.state = STATE_STOPPED;
            } else {
                /* our run-n-grab was interrupted; mark as invalid */
                d.state = STATE_INVALID;
            }
        }
        break;
    case MultibyteAccelInfo::MAT_SHIFTGRAB:
        {
            /*
             * For shift-grab matcher, we want two matches separated by anything;
             * however the second vertex *must* be a negative (non-overlapping) match.
             *
             * Shiftgrab matcher is identical to shift except for presence of grab.
             */
            if (d.state == STATE_WAITING_FOR_GRAB) {
                if ((ref_cr & cur_cr).any()) {
                    d.state = STATE_INVALID;
                } else {
                    d.state = STATE_FIRST_RUN;
                    d.len1++;
                }
                return;
            }
        }
        /* no break, falling through */
    case MultibyteAccelInfo::MAT_SHIFT:
        {
            /*
             * For shift-matcher, we want two matches separated by anything.
             */
            if (ref_cr == cur_cr) {
                // keep matching tail
                switch (d.state) {
                case STATE_FIRST_RUN:
                    d.state = STATE_FIRST_TAIL;
                    break;
                case STATE_FIRST_TAIL:
                    d.tlen1++;
                    break;
                default:
                    // shouldn't happen
                    assert(0);
                }
            } else {
                switch (d.state) {
                case STATE_FIRST_RUN:
                    // simply advance
                    d.len1++;
                    break;
                case STATE_FIRST_TAIL:
                    // we found a non-matching char after tail, so stop
                    d.state = STATE_STOPPED;
                    break;
                default:
                    // shouldn't happen
                    assert(0);
                }
            }
        }
        break;
    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
        {
            /*
             * For double shift-grab matcher, we want two matches separated by
             * either negative matches or dots; however the second vertex *must*
             * be a negative match.
             *
             * Doubleshiftgrab matcher is identical to doubleshift except for
             * presence of grab.
             */
            if (d.state == STATE_WAITING_FOR_GRAB) {
                if ((ref_cr & cur_cr).any()) {
                    d.state = STATE_INVALID;
                } else {
                    d.state = STATE_FIRST_RUN;
                    d.len1++;
                }
                return;
            }
        }
        /* no break, falling through */
    case MultibyteAccelInfo::MAT_DSHIFT:
        {
            /*
             * For double shift matcher, we want three matches, each separated
             * by a lot of anything.
             *
             * Doubleshift matcher is complicated by presence of tails.
             */
            if (ref_cr == cur_cr) {
                // decide if we are activating second shift or matching tails
                switch (d.state) {
                case STATE_FIRST_RUN:
                    d.state = STATE_FIRST_TAIL;
                    d.len2 = 1; // we're now ready for our second run
                    break;
                case STATE_FIRST_TAIL:
                    d.tlen1++;
                    break;
                case STATE_SECOND_RUN:
                    d.state = STATE_SECOND_TAIL;
                    break;
                case STATE_SECOND_TAIL:
                    d.tlen2++;
                    break;
                default:
                    // shouldn't happen
                    assert(0);
                }
            } else {
                switch (d.state) {
                case STATE_FIRST_RUN:
                    d.len1++;
                    break;
                case STATE_FIRST_TAIL:
                    // start second run
                    d.state = STATE_SECOND_RUN;
                    d.len2++;
                    break;
                case STATE_SECOND_RUN:
                    d.len2++;
                    break;
                case STATE_SECOND_TAIL:
                    // stop
                    d.state = STATE_STOPPED;
                    break;
                default:
                    // shouldn't happen
                    assert(0);
                }
            }
        }
        break;
    default:
        // shouldn't happen
        assert(0);
        break;
    }
 }
 MultiaccelCompileHelper::MultiaccelCompileHelper(const CharReach &ref_cr,
                                                 u32 off, unsigned max_length)
    : cr(ref_cr), offset(off), max_len(max_length) {
    int accel_num = (int) MultibyteAccelInfo::MAT_MAX;
    accels.resize(accel_num);
    // mark everything as valid
    for (int i = 0; i < accel_num; i++) {
        accel_data &ad = accels[i];
        ad.len1 = 1;
        ad.type = (MultibyteAccelInfo::multiaccel_type) i;
        /* for shift-grab matchers, we are waiting for the grab right at the start */
        if (ad.type == MultibyteAccelInfo::MAT_SHIFTGRAB
                || ad.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
            ad.state = STATE_WAITING_FOR_GRAB;
        } else {
            ad.state = STATE_FIRST_RUN;
        }
    }
 }
 bool MultiaccelCompileHelper::canAdvance() {
    for (const accel_data &ad : accels) {
        if (ad.state != STATE_STOPPED && ad.state != STATE_INVALID) {
            return true;
        }
    }
    return false;
 }
 void MultiaccelCompileHelper::advance(const CharReach &cur_cr) {
    for (accel_data &ad : accels) {
        if (ad.state == STATE_STOPPED || ad.state == STATE_INVALID) {
            continue;
        }
        match(ad, cr, cur_cr);
 #ifdef DEBUG
        dumpMultiaccelState(ad);
 #endif
    }
 }
 MultibyteAccelInfo MultiaccelCompileHelper::getBestScheme() {
    int best_len = 0;
    accel_data best;
    DEBUG_PRINTF("Stopping multiaccel compile\n");
    for (accel_data &ad : accels) {
        // stop our matching
        stop(ad);
        validate(ad, max_len);
 #ifdef DEBUG
        dumpMultiaccelState(ad);
 #endif
        // skip invalid schemes
        if (ad.state == STATE_INVALID) {
            continue;
        }
        DEBUG_PRINTF("Marking as viable\n");
        // TODO: relative strengths of accel schemes? maybe e.g. a shorter
        // long match would in some cases be preferable to a longer
        // double shift match (for example, depending on length)?
        int as_len = ad.len1 + ad.len2;
        if (as_len >= best_len) {
            DEBUG_PRINTF("Marking as best\n");
            best_len = as_len;
            best = ad;
        }
    }
    // if we found at least one accel scheme, return it
    if (best.state != STATE_INVALID) {
 #ifdef DEBUG
        DEBUG_PRINTF("Picked best multiaccel state:\n");
        dumpMultiaccelState(best);
 #endif
        MultibyteAccelInfo info;
        info.cr = cr;
        info.offset = offset;
        info.len1 = best.len1;
        info.len2 = best.len2;
        info.type = best.type;
        return info;
    }
    return MultibyteAccelInfo();
 }
--- a/src/nfa/multiaccel_compilehelper.h
+++ b/src/nfa/multiaccel_compilehelper.h
@ -1,75 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCELCOMPILE_H_
 #define MULTIACCELCOMPILE_H_
 #include "ue2common.h"
 #include "nfagraph/ng_limex_accel.h"
 #include <vector>
 namespace ue2 {
 /* accel scheme state machine */
 enum accel_scheme_state {
    STATE_FIRST_RUN,
    STATE_SECOND_RUN,
    STATE_WAITING_FOR_GRAB,
    STATE_FIRST_TAIL,
    STATE_SECOND_TAIL,
    STATE_STOPPED,
    STATE_INVALID
 };
 struct accel_data {
    MultibyteAccelInfo::multiaccel_type type = MultibyteAccelInfo::MAT_NONE;
    accel_scheme_state state = STATE_INVALID;
    unsigned len1 = 0; /* length of first run */
    unsigned len2 = 0; /* length of second run, if present */
    unsigned tlen1 = 0; /* first tail length */
    unsigned tlen2 = 0; /* second tail length */
 };
 class MultiaccelCompileHelper {
 private:
    const CharReach &cr;
    u32 offset;
    std::vector<accel_data> accels;
    unsigned max_len;
 public:
    MultiaccelCompileHelper(const CharReach &cr, u32 off, unsigned max_len);
    bool canAdvance();
    MultibyteAccelInfo getBestScheme();
    void advance(const ue2::CharReach &cr);
 };
 }; // namespace
 #endif /* MULTIACCELCOMPILE_H_ */
--- a/src/nfa/multiaccel_doubleshift.h
+++ b/src/nfa/multiaccel_doubleshift.h
@ -1,149 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_DOUBLESHIFT_H_
 #define MULTIACCEL_DOUBLESHIFT_H_
 #include "multiaccel_common.h"
 #define DOUBLESHIFT_MATCH(len, match_t, match_sz) \
    static really_inline \
    const u8 * JOIN4(doubleshiftMatch_, match_sz, _, len)(const u8 *buf, match_t z, u32 len2) {\
        if (unlikely(z)) { \
            match_t tmp = z; \
            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
            tmp |= ((match_t) (1 << (len + len2)) - 1) << (match_sz / 2); \
            VARISHIFT(z, z, len); \
            VARISHIFT(tmp, tmp, len2); \
            VARISHIFT(tmp, z, len); \
            return JOIN(match, match_sz)(buf, z); \
        } \
        return NULL; \
    }
 #define DOUBLESHIFT_MATCH_32_DEF(n) \
        DOUBLESHIFT_MATCH(n, u32, 32)
 #define DOUBLESHIFT_MATCH_64_DEF(n) \
        DOUBLESHIFT_MATCH(n, u64a, 64)
 #define DOUBLESHIFT_MATCH_DEF(n) \
    DOUBLESHIFT_MATCH_32_DEF(n) \
    DOUBLESHIFT_MATCH_64_DEF(n)
 DOUBLESHIFT_MATCH_DEF(1)
 DOUBLESHIFT_MATCH_DEF(2)
 DOUBLESHIFT_MATCH_DEF(3)
 DOUBLESHIFT_MATCH_DEF(4)
 DOUBLESHIFT_MATCH_DEF(5)
 DOUBLESHIFT_MATCH_DEF(6)
 DOUBLESHIFT_MATCH_DEF(7)
 DOUBLESHIFT_MATCH_DEF(8)
 DOUBLESHIFT_MATCH_DEF(9)
 DOUBLESHIFT_MATCH_DEF(10)
 DOUBLESHIFT_MATCH_DEF(11)
 DOUBLESHIFT_MATCH_DEF(12)
 DOUBLESHIFT_MATCH_DEF(13)
 DOUBLESHIFT_MATCH_DEF(14)
 DOUBLESHIFT_MATCH_DEF(15)
 DOUBLESHIFT_MATCH_64_DEF(16)
 DOUBLESHIFT_MATCH_64_DEF(17)
 DOUBLESHIFT_MATCH_64_DEF(18)
 DOUBLESHIFT_MATCH_64_DEF(19)
 DOUBLESHIFT_MATCH_64_DEF(20)
 DOUBLESHIFT_MATCH_64_DEF(21)
 DOUBLESHIFT_MATCH_64_DEF(22)
 DOUBLESHIFT_MATCH_64_DEF(23)
 DOUBLESHIFT_MATCH_64_DEF(24)
 DOUBLESHIFT_MATCH_64_DEF(25)
 DOUBLESHIFT_MATCH_64_DEF(26)
 DOUBLESHIFT_MATCH_64_DEF(27)
 DOUBLESHIFT_MATCH_64_DEF(28)
 DOUBLESHIFT_MATCH_64_DEF(29)
 DOUBLESHIFT_MATCH_64_DEF(30)
 DOUBLESHIFT_MATCH_64_DEF(31)
 static
 const UNUSED u8 * (*doubleshift_match_funcs_32[])(const u8 *buf, u32 z, u32 len2) =
 {
 // skip the first
    0,
    &doubleshiftMatch_32_1,
    &doubleshiftMatch_32_2,
    &doubleshiftMatch_32_3,
    &doubleshiftMatch_32_4,
    &doubleshiftMatch_32_5,
    &doubleshiftMatch_32_6,
    &doubleshiftMatch_32_7,
    &doubleshiftMatch_32_8,
    &doubleshiftMatch_32_9,
    &doubleshiftMatch_32_10,
    &doubleshiftMatch_32_11,
    &doubleshiftMatch_32_12,
    &doubleshiftMatch_32_13,
    &doubleshiftMatch_32_14,
    &doubleshiftMatch_32_15,
 };
 static
 const UNUSED u8 * (*doubleshift_match_funcs_64[])(const u8 *buf, u64a z, u32 len2) =
 {
 // skip the first
    0,
    &doubleshiftMatch_64_1,
    &doubleshiftMatch_64_2,
    &doubleshiftMatch_64_3,
    &doubleshiftMatch_64_4,
    &doubleshiftMatch_64_5,
    &doubleshiftMatch_64_6,
    &doubleshiftMatch_64_7,
    &doubleshiftMatch_64_8,
    &doubleshiftMatch_64_9,
    &doubleshiftMatch_64_10,
    &doubleshiftMatch_64_11,
    &doubleshiftMatch_64_12,
    &doubleshiftMatch_64_13,
    &doubleshiftMatch_64_14,
    &doubleshiftMatch_64_15,
    &doubleshiftMatch_64_16,
    &doubleshiftMatch_64_17,
    &doubleshiftMatch_64_18,
    &doubleshiftMatch_64_19,
    &doubleshiftMatch_64_20,
    &doubleshiftMatch_64_21,
    &doubleshiftMatch_64_22,
    &doubleshiftMatch_64_23,
    &doubleshiftMatch_64_24,
    &doubleshiftMatch_64_25,
    &doubleshiftMatch_64_26,
    &doubleshiftMatch_64_27,
    &doubleshiftMatch_64_28,
    &doubleshiftMatch_64_29,
    &doubleshiftMatch_64_30,
    &doubleshiftMatch_64_31,
 };
 #endif /* MULTIACCEL_DOUBLESHIFT_H_ */
--- a/src/nfa/multiaccel_doubleshiftgrab.h
+++ b/src/nfa/multiaccel_doubleshiftgrab.h
@ -1,152 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_DOUBLESHIFTGRAB_H_
 #define MULTIACCEL_DOUBLESHIFTGRAB_H_
 #include "multiaccel_common.h"
 #define DOUBLESHIFTGRAB_MATCH(len, match_t, match_sz) \
    static really_inline \
    const u8 * JOIN4(doubleshiftgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z, u32 len2) {\
        if (unlikely(z)) { \
            match_t neg = ~z; \
            match_t tmp = z; \
            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
            tmp |= ((match_t) (1 << (len + len2)) - 1) << (match_sz / 2); \
            neg |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
            VARISHIFT(z, z, len); \
            VARISHIFT(tmp, tmp, len2); \
            VARISHIFT(neg, z, 1); \
            VARISHIFT(tmp, z, len); \
            return JOIN(match, match_sz)(buf, z); \
        } \
        return NULL; \
    }
 #define DOUBLESHIFTGRAB_MATCH_32_DEF(n) \
        DOUBLESHIFTGRAB_MATCH(n, u32, 32)
 #define DOUBLESHIFTGRAB_MATCH_64_DEF(n) \
        DOUBLESHIFTGRAB_MATCH(n, u64a, 64)
 #define DOUBLESHIFTGRAB_MATCH_DEF(n) \
    DOUBLESHIFTGRAB_MATCH_32_DEF(n) \
    DOUBLESHIFTGRAB_MATCH_64_DEF(n)
 DOUBLESHIFTGRAB_MATCH_DEF(1)
 DOUBLESHIFTGRAB_MATCH_DEF(2)
 DOUBLESHIFTGRAB_MATCH_DEF(3)
 DOUBLESHIFTGRAB_MATCH_DEF(4)
 DOUBLESHIFTGRAB_MATCH_DEF(5)
 DOUBLESHIFTGRAB_MATCH_DEF(6)
 DOUBLESHIFTGRAB_MATCH_DEF(7)
 DOUBLESHIFTGRAB_MATCH_DEF(8)
 DOUBLESHIFTGRAB_MATCH_DEF(9)
 DOUBLESHIFTGRAB_MATCH_DEF(10)
 DOUBLESHIFTGRAB_MATCH_DEF(11)
 DOUBLESHIFTGRAB_MATCH_DEF(12)
 DOUBLESHIFTGRAB_MATCH_DEF(13)
 DOUBLESHIFTGRAB_MATCH_DEF(14)
 DOUBLESHIFTGRAB_MATCH_DEF(15)
 DOUBLESHIFTGRAB_MATCH_64_DEF(16)
 DOUBLESHIFTGRAB_MATCH_64_DEF(17)
 DOUBLESHIFTGRAB_MATCH_64_DEF(18)
 DOUBLESHIFTGRAB_MATCH_64_DEF(19)
 DOUBLESHIFTGRAB_MATCH_64_DEF(20)
 DOUBLESHIFTGRAB_MATCH_64_DEF(21)
 DOUBLESHIFTGRAB_MATCH_64_DEF(22)
 DOUBLESHIFTGRAB_MATCH_64_DEF(23)
 DOUBLESHIFTGRAB_MATCH_64_DEF(24)
 DOUBLESHIFTGRAB_MATCH_64_DEF(25)
 DOUBLESHIFTGRAB_MATCH_64_DEF(26)
 DOUBLESHIFTGRAB_MATCH_64_DEF(27)
 DOUBLESHIFTGRAB_MATCH_64_DEF(28)
 DOUBLESHIFTGRAB_MATCH_64_DEF(29)
 DOUBLESHIFTGRAB_MATCH_64_DEF(30)
 DOUBLESHIFTGRAB_MATCH_64_DEF(31)
 static
 const UNUSED u8 * (*doubleshiftgrab_match_funcs_32[])(const u8 *buf, u32 z, u32 len2) =
 {
 // skip the first
    0,
    &doubleshiftgrabMatch_32_1,
    &doubleshiftgrabMatch_32_2,
    &doubleshiftgrabMatch_32_3,
    &doubleshiftgrabMatch_32_4,
    &doubleshiftgrabMatch_32_5,
    &doubleshiftgrabMatch_32_6,
    &doubleshiftgrabMatch_32_7,
    &doubleshiftgrabMatch_32_8,
    &doubleshiftgrabMatch_32_9,
    &doubleshiftgrabMatch_32_10,
    &doubleshiftgrabMatch_32_11,
    &doubleshiftgrabMatch_32_12,
    &doubleshiftgrabMatch_32_13,
    &doubleshiftgrabMatch_32_14,
    &doubleshiftgrabMatch_32_15,
 };
 static
 const UNUSED u8 * (*doubleshiftgrab_match_funcs_64[])(const u8 *buf, u64a z, u32 len2) =
 {
 // skip the first
    0,
    &doubleshiftgrabMatch_64_1,
    &doubleshiftgrabMatch_64_2,
    &doubleshiftgrabMatch_64_3,
    &doubleshiftgrabMatch_64_4,
    &doubleshiftgrabMatch_64_5,
    &doubleshiftgrabMatch_64_6,
    &doubleshiftgrabMatch_64_7,
    &doubleshiftgrabMatch_64_8,
    &doubleshiftgrabMatch_64_9,
    &doubleshiftgrabMatch_64_10,
    &doubleshiftgrabMatch_64_11,
    &doubleshiftgrabMatch_64_12,
    &doubleshiftgrabMatch_64_13,
    &doubleshiftgrabMatch_64_14,
    &doubleshiftgrabMatch_64_15,
    &doubleshiftgrabMatch_64_16,
    &doubleshiftgrabMatch_64_17,
    &doubleshiftgrabMatch_64_18,
    &doubleshiftgrabMatch_64_19,
    &doubleshiftgrabMatch_64_20,
    &doubleshiftgrabMatch_64_21,
    &doubleshiftgrabMatch_64_22,
    &doubleshiftgrabMatch_64_23,
    &doubleshiftgrabMatch_64_24,
    &doubleshiftgrabMatch_64_25,
    &doubleshiftgrabMatch_64_26,
    &doubleshiftgrabMatch_64_27,
    &doubleshiftgrabMatch_64_28,
    &doubleshiftgrabMatch_64_29,
    &doubleshiftgrabMatch_64_30,
    &doubleshiftgrabMatch_64_31,
 };
 #endif /* MULTIACCEL_DOUBLESHIFTGRAB_H_ */
--- a/src/nfa/multiaccel_long.h
+++ b/src/nfa/multiaccel_long.h
@ -1,145 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_LONG_H_
 #define MULTIACCEL_LONG_H_
 #include "multiaccel_common.h"
 #define LONG_MATCH(len, match_t, match_sz) \
    static really_inline \
    const u8 * JOIN4(longMatch_, match_sz, _, len)(const u8 *buf, match_t z) { \
        if (unlikely(z)) { \
            z |= ((match_t) (1 << (len - 1)) - 1) << (match_sz / 2); \
            JOIN(SHIFT, len)(z); \
            return JOIN(match, match_sz)(buf, z); \
        } \
        return NULL; \
    }
 #define LONG_MATCH_32_DEF(n) \
        LONG_MATCH(n, u32, 32)
 #define LONG_MATCH_64_DEF(n) \
        LONG_MATCH(n, u64a, 64)
 #define LONG_MATCH_DEF(n) \
    LONG_MATCH_32_DEF(n) \
    LONG_MATCH_64_DEF(n)
 LONG_MATCH_DEF(1)
 LONG_MATCH_DEF(2)
 LONG_MATCH_DEF(3)
 LONG_MATCH_DEF(4)
 LONG_MATCH_DEF(5)
 LONG_MATCH_DEF(6)
 LONG_MATCH_DEF(7)
 LONG_MATCH_DEF(8)
 LONG_MATCH_DEF(9)
 LONG_MATCH_DEF(10)
 LONG_MATCH_DEF(11)
 LONG_MATCH_DEF(12)
 LONG_MATCH_DEF(13)
 LONG_MATCH_DEF(14)
 LONG_MATCH_DEF(15)
 LONG_MATCH_64_DEF(16)
 LONG_MATCH_64_DEF(17)
 LONG_MATCH_64_DEF(18)
 LONG_MATCH_64_DEF(19)
 LONG_MATCH_64_DEF(20)
 LONG_MATCH_64_DEF(21)
 LONG_MATCH_64_DEF(22)
 LONG_MATCH_64_DEF(23)
 LONG_MATCH_64_DEF(24)
 LONG_MATCH_64_DEF(25)
 LONG_MATCH_64_DEF(26)
 LONG_MATCH_64_DEF(27)
 LONG_MATCH_64_DEF(28)
 LONG_MATCH_64_DEF(29)
 LONG_MATCH_64_DEF(30)
 LONG_MATCH_64_DEF(31)
 static
 const UNUSED u8 *(*long_match_funcs_32[])(const u8 *buf, u32 z) =
 {
    // skip the first three
     0,
     &longMatch_32_1,
     &longMatch_32_2,
     &longMatch_32_3,
     &longMatch_32_4,
     &longMatch_32_5,
     &longMatch_32_6,
     &longMatch_32_7,
     &longMatch_32_8,
     &longMatch_32_9,
     &longMatch_32_10,
     &longMatch_32_11,
     &longMatch_32_12,
     &longMatch_32_13,
     &longMatch_32_14,
     &longMatch_32_15,
 };
 static
 const UNUSED u8 *(*long_match_funcs_64[])(const u8 *buf, u64a z) =
 {
 // skip the first three
    0,
    &longMatch_64_1,
    &longMatch_64_2,
    &longMatch_64_3,
    &longMatch_64_4,
    &longMatch_64_5,
    &longMatch_64_6,
    &longMatch_64_7,
    &longMatch_64_8,
    &longMatch_64_9,
    &longMatch_64_10,
    &longMatch_64_11,
    &longMatch_64_12,
    &longMatch_64_13,
    &longMatch_64_14,
    &longMatch_64_15,
    &longMatch_64_16,
    &longMatch_64_17,
    &longMatch_64_18,
    &longMatch_64_19,
    &longMatch_64_20,
    &longMatch_64_21,
    &longMatch_64_22,
    &longMatch_64_23,
    &longMatch_64_24,
    &longMatch_64_25,
    &longMatch_64_26,
    &longMatch_64_27,
    &longMatch_64_28,
    &longMatch_64_29,
    &longMatch_64_30,
    &longMatch_64_31,
 };
 #endif /* MULTIACCEL_LONG_H_ */
--- a/src/nfa/multiaccel_longgrab.h
+++ b/src/nfa/multiaccel_longgrab.h
@ -1,148 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_LONGGRAB_H_
 #define MULTIACCEL_LONGGRAB_H_
 #include "multiaccel_common.h"
 #define LONGGRAB_MATCH(len, match_t, match_sz) \
    static really_inline \
    const u8 * JOIN4(longgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z) { \
        if (unlikely(z)) { \
            match_t tmp = ~z; \
            tmp |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
            z |= ((match_t) (1 << (len - 1)) - 1) << (match_sz / 2); \
            JOIN(SHIFT, len)(z); \
            VARISHIFT(tmp, z, len); \
            return JOIN(match, match_sz)(buf, z); \
        } \
        return NULL; \
    }
 #define LONGGRAB_MATCH_32_DEF(n) \
        LONGGRAB_MATCH(n, u32, 32)
 #define LONGGRAB_MATCH_64_DEF(n) \
        LONGGRAB_MATCH(n, u64a, 64)
 #define LONGGRAB_MATCH_DEF(n) \
    LONGGRAB_MATCH_32_DEF(n) \
    LONGGRAB_MATCH_64_DEF(n)
 LONGGRAB_MATCH_DEF(1)
 LONGGRAB_MATCH_DEF(2)
 LONGGRAB_MATCH_DEF(3)
 LONGGRAB_MATCH_DEF(4)
 LONGGRAB_MATCH_DEF(5)
 LONGGRAB_MATCH_DEF(6)
 LONGGRAB_MATCH_DEF(7)
 LONGGRAB_MATCH_DEF(8)
 LONGGRAB_MATCH_DEF(9)
 LONGGRAB_MATCH_DEF(10)
 LONGGRAB_MATCH_DEF(11)
 LONGGRAB_MATCH_DEF(12)
 LONGGRAB_MATCH_DEF(13)
 LONGGRAB_MATCH_DEF(14)
 LONGGRAB_MATCH_DEF(15)
 LONGGRAB_MATCH_64_DEF(16)
 LONGGRAB_MATCH_64_DEF(17)
 LONGGRAB_MATCH_64_DEF(18)
 LONGGRAB_MATCH_64_DEF(19)
 LONGGRAB_MATCH_64_DEF(20)
 LONGGRAB_MATCH_64_DEF(21)
 LONGGRAB_MATCH_64_DEF(22)
 LONGGRAB_MATCH_64_DEF(23)
 LONGGRAB_MATCH_64_DEF(24)
 LONGGRAB_MATCH_64_DEF(25)
 LONGGRAB_MATCH_64_DEF(26)
 LONGGRAB_MATCH_64_DEF(27)
 LONGGRAB_MATCH_64_DEF(28)
 LONGGRAB_MATCH_64_DEF(29)
 LONGGRAB_MATCH_64_DEF(30)
 LONGGRAB_MATCH_64_DEF(31)
 static
 const UNUSED u8 *(*longgrab_match_funcs_32[])(const u8 *buf, u32 z) =
 {
 // skip the first three
     0,
     &longgrabMatch_32_1,
     &longgrabMatch_32_2,
     &longgrabMatch_32_3,
     &longgrabMatch_32_4,
     &longgrabMatch_32_5,
     &longgrabMatch_32_6,
     &longgrabMatch_32_7,
     &longgrabMatch_32_8,
     &longgrabMatch_32_9,
     &longgrabMatch_32_10,
     &longgrabMatch_32_11,
     &longgrabMatch_32_12,
     &longgrabMatch_32_13,
     &longgrabMatch_32_14,
     &longgrabMatch_32_15,
 };
 static
 const UNUSED u8 *(*longgrab_match_funcs_64[])(const u8 *buf, u64a z) =
 {
 // skip the first three
    0,
    &longgrabMatch_64_1,
    &longgrabMatch_64_2,
    &longgrabMatch_64_3,
    &longgrabMatch_64_4,
    &longgrabMatch_64_5,
    &longgrabMatch_64_6,
    &longgrabMatch_64_7,
    &longgrabMatch_64_8,
    &longgrabMatch_64_9,
    &longgrabMatch_64_10,
    &longgrabMatch_64_11,
    &longgrabMatch_64_12,
    &longgrabMatch_64_13,
    &longgrabMatch_64_14,
    &longgrabMatch_64_15,
    &longgrabMatch_64_16,
    &longgrabMatch_64_17,
    &longgrabMatch_64_18,
    &longgrabMatch_64_19,
    &longgrabMatch_64_20,
    &longgrabMatch_64_21,
    &longgrabMatch_64_22,
    &longgrabMatch_64_23,
    &longgrabMatch_64_24,
    &longgrabMatch_64_25,
    &longgrabMatch_64_26,
    &longgrabMatch_64_27,
    &longgrabMatch_64_28,
    &longgrabMatch_64_29,
    &longgrabMatch_64_30,
    &longgrabMatch_64_31,
 };
 #endif /* MULTIACCEL_LONGGRAB_H_ */
--- a/src/nfa/multiaccel_shift.h
+++ b/src/nfa/multiaccel_shift.h
@ -1,145 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_SHIFT_H_
 #define MULTIACCEL_SHIFT_H_
 #include "multiaccel_common.h"
 #define SHIFT_MATCH(len, match_t, match_sz) \
    static really_inline \
    const u8 * JOIN4(shiftMatch_, match_sz, _, len)(const u8 *buf, match_t z) {\
        if (unlikely(z)) { \
            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
            VARISHIFT(z, z, len); \
            return JOIN(match, match_sz)(buf, z); \
        } \
        return NULL; \
    }
 #define SHIFT_MATCH_32_DEF(n) \
        SHIFT_MATCH(n, u32, 32)
 #define SHIFT_MATCH_64_DEF(n) \
        SHIFT_MATCH(n, u64a, 64)
 #define SHIFT_MATCH_DEF(n) \
    SHIFT_MATCH_32_DEF(n) \
    SHIFT_MATCH_64_DEF(n)
 SHIFT_MATCH_DEF(1)
 SHIFT_MATCH_DEF(2)
 SHIFT_MATCH_DEF(3)
 SHIFT_MATCH_DEF(4)
 SHIFT_MATCH_DEF(5)
 SHIFT_MATCH_DEF(6)
 SHIFT_MATCH_DEF(7)
 SHIFT_MATCH_DEF(8)
 SHIFT_MATCH_DEF(9)
 SHIFT_MATCH_DEF(10)
 SHIFT_MATCH_DEF(11)
 SHIFT_MATCH_DEF(12)
 SHIFT_MATCH_DEF(13)
 SHIFT_MATCH_DEF(14)
 SHIFT_MATCH_DEF(15)
 SHIFT_MATCH_64_DEF(16)
 SHIFT_MATCH_64_DEF(17)
 SHIFT_MATCH_64_DEF(18)
 SHIFT_MATCH_64_DEF(19)
 SHIFT_MATCH_64_DEF(20)
 SHIFT_MATCH_64_DEF(21)
 SHIFT_MATCH_64_DEF(22)
 SHIFT_MATCH_64_DEF(23)
 SHIFT_MATCH_64_DEF(24)
 SHIFT_MATCH_64_DEF(25)
 SHIFT_MATCH_64_DEF(26)
 SHIFT_MATCH_64_DEF(27)
 SHIFT_MATCH_64_DEF(28)
 SHIFT_MATCH_64_DEF(29)
 SHIFT_MATCH_64_DEF(30)
 SHIFT_MATCH_64_DEF(31)
 static
 const UNUSED u8 * (*shift_match_funcs_32[])(const u8 *buf, u32 z) =
 {
 // skip the first
   0,
   &shiftMatch_32_1,
   &shiftMatch_32_2,
   &shiftMatch_32_3,
   &shiftMatch_32_4,
   &shiftMatch_32_5,
   &shiftMatch_32_6,
   &shiftMatch_32_7,
   &shiftMatch_32_8,
   &shiftMatch_32_9,
   &shiftMatch_32_10,
   &shiftMatch_32_11,
   &shiftMatch_32_12,
   &shiftMatch_32_13,
   &shiftMatch_32_14,
   &shiftMatch_32_15,
 };
 static
 const UNUSED u8 * (*shift_match_funcs_64[])(const u8 *buf, u64a z) =
 {
 // skip the first
    0,
    &shiftMatch_64_1,
    &shiftMatch_64_2,
    &shiftMatch_64_3,
    &shiftMatch_64_4,
    &shiftMatch_64_5,
    &shiftMatch_64_6,
    &shiftMatch_64_7,
    &shiftMatch_64_8,
    &shiftMatch_64_9,
    &shiftMatch_64_10,
    &shiftMatch_64_11,
    &shiftMatch_64_12,
    &shiftMatch_64_13,
    &shiftMatch_64_14,
    &shiftMatch_64_15,
    &shiftMatch_64_16,
    &shiftMatch_64_17,
    &shiftMatch_64_18,
    &shiftMatch_64_19,
    &shiftMatch_64_20,
    &shiftMatch_64_21,
    &shiftMatch_64_22,
    &shiftMatch_64_23,
    &shiftMatch_64_24,
    &shiftMatch_64_25,
    &shiftMatch_64_26,
    &shiftMatch_64_27,
    &shiftMatch_64_28,
    &shiftMatch_64_29,
    &shiftMatch_64_30,
    &shiftMatch_64_31,
 };
 #endif /* MULTIACCEL_SHIFT_H_ */
--- a/src/nfa/multiaccel_shiftgrab.h
+++ b/src/nfa/multiaccel_shiftgrab.h
@ -1,148 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIACCEL_SHIFTGRAB_H_
 #define MULTIACCEL_SHIFTGRAB_H_
 #include "multiaccel_common.h"
 #define SHIFTGRAB_MATCH(len, match_t, match_sz) \
    static really_inline \
    const u8 * JOIN4(shiftgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z) {\
        if (unlikely(z)) { \
            match_t tmp = ~z; \
            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
            tmp |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
            VARISHIFT(z, z, len); \
            VARISHIFT(tmp, z, 1); \
            return JOIN(match, match_sz)(buf, z); \
        } \
        return NULL; \
    }
 #define SHIFTGRAB_MATCH_32_DEF(n) \
        SHIFTGRAB_MATCH(n, u32, 32)
 #define SHIFTGRAB_MATCH_64_DEF(n) \
        SHIFTGRAB_MATCH(n, u64a, 64)
 #define SHIFTGRAB_MATCH_DEF(n) \
    SHIFTGRAB_MATCH_32_DEF(n) \
    SHIFTGRAB_MATCH_64_DEF(n)
 SHIFTGRAB_MATCH_DEF(1)
 SHIFTGRAB_MATCH_DEF(2)
 SHIFTGRAB_MATCH_DEF(3)
 SHIFTGRAB_MATCH_DEF(4)
 SHIFTGRAB_MATCH_DEF(5)
 SHIFTGRAB_MATCH_DEF(6)
 SHIFTGRAB_MATCH_DEF(7)
 SHIFTGRAB_MATCH_DEF(8)
 SHIFTGRAB_MATCH_DEF(9)
 SHIFTGRAB_MATCH_DEF(10)
 SHIFTGRAB_MATCH_DEF(11)
 SHIFTGRAB_MATCH_DEF(12)
 SHIFTGRAB_MATCH_DEF(13)
 SHIFTGRAB_MATCH_DEF(14)
 SHIFTGRAB_MATCH_DEF(15)
 SHIFTGRAB_MATCH_64_DEF(16)
 SHIFTGRAB_MATCH_64_DEF(17)
 SHIFTGRAB_MATCH_64_DEF(18)
 SHIFTGRAB_MATCH_64_DEF(19)
 SHIFTGRAB_MATCH_64_DEF(20)
 SHIFTGRAB_MATCH_64_DEF(21)
 SHIFTGRAB_MATCH_64_DEF(22)
 SHIFTGRAB_MATCH_64_DEF(23)
 SHIFTGRAB_MATCH_64_DEF(24)
 SHIFTGRAB_MATCH_64_DEF(25)
 SHIFTGRAB_MATCH_64_DEF(26)
 SHIFTGRAB_MATCH_64_DEF(27)
 SHIFTGRAB_MATCH_64_DEF(28)
 SHIFTGRAB_MATCH_64_DEF(29)
 SHIFTGRAB_MATCH_64_DEF(30)
 SHIFTGRAB_MATCH_64_DEF(31)
 static
 const UNUSED u8 * (*shiftgrab_match_funcs_32[])(const u8 *buf, u32 z) =
 {
 // skip the first
    0,
    &shiftgrabMatch_32_1,
    &shiftgrabMatch_32_2,
    &shiftgrabMatch_32_3,
    &shiftgrabMatch_32_4,
    &shiftgrabMatch_32_5,
    &shiftgrabMatch_32_6,
    &shiftgrabMatch_32_7,
    &shiftgrabMatch_32_8,
    &shiftgrabMatch_32_9,
    &shiftgrabMatch_32_10,
    &shiftgrabMatch_32_11,
    &shiftgrabMatch_32_12,
    &shiftgrabMatch_32_13,
    &shiftgrabMatch_32_14,
    &shiftgrabMatch_32_15,
 };
 static
 const UNUSED u8 * (*shiftgrab_match_funcs_64[])(const u8 *buf, u64a z) =
                                                       {
 // skip the first
    0,
    &shiftgrabMatch_64_1,
    &shiftgrabMatch_64_2,
    &shiftgrabMatch_64_3,
    &shiftgrabMatch_64_4,
    &shiftgrabMatch_64_5,
    &shiftgrabMatch_64_6,
    &shiftgrabMatch_64_7,
    &shiftgrabMatch_64_8,
    &shiftgrabMatch_64_9,
    &shiftgrabMatch_64_10,
    &shiftgrabMatch_64_11,
    &shiftgrabMatch_64_12,
    &shiftgrabMatch_64_13,
    &shiftgrabMatch_64_14,
    &shiftgrabMatch_64_15,
    &shiftgrabMatch_64_16,
    &shiftgrabMatch_64_17,
    &shiftgrabMatch_64_18,
    &shiftgrabMatch_64_19,
    &shiftgrabMatch_64_20,
    &shiftgrabMatch_64_21,
    &shiftgrabMatch_64_22,
    &shiftgrabMatch_64_23,
    &shiftgrabMatch_64_24,
    &shiftgrabMatch_64_25,
    &shiftgrabMatch_64_26,
    &shiftgrabMatch_64_27,
    &shiftgrabMatch_64_28,
    &shiftgrabMatch_64_29,
    &shiftgrabMatch_64_30,
    &shiftgrabMatch_64_31,
 };
 #endif /* MULTIACCEL_SHIFTGRAB_H_ */
--- a/src/nfa/multishufti.c
+++ b/src/nfa/multishufti.c
@ -1,115 +0,0 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 /** \file
 * \brief Shufti: character class acceleration.
 *
 * Utilises the SSSE3 pshufb shuffle instruction
 */
 #include "config.h"
 #include "ue2common.h"
 #include "util/arch.h"
 #include "multishufti.h"
 #include "multiaccel_common.h"
 #if !defined(HAVE_AVX2)
 #define MATCH_ALGO long_
 #include "multiaccel_long.h"
 #include "multishufti_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO longgrab_
 #include "multiaccel_longgrab.h"
 #include "multishufti_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shift_
 #include "multiaccel_shift.h"
 #include "multishufti_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shiftgrab_
 #include "multiaccel_shiftgrab.h"
 #include "multishufti_sse.h"
 #undef MATCH_ALGO
 #define MULTIACCEL_DOUBLE
 #define MATCH_ALGO doubleshift_
 #include "multiaccel_doubleshift.h"
 #include "multishufti_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO doubleshiftgrab_
 #include "multiaccel_doubleshiftgrab.h"
 #include "multishufti_sse.h"
 #undef MATCH_ALGO
 #undef MULTIACCEL_DOUBLE
 #else
 #define MATCH_ALGO long_
 #include "multiaccel_long.h"
 #include "multishufti_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO longgrab_
 #include "multiaccel_longgrab.h"
 #include "multishufti_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shift_
 #include "multiaccel_shift.h"
 #include "multishufti_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shiftgrab_
 #include "multiaccel_shiftgrab.h"
 #include "multishufti_avx2.h"
 #undef MATCH_ALGO
 #define MULTIACCEL_DOUBLE
 #define MATCH_ALGO doubleshift_
 #include "multiaccel_doubleshift.h"
 #include "multishufti_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO doubleshiftgrab_
 #include "multiaccel_doubleshiftgrab.h"
 #include "multishufti_avx2.h"
 #undef MATCH_ALGO
 #undef MULTIACCEL_DOUBLE
 #endif
--- a/src/nfa/multishufti.h
+++ b/src/nfa/multishufti.h
@ -1,70 +0,0 @@
 /*
 * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 /** \file
 * \brief Multishufti: multibyte version of Shufti
 *
 * Utilises the SSSE3 pshufb shuffle instruction
 */
 #ifndef MULTISHUFTI_H
 #define MULTISHUFTI_H
 #include "ue2common.h"
 #include "util/simd_types.h"
 #ifdef __cplusplus
 extern "C"
 {
 #endif
 const u8 *long_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                          const u8 *buf_end, const u8 run_len);
 const u8 *longgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                              const u8 *buf_end, const u8 run_len);
 const u8 *shift_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                           const u8 *buf_end, const u8 run_len);
 const u8 *shiftgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                               const u8 *buf_end, const u8 run_len);
 const u8 *doubleshift_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                                 const u8 *buf_end, const u8 run_len,
                                 const u8 run2_len);
 const u8 *doubleshiftgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                                     const u8 *buf_end, const u8 run_len,
                                     const u8 run2_len);
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/src/nfa/multishufti_avx2.h
+++ b/src/nfa/multishufti_avx2.h
@ -1,121 +0,0 @@
 /*
 * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "shufti_common.h"
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 static really_inline
 const u8 *JOIN(MATCH_ALGO, fwdBlock)(m256 mask_lo, m256 mask_hi, m256 chars,
                                     const u8 *buf, const m256 low4bits,
                                     const m256 zeroes, const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                     , const u8 run_len2
 #endif
                                     ) {
    u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
    return (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])(buf, ~z
 #ifdef MULTIACCEL_DOUBLE
                                                             , run_len2
 #endif
                                                             );
 }
 const u8 *JOIN(MATCH_ALGO, shuftiExec)(m128 mask_lo, m128 mask_hi,
                                       const u8 *buf,
                                       const u8 *buf_end, u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                       , u8 run_len2
 #endif
                                       ) {
    assert(buf && buf_end);
    assert(buf < buf_end);
    // Slow path for small cases.
    if (buf_end - buf < 32) {
        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
                             buf, buf_end);
    }
    const m256 zeroes = zeroes256();
    const m256 low4bits = set32x8(0xf);
    const m256 wide_mask_lo = set2x128(mask_lo);
    const m256 wide_mask_hi = set2x128(mask_hi);
    const u8 *rv;
    size_t min = (size_t)buf % 32;
    assert(buf_end - buf >= 32);
    // Preconditioning: most of the time our buffer won't be aligned.
    m256 chars = loadu256(buf);
    rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, chars, buf,
                                            low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
                                            , run_len2
 #endif
                                            );
    if (rv) {
        return rv;
    }
    buf += (32 - min);
    // Unrolling was here, but it wasn't doing anything but taking up space.
    // Reroll FTW.
    const u8 *last_block = buf_end - 32;
    while (buf < last_block) {
        m256 lchars = load256(buf);
        rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, lchars, buf,
                                        low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
                                        , run_len2
 #endif
                                        );
        if (rv) {
            return rv;
        }
        buf += 32;
    }
    // Use an unaligned load to mop up the last 32 bytes and get an accurate
    // picture to buf_end.
    assert(buf <= buf_end && buf >= buf_end - 32);
    chars = loadu256(buf_end - 32);
    rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, chars, buf_end - 32,
                                    low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
                                    , run_len2
 #endif
                                    );
    if (rv) {
        return rv;
    }
    return buf_end;
 }
--- a/src/nfa/multishufti_sse.h
+++ b/src/nfa/multishufti_sse.h
@ -1,265 +0,0 @@
 /*
 * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "shufti_common.h"
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 /* Normal SSSE3 shufti */
 static really_inline
 const u8 *JOIN(MATCH_ALGO, fwdBlock)(m128 mask_lo, m128 mask_hi, m128 chars,
                                     const u8 *buf, const m128 low4bits,
                                     const m128 zeroes, const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                     , const u8 run_len2
 #endif
                                             ) {
    // negate first 16 bits
    u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes) ^ 0xFFFF;
    return (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])(buf, z
 #ifdef MULTIACCEL_DOUBLE
            , run_len2
 #endif
            );
 }
 /*
 * 16-byte pipeline, for smaller scans
 */
 static
 const u8 *JOIN(MATCH_ALGO, shuftiPipeline16)(m128 mask_lo, m128 mask_hi,
                                             const u8 *buf, const u8 *buf_end,
                                             const m128 low4bits,
                                             const m128 zeroes, const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                             , const u8 run_len2
 #endif
                                             ) {
    const u8* ptr, *last_buf;
    u32 last_res;
    // pipeline prologue: scan first 16 bytes
    m128 data = load128(buf);
    u32 z = block(mask_lo, mask_hi, data, low4bits, zeroes) ^ 0xFFFF;
    last_buf = buf;
    last_res = z;
    buf += 16;
    // now, start the pipeline!
    assert((size_t)buf % 16 == 0);
    for (; buf + 15 < buf_end; buf += 16) {
        // scan more data
        data = load128(buf);
        z = block(mask_lo, mask_hi, data, low4bits, zeroes) ^ 0xFFFF;
        // do a comparison on previous result
        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
                (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
                 , run_len2
 #endif
                 );
        if (unlikely(ptr)) {
            return ptr;
        }
        last_buf = buf;
        last_res = z;
    }
    assert(buf <= buf_end && buf >= buf_end - 16);
    // epilogue: compare final results
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
            (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
             , run_len2
 #endif
             );
    if (unlikely(ptr)) {
        return ptr;
    }
    return NULL;
 }
 /*
 * 32-byte pipeline, for bigger scans
 */
 static
 const u8 *JOIN(MATCH_ALGO, shuftiPipeline32)(m128 mask_lo, m128 mask_hi,
                                             const u8 *buf, const u8 *buf_end,
                                             const m128 low4bits,
                                             const m128 zeroes, const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                             , const u8 run_len2
 #endif
                                             ) {
    const u8* ptr, *last_buf;
    u32 res;
    // pipeline prologue: scan first 32 bytes
    m128 data1 = load128(buf);
    u32 z1 = block(mask_lo, mask_hi, data1, low4bits, zeroes) ^ 0xFFFF;
    m128 data2 = load128(buf + 16);
    u32 z2 = block(mask_lo, mask_hi, data2, low4bits, zeroes) ^ 0xFFFF;
    // store the results
    u32 last_res = z1 | (z2 << 16);
    last_buf = buf;
    buf += 32;
    // now, start the pipeline!
    assert((size_t)buf % 16 == 0);
    for (; buf + 31 < buf_end; buf += 32) {
        // scan more data
        data1 = load128(buf);
        z1 = block(mask_lo, mask_hi, data1, low4bits, zeroes) ^ 0xFFFF;
        data2 = load128(buf + 16);
        z2 = block(mask_lo, mask_hi, data2, low4bits, zeroes) ^ 0xFFFF;
        res = z1 | (z2 << 16);
        // do a comparison on previous result
        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
                (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
                 , run_len2
 #endif
                 );
        if (unlikely(ptr)) {
            return ptr;
        }
        last_res = res;
        last_buf = buf;
    }
    // epilogue: compare final results
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
            (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
             , run_len2
 #endif
             );
    if (unlikely(ptr)) {
        return ptr;
    }
    // if we still have some data left, scan it too
    for (; buf + 15 < buf_end; buf += 16) {
        m128 chars = load128(buf);
        ptr = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars, buf,
                low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
                , run_len2
 #endif
                );
        if (unlikely(ptr)) {
            return ptr;
        }
    }
    assert(buf <= buf_end && buf >= buf_end - 16);
    return NULL;
 }
 const u8 *JOIN(MATCH_ALGO, shuftiExec)(m128 mask_lo, m128 mask_hi,
                                       const u8 *buf,
                                       const u8 *buf_end, u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                       , u8 run_len2
 #endif
                                               ) {
    assert(buf && buf_end);
    assert(buf < buf_end);
    // Slow path for small cases.
    if (buf_end - buf < 16) {
        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
                buf, buf_end);
    }
    const m128 zeroes = zeroes128();
    const m128 low4bits = _mm_set1_epi8(0xf);
    const u8 *rv;
    size_t min = (size_t)buf % 16;
    assert(buf_end - buf >= 16);
    // Preconditioning: most of the time our buffer won't be aligned.
    m128 chars = loadu128(buf);
    rv = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars, buf,
            low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
            , run_len2
 #endif
            );
    if (rv) {
        return rv;
    }
    buf += (16 - min);
    // if we have enough data, run bigger pipeline; otherwise run smaller one
    if (buf_end - buf >= 128) {
        rv = JOIN(MATCH_ALGO, shuftiPipeline32)(mask_lo, mask_hi,
                buf, buf_end, low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
                , run_len2
 #endif
                );
        if (unlikely(rv)) {
            return rv;
        }
    } else if (buf_end - buf >= 16){
        rv = JOIN(MATCH_ALGO, shuftiPipeline16)(mask_lo, mask_hi,
                buf, buf_end, low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
                , run_len2
 #endif
                );
        if (unlikely(rv)) {
            return rv;
        }
    }
    // Use an unaligned load to mop up the last 16 bytes and get an accurate
    // picture to buf_end.
    chars = loadu128(buf_end - 16);
    rv = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars,
            buf_end - 16, low4bits, zeroes, run_len
 #ifdef MULTIACCEL_DOUBLE
            , run_len2
 #endif
            );
    if (rv) {
        return rv;
    }
    return buf_end;
 }
--- a/src/nfa/multitruffle.c
+++ b/src/nfa/multitruffle.c
@ -1,111 +0,0 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "config.h"
 #include "ue2common.h"
 #include "util/arch.h"
 #include "multitruffle.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "multiaccel_common.h"
 #if !defined(HAVE_AVX2)
 #define MATCH_ALGO long_
 #include "multiaccel_long.h"
 #include "multitruffle_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO longgrab_
 #include "multiaccel_longgrab.h"
 #include "multitruffle_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shift_
 #include "multiaccel_shift.h"
 #include "multitruffle_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shiftgrab_
 #include "multiaccel_shiftgrab.h"
 #include "multitruffle_sse.h"
 #undef MATCH_ALGO
 #define MULTIACCEL_DOUBLE
 #define MATCH_ALGO doubleshift_
 #include "multiaccel_doubleshift.h"
 #include "multitruffle_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO doubleshiftgrab_
 #include "multiaccel_doubleshiftgrab.h"
 #include "multitruffle_sse.h"
 #undef MATCH_ALGO
 #undef MULTIACCEL_DOUBLE
 #else
 #define MATCH_ALGO long_
 #include "multiaccel_long.h"
 #include "multitruffle_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO longgrab_
 #include "multiaccel_longgrab.h"
 #include "multitruffle_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shift_
 #include "multiaccel_shift.h"
 #include "multitruffle_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shiftgrab_
 #include "multiaccel_shiftgrab.h"
 #include "multitruffle_avx2.h"
 #undef MATCH_ALGO
 #define MULTIACCEL_DOUBLE
 #define MATCH_ALGO doubleshift_
 #include "multiaccel_doubleshift.h"
 #include "multitruffle_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO doubleshiftgrab_
 #include "multiaccel_doubleshiftgrab.h"
 #include "multitruffle_avx2.h"
 #undef MATCH_ALGO
 #undef MULTIACCEL_DOUBLE
 #endif
--- a/src/nfa/multitruffle.h
+++ b/src/nfa/multitruffle.h
@ -1,73 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTITRUFFLE_H
 #define MULTITRUFFLE_H
 /** \file
 * \brief Multitruffle: multibyte version of Truffle.
 *
 * Utilises the SSSE3 pshufb shuffle instruction
 */
 #include "util/simd_types.h"
 #ifdef __cplusplus
 extern "C"
 {
 #endif
 const u8 *long_truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
                           const u8 *buf, const u8 *buf_end, const u8 run_len);
 const u8 *longgrab_truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
                               const u8 *buf, const u8 *buf_end, const u8 run_len);
 const u8 *shift_truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
                            const u8 *buf, const u8 *buf_end, const u8 run_len);
 const u8 *shiftgrab_truffleExec(m128 shuf_mask_lo_highclear,
                                m128 shuf_mask_lo_highset, const u8 *buf,
                                const u8 *buf_end, const u8 run_len);
 const u8 *doubleshift_truffleExec(m128 shuf_mask_lo_highclear,
                                  m128 shuf_mask_lo_highset, const u8 *buf,
                                  const u8 *buf_end, const u8 run_len,
                                  const u8 run2_len);
 const u8 *doubleshiftgrab_truffleExec(m128 shuf_mask_lo_highclear,
                                      m128 shuf_mask_lo_highset, const u8 *buf,
                                      const u8 *buf_end, const u8 run_len,
                                      const u8 run2_len);
 #ifdef __cplusplus
 }
 #endif
 #endif /* MULTITRUFFLE_H */
--- a/src/nfa/multitruffle_avx2.h
+++ b/src/nfa/multitruffle_avx2.h
@ -1,125 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 /*
 * Matches a byte in a charclass using three shuffles
 */
 #include "config.h"
 #include "ue2common.h"
 #include "multiaccel_common.h"
 /*
 * include "block" function
 */
 #include "truffle_common.h"
 /*
 * single-byte truffle fwd match function, should only be defined when not
 * compiling multiaccel
 */
 static really_inline
 const u8 *JOIN(MATCH_ALGO, fwdBlock)(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
                                     m256 v, const u8 *buf, const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                     , const u8 run_len2
 #endif
                                     ) {
    u64a z = (u64a) block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
    return (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])(buf, z ^ 0xFFFFFFFF
 #ifdef MULTIACCEL_DOUBLE
                                                             , run_len2
 #endif
                                                             );
 }
 const u8 *JOIN(MATCH_ALGO, truffleExec)(m128 shuf_mask_lo_highclear,
                                        m128 shuf_mask_lo_highset,
                                        const u8 *buf, const u8 *buf_end, const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                        , const u8 run_len2
 #endif
                                        ) {
    DEBUG_PRINTF("run_len %zu\n", buf_end - buf);
    const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
    const m256 wide_set = set2x128(shuf_mask_lo_highset);
    assert(buf && buf_end);
    assert(buf < buf_end);
    const u8 *rv;
    if (buf_end - buf < 32) {
        return truffleMini(wide_clear, wide_set, buf, buf_end);
    }
    size_t min = (size_t)buf % 32;
    assert(buf_end - buf >= 32);
    // Preconditioning: most of the time our buffer won't be aligned.
    m256 chars = loadu256(buf);
    rv = JOIN(MATCH_ALGO, fwdBlock)(wide_clear, wide_set, chars, buf, run_len
 #ifdef MULTIACCEL_DOUBLE
                                    , run_len2
 #endif
                                    );
    if (rv) {
        return rv;
    }
    buf += (32 - min);
    const u8 *last_block = buf_end - 32;
    while (buf < last_block) {
        m256 lchars = load256(buf);
        rv = JOIN(MATCH_ALGO, fwdBlock)(wide_clear, wide_set, lchars,
                                        buf, run_len
 #ifdef MULTIACCEL_DOUBLE
                                        , run_len2
 #endif
                                        );
        if (rv) {
            return rv;
        }
        buf += 32;
    }
    // Use an unaligned load to mop up the last 32 bytes and get an accurate
    // picture to buf_end.
    assert(buf <= buf_end && buf >= buf_end - 32);
    chars = loadu256(buf_end - 32);
    rv = JOIN(MATCH_ALGO, fwdBlock)(wide_clear, wide_set, chars,
                                    buf_end - 32, run_len
 #ifdef MULTIACCEL_DOUBLE
                                    , run_len2
 #endif
                                    );
    if (rv) {
        return rv;
    }
    return buf_end;
 }
--- a/src/nfa/multitruffle_sse.h
+++ b/src/nfa/multitruffle_sse.h
@ -1,265 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "config.h"
 #include "ue2common.h"
 #include "multiaccel_common.h"
 /*
 * include "block" function
 */
 #include "truffle_common.h"
 /*
 * single-byte truffle fwd match function, should only be defined when not
 * compiling multiaccel
 */
 static really_inline
 const u8 *JOIN(MATCH_ALGO, fwdBlock)(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
                                     m128 v, const u8 *buf, const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                     , const u8 run_len2
 #endif
                                     ) {
    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v) ^ 0xFFFF;
    return (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])(buf, z
 #ifdef MULTIACCEL_DOUBLE
                                                             , run_len2
 #endif
            );
 }
 /*
 * 16-byte pipeline, for smaller scans
 */
 static
 const u8 *JOIN(MATCH_ALGO, trufflePipeline16)(m128 shuf_mask_lo_highclear,
                                              m128 shuf_mask_lo_highset,
                                              const u8 *buf, const u8 *buf_end,
                                              const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                              , const u8 run_len2
 #endif
                                              ) {
    const u8* ptr, *last_buf;
    u32 last_res;
    // pipeline prologue: scan first 16 bytes
    m128 data = load128(buf);
    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, data) ^ 0xFFFF;
    last_buf = buf;
    last_res = z;
    buf += 16;
    // now, start the pipeline!
    assert((size_t)buf % 16 == 0);
    for (; buf + 15 < buf_end; buf += 16) {
        // scan more data
        data = load128(buf);
        z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, data) ^ 0xFFFF;
        // do a comparison on previous result
        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
                (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
                 , run_len2
 #endif
                 );
        if (unlikely(ptr)) {
            return ptr;
        }
        last_buf = buf;
        last_res = z;
    }
    assert(buf <= buf_end && buf >= buf_end - 16);
    // epilogue: compare final results
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
            (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
             , run_len2
 #endif
             );
    if (unlikely(ptr)) {
        return ptr;
    }
    return NULL;
 }
 /*
 * 32-byte pipeline, for bigger scans
 */
 static
 const u8 *JOIN(MATCH_ALGO, trufflePipeline32)(m128 shuf_mask_lo_highclear,
                                              m128 shuf_mask_lo_highset,
                                              const u8 *buf, const u8 *buf_end,
                                              const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                              , const u8 run_len2
 #endif
                                              ) {
    const u8* ptr, *last_buf;
    u32 res;
    // pipeline prologue: scan first 32 bytes
    m128 data1 = load128(buf);
    u32 z1 = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, data1) ^ 0xFFFF;
    m128 data2 = load128(buf + 16);
    u32 z2 = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, data2) ^ 0xFFFF;
    // store the results
    u32 last_res = z1 | (z2 << 16);
    last_buf = buf;
    buf += 32;
    // now, start the pipeline!
    assert((size_t)buf % 16 == 0);
    for (; buf + 31 < buf_end; buf += 32) {
        // scan more data
        data1 = load128(buf);
        z1 = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, data1) ^ 0xFFFF;
        data2 = load128(buf + 16);
        z2 = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, data2) ^ 0xFFFF;
        res = z1 | (z2 << 16);
        // do a comparison on previous result
        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
                (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
                 , run_len2
 #endif
                 );
        if (unlikely(ptr)) {
            return ptr;
        }
        last_res = res;
        last_buf = buf;
    }
    // epilogue: compare final results
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
            (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
             , run_len2
 #endif
             );
    if (unlikely(ptr)) {
        return ptr;
    }
    // if we still have some data left, scan it too
    for (; buf + 15 < buf_end; buf += 16) {
        m128 chars = load128(buf);
        ptr = JOIN(MATCH_ALGO, fwdBlock)(shuf_mask_lo_highclear, shuf_mask_lo_highset,
                                         chars, buf, run_len
 #ifdef MULTIACCEL_DOUBLE
                                         , run_len2
 #endif
                                         );
        if (unlikely(ptr)) {
            return ptr;
        }
    }
    assert(buf <= buf_end && buf >= buf_end - 16);
    return NULL;
 }
 const u8 *JOIN(MATCH_ALGO, truffleExec)(m128 shuf_mask_lo_highclear,
                                        m128 shuf_mask_lo_highset,
                                        const u8 *buf, const u8 *buf_end, const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                        , const u8 run_len2
 #endif
                                                ) {
    DEBUG_PRINTF("run_len %zu\n", buf_end - buf);
    assert(buf && buf_end);
    assert(buf < buf_end);
    const u8 *rv;
    if (buf_end - buf < 16) {
        return truffleMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf, buf_end);
    }
    size_t min = (size_t)buf % 16;
    assert(buf_end - buf >= 16);
    // Preconditioning: most of the time our buffer won't be aligned.
    m128 chars = loadu128(buf);
    rv = JOIN(MATCH_ALGO, fwdBlock)(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf, run_len
 #ifdef MULTIACCEL_DOUBLE
                                    , run_len2
 #endif
                                    );
    if (rv) {
        return rv;
    }
    buf += (16 - min);
    // if we have enough data, run bigger pipeline; otherwise run smaller one
    if (buf_end - buf >= 128) {
        rv = JOIN(MATCH_ALGO, trufflePipeline32)(shuf_mask_lo_highclear, shuf_mask_lo_highset,
                                                 buf, buf_end, run_len
 #ifdef MULTIACCEL_DOUBLE
                                                 , run_len2
 #endif
                                                 );
        if (unlikely(rv)) {
            return rv;
        }
    } else if (buf_end - buf >= 16){
        rv = JOIN(MATCH_ALGO, trufflePipeline16)(shuf_mask_lo_highclear, shuf_mask_lo_highset,
                                                 buf, buf_end, run_len
 #ifdef MULTIACCEL_DOUBLE
                                                 , run_len2
 #endif
                                                 );
        if (unlikely(rv)) {
            return rv;
        }
    }
    // Use an unaligned load to mop up the last 16 bytes and get an accurate
    // picture to buf_end.
    chars = loadu128(buf_end - 16);
    rv = JOIN(MATCH_ALGO, fwdBlock)(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars,
                                    buf_end - 16, run_len
 #ifdef MULTIACCEL_DOUBLE
                                    , run_len2
 #endif
                                    );
    if (rv) {
        return rv;
    }
    return buf_end;
 }
--- a/src/nfa/multivermicelli.c
+++ b/src/nfa/multivermicelli.c
@ -1,109 +0,0 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "config.h"
 #include "ue2common.h"
 #include "util/arch.h"
 #include "multivermicelli.h"
 #include "multiaccel_common.h"
 #if !defined(HAVE_AVX2)
 #define MATCH_ALGO long_
 #include "multiaccel_long.h"
 #include "multivermicelli_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO longgrab_
 #include "multiaccel_longgrab.h"
 #include "multivermicelli_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shift_
 #include "multiaccel_shift.h"
 #include "multivermicelli_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shiftgrab_
 #include "multiaccel_shiftgrab.h"
 #include "multivermicelli_sse.h"
 #undef MATCH_ALGO
 #define MULTIACCEL_DOUBLE
 #define MATCH_ALGO doubleshift_
 #include "multiaccel_doubleshift.h"
 #include "multivermicelli_sse.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO doubleshiftgrab_
 #include "multiaccel_doubleshiftgrab.h"
 #include "multivermicelli_sse.h"
 #undef MATCH_ALGO
 #undef MULTIACCEL_DOUBLE
 #else
 #define MATCH_ALGO long_
 #include "multiaccel_long.h"
 #include "multivermicelli_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO longgrab_
 #include "multiaccel_longgrab.h"
 #include "multivermicelli_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shift_
 #include "multiaccel_shift.h"
 #include "multivermicelli_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO shiftgrab_
 #include "multiaccel_shiftgrab.h"
 #include "multivermicelli_avx2.h"
 #undef MATCH_ALGO
 #define MULTIACCEL_DOUBLE
 #define MATCH_ALGO doubleshift_
 #include "multiaccel_doubleshift.h"
 #include "multivermicelli_avx2.h"
 #undef MATCH_ALGO
 #define MATCH_ALGO doubleshiftgrab_
 #include "multiaccel_doubleshiftgrab.h"
 #include "multivermicelli_avx2.h"
 #undef MATCH_ALGO
 #undef MULTIACCEL_DOUBLE
 #endif
--- a/src/nfa/multivermicelli.h
+++ b/src/nfa/multivermicelli.h
@ -1,62 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef MULTIVERMICELLI_H_
 #define MULTIVERMICELLI_H_
 #ifdef __cplusplus
 extern "C"
 {
 #endif
 const u8 *long_vermicelliExec(char c, char nocase, const u8 *buf,
                              const u8 *buf_end, const u8 run_len);
 const u8 *longgrab_vermicelliExec(char c, char nocase, const u8 *buf,
                                  const u8 *buf_end, const u8 run_len);
 const u8 *shift_vermicelliExec(char c, char nocase, const u8 *buf,
                               const u8 *buf_end, const u8 run_len);
 const u8 *shiftgrab_vermicelliExec(char c, char nocase, const u8 *buf,
                                   const u8 *buf_end, const u8 run_len);
 const u8 *doubleshift_vermicelliExec(char c, char nocase, const u8 *buf,
                                     const u8 *buf_end, const u8 run_len,
                                     const u8 run2_len);
 const u8 *doubleshiftgrab_vermicelliExec(char c, char nocase, const u8 *buf,
                                         const u8 *buf_end, const u8 run_len,
                                         const u8 run2_len);
 #ifdef __cplusplus
 }
 #endif
 #endif /* MULTIVERMICELLI_H_ */
--- a/src/nfa/multivermicelli_avx2.h
+++ b/src/nfa/multivermicelli_avx2.h
@ -1,283 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/unaligned.h"
 #include "multiaccel_common.h"
 static really_inline
 const u8 *JOIN(MATCH_ALGO, vermUnalignNocase)(m256 chars,
                                              const u8 *buf,
                                              const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                              , const u8 run_len2
 #endif
                                              ) {
    m256 casemask = set32x8(CASE_CLEAR);
    const u8 *ptr;
    m256 data = loadu256(buf);
    u32 z = movemask256(eq256(chars, and256(casemask, data)));
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
            (buf, z
 #ifdef MULTIACCEL_DOUBLE
            , run_len2
 #endif
            );
    if (unlikely(ptr)) {
        return ptr;
    }
    return NULL;
 }
 static really_inline
 const u8 *JOIN(MATCH_ALGO, vermUnalign)(m256 chars,
                                        const u8 *buf,
                                        const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                        , const u8 run_len2
 #endif
                                        ) {
    const u8 *ptr;
    m256 data = loadu256(buf);
    u32 z = movemask256(eq256(chars, data));
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
            (buf, z
 #ifdef MULTIACCEL_DOUBLE
            , run_len2
 #endif
            );
    if (unlikely(ptr)) {
        return ptr;
    }
    return NULL;
 }
 /*
 * 32-byte pipeline
 */
 static really_inline
 const u8 *JOIN(MATCH_ALGO, vermPipeline)(m256 chars,
                                         const u8 *buf,
                                         const u8 *buf_end,
                                         const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                         , const u8 run_len2
 #endif
                                         ) {
    const u8* ptr, *last_buf;
    u32 last_res;
    // pipeline prologue: scan first 32 bytes
    m256 data = load256(buf);
    u32 z = movemask256(eq256(chars, data));
    last_res = z;
    last_buf = buf;
    buf += 32;
    // now, start the pipeline!
    assert((size_t)buf % 32 == 0);
    for (; buf + 31 < buf_end; buf += 32) {
        // scan more data
        data = load256(buf);
        z = movemask256(eq256(chars, data));
        // do a comparison on previous result
        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
                (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
                , run_len2
 #endif
                );
        if (unlikely(ptr)) {
            return ptr;
        }
        last_buf = buf;
        last_res = z;
    }
    assert(buf <= buf_end && buf >= buf_end - 32);
    // epilogue: compare final results
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
            (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
            , run_len2
 #endif
            );
    if (unlikely(ptr)) {
        return ptr;
    }
    return NULL;
 }
 /*
 * 32-byte caseless pipeline
 */
 static really_inline
 const u8 *JOIN(MATCH_ALGO, vermPipelineNocase)(m256 chars,
                                               const u8 *buf,
                                               const u8 *buf_end,
                                               const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                               , const u8 run_len2
 #endif
                                               ) {
    m256 casemask = set32x8(CASE_CLEAR);
    const u8* ptr, *last_buf;
    u32 last_res;
    // pipeline prologue: scan first 32 bytes
    m256 data = load256(buf);
    u32 z = movemask256(eq256(chars, and256(casemask, data)));
    last_res = z;
    last_buf = buf;
    buf += 32;
    // now, start the pipeline!
    assert((size_t)buf % 32 == 0);
    for (; buf + 31 < buf_end; buf += 32) {
        // scan more data
        data = load256(buf);
        z = movemask256(eq256(chars, and256(casemask, data)));
        // do a comparison on previous result
        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
                (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
                , run_len2
 #endif
                );
        if (unlikely(ptr)) {
            return ptr;
        }
        last_buf = buf;
        last_res = z;
    }
    assert(buf <= buf_end && buf >= buf_end - 32);
    // epilogue: compare final results
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
            (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
            , run_len2
 #endif
            );
    if (unlikely(ptr)) {
        return ptr;
    }
    return NULL;
 }
 const u8 *JOIN(MATCH_ALGO, vermicelliExec)(char c, char nocase,
                                           const u8 *buf,
                                           const u8 *buf_end,
                                           const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                           , const u8 run_len2
 #endif
                                           ) {
    DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n",
                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
    assert(buf < buf_end);
    const u8 *ptr;
    // Handle small scans.
    if (buf_end - buf < 32) {
        for (; buf < buf_end; buf++) {
            char cur = (char)*buf;
            if (nocase) {
                cur &= CASE_CLEAR;
            }
            if (cur == c) {
                break;
            }
        }
        return buf;
    }
    m256 chars = set32x8(c); /* nocase already uppercase */
    uintptr_t min = (uintptr_t)buf % 32;
    if (min) {
        ptr = nocase ? JOIN(MATCH_ALGO, vermUnalignNocase)(chars,
                buf, run_len
 #ifdef MULTIACCEL_DOUBLE
                , run_len2
 #endif
                ) : JOIN(MATCH_ALGO, vermUnalign)(chars,
                        buf, run_len
 #ifdef MULTIACCEL_DOUBLE
                        , run_len2
 #endif
                        );
        if (unlikely(ptr)) {
            return ptr;
        }
        buf += 32 - min;
    }
    if (buf_end - buf >= 32){
        ptr = nocase ? JOIN(MATCH_ALGO, vermPipelineNocase)(chars,
                buf, buf_end, run_len
 #ifdef MULTIACCEL_DOUBLE
                , run_len2
 #endif
                ) : JOIN(MATCH_ALGO, vermPipeline)(chars,
                        buf, buf_end, run_len
 #ifdef MULTIACCEL_DOUBLE
                        , run_len2
 #endif
                        );
        if (unlikely(ptr)) {
            return ptr;
        }
    }
    // final unaligned scan
    ptr = nocase ? JOIN(MATCH_ALGO, vermUnalignNocase)(chars,
            buf_end - 32, run_len
 #ifdef MULTIACCEL_DOUBLE
            , run_len2
 #endif
            ) : JOIN(MATCH_ALGO, vermUnalign)(chars,
                    buf_end - 32, run_len
 #ifdef MULTIACCEL_DOUBLE
                    , run_len2
 #endif
                    );
    // run our pipeline
    return ptr ? ptr : buf_end;
 }
--- a/src/nfa/multivermicelli_sse.h
+++ b/src/nfa/multivermicelli_sse.h
@ -1,452 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/unaligned.h"
 #define VERM_BOUNDARY 16
 #define VERM_TYPE m128
 #define VERM_SET_FN set16x8
 #include "multiaccel_common.h"
 static really_inline
 const u8 *JOIN(MATCH_ALGO, vermUnalignNocase)(m128 chars,
                                              const u8 *buf,
                                              const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                              , const u8 run_len2
 #endif
                                              ) {
    m128 casemask = set16x8(CASE_CLEAR);
    const u8 *ptr;
    m128 data = loadu128(buf);
    u32 z = movemask128(eq128(chars, and128(casemask, data)));
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
            (buf, z
 #ifdef MULTIACCEL_DOUBLE
             , run_len2
 #endif
             );
    if (unlikely(ptr)) {
        return ptr;
    }
    return NULL;
 }
 static really_inline
 const u8 *JOIN(MATCH_ALGO, vermUnalign)(m128 chars,
                                        const u8 *buf,
                                        const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                        , const u8 run_len2
 #endif
                                        ) {
    const u8 *ptr;
    m128 data = loadu128(buf);
    u32 z = movemask128(eq128(chars, data));
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
            (buf, z
 #ifdef MULTIACCEL_DOUBLE
             , run_len2
 #endif
             );
    if (unlikely(ptr)) {
        return ptr;
    }
    return NULL;
 }
 /*
 * 16-byte pipeline, for smaller scans
 */
 static
 const u8 *JOIN(MATCH_ALGO, vermPipeline16)(m128 chars,
                                                 const u8 *buf,
                                                 const u8 *buf_end,
                                                 const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                                 , const u8 run_len2
 #endif
                                                 ) {
    const u8* ptr, *last_buf;
    u32 last_res;
    // pipeline prologue: scan first 16 bytes
    m128 data = load128(buf);
    u32 z = movemask128(eq128(chars, data));
    last_buf = buf;
    last_res = z;
    buf += 16;
    // now, start the pipeline!
    assert((size_t)buf % 16 == 0);
    for (; buf + 15 < buf_end; buf += 16) {
        // scan more data
        data = load128(buf);
        z = movemask128(eq128(chars, data));
        // do a comparison on previous result
        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
                (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
                 , run_len2
 #endif
                 );
        if (unlikely(ptr)) {
            return ptr;
        }
        last_buf = buf;
        last_res = z;
    }
    assert(buf <= buf_end && buf >= buf_end - 16);
    // epilogue: compare final results
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
            (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
             , run_len2
 #endif
             );
    if (unlikely(ptr)) {
        return ptr;
    }
    return NULL;
 }
 /*
 * 16-byte pipeline, for smaller scans
 */
 static
 const u8 *JOIN(MATCH_ALGO, vermPipeline16Nocase)(m128 chars,
                                                         const u8 *buf,
                                                         const u8 *buf_end,
                                                         const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                                         , const u8 run_len2
 #endif
                                                         ) {
    m128 casemask = set16x8(CASE_CLEAR);
    const u8* ptr, *last_buf;
    u32 last_res;
    // pipeline prologue: scan first 16 bytes
    m128 data = load128(buf);
    u32 z = movemask128(eq128(chars, and128(casemask, data)));
    last_buf = buf;
    last_res = z;
    buf += 16;
    // now, start the pipeline!
    assert((size_t)buf % 16 == 0);
    for (; buf + 15 < buf_end; buf += 16) {
        // scan more data
        data = load128(buf);
        z = movemask128(eq128(chars, and128(casemask, data)));
        // do a comparison on previous result
        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
                (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
                 , run_len2
 #endif
                 );
        if (unlikely(ptr)) {
            return ptr;
        }
        last_buf = buf;
        last_res = z;
    }
    assert(buf <= buf_end && buf >= buf_end - 16);
    // epilogue: compare final results
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
            (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
             , run_len2
 #endif
             );
    if (unlikely(ptr)) {
        return ptr;
    }
    return NULL;
 }
 /*
 * 32-byte pipeline, for bigger scans
 */
 static
 const u8 *JOIN(MATCH_ALGO, vermPipeline32)(m128 chars,
                                                   const u8 *buf,
                                                   const u8 *buf_end,
                                                   const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                                   , const u8 run_len2
 #endif
                                                   ) {
    const u8* ptr, *last_buf;
    u32 res;
    // pipeline prologue: scan first 32 bytes
    m128 data1 = load128(buf);
    u32 z1 = movemask128(eq128(chars, data1));
    m128 data2 = load128(buf + 16);
    u32 z2 = movemask128(eq128(chars, data2));
    // store the results
    u32 last_res = z1 | (z2 << VERM_BOUNDARY);
    last_buf = buf;
    buf += 32;
    // now, start the pipeline!
    assert((size_t)buf % 16 == 0);
    for (; buf + 31 < buf_end; buf += 32) {
        // scan more data
        data1 = load128(buf);
        z1 = movemask128(eq128(chars, data1));
        data2 = load128(buf + 16);
        z2 = movemask128(eq128(chars, data2));
        res = z1 | (z2 << 16);
        // do a comparison on previous result
        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
                (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
                 , run_len2
 #endif
                 );
        if (unlikely(ptr)) {
            return ptr;
        }
        last_res = res;
        last_buf = buf;
    }
    // epilogue: compare final results
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
            (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
             , run_len2
 #endif
             );
    if (unlikely(ptr)) {
        return ptr;
    }
    // if we still have some data left, scan it too
    if (buf + 15 < buf_end) {
        return JOIN(MATCH_ALGO, vermPipeline16)(chars, buf, buf_end, run_len
 #ifdef MULTIACCEL_DOUBLE
                                                , run_len2
 #endif
                                                );
    }
    assert(buf <= buf_end && buf >= buf_end - 16);
    return NULL;
 }
 /*
 * 32-byte caseless pipeline, for bigger scans
 */
 static
 const u8 *JOIN(MATCH_ALGO, vermPipeline32Nocase)(m128 chars,
                                                         const u8 *buf,
                                                         const u8 *buf_end,
                                                         const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                                         , const u8 run_len2
 #endif
                                                         ) {
    m128 casemask = set16x8(CASE_CLEAR);
    const u8* ptr, *last_buf;
    u32 last_res;
    // pipeline prologue: scan first 32 bytes
    m128 data1 = load128(buf);
    u32 z1 = movemask128(eq128(chars, and128(casemask, data1)));
    m128 data2 = load128(buf + 16);
    u32 z2 = movemask128(eq128(chars, and128(casemask, data2)));
    u32 z = z1 | (z2 << VERM_BOUNDARY);
    last_res = z;
    last_buf = buf;
    buf += 32;
    // now, start the pipeline!
    assert((size_t)buf % 16 == 0);
    for (; buf + 31 < buf_end; buf += 32) {
        // scan more data
        data1 = load128(buf);
        z1 = movemask128(eq128(chars, and128(casemask, data1)));
        data2 = load128(buf + 16);
        z2 = movemask128(eq128(chars, and128(casemask, data2)));
        z = z1 | (z2 << 16);
        // do a comparison on previous result
        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
                (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
                 , run_len2
 #endif
                 );
        if (unlikely(ptr)) {
            return ptr;
        }
        last_res = z;
        last_buf = buf;
    }
    // epilogue: compare final results
    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
            (last_buf, last_res
 #ifdef MULTIACCEL_DOUBLE
             , run_len2
 #endif
             );
    if (unlikely(ptr)) {
        return ptr;
    }
    // if we still have some data left, scan it too
    if (buf + 15 < buf_end) {
        return JOIN(MATCH_ALGO, vermPipeline16Nocase)(chars, buf, buf_end, run_len
 #ifdef MULTIACCEL_DOUBLE
                                                              , run_len2
 #endif
                                                              );
    }
    assert(buf <= buf_end && buf >= buf_end - 16);
    return NULL;
 }
 const u8 *JOIN(MATCH_ALGO, vermicelliExec)(char c, char nocase,
                                                   const u8 *buf,
                                                   const u8 *buf_end,
                                                   const u8 run_len
 #ifdef MULTIACCEL_DOUBLE
                                                   , const u8 run_len2
 #endif
                                                   ) {
    DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n",
                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
    assert(buf < buf_end);
    const u8 *ptr;
    // Handle small scans.
    if (buf_end - buf < VERM_BOUNDARY) {
        for (; buf < buf_end; buf++) {
            char cur = (char)*buf;
            if (nocase) {
                cur &= CASE_CLEAR;
            }
            if (cur == c) {
                break;
            }
        }
        return buf;
    }
    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
    if (min) {
        ptr = nocase ? JOIN(MATCH_ALGO, vermUnalignNocase)(chars,
                buf, run_len
 #ifdef MULTIACCEL_DOUBLE
                , run_len2
 #endif
                ) : JOIN(MATCH_ALGO, vermUnalign)(chars,
                        buf, run_len
 #ifdef MULTIACCEL_DOUBLE
                        , run_len2
 #endif
                        );
        if (unlikely(ptr)) {
            return ptr;
        }
        buf += VERM_BOUNDARY - min;
    }
    // if we have enough data, run bigger pipeline; otherwise run smaller one
    if (buf_end - buf >= 128) {
        ptr = nocase ? JOIN(MATCH_ALGO, vermPipeline32Nocase)(chars,
                buf, buf_end, run_len
 #ifdef MULTIACCEL_DOUBLE
                , run_len2
 #endif
                ) : JOIN(MATCH_ALGO, vermPipeline32)(chars,
                        buf, buf_end, run_len
 #ifdef MULTIACCEL_DOUBLE
                        , run_len2
 #endif
                        );
        if (unlikely(ptr)) {
            return ptr;
        }
    } else if (buf_end - buf >= 16){
        ptr = nocase ? JOIN(MATCH_ALGO, vermPipeline16Nocase)(chars,
                buf, buf_end, run_len
 #ifdef MULTIACCEL_DOUBLE
                , run_len2
 #endif
                ) : JOIN(MATCH_ALGO, vermPipeline16)(chars,
                        buf, buf_end, run_len
 #ifdef MULTIACCEL_DOUBLE
                        , run_len2
 #endif
                        );
        if (unlikely(ptr)) {
            return ptr;
        }
    }
    // final unaligned scan
    ptr = nocase ? JOIN(MATCH_ALGO, vermUnalignNocase)(chars,
            buf_end - VERM_BOUNDARY, run_len
 #ifdef MULTIACCEL_DOUBLE
            , run_len2
 #endif
            ) : JOIN(MATCH_ALGO, vermUnalign)(chars,
                    buf_end - VERM_BOUNDARY, run_len
 #ifdef MULTIACCEL_DOUBLE
                    , run_len2
 #endif
                    );
    // run our pipeline
    return ptr ? ptr : buf_end;
 }
--- a/src/nfa/shufti.c
+++ b/src/nfa/shufti.c
@ -39,7 +39,52 @@
 #include "util/simd_utils.h"
 #include "util/unaligned.h"
-#include "shufti_common.h"
+#ifdef DEBUG
 #include <ctype.h>
 #define DUMP_MSK(_t)                                \
 static UNUSED                                       \
 void dumpMsk##_t(m##_t msk) {                       \
    u8 * mskAsU8 = (u8 *)&msk;                      \
    for (unsigned i = 0; i < sizeof(msk); i++) {    \
        u8 c = mskAsU8[i];                          \
        for (int j = 0; j < 8; j++) {               \
            if ((c >> (7-j)) & 0x1)                 \
                printf("1");                        \
            else                                    \
                printf("0");                        \
        }                                           \
        printf(" ");                                \
    }                                               \
 }                                                   \
 static UNUSED                                       \
 void dumpMsk##_t##AsChars(m##_t msk) {              \
    u8 * mskAsU8 = (u8 *)&msk;                      \
    for (unsigned i = 0; i < sizeof(msk); i++) {    \
        u8 c = mskAsU8[i];                          \
        if (isprint(c))                             \
            printf("%c",c);                         \
        else                                        \
            printf(".");                            \
    }                                               \
 }
 #endif
 /** \brief Naive byte-by-byte implementation. */
 static really_inline
 const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf,
                        const u8 *buf_end) {
    assert(buf < buf_end);
    for (; buf < buf_end; ++buf) {
        u8 c = *buf;
        if (lo[c & 0xf] & hi[c >> 4]) {
            break;
        }
    }
    return buf;
 }
 /** \brief Naive byte-by-byte implementation. */
 static really_inline
@ -59,6 +104,30 @@ const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
 #if !defined(HAVE_AVX2)
 /* Normal SSSE3 shufti */
 #ifdef DEBUG
 DUMP_MSK(128)
 #endif
 #define GET_LO_4(chars) and128(chars, low4bits)
 #define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
 static really_inline
 u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
          const m128 compare) {
    m128 c_lo  = pshufb(mask_lo, GET_LO_4(chars));
    m128 c_hi  = pshufb(mask_hi, GET_HI_4(chars));
    m128 t     = and128(c_lo, c_hi);
 #ifdef DEBUG
    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
 #endif
    return movemask128(eq128(t, compare));
 }
 static really_inline
 const u8 *firstMatch(const u8 *buf, u32 z) {
    if (unlikely(z != 0xffff)) {
@ -293,6 +362,31 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
 #else // AVX2 - 256 wide shuftis
 #ifdef DEBUG
 DUMP_MSK(256)
 #endif
 #define GET_LO_4(chars) and256(chars, low4bits)
 #define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
 static really_inline
 u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
          const m256 compare) {
    m256 c_lo  = vpshufb(mask_lo, GET_LO_4(chars));
    m256 c_hi  = vpshufb(mask_hi, GET_HI_4(chars));
    m256 t = and256(c_lo, c_hi);
 #ifdef DEBUG
    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
    DEBUG_PRINTF("  char: "); dumpMsk256(chars); printf("\n");
    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo); printf("\n");
    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi); printf("\n");
    DEBUG_PRINTF("     t: "); dumpMsk256(t); printf("\n");
 #endif
    return movemask256(eq256(t, compare));
 }
 static really_inline
 const u8 *firstMatch(const u8 *buf, u32 z) {
    if (unlikely(z != 0xffffffff)) {
--- a/src/nfa/shufti_common.h
+++ b/src/nfa/shufti_common.h
@ -1,146 +0,0 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef SHUFTI_COMMON_H_
 #define SHUFTI_COMMON_H_
 #include "ue2common.h"
 #include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/unaligned.h"
 /*
 * Common stuff for all versions of shufti (single, multi and multidouble)
 */
 /** \brief Naive byte-by-byte implementation. */
 static really_inline
 const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf,
                        const u8 *buf_end) {
    assert(buf < buf_end);
    for (; buf < buf_end; ++buf) {
        u8 c = *buf;
        if (lo[c & 0xf] & hi[c >> 4]) {
            break;
        }
    }
    return buf;
 }
 #ifdef DEBUG
 #include <ctype.h>
 #define DUMP_MSK(_t)                                \
 static UNUSED                                       \
 void dumpMsk##_t(m##_t msk) {                       \
    u8 * mskAsU8 = (u8 *)&msk;                      \
    for (unsigned i = 0; i < sizeof(msk); i++) {    \
        u8 c = mskAsU8[i];                          \
        for (int j = 0; j < 8; j++) {               \
            if ((c >> (7-j)) & 0x1)                 \
                printf("1");                        \
            else                                    \
                printf("0");                        \
        }                                           \
        printf(" ");                                \
    }                                               \
 }                                                   \
 static UNUSED                                       \
 void dumpMsk##_t##AsChars(m##_t msk) {              \
    u8 * mskAsU8 = (u8 *)&msk;                      \
    for (unsigned i = 0; i < sizeof(msk); i++) {    \
        u8 c = mskAsU8[i];                          \
        if (isprint(c))                             \
            printf("%c",c);                         \
        else                                        \
            printf(".");                            \
    }                                               \
 }
 #endif
 #if !defined(HAVE_AVX2)
 #ifdef DEBUG
 DUMP_MSK(128)
 #endif
 #define GET_LO_4(chars) and128(chars, low4bits)
 #define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
 static really_inline
 u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
          const m128 compare) {
    m128 c_lo  = pshufb(mask_lo, GET_LO_4(chars));
    m128 c_hi  = pshufb(mask_hi, GET_HI_4(chars));
    m128 t     = and128(c_lo, c_hi);
 #ifdef DEBUG
    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
 #endif
    return movemask128(eq128(t, compare));
 }
 #else
 #ifdef DEBUG
 DUMP_MSK(256)
 #endif
 #define GET_LO_4(chars) and256(chars, low4bits)
 #define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
 static really_inline
 u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
          const m256 compare) {
    m256 c_lo  = vpshufb(mask_lo, GET_LO_4(chars));
    m256 c_hi  = vpshufb(mask_hi, GET_HI_4(chars));
    m256 t = and256(c_lo, c_hi);
 #ifdef DEBUG
    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
    DEBUG_PRINTF("  char: "); dumpMsk256(chars); printf("\n");
    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo); printf("\n");
    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi); printf("\n");
    DEBUG_PRINTF("     t: "); dumpMsk256(t); printf("\n");
 #endif
    return movemask256(eq256(t, compare));
 }
 #endif
 #endif /* SHUFTI_COMMON_H_ */
--- a/src/nfa/truffle.c
+++ b/src/nfa/truffle.c
@ -37,8 +37,6 @@
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "truffle_common.h"
 #if !defined(HAVE_AVX2)
 static really_inline
@ -52,6 +50,57 @@ const u8 *lastMatch(const u8 *buf, u32 z) {
    return NULL; // no match
 }
 static really_inline
 const u8 *firstMatch(const u8 *buf, u32 z) {
    if (unlikely(z != 0xffff)) {
        u32 pos = ctz32(~z & 0xffff);
        assert(pos < 16);
        return buf + pos;
    }
    return NULL; // no match
 }
 static really_inline
 u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
    m128 highconst = _mm_set1_epi8(0x80);
    m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
    // and now do the real work
    m128 shuf1 = pshufb(shuf_mask_lo_highclear, v);
    m128 t1 = xor128(v, highconst);
    m128 shuf2 = pshufb(shuf_mask_lo_highset, t1);
    m128 t2 = andnot128(highconst, rshift64_m128(v, 4));
    m128 shuf3 = pshufb(shuf_mask_hi, t2);
    m128 tmp = and128(or128(shuf1, shuf2), shuf3);
    m128 tmp2 = eq128(tmp, zeroes128());
    u32 z = movemask128(tmp2);
    return z;
 }
 static
 const u8 *truffleMini(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
                       const u8 *buf, const u8 *buf_end) {
    uintptr_t len = buf_end - buf;
    assert(len < 16);
    m128 chars = zeroes128();
    memcpy(&chars, buf, len);
    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
    // can't be these bytes in z
    u32 mask = (0xffff >> (16 - len)) ^ 0xffff;
    const u8 *rv = firstMatch(buf, z | mask);
    if (rv) {
        return rv;
    } else {
        return buf_end;
    }
 }
 static really_inline
 const u8 *fwdBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
                   m128 v, const u8 *buf) {
@ -125,7 +174,7 @@ const u8 *truffleRevMini(m128 shuf_mask_lo_highclear,
    m128 chars = zeroes128();
    memcpy(&chars, buf, len);
-    u32 mask = (0xFFFF >> (16 - len)) ^ 0xFFFF;
+    u32 mask = (0xffff >> (16 - len)) ^ 0xffff;
    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
    const u8 *rv = lastMatch(buf, z | mask);
@ -184,6 +233,8 @@ const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
 #else
 // AVX2
 static really_inline
 const u8 *lastMatch(const u8 *buf, u32 z) {
    if (unlikely(z != 0xffffffff)) {
@ -195,6 +246,57 @@ const u8 *lastMatch(const u8 *buf, u32 z) {
    return NULL; // no match
 }
 static really_inline
 const u8 *firstMatch(const u8 *buf, u32 z) {
    if (unlikely(z != 0xffffffff)) {
        u32 pos = ctz32(~z);
        assert(pos < 32);
        return buf + pos;
    }
    return NULL; // no match
 }
 static really_inline
 u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
    m256 highconst = _mm256_set1_epi8(0x80);
    m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
    // and now do the real work
    m256 shuf1 = vpshufb(shuf_mask_lo_highclear, v);
    m256 t1 = xor256(v, highconst);
    m256 shuf2 = vpshufb(shuf_mask_lo_highset, t1);
    m256 t2 = andnot256(highconst, rshift64_m256(v, 4));
    m256 shuf3 = vpshufb(shuf_mask_hi, t2);
    m256 tmp = and256(or256(shuf1, shuf2), shuf3);
    m256 tmp2 = eq256(tmp, zeroes256());
    u32 z = movemask256(tmp2);
    return z;
 }
 static
 const u8 *truffleMini(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
                       const u8 *buf, const u8 *buf_end) {
    uintptr_t len = buf_end - buf;
    assert(len < 32);
    m256 chars = zeroes256();
    memcpy(&chars, buf, len);
    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
    // can't be these bytes in z
    u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff;
    const u8 *rv = firstMatch(buf, z | mask);
    if (rv) {
        return rv;
    } else {
        return buf_end;
    }
 }
 static really_inline
 const u8 *fwdBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
                   m256 v, const u8 *buf) {
@ -266,7 +368,7 @@ const u8 *truffleRevMini(m256 shuf_mask_lo_highclear,
    m256 chars = zeroes256();
    memcpy(&chars, buf, len);
-    u32 mask = (0xFFFFFFFF >> (32 - len)) ^ 0xFFFFFFFF;
+    u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff;
    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
    const u8 *rv = lastMatch(buf, z | mask);
--- a/src/nfa/truffle_common.h
+++ b/src/nfa/truffle_common.h
@ -1,147 +0,0 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef TRUFFLE_COMMON_H_
 #define TRUFFLE_COMMON_H_
 #include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 /*
 * Common stuff for all versions of truffle (single, multi and multidouble)
 */
 #if !defined(HAVE_AVX2)
 static really_inline
 const u8 *firstMatch(const u8 *buf, u32 z) {
    if (unlikely(z != 0xffff)) {
        u32 pos = ctz32(~z & 0xffff);
        assert(pos < 16);
        return buf + pos;
    }
    return NULL; // no match
 }
 static really_inline
 u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
    m128 highconst = _mm_set1_epi8(0x80);
    m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
    // and now do the real work
    m128 shuf1 = pshufb(shuf_mask_lo_highclear, v);
    m128 t1 = xor128(v, highconst);
    m128 shuf2 = pshufb(shuf_mask_lo_highset, t1);
    m128 t2 = andnot128(highconst, rshift64_m128(v, 4));
    m128 shuf3 = pshufb(shuf_mask_hi, t2);
    m128 tmp = and128(or128(shuf1, shuf2), shuf3);
    m128 tmp2 = eq128(tmp, zeroes128());
    u32 z = movemask128(tmp2);
    return z;
 }
 static
 const u8 *truffleMini(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
                       const u8 *buf, const u8 *buf_end) {
    uintptr_t len = buf_end - buf;
    assert(len < 16);
    m128 chars = zeroes128();
    memcpy(&chars, buf, len);
    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
    // can't be these bytes in z
    u32 mask = (0xFFFF >> (16 - len)) ^ 0xFFFF;
    const u8 *rv = firstMatch(buf, z| mask);
    if (rv) {
        return rv;
    } else {
        return buf_end;
    }
 }
 #else
 static really_inline
 const u8 *firstMatch(const u8 *buf, u32 z) {
    if (unlikely(z != 0xffffffff)) {
        u32 pos = ctz32(~z);
        assert(pos < 32);
        return buf + pos;
    }
    return NULL; // no match
 }
 static really_inline
 u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
    m256 highconst = _mm256_set1_epi8(0x80);
    m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
    // and now do the real work
    m256 shuf1 = vpshufb(shuf_mask_lo_highclear, v);
    m256 t1 = xor256(v, highconst);
    m256 shuf2 = vpshufb(shuf_mask_lo_highset, t1);
    m256 t2 = andnot256(highconst, rshift64_m256(v, 4));
    m256 shuf3 = vpshufb(shuf_mask_hi, t2);
    m256 tmp = and256(or256(shuf1, shuf2), shuf3);
    m256 tmp2 = eq256(tmp, zeroes256());
    u32 z = movemask256(tmp2);
    return z;
 }
 static
 const u8 *truffleMini(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
                       const u8 *buf, const u8 *buf_end) {
    uintptr_t len = buf_end - buf;
    assert(len < 32);
    m256 chars = zeroes256();
    memcpy(&chars, buf, len);
    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
    // can't be these bytes in z
    u32 mask = (0xFFFFFFFF >> (32 - len)) ^ 0xFFFFFFFF;
    const u8 *rv = firstMatch(buf, z | mask);
    if (rv) {
        return rv;
    } else {
        return buf_end;
    }
 }
 #endif
 #endif /* TRUFFLE_COMMON_H_ */
--- a/src/nfagraph/ng_limex_accel.cpp
+++ b/src/nfagraph/ng_limex_accel.cpp
@ -37,7 +37,6 @@
 #include "ue2common.h"
 #include "nfa/accel.h"
 #include "nfa/multiaccel_compilehelper.h"
 #include "util/bitutils.h" // for CASE_CLEAR
 #include "util/charreach.h"
@ -677,134 +676,6 @@ NFAVertex get_sds_or_proxy(const NGHolder &g) {
    return g.startDs;
 }
 static
 NFAVertex find_next(const NFAVertex v, const NGHolder &g) {
    NFAVertex res = NGHolder::null_vertex();
    for (NFAVertex u : adjacent_vertices_range(v, g)) {
        if (u != v) {
            res = u;
            break;
        }
    }
    return res;
 }
 /** \brief Check if vertex \a v is a multi accelerable state (for a limex NFA). */
 MultibyteAccelInfo nfaCheckMultiAccel(const NGHolder &g,
                                      const vector<NFAVertex> &states,
                                      const CompileContext &cc) {
    // For a set of states to be accelerable, we basically have to have only
    // one state to accelerate.
    if (states.size() != 1) {
        DEBUG_PRINTF("can't accelerate multiple states\n");
        return MultibyteAccelInfo();
    }
    // Get our base vertex
    NFAVertex v = states[0];
    // We need the base vertex to be a self-looping dotall leading to exactly
    // one vertex.
    if (!hasSelfLoop(v, g)) {
        DEBUG_PRINTF("base vertex has self-loop\n");
        return MultibyteAccelInfo();
    }
    if (!g[v].char_reach.all()) {
        DEBUG_PRINTF("can't accelerate anything but dot\n");
        return MultibyteAccelInfo();
    }
    if (proper_out_degree(v, g) != 1) {
        DEBUG_PRINTF("can't accelerate states with multiple successors\n");
        return MultibyteAccelInfo();
    }
    // find our start vertex
    NFAVertex cur = find_next(v, g);
    if (cur == NGHolder::null_vertex()) {
        DEBUG_PRINTF("invalid start vertex\n");
        return MultibyteAccelInfo();
    }
    bool has_offset = false;
    u32 offset = 0;
    CharReach cr = g[cur].char_reach;
    // if we start with a dot, we have an offset, so defer figuring out the
    // real CharReach for this accel scheme
    if (cr == CharReach::dot()) {
        has_offset = true;
        offset = 1;
    }
    // figure out our offset
    while (has_offset) {
        // vertices have to have no self loops
        if (hasSelfLoop(cur, g)) {
            DEBUG_PRINTF("can't have self-loops\n");
            return MultibyteAccelInfo();
        }
        // we have to have exactly 1 successor to have this acceleration scheme
        if (out_degree(cur, g) != 1) {
            DEBUG_PRINTF("can't have multiple successors\n");
            return MultibyteAccelInfo();
        }
        cur = *adjacent_vertices(cur, g).first;
        // if we met a special vertex, bail out
        if (is_special(cur, g)) {
            DEBUG_PRINTF("can't have special vertices\n");
            return MultibyteAccelInfo();
        }
        // now, get the real char reach
        if (g[cur].char_reach != CharReach::dot()) {
            cr = g[cur].char_reach;
            has_offset = false;
        } else {
            offset++;
        }
    }
    // now, fire up the compilation machinery
    target_t ti = cc.target_info;
    unsigned max_len = ti.has_avx2() ? MULTIACCEL_MAX_LEN_AVX2 : MULTIACCEL_MAX_LEN_SSE;
    MultiaccelCompileHelper mac(cr, offset, max_len);
    while (mac.canAdvance()) {
        // vertices have to have no self loops
        if (hasSelfLoop(cur, g)) {
            break;
        }
        // we have to have exactly 1 successor to have this acceleration scheme
        if (out_degree(cur, g) != 1) {
            break;
        }
        cur = *adjacent_vertices(cur, g).first;
        // if we met a special vertex, bail out
        if (is_special(cur, g)) {
            break;
        }
        mac.advance(g[cur].char_reach);
    }
    MultibyteAccelInfo mai = mac.getBestScheme();
 #ifdef DEBUG
    DEBUG_PRINTF("Multibyte acceleration scheme: type: %u offset: %u lengths: %u,%u\n",
                 mai.type, mai.offset, mai.len1, mai.len2);
    for (size_t c = mai.cr.find_first(); c != CharReach::npos; c = mai.cr.find_next(c)) {
        DEBUG_PRINTF("multibyte accel char: %zu\n", c);
    }
 #endif
    return mai;
 }
 /** \brief Check if vertex \a v is an accelerable state (for a limex NFA). */
 bool nfaCheckAccel(const NGHolder &g, NFAVertex v,
                   const vector<CharReach> &refined_cr,
--- a/src/nfagraph/ng_limex_accel.h
+++ b/src/nfagraph/ng_limex_accel.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -51,9 +51,6 @@ namespace ue2 {
 #define MAX_MERGED_ACCEL_STOPS 200
 #define ACCEL_MAX_STOP_CHAR 24
 #define ACCEL_MAX_FLOATING_STOP_CHAR 192 /* accelerating sds is important */
 #define MULTIACCEL_MIN_LEN 3
 #define MULTIACCEL_MAX_LEN_SSE 15
 #define MULTIACCEL_MAX_LEN_AVX2 31
 // forward-declaration of CompileContext
 struct CompileContext;
@ -84,11 +81,6 @@ bool nfaCheckAccel(const NGHolder &g, NFAVertex v,
                   const std::map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
                   AccelScheme *as, bool allow_wide);
 /** \brief Check if vertex \a v is a multi accelerable state (for a limex NFA).
 */
 MultibyteAccelInfo nfaCheckMultiAccel(const NGHolder &g,
                                      const std::vector<NFAVertex> &verts,
                                      const CompileContext &cc);
 } // namespace ue2
--- a/unit/CMakeLists.txt
+++ b/unit/CMakeLists.txt
@ -52,8 +52,6 @@ set(unit_internal_SOURCES
    internal/limex_nfa.cpp
    internal/masked_move.cpp
    internal/multi_bit.cpp
    internal/multiaccel_matcher.cpp
    internal/multiaccel_shift.cpp
    internal/nfagraph_common.h
    internal/nfagraph_comp.cpp
    internal/nfagraph_equivalence.cpp
--- a/unit/internal/multiaccel_matcher.cpp
+++ b/unit/internal/multiaccel_matcher.cpp
@ -1,301 +0,0 @@
 /*
 * Copyright (c) 2015-2016, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 extern "C" {
 #include "nfa/accel.h" // wrapping in extern C to make sure run_accel works
 }
 #include "config.h"
 #include "src/ue2common.h"
 #include "gtest/gtest.h"
 #include "nfagraph/ng_limex_accel.h"
 #include "nfa/accelcompile.h"
 #include "nfa/multivermicelli.h"
 #include "nfa/multishufti.h"
 #include "nfa/multitruffle.h"
 #include "util/alloc.h"
 #include "util/charreach.h"
 #include <algorithm>
 #include <iostream>
 #include <random>
 #include <string>
 #include <vector>
 using namespace ue2;
 using namespace std;
 using namespace testing;
 // test parameters structure
 struct MultiaccelTestParam {
    string match_pattern;
    u32 match_pattern_start_idx;
    u32 match_idx;
    bool test_all_offsets;
    u8 match_len1;
    u8 match_len2;
    MultibyteAccelInfo::multiaccel_type type;
 };
 // buffer size is constant
 static const u32 BUF_SIZE = 200;
 // strings, out of which CharReach will be generated
 static const string VERM_CR = "a";
 static const string V_NC_CR = "aA";
 static const string SHUF_CR = "abcdefghijklmnopqrstuvwxyz";
 static const string TRUF_CR = "\x11\x22\x33\x44\x55\x66\x77\x88\x99";
 // Parameterized test case for multiaccel patterns.
 class MultiaccelTest : public TestWithParam<MultiaccelTestParam> {
 protected:
    virtual void SetUp() {
        // set up is deferred until the actual test, since we can't compile
        // any accel schemes unless we know CharReach
        const MultiaccelTestParam &p = GetParam();
        // reserve space in our buffer
        buffer = (u8 *)aligned_zmalloc(BUF_SIZE);
        // store the index where we expect to see the match. note that it may
        // be different from where the match pattern has started since we may
        // have a flooded match (i.e. a match preceded by almost-match) or a
        // no-match (in which case "match" index is at the end of the buffer).
        match_idx = p.match_idx;
        // make note if we need to test all offsets - sometimes we don't, for
        // example when testing partial or no-match.
        test_all_offsets = p.test_all_offsets;
    }
    char getChar(const CharReach &cr) {
        assert(cr.count() > 0);
        auto dist = uniform_int_distribution<size_t>(0, cr.count() - 1);
        size_t result = cr.find_nth(dist(prng));
        assert(result != CharReach::npos);
        return (char)result;
    }
    // char generator
    char getChar(const CharReach &cr, bool match) {
        return getChar(match ? cr : ~cr);
    }
    // appends a string with matches/unmatches according to input match pattern
    void getMatch(u8 *result, u32 start, const string &pattern,
                  const CharReach &cr) {
        for (const auto &c : pattern) {
            result[start++] = getChar(cr, c == '1');
        }
    }
    // appends non-matching noise of certain lengths
    void getNoise(u8 *result, u32 start, u32 len, const CharReach &cr) {
        for (unsigned i = 0; i < len; i++) {
            result[start + i] = getChar(cr, false);
        }
    }
    // deferred buffer generation, as we don't know CharReach before we run the test
    void GenerateBuffer(const CharReach &cr) {
        const MultiaccelTestParam &p = GetParam();
        // step 1: fill prefix with non-matching noise
        u32 start = 0;
        getNoise(buffer, start, p.match_pattern_start_idx, cr);
        // step 2: add a match
        start += p.match_pattern_start_idx;
        getMatch(buffer, start, p.match_pattern, cr);
        // step 3: fill in the rest of the buffer with non-matching noise
        start += p.match_pattern.size();
        getNoise(buffer, start, BUF_SIZE - p.match_pattern.size() -
                 p.match_pattern_start_idx, cr);
    }
    // deferred accel scheme generation, as we don't know CharReach before we run the test
    void CompileAccelScheme(const CharReach &cr, AccelAux *aux) {
        const MultiaccelTestParam &p = GetParam();
        AccelInfo ai;
        ai.single_stops = cr; // dummy CharReach to prevent red tape accel
        ai.ma_len1 = p.match_len1;
        ai.ma_len2 = p.match_len2;
        ai.multiaccel_stops = cr;
        ai.ma_type = p.type;
        buildAccelAux(ai, aux);
        // now, verify we've successfully built our accel scheme, *and* that it's
        // a multibyte scheme
        ASSERT_TRUE(aux->accel_type >= ACCEL_MLVERM &&
                    aux->accel_type <= ACCEL_MDSGTRUFFLE);
    }
    virtual void TearDown() {
        aligned_free(buffer);
    }
    // We want our tests to be deterministic, so we use a PRNG in the test
    // fixture.
    mt19937 prng;
    u32 match_idx;
    u8 *buffer;
    bool test_all_offsets;
 };
 static
 void runTest(const u8 *buffer, AccelAux *aux, unsigned match_idx,
             bool test_all_offsets) {
    const u8 *start = buffer;
    const u8 *end = start + BUF_SIZE;
    const u8 *match = start + match_idx;
    // comparing indexes into the buffer is easier to understand than pointers
    if (test_all_offsets) {
        // run_accel can only scan >15 byte buffers
        u32 end_offset = min(match_idx, BUF_SIZE - 15);
        for (unsigned offset = 0; offset < end_offset; offset++) {
            const u8 *ptr = run_accel(aux, (start + offset), end);
            unsigned idx = ptr - start;
            ASSERT_EQ(match_idx, idx);
        }
    } else {
        const u8 *ptr = run_accel(aux, start, end);
        unsigned idx = ptr - start;
        ASSERT_EQ(match_idx, idx);
    }
 }
 TEST_P(MultiaccelTest, TestVermicelli) {
    AccelAux aux = {0};
    CharReach cr(VERM_CR);
    GenerateBuffer(cr);
    CompileAccelScheme(cr, &aux);
    runTest(buffer, &aux, match_idx, test_all_offsets);
 }
 TEST_P(MultiaccelTest, TestVermicelliNocase) {
    AccelAux aux = {0};
    CharReach cr(V_NC_CR);
    GenerateBuffer(cr);
    CompileAccelScheme(cr, &aux);
    runTest(buffer, &aux, match_idx, test_all_offsets);
 }
 TEST_P(MultiaccelTest, TestShufti) {
    AccelAux aux = {0};
    CharReach cr(SHUF_CR);
    GenerateBuffer(cr);
    CompileAccelScheme(cr, &aux);
    runTest(buffer, &aux, match_idx, test_all_offsets);
 }
 TEST_P(MultiaccelTest, TestTruffle) {
    AccelAux aux = {0};
    CharReach cr(TRUF_CR);
    GenerateBuffer(cr);
    CompileAccelScheme(cr, &aux);
    runTest(buffer, &aux, match_idx, test_all_offsets);
 }
 static const MultiaccelTestParam multiaccelTests[] = {
    // long matcher
    // full, partial, flooded, nomatch
    {"11111", 180, 180, true, 5, 0, MultibyteAccelInfo::MAT_LONG},
    {"111", 197, 197, true, 5, 0, MultibyteAccelInfo::MAT_LONG},
    {"1111011111", 177, 182, false, 5, 0, MultibyteAccelInfo::MAT_LONG},
    {"1111011110", 177, 200, false, 5, 0, MultibyteAccelInfo::MAT_LONG},
    // long-grab matcher
    // full, partial, flooded, nomatch
    {"111110", 180, 180, true, 5, 0, MultibyteAccelInfo::MAT_LONGGRAB},
    {"111", 197, 197, true, 5, 0, MultibyteAccelInfo::MAT_LONGGRAB},
    {"11111111110", 177, 182, false, 5, 0, MultibyteAccelInfo::MAT_LONGGRAB},
    {"11110111101", 177, 200, false, 5, 0, MultibyteAccelInfo::MAT_LONGGRAB},
    // shift matcher
    // full, partial, flooded, nomatch
    {"11001", 180, 180, true, 4, 0, MultibyteAccelInfo::MAT_SHIFT},
    {"110", 197, 197, true, 4, 0, MultibyteAccelInfo::MAT_SHIFT},
    {"1001011001", 177, 182, false, 4, 0, MultibyteAccelInfo::MAT_SHIFT},
    {"1101001011", 177, 200, false, 4, 0, MultibyteAccelInfo::MAT_SHIFT},
    // shift-grab matcher
    // full, partial, flooded, nomatch
    {"10111", 180, 180, true, 4, 0, MultibyteAccelInfo::MAT_SHIFTGRAB},
    {"101", 197, 197, true, 4, 0, MultibyteAccelInfo::MAT_SHIFTGRAB},
    {"1110010111", 177, 182, false, 4, 0, MultibyteAccelInfo::MAT_SHIFTGRAB},
    {"1100101100", 177, 200, false, 4, 0, MultibyteAccelInfo::MAT_SHIFTGRAB},
    // doubleshift matcher
    // full, partial (one and two shifts), flooded, nomatch
    {"110111", 180, 180, true, 3, 2, MultibyteAccelInfo::MAT_DSHIFT},
    {"110", 197, 197, true, 3, 2, MultibyteAccelInfo::MAT_DSHIFT},
    {"1101", 196, 196, true, 3, 2, MultibyteAccelInfo::MAT_DSHIFT},
    {"1100100101", 178, 182, false, 3, 2, MultibyteAccelInfo::MAT_DSHIFT},
    {"1101001101", 177, 200, false, 3, 2, MultibyteAccelInfo::MAT_DSHIFT},
    // doubleshift-grab matcher
    // full, partial (one and two shifts), flooded, nomatch
    {"100101", 180, 180, true, 3, 2, MultibyteAccelInfo::MAT_DSHIFTGRAB},
    {"100", 197, 197, true, 3, 2, MultibyteAccelInfo::MAT_DSHIFTGRAB},
    {"1011", 196, 196, true, 3, 2, MultibyteAccelInfo::MAT_DSHIFTGRAB},
    {"11111101101", 177, 182, false, 3, 2, MultibyteAccelInfo::MAT_DSHIFTGRAB},
    {"1111110111", 177, 200, false, 3, 2, MultibyteAccelInfo::MAT_DSHIFTGRAB},
 };
 INSTANTIATE_TEST_CASE_P(Multiaccel, MultiaccelTest, ValuesIn(multiaccelTests));
 // boring stuff for google test
 void PrintTo(const MultiaccelTestParam &p, ::std::ostream *os) {
    *os << "MultiaccelTestParam: " << p.match_pattern;
 }
--- a/unit/internal/multiaccel_shift.cpp
+++ b/unit/internal/multiaccel_shift.cpp
@ -1,81 +0,0 @@
 /*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "config.h"
 #include "src/ue2common.h"
 #include "gtest/gtest.h"
 #include "nfa/multiaccel_common.h"
 /*
 * Unit tests for the shifters.
 *
 * This is a bit messy, as shifters are macros, so we're using macros to test
 * other macros.
 */
 #define TEST_SHIFT(n) \
    do { \
        u64a val = ((u64a) 1 << n) - 1; \
        JOIN(SHIFT, n)(val); \
        ASSERT_EQ(val, 1); \
    } while (0)
 TEST(MultiaccelShift, StaticShift) {
    TEST_SHIFT(1);
    TEST_SHIFT(2);
    TEST_SHIFT(3);
    TEST_SHIFT(4);
    TEST_SHIFT(5);
    TEST_SHIFT(6);
    TEST_SHIFT(7);
    TEST_SHIFT(8);
    TEST_SHIFT(10);
    TEST_SHIFT(11);
    TEST_SHIFT(12);
    TEST_SHIFT(13);
    TEST_SHIFT(14);
    TEST_SHIFT(15);
    TEST_SHIFT(16);
    TEST_SHIFT(17);
    TEST_SHIFT(18);
    TEST_SHIFT(19);
    TEST_SHIFT(20);
    TEST_SHIFT(21);
    TEST_SHIFT(22);
    TEST_SHIFT(23);
    TEST_SHIFT(24);
    TEST_SHIFT(25);
    TEST_SHIFT(26);
    TEST_SHIFT(27);
    TEST_SHIFT(28);
    TEST_SHIFT(29);
    TEST_SHIFT(30);
    TEST_SHIFT(31);
    TEST_SHIFT(32);
 }