MCSHENG64: extend to 64-state based on mcsheng

2025-06-28 16:41:01 +03:00 · 2020-09-08 14:59:33 +00:00 · 2020-09-08 14:59:33 +00:00 · d96f1ab505
commit d96f1ab505
parent dea7c4dc2e
15 changed files with 2334 additions and 15 deletions
--- a/cmake/build_wrapper.sh
+++ b/cmake/build_wrapper.sh
@ -17,7 +17,7 @@ KEEPSYMS=$(mktemp -p /tmp keep.syms.XXXXX)
 LIBC_SO=$("$@" --print-file-name=libc.so.6)
 cp ${KEEPSYMS_IN} ${KEEPSYMS}
 # get all symbols from libc and turn them into patterns
-nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ ]*\).*/^\1$/' >> ${KEEPSYMS}
+nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ @]*\).*/^\1$/' >> ${KEEPSYMS}
 # build the object
 "$@"
 # rename the symbols in the object
--- a/src/nfa/mcsheng.c
+++ b/src/nfa/mcsheng.c
--- a/src/nfa/mcsheng.h
+++ b/src/nfa/mcsheng.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -80,5 +80,78 @@ char nfaExecMcSheng16_expandState(const struct NFA *nfa, void *dest,

 #define nfaExecMcSheng16_B_Reverse NFA_API_NO_IMPL
 #define nfaExecMcSheng16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#if defined(HAVE_AVX512VBMI)
+/* 64-8 bit Sheng-McClellan hybrid  */
+char nfaExecMcSheng64_8_testEOD(const struct NFA *nfa, const char *state,
+                                const char *streamState, u64a offset,
+                                NfaCallback callback, void *context);
+char nfaExecMcSheng64_8_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_8_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_8_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng64_8_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_8_inAccept(const struct NFA *n, ReportID report,
+                                 struct mq *q);
+char nfaExecMcSheng64_8_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_8_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_8_initCompressedState(const struct NFA *n, u64a offset,
+                                            void *state, u8 key);
+char nfaExecMcSheng64_8_queueCompressState(const struct NFA *nfa,
+                                           const struct mq *q, s64a loc);
+char nfaExecMcSheng64_8_expandState(const struct NFA *nfa, void *dest,
+                                    const void *src, u64a offset, u8 key);
+
+#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+/* 64-16 bit Sheng-McClellan hybrid  */
+char nfaExecMcSheng64_16_testEOD(const struct NFA *nfa, const char *state,
+                                 const char *streamState, u64a offset,
+                                 NfaCallback callback, void *context);
+char nfaExecMcSheng64_16_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_16_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_16_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng64_16_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_16_inAccept(const struct NFA *n, ReportID report,
+                                  struct mq *q);
+char nfaExecMcSheng64_16_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_16_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_16_initCompressedState(const struct NFA *n, u64a offset,
+                                             void *state, u8 key);
+char nfaExecMcSheng64_16_queueCompressState(const struct NFA *nfa,
+                                            const struct mq *q, s64a loc);
+char nfaExecMcSheng64_16_expandState(const struct NFA *nfa, void *dest,
+                                     const void *src, u64a offset, u8 key);
+#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#else // !HAVE_AVX512VBMI
+#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecMcSheng64_8_Q NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_Q2 NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_QR NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_inAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_queueInitState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_queueCompressState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_expandState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_initCompressedState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_testEOD NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_reportCurrent NFA_API_NO_IMPL
+
+#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecMcSheng64_16_Q NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_Q2 NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_QR NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_inAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_queueInitState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_queueCompressState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_expandState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_initCompressedState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_testEOD NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_reportCurrent NFA_API_NO_IMPL
+
+#endif    //end of HAVE_AVX512VBM

 #endif
--- a/src/nfa/mcsheng_compile.cpp
+++ b/src/nfa/mcsheng_compile.cpp
@ -64,7 +64,6 @@
 #include <set>
 #include <deque>
 #include <vector>
-
 #include <boost/range/adaptor/map.hpp>

 using namespace std;
@ -244,6 +243,108 @@ void populateBasicInfo(size_t state_size, const dfa_info &info,
    }
 }

+#if defined(HAVE_AVX512VBMI)
+static
+mstate_aux *getAux64(NFA *n, dstate_id_t i) {
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(n);
+    mstate_aux *aux_base = (mstate_aux *)((char *)n + m->aux_offset);
+
+    mstate_aux *aux = aux_base + i;
+    assert((const char *)aux < (const char *)n + m->length);
+    return aux;
+}
+
+static
+void createShuffleMasks64(mcsheng64 *m, const dfa_info &info,
+                      dstate_id_t sheng_end,
+                      const map<dstate_id_t, AccelScheme> &accel_escape_info) {
+    DEBUG_PRINTF("using first %hu states for a sheng\n", sheng_end);
+    assert(sheng_end > DEAD_STATE + 1);
+    assert(sheng_end <= sizeof(m512) + 1);
+    vector<array<u8, sizeof(m512)>> masks;
+    masks.resize(info.alpha_size);
+    /* -1 to avoid wasting a slot as we do not include dead state */
+    vector<dstate_id_t> raw_ids;
+    raw_ids.resize(sheng_end - 1);
+    for (dstate_id_t s = DEAD_STATE + 1; s < info.states.size(); s++) {
+        assert(info.implId(s)); /* should not map to DEAD_STATE */
+        if (info.is_sheng(s)) {
+            raw_ids[info.extra[s].sheng_id] = s;
+        }
+    }
+    for (u32 i = 0; i < info.alpha_size; i++) {
+        if (i == info.alpha_remap[TOP]) {
+            continue;
+        }
+        auto &mask = masks[i];
+        assert(sizeof(mask) == sizeof(m512));
+        mask.fill(0);
+
+        for (dstate_id_t sheng_id = 0; sheng_id < sheng_end - 1; sheng_id++) {
+            dstate_id_t raw_id = raw_ids[sheng_id];
+            dstate_id_t next_id = info.implId(info.states[raw_id].next[i]);
+            if (next_id == DEAD_STATE) {
+                next_id = sheng_end - 1;
+            } else if (next_id < sheng_end) {
+                next_id--;
+            }
+            DEBUG_PRINTF("%hu: %u->next %hu\n", sheng_id, i, next_id);
+            mask[sheng_id] = verify_u8(next_id);
+        }
+    }
+    for (u32 i = 0; i < N_CHARS; i++) {
+        assert(info.alpha_remap[i] != info.alpha_remap[TOP]);
+        memcpy((u8 *)&m->sheng_succ_masks[i],
+               (u8 *)masks[info.alpha_remap[i]].data(), sizeof(m512));
+    }
+    m->sheng_end = sheng_end;
+    m->sheng_accel_limit = sheng_end - 1;
+
+    for (dstate_id_t s : raw_ids) {
+        if (contains(accel_escape_info, s)) {
+            LIMIT_TO_AT_MOST(&m->sheng_accel_limit, info.extra[s].sheng_id);
+        }
+    }
+}
+
+static
+void populateBasicInfo64(size_t state_size, const dfa_info &info,
+                         u32 total_size, u32 aux_offset, u32 accel_offset,
+                         u32 accel_count, ReportID arb, bool single, NFA *nfa) {
+    assert(state_size == sizeof(u16) || state_size == sizeof(u8));
+
+    nfa->length = total_size;
+    nfa->nPositions = info.states.size();
+
+    nfa->scratchStateSize = verify_u32(state_size);
+    nfa->streamStateSize = verify_u32(state_size);
+
+    if (state_size == sizeof(u8)) {
+        nfa->type = MCSHENG_64_NFA_8;
+    } else {
+        nfa->type = MCSHENG_64_NFA_16;
+    }
+
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
+    for (u32 i = 0; i < 256; i++) {
+        m->remap[i] = verify_u8(info.alpha_remap[i]);
+    }
+    m->alphaShift = info.getAlphaShift();
+    m->length = total_size;
+    m->aux_offset = aux_offset;
+    m->accel_offset = accel_offset;
+    m->arb_report = arb;
+    m->state_count = verify_u16(info.size());
+    m->start_anchored = info.implId(info.raw.start_anchored);
+    m->start_floating = info.implId(info.raw.start_floating);
+    m->has_accel = accel_count ? 1 : 0;
+
+    if (single) {
+        m->flags |= MCSHENG_FLAG_SINGLE;
+    }
+}
+#endif
+
 static
 size_t calcShermanRegionSize(const dfa_info &info) {
    size_t rv = 0;
@ -272,7 +373,7 @@ void fillInAux(mstate_aux *aux, dstate_id_t i, const dfa_info &info,
 /* returns false on error */
 static
 bool allocateImplId16(dfa_info &info, dstate_id_t sheng_end,
-                     dstate_id_t *sherman_base) {
+                      dstate_id_t *sherman_base) {
    info.states[0].impl_id = 0; /* dead is always 0 */

    vector<dstate_id_t> norm;
@ -382,6 +483,7 @@ CharReach get_edge_reach(dstate_id_t u, dstate_id_t v, const dfa_info &info) {
 }

 #define MAX_SHENG_STATES 16
+#define MAX_SHENG64_STATES 64
 #define MAX_SHENG_LEAKINESS 0.05

 using LeakinessCache = ue2_unordered_map<pair<RdfaVertex, u32>, double>;
@ -435,7 +537,8 @@ double leakiness(const RdfaGraph &g, dfa_info &info,

 static
 dstate_id_t find_sheng_states(dfa_info &info,
-                             map<dstate_id_t, AccelScheme> &accel_escape_info) {
+                              map<dstate_id_t, AccelScheme> &accel_escape_info,
+                              size_t max_sheng_states) {
    RdfaGraph g(info.raw);
    auto cyclics = find_vertices_in_cycles(g);

@ -470,7 +573,7 @@ dstate_id_t find_sheng_states(dfa_info &info,
    flat_set<dstate_id_t> considered = { DEAD_STATE };
    bool seen_back_edge = false;
    while (!to_consider.empty()
-           && sheng_states.size() < MAX_SHENG_STATES) {
+           && sheng_states.size() < max_sheng_states) {
        auto v = to_consider.front();
        to_consider.pop_front();
        if (!considered.insert(g[v].index).second) {
@ -616,6 +719,82 @@ void fill_in_succ_table_16(NFA *nfa, const dfa_info &info,
    }
 }

+#if defined(HAVE_AVX512VBMI)
+static
+void fill_in_aux_info64(NFA *nfa, const dfa_info &info,
+                        const map<dstate_id_t, AccelScheme> &accel_escape_info,
+                        u32 accel_offset, UNUSED u32 accel_end_offset,
+                        const vector<u32> &reports,
+                        const vector<u32> &reports_eod,
+                        u32 report_base_offset,
+                        const raw_report_info &ri) {
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
+
+    vector<u32> reportOffsets;
+
+    ri.fillReportLists(nfa, report_base_offset, reportOffsets);
+
+    for (u32 i = 0; i < info.size(); i++) {
+        u16 impl_id = info.implId(i);
+        mstate_aux *this_aux = getAux64(nfa, impl_id);
+
+        fillInAux(this_aux, i, info, reports, reports_eod, reportOffsets);
+        if (contains(accel_escape_info, i)) {
+            this_aux->accel_offset = accel_offset;
+            accel_offset += info.strat.accelSize();
+            assert(accel_offset <= accel_end_offset);
+            assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+            info.strat.buildAccel(i, accel_escape_info.at(i),
+                                  (void *)((char *)m + this_aux->accel_offset));
+        }
+    }
+}
+
+static
+u16 get_edge_flags64(NFA *nfa, dstate_id_t target_impl_id) {
+    mstate_aux *aux = getAux64(nfa, target_impl_id);
+    u16 flags = 0;
+
+    if (aux->accept) {
+        flags |= ACCEPT_FLAG;
+    }
+
+    if (aux->accel_offset) {
+        flags |= ACCEL_FLAG;
+    }
+
+    return flags;
+}
+
+static
+void fill_in_succ_table_64_16(NFA *nfa, const dfa_info &info,
+                              dstate_id_t sheng_end,
+                              UNUSED dstate_id_t sherman_base) {
+    u16 *succ_table = (u16 *)((char *)nfa + sizeof(NFA) + sizeof(mcsheng64));
+
+    u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
+
+    for (size_t i = 0; i < info.size(); i++) {
+        if (!info.is_normal(i)) {
+            assert(info.implId(i) < sheng_end || info.is_sherman(i));
+            continue;
+        }
+
+        assert(info.implId(i) < sherman_base);
+        u16 normal_id = verify_u16(info.implId(i) - sheng_end);
+
+        for (size_t s = 0; s < info.impl_alpha_size; s++) {
+            dstate_id_t raw_succ = info.states[i].next[s];
+            u16 &entry = succ_table[((size_t)normal_id << alphaShift) + s];
+
+            entry = info.implId(raw_succ);
+            entry |= get_edge_flags64(nfa, entry);
+        }
+    }
+}
+#endif
+
 #define MAX_SHERMAN_LIST_LEN 8

 static
@ -934,6 +1113,162 @@ void fill_in_succ_table_8(NFA *nfa, const dfa_info &info,
    }
 }

+#if defined(HAVE_AVX512VBMI)
+static
+void fill_in_sherman64(NFA *nfa, dfa_info &info, UNUSED u16 sherman_limit) {
+    char *nfa_base = (char *)nfa;
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
+    char *sherman_table = nfa_base + m->sherman_offset;
+
+    assert(ISALIGNED_16(sherman_table));
+    for (size_t i = 0; i < info.size(); i++) {
+        if (!info.is_sherman(i)) {
+            continue;
+        }
+        u16 fs = verify_u16(info.implId(i));
+        DEBUG_PRINTF("building sherman %zu impl %hu\n", i, fs);
+
+        assert(fs >= sherman_limit);
+
+        char *curr_sherman_entry
+            = sherman_table + (fs - m->sherman_limit) * SHERMAN_FIXED_SIZE;
+        assert(curr_sherman_entry <= nfa_base + m->length);
+
+        u8 len = verify_u8(info.impl_alpha_size - info.extra[i].daddytaken);
+        assert(len <= 9);
+        dstate_id_t d = info.states[i].daddy;
+
+        *(u8 *)(curr_sherman_entry + SHERMAN_TYPE_OFFSET) = SHERMAN_STATE;
+        *(u8 *)(curr_sherman_entry + SHERMAN_LEN_OFFSET) = len;
+        *(u16 *)(curr_sherman_entry + SHERMAN_DADDY_OFFSET) = info.implId(d);
+        u8 *chars = (u8 *)(curr_sherman_entry + SHERMAN_CHARS_OFFSET);
+
+        for (u16 s = 0; s < info.impl_alpha_size; s++) {
+            if (info.states[i].next[s] != info.states[d].next[s]) {
+                *(chars++) = (u8)s;
+            }
+        }
+
+        u16 *states = (u16 *)(curr_sherman_entry + SHERMAN_STATES_OFFSET(len));
+        for (u16 s = 0; s < info.impl_alpha_size; s++) {
+            if (info.states[i].next[s] != info.states[d].next[s]) {
+                DEBUG_PRINTF("s overrider %hu dad %hu char next %hu\n", fs,
+                             info.implId(d),
+                             info.implId(info.states[i].next[s]));
+                u16 entry_val = info.implId(info.states[i].next[s]);
+                entry_val |= get_edge_flags64(nfa, entry_val);
+                unaligned_store_u16((u8 *)states++, entry_val);
+            }
+        }
+    }
+}
+
+static
+bytecode_ptr<NFA> mcsheng64Compile16(dfa_info&info, dstate_id_t sheng_end,
+                         const map<dstate_id_t, AccelScheme>&accel_escape_info,
+                         const Grey &grey) {
+    DEBUG_PRINTF("building mcsheng 64-16\n");
+
+    vector<u32> reports; /* index in ri for the appropriate report list */
+    vector<u32> reports_eod; /* as above */
+    ReportID arb;
+    u8 single;
+
+    assert(info.getAlphaShift() <= 8);
+
+    // Sherman optimization
+    if (info.impl_alpha_size > 16) {
+        u16 total_daddy = 0;
+        for (u32 i = 0; i < info.size(); i++) {
+            find_better_daddy(info, i,
+                              is_cyclic_near(info.raw, info.raw.start_anchored),
+                              grey);
+            total_daddy += info.extra[i].daddytaken;
+        }
+
+        DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
+                     info.size() * info.impl_alpha_size, info.size(),
+                     info.impl_alpha_size);
+    }
+
+    u16 sherman_limit;
+    if (!allocateImplId16(info, sheng_end, &sherman_limit)) {
+        DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
+                     info.size());
+        return nullptr;
+    }
+    u16 count_real_states = sherman_limit - sheng_end;
+
+    auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+
+    size_t tran_size = (1 << info.getAlphaShift()) * sizeof(u16)
+                     * count_real_states;
+
+    size_t aux_size = sizeof(mstate_aux) * info.size();
+
+    size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng64) + tran_size);
+    size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
+    size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+                                    + ri->getReportListSize(), 32);
+    size_t sherman_offset = ROUNDUP_16(accel_offset + accel_size);
+    size_t sherman_size = calcShermanRegionSize(info);
+
+    size_t total_size = sherman_offset + sherman_size;
+
+    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
+    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa.get());
+
+    populateBasicInfo64(sizeof(u16), info, total_size, aux_offset, accel_offset,
+                        accel_escape_info.size(), arb, single, nfa.get());
+    createShuffleMasks64(m, info, sheng_end, accel_escape_info);
+
+    /* copy in the mc header information */
+    m->sherman_offset = sherman_offset;
+    m->sherman_end = total_size;
+    m->sherman_limit = sherman_limit;
+
+    DEBUG_PRINTF("%hu sheng, %hu norm, %zu total\n", sheng_end,
+                 count_real_states, info.size());
+
+    fill_in_aux_info64(nfa.get(), info, accel_escape_info, accel_offset,
+                       sherman_offset - sizeof(NFA), reports, reports_eod,
+                       aux_offset + aux_size, *ri);
+
+    fill_in_succ_table_64_16(nfa.get(), info, sheng_end, sherman_limit);
+
+    fill_in_sherman64(nfa.get(), info, sherman_limit);
+
+    return nfa;
+}
+
+static
+void fill_in_succ_table_64_8(NFA *nfa, const dfa_info &info,
+                             dstate_id_t sheng_end) {
+    u8 *succ_table = (u8 *)nfa + sizeof(NFA) + sizeof(mcsheng64);
+
+    u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
+
+    for (size_t i = 0; i < info.size(); i++) {
+        assert(!info.is_sherman(i));
+        if (!info.is_normal(i)) {
+            assert(info.implId(i) < sheng_end);
+            continue;
+        }
+        u8 normal_id = verify_u8(info.implId(i) - sheng_end);
+
+        for (size_t s = 0; s < info.impl_alpha_size; s++) {
+            dstate_id_t raw_succ = info.states[i].next[s];
+            succ_table[((size_t)normal_id << alphaShift) + s]
+                = info.implId(raw_succ);
+        }
+    }
+}
+#endif
+
 static
 void allocateImplId8(dfa_info &info, dstate_id_t sheng_end,
                     const map<dstate_id_t, AccelScheme> &accel_escape_info,
@ -1031,6 +1366,60 @@ bytecode_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
    return nfa;
 }

+#if defined(HAVE_AVX512VBMI)
+static
+bytecode_ptr<NFA> mcsheng64Compile8(dfa_info &info, dstate_id_t sheng_end,
+                      const map<dstate_id_t, AccelScheme> &accel_escape_info) {
+    DEBUG_PRINTF("building mcsheng 64-8\n");
+
+    vector<u32> reports;
+    vector<u32> reports_eod;
+    ReportID arb;
+    u8 single;
+
+    auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+
+    size_t normal_count = info.size() - sheng_end;
+
+    size_t tran_size = sizeof(u8) * (1 << info.getAlphaShift()) * normal_count;
+    size_t aux_size = sizeof(mstate_aux) * info.size();
+    size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng64) + tran_size);
+    size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
+    size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+                                    + ri->getReportListSize(), 32);
+    size_t total_size = accel_offset + accel_size;
+
+    DEBUG_PRINTF("aux_size %zu\n", aux_size);
+    DEBUG_PRINTF("aux_offset %zu\n", aux_offset);
+    DEBUG_PRINTF("rl size %u\n", ri->getReportListSize());
+    DEBUG_PRINTF("accel_size %zu\n", accel_size);
+    DEBUG_PRINTF("accel_offset %zu\n", accel_offset);
+    DEBUG_PRINTF("total_size %zu\n", total_size);
+
+    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
+    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa.get());
+
+    allocateImplId8(info, sheng_end, accel_escape_info, &m->accel_limit_8,
+                    &m->accept_limit_8);
+
+    populateBasicInfo64(sizeof(u8), info, total_size, aux_offset, accel_offset,
+                        accel_escape_info.size(), arb, single, nfa.get());
+    createShuffleMasks64(m, info, sheng_end, accel_escape_info);
+
+    fill_in_aux_info64(nfa.get(), info, accel_escape_info, accel_offset,
+                       total_size - sizeof(NFA), reports, reports_eod,
+                       aux_offset + aux_size, *ri);
+
+    fill_in_succ_table_64_8(nfa.get(), info, sheng_end);
+    DEBUG_PRINTF("rl size %zu\n", ri->size());
+
+    return nfa;
+}
+#endif
+
 bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
                                 const ReportManager &rm) {
    if (!cc.grey.allowMcSheng) {
@ -1050,19 +1439,79 @@ bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,

    map<dstate_id_t, AccelScheme> accel_escape_info
        = info.strat.getAccelInfo(cc.grey);
+    auto old_states = info.states;
+    dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info, MAX_SHENG_STATES);

-    dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info);
    if (sheng_end <= DEAD_STATE + 1) {
+        info.states = old_states;
        return nullptr;
    }

    bytecode_ptr<NFA> nfa;
+
    if (!using8bit) {
        nfa = mcshengCompile16(info, sheng_end, accel_escape_info, cc.grey);
    } else {
        nfa = mcshengCompile8(info, sheng_end, accel_escape_info);
    }

+    if (!nfa) {
+        info.states = old_states;
+        return nfa;
+    }
+
+    if (has_eod_reports) {
+        nfa->flags |= NFA_ACCEPTS_EOD;
+    }
+
+    DEBUG_PRINTF("compile done\n");
+    return nfa;
+}
+
+#if defined(HAVE_AVX512VBMI)
+bytecode_ptr<NFA> mcshengCompile64(raw_dfa &raw, const CompileContext &cc,
+                                   const ReportManager &rm) {
+    if (!cc.grey.allowMcSheng) {
+        return nullptr;
+    }
+
+    mcclellan_build_strat mbs(raw, rm, false);
+    dfa_info info(mbs);
+    bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256;
+
+    if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming
+                          * mode with our semantics */
+        raw.stripExtraEodReports();
+    }
+
+    bool has_eod_reports = raw.hasEodReports();
+
+    map<dstate_id_t, AccelScheme> accel_escape_info
+        = info.strat.getAccelInfo(cc.grey);
+    bool using64state = false; /*default flag*/
+    dstate_id_t sheng_end64;
+    sheng_end64 = find_sheng_states(info, accel_escape_info, MAX_SHENG64_STATES);
+
+    if (sheng_end64 <= DEAD_STATE + 1) {
+        return nullptr;
+    } else {
+        using64state = true;
+    }
+
+    bytecode_ptr<NFA> nfa;
+
+    if (using64state) {
+        assert((sheng_end64 > 17) && (sheng_end64 <= 65));
+        if (!using8bit) {
+            nfa = mcsheng64Compile16(info, sheng_end64, accel_escape_info, cc.grey);
+        } else {
+            assert(using8bit);
+            nfa = mcsheng64Compile8(info, sheng_end64, accel_escape_info);
+            assert(nfa);
+            assert(nfa->type == MCSHENG_64_NFA_8);
+        }
+    }
+
    if (!nfa) {
        return nfa;
    }
@ -1074,6 +1523,7 @@ bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
    DEBUG_PRINTF("compile done\n");
    return nfa;
 }
+#endif

 bool has_accel_mcsheng(const NFA *) {
    return true; /* consider the sheng region as accelerated */
--- a/src/nfa/mcsheng_compile.h
+++ b/src/nfa/mcsheng_compile.h
@ -42,7 +42,8 @@ struct raw_dfa;

 bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
                                 const ReportManager &rm);
-
+bytecode_ptr<NFA> mcshengCompile64(raw_dfa &raw, const CompileContext &cc,
+                                   const ReportManager &rm);
 bool has_accel_mcsheng(const NFA *nfa);

 } // namespace ue2
--- a/src/nfa/mcsheng_data.c
+++ b/src/nfa/mcsheng_data.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -41,3 +41,16 @@ const u64a mcsheng_pext_mask[8] = {
    0x00ff00000000000f,
    0xff0000000000000f,
 };
+#if defined(HAVE_AVX512VBMI)
+const u64a mcsheng64_pext_mask[8] = {
+    0, /* dummy */
+    0x000000000000ff3f,
+    0x0000000000ff003f,
+    0x00000000ff00003f,
+    0x000000ff0000003f,
+    0x0000ff000000003f,
+    0x00ff00000000003f,
+    0xff0000000000003f,
+};
+#endif
+
--- a/src/nfa/mcsheng_dump.cpp
+++ b/src/nfa/mcsheng_dump.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -174,6 +174,126 @@ void describeEdge(FILE *f, const mcsheng *m, const u16 *t, u16 i) {
    }
 }

+#if defined(HAVE_AVX512VBMI)
+static
+const mstate_aux *getAux64(const NFA *n, dstate_id_t i) {
+    auto *m = (const mcsheng64 *)getImplNfa(n);
+    auto *aux_base = (const mstate_aux *)((const char *)n + m->aux_offset);
+
+    const mstate_aux *aux = aux_base + i;
+
+    assert((const char *)aux < (const char *)n + m->length);
+    return aux;
+}
+
+static
+void next_states64(const NFA *n, u16 s, u16 *t) {
+    const mcsheng64 *m = (const mcsheng64 *)getImplNfa(n);
+    const mstate_aux *aux = getAux64(n, s);
+    const u32 as = m->alphaShift;
+    assert(s != DEAD_STATE);
+
+    if (s < m->sheng_end) {
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u8 sheng_s = s - 1;
+            auto trans_for_c = (const char *)&m->sheng_succ_masks[c];
+            assert(sheng_s < sizeof(m512));
+            u8 raw_succ = trans_for_c[sheng_s];
+            if (raw_succ == m->sheng_end - 1) {
+                t[c] = DEAD_STATE;
+            } else if (raw_succ < m->sheng_end) {
+                t[c] = raw_succ + 1;
+            } else {
+                t[c] = raw_succ;
+            }
+        }
+    } else  if (n->type == MCSHENG_64_NFA_8) {
+        const u8 *succ_table = (const u8 *)((const char *)m + sizeof(mcsheng64));
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u32 normal_id = s - m->sheng_end;
+            t[c] = succ_table[(normal_id << as) + m->remap[c]];
+        }
+    } else {
+        u16 base_s = s;
+        const char *winfo_base = (const char *)n + m->sherman_offset;
+        const char *state_base
+                = winfo_base + SHERMAN_FIXED_SIZE * (s - m->sherman_limit);
+
+        if (s >= m->sherman_limit) {
+            base_s = unaligned_load_u16(state_base + SHERMAN_DADDY_OFFSET);
+            assert(base_s >= m->sheng_end);
+        }
+
+        const u16 *succ_table = (const u16 *)((const char *)m
+                                              + sizeof(mcsheng64));
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u32 normal_id = base_s - m->sheng_end;
+            t[c] = succ_table[(normal_id << as) + m->remap[c]];
+        }
+
+        if (s >= m->sherman_limit) {
+            UNUSED char type = *(state_base + SHERMAN_TYPE_OFFSET);
+            assert(type == SHERMAN_STATE);
+            u8 len = *(const u8 *)(SHERMAN_LEN_OFFSET + state_base);
+            const char *chars = state_base + SHERMAN_CHARS_OFFSET;
+            const u16 *states = (const u16 *)(state_base
+                                              + SHERMAN_STATES_OFFSET(len));
+
+            for (u8 i = 0; i < len; i++) {
+                for (u16 c = 0; c < N_CHARS; c++) {
+                    if (m->remap[c] == chars[i]) {
+                        t[c] = unaligned_load_u16((const u8*)&states[i]);
+                    }
+                }
+            }
+        }
+
+        for (u16 c = 0; c < N_CHARS; c++) {
+            t[c] &= STATE_MASK;
+        }
+
+    }
+
+    t[TOP] = aux->top & STATE_MASK;
+}
+
+static
+void describeEdge64(FILE *f, const mcsheng64 *m, const u16 *t, u16 i) {
+    for (u16 s = 0; s < N_CHARS; s++) {
+        if (!t[s]) {
+            continue;
+        }
+
+        u16 ss;
+        for (ss = 0; ss < s; ss++) {
+            if (t[s] == t[ss]) {
+                break;
+            }
+        }
+
+        if (ss != s) {
+            continue;
+        }
+
+        CharReach reach;
+        for (ss = s; ss < 256; ss++) {
+            if (t[s] == t[ss]) {
+                reach.set(ss);
+            }
+        }
+
+        fprintf(f, "%u -> %u [ ", i, t[s]);
+        if (i < m->sheng_end && t[s] < m->sheng_end) {
+            fprintf(f, "color = red, fontcolor = red ");
+        }
+        fprintf(f, "label = \"");
+        describeClass(f, reach, 5, CC_OUT_DOT);
+
+        fprintf(f, "\" ];\n");
+    }
+}
+#endif
+
 static
 void dumpAccelDot(FILE *f, u16 i, const union AccelAux *accel) {
    switch(accel->accel_type) {
@ -256,6 +376,68 @@ void describeNode(const NFA *n, const mcsheng *m, u16 i, FILE *f) {

 }

+#if defined(HAVE_AVX512VBMI)
+static
+void describeNode64(const NFA *n, const mcsheng64 *m, u16 i, FILE *f) {
+    const mstate_aux *aux = getAux64(n, i);
+
+    bool isSherman = m->sherman_limit && i >= m->sherman_limit;
+
+    fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
+            "label = \"%u%s\" ]; \n", i, i, isSherman ? "w":"");
+
+    if (aux->accel_offset) {
+        dumpAccelDot(f, i, (const union AccelAux *)
+                     ((const char *)m + aux->accel_offset));
+    }
+
+    if (i && i < m->sheng_end) {
+        fprintf(f, "%u [color = red, fontcolor = red]; \n", i);
+    }
+
+    if (aux->accept_eod) {
+        fprintf(f, "%u [ color = darkorchid ];\n", i);
+    }
+
+    if (aux->accept) {
+        fprintf(f, "%u [ shape = doublecircle ];\n", i);
+    }
+
+    if (aux->top && aux->top != i) {
+        fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
+                aux->top);
+    }
+
+    if (i == m->start_anchored) {
+        fprintf(f, "STARTA -> %u [color = blue ]\n", i);
+    }
+
+    if (i == m->start_floating) {
+        fprintf(f, "STARTF -> %u [color = red ]\n", i);
+    }
+
+    if (isSherman) {
+        const char *winfo_base = (const char *)n + m->sherman_offset;
+        const char *state_base
+                = winfo_base + SHERMAN_FIXED_SIZE * (i - m->sherman_limit);
+        assert(state_base < (const char *)m + m->length - sizeof(NFA));
+        UNUSED u8 type = *(const u8 *)(state_base + SHERMAN_TYPE_OFFSET);
+        assert(type == SHERMAN_STATE);
+        fprintf(f, "%u [ fillcolor = lightblue style=filled ];\n", i);
+        u16 daddy = *(const u16 *)(state_base + SHERMAN_DADDY_OFFSET);
+        if (daddy) {
+            fprintf(f, "%u -> %u [ color=royalblue style=dashed weight=0.1]\n",
+                    i, daddy);
+        }
+    }
+
+    if (i && i < m->sheng_end) {
+        fprintf(f, "subgraph cluster_sheng { %u } \n", i);
+    }
+
+}
+#endif
+
 static
 void dumpDotPreambleDfa(FILE *f) {
    dumpDotPreamble(f);
@ -392,6 +574,133 @@ void dump_text_8(const NFA *nfa, FILE *f) {
    dumpTextReverse(nfa, f);
 }

+#if defined(HAVE_AVX512VBMI)
+static
+void dump64_dot_16(const NFA *nfa, FILE *f) {
+    auto  *m = (const mcsheng64 *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < m->state_count; i++) {
+        describeNode64(nfa, m, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        next_states64(nfa, i, t);
+
+        describeEdge64(f, m, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+static
+void dump64_dot_8(const NFA *nfa, FILE *f) {
+    auto m = (const mcsheng64 *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < m->state_count; i++) {
+        describeNode64(nfa, m, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        next_states64(nfa, i, t);
+
+        describeEdge64(f, m, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+static
+void dumpAccelMasks64(FILE *f, const mcsheng64 *m, const mstate_aux *aux) {
+    fprintf(f, "\n");
+    fprintf(f, "Acceleration\n");
+    fprintf(f, "------------\n");
+
+    for (u16 i = 0; i < m->state_count; i++) {
+        if (!aux[i].accel_offset) {
+            continue;
+        }
+
+        auto accel = (const AccelAux *)((const char *)m + aux[i].accel_offset);
+        fprintf(f, "%05hu ", i);
+        dumpAccelInfo(f, *accel);
+    }
+}
+
+static
+void describeAlphabet64(FILE *f, const mcsheng64 *m) {
+    map<u8, CharReach> rev;
+
+    for (u16 i = 0; i < N_CHARS; i++) {
+        rev[m->remap[i]].clear();
+    }
+
+    for (u16 i = 0; i < N_CHARS; i++) {
+        rev[m->remap[i]].set(i);
+    }
+
+    map<u8, CharReach>::const_iterator it;
+    fprintf(f, "\nAlphabet\n");
+    for (it = rev.begin(); it != rev.end(); ++it) {
+        fprintf(f, "%3hhu: ", it->first);
+        describeClass(f, it->second, 10240, CC_OUT_TEXT);
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
+static
+void dumpCommonHeader64(FILE *f, const mcsheng64 *m) {
+    fprintf(f, "report: %u, states: %u, length: %u\n", m->arb_report,
+            m->state_count, m->length);
+    fprintf(f, "astart: %hu, fstart: %hu\n", m->start_anchored,
+            m->start_floating);
+    fprintf(f, "single accept: %d, has_accel: %d\n",
+            !!(int)m->flags & MCSHENG_FLAG_SINGLE, m->has_accel);
+    fprintf(f, "sheng_end:         %hu\n", m->sheng_end);
+    fprintf(f, "sheng_accel_limit: %hu\n", m->sheng_accel_limit);
+}
+
+static
+void dump64_text_8(const NFA *nfa, FILE *f) {
+    auto m = (const mcsheng64 *)getImplNfa(nfa);
+    auto aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
+
+    fprintf(f, "mcsheng 64-8\n");
+    dumpCommonHeader64(f, m);
+    fprintf(f, "accel_limit: %hu, accept_limit %hu\n", m->accel_limit_8,
+            m->accept_limit_8);
+    fprintf(f, "\n");
+
+    describeAlphabet64(f, m);
+    dumpAccelMasks64(f, m, aux);
+
+    fprintf(f, "\n");
+    dumpTextReverse(nfa, f);
+}
+
+static
+void dump64_text_16(const NFA *nfa, FILE *f) {
+    auto *m = (const mcsheng64 *)getImplNfa(nfa);
+    auto *aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
+
+    fprintf(f, "mcsheng 64-16\n");
+    dumpCommonHeader64(f, m);
+    fprintf(f, "sherman_limit: %d, sherman_end: %d\n", (int)m->sherman_limit,
+            (int)m->sherman_end);
+    fprintf(f, "\n");
+
+    describeAlphabet64(f, m);
+    dumpAccelMasks64(f, m, aux);
+
+    fprintf(f, "\n");
+    dumpTextReverse(nfa, f);
+}
+#endif
+
 void nfaExecMcSheng16_dump(const NFA *nfa, const string &base) {
    assert(nfa->type == MCSHENG_NFA_16);
    dump_text_16(nfa, StdioFile(base + ".txt", "w"));
@ -404,4 +713,20 @@ void nfaExecMcSheng8_dump(const NFA *nfa, const string &base) {
    dump_dot_8(nfa, StdioFile(base + ".dot", "w"));
 }

+void nfaExecMcSheng64_16_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
+#if defined(HAVE_AVX512VBMI)
+    assert(nfa->type == MCSHENG_64_NFA_16);
+    dump64_text_16(nfa, StdioFile(base + ".txt", "w"));
+    dump64_dot_16(nfa, StdioFile(base + ".dot", "w"));
+#endif
+}
+
+void nfaExecMcSheng64_8_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
+#if defined(HAVE_AVX512VBMI)
+    assert(nfa->type == MCSHENG_64_NFA_8);
+    dump64_text_8(nfa, StdioFile(base + ".txt", "w"));
+    dump64_dot_8(nfa, StdioFile(base + ".dot", "w"));
+#endif
+}
+
 } // namespace ue2
--- a/src/nfa/mcsheng_dump.h
+++ b/src/nfa/mcsheng_dump.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -42,7 +42,8 @@ namespace ue2 {

 void nfaExecMcSheng8_dump(const struct NFA *nfa, const std::string &base);
 void nfaExecMcSheng16_dump(const struct NFA *nfa, const std::string &base);
-
+void nfaExecMcSheng64_8_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecMcSheng64_16_dump(const struct NFA *nfa, const std::string &base);
 } // namespace ue2

 #endif // DUMP_SUPPORT
--- a/src/nfa/mcsheng_internal.h
+++ b/src/nfa/mcsheng_internal.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -92,4 +92,35 @@ struct mcsheng {
 * representing the data from a u64a. */
 extern const u64a mcsheng_pext_mask[8];

+#if defined(HAVE_AVX512VBMI)
+struct mcsheng64 {
+    u16 state_count; /**< total number of states */
+    u32 length; /**< length of dfa in bytes */
+    u16 start_anchored; /**< anchored start state */
+    u16 start_floating; /**< floating start state */
+    u32 aux_offset; /**< offset of the aux structures relative to the start of
+                     *  the nfa structure */
+    u32 sherman_offset; /**< offset of array of sherman state offsets the
+                         * state_info structures relative to the start of the
+                         * nfa structure */
+    u32 sherman_end; /**< offset of the end of the state_info structures
+                      * relative to the start of the nfa structure */
+    u16 sheng_end; /**< first non-sheng state */
+    u16 sheng_accel_limit; /**< first sheng accel state. state given in terms of
+                            * internal sheng ids */
+    u16 accel_limit_8; /**< 8 bit, lowest accelerable state */
+    u16 accept_limit_8; /**< 8 bit, lowest accept state */
+    u16 sherman_limit; /**< lowest sherman state */
+    u8  alphaShift;
+    u8  flags;
+    u8  has_accel; /**< 1 iff there are any accel plans */
+    u8  remap[256]; /**< remaps characters to a smaller alphabet */
+    ReportID arb_report; /**< one of the accepts that this dfa may raise */
+    u32 accel_offset; /**< offset of accel structures from start of McClellan */
+    m512 sheng_succ_masks[N_CHARS];
+};
+
+extern const u64a mcsheng64_pext_mask[8];
+#endif
+
 #endif
--- a/src/nfa/nfa_api_dispatch.c
+++ b/src/nfa/nfa_api_dispatch.c
@ -78,6 +78,8 @@
        DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func);                   \
        DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func);                       \
        DISPATCH_CASE(SHENG_NFA_64, Sheng64, dbnt_func);                       \
+        DISPATCH_CASE(MCSHENG_64_NFA_8, McSheng64_8, dbnt_func);               \
+        DISPATCH_CASE(MCSHENG_64_NFA_16, McSheng64_16, dbnt_func);             \
    default:                                                                   \
        assert(0);                                                             \
    }
--- a/src/nfa/nfa_build_util.cpp
+++ b/src/nfa/nfa_build_util.cpp
@ -478,6 +478,37 @@ const nfa_dispatch_fn NFATraits<SHENG_NFA_64>::has_repeats_other_than_firsts = d
 const char *NFATraits<SHENG_NFA_64>::name = "Sheng 64";
 #endif

+template<> struct NFATraits<MCSHENG_64_NFA_8> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 1;
+    static const bool fast = true;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_8>::has_accel = has_accel_mcsheng;
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_8>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_8>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<MCSHENG_64_NFA_8>::name = "Shengy64 McShengFace 8";
+#endif
+
+template<> struct NFATraits<MCSHENG_64_NFA_16> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 2;
+    static const bool fast = true;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_16>::has_accel = has_accel_mcsheng;
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_16>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_16>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<MCSHENG_64_NFA_16>::name = "Shengy64 McShengFace 16";
+#endif
 } // namespace

 #if defined(DUMP_SUPPORT)
--- a/src/nfa/nfa_dump_dispatch.cpp
+++ b/src/nfa/nfa_dump_dispatch.cpp
@ -83,6 +83,8 @@ namespace ue2 {
        DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func);                   \
        DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func);                       \
        DISPATCH_CASE(SHENG_NFA_64, Sheng64, dbnt_func);                       \
+        DISPATCH_CASE(MCSHENG_64_NFA_8, McSheng64_8, dbnt_func);               \
+        DISPATCH_CASE(MCSHENG_64_NFA_16, McSheng64_16, dbnt_func);             \
    default:                                                                   \
        assert(0);                                                             \
    }
--- a/src/nfa/nfa_internal.h
+++ b/src/nfa/nfa_internal.h
@ -74,6 +74,8 @@ enum NFAEngineType {
    MCSHENG_NFA_16,     /**< magic pseudo nfa */
    SHENG_NFA_32,       /**< magic pseudo nfa */
    SHENG_NFA_64,       /**< magic pseudo nfa */
+    MCSHENG_64_NFA_8,   /**< magic pseudo nfa */
+    MCSHENG_64_NFA_16,  /**< magic pseudo nfa */
    /** \brief bogus NFA - not used */
    INVALID_NFA
 };
@ -150,7 +152,12 @@ static really_inline int isMcClellanType(u8 t) {
 /** \brief True if the given type (from NFA::type) is a Sheng-McClellan hybrid
 * DFA. */
 static really_inline int isShengMcClellanType(u8 t) {
+#if defined(HAVE_AVX512VBMI)
+    return t == MCSHENG_64_NFA_8 || t == MCSHENG_64_NFA_16 || t == MCSHENG_NFA_8 ||
+           t == MCSHENG_NFA_16;
+#else
    return t == MCSHENG_NFA_8 || t == MCSHENG_NFA_16;
+#endif
 }

 /** \brief True if the given type (from NFA::type) is a Gough DFA. */
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@ -632,6 +632,7 @@ bytecode_ptr<NFA> getDfa(raw_dfa &rdfa, bool is_transient,
         * bytecode and that they are usually run on small blocks */
        dfa = mcshengCompile(rdfa, cc, rm);
    }
+
 #if defined(HAVE_AVX512VBMI)
    if (!dfa) {
        dfa = sheng32Compile(rdfa, cc, rm, false);
@ -639,6 +640,9 @@ bytecode_ptr<NFA> getDfa(raw_dfa &rdfa, bool is_transient,
    if (!dfa) {
        dfa = sheng64Compile(rdfa, cc, rm, false);
    }
+    if (!dfa && !is_transient) {
+        dfa = mcshengCompile64(rdfa, cc, rm);
+    }
 #endif
    if (!dfa) {
        // Sheng wasn't successful, so unleash McClellan!
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@ -108,6 +108,12 @@ m128 lshift64_m128(m128 a, unsigned b) {
 #define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
 #define movemask128(a)  ((u32)_mm_movemask_epi8((a)))

+#if defined(HAVE_AVX512)
+static really_inline m128 cast512to128(const m512 in) {
+    return _mm512_castsi512_si128(in);
+}
+#endif
+
 static really_inline m128 set1_16x8(u8 c) {
    return _mm_set1_epi8(c);
 }
@ -165,6 +171,10 @@ m128 load_m128_from_u64a(const u64a *p) {

 #endif // !AVX2

+static really_inline m128 add128(m128 a, m128 b) {
+    return _mm_add_epi64(a, b);
+}
+
 static really_inline m128 and128(m128 a, m128 b) {
    return _mm_and_si128(a,b);
 }
@ -352,6 +362,10 @@ static really_inline m256 ones256(void) {
    return rv;
 }

+static really_inline m256 add256(m256 a, m256 b) {
+    return _mm256_add_epi64(a, b);
+}
+
 static really_inline m256 and256(m256 a, m256 b) {
    return _mm256_and_si256(a, b);
 }
@ -562,6 +576,12 @@ static really_inline u32 movd512(const m512 in) {
    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
 }

+static really_inline u64a movq512(const m512 in) {
+    // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
+    //       so we use 2-step convertions to work around.
+    return _mm_cvtsi128_si64(_mm512_castsi512_si128(in));
+}
+
 static really_inline
 m512 pshufb_m512(m512 a, m512 b) {
    return _mm512_shuffle_epi8(a, b);
@ -606,6 +626,11 @@ m512 set1_8x64(u64a a) {
    return _mm512_set1_epi64(a);
 }

+static really_inline
+m512 set16x32(u32 a) {
+    return _mm512_set1_epi32(a);
+}
+
 static really_inline
 m512 set8x64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
@ -624,6 +649,31 @@ m512 set1_4x128(m128 a) {
    return _mm512_broadcast_i32x4(a);
 }

+static really_inline
+m512 sadd_u8_m512(m512 a, m512 b) {
+    return _mm512_adds_epu8(a, b);
+}
+
+static really_inline
+m512 max_u8_m512(m512 a, m512 b) {
+    return _mm512_max_epu8(a, b);
+}
+
+static really_inline
+m512 min_u8_m512(m512 a, m512 b) {
+    return _mm512_min_epu8(a, b);
+}
+
+static really_inline
+m512 sub_u8_m512(m512 a, m512 b) {
+    return _mm512_sub_epi8(a, b);
+}
+
+static really_inline m512
+add512(m512 a, m512 b) {
+    return _mm512_add_epu64(a, b);
+}
+
 static really_inline
 m512 and512(m512 a, m512 b) {
    return _mm512_and_si512(a, b);