diff --git a/CMakeLists.txt b/CMakeLists.txt
index 52d54955..9062c287 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -448,10 +448,6 @@ set (hs_exec_SRCS
     src/nfa/lbr.h
     src/nfa/lbr_common_impl.h
     src/nfa/lbr_internal.h
-    src/nfa/mcclellan.c
-    src/nfa/mcclellan.h
-    src/nfa/mcclellan_common_impl.h
-    src/nfa/mcclellan_internal.h
     src/nfa/limex_accel.c
     src/nfa/limex_accel.h
     src/nfa/limex_exceptional.h
@@ -470,6 +466,14 @@ set (hs_exec_SRCS
     src/nfa/limex_runtime_impl.h
     src/nfa/limex_shuffle.h
     src/nfa/limex_state_impl.h
+    src/nfa/mcclellan.c
+    src/nfa/mcclellan.h
+    src/nfa/mcclellan_common_impl.h
+    src/nfa/mcclellan_internal.h
+    src/nfa/mcsheng.c
+    src/nfa/mcsheng_data.c
+    src/nfa/mcsheng.h
+    src/nfa/mcsheng_internal.h
     src/nfa/mpv.h
     src/nfa/mpv.c
     src/nfa/mpv_internal.h
@@ -650,6 +654,8 @@ SET (hs_SRCS
     src/nfa/mcclellancompile.h
     src/nfa/mcclellancompile_util.cpp
     src/nfa/mcclellancompile_util.h
+    src/nfa/mcsheng_compile.cpp
+    src/nfa/mcsheng_compile.h
     src/nfa/limex_compile.cpp
     src/nfa/limex_compile.h
     src/nfa/limex_accel.h
@@ -667,6 +673,8 @@ SET (hs_SRCS
     src/nfa/nfa_internal.h
     src/nfa/nfa_kind.h
     src/nfa/rdfa.h
+    src/nfa/rdfa_graph.cpp
+    src/nfa/rdfa_graph.h
     src/nfa/rdfa_merge.cpp
     src/nfa/rdfa_merge.h
     src/nfa/repeat_internal.h
@@ -962,6 +970,8 @@ set(hs_dump_SRCS
     src/nfa/limex_dump.cpp
     src/nfa/mcclellandump.cpp
     src/nfa/mcclellandump.h
+    src/nfa/mcsheng_dump.cpp
+    src/nfa/mcsheng_dump.h
     src/nfa/mpv_dump.cpp
     src/nfa/nfa_dump_api.h
     src/nfa/nfa_dump_dispatch.cpp
diff --git a/src/grey.cpp b/src/grey.cpp
index bad56b56..340a34bf 100644
--- a/src/grey.cpp
+++ b/src/grey.cpp
@@ -51,6 +51,7 @@ Grey::Grey(void) :
                    allowLbr(true),
                    allowMcClellan(true),
                    allowSheng(true),
+                   allowMcSheng(true),
                    allowPuff(true),
                    allowLiteral(true),
                    allowRose(true),
@@ -217,6 +218,7 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(allowLbr);
         G_UPDATE(allowMcClellan);
         G_UPDATE(allowSheng);
+        G_UPDATE(allowMcSheng);
         G_UPDATE(allowPuff);
         G_UPDATE(allowLiteral);
         G_UPDATE(allowRose);
diff --git a/src/grey.h b/src/grey.h
index 90f5f826..4882af7d 100644
--- a/src/grey.h
+++ b/src/grey.h
@@ -51,6 +51,7 @@ struct Grey {
     bool allowLbr;
     bool allowMcClellan;
     bool allowSheng;
+    bool allowMcSheng;
     bool allowPuff;
     bool allowLiteral;
     bool allowRose;
diff --git a/src/nfa/limex_accel.c b/src/nfa/limex_accel.c
index f883973e..c74c7079 100644
--- a/src/nfa/limex_accel.c
+++ b/src/nfa/limex_accel.c
@@ -78,7 +78,7 @@ size_t accelScanWrapper(const u8 *accelTable, const union AccelAux *aux,
 size_t doAccel32(u32 s, u32 accel, const u8 *accelTable,
                  const union AccelAux *aux, const u8 *input, size_t i,
                  size_t end) {
-    u32 idx = packedExtract32(s, accel);
+    u32 idx = pext32(s, accel);
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
 
@@ -86,14 +86,14 @@ size_t doAccel32(u32 s, u32 accel, const u8 *accelTable,
 size_t doAccel64(u64a s, u64a accel, const u8 *accelTable,
                  const union AccelAux *aux, const u8 *input, size_t i,
                  size_t end) {
-    u32 idx = packedExtract64(s, accel);
+    u32 idx = pext64(s, accel);
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
 #else
 size_t doAccel64(m128 s, m128 accel, const u8 *accelTable,
                  const union AccelAux *aux, const u8 *input, size_t i,
                  size_t end) {
-    u32 idx = packedExtract64(movq(s), movq(accel));
+    u32 idx = pext64(movq(s), movq(accel));
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
 #endif
diff --git a/src/nfa/limex_shuffle.h b/src/nfa/limex_shuffle.h
index e45e4331..5ca8fce0 100644
--- a/src/nfa/limex_shuffle.h
+++ b/src/nfa/limex_shuffle.h
@@ -41,52 +41,6 @@
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 
-#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__))
-#define HAVE_PEXT
-#endif
-
-static really_inline
-u32 packedExtract32(u32 x, u32 mask) {
-#if defined(HAVE_PEXT)
-    // Intel BMI2 can do this operation in one instruction.
-    return _pext_u32(x, mask);
-#else
-
-    u32 result = 0, num = 1;
-    while (mask != 0) {
-        u32 bit = findAndClearLSB_32(&mask);
-        if (x & (1U << bit)) {
-            assert(num != 0); // more than 32 bits!
-            result |= num;
-        }
-        num <<= 1;
-    }
-    return result;
-#endif
-}
-
-static really_inline
-u32 packedExtract64(u64a x, u64a mask) {
-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
-    // Intel BMI2 can do this operation in one instruction.
-    return _pext_u64(x, mask);
-#else
-
-    u32 result = 0, num = 1;
-    while (mask != 0) {
-        u32 bit = findAndClearLSB_64(&mask);
-        if (x & (1ULL << bit)) {
-            assert(num != 0); // more than 32 bits!
-            result |= num;
-        }
-        num <<= 1;
-    }
-    return result;
-#endif
-}
-
-#undef HAVE_PEXT
-
 static really_inline
 u32 packedExtract128(m128 s, const m128 permute, const m128 compare) {
     m128 shuffled = pshufb(s, permute);
diff --git a/src/nfa/mcclellan.c b/src/nfa/mcclellan.c
index 63f5f535..584670c2 100644
--- a/src/nfa/mcclellan.c
+++ b/src/nfa/mcclellan.c
@@ -175,7 +175,7 @@ char mcclellanExec16_i(const struct mcclellan *m, u32 *state, const u8 *buf,
         if (mode == STOP_AT_MATCH) {
             *c_final = buf;
         }
-        return MO_CONTINUE_MATCHING;
+        return MO_ALIVE;
     }
 
     u32 s = *state;
@@ -213,7 +213,7 @@ without_accel:
             if (mode == STOP_AT_MATCH) {
                 *state = s & STATE_MASK;
                 *c_final = c - 1;
-                return MO_CONTINUE_MATCHING;
+                return MO_MATCHES_PENDING;
             }
 
             u64a loc = (c - 1) - buf + offAdj + 1;
@@ -221,12 +221,12 @@ without_accel:
             if (single) {
                 DEBUG_PRINTF("reporting %u\n", m->arb_report);
                 if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
-                    return MO_HALT_MATCHING; /* termination requested */
+                    return MO_DEAD; /* termination requested */
                 }
             } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
                                        &cached_accept_state, &cached_accept_id)
                        == MO_HALT_MATCHING) {
-                return MO_HALT_MATCHING;
+                return MO_DEAD;
             }
         }
 
@@ -265,7 +265,7 @@ with_accel:
             if (mode == STOP_AT_MATCH) {
                 *state = s & STATE_MASK;
                 *c_final = c - 1;
-                return MO_CONTINUE_MATCHING;
+                return MO_MATCHES_PENDING;
             }
 
             u64a loc = (c - 1) - buf + offAdj + 1;
@@ -273,12 +273,12 @@ with_accel:
             if (single) {
                 DEBUG_PRINTF("reporting %u\n", m->arb_report);
                 if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
-                    return MO_HALT_MATCHING; /* termination requested */
+                    return MO_DEAD; /* termination requested */
                 }
             } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
                                        &cached_accept_state, &cached_accept_id)
                        == MO_HALT_MATCHING) {
-                return MO_HALT_MATCHING;
+                return MO_DEAD;
             }
         }
 
@@ -293,7 +293,7 @@ exit:
     }
     *state = s;
 
-    return MO_CONTINUE_MATCHING;
+    return MO_ALIVE;
 }
 
 static never_inline
@@ -376,7 +376,7 @@ char mcclellanExec8_i(const struct mcclellan *m, u32 *state, const u8 *buf,
                       char single, const u8 **c_final, enum MatchMode mode) {
     if (!len) {
         *c_final = buf;
-        return MO_CONTINUE_MATCHING;
+        return MO_ALIVE;
     }
     u32 s = *state;
     const u8 *c = buf;
@@ -390,8 +390,7 @@ char mcclellanExec8_i(const struct mcclellan *m, u32 *state, const u8 *buf,
     u32 cached_accept_id = 0;
     u32 cached_accept_state = 0;
 
-    DEBUG_PRINTF("accel %hu, accept %hu\n",
-                 m->accel_limit_8, m->accept_limit_8);
+    DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit);
 
     DEBUG_PRINTF("s: %u, len %zu\n", s, len);
 
@@ -417,19 +416,19 @@ without_accel:
                 DEBUG_PRINTF("match - pausing\n");
                 *state = s;
                 *c_final = c - 1;
-                return MO_CONTINUE_MATCHING;
+                return MO_MATCHES_PENDING;
             }
 
             u64a loc = (c - 1) - buf + offAdj + 1;
             if (single) {
                 DEBUG_PRINTF("reporting %u\n", m->arb_report);
                 if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
-                    return MO_HALT_MATCHING;
+                    return MO_DEAD;
                 }
             } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
                                        &cached_accept_state, &cached_accept_id)
                        == MO_HALT_MATCHING) {
-                return MO_HALT_MATCHING;
+                return MO_DEAD;
             }
         }
 
@@ -464,19 +463,19 @@ with_accel:
                 DEBUG_PRINTF("match - pausing\n");
                 *state = s;
                 *c_final = c - 1;
-                return MO_CONTINUE_MATCHING;
+                return MO_MATCHES_PENDING;
             }
 
             u64a loc = (c - 1) - buf + offAdj + 1;
             if (single) {
                 DEBUG_PRINTF("reporting %u\n", m->arb_report);
                 if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
-                    return MO_HALT_MATCHING;
+                    return MO_DEAD;
                 }
             } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
                                        &cached_accept_state, &cached_accept_id)
                        == MO_HALT_MATCHING) {
-                return MO_HALT_MATCHING;
+                return MO_DEAD;
             }
         }
 
@@ -488,7 +487,7 @@ exit:
     if (mode == STOP_AT_MATCH) {
         *c_final = c_end;
     }
-    return MO_CONTINUE_MATCHING;
+    return MO_ALIVE;
 }
 
 static never_inline
@@ -576,7 +575,7 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         q->report_current = 0;
 
         if (rv == MO_HALT_MATCHING) {
-            return MO_HALT_MATCHING;
+            return MO_DEAD;
         }
     }
 
@@ -611,17 +610,20 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
         /* do main buffer region */
         const u8 *final_look;
-        if (mcclellanExec16_i_ni(m, &s, cur_buf + sp, local_ep - sp,
-                                 offset + sp, cb, context, single, &final_look,
-                                 mode)
-            == MO_HALT_MATCHING) {
+        char rv = mcclellanExec16_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                      offset + sp, cb, context, single,
+                                      &final_look, mode);
+        if (rv == MO_DEAD) {
             *(u16 *)q->state = 0;
-            return 0;
+            return MO_DEAD;
         }
-        if (mode == STOP_AT_MATCH && final_look != cur_buf + local_ep) {
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
             DEBUG_PRINTF("this is as far as we go\n");
-            assert(q->cur);
             DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
             q->cur--;
             q->items[q->cur].type = MQE_START;
             q->items[q->cur].location = final_look - cur_buf + 1; /* due to
@@ -630,6 +632,7 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
             return MO_MATCHES_PENDING;
         }
 
+        assert(rv == MO_ALIVE);
         assert(q->cur);
         if (mode != NO_MATCHES && q->items[q->cur].location > end) {
             DEBUG_PRINTF("this is as far as we go\n");
@@ -662,7 +665,7 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         case MQE_END:
             *(u16 *)q->state = s;
             q->cur++;
-            return s ? MO_ALIVE : 0;
+            return s ? MO_ALIVE : MO_DEAD;
         default:
             assert(!"invalid queue event");
         }
@@ -681,8 +684,8 @@ char nfaExecMcClellan16_Bi(const struct NFA *n, u64a offset, const u8 *buffer,
 
     if (mcclellanExec16_i(m, &s, buffer, length, offset, cb, context, single,
                           NULL, CALLBACK_OUTPUT)
-        == MO_HALT_MATCHING) {
-        return 0;
+        == MO_DEAD) {
+        return s ? MO_ALIVE : MO_DEAD;
     }
 
     const struct mstate_aux *aux = get_aux(m, s);
@@ -691,7 +694,7 @@ char nfaExecMcClellan16_Bi(const struct NFA *n, u64a offset, const u8 *buffer,
         doComplexReport(cb, context, m, s, offset + length, 1, NULL, NULL);
     }
 
-    return !!s;
+    return MO_ALIVE;
 }
 
 static really_inline
@@ -724,7 +727,7 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         q->report_current = 0;
 
         if (rv == MO_HALT_MATCHING) {
-            return MO_HALT_MATCHING;
+            return MO_DEAD;
         }
     }
 
@@ -760,16 +763,20 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         }
 
         const u8 *final_look;
-        if (mcclellanExec8_i_ni(m, &s, cur_buf + sp, local_ep - sp, offset + sp,
-                                cb, context, single, &final_look, mode)
-            == MO_HALT_MATCHING) {
+        char rv = mcclellanExec8_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                     offset + sp, cb, context, single,
+                                     &final_look, mode);
+        if (rv == MO_HALT_MATCHING) {
             *(u8 *)q->state = 0;
-            return 0;
+            return MO_DEAD;
         }
-        if (mode == STOP_AT_MATCH && final_look != cur_buf + local_ep) {
-            /* found a match */
-            DEBUG_PRINTF("found a match\n");
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
             assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
             q->cur--;
             q->items[q->cur].type = MQE_START;
             q->items[q->cur].location = final_look - cur_buf + 1; /* due to
@@ -778,6 +785,7 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
             return MO_MATCHES_PENDING;
         }
 
+        assert(rv == MO_ALIVE);
         assert(q->cur);
         if (mode != NO_MATCHES && q->items[q->cur].location > end) {
             DEBUG_PRINTF("this is as far as we go\n");
@@ -811,7 +819,7 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         case MQE_END:
             *(u8 *)q->state = s;
             q->cur++;
-            return s ? MO_ALIVE : 0;
+            return s ? MO_ALIVE : MO_DEAD;
         default:
             assert(!"invalid queue event");
         }
@@ -830,8 +838,8 @@ char nfaExecMcClellan8_Bi(const struct NFA *n, u64a offset, const u8 *buffer,
 
     if (mcclellanExec8_i(m, &s, buffer, length, offset, cb, context, single,
                          NULL, CALLBACK_OUTPUT)
-        == MO_HALT_MATCHING) {
-        return 0;
+        == MO_DEAD) {
+        return MO_DEAD;
     }
 
     const struct mstate_aux *aux = get_aux(m, s);
@@ -840,7 +848,7 @@ char nfaExecMcClellan8_Bi(const struct NFA *n, u64a offset, const u8 *buffer,
         doComplexReport(cb, context, m, s, offset + length, 1, NULL, NULL);
     }
 
-    return s;
+    return s ? MO_ALIVE : MO_DEAD;
 }
 
 char nfaExecMcClellan8_B(const struct NFA *n, u64a offset, const u8 *buffer,
diff --git a/src/nfa/mcclellan_internal.h b/src/nfa/mcclellan_internal.h
index 4a27aadb..549bccf5 100644
--- a/src/nfa/mcclellan_internal.h
+++ b/src/nfa/mcclellan_internal.h
@@ -71,17 +71,17 @@ struct mcclellan {
     u16 start_floating; /**< floating start state */
     u32 aux_offset; /**< offset of the aux structures relative to the start of
                      *  the nfa structure */
-    u32 sherman_offset; /**< offset of to array of sherman state offsets
-                      * the state_info structures relative to the start of the
-                      * nfa structure */
-    u32 sherman_end; /**< offset of the end of the state_info structures relative
-                   *  to the start of the nfa structure */
+    u32 sherman_offset; /**< offset of array of sherman state offsets the
+                         * state_info structures relative to the start of the
+                         * nfa structure */
+    u32 sherman_end; /**< offset of the end of the state_info structures
+                      *  relative to the start of the nfa structure */
     u16 accel_limit_8; /**< 8 bit, lowest accelerable state */
     u16 accept_limit_8; /**< 8 bit, lowest accept state */
     u16 sherman_limit; /**< lowest sherman state */
     u8  alphaShift;
     u8  flags;
-    u8  has_accel; /**< 1 iff there are any accel planes */
+    u8  has_accel; /**< 1 iff there are any accel plans */
     u8  remap[256]; /**< remaps characters to a smaller alphabet */
     ReportID arb_report; /**< one of the accepts that this dfa may raise */
     u32 accel_offset; /**< offset of the accel structures from start of NFA */
diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index 09006d5b..7a73c9d4 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -415,9 +415,9 @@ void fillInAux(mstate_aux *aux, dstate_id_t i, const dfa_info &info,
                              : info.raw.start_floating);
 }
 
-/* returns non-zero on error */
+/* returns false on error */
 static
-int allocateFSN16(dfa_info &info, dstate_id_t *sherman_base) {
+bool allocateFSN16(dfa_info &info, dstate_id_t *sherman_base) {
     info.states[0].impl_id = 0; /* dead is always 0 */
 
     vector<dstate_id_t> norm;
@@ -426,7 +426,7 @@ int allocateFSN16(dfa_info &info, dstate_id_t *sherman_base) {
     if (info.size() > (1 << 16)) {
         DEBUG_PRINTF("too many states\n");
         *sherman_base = 0;
-        return 1;
+        return false;
     }
 
     for (u32 i = 1; i < info.size(); i++) {
@@ -452,7 +452,7 @@ int allocateFSN16(dfa_info &info, dstate_id_t *sherman_base) {
     /* Check to see if we haven't over allocated our states */
     DEBUG_PRINTF("next sherman %u masked %u\n", next_sherman,
                  (dstate_id_t)(next_sherman & STATE_MASK));
-    return (next_sherman - 1) != ((next_sherman - 1) & STATE_MASK);
+    return (next_sherman - 1) == ((next_sherman - 1) & STATE_MASK);
 }
 
 static
@@ -470,7 +470,7 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
     assert(alphaShift <= 8);
 
     u16 count_real_states;
-    if (allocateFSN16(info, &count_real_states)) {
+    if (!allocateFSN16(info, &count_real_states)) {
         DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
                      info.size());
         return nullptr;
diff --git a/src/nfa/mcclellancompile.h b/src/nfa/mcclellancompile.h
index e6f548a7..8d8dfb19 100644
--- a/src/nfa/mcclellancompile.h
+++ b/src/nfa/mcclellancompile.h
@@ -32,9 +32,7 @@
 #include "accel_dfa_build_strat.h"
 #include "rdfa.h"
 #include "ue2common.h"
-#include "util/accel_scheme.h"
 #include "util/alloc.h"
-#include "util/charreach.h"
 #include "util/ue2_containers.h"
 
 #include <memory>
diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c
new file mode 100644
index 00000000..98db3f0a
--- /dev/null
+++ b/src/nfa/mcsheng.c
@@ -0,0 +1,1406 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mcsheng.h"
+
+#include "accel.h"
+#include "mcsheng_internal.h"
+#include "nfa_api.h"
+#include "nfa_api_queue.h"
+#include "nfa_internal.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+#include "util/simd_utils.h"
+#include "ue2common.h"
+
+enum MatchMode {
+    CALLBACK_OUTPUT,
+    STOP_AT_MATCH,
+    NO_MATCHES
+};
+
+static really_inline
+const struct mstate_aux *get_aux(const struct mcsheng *m, u32 s) {
+    const char *nfa = (const char *)m - sizeof(struct NFA);
+    const struct mstate_aux *aux
+        = s + (const struct mstate_aux *)(nfa + m->aux_offset);
+
+    assert(ISALIGNED(aux));
+    return aux;
+}
+
+static really_inline
+u32 mcshengEnableStarts(const struct mcsheng *m, u32 s) {
+    const struct mstate_aux *aux = get_aux(m, s);
+
+    DEBUG_PRINTF("enabling starts %u->%hu\n", s, aux->top);
+    return aux->top;
+}
+
+static really_inline
+u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
+                u32 as) {
+    assert(ISALIGNED_N(sherman_state, 16));
+
+    u8 len = *(const u8 *)(sherman_state + SHERMAN_LEN_OFFSET);
+
+    if (len) {
+        m128 ss_char = load128(sherman_state);
+        m128 cur_char = set16x8(cprime);
+
+        u32 z = movemask128(eq128(ss_char, cur_char));
+
+        /* remove header cruft: type 1, len 1, daddy 2*/
+        z &= ~0xf;
+        z &= (1U << (len + 4)) - 1;
+
+        if (z) {
+            u32 i = ctz32(z & ~0xf) - 4;
+
+            u32 s_out = unaligned_load_u16((const u8 *)sherman_state
+                                           + SHERMAN_STATES_OFFSET(len)
+                                           + sizeof(u16) * i);
+            DEBUG_PRINTF("found sherman match at %u/%u for c'=%hhu s=%u\n", i,
+                         len, cprime, s_out);
+            return s_out;
+        }
+    }
+
+    u32 daddy = *(const u16 *)(sherman_state + SHERMAN_DADDY_OFFSET);
+    return succ_table[(daddy << as) + cprime];
+}
+
+static really_inline
+char doComplexReport(NfaCallback cb, void *ctxt, const struct mcsheng *m,
+                     u32 s, u64a loc, char eod, u32 *cached_accept_state,
+                     u32 *cached_accept_id) {
+    DEBUG_PRINTF("reporting state = %u, loc=%llu, eod %hhu\n",
+                 s & STATE_MASK, loc, eod);
+
+    if (!eod && s == *cached_accept_state) {
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    const struct mstate_aux *aux = get_aux(m, s);
+    size_t offset = eod ? aux->accept_eod : aux->accept;
+
+    assert(offset);
+    const struct report_list *rl
+        = (const void *)((const char *)m + offset - sizeof(struct NFA));
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list size %u\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = s;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+#define SHENG_CHUNK 8
+
+static really_inline
+u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
+            const u8 *hard_c_end, u32 s_in, char do_accel) {
+    assert(s_in < m->sheng_end);
+    assert(s_in); /* should not already be dead */
+    assert(soft_c_end <= hard_c_end);
+    DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
+    m128 s = set16x8(s_in - 1);
+    const u8 *c = *c_inout;
+    const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
+    if (!do_accel) {
+        c_end = MIN(soft_c_end, hard_c_end - SHENG_CHUNK + 1);
+    }
+    const m128 *masks = m->sheng_masks;
+    u8 sheng_limit = m->sheng_end - 1; /* - 1: no dead state */
+    u8 sheng_stop_limit = do_accel ? m->sheng_accel_limit : sheng_limit;
+
+    /* When we use movd to get a u32 containing our state, it will have 4 lanes
+     * all duplicating the state. We can create versions of our limits with 4
+     * copies to directly compare against, this prevents us generating code to
+     * extract a single copy of the state from the u32 for checking. */
+    u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101;
+
+#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+    u32 sheng_limit_x4 = sheng_limit * 0x01010101;
+    m128 simd_stop_limit = set4x32(sheng_stop_limit_x4);
+    m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit);
+    DEBUG_PRINTF("end %hu, accel %hhu --> limit %hhu\n", sheng_limit,
+                 m->sheng_accel_limit, sheng_stop_limit);
+#endif
+
+#define SHENG_SINGLE_ITER do {                                          \
+        m128 shuffle_mask = masks[*(c++)];                              \
+        s = pshufb(shuffle_mask, s);                                    \
+        u32 s_gpr_x4 = movd(s); /* convert to u8 */                     \
+        DEBUG_PRINTF("c %hhu (%c) --> s %hhu\n", c[-1], c[-1], s_gpr);  \
+        if (s_gpr_x4 >= sheng_stop_limit_x4) {                          \
+            s_gpr = s_gpr_x4;                                           \
+            goto exit;                                                  \
+        }                                                               \
+    } while (0)
+
+    u8 s_gpr;
+    while (c < c_end) {
+#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+        /* This version uses pext for efficently bitbashing out scaled
+         * versions of the bytes to process from a u64a */
+
+        u64a data_bytes = unaligned_load_u64a(c);
+        u64a cc0 = pdep64(data_bytes, 0xff0); /* extract scaled low byte */
+        data_bytes &= ~0xffULL; /* clear low bits for scale space */
+        m128 shuffle_mask0 = load128((const char *)masks + cc0);
+        s = pshufb(shuffle_mask0, s);
+        m128 s_max = s;
+        m128 s_max0 = s_max;
+        DEBUG_PRINTF("c %02llx --> s %hhu\n", cc0 >> 4, movd(s));
+
+#define SHENG_SINGLE_UNROLL_ITER(iter)                                  \
+        assert(iter);                                                   \
+        u64a cc##iter = pext64(data_bytes, mcsheng_pext_mask[iter]);    \
+        assert(cc##iter == (u64a)c[iter] << 4);                         \
+        m128 shuffle_mask##iter = load128((const char *)masks + cc##iter); \
+        s = pshufb(shuffle_mask##iter, s);                              \
+        if (do_accel && iter == 7) {                                    \
+            /* in the final iteration we also have to check against accel */ \
+            m128 s_temp = sadd_u8_m128(s, accel_delta);                 \
+            s_max = max_u8_m128(s_max, s_temp);                         \
+        } else {                                                        \
+            s_max = max_u8_m128(s_max, s);                              \
+        }                                                               \
+        m128 s_max##iter = s_max;                                       \
+        DEBUG_PRINTF("c %02llx --> s %hhu max %hhu\n", cc##iter >> 4,   \
+                     movd(s), movd(s_max));
+
+        SHENG_SINGLE_UNROLL_ITER(1);
+
+        SHENG_SINGLE_UNROLL_ITER(2);
+        SHENG_SINGLE_UNROLL_ITER(3);
+
+        SHENG_SINGLE_UNROLL_ITER(4);
+        SHENG_SINGLE_UNROLL_ITER(5);
+
+        SHENG_SINGLE_UNROLL_ITER(6);
+        SHENG_SINGLE_UNROLL_ITER(7);
+
+        if (movd(s_max7) >= sheng_limit_x4) {
+            DEBUG_PRINTF("exit found\n");
+
+            /* Explicitly check the last byte as it is more likely as it also
+             * checks for acceleration. */
+            if (movd(s_max6) < sheng_limit_x4) {
+                c += SHENG_CHUNK;
+                s_gpr = movq(s);
+                assert(s_gpr >= sheng_stop_limit);
+                goto exit;
+            }
+
+            /* use shift-xor to create a register containing all of the max
+             * values */
+            m128 blended = rshift64_m128(s_max0, 56);
+            blended = xor128(blended, rshift64_m128(s_max1, 48));
+            blended = xor128(blended, rshift64_m128(s_max2, 40));
+            blended = xor128(blended, rshift64_m128(s_max3, 32));
+            blended = xor128(blended, rshift64_m128(s_max4, 24));
+            blended = xor128(blended, rshift64_m128(s_max5, 16));
+            blended = xor128(blended, rshift64_m128(s_max6, 8));
+            blended = xor128(blended, s);
+            blended = xor128(blended, rshift64_m128(blended, 8));
+            DEBUG_PRINTF("blended %016llx\n", movq(blended));
+
+            m128 final = min_u8_m128(blended, simd_stop_limit);
+            m128 cmp = sub_u8_m128(final, simd_stop_limit);
+            u64a stops = ~movemask128(cmp);
+            assert(stops);
+            u32 earliest = ctz32(stops);
+            DEBUG_PRINTF("stops %02llx, earliest %u\n", stops, earliest);
+            assert(earliest < 8);
+            c += earliest + 1;
+            s_gpr = movq(blended) >> (earliest * 8);
+            assert(s_gpr >= sheng_stop_limit);
+            goto exit;
+        } else {
+            c += SHENG_CHUNK;
+        }
+#else
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+        SHENG_SINGLE_ITER;
+#endif
+    }
+
+    assert(c_end - c < SHENG_CHUNK);
+    if (c < soft_c_end) {
+        assert(soft_c_end - c < SHENG_CHUNK);
+        switch (soft_c_end - c) {
+        case 7:
+            SHENG_SINGLE_ITER;
+        case 6:
+            SHENG_SINGLE_ITER;
+        case 5:
+            SHENG_SINGLE_ITER;
+        case 4:
+            SHENG_SINGLE_ITER;
+        case 3:
+            SHENG_SINGLE_ITER;
+        case 2:
+            SHENG_SINGLE_ITER;
+        case 1:
+            SHENG_SINGLE_ITER;
+        }
+    }
+
+    assert(c >= soft_c_end);
+
+    s_gpr = movd(s);
+exit:
+    assert(c <= hard_c_end);
+    DEBUG_PRINTF("%zu from end; s %hhu\n", c_end - c, s_gpr);
+    assert(c >= soft_c_end || s_gpr >= sheng_stop_limit);
+    /* undo state adjustment to match mcclellan view */
+    if (s_gpr == sheng_limit) {
+        s_gpr = 0;
+    } else if (s_gpr < sheng_limit) {
+        s_gpr++;
+    }
+
+    *c_inout = c;
+    return s_gpr;
+}
+
+static really_inline
+const char *findShermanState(UNUSED const struct mcsheng *m,
+                             const char *sherman_base_offset, u32 sherman_base,
+                             u32 s) {
+    const char *rv
+        = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
+    assert(rv < (const char *)m + m->length - sizeof(struct NFA));
+    UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET);
+    assert(type == SHERMAN_STATE);
+    return rv;
+}
+
+static really_inline
+const u8 *run_mcsheng_accel(const struct mcsheng *m,
+                            const struct mstate_aux *aux, u32 s,
+                            const u8 **min_accel_offset,
+                            const u8 *c, const u8 *c_end) {
+    DEBUG_PRINTF("skipping\n");
+    u32 accel_offset = aux[s].accel_offset;
+
+    assert(aux[s].accel_offset);
+    assert(accel_offset >= m->aux_offset);
+    assert(!m->sherman_offset || accel_offset < m->sherman_offset);
+
+    const union AccelAux *aaux = (const void *)((const char *)m + accel_offset);
+    const u8 *c2 = run_accel(aaux, c, c_end);
+
+    if (c2 < *min_accel_offset + BAD_ACCEL_DIST) {
+        *min_accel_offset = c2 + BIG_ACCEL_PENALTY;
+    } else {
+        *min_accel_offset = c2 + SMALL_ACCEL_PENALTY;
+    }
+
+    if (*min_accel_offset >= c_end - ACCEL_MIN_LEN) {
+        *min_accel_offset = c_end;
+    }
+
+    DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
+                 c2 - c, *min_accel_offset - c2, c_end - c2);
+
+    return c2;
+}
+
+static really_inline
+u32 doNormal16(const struct mcsheng *m, const u8 **c_inout, const u8 *end,
+               u32 s, char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+
+    const u16 *succ_table
+        = (const u16 *)((const char *)m + sizeof(struct mcsheng));
+    assert(ISALIGNED_N(succ_table, 2));
+    u32 sheng_end = m->sheng_end;
+    u32 sherman_base = m->sherman_limit;
+    const char *sherman_base_offset
+        = (const char *)m - sizeof(struct NFA) + m->sherman_offset;
+    u32 as = m->alphaShift;
+
+    /* Adjust start of succ table so we can index into using state id (rather
+     * than adjust to normal id). As we will not be processing states with low
+     * state ids, we will not be accessing data before the succ table. Note: due
+     * to the size of the sheng tables, the succ_table pointer will still be
+     * inside the engine.*/
+    succ_table -= sheng_end << as;
+
+    s &= STATE_MASK;
+
+    while (c < end && s >= sheng_end) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u)\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime, s);
+        if (s < sherman_base) {
+            DEBUG_PRINTF("doing normal\n");
+            assert(s < m->state_count);
+            s = succ_table[(s << as) + cprime];
+        } else {
+            const char *sherman_state
+                = findShermanState(m, sherman_base_offset, sherman_base, s);
+            DEBUG_PRINTF("doing sherman (%u)\n", s);
+            s = doSherman16(sherman_state, cprime, succ_table, as);
+        }
+
+        DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK);
+        c++;
+
+        if (do_accel && (s & ACCEL_FLAG)) {
+            break;
+        }
+        if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+            break;
+        }
+
+        s &= STATE_MASK;
+    }
+
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcshengExec16_i(const struct mcsheng *m, u32 *state, const u8 *buf,
+                     size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                     char single, const u8 **c_final, enum MatchMode mode) {
+    assert(ISALIGNED_N(state, 2));
+    if (!len) {
+        if (mode == STOP_AT_MATCH) {
+            *c_final = buf;
+        }
+        return MO_ALIVE;
+    }
+
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const u8 sheng_end = m->sheng_end;
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+
+    s &= STATE_MASK;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        int do_accept;
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            s = doSheng(m, &c, min_accel_offset, c_end, s, 0);
+            do_accept = mode != NO_MATCHES && get_aux(m, s)->accept;
+        } else {
+            s = doNormal16(m, &c, min_accel_offset, s, 0, mode);
+
+            do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+        }
+
+        if (do_accept) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
+    }
+
+with_accel:
+    do {
+        assert(c < c_end);
+        int do_accept;
+
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            if (s > m->sheng_accel_limit) {
+                c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doSheng(m, &c, c_end, c_end, s, 1);
+            do_accept = mode != NO_MATCHES && get_aux(m, s)->accept;
+        } else {
+            if (s & ACCEL_FLAG) {
+                DEBUG_PRINTF("skipping\n");
+                s &= STATE_MASK;
+                c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+
+            s = doNormal16(m, &c, c_end, s, 1, mode);
+            do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+        }
+
+        if (do_accept) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    s &= STATE_MASK;
+
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    *state = s;
+
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcshengExec16_i_cb(const struct mcsheng *m, u32 *state, const u8 *buf,
+                        size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                        char single, const u8 **final_point) {
+    return mcshengExec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                           final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcshengExec16_i_sam(const struct mcsheng *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point) {
+    return mcshengExec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                           final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcshengExec16_i_nm(const struct mcsheng *m, u32 *state, const u8 *buf,
+                        size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                        char single, const u8 **final_point) {
+    return mcshengExec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                           final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcshengExec16_i_ni(const struct mcsheng *m, u32 *state, const u8 *buf,
+                        size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                        char single, const u8 **final_point,
+                        enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcshengExec16_i_cb(m, state, buf, len, offAdj, cb, ctxt,
+                                  single, final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcshengExec16_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+                                   single, final_point);
+    } else {
+        assert (mode == NO_MATCHES);
+        return mcshengExec16_i_nm(m, state, buf, len, offAdj, cb, ctxt,
+                                  single, final_point);
+    }
+}
+
+static really_inline
+u32 doNormal8(const struct mcsheng *m, const u8 **c_inout, const u8 *end, u32 s,
+              char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+    u32 sheng_end = m->sheng_end;
+    u32 accel_limit = m->accel_limit_8;
+    u32 accept_limit = m->accept_limit_8;
+
+    const u32 as = m->alphaShift;
+    const u8 *succ_table = (const u8 *)((const char *)m
+                                        + sizeof(struct mcsheng));
+    /* Adjust start of succ table so we can index into using state id (rather
+     * than adjust to normal id). As we will not be processing states with low
+     * state ids, we will not be accessing data before the succ table. Note: due
+     * to the size of the sheng tables, the succ_table pointer will still be
+     * inside the engine.*/
+    succ_table -= sheng_end << as;
+
+    assert(s >= sheng_end);
+
+    while (c < end && s >= sheng_end) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime);
+        s = succ_table[(s << as) + cprime];
+
+        DEBUG_PRINTF("s: %u\n", s);
+        c++;
+        if (do_accel) {
+            if (s >= accel_limit) {
+                break;
+            }
+        } else {
+            if (mode != NO_MATCHES && s >= accept_limit) {
+                break;
+            }
+        }
+    }
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcshengExec8_i(const struct mcsheng *m, u32 *state, const u8 *buf,
+                    size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                    char single, const u8 **c_final, enum MatchMode mode) {
+    if (!len) {
+        *c_final = buf;
+        return MO_ALIVE;
+    }
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const u8 sheng_end = m->sheng_end;
+
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+    u32 accept_limit = m->accept_limit_8;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit);
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            s = doSheng(m, &c, min_accel_offset, c_end, s, 0);
+        } else {
+            s = doNormal8(m, &c, min_accel_offset, s, 0, mode);
+            assert(c <= min_accel_offset);
+        }
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
+    }
+
+with_accel:
+    do {
+        u32 accel_limit = m->accel_limit_8;
+
+        assert(c < c_end);
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            if (s > m->sheng_accel_limit) {
+                c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doSheng(m, &c, c_end, c_end, s, 1);
+        } else {
+            if (s >= accel_limit && aux[s].accel_offset) {
+                c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doNormal8(m, &c, c_end, s, 1, mode);
+        }
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
+                                       &cached_accept_state, &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    *state = s;
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcshengExec8_i_cb(const struct mcsheng *m, u32 *state, const u8 *buf,
+                       size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                       char single, const u8 **final_point) {
+    return mcshengExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                          final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcshengExec8_i_sam(const struct mcsheng *m, u32 *state, const u8 *buf,
+                        size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                        char single, const u8 **final_point) {
+    return mcshengExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                          final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcshengExec8_i_nm(const struct mcsheng *m, u32 *state, const u8 *buf,
+                       size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                       char single, const u8 **final_point) {
+    return mcshengExec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                          final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcshengExec8_i_ni(const struct mcsheng *m, u32 *state, const u8 *buf,
+                       size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                       char single, const u8 **final_point,
+                       enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcshengExec8_i_cb(m, state, buf, len, offAdj, cb, ctxt, single,
+                                 final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcshengExec8_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+                                  single, final_point);
+    } else {
+        assert(mode == NO_MATCHES);
+        return mcshengExec8_i_nm(m, state, buf, len, offAdj, cb, ctxt, single,
+                                 final_point);
+    }
+}
+
+static really_inline
+char mcshengCheckEOD(const struct NFA *nfa, u32 s, u64a offset,
+                     NfaCallback cb, void *ctxt) {
+    const struct mcsheng *m = getImplNfa(nfa);
+    const struct mstate_aux *aux = get_aux(m, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+    return doComplexReport(cb, ctxt, m, s, offset, 1, NULL, NULL);
+}
+
+static really_inline
+char nfaExecMcSheng16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                          const u8 *hend, NfaCallback cb, void *context,
+                          struct mq *q, char single, s64a end,
+                          enum MatchMode mode) {
+    assert(n->type == MCSHENG_NFA_16);
+    const struct mcsheng *m = getImplNfa(n);
+    s64a sp;
+
+    assert(ISALIGNED_N(q->state, 2));
+    u32 s = *(u16 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(get_aux(m, s)->accept);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0,
+                                 &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    assert(q->cur);
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u16 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        /* do main buffer region */
+        const u8 *final_look;
+        char rv = mcshengExec16_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                     offset + sp, cb, context, single,
+                                     &final_look, mode);
+        if (rv == MO_DEAD) {
+            *(u16 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u16 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u16 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = m->start_anchored;
+                break;
+            }
+            s = mcshengEnableStarts(m, s);
+            break;
+        case MQE_END:
+            *(u16 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+static really_inline
+char nfaExecMcSheng8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                         const u8 *hend, NfaCallback cb, void *context,
+                         struct mq *q, char single, s64a end,
+                         enum MatchMode mode) {
+    assert(n->type == MCSHENG_NFA_8);
+    const struct mcsheng *m = getImplNfa(n);
+    s64a sp;
+
+    u32 s = *(u8 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(s >= m->accept_limit_8);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0,
+                                 &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u8 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
+                     q->items[q->cur].type == MQE_END ? "END" : "???",
+                     q->items[q->cur].location + offset);
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        const u8 *final_look;
+        char rv = mcshengExec8_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                    offset + sp, cb, context, single,
+                                    &final_look, mode);
+        if (rv == MO_HALT_MATCHING) {
+            *(u8 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u8 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            assert(q->cur);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u8 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = (u8)m->start_anchored;
+                break;
+            }
+            s = mcshengEnableStarts(m, s);
+            break;
+        case MQE_END:
+            *(u8 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+char nfaExecMcSheng8_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_8);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng8_Q2i(n, offset, buffer, hend, cb, context, q,
+                               m->flags & MCSHENG_FLAG_SINGLE, end,
+                               CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng16_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_16);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                m->flags & MCSHENG_FLAG_SINGLE, end,
+                                CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng8_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcsheng *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u8 *)q->state;
+    u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    assert(s);
+
+    if (s >= m->accept_limit_8) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                            &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcSheng16_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcsheng *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u16 *)q->state;
+    const struct mstate_aux *aux = get_aux(m, s);
+    u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    DEBUG_PRINTF("state %u\n", s);
+    assert(s);
+
+    if (aux->accept) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                            &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+static
+char mcshengHasAccept(const struct mcsheng *m, const struct mstate_aux *aux,
+                        ReportID report) {
+    assert(m && aux);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    const struct report_list *rl = (const struct report_list *)
+            ((const char *)m + aux->accept - sizeof(struct NFA));
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcSheng8_inAccept(const struct NFA *n, ReportID report,
+                              struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+    return mcshengHasAccept(m, get_aux(m, s), report);
+}
+
+char nfaExecMcSheng8_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+    return !!get_aux(m, s)->accept;
+}
+
+char nfaExecMcSheng16_inAccept(const struct NFA *n, ReportID report,
+                               struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return mcshengHasAccept(m, get_aux(m, s), report);
+}
+
+char nfaExecMcSheng16_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return !!get_aux(m, s)->accept;
+}
+
+char nfaExecMcSheng8_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_8);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng8_Q2i(n, offset, buffer, hend, cb, context, q,
+                               m->flags & MCSHENG_FLAG_SINGLE, end,
+                               STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng16_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_16);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                m->flags & MCSHENG_FLAG_SINGLE, end,
+                                STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng8_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_8);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcSheng8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                  m->flags & MCSHENG_FLAG_SINGLE, 0 /* end */,
+                                  NO_MATCHES);
+    if (rv && nfaExecMcSheng8_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcSheng16_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_NFA_16);
+    const struct mcsheng *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcSheng16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                   m->flags & MCSHENG_FLAG_SINGLE, 0 /* end */,
+                                   NO_MATCHES);
+
+    if (rv && nfaExecMcSheng16_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcSheng8_initCompressedState(const struct NFA *nfa, u64a offset,
+                                         void *state, UNUSED u8 key) {
+    const struct mcsheng *m = getImplNfa(nfa);
+    u8 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        *(u8 *)state = s;
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecMcSheng16_initCompressedState(const struct NFA *nfa, u64a offset,
+                                          void *state, UNUSED u8 key) {
+    const struct mcsheng *m = getImplNfa(nfa);
+    u16 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        unaligned_store_u16(state, s);
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecMcSheng8_testEOD(const struct NFA *nfa, const char *state,
+                             UNUSED const char *streamState, u64a offset,
+                             NfaCallback callback, void *context) {
+    return mcshengCheckEOD(nfa, *(const u8 *)state, offset, callback,
+                           context);
+}
+
+char nfaExecMcSheng16_testEOD(const struct NFA *nfa, const char *state,
+                              UNUSED const char *streamState, u64a offset,
+                              NfaCallback callback, void *context) {
+    assert(ISALIGNED_N(state, 2));
+    return mcshengCheckEOD(nfa, *(const u16 *)state, offset, callback,
+                           context);
+}
+
+char nfaExecMcSheng8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+    *(u8 *)q->state = 0;
+    return 0;
+}
+
+char nfaExecMcSheng16_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 2);
+    assert(ISALIGNED_N(q->state, 2));
+    *(u16 *)q->state = 0;
+    return 0;
+}
+
+char nfaExecMcSheng8_queueCompressState(UNUSED const struct NFA *nfa,
+                                        const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcSheng8_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                 const void *src, UNUSED u64a offset,
+                                 UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcSheng16_queueCompressState(UNUSED const struct NFA *nfa,
+                                         const struct mq *q,
+                                         UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 2);
+    assert(nfa->streamStateSize == 2);
+    assert(ISALIGNED_N(src, 2));
+    unaligned_store_u16(dest, *(const u16 *)(src));
+    return 0;
+}
+
+char nfaExecMcSheng16_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                  const void *src, UNUSED u64a offset,
+                                  UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 2);
+    assert(nfa->streamStateSize == 2);
+    assert(ISALIGNED_N(dest, 2));
+    *(u16 *)dest = unaligned_load_u16(src);
+    return 0;
+}
diff --git a/src/nfa/mcsheng.h b/src/nfa/mcsheng.h
new file mode 100644
index 00000000..19fd6961
--- /dev/null
+++ b/src/nfa/mcsheng.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MCSHENG_H
+#define MCSHENG_H
+
+#include "callback.h"
+#include "ue2common.h"
+
+struct mq;
+struct NFA;
+
+/* 8-bit Sheng-McClellan hybrid */
+
+char nfaExecMcSheng8_testEOD(const struct NFA *nfa, const char *state,
+                             const char *streamState, u64a offset,
+                             NfaCallback callback, void *context);
+char nfaExecMcSheng8_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng8_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng8_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng8_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng8_inAccept(const struct NFA *n, ReportID report,
+                              struct mq *q);
+char nfaExecMcSheng8_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng8_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng8_initCompressedState(const struct NFA *n, u64a offset,
+                                         void *state, u8 key);
+char nfaExecMcSheng8_queueCompressState(const struct NFA *nfa,
+                                        const struct mq *q, s64a loc);
+char nfaExecMcSheng8_expandState(const struct NFA *nfa, void *dest,
+                                 const void *src, u64a offset, u8 key);
+
+#define nfaExecMcSheng8_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng8_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+/* 16-bit Sheng-McClellan hybrid */
+
+char nfaExecMcSheng16_testEOD(const struct NFA *nfa, const char *state,
+                              const char *streamState, u64a offset,
+                              NfaCallback callback, void *context);
+char nfaExecMcSheng16_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng16_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng16_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng16_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng16_inAccept(const struct NFA *n, ReportID report,
+                               struct mq *q);
+char nfaExecMcSheng16_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng16_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng16_initCompressedState(const struct NFA *n, u64a offset,
+                                          void *state, u8 key);
+char nfaExecMcSheng16_queueCompressState(const struct NFA *nfa,
+                                         const struct mq *q, s64a loc);
+char nfaExecMcSheng16_expandState(const struct NFA *nfa, void *dest,
+                                  const void *src, u64a offset, u8 key);
+
+#define nfaExecMcSheng16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+#endif
diff --git a/src/nfa/mcsheng_compile.cpp b/src/nfa/mcsheng_compile.cpp
new file mode 100644
index 00000000..666c3b1d
--- /dev/null
+++ b/src/nfa/mcsheng_compile.cpp
@@ -0,0 +1,1144 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mcsheng_compile.h"
+
+#include "accel.h"
+#include "accelcompile.h"
+#include "grey.h"
+#include "mcclellancompile.h"
+#include "mcclellancompile_util.h"
+#include "mcsheng_internal.h"
+#include "nfa_internal.h"
+#include "rdfa_graph.h"
+#include "shufticompile.h"
+#include "trufflecompile.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/bitutils.h"
+#include "util/charreach.h"
+#include "util/compare.h"
+#include "util/compile_context.h"
+#include "util/container.h"
+#include "util/graph.h"
+#include "util/graph_range.h"
+#include "util/make_unique.h"
+#include "util/order_check.h"
+#include "util/report_manager.h"
+#include "util/ue2_containers.h"
+#include "util/unaligned.h"
+#include "util/verify_types.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <set>
+#include <deque>
+#include <vector>
+
+#include <boost/range/adaptor/map.hpp>
+
+using namespace std;
+using boost::adaptors::map_keys;
+
+namespace ue2 {
+
+namespace /* anon */ {
+
+#define MIN_SHENG_SIZE 6
+#define INVALID_SHENG_ID 255
+
+struct dstate_extra {
+    u16 daddytaken = 0;
+    bool shermanState = false;
+    bool sheng_succ = false;
+    u8 sheng_id = INVALID_SHENG_ID;
+};
+
+struct dfa_info {
+    accel_dfa_build_strat &strat;
+    raw_dfa &raw;
+    vector<dstate> &states;
+    vector<dstate_extra> extra;
+    const u16 alpha_size; /* including special symbols */
+    const array<u16, ALPHABET_SIZE> &alpha_remap;
+    vector<CharReach> rev_alpha;
+    const u16 impl_alpha_size;
+
+    u8 getAlphaShift() const;
+
+    explicit dfa_info(accel_dfa_build_strat &s)
+                                : strat(s),
+                                  raw(s.get_raw()),
+                                  states(raw.states),
+                                  extra(raw.states.size()),
+                                  alpha_size(raw.alpha_size),
+                                  alpha_remap(raw.alpha_remap),
+                                  impl_alpha_size(raw.getImplAlphaSize()) {
+        rev_alpha.resize(impl_alpha_size);
+        for (u32 i = 0; i < N_CHARS; i++) {
+            rev_alpha[alpha_remap[i]].set(i);
+        }
+    }
+
+    dstate_id_t implId(dstate_id_t raw_id) const {
+        return states[raw_id].impl_id;
+    }
+
+    bool is_sherman(dstate_id_t raw_id) const {
+        return extra[raw_id].shermanState;
+    }
+
+    bool is_sheng(dstate_id_t raw_id) const {
+        return extra[raw_id].sheng_id != INVALID_SHENG_ID;
+    }
+
+    bool is_sheng_succ(dstate_id_t raw_id) const {
+        return extra[raw_id].sheng_succ;
+    }
+
+    /* states which use the normal transition/successor table */
+    bool is_normal(dstate_id_t raw_id) const {
+        return raw_id != DEAD_STATE && !is_sheng(raw_id) && !is_sherman(raw_id);
+    }
+    size_t size(void) const { return states.size(); }
+};
+
+u8 dfa_info::getAlphaShift() const {
+    if (impl_alpha_size < 2) {
+        return 1;
+    } else {
+        /* log2 round up */
+        return 32 - clz32(impl_alpha_size - 1);
+    }
+}
+
+} // namespace
+
+static
+mstate_aux *getAux(NFA *n, dstate_id_t i) {
+    mcsheng *m = (mcsheng *)getMutableImplNfa(n);
+    mstate_aux *aux_base = (mstate_aux *)((char *)n + m->aux_offset);
+
+    mstate_aux *aux = aux_base + i;
+    assert((const char *)aux < (const char *)n + m->length);
+    return aux;
+}
+
+static
+void createShuffleMasks(mcsheng *m, const dfa_info &info,
+                       dstate_id_t sheng_end,
+                       const map<dstate_id_t, AccelScheme> &accel_escape_info) {
+    DEBUG_PRINTF("using first %hu states for a sheng\n", sheng_end);
+    assert(sheng_end > DEAD_STATE + 1);
+    assert(sheng_end <= sizeof(m128) + 1);
+    vector<array<u8, sizeof(m128)>> masks;
+    masks.resize(info.alpha_size);
+    /* -1 to avoid wasting a slot as we do not include dead state */
+    vector<dstate_id_t> raw_ids;
+    raw_ids.resize(sheng_end - 1);
+    for (dstate_id_t s = DEAD_STATE + 1; s < info.states.size(); s++) {
+        assert(info.implId(s)); /* should not map to DEAD_STATE */
+        if (info.is_sheng(s)) {
+            raw_ids[info.extra[s].sheng_id] = s;
+        }
+    }
+    for (u32 i = 0; i < info.alpha_size; i++) {
+        if (i == info.alpha_remap[TOP]) {
+            continue;
+        }
+        auto &mask = masks[i];
+        assert(sizeof(mask) == sizeof(m128));
+        mask.fill(0);
+
+        for (dstate_id_t sheng_id = 0; sheng_id < sheng_end - 1; sheng_id++) {
+            dstate_id_t raw_id = raw_ids[sheng_id];
+            dstate_id_t next_id = info.implId(info.states[raw_id].next[i]);
+            if (next_id == DEAD_STATE) {
+                next_id = sheng_end - 1;
+            } else if (next_id < sheng_end) {
+                next_id--;
+            }
+            DEBUG_PRINTF("%hu: %u->next %hu\n", sheng_id, i, next_id);
+            mask[sheng_id] = verify_u8(next_id);
+        }
+    }
+    for (u32 i = 0; i < N_CHARS; i++) {
+        assert(info.alpha_remap[i] != info.alpha_remap[TOP]);
+        m->sheng_masks[i] = loadu128(masks[info.alpha_remap[i]].data());
+    }
+    m->sheng_end = sheng_end;
+    m->sheng_accel_limit = sheng_end - 1;
+
+    for (dstate_id_t s : raw_ids) {
+        if (contains(accel_escape_info, s)) {
+            LIMIT_TO_AT_MOST(&m->sheng_accel_limit, info.extra[s].sheng_id);
+        }
+    }
+}
+
+static
+void populateBasicInfo(size_t state_size, const dfa_info &info,
+                       u32 total_size, u32 aux_offset, u32 accel_offset,
+                       u32 accel_count, ReportID arb, bool single, NFA *nfa) {
+    assert(state_size == sizeof(u16) || state_size == sizeof(u8));
+
+    nfa->length = total_size;
+    nfa->nPositions = info.states.size();
+
+    nfa->scratchStateSize = verify_u32(state_size);
+    nfa->streamStateSize = verify_u32(state_size);
+
+    if (state_size == sizeof(u8)) {
+        nfa->type = MCSHENG_NFA_8;
+    } else {
+        nfa->type = MCSHENG_NFA_16;
+    }
+
+    mcsheng *m = (mcsheng *)getMutableImplNfa(nfa);
+    for (u32 i = 0; i < 256; i++) {
+        m->remap[i] = verify_u8(info.alpha_remap[i]);
+    }
+    m->alphaShift = info.getAlphaShift();
+    m->length = total_size;
+    m->aux_offset = aux_offset;
+    m->accel_offset = accel_offset;
+    m->arb_report = arb;
+    m->state_count = verify_u16(info.size());
+    m->start_anchored = info.implId(info.raw.start_anchored);
+    m->start_floating = info.implId(info.raw.start_floating);
+    m->has_accel = accel_count ? 1 : 0;
+
+    if (single) {
+        m->flags |= MCSHENG_FLAG_SINGLE;
+    }
+}
+
+namespace {
+
+struct raw_report_list {
+    flat_set<ReportID> reports;
+
+    raw_report_list(const flat_set<ReportID> &reports_in,
+                    const ReportManager &rm, bool do_remap) {
+        if (do_remap) {
+            for (auto &id : reports_in) {
+                reports.insert(rm.getProgramOffset(id));
+            }
+        } else {
+            reports = reports_in;
+        }
+    }
+
+    bool operator<(const raw_report_list &b) const {
+        return reports < b.reports;
+    }
+};
+
+struct raw_report_info_impl : public raw_report_info {
+    vector<raw_report_list> rl;
+    u32 getReportListSize() const override;
+    size_t size() const override;
+    void fillReportLists(NFA *n, size_t base_offset,
+                         std::vector<u32> &ro /* out */) const override;
+};
+}
+
+u32 raw_report_info_impl::getReportListSize() const {
+    u32 rv = 0;
+
+    for (const auto &reps : rl) {
+        rv += sizeof(report_list);
+        rv += sizeof(ReportID) * reps.reports.size();
+    }
+
+    return rv;
+}
+
+size_t raw_report_info_impl::size() const {
+    return rl.size();
+}
+
+void raw_report_info_impl::fillReportLists(NFA *n, size_t base_offset,
+                                           vector<u32> &ro) const {
+    for (const auto &reps : rl) {
+        ro.push_back(base_offset);
+
+        report_list *p = (report_list *)((char *)n + base_offset);
+
+        u32 i = 0;
+        for (const ReportID report : reps.reports) {
+            p->report[i++] = report;
+        }
+        p->count = verify_u32(reps.reports.size());
+
+        base_offset += sizeof(report_list);
+        base_offset += sizeof(ReportID) * reps.reports.size();
+    }
+}
+
+static
+void fillAccelOut(const map<dstate_id_t, AccelScheme> &accel_escape_info,
+                  set<dstate_id_t> *accel_states) {
+    for (dstate_id_t i : accel_escape_info | map_keys) {
+        accel_states->insert(i);
+    }
+}
+
+static
+size_t calcShermanRegionSize(const dfa_info &info) {
+    size_t rv = 0;
+
+    for (size_t i = 0; i < info.size(); i++) {
+        if (info.is_sherman(i)) {
+            rv += SHERMAN_FIXED_SIZE;
+        }
+    }
+
+    return ROUNDUP_16(rv);
+}
+
+static
+void fillInAux(mstate_aux *aux, dstate_id_t i, const dfa_info &info,
+               const vector<u32> &reports, const vector<u32> &reports_eod,
+               const vector<u32> &reportOffsets) {
+    const dstate &raw_state = info.states[i];
+    aux->accept = raw_state.reports.empty() ? 0 : reportOffsets[reports[i]];
+    aux->accept_eod = raw_state.reports_eod.empty() ? 0
+                                              : reportOffsets[reports_eod[i]];
+    aux->top = info.implId(i ? raw_state.next[info.alpha_remap[TOP]]
+                             : info.raw.start_floating);
+}
+
+/* returns false on error */
+static
+bool allocateImplId16(dfa_info &info, dstate_id_t sheng_end,
+                     dstate_id_t *sherman_base) {
+    info.states[0].impl_id = 0; /* dead is always 0 */
+
+    vector<dstate_id_t> norm;
+    vector<dstate_id_t> sherm;
+    vector<dstate_id_t> norm_sheng_succ;
+    vector<dstate_id_t> sherm_sheng_succ;
+
+    if (info.size() > (1 << 16)) {
+        DEBUG_PRINTF("too many states\n");
+        *sherman_base = 0;
+        return false;
+    }
+
+    for (u32 i = 1; i < info.size(); i++) {
+        if (info.is_sheng(i)) {
+            continue; /* sheng impl ids have already been allocated */
+        } if (info.is_sherman(i)) {
+            if (info.is_sheng_succ(i)) {
+                sherm_sheng_succ.push_back(i);
+            } else {
+                sherm.push_back(i);
+            }
+        } else {
+            if (info.is_sheng_succ(i)) {
+                norm_sheng_succ.push_back(i);
+            } else {
+                norm.push_back(i);
+            }
+        }
+    }
+
+    dstate_id_t next_norm = sheng_end;
+    for (dstate_id_t s : norm_sheng_succ) {
+        info.states[s].impl_id = next_norm++;
+    }
+    if (next_norm + norm.size() + sherm_sheng_succ.size() > UINT8_MAX) {
+        /* we need to give sheng_succs ids which fit into a u8 -- demote these
+         * to normal states */
+        for (dstate_id_t s : sherm_sheng_succ) {
+            info.states[s].impl_id = next_norm++;
+            info.extra[s].shermanState = false;
+        }
+        sherm_sheng_succ.clear();
+    }
+    for (dstate_id_t s : norm) {
+        info.states[s].impl_id = next_norm++;
+    }
+
+    *sherman_base = next_norm;
+    dstate_id_t next_sherman = next_norm;
+
+    for (dstate_id_t s : sherm_sheng_succ) {
+        info.states[s].impl_id = next_sherman++;
+    }
+
+    for (dstate_id_t s : sherm) {
+        info.states[s].impl_id = next_sherman++;
+    }
+
+    /* Check to see if we haven't over allocated our states */
+    DEBUG_PRINTF("next sherman %u masked %u\n", next_sherman,
+                 (dstate_id_t)(next_sherman & STATE_MASK));
+    return (next_sherman - 1) == ((next_sherman - 1) & STATE_MASK);
+}
+
+typedef RdfaGraph::vertex_descriptor RdfaVertex;
+
+static
+bool mark_sheng_succs(const RdfaGraph &g, dfa_info &info,
+                      const flat_set<RdfaVertex> &sheng_states) {
+    u32 exit_count = 0;
+
+    for (auto v : sheng_states) {
+        dstate_id_t s = g[v].index;
+        for (u32 i = 0; i != info.alpha_size; i++) {
+            if (i == info.alpha_remap[TOP]) {
+                continue;
+            }
+            dstate_id_t next = info.states[s].next[i];
+            if (!next || info.is_sheng(next) || info.is_sheng_succ(next)) {
+                continue;
+            }
+            exit_count++;
+            info.extra[next].sheng_succ = true;
+        }
+    }
+
+    if (exit_count + sheng_states.size() < UINT8_MAX) {
+        return true;
+    } else {
+        DEBUG_PRINTF("fail: unable to fit %u exits in byte", exit_count);
+        return false;
+    }
+}
+
+static
+CharReach get_edge_reach(dstate_id_t u, dstate_id_t v, const dfa_info &info) {
+    CharReach rv;
+    for (u32 i = 0; i < info.impl_alpha_size; i++) {
+        if (info.raw.states[u].next[i] == v) {
+            assert(info.rev_alpha[i].any());
+            rv |= info.rev_alpha[i];
+        }
+    }
+    assert(rv.any());
+    return rv;
+}
+
+#define MAX_SHENG_STATES 16
+#define MAX_SHENG_LEAKINESS 0.05
+
+/**
+ * Returns the proportion of strings of length 'depth' which will leave the
+ * sheng region when starting at state 'u'.
+ */
+static
+double leakiness(const RdfaGraph &g, dfa_info &info,
+                 const flat_set<RdfaVertex> &sheng_states, RdfaVertex u,
+                 u32 depth,
+                 unordered_map<pair<RdfaVertex, u32>, double> &cache) {
+    double rv = 0;
+    if (contains(cache, make_pair(u, depth))) {
+        return cache[make_pair(u, depth)];
+    }
+    for (RdfaVertex v : adjacent_vertices_range(u, g)) {
+        if (g[v].index == DEAD_STATE) {
+            continue;
+        }
+        double width = get_edge_reach(g[u].index, g[v].index, info).count();
+        width /= N_CHARS;
+
+        double weight;
+        if (!contains(sheng_states, v)) {
+            weight = 1;
+        } else if (depth > 1) {
+             weight = leakiness(g, info, sheng_states, v, depth - 1, cache);
+        } else {
+            continue; /* weight = 0 */
+        }
+        rv += width * weight;
+    }
+
+    cache[make_pair(u, depth)] = rv;
+    DEBUG_PRINTF("%zu [%u] q = %g\n", g[u].index, depth, rv);
+    return rv;
+}
+
+/**
+ * Returns the proportion of 8 byte strings which will leave the sheng region
+ * when starting at state 'u'.
+ */
+static
+double leakiness(const RdfaGraph &g, dfa_info &info,
+                 const flat_set<RdfaVertex> &sheng_states, RdfaVertex u) {
+    unordered_map<pair<RdfaVertex, u32>, double> cache;
+    double rv = leakiness(g, info, sheng_states, u, 8, cache);
+    return rv;
+}
+
+static
+dstate_id_t find_sheng_states(dfa_info &info,
+                             map<dstate_id_t, AccelScheme> &accel_escape_info) {
+    RdfaGraph g(info.raw);
+    auto cyclics = find_vertices_in_cycles(g);
+
+    auto base_cyclic = RdfaGraph::null_vertex();
+    for (const auto &v : cyclics) {
+        if (g[v].index == DEAD_STATE) {
+            continue;
+        }
+        DEBUG_PRINTF("considering cyclic %zu\n", g[v].index);
+        /* get an estimate of stickness of the cyclic: assume any edges from
+         * states with larger state ids are back edges */
+        CharReach est_back_reach;
+        for (const auto &u : inv_adjacent_vertices_range(v, g)) {
+            if (g[u].index < g[v].index) {
+                continue;
+            }
+            est_back_reach |= get_edge_reach(g[u].index, g[v].index, info);
+        }
+
+        if (est_back_reach.count() < 30) {
+            continue;
+        }
+        base_cyclic = v;
+        break;
+    }
+    if (!base_cyclic) {
+        return DEAD_STATE;
+    }
+
+    flat_set<RdfaVertex> sheng_states;
+    deque<RdfaVertex> to_consider = { base_cyclic };
+    flat_set<dstate_id_t> considered = { DEAD_STATE };
+    bool seen_back_edge = false;
+    while (!to_consider.empty()
+           && sheng_states.size() < MAX_SHENG_STATES) {
+        auto v = to_consider.front();
+        to_consider.pop_front();
+        if (!considered.insert(g[v].index).second) {
+            continue;
+        }
+
+        assert(!contains(sheng_states, v));
+
+        if (generates_callbacks(info.raw.kind)
+            && !info.states[g[v].index].reports.empty()) {
+            /* cannot raise callbacks from sheng region */
+            continue;
+        }
+
+        sheng_states.insert(v);
+        for (const auto &t : adjacent_vertices_range(v, g)) {
+            if (!contains(considered, g[t].index)) {
+                to_consider.push_back(t);
+            }
+            if (t == base_cyclic) {
+                seen_back_edge = true;
+            }
+        }
+    }
+
+    /* allocate normal ids */
+    dstate_id_t sheng_end = DEAD_STATE + 1;
+    for (auto v : sheng_states) {
+        dstate_id_t s = g[v].index;
+        if (!contains(accel_escape_info, s)) {
+            info.states[s].impl_id = sheng_end++;
+            info.extra[s].sheng_id = info.states[s].impl_id - 1;
+        }
+    }
+
+    /* allocate accel ids */
+    for (auto v : sheng_states) {
+        dstate_id_t s = g[v].index;
+        if (contains(accel_escape_info, s)) {
+            assert(!info.states[s].impl_id);
+            info.states[s].impl_id = sheng_end++;
+            info.extra[s].sheng_id = info.states[s].impl_id - 1;
+        }
+    }
+
+    if (sheng_states.size() < MIN_SHENG_SIZE) {
+        DEBUG_PRINTF("sheng region too small\n");
+        return DEAD_STATE;
+    }
+
+    if (!seen_back_edge) {
+        DEBUG_PRINTF("did not include cyclic\n");
+        return DEAD_STATE;
+    }
+
+    double leak = leakiness(g, info, sheng_states, base_cyclic);
+    if (leak > MAX_SHENG_LEAKINESS) {
+        DEBUG_PRINTF("too leaky (%g)\n", leak);
+        return DEAD_STATE;
+    }
+
+    if (!mark_sheng_succs(g, info, sheng_states)) {
+        return DEAD_STATE;
+    }
+
+    /* TODO: ensure sufficiently 'sticky' */
+    /* TODO: check not all states accel */
+    DEBUG_PRINTF("sheng_end = %hu\n", sheng_end);
+    return sheng_end;
+}
+
+static
+void fill_in_aux_info(NFA *nfa, const dfa_info &info,
+                      const map<dstate_id_t, AccelScheme> &accel_escape_info,
+                      u32 accel_offset, UNUSED u32 accel_end_offset,
+                      const vector<u32> &reports,
+                      const vector<u32> &reports_eod,
+                      u32 report_base_offset,
+                      const raw_report_info &ri) {
+    mcsheng *m = (mcsheng *)getMutableImplNfa(nfa);
+
+    vector<u32> reportOffsets;
+
+    ri.fillReportLists(nfa, report_base_offset, reportOffsets);
+
+    for (u32 i = 0; i < info.size(); i++) {
+        u16 impl_id = info.implId(i);
+        mstate_aux *this_aux = getAux(nfa, impl_id);
+
+        fillInAux(this_aux, i, info, reports, reports_eod, reportOffsets);
+        if (contains(accel_escape_info, i)) {
+            this_aux->accel_offset = accel_offset;
+            accel_offset += info.strat.accelSize();
+            assert(accel_offset <= accel_end_offset);
+            assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+            info.strat.buildAccel(i, accel_escape_info.at(i),
+                                  (void *)((char *)m + this_aux->accel_offset));
+        }
+    }
+}
+
+static
+u16 get_edge_flags(NFA *nfa, dstate_id_t target_impl_id) {
+    mstate_aux *aux = getAux(nfa, target_impl_id);
+    u16 flags = 0;
+
+    if (aux->accept) {
+        flags |= ACCEPT_FLAG;
+    }
+
+    if (aux->accel_offset) {
+        flags |= ACCEL_FLAG;
+    }
+
+    return flags;
+}
+
+static
+void fill_in_succ_table_16(NFA *nfa, const dfa_info &info,
+                           dstate_id_t sheng_end,
+                           UNUSED dstate_id_t sherman_base) {
+    u16 *succ_table = (u16 *)((char *)nfa + sizeof(NFA) + sizeof(mcsheng));
+
+    u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
+
+    for (size_t i = 0; i < info.size(); i++) {
+        if (!info.is_normal(i)) {
+            assert(info.implId(i) < sheng_end || info.is_sherman(i));
+            continue;
+        }
+
+        assert(info.implId(i) < sherman_base);
+        u16 normal_id = verify_u16(info.implId(i) - sheng_end);
+
+        for (size_t s = 0; s < info.impl_alpha_size; s++) {
+            dstate_id_t raw_succ = info.states[i].next[s];
+            u16 &entry = succ_table[(normal_id << alphaShift) + s];
+
+            entry = info.implId(raw_succ);
+            entry |= get_edge_flags(nfa, entry);
+        }
+    }
+}
+
+#define MAX_SHERMAN_LIST_LEN 8
+
+static
+void addIfEarlier(set<dstate_id_t> &dest, dstate_id_t candidate,
+                  dstate_id_t max) {
+    if (candidate < max) {
+        dest.insert(candidate);
+    }
+}
+
+static
+void addSuccessors(set<dstate_id_t> &dest, const dstate &source,
+                   u16 alphasize, dstate_id_t curr_id) {
+    for (symbol_t s = 0; s < alphasize; s++) {
+        addIfEarlier(dest, source.next[s], curr_id);
+    }
+}
+
+#define MAX_SHERMAN_SELF_LOOP 20
+
+static
+void find_better_daddy(dfa_info &info, dstate_id_t curr_id,
+                       bool any_cyclic_near_anchored_state, const Grey &grey) {
+    if (!grey.allowShermanStates) {
+        return;
+    }
+
+    const u16 width = sizeof(u16);
+    const u16 alphasize = info.impl_alpha_size;
+
+    if (info.raw.start_anchored != DEAD_STATE
+        && any_cyclic_near_anchored_state
+        && curr_id < alphasize * 3) {
+        /* crude attempt to prevent frequent states from being sherman'ed
+         * depends on the fact that states are numbers are currently in bfs
+         * order */
+        DEBUG_PRINTF("%hu is banned\n", curr_id);
+        return;
+    }
+
+    if (info.raw.start_floating != DEAD_STATE
+        && curr_id >= info.raw.start_floating
+        && curr_id < info.raw.start_floating + alphasize * 3) {
+        /* crude attempt to prevent frequent states from being sherman'ed
+         * depends on the fact that states are numbers are currently in bfs
+         * order */
+        DEBUG_PRINTF("%hu is banned (%hu)\n", curr_id, info.raw.start_floating);
+        return;
+    }
+
+    const u16 full_state_size = width * alphasize;
+    const u16 max_list_len = MIN(MAX_SHERMAN_LIST_LEN,
+                           (full_state_size - 2)/(width + 1));
+    u16 best_score = 0;
+    dstate_id_t best_daddy = 0;
+    dstate &currState = info.states[curr_id];
+
+    set<dstate_id_t> hinted; /* set of states to search for a better daddy */
+    addIfEarlier(hinted, 0, curr_id);
+    addIfEarlier(hinted, info.raw.start_anchored, curr_id);
+    addIfEarlier(hinted, info.raw.start_floating, curr_id);
+
+    dstate_id_t mydaddy = currState.daddy;
+    if (mydaddy) {
+        addIfEarlier(hinted, mydaddy, curr_id);
+        addSuccessors(hinted, info.states[mydaddy], alphasize, curr_id);
+        dstate_id_t mygranddaddy = info.states[mydaddy].daddy;
+        if (mygranddaddy) {
+            addIfEarlier(hinted, mygranddaddy, curr_id);
+            addSuccessors(hinted, info.states[mygranddaddy], alphasize,
+                          curr_id);
+        }
+    }
+
+    for (const dstate_id_t &donor : hinted) {
+        assert(donor < curr_id);
+        u32 score = 0;
+
+        if (!info.is_normal(donor)) {
+            continue;
+        }
+
+        const dstate &donorState = info.states[donor];
+        for (symbol_t s = 0; s < alphasize; s++) {
+            if (currState.next[s] == donorState.next[s]) {
+                score++;
+            }
+        }
+
+        /* prefer lower ids to provide some stability amongst potential
+         * siblings */
+        if (score > best_score || (score == best_score && donor < best_daddy)) {
+            best_daddy = donor;
+            best_score = score;
+
+            if (score == alphasize) {
+                break;
+            }
+        }
+    }
+
+    currState.daddy = best_daddy;
+    info.extra[curr_id].daddytaken = best_score;
+    DEBUG_PRINTF("%hu -> daddy %hu: %u/%u BF\n", curr_id, best_daddy,
+                 best_score, alphasize);
+
+    if (best_daddy == DEAD_STATE) {
+        return; /* No good daddy */
+    }
+
+    if (best_score + max_list_len < alphasize) {
+        return; /* ??? */
+    }
+
+    assert(info.is_normal(currState.daddy));
+
+    u32 self_loop_width = 0;
+    const dstate curr_raw = info.states[curr_id];
+    for (unsigned i = 0; i < N_CHARS; i++) {
+        if (curr_raw.next[info.alpha_remap[i]] == curr_id) {
+            self_loop_width++;
+        }
+    }
+
+    if (self_loop_width > MAX_SHERMAN_SELF_LOOP) {
+        DEBUG_PRINTF("%hu is banned wide self loop (%u)\n", curr_id,
+                      self_loop_width);
+        return;
+    }
+
+    if (info.is_sheng(curr_id)) {
+        return;
+    }
+
+    DEBUG_PRINTF("%hu is sherman\n", curr_id);
+    info.extra[curr_id].shermanState = true;
+}
+
+static
+bool is_cyclic_near(const raw_dfa &raw, dstate_id_t root) {
+    symbol_t alphasize = raw.getImplAlphaSize();
+    for (symbol_t s = 0; s < alphasize; s++) {
+        dstate_id_t succ_id = raw.states[root].next[s];
+        if (succ_id == DEAD_STATE) {
+            continue;
+        }
+
+        const dstate &succ = raw.states[succ_id];
+        for (symbol_t t = 0; t < alphasize; t++) {
+            if (succ.next[t] == root || succ.next[t] == succ_id) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+static
+void fill_in_sherman(NFA *nfa, dfa_info &info, UNUSED u16 sherman_limit) {
+    char *nfa_base = (char *)nfa;
+    mcsheng *m = (mcsheng *)getMutableImplNfa(nfa);
+    char *sherman_table = nfa_base + m->sherman_offset;
+
+    assert(ISALIGNED_16(sherman_table));
+    for (size_t i = 0; i < info.size(); i++) {
+        if (!info.is_sherman(i)) {
+            continue;
+        }
+        u16 fs = verify_u16(info.implId(i));
+        DEBUG_PRINTF("building sherman %zu impl %hu\n", i, fs);
+
+        assert(fs >= sherman_limit);
+
+        char *curr_sherman_entry
+            = sherman_table + (fs - m->sherman_limit) * SHERMAN_FIXED_SIZE;
+        assert(curr_sherman_entry <= nfa_base + m->length);
+
+        u8 len = verify_u8(info.impl_alpha_size - info.extra[i].daddytaken);
+        assert(len <= 9);
+        dstate_id_t d = info.states[i].daddy;
+
+        *(u8 *)(curr_sherman_entry + SHERMAN_TYPE_OFFSET) = SHERMAN_STATE;
+        *(u8 *)(curr_sherman_entry + SHERMAN_LEN_OFFSET) = len;
+        *(u16 *)(curr_sherman_entry + SHERMAN_DADDY_OFFSET) = info.implId(d);
+        u8 *chars = (u8 *)(curr_sherman_entry + SHERMAN_CHARS_OFFSET);
+
+        for (u16 s = 0; s < info.impl_alpha_size; s++) {
+            if (info.states[i].next[s] != info.states[d].next[s]) {
+                *(chars++) = (u8)s;
+            }
+        }
+
+        u16 *states = (u16 *)(curr_sherman_entry + SHERMAN_STATES_OFFSET(len));
+        for (u16 s = 0; s < info.impl_alpha_size; s++) {
+            if (info.states[i].next[s] != info.states[d].next[s]) {
+                DEBUG_PRINTF("s overrider %hu dad %hu char next %hu\n", fs,
+                             info.implId(d),
+                             info.implId(info.states[i].next[s]));
+                u16 entry_val = info.implId(info.states[i].next[s]);
+                entry_val |= get_edge_flags(nfa, entry_val);
+                unaligned_store_u16((u8 *)states++, entry_val);
+            }
+        }
+    }
+}
+
+static
+aligned_unique_ptr<NFA> mcshengCompile16(dfa_info &info, dstate_id_t sheng_end,
+                        const map<dstate_id_t, AccelScheme> &accel_escape_info,
+                        const Grey &grey) {
+    DEBUG_PRINTF("building mcsheng 16\n");
+
+    vector<u32> reports; /* index in ri for the appropriate report list */
+    vector<u32> reports_eod; /* as above */
+    ReportID arb;
+    u8 single;
+
+    assert(info.getAlphaShift() <= 8);
+
+    u16 total_daddy = 0;
+    for (u32 i = 0; i < info.size(); i++) {
+        find_better_daddy(info, i,
+                          is_cyclic_near(info.raw, info.raw.start_anchored),
+                          grey);
+        total_daddy += info.extra[i].daddytaken;
+    }
+
+    DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
+                 info.size() * info.impl_alpha_size, info.size(),
+                 info.impl_alpha_size);
+
+    u16 sherman_limit;
+    if (!allocateImplId16(info, sheng_end, &sherman_limit)) {
+        DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
+                     info.size());
+        return nullptr;
+    }
+    u16 count_real_states = sherman_limit - sheng_end;
+
+    auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+
+    size_t tran_size = (1 << info.getAlphaShift()) * sizeof(u16)
+                     * count_real_states;
+
+    size_t aux_size = sizeof(mstate_aux) * info.size();
+
+    size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng) + tran_size);
+    size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
+    size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+                                    + ri->getReportListSize(), 32);
+    size_t sherman_offset = ROUNDUP_16(accel_offset + accel_size);
+    size_t sherman_size = calcShermanRegionSize(info);
+
+    size_t total_size = sherman_offset + sherman_size;
+
+    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
+    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+
+    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    mcsheng *m = (mcsheng *)getMutableImplNfa(nfa.get());
+
+    populateBasicInfo(sizeof(u16), info, total_size, aux_offset, accel_offset,
+                      accel_escape_info.size(), arb, single, nfa.get());
+    createShuffleMasks(m, info, sheng_end, accel_escape_info);
+
+    /* copy in the mc header information */
+    m->sherman_offset = sherman_offset;
+    m->sherman_end = total_size;
+    m->sherman_limit = sherman_limit;
+
+    DEBUG_PRINTF("%hu sheng, %hu norm, %zu total\n", sheng_end,
+                 count_real_states, info.size());
+
+    fill_in_aux_info(nfa.get(), info, accel_escape_info, accel_offset,
+                     sherman_offset - sizeof(NFA), reports, reports_eod,
+                     aux_offset + aux_size, *ri);
+
+    fill_in_succ_table_16(nfa.get(), info, sheng_end, sherman_limit);
+
+    fill_in_sherman(nfa.get(), info, sherman_limit);
+
+    return nfa;
+}
+
+static
+void fill_in_succ_table_8(NFA *nfa, const dfa_info &info,
+                          dstate_id_t sheng_end) {
+    u8 *succ_table = (u8 *)nfa + sizeof(NFA) + sizeof(mcsheng);
+
+    u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
+
+    for (size_t i = 0; i < info.size(); i++) {
+        assert(!info.is_sherman(i));
+        if (!info.is_normal(i)) {
+            assert(info.implId(i) < sheng_end);
+            continue;
+        }
+        u8 normal_id = verify_u8(info.implId(i) - sheng_end);
+
+        for (size_t s = 0; s < info.impl_alpha_size; s++) {
+            dstate_id_t raw_succ = info.states[i].next[s];
+            succ_table[(normal_id << alphaShift) + s] = info.implId(raw_succ);
+        }
+    }
+}
+
+static
+void allocateImplId8(dfa_info &info, dstate_id_t sheng_end,
+                     const map<dstate_id_t, AccelScheme> &accel_escape_info,
+                     u16 *accel_limit, u16 *accept_limit) {
+    info.states[0].impl_id = 0; /* dead is always 0 */
+
+    vector<dstate_id_t> norm;
+    vector<dstate_id_t> accel;
+    vector<dstate_id_t> accept;
+
+    assert(info.size() <= (1 << 8));
+
+    for (u32 i = 1; i < info.size(); i++) {
+        if (info.is_sheng(i)) {
+            continue; /* already allocated */
+        } else if (!info.states[i].reports.empty()) {
+            accept.push_back(i);
+        } else if (contains(accel_escape_info, i)) {
+            accel.push_back(i);
+        } else {
+            norm.push_back(i);
+        }
+    }
+
+    u32 j = sheng_end;
+    for (const dstate_id_t &s : norm) {
+        assert(j <= 256);
+        DEBUG_PRINTF("mapping state %u to %u\n", s, j);
+        info.states[s].impl_id = j++;
+    }
+    *accel_limit = j;
+    for (const dstate_id_t &s : accel) {
+        assert(j <= 256);
+        DEBUG_PRINTF("mapping state %u to %u\n", s, j);
+        info.states[s].impl_id = j++;
+    }
+    *accept_limit = j;
+    for (const dstate_id_t &s : accept) {
+        assert(j <= 256);
+        DEBUG_PRINTF("mapping state %u to %u\n",  s, j);
+        info.states[s].impl_id = j++;
+    }
+}
+
+static
+aligned_unique_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
+                       const map<dstate_id_t, AccelScheme> &accel_escape_info) {
+    DEBUG_PRINTF("building mcsheng 8\n");
+
+    vector<u32> reports;
+    vector<u32> reports_eod;
+    ReportID arb;
+    u8 single;
+
+    auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+
+    size_t normal_count = info.size() - sheng_end;
+
+    size_t tran_size = sizeof(u8) * (1 << info.getAlphaShift()) * normal_count;
+    size_t aux_size = sizeof(mstate_aux) * info.size();
+    size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng) + tran_size);
+    size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
+    size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+                                     + ri->getReportListSize(), 32);
+    size_t total_size = accel_offset + accel_size;
+
+    DEBUG_PRINTF("aux_size %zu\n", aux_size);
+    DEBUG_PRINTF("aux_offset %zu\n", aux_offset);
+    DEBUG_PRINTF("rl size %u\n", ri->getReportListSize());
+    DEBUG_PRINTF("accel_size %zu\n", accel_size);
+    DEBUG_PRINTF("accel_offset %zu\n", accel_offset);
+    DEBUG_PRINTF("total_size %zu\n", total_size);
+
+    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
+    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+
+    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    mcsheng *m = (mcsheng *)getMutableImplNfa(nfa.get());
+
+    allocateImplId8(info, sheng_end, accel_escape_info, &m->accel_limit_8,
+                    &m->accept_limit_8);
+
+    populateBasicInfo(sizeof(u8), info, total_size, aux_offset, accel_offset,
+                      accel_escape_info.size(), arb, single, nfa.get());
+    createShuffleMasks(m, info, sheng_end, accel_escape_info);
+
+    fill_in_aux_info(nfa.get(), info, accel_escape_info, accel_offset,
+                     total_size - sizeof(NFA), reports, reports_eod,
+                     aux_offset + aux_size, *ri);
+
+    fill_in_succ_table_8(nfa.get(), info, sheng_end);
+
+    DEBUG_PRINTF("rl size %zu\n", ri->size());
+
+    return nfa;
+}
+
+aligned_unique_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
+                                       const ReportManager &rm,
+                                       set<dstate_id_t> *accel_states) {
+    if (!cc.grey.allowMcSheng) {
+        return nullptr;
+    }
+
+    mcclellan_build_strat mbs(raw, rm);
+    dfa_info info(mbs);
+    bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256;
+
+    if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming
+                          * mode with our semantics */
+        raw.stripExtraEodReports();
+    }
+
+    bool has_eod_reports = raw.hasEodReports();
+
+    map<dstate_id_t, AccelScheme> accel_escape_info
+        = info.strat.getAccelInfo(cc.grey);
+
+    dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info);
+    if (sheng_end <= DEAD_STATE + 1) {
+        return nullptr;
+    }
+
+    aligned_unique_ptr<NFA> nfa;
+    if (!using8bit) {
+        nfa = mcshengCompile16(info, sheng_end, accel_escape_info, cc.grey);
+    } else {
+        nfa = mcshengCompile8(info, sheng_end, accel_escape_info);
+    }
+
+    if (!nfa) {
+        return nfa;
+    }
+
+    if (has_eod_reports) {
+        nfa->flags |= NFA_ACCEPTS_EOD;
+    }
+
+    if (accel_states) {
+        fillAccelOut(accel_escape_info, accel_states);
+    }
+
+    DEBUG_PRINTF("compile done\n");
+    return nfa;
+}
+
+bool has_accel_mcsheng(const NFA *) {
+    return true; /* consider the sheng region as accelerated */
+}
+
+} // namespace ue2
diff --git a/src/nfa/mcsheng_compile.h b/src/nfa/mcsheng_compile.h
new file mode 100644
index 00000000..24cc66e9
--- /dev/null
+++ b/src/nfa/mcsheng_compile.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MCSHENGCOMPILE_H
+#define MCSHENGCOMPILE_H
+
+#include "accel_dfa_build_strat.h"
+#include "rdfa.h"
+#include "ue2common.h"
+#include "util/alloc.h"
+#include "util/ue2_containers.h"
+
+#include <memory>
+#include <set>
+
+struct NFA;
+
+namespace ue2 {
+
+class ReportManager;
+struct CompileContext;
+
+/* accel_states: (optional) on success, is filled with the set of accelerable
+ * states */
+ue2::aligned_unique_ptr<NFA>
+mcshengCompile(raw_dfa &raw, const CompileContext &cc,
+               const ReportManager &rm,
+               std::set<dstate_id_t> *accel_states = nullptr);
+
+bool has_accel_mcsheng(const NFA *nfa);
+
+} // namespace ue2
+
+#endif
diff --git a/src/nfa/mcsheng_data.c b/src/nfa/mcsheng_data.c
new file mode 100644
index 00000000..eaf3cbbb
--- /dev/null
+++ b/src/nfa/mcsheng_data.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mcsheng_internal.h"
+
+/* This table is in a separate translation unit from mcsheng.c as we want to
+ * prevent the compiler from seeing these constants. We have the load resources
+ * free at runtime to load the masks with no problems. */
+const u64a mcsheng_pext_mask[8] = {
+    0, /* dummy */
+    0x000000000000ff0f,
+    0x0000000000ff000f,
+    0x00000000ff00000f,
+    0x000000ff0000000f,
+    0x0000ff000000000f,
+    0x00ff00000000000f,
+    0xff0000000000000f,
+};
diff --git a/src/nfa/mcsheng_dump.cpp b/src/nfa/mcsheng_dump.cpp
new file mode 100644
index 00000000..f5c058af
--- /dev/null
+++ b/src/nfa/mcsheng_dump.cpp
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "mcsheng_dump.h"
+
+#include "accel.h"
+#include "accel_dump.h"
+#include "nfa_dump_internal.h"
+#include "nfa_internal.h"
+#include "mcsheng_internal.h"
+#include "rdfa.h"
+#include "ue2common.h"
+#include "util/charreach.h"
+#include "util/dump_charclass.h"
+#include "util/dump_util.h"
+#include "util/unaligned.h"
+
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <map>
+
+#ifndef DUMP_SUPPORT
+#error No dump support!
+#endif
+
+using namespace std;
+
+namespace ue2 {
+
+static
+const mstate_aux *getAux(const NFA *n, dstate_id_t i) {
+    auto *m = (const mcsheng *)getImplNfa(n);
+    auto *aux_base = (const mstate_aux *)((const char *)n + m->aux_offset);
+
+    const mstate_aux *aux = aux_base + i;
+
+    assert((const char *)aux < (const char *)n + m->length);
+    return aux;
+}
+
+static
+void next_states(const NFA *n, u16 s, u16 *t) {
+    const mcsheng *m = (const mcsheng *)getImplNfa(n);
+    const mstate_aux *aux = getAux(n, s);
+    const u32 as = m->alphaShift;
+    assert(s != DEAD_STATE);
+
+    if (s < m->sheng_end) {
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u8 sheng_s = s - 1;
+            auto trans_for_c = (const char *)&m->sheng_masks[c];
+            assert(sheng_s < sizeof(m128));
+            u8 raw_succ = trans_for_c[sheng_s];
+            if (raw_succ == m->sheng_end - 1) {
+                t[c] = DEAD_STATE;
+            } else if (raw_succ < m->sheng_end) {
+                t[c] = raw_succ + 1;
+            } else {
+                t[c] = raw_succ;
+            }
+        }
+    } else  if (n->type == MCSHENG_NFA_8) {
+        const u8 *succ_table = (const u8 *)((const char *)m + sizeof(mcsheng));
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u32 normal_id = s - m->sheng_end;
+            t[c] = succ_table[(normal_id << as) + m->remap[c]];
+        }
+    } else {
+        u16 base_s = s;
+        const char *winfo_base = (const char *)n + m->sherman_offset;
+        const char *state_base
+                = winfo_base + SHERMAN_FIXED_SIZE * (s - m->sherman_limit);
+
+        if (s >= m->sherman_limit) {
+            base_s = unaligned_load_u16(state_base + SHERMAN_DADDY_OFFSET);
+            assert(base_s >= m->sheng_end);
+        }
+
+        const u16 *succ_table = (const u16 *)((const char *)m
+                                              + sizeof(mcsheng));
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u32 normal_id = base_s - m->sheng_end;
+            t[c] = succ_table[(normal_id << as) + m->remap[c]];
+        }
+
+        if (s >= m->sherman_limit) {
+            UNUSED char type = *(state_base + SHERMAN_TYPE_OFFSET);
+            assert(type == SHERMAN_STATE);
+            u8 len = *(const u8 *)(SHERMAN_LEN_OFFSET + state_base);
+            const char *chars = state_base + SHERMAN_CHARS_OFFSET;
+            const u16 *states = (const u16 *)(state_base
+                                              + SHERMAN_STATES_OFFSET(len));
+
+            for (u8 i = 0; i < len; i++) {
+                for (u16 c = 0; c < N_CHARS; c++) {
+                    if (m->remap[c] == chars[i]) {
+                        t[c] = unaligned_load_u16((const u8*)&states[i]);
+                    }
+                }
+            }
+        }
+
+        for (u16 c = 0; c < N_CHARS; c++) {
+            t[c] &= STATE_MASK;
+        }
+
+    }
+
+    t[TOP] = aux->top & STATE_MASK;
+}
+
+static
+void describeEdge(FILE *f, const mcsheng *m, const u16 *t, u16 i) {
+    for (u16 s = 0; s < N_CHARS; s++) {
+        if (!t[s]) {
+            continue;
+        }
+
+        u16 ss;
+        for (ss = 0; ss < s; ss++) {
+            if (t[s] == t[ss]) {
+                break;
+            }
+        }
+
+        if (ss != s) {
+            continue;
+        }
+
+        CharReach reach;
+        for (ss = s; ss < 256; ss++) {
+            if (t[s] == t[ss]) {
+                reach.set(ss);
+            }
+        }
+
+        fprintf(f, "%u -> %u [ ", i, t[s]);
+        if (i < m->sheng_end && t[s] < m->sheng_end) {
+            fprintf(f, "color = red, fontcolor = red ");
+        }
+        fprintf(f, "label = \"");
+        describeClass(f, reach, 5, CC_OUT_DOT);
+
+        fprintf(f, "\" ];\n");
+    }
+}
+
+static
+void dumpAccelDot(FILE *f, u16 i, const union AccelAux *accel) {
+    switch(accel->accel_type) {
+    case ACCEL_NONE:
+        break;
+    case ACCEL_VERM:
+    case ACCEL_VERM_NOCASE:
+    case ACCEL_DVERM:
+    case ACCEL_DVERM_NOCASE:
+        fprintf(f, "%u [ color = forestgreen style=diagonals];\n", i);
+        break;
+    case ACCEL_SHUFTI:
+    case ACCEL_DSHUFTI:
+    case ACCEL_TRUFFLE:
+        fprintf(f, "%u [ color = darkgreen style=diagonals ];\n", i);
+        break;
+    default:
+        fprintf(f, "%u [ color = yellow style=diagonals ];\n", i);
+        break;
+    }
+}
+
+static
+void describeNode(const NFA *n, const mcsheng *m, u16 i, FILE *f) {
+    const mstate_aux *aux = getAux(n, i);
+
+    bool isSherman = m->sherman_limit && i >= m->sherman_limit;
+
+    fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
+            "label = \"%u%s\" ]; \n", i, i, isSherman ? "w":"");
+
+    if (aux->accel_offset) {
+        dumpAccelDot(f, i, (const union AccelAux *)
+                     ((const char *)m + aux->accel_offset));
+    }
+
+    if (i && i < m->sheng_end) {
+        fprintf(f, "%u [color = red, fontcolor = red]; \n", i);
+    }
+
+    if (aux->accept_eod) {
+        fprintf(f, "%u [ color = darkorchid ];\n", i);
+    }
+
+    if (aux->accept) {
+        fprintf(f, "%u [ shape = doublecircle ];\n", i);
+    }
+
+    if (aux->top && aux->top != i) {
+        fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
+                aux->top);
+    }
+
+    if (i == m->start_anchored) {
+        fprintf(f, "STARTA -> %u [color = blue ]\n", i);
+    }
+
+    if (i == m->start_floating) {
+        fprintf(f, "STARTF -> %u [color = red ]\n", i);
+    }
+
+    if (isSherman) {
+        const char *winfo_base = (const char *)n + m->sherman_offset;
+        const char *state_base
+                = winfo_base + SHERMAN_FIXED_SIZE * (i - m->sherman_limit);
+        assert(state_base < (const char *)m + m->length - sizeof(NFA));
+        UNUSED u8 type = *(const u8 *)(state_base + SHERMAN_TYPE_OFFSET);
+        assert(type == SHERMAN_STATE);
+        fprintf(f, "%u [ fillcolor = lightblue style=filled ];\n", i);
+        u16 daddy = *(const u16 *)(state_base + SHERMAN_DADDY_OFFSET);
+        if (daddy) {
+            fprintf(f, "%u -> %u [ color=royalblue style=dashed weight=0.1]\n",
+                    i, daddy);
+        }
+    }
+
+    if (i && i < m->sheng_end) {
+        fprintf(f, "subgraph cluster_sheng { %u } \n", i);
+    }
+
+}
+
+static
+void dumpDotPreambleDfa(FILE *f) {
+    dumpDotPreamble(f);
+
+    // DFA specific additions.
+    fprintf(f, "STARTF [style=invis];\n");
+    fprintf(f, "STARTA [style=invis];\n");
+    fprintf(f, "0 [style=invis];\n");
+    fprintf(f, "subgraph cluster_sheng { style = dashed }\n");
+}
+
+static
+void dump_dot_16(const NFA *nfa, FILE *f) {
+    auto  *m = (const mcsheng *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < m->state_count; i++) {
+        describeNode(nfa, m, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        next_states(nfa, i, t);
+
+        describeEdge(f, m, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+static
+void dump_dot_8(const NFA *nfa, FILE *f) {
+    auto m = (const mcsheng *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < m->state_count; i++) {
+        describeNode(nfa, m, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        next_states(nfa, i, t);
+
+        describeEdge(f, m, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+static
+void dumpAccelMasks(FILE *f, const mcsheng *m, const mstate_aux *aux) {
+    fprintf(f, "\n");
+    fprintf(f, "Acceleration\n");
+    fprintf(f, "------------\n");
+
+    for (u16 i = 0; i < m->state_count; i++) {
+        if (!aux[i].accel_offset) {
+            continue;
+        }
+
+        auto accel = (const AccelAux *)((const char *)m + aux[i].accel_offset);
+        fprintf(f, "%05hu ", i);
+        dumpAccelInfo(f, *accel);
+    }
+}
+
+static
+void describeAlphabet(FILE *f, const mcsheng *m) {
+    map<u8, CharReach> rev;
+
+    for (u16 i = 0; i < N_CHARS; i++) {
+        rev[m->remap[i]].clear();
+    }
+
+    for (u16 i = 0; i < N_CHARS; i++) {
+        rev[m->remap[i]].set(i);
+    }
+
+    map<u8, CharReach>::const_iterator it;
+    fprintf(f, "\nAlphabet\n");
+    for (it = rev.begin(); it != rev.end(); ++it) {
+        fprintf(f, "%3hhu: ", it->first);
+        describeClass(f, it->second, 10240, CC_OUT_TEXT);
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
+static
+void dumpCommonHeader(FILE *f, const mcsheng *m) {
+    fprintf(f, "report: %u, states: %u, length: %u\n", m->arb_report,
+            m->state_count, m->length);
+    fprintf(f, "astart: %hu, fstart: %hu\n", m->start_anchored,
+            m->start_floating);
+    fprintf(f, "single accept: %d, has_accel: %d\n",
+            !!(int)m->flags & MCSHENG_FLAG_SINGLE, m->has_accel);
+    fprintf(f, "sheng_end:         %hu\n", m->sheng_end);
+    fprintf(f, "sheng_accel_limit: %hu\n", m->sheng_accel_limit);
+}
+
+static
+void dump_text_16(const NFA *nfa, FILE *f) {
+    auto *m = (const mcsheng *)getImplNfa(nfa);
+    auto *aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
+
+    fprintf(f, "mcsheng 16\n");
+    dumpCommonHeader(f, m);
+    fprintf(f, "sherman_limit: %d, sherman_end: %d\n", (int)m->sherman_limit,
+            (int)m->sherman_end);
+    fprintf(f, "\n");
+
+    describeAlphabet(f, m);
+    dumpAccelMasks(f, m, aux);
+
+    fprintf(f, "\n");
+    dumpTextReverse(nfa, f);
+}
+
+static
+void dump_text_8(const NFA *nfa, FILE *f) {
+    auto m = (const mcsheng *)getImplNfa(nfa);
+    auto aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
+
+    fprintf(f, "mcsheng 8\n");
+    dumpCommonHeader(f, m);
+    fprintf(f, "accel_limit: %hu, accept_limit %hu\n", m->accel_limit_8,
+            m->accept_limit_8);
+    fprintf(f, "\n");
+
+    describeAlphabet(f, m);
+    dumpAccelMasks(f, m, aux);
+
+    fprintf(f, "\n");
+    dumpTextReverse(nfa, f);
+}
+
+void nfaExecMcSheng16_dump(const NFA *nfa, const string &base) {
+    assert(nfa->type == MCSHENG_NFA_16);
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    dump_text_16(nfa, f);
+    fclose(f);
+    f = fopen_or_throw((base + ".dot").c_str(), "w");
+    dump_dot_16(nfa, f);
+    fclose(f);
+}
+
+void nfaExecMcSheng8_dump(const NFA *nfa, const string &base) {
+    assert(nfa->type == MCSHENG_NFA_8);
+    FILE *f = fopen_or_throw((base + ".txt").c_str(), "w");
+    dump_text_8(nfa, f);
+    fclose(f);
+    f = fopen_or_throw((base + ".dot").c_str(), "w");
+    dump_dot_8(nfa, f);
+    fclose(f);
+}
+
+} // namespace ue2
diff --git a/src/nfa/mcsheng_dump.h b/src/nfa/mcsheng_dump.h
new file mode 100644
index 00000000..1b699367
--- /dev/null
+++ b/src/nfa/mcsheng_dump.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MCSHENG_DUMP_H
+#define MCSHENG_DUMP_H
+
+#ifdef DUMP_SUPPORT
+
+#include "rdfa.h"
+
+#include <cstdio>
+#include <string>
+
+struct NFA;
+
+namespace ue2 {
+
+void nfaExecMcSheng8_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecMcSheng16_dump(const struct NFA *nfa, const std::string &base);
+
+} // namespace ue2
+
+#endif // DUMP_SUPPORT
+
+#endif // MCSHENG_DUMP_H
diff --git a/src/nfa/mcsheng_internal.h b/src/nfa/mcsheng_internal.h
new file mode 100644
index 00000000..5ced6f76
--- /dev/null
+++ b/src/nfa/mcsheng_internal.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MCSHENG_INTERNAL_H
+#define MCSHENG_INTERNAL_H
+
+#include "nfa_internal.h"
+#include "ue2common.h"
+#include "util/simd_utils.h"
+
+#define ACCEPT_FLAG 0x8000
+#define ACCEL_FLAG  0x4000
+#define STATE_MASK  0x3fff
+
+#define SHERMAN_STATE 1
+
+#define SHERMAN_TYPE_OFFSET            0
+#define SHERMAN_FIXED_SIZE            32
+
+#define SHERMAN_LEN_OFFSET             1
+#define SHERMAN_DADDY_OFFSET           2
+#define SHERMAN_CHARS_OFFSET           4
+#define SHERMAN_STATES_OFFSET(sso_len) (4 + (sso_len))
+
+struct report_list {
+    u32 count;
+    ReportID report[];
+};
+
+struct mstate_aux {
+    u32 accept;
+    u32 accept_eod;
+    u16 top;
+    u32 accel_offset; /* relative to start of struct mcsheng; 0 if no accel */
+};
+
+#define MCSHENG_FLAG_SINGLE 1  /**< we raise only single accept id */
+
+struct mcsheng {
+    u16 state_count; /**< total number of states */
+    u32 length; /**< length of dfa in bytes */
+    u16 start_anchored; /**< anchored start state */
+    u16 start_floating; /**< floating start state */
+    u32 aux_offset; /**< offset of the aux structures relative to the start of
+                     *  the nfa structure */
+    u32 sherman_offset; /**< offset of array of sherman state offsets the
+                         * state_info structures relative to the start of the
+                         * nfa structure */
+    u32 sherman_end; /**< offset of the end of the state_info structures
+                      * relative to the start of the nfa structure */
+    u16 sheng_end; /**< first non-sheng state */
+    u16 sheng_accel_limit; /**< first sheng accel state. state given in terms of
+                            * internal sheng ids */
+    u16 accel_limit_8; /**< 8 bit, lowest accelerable state */
+    u16 accept_limit_8; /**< 8 bit, lowest accept state */
+    u16 sherman_limit; /**< lowest sherman state */
+    u8  alphaShift;
+    u8  flags;
+    u8  has_accel; /**< 1 iff there are any accel plans */
+    u8  remap[256]; /**< remaps characters to a smaller alphabet */
+    ReportID arb_report; /**< one of the accepts that this dfa may raise */
+    u32 accel_offset; /**< offset of the accel structures from start of NFA */
+    m128 sheng_masks[N_CHARS];
+};
+
+/* pext masks for the runtime to access appropriately copies of bytes 1..7
+ * representing the data from a u64a. */
+extern const u64a mcsheng_pext_mask[8];
+
+#endif
diff --git a/src/nfa/nfa_api_dispatch.c b/src/nfa/nfa_api_dispatch.c
index d4e9eb78..f4b7552e 100644
--- a/src/nfa/nfa_api_dispatch.c
+++ b/src/nfa/nfa_api_dispatch.c
@@ -41,6 +41,7 @@
 #include "lbr.h"
 #include "limex.h"
 #include "mcclellan.h"
+#include "mcsheng.h"
 #include "mpv.h"
 #include "sheng.h"
 #include "tamarama.h"
@@ -73,6 +74,8 @@
         DISPATCH_CASE(CASTLE_NFA, Castle, dbnt_func);                          \
         DISPATCH_CASE(SHENG_NFA, Sheng, dbnt_func);                            \
         DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func);                      \
+        DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func);                     \
+        DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func);                   \
     default:                                                                   \
         assert(0);                                                             \
     }
diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp
index 3b235bf4..3103cd29 100644
--- a/src/nfa/nfa_build_util.cpp
+++ b/src/nfa/nfa_build_util.cpp
@@ -30,6 +30,7 @@
 
 #include "limex_internal.h"
 #include "mcclellancompile.h"
+#include "mcsheng_compile.h"
 #include "shengcompile.h"
 #include "nfa_internal.h"
 #include "repeat_internal.h"
@@ -413,6 +414,38 @@ const nfa_dispatch_fn NFATraits<TAMARAMA_NFA>::has_repeats_other_than_firsts = d
 const char *NFATraits<TAMARAMA_NFA>::name = "Tamarama";
 #endif
 
+template<> struct NFATraits<MCSHENG_NFA_8> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 1;
+    static const bool fast = true;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<MCSHENG_NFA_8>::has_accel = has_accel_mcsheng;
+const nfa_dispatch_fn NFATraits<MCSHENG_NFA_8>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCSHENG_NFA_8>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<MCSHENG_NFA_8>::name = "Shengy McShengFace 8";
+#endif
+
+template<> struct NFATraits<MCSHENG_NFA_16> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 2;
+    static const bool fast = true;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<MCSHENG_NFA_16>::has_accel = has_accel_mcsheng;
+const nfa_dispatch_fn NFATraits<MCSHENG_NFA_16>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCSHENG_NFA_16>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<MCSHENG_NFA_16>::name = "Shengy McShengFace 16";
+#endif
+
 } // namespace
 
 #if defined(DUMP_SUPPORT)
diff --git a/src/nfa/nfa_dump_dispatch.cpp b/src/nfa/nfa_dump_dispatch.cpp
index 3dea5ef7..5607ed27 100644
--- a/src/nfa/nfa_dump_dispatch.cpp
+++ b/src/nfa/nfa_dump_dispatch.cpp
@@ -39,6 +39,7 @@
 #include "lbr_dump.h"
 #include "limex.h"
 #include "mcclellandump.h"
+#include "mcsheng_dump.h"
 #include "mpv_dump.h"
 #include "shengdump.h"
 #include "tamarama_dump.h"
@@ -78,6 +79,8 @@ namespace ue2 {
         DISPATCH_CASE(CASTLE_NFA, Castle, dbnt_func);                          \
         DISPATCH_CASE(SHENG_NFA, Sheng, dbnt_func);                            \
         DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func);                      \
+        DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func);                     \
+        DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func);                   \
     default:                                                                   \
         assert(0);                                                             \
     }
diff --git a/src/nfa/nfa_internal.h b/src/nfa/nfa_internal.h
index 1ce566ff..9d280822 100644
--- a/src/nfa/nfa_internal.h
+++ b/src/nfa/nfa_internal.h
@@ -70,6 +70,8 @@ enum NFAEngineType {
     CASTLE_NFA,         /**< magic pseudo nfa */
     SHENG_NFA,          /**< magic pseudo nfa */
     TAMARAMA_NFA,       /**< magic nfa container */
+    MCSHENG_NFA_8,      /**< magic pseudo nfa */
+    MCSHENG_NFA_16,     /**< magic pseudo nfa */
     /** \brief bogus NFA - not used */
     INVALID_NFA
 };
@@ -143,6 +145,12 @@ static really_inline int isMcClellanType(u8 t) {
     return t == MCCLELLAN_NFA_8 || t == MCCLELLAN_NFA_16;
 }
 
+/** \brief True if the given type (from NFA::type) is a Sheng-McClellan hybrid
+ * DFA. */
+static really_inline int isShengMcClellanType(u8 t) {
+    return t == MCSHENG_NFA_8 || t == MCSHENG_NFA_16;
+}
+
 /** \brief True if the given type (from NFA::type) is a Gough DFA. */
 static really_inline int isGoughType(u8 t) {
     return t == GOUGH_NFA_8 || t == GOUGH_NFA_16;
@@ -158,7 +166,16 @@ static really_inline int isShengType(u8 t) {
  * Sheng DFA.
  */
 static really_inline int isDfaType(u8 t) {
-    return isMcClellanType(t) || isGoughType(t) || isShengType(t);
+    return isMcClellanType(t) || isGoughType(t) || isShengType(t)
+        || isShengMcClellanType(t);
+}
+
+static really_inline int isBigDfaType(u8 t) {
+    return t == MCCLELLAN_NFA_16 || t == MCSHENG_NFA_16 || t == GOUGH_NFA_16;
+}
+
+static really_inline int isSmallDfaType(u8 t) {
+    return isDfaType(t) && !isBigDfaType(t);
 }
 
 /** \brief True if the given type (from NFA::type) is an NFA. */
diff --git a/src/nfa/rdfa_graph.cpp b/src/nfa/rdfa_graph.cpp
new file mode 100644
index 00000000..2467748b
--- /dev/null
+++ b/src/nfa/rdfa_graph.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "rdfa_graph.h"
+
+#include "rdfa.h"
+#include "util/container.h"
+
+#include <vector>
+
+using namespace std;
+
+namespace ue2 {
+
+RdfaGraph::RdfaGraph(const raw_dfa &rdfa) {
+    RdfaGraph &g = *this;
+
+    vector<RdfaGraph::vertex_descriptor> verts;
+    verts.reserve(rdfa.states.size());
+    for (dstate_id_t i = 0; i < rdfa.states.size(); i++) {
+        verts.push_back(add_vertex(g));
+        assert(g[verts.back()].index == i);
+    }
+
+    symbol_t symbol_end = rdfa.alpha_size - 1;
+
+    flat_set<dstate_id_t> local_succs;
+    for (dstate_id_t i = 0; i < rdfa.states.size(); i++) {
+        local_succs.clear();
+        for (symbol_t s = 0; s < symbol_end; s++) {
+            dstate_id_t next = rdfa.states[i].next[s];
+            if (contains(local_succs, next)) {
+                continue;
+            }
+            DEBUG_PRINTF("%hu->%hu\n", i, next);
+            add_edge(verts[i], verts[next], g);
+            local_succs.insert(next);
+        }
+    }
+}
+
+}
diff --git a/src/nfa/rdfa_graph.h b/src/nfa/rdfa_graph.h
new file mode 100644
index 00000000..6d166c2f
--- /dev/null
+++ b/src/nfa/rdfa_graph.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RDFA_GRAPH_H
+#define RDFA_GRAPH_H
+
+#include "ue2common.h"
+#include "util/ue2_graph.h"
+
+namespace ue2 {
+
+struct raw_dfa;
+
+struct RdfaVertexProps {
+    size_t index = 0;
+};
+
+struct RdfaEdgeProps {
+    size_t index = 0;
+};
+
+struct RdfaGraph : public ue2_graph<RdfaGraph, RdfaVertexProps, RdfaEdgeProps> {
+    RdfaGraph(const raw_dfa &rdfa);
+};
+
+
+}
+
+#endif
diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp
index 3902dbaf..a02a9b96 100644
--- a/src/nfa/shengcompile.cpp
+++ b/src/nfa/shengcompile.cpp
@@ -447,9 +447,8 @@ void createShuffleMasks(sheng *s, dfa_info &info,
     }
 }
 
-bool has_accel_sheng(const NFA *nfa) {
-    const sheng *s = (const sheng *)getImplNfa(nfa);
-    return s->flags & SHENG_FLAG_HAS_ACCEL;
+bool has_accel_sheng(const NFA *) {
+    return true; /* consider the sheng region as accelerated */
 }
 
 aligned_unique_ptr<NFA> shengCompile(raw_dfa &raw,
diff --git a/src/nfagraph/ng_util.cpp b/src/nfagraph/ng_util.cpp
index 948cd7f1..5252eb18 100644
--- a/src/nfagraph/ng_util.cpp
+++ b/src/nfagraph/ng_util.cpp
@@ -46,7 +46,6 @@
 #include <map>
 #include <set>
 #include <boost/graph/filtered_graph.hpp>
-#include <boost/graph/strong_components.hpp>
 #include <boost/graph/topological_sort.hpp>
 #include <boost/range/adaptor/map.hpp>
 
@@ -54,7 +53,6 @@ using namespace std;
 using boost::default_color_type;
 using boost::make_filtered_graph;
 using boost::make_assoc_property_map;
-using boost::adaptors::map_values;
 
 namespace ue2 {
 
@@ -257,38 +255,6 @@ bool hasBigCycles(const NGHolder &g) {
     return false;
 }
 
-set<NFAVertex> findVerticesInCycles(const NGHolder &g) {
-    map<NFAVertex, size_t> comp_map;
-
-    strong_components(g, make_assoc_property_map(comp_map));
-
-    map<size_t, set<NFAVertex> > comps;
-
-    for (const auto &e : comp_map) {
-        comps[e.second].insert(e.first);
-    }
-
-
-    set<NFAVertex> rv;
-
-    for (const auto &comp : comps | map_values) {
-        /* every vertex in a strongly connected component is reachable from
-         * every other vertex in the component. A vertex is involved in a cycle
-         * therefore if it is in a strongly connected component with more than
-         * one vertex or if it is the only vertex and it has a self loop. */
-        assert(!comp.empty());
-        if (comp.size() > 1) {
-            insert(&rv, comp);
-        }
-        NFAVertex v = *comp.begin();
-        if (hasSelfLoop(v, g)) {
-            rv.insert(v);
-        }
-    }
-
-    return rv;
-}
-
 bool can_never_match(const NGHolder &g) {
     assert(edge(g.accept, g.acceptEod, g).second);
     if (in_degree(g.accept, g) == 0 && in_degree(g.acceptEod, g) == 1) {
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index f074973d..ef74619d 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -52,6 +52,7 @@
 #include "nfa/goughcompile.h"
 #include "nfa/mcclellancompile.h"
 #include "nfa/mcclellancompile_util.h"
+#include "nfa/mcsheng_compile.h"
 #include "nfa/nfa_api_queue.h"
 #include "nfa/nfa_build_util.h"
 #include "nfa/nfa_internal.h"
@@ -615,7 +616,7 @@ aligned_unique_ptr<NFA> pickImpl(aligned_unique_ptr<NFA> dfa_impl,
 
     bool d_accel = has_accel(*dfa_impl);
     bool n_accel = has_accel(*nfa_impl);
-    bool d_big = dfa_impl->type == MCCLELLAN_NFA_16;
+    bool d_big = isBigDfaType(dfa_impl->type);
     bool n_vsmall = nfa_impl->nPositions <= 32;
     bool n_br = has_bounded_repeats(*nfa_impl);
     DEBUG_PRINTF("da %d na %d db %d nvs %d nbr %d\n", (int)d_accel,
@@ -666,10 +667,17 @@ buildRepeatEngine(const CastleProto &proto,
 }
 
 static
-aligned_unique_ptr<NFA> getDfa(raw_dfa &rdfa, const CompileContext &cc,
+aligned_unique_ptr<NFA> getDfa(raw_dfa &rdfa, bool is_transient,
+                               const CompileContext &cc,
                                const ReportManager &rm) {
     // Unleash the Sheng!!
     auto dfa = shengCompile(rdfa, cc, rm);
+    if (!dfa && !is_transient) {
+        // Sheng wasn't successful, so unleash McClellan!
+        /* We don't try the hybrid for transient prefixes due to the extra
+         * bytecode and that they are usually run on small blocks */
+        dfa = mcshengCompile(rdfa, cc, rm);
+    }
     if (!dfa) {
         // Sheng wasn't successful, so unleash McClellan!
         dfa = mcclellanCompile(rdfa, cc, rm);
@@ -697,7 +705,7 @@ buildSuffix(const ReportManager &rm, const SomSlotManager &ssm,
     }
 
     if (suff.dfa()) {
-        auto d = getDfa(*suff.dfa(), cc, rm);
+        auto d = getDfa(*suff.dfa(), false, cc, rm);
         assert(d);
         return d;
     }
@@ -726,7 +734,7 @@ buildSuffix(const ReportManager &rm, const SomSlotManager &ssm,
             auto rdfa = buildMcClellan(holder, &rm, false, triggers.at(0),
                                        cc.grey);
             if (rdfa) {
-                auto d = getDfa(*rdfa, cc, rm);
+                auto d = getDfa(*rdfa, false, cc, rm);
                 assert(d);
                 if (cc.grey.roseMcClellanSuffix != 2) {
                     n = pickImpl(move(d), move(n));
@@ -846,12 +854,12 @@ makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
     }
 
     if (left.dfa()) {
-        n = getDfa(*left.dfa(), cc, rm);
+        n = getDfa(*left.dfa(), is_transient, cc, rm);
     } else if (left.graph() && cc.grey.roseMcClellanPrefix == 2 && is_prefix &&
                !is_transient) {
         auto rdfa = buildMcClellan(*left.graph(), nullptr, cc.grey);
         if (rdfa) {
-            n = getDfa(*rdfa, cc, rm);
+            n = getDfa(*rdfa, is_transient, cc, rm);
             assert(n);
         }
     }
@@ -878,7 +886,7 @@ makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
         && (!n || !has_bounded_repeats_other_than_firsts(*n) || !is_fast(*n))) {
         auto rdfa = buildMcClellan(*left.graph(), nullptr, cc.grey);
         if (rdfa) {
-            auto d = getDfa(*rdfa, cc, rm);
+            auto d = getDfa(*rdfa, is_transient, cc, rm);
             assert(d);
             n = pickImpl(move(d), move(n));
         }
@@ -1614,7 +1622,7 @@ public:
 
     aligned_unique_ptr<NFA> operator()(unique_ptr<raw_dfa> &rdfa) const {
         // Unleash the mighty DFA!
-        return getDfa(*rdfa, build.cc, build.rm);
+        return getDfa(*rdfa, false, build.cc, build.rm);
     }
 
     aligned_unique_ptr<NFA> operator()(unique_ptr<raw_som_dfa> &haig) const {
@@ -1642,7 +1650,7 @@ public:
             !has_bounded_repeats_other_than_firsts(*n)) {
             auto rdfa = buildMcClellan(h, &rm, cc.grey);
             if (rdfa) {
-                auto d = getDfa(*rdfa, cc, rm);
+                auto d = getDfa(*rdfa, false, cc, rm);
                 if (d) {
                     n = pickImpl(move(d), move(n));
                 }
diff --git a/src/rose/rose_build_infix.cpp b/src/rose/rose_build_infix.cpp
index f3e7680f..4bbb3525 100644
--- a/src/rose/rose_build_infix.cpp
+++ b/src/rose/rose_build_infix.cpp
@@ -278,7 +278,7 @@ void findCountingMiracleInfo(const left_id &left, const vector<u8> &stopTable,
 
     const NGHolder &g = *left.graph();
 
-    auto cyclics = findVerticesInCycles(g);
+    auto cyclics = find_vertices_in_cycles(g);
 
     if (!proper_out_degree(g.startDs, g)) {
         cyclics.erase(g.startDs);
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index 50ca1d9e..28b885bd 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -1206,7 +1206,7 @@ u32 roseQuality(const RoseEngine *t) {
         }
         const NFA *nfa = (const NFA *)((const char *)atable + sizeof(*atable));
 
-        if (nfa->type != MCCLELLAN_NFA_8) {
+        if (!isSmallDfaType(nfa->type)) {
             DEBUG_PRINTF("m16 atable engine\n");
             return 0;
         }
diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index b7a09ca7..d144e879 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -471,4 +471,55 @@ u32 rank_in_mask64(u64a mask, u32 bit) {
     return popcount64(mask);
 }
 
+#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__))
+#define HAVE_PEXT
+#endif
+
+static really_inline
+u32 pext32(u32 x, u32 mask) {
+#if defined(HAVE_PEXT)
+    // Intel BMI2 can do this operation in one instruction.
+    return _pext_u32(x, mask);
+#else
+
+    u32 result = 0, num = 1;
+    while (mask != 0) {
+        u32 bit = findAndClearLSB_32(&mask);
+        if (x & (1U << bit)) {
+            assert(num != 0); // more than 32 bits!
+            result |= num;
+        }
+        num <<= 1;
+    }
+    return result;
+#endif
+}
+
+static really_inline
+u64a pext64(u64a x, u64a mask) {
+#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+    // Intel BMI2 can do this operation in one instruction.
+    return _pext_u64(x, mask);
+#else
+
+    u32 result = 0, num = 1;
+    while (mask != 0) {
+        u32 bit = findAndClearLSB_64(&mask);
+        if (x & (1ULL << bit)) {
+            assert(num != 0); // more than 32 bits!
+            result |= num;
+        }
+        num <<= 1;
+    }
+    return result;
+#endif
+}
+
+#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+static really_inline
+u64a pdep64(u64a x, u64a mask) {
+    return _pdep_u64(x, mask);
+}
+#endif
+
 #endif // BITUTILS_H
diff --git a/src/util/graph.h b/src/util/graph.h
index ae7c2c90..4c2876f1 100644
--- a/src/util/graph.h
+++ b/src/util/graph.h
@@ -39,8 +39,12 @@
 #include "util/ue2_containers.h"
 
 #include <boost/graph/depth_first_search.hpp>
+#include <boost/graph/strong_components.hpp>
+#include <boost/range/adaptor/map.hpp>
 
 #include <algorithm>
+#include <map>
+#include <set>
 #include <utility>
 #include <vector>
 
@@ -140,6 +144,41 @@ void find_unreachable(const Graph &g, const SourceCont &sources, OutCont *out) {
     }
 }
 
+template <class Graph>
+ue2::flat_set<typename Graph::vertex_descriptor>
+find_vertices_in_cycles(const Graph &g) {
+    using vertex_descriptor = typename Graph::vertex_descriptor;
+
+    std::map<vertex_descriptor, size_t> comp_map;
+
+    boost::strong_components(g, boost::make_assoc_property_map(comp_map));
+
+    std::map<size_t, std::vector<vertex_descriptor>> comps;
+
+    for (const auto &e : comp_map) {
+        comps[e.second].push_back(e.first);
+    }
+
+    ue2::flat_set<vertex_descriptor> rv;
+
+    for (const auto &comp : comps | boost::adaptors::map_values) {
+        /* every vertex in a strongly connected component is reachable from
+         * every other vertex in the component. A vertex is involved in a cycle
+         * therefore if it is in a strongly connected component with more than
+         * one vertex or if it is the only vertex and it has a self loop. */
+        assert(!comp.empty());
+        if (comp.size() > 1) {
+            insert(&rv, comp);
+        }
+        vertex_descriptor v = *comp.begin();
+        if (hasSelfLoop(v, g)) {
+            rv.insert(v);
+        }
+    }
+
+    return rv;
+}
+
 template <class Graph>
 bool has_parallel_edge(const Graph &g) {
     using vertex_descriptor = typename Graph::vertex_descriptor;
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 35e1a390..e8676249 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -159,6 +159,10 @@ static really_inline m128 set16x8(u8 c) {
     return _mm_set1_epi8(c);
 }
 
+static really_inline m128 set4x32(u32 c) {
+    return _mm_set1_epi32(c);
+}
+
 static really_inline u32 movd(const m128 in) {
     return _mm_cvtsi128_si32(in);
 }
@@ -328,6 +332,25 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
     return pshufb(in, shift_mask);
 }
 
+static really_inline
+m128 max_u8_m128(m128 a, m128 b) {
+    return _mm_max_epu8(a, b);
+}
+
+static really_inline
+m128 min_u8_m128(m128 a, m128 b) {
+    return _mm_min_epu8(a, b);
+}
+
+static really_inline
+m128 sadd_u8_m128(m128 a, m128 b) {
+    return _mm_adds_epu8(a, b);
+}
+
+static really_inline
+m128 sub_u8_m128(m128 a, m128 b) {
+    return _mm_sub_epi8(a, b);
+}
 
 /****
  **** 256-bit Primitives
diff --git a/unit/internal/bitutils.cpp b/unit/internal/bitutils.cpp
index 4d476932..31aaf17f 100644
--- a/unit/internal/bitutils.cpp
+++ b/unit/internal/bitutils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -436,3 +436,16 @@ TEST(BitUtils, rank_in_mask64) {
     ASSERT_EQ(15, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 31));
     ASSERT_EQ(31, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 63));
 }
+
+#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+TEST(BitUtils, pdep64) {
+    u64a data = 0xF123456789ABCDEF;
+    ASSERT_EQ(0xfULL, pdep64(data, 0xf));
+    ASSERT_EQ(0xefULL, pdep64(data, 0xff));
+    ASSERT_EQ(0xf0ULL, pdep64(data, 0xf0));
+    ASSERT_EQ(0xfULL, pdep64(data, 0xf));
+    ASSERT_EQ(0xef0ULL, pdep64(data, 0xff0));
+    ASSERT_EQ(0xef00ULL, pdep64(data, 0xff00));
+    ASSERT_EQ(0xd0e0f00ULL, pdep64(data, 0xf0f0f00));
+}
+#endif
diff --git a/unit/internal/nfagraph_util.cpp b/unit/internal/nfagraph_util.cpp
index 135276dd..b6952f5a 100644
--- a/unit/internal/nfagraph_util.cpp
+++ b/unit/internal/nfagraph_util.cpp
@@ -320,9 +320,9 @@ TEST(NFAGraph, cyclicVerts1) {
     add_edge(a, b, g);
     add_edge(b, a, g);
 
-    auto cyclics = findVerticesInCycles(g);
+    auto cyclics = find_vertices_in_cycles(g);
 
-    ASSERT_EQ(set<NFAVertex>({g.startDs, a, b}), cyclics);
+    ASSERT_EQ(flat_set<NFAVertex>({g.startDs, a, b}), cyclics);
 }
 
 TEST(NFAGraph, cyclicVerts2) {
@@ -341,9 +341,9 @@ TEST(NFAGraph, cyclicVerts2) {
     add_edge(c, d, g);
     add_edge(a, e, g);
 
-    auto cyclics = findVerticesInCycles(g);
+    auto cyclics = find_vertices_in_cycles(g);
 
-    ASSERT_EQ(set<NFAVertex>({g.startDs, a, b, c}), cyclics);
+    ASSERT_EQ(flat_set<NFAVertex>({g.startDs, a, b, c}), cyclics);
 }
 
 TEST(NFAGraph, cyclicVerts3) {
@@ -369,9 +369,9 @@ TEST(NFAGraph, cyclicVerts3) {
     add_edge(f, h, g);
     add_edge(h, h, g);
 
-    auto cyclics = findVerticesInCycles(g);
+    auto cyclics = find_vertices_in_cycles(g);
 
-    ASSERT_EQ(set<NFAVertex>({g.startDs, a, b, c, d, e, h}), cyclics);
+    ASSERT_EQ(flat_set<NFAVertex>({g.startDs, a, b, c, d, e, h}), cyclics);
 }
 
 TEST(NFAGraph, cyclicVerts4) {
@@ -396,9 +396,9 @@ TEST(NFAGraph, cyclicVerts4) {
     add_edge(e, f, g);
     add_edge(f, h, g);
 
-    auto cyclics = findVerticesInCycles(g);
+    auto cyclics = find_vertices_in_cycles(g);
 
-    ASSERT_EQ(set<NFAVertex>({g.startDs, a, b, c, d, e}), cyclics);
+    ASSERT_EQ(flat_set<NFAVertex>({g.startDs, a, b, c, d, e}), cyclics);
 }
 
 TEST(NFAGraph, cyclicVerts5) {
@@ -418,7 +418,7 @@ TEST(NFAGraph, cyclicVerts5) {
     add_edge(c, d, g);
     add_edge(e, c, g);
 
-    auto cyclics = findVerticesInCycles(g);
+    auto cyclics = find_vertices_in_cycles(g);
 
-    ASSERT_EQ(set<NFAVertex>({g.startDs, b, c}), cyclics);
+    ASSERT_EQ(flat_set<NFAVertex>({g.startDs, b, c}), cyclics);
 }
diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp
index 614b641d..a4632c36 100644
--- a/unit/internal/shuffle.cpp
+++ b/unit/internal/shuffle.cpp
@@ -54,14 +54,14 @@ TEST(Shuffle, PackedExtract32_1) {
     for (unsigned int i = 0; i < 32; i++) {
         // shuffle a single 1 bit to the front
         u32 mask = 1U << i;
-        EXPECT_EQ(1U, packedExtract32(mask, mask));
-        EXPECT_EQ(1U, packedExtract32(~0U, mask));
+        EXPECT_EQ(1U, pext32(mask, mask));
+        EXPECT_EQ(1U, pext32(~0U, mask));
         // we should get zero out of these cases
-        EXPECT_EQ(0U, packedExtract32(0, mask));
-        EXPECT_EQ(0U, packedExtract32(~mask, mask));
+        EXPECT_EQ(0U, pext32(0, mask));
+        EXPECT_EQ(0U, pext32(~mask, mask));
         // we should get zero out of all the other bit positions
         for (unsigned int j = 0; (j != i && j < 32); j++) {
-            EXPECT_EQ(0U, packedExtract32((1U << j), mask));
+            EXPECT_EQ(0U, pext32((1U << j), mask));
         }
     }
 }
@@ -69,10 +69,10 @@ TEST(Shuffle, PackedExtract32_1) {
 TEST(Shuffle, PackedExtract32_2) {
     // All 32 bits in mask are on
     u32 mask = ~0U;
-    EXPECT_EQ(0U, packedExtract32(0, mask));
-    EXPECT_EQ(mask, packedExtract32(mask, mask));
+    EXPECT_EQ(0U, pext32(0, mask));
+    EXPECT_EQ(mask, pext32(mask, mask));
     for (unsigned int i = 0; i < 32; i++) {
-        EXPECT_EQ(1U << i, packedExtract32(1U << i, mask));
+        EXPECT_EQ(1U << i, pext32(1U << i, mask));
     }
 }
 
@@ -84,16 +84,16 @@ TEST(Shuffle, PackedExtract32_3) {
     }
 
     // Test both cases (all even bits, all odd bits)
-    EXPECT_EQ((1U << 16) - 1, packedExtract32(mask, mask));
-    EXPECT_EQ((1U << 16) - 1, packedExtract32(~mask, ~mask));
-    EXPECT_EQ(0U, packedExtract32(~mask, mask));
-    EXPECT_EQ(0U, packedExtract32(mask, ~mask));
+    EXPECT_EQ((1U << 16) - 1, pext32(mask, mask));
+    EXPECT_EQ((1U << 16) - 1, pext32(~mask, ~mask));
+    EXPECT_EQ(0U, pext32(~mask, mask));
+    EXPECT_EQ(0U, pext32(mask, ~mask));
 
     for (unsigned int i = 0; i < 32; i += 2) {
-        EXPECT_EQ(1U << (i/2), packedExtract32(1U << i, mask));
-        EXPECT_EQ(0U, packedExtract32(1U << i, ~mask));
-        EXPECT_EQ(1U << (i/2), packedExtract32(1U << (i+1), ~mask));
-        EXPECT_EQ(0U, packedExtract32(1U << (i+1), mask));
+        EXPECT_EQ(1U << (i/2), pext32(1U << i, mask));
+        EXPECT_EQ(0U, pext32(1U << i, ~mask));
+        EXPECT_EQ(1U << (i/2), pext32(1U << (i+1), ~mask));
+        EXPECT_EQ(0U, pext32(1U << (i+1), mask));
     }
 }
 
@@ -102,14 +102,14 @@ TEST(Shuffle, PackedExtract64_1) {
     for (unsigned int i = 0; i < 64; i++) {
         // shuffle a single 1 bit to the front
         u64a mask = 1ULL << i;
-        EXPECT_EQ(1U, packedExtract64(mask, mask));
-        EXPECT_EQ(1U, packedExtract64(~0ULL, mask));
+        EXPECT_EQ(1U, pext64(mask, mask));
+        EXPECT_EQ(1U, pext64(~0ULL, mask));
         // we should get zero out of these cases
-        EXPECT_EQ(0U, packedExtract64(0, mask));
-        EXPECT_EQ(0U, packedExtract64(~mask, mask));
+        EXPECT_EQ(0U, pext64(0, mask));
+        EXPECT_EQ(0U, pext64(~mask, mask));
         // we should get zero out of all the other bit positions
         for (unsigned int j = 0; (j != i && j < 64); j++) {
-            EXPECT_EQ(0U, packedExtract64((1ULL << j), mask));
+            EXPECT_EQ(0U, pext64((1ULL << j), mask));
         }
     }
 }
@@ -117,26 +117,26 @@ TEST(Shuffle, PackedExtract64_1) {
 TEST(Shuffle, PackedExtract64_2) {
     // Fill first half of mask
     u64a mask = 0x00000000ffffffffULL;
-    EXPECT_EQ(0U, packedExtract64(0, mask));
-    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
+    EXPECT_EQ(0U, pext64(0, mask));
+    EXPECT_EQ(0xffffffffU, pext64(mask, mask));
     for (unsigned int i = 0; i < 32; i++) {
-        EXPECT_EQ(1U << i, packedExtract64(1ULL << i, mask));
+        EXPECT_EQ(1U << i, pext64(1ULL << i, mask));
     }
 
     // Fill second half of mask
     mask = 0xffffffff00000000ULL;
-    EXPECT_EQ(0U, packedExtract64(0, mask));
-    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
+    EXPECT_EQ(0U, pext64(0, mask));
+    EXPECT_EQ(0xffffffffU, pext64(mask, mask));
     for (unsigned int i = 32; i < 64; i++) {
-        EXPECT_EQ(1U << (i - 32), packedExtract64(1ULL << i, mask));
+        EXPECT_EQ(1U << (i - 32), pext64(1ULL << i, mask));
     }
 
     // Try one in the middle
     mask = 0x0000ffffffff0000ULL;
-    EXPECT_EQ(0U, packedExtract64(0, mask));
-    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
+    EXPECT_EQ(0U, pext64(0, mask));
+    EXPECT_EQ(0xffffffffU, pext64(mask, mask));
     for (unsigned int i = 16; i < 48; i++) {
-        EXPECT_EQ(1U << (i - 16), packedExtract64(1ULL << i, mask));
+        EXPECT_EQ(1U << (i - 16), pext64(1ULL << i, mask));
     }
 }
 
@@ -148,16 +148,16 @@ TEST(Shuffle, PackedExtract64_3) {
     }
 
     // Test both cases (all even bits, all odd bits)
-    EXPECT_EQ(0xffffffffU, packedExtract64(mask, mask));
-    EXPECT_EQ(0xffffffffU, packedExtract64(~mask, ~mask));
-    EXPECT_EQ(0U, packedExtract64(~mask, mask));
-    EXPECT_EQ(0U, packedExtract64(mask, ~mask));
+    EXPECT_EQ(0xffffffffU, pext64(mask, mask));
+    EXPECT_EQ(0xffffffffU, pext64(~mask, ~mask));
+    EXPECT_EQ(0U, pext64(~mask, mask));
+    EXPECT_EQ(0U, pext64(mask, ~mask));
 
     for (unsigned int i = 0; i < 64; i += 2) {
-        EXPECT_EQ(1U << (i/2), packedExtract64(1ULL << i, mask));
-        EXPECT_EQ(0U, packedExtract64(1ULL << i, ~mask));
-        EXPECT_EQ(1U << (i/2), packedExtract64(1ULL << (i+1), ~mask));
-        EXPECT_EQ(0U, packedExtract64(1ULL << (i+1), mask));
+        EXPECT_EQ(1U << (i/2), pext64(1ULL << i, mask));
+        EXPECT_EQ(0U, pext64(1ULL << i, ~mask));
+        EXPECT_EQ(1U << (i/2), pext64(1ULL << (i+1), ~mask));
+        EXPECT_EQ(0U, pext64(1ULL << (i+1), mask));
     }
 }
 
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index 31d4b925..7b34d92e 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -614,6 +614,12 @@ TEST(SimdUtilsTest, set16x8) {
     }
 }
 
+TEST(SimdUtilsTest, set4x32) {
+    u32 cmp[4] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 };
+    m128 simd = set4x32(cmp[0]);
+    ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
+}
+
 #if defined(__AVX2__)
 TEST(SimdUtilsTest, set32x8) {
     char cmp[sizeof(m256)];
@@ -693,4 +699,50 @@ TEST(SimdUtilsTest, variableByteShift128) {
     EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16)));
 }
 
+TEST(SimdUtilsTest, max_u8_m128) {
+    char base1[] = "0123456789ABCDE\xfe";
+    char base2[] = "!!23455889aBCd\xff\xff";
+    char expec[] = "0123456889aBCd\xff\xff";
+    m128 in1 = loadu128(base1);
+    m128 in2 = loadu128(base2);
+    m128 result = max_u8_m128(in1, in2);
+    EXPECT_TRUE(!diff128(result, loadu128(expec)));
+}
+
+TEST(SimdUtilsTest, min_u8_m128) {
+    char base1[] = "0123456789ABCDE\xfe";
+    char base2[] = "!!23455889aBCd\xff\xff";
+    char expec[] = "!!23455789ABCDE\xfe";
+    m128 in1 = loadu128(base1);
+    m128 in2 = loadu128(base2);
+    m128 result = min_u8_m128(in1, in2);
+    EXPECT_TRUE(!diff128(result, loadu128(expec)));
+}
+
+TEST(SimdUtilsTest, sadd_u8_m128) {
+    unsigned char base1[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4',
+                             '1', '2', '3', '4', '1', '2', '3', '4'};
+    unsigned char base2[] = {'a', 0x80, 'b', 'A', 0x10, 0x10, 0x10, 0x10,
+                             0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0};
+    unsigned char expec[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D',
+                             'a', 'b', 'c', 'd', '1', '2', '3', '4'};
+    m128 in1 = loadu128(base1);
+    m128 in2 = loadu128(base2);
+    m128 result = sadd_u8_m128(in1, in2);
+    EXPECT_TRUE(!diff128(result, loadu128(expec)));
+}
+
+TEST(SimdUtilsTest, sub_u8_m128) {
+    unsigned char base1[] = {'a', 0xff, 0xff, 0x82, 'A', 'B', 'C', 'D',
+                             'a', 'b', 'c', 'd', '1', '2', '3', '4'};
+    unsigned char base2[] = {0, 0x80, 0xff, 'A', '1', '2', '3', '4',
+                             '1', '2', '3', '4', '1', '2', '3', '4'};
+    unsigned char expec[] = {'a', 0x7f, 0, 'A', 0x10, 0x10, 0x10, 0x10,
+                             0x30, 0x30, 0x30, 0x30, 0, 0, 0, 0};
+    m128 in1 = loadu128(base1);
+    m128 in2 = loadu128(base2);
+    m128 result = sub_u8_m128(in1, in2);
+    EXPECT_TRUE(!diff128(result, loadu128(expec)));
+}
+
 } // namespace