diff --git a/src/nfa/limex_shuffle.hpp b/src/nfa/limex_shuffle.hpp
new file mode 100644
index 00000000..fe303311
--- /dev/null
+++ b/src/nfa/limex_shuffle.hpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Naive dynamic shuffles.
+ *
+ * These are written with the assumption that the provided masks are sparsely
+ * populated and never contain more than 32 on bits. Other implementations will
+ * be faster and actually correct if these assumptions don't hold true.
+ */
+
+#ifndef LIMEX_SHUFFLE_HPP
+#define LIMEX_SHUFFLE_HPP
+
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+#include "util/unaligned.h"
+#include "util/supervector/supervector.hpp"
+
+template <u16 S>
+u32 packedExtract(SuperVector<S> s, const SuperVector<S> permute, const SuperVector<S> compare);
+
+
+template <>
+really_really_inline
+u32 packedExtract<16>(SuperVector<16> s, const SuperVector<16> permute, const SuperVector<16> compare) {
+    SuperVector<16> shuffled = s.pshufb(permute);
+    SuperVector<16> compared = shuffled & compare;
+    u16 rv = ~compared.eqmask(shuffled);
+    return (u32)rv;
+}
+
+template <>
+really_really_inline
+u32 packedExtract<32>(SuperVector<32> s, const SuperVector<32> permute, const SuperVector<32> compare) {
+    SuperVector<32> shuffled = s.pshufb(permute);
+    SuperVector<32> compared = shuffled & compare;
+    u32 rv = ~compared.eqmask(shuffled); 
+    return (u32)((rv >> 16) | (rv & 0xffffU));
+}
+
+template <>
+really_really_inline
+u32 packedExtract<64>(SuperVector<64> s, const SuperVector<64> permute, const SuperVector<64> compare) {
+    SuperVector<64> shuffled = s.pshufb(permute);
+    SuperVector<64> compared = shuffled & compare;
+    u64a rv = ~compared.eqmask(shuffled);
+    rv = rv >> 32 | rv;
+    return (u32)(((rv >> 16) | rv) & 0xffffU);
+}
+
+
+#endif // LIMEX_SHUFFLE_HPP
\ No newline at end of file
diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp
index b2316bab..d74509d6 100644
--- a/unit/internal/shuffle.cpp
+++ b/unit/internal/shuffle.cpp
@@ -33,6 +33,9 @@
 #include "util/arch.h"
 #include "util/simd_utils.h"
 #include "nfa/limex_shuffle.h"
+#include"util/supervector/supervector.hpp"
+#include "nfa/limex_shuffle.hpp"
+
 
 namespace {
 
@@ -196,6 +199,26 @@ TEST(Shuffle, PackedExtract128_1) {
     }
 }
 
+TEST(Shuffle, PackedExtract_templatized_128_1) {
+    // Try all possible one-bit masks
+    for (unsigned int i = 0; i < 128; i++) {
+        // shuffle a single 1 bit to the front
+        SuperVector<16> permute = SuperVector<16>::Zeroes();
+        SuperVector<16> compare = SuperVector<16>::Zeroes();
+        build_pshufb_masks_onebit(i, &permute.u.v128[0], &compare.u.v128[0]);
+        EXPECT_EQ(1U, packedExtract<16>(setbit<m128>(i), permute, compare));
+        EXPECT_EQ(1U, packedExtract<16>(SuperVector<16>::Ones(), permute, compare));
+        // we should get zero out of these cases
+        EXPECT_EQ(0U, packedExtract<16>(SuperVector<16>::Zeroes(), permute, compare));
+        EXPECT_EQ(0U, packedExtract<16>(not128(setbit<m128>(i)), permute, compare));
+        // we should get zero out of all the other bit positions
+        for (unsigned int j = 0; (j != i && j < 128); j++) {
+            EXPECT_EQ(0U, packedExtract<16>(setbit<m128>(j), permute, compare));
+        }
+    }
+}
+
+
 #if defined(HAVE_AVX2)
 TEST(Shuffle, PackedExtract256_1) {
     // Try all possible one-bit masks
@@ -214,6 +237,27 @@ TEST(Shuffle, PackedExtract256_1) {
         }
     }
 }
+
+
+TEST(Shuffle, PackedExtract_templatized_256_1) {
+    // Try all possible one-bit masks
+    for (unsigned int i = 0; i < 256; i++) {
+        // shuffle a single 1 bit to the front
+        SuperVector<32> permute = SuperVector<32>::Zeroes();
+        SuperVector<32> compare = SuperVector<32>::Zeroes();
+        build_pshufb_masks_onebit(i, &permute.u.v256[0], &compare.u.v256[0]);
+        EXPECT_EQ(1U, packedExtract<32>(setbit<m256>(i), permute, compare));
+        EXPECT_EQ(1U, packedExtract<32>(SuperVector<32>::Ones(), permute, compare));
+        // we should get zero out of these cases
+        EXPECT_EQ(0U, packedExtract<32>(SuperVector<32>::Zeroes(), permute, compare));
+        EXPECT_EQ(0U, packedExtract<32>(not256(setbit<m256>(i)), permute, compare));
+        // we should get zero out of all the other bit positions
+        for (unsigned int j = 0; (j != i && j < 256); j++) {
+            EXPECT_EQ(0U, packedExtract<32>(setbit<m256>(j), permute, compare));
+        }
+    }
+}
+
 #endif
 
 #if defined(HAVE_AVX512)
@@ -234,5 +278,25 @@ TEST(Shuffle, PackedExtract512_1) {
         }
     }
 }
+
+TEST(Shuffle, PackedExtract_templatized_512_1) {
+    // Try all possible one-bit masks
+    for (unsigned int i = 0; i < 512; i++) {
+        // shuffle a single 1 bit to the front
+        SuperVector<64> permute = SuperVector<64>::Zeroes();
+        SuperVector<64> compare = SuperVector<64>::Zeroes();
+        build_pshufb_masks_onebit(i, &permute.u.v512[0], &compare.u.v512[0]);
+        EXPECT_EQ(1U, packedExtract<64>(setbit<m512>(i), permute, compare));
+        EXPECT_EQ(1U, packedExtract<64>(SuperVector<64>::Ones(), permute, compare));
+        // we should get zero out of these cases
+        EXPECT_EQ(0U, packedExtract<64>(SuperVector<64>::Zeroes(), permute, compare));
+        EXPECT_EQ(0U, packedExtract<64>(not512(setbit<m512>(i)), permute, compare));
+        // we should get zero out of all the other bit positions
+        for (unsigned int j = 0; (j != i && j < 512); j++) {
+            EXPECT_EQ(0U, packedExtract<64>(setbit<m512>(j), permute, compare));
+        }
+    }
+}
+
 #endif
 } // namespace
diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp
index 261eeac0..e85d815e 100644
--- a/unit/internal/supervector.cpp
+++ b/unit/internal/supervector.cpp
@@ -290,6 +290,55 @@ TEST(SuperVectorUtilsTest,pshufb128c) {
     }
 }
 
+
+/*Define LSHIFT128_128 macro*/
+#define TEST_LSHIFT128_128(buf, vec, v, l) {                                              \
+                                           auto v_shifted = SP.lshift128(l);              \
+                                           for (int i=15; i>= l; --i) {                   \
+                                               buf[i] = vec[i-l];                         \
+                                           }                                              \
+                                           for (int i=0; i<l; i++) {                      \
+                                               buf[i] = 0;                                \
+                                           }                                              \
+                                           for(int i=0; i<16; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,LShift128_128c){
+    u8 vec[16];
+    for (int i = 0; i<16; i++ ){ vec[i] = i+1; }
+    auto SP = SuperVector<16>::loadu(vec);
+    u8 buf[16];
+    for (int j = 0; j<16; j++) { 
+        TEST_LSHIFT128_128(buf, vec, SP, j);
+    }   
+}
+
+/*Define RSHIFT128_128 macro*/
+#define TEST_RSHIFT128_128(buf, vec, v, l) {                                              \
+                                           auto v_shifted = SP.rshift128(l);              \
+                                           for (int i=0; i<16-l; i++) {                   \
+                                               buf[i] = vec[i+l];                         \
+                                           }                                              \
+                                           for (int i=16-l; i<16; i++) {                  \
+                                               buf[i] = 0;                                \
+                                           }                                              \
+                                           for(int i=0; i<16; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,RShift128_128c){
+    u8 vec[16];
+    for (int i = 0; i<16; i++ ){ vec[i] = i+1; }
+    auto SP = SuperVector<16>::loadu(vec);
+    u8 buf[16];
+    for (int j = 0; j<16; j++) { 
+        TEST_RSHIFT128_128(buf, vec, SP, j);
+    }
+}
+
 /*Define ALIGNR128 macro*/
 #define TEST_ALIGNR128(v1, v2, buf, l) {                                                 \
                                            auto v_aligned = v2.alignr(v1, l);            \
@@ -538,7 +587,7 @@ TEST(SuperVectorUtilsTest,LShift256c){
     }
 }
 
-/*
+
 TEST(SuperVectorUtilsTest,LShift64_256c){
     u64a vec[4] = {128, 512, 256, 1024};
     auto SP = SuperVector<32>::loadu(vec);
@@ -560,7 +609,7 @@ TEST(SuperVectorUtilsTest,RShift64_256c){
         }
     }   
 }
-*/
+
 
 /*Define RSHIFT256 macro*/
 #define TEST_RSHIFT256(buf, vec, v, l) {                                                  \
@@ -587,6 +636,62 @@ TEST(SuperVectorUtilsTest,RShift256c){
 }
 
 
+
+
+
+/*Define LSHIFT128_256 macro*/
+#define TEST_LSHIFT128_256(buf, vec, v, l) {                                              \
+                                           auto v_shifted = SP.lshift128(l);              \
+                                           for (int i=16; i>= l; --i) {                   \
+                                               buf[i] = vec[i-l];                         \
+                                               buf[i+16] = vec[(16+i)-l];                 \
+                                           }                                              \
+                                           for (int i=0; i<l; i++) {                      \
+                                               buf[i] = 0;                                \
+                                               buf[i+16]= 0;                              \
+                                           }                                              \
+                                           for(int i=0; i<32; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,LShift128_256c){
+    u8 vec[32];
+    for (int i = 0; i<32; i++) { vec[i]= i+1;}
+    auto SP = SuperVector<32>::loadu(vec);
+    u8 buf[32];
+    for (int j=0; j<16; j++) {
+        TEST_LSHIFT128_256(buf, vec, SP, j);
+    }
+}
+
+/*Define RSHIFT128_128 macro*/
+#define TEST_RSHIFT128_256(buf, vec, v, l) {                                              \
+                                           auto v_shifted = SP.rshift128(l);              \
+                                           for (int i=0; i<16-l; i++) {                   \
+                                               buf[i] = vec[i+l];                         \
+                                               buf[i+16] = vec[(i+16)+l];                 \
+                                           }                                              \
+                                           for (int i=16-l; i<16; i++) {                  \
+                                               buf[i] = 0;                                \
+                                               buf[i+16] = 0;                             \
+                                           }                                              \
+                                           for(int i=0; i<32; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,RShift128_256c){
+    u8 vec[32];
+    for (int i = 0; i<32; i++ ){ vec[i] = i+1; }
+    auto SP = SuperVector<32>::loadu(vec);
+    u8 buf[32];
+    for(int j=0; j<16; j++) {
+        TEST_RSHIFT128_256(buf, vec, SP, j);
+    }
+}
+
+
 /*Define ALIGNR256 macro*/
 /*
 #define TEST_ALIGNR256(v1, v2, buf, l) {                                                 \
@@ -772,13 +877,13 @@ TEST(SuperVectorUtilsTest,OPANDNOT512c){
     }
 }
 
-/*
+
 TEST(SuperVectorUtilsTest,Movemask512c){
     srand (time(NULL));
     u8 vec[64] = {0};
     u64a r = rand() % 100 + 1;
     for(int i=0; i<64; i++) {
-        if (r & (1 << i)) {
+        if (r & (1ULL << i)) {
             vec[i] = 0xff;
         }
     }
@@ -786,16 +891,16 @@ TEST(SuperVectorUtilsTest,Movemask512c){
     u8 vec2[64] = {0};
     u64a mask = SP.movemask();
     for(int i=0; i<64; i++) {
-        if (mask & (1 << i)) {
+        if (mask & (1ULL << i)) {
             vec2[i] = 0xff;
         }
     }
     for (int i=0; i<64; i++){
-        printf("%d)  vec =%i , vec2 = %i \n",i,vec[i],vec2[i]);
-        //ASSERT_EQ(vec[i],vec2[i]);
+        //printf("%d)  vec =%i , vec2 = %i \n",i,vec[i],vec2[i]);
+        ASSERT_EQ(vec[i],vec2[i]);
     }
 }
-*/
+
 
 TEST(SuperVectorUtilsTest,Eqmask512c){
     srand (time(NULL));
@@ -858,7 +963,7 @@ TEST(SuperVectorUtilsTest,LShift512c){
     }
 }
 
-/*
+
 TEST(SuperVectorUtilsTest,LShift64_512c){
     u64a vec[8] = {32, 64, 128, 256, 512, 512, 256, 1024};
     auto SP = SuperVector<64>::loadu(vec);
@@ -880,7 +985,7 @@ TEST(SuperVectorUtilsTest,RShift64_512c){
         }
     }   
 }
-*/
+
 
 /*Define RSHIFT512 macro*/
 #define TEST_RSHIFT512(buf, vec, v, l) {                                                  \
@@ -906,6 +1011,67 @@ TEST(SuperVectorUtilsTest,RShift512c){
     }
 }
 
+
+/*Define RSHIFT128_512 macro*/
+#define TEST_RSHIFT128_512(buf, vec, v, l) {                                              \
+                                           auto v_shifted = SP.rshift128(l);              \
+                                           for (int i=0; i<16-l; i++) {                   \
+                                               buf[i] = vec[i+l];                         \
+                                               buf[i+16] = vec[(i+16)+l];                 \
+                                               buf[i+32] = vec[(i+32)+l];                 \
+                                               buf[i+48] = vec[(i+48)+l];                 \
+                                           }                                              \
+                                           for (int i=16-l; i<16; i++) {                  \
+                                               buf[i] = 0;                                \
+                                               buf[i+16] = 0;                             \
+                                               buf[i+32] = 0;                             \
+                                               buf[i+48] = 0;                             \
+                                           }                                              \
+                                           for(int i=0; i<64; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+TEST(SuperVectorUtilsTest,RShift128_512c){
+    u8 vec[64];
+    for (int i = 0; i<64; i++ ){ vec[i] = i+1; }
+    auto SP = SuperVector<64>::loadu(vec);
+    u8 buf[64] = {1};
+    for(int j=0; j<16; j++){
+        TEST_RSHIFT128_512(buf, vec, SP, j)
+    }      
+}
+
+/*Define LSHIFT512 macro*/
+#define TEST_LSHIFT128_512(buf, vec, v, l) {                                              \
+                                           auto v_shifted = SP.lshift128(l);              \
+                                           for (int i=16; i>=l; --i) {                    \
+                                               buf[i] = vec[i-l];                         \
+                                               buf[i+16] = vec[(i+16)-l];                 \
+                                               buf[i+32] = vec[(i+32)-l];                 \
+                                               buf[i+48] = vec[(i+48)-l];                 \
+                                           }                                              \
+                                           for (int i=0; i<l; i++) {                      \
+                                               buf[i] = 0;                                \
+                                               buf[i+16] = 0;                             \
+                                               buf[i+32] = 0;                             \
+                                               buf[i+48] = 0;                             \
+                                           }                                              \
+                                           for(int i=0; i<64; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,LShift128_512c){
+    u8 vec[64];
+    for (int i = 0; i<64; i++) { vec[i]= i+1;}
+    auto SP = SuperVector<64>::loadu(vec);
+    u8 buf[64] = {1};
+    for(int j=0; j<16;j++){
+        TEST_LSHIFT128_512(buf, vec, SP, j);
+    }
+}
+
+
 /*Define ALIGNR512 macro*/
 /*
 #define TEST_ALIGNR512(v1, v2, buf, l) {                                                 \