diff --git a/src/util/simd/arch/arm/impl.cpp b/src/util/simd/arch/arm/impl.cpp
index 75796a4b..fb2138d1 100644
--- a/src/util/simd/arch/arm/impl.cpp
+++ b/src/util/simd/arch/arm/impl.cpp
@@ -148,7 +148,7 @@ really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const b
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const b) const
 {
-    return {vandq_s8(u.v128[0], b.u.v128[0])};
+    return {vorrq_s8(u.v128[0], b.u.v128[0])};
 }
 
 template <>
@@ -193,31 +193,31 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(Su
 
 #ifndef HS_OPTIMIZE
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
+really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 {
-	return {vshlq_n_s32(u.v128[0], N)};
+    return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), N)};
 }
 #else
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
+really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 {
 	switch(N) {
 	case 0: return *this; break;
-	case 1: return {vshlq_n_s32((int16x8_t) u.v128[0], 1)}; break;
-	case 2: return {vshlq_n_s32((int16x8_t) u.v128[0], 2)}; break;
-	case 3: return {vshlq_n_s32((int16x8_t) u.v128[0], 3)}; break;
-	case 4: return {vshlq_n_s32((int16x8_t) u.v128[0], 4)}; break;
-	case 5: return {vshlq_n_s32((int16x8_t) u.v128[0], 5)}; break;
-	case 6: return {vshlq_n_s32((int16x8_t) u.v128[0], 6)}; break;
-	case 7: return {vshlq_n_s32((int16x8_t) u.v128[0], 7)}; break;
-	case 8: return {vshlq_n_s32((int16x8_t) u.v128[0], 8)}; break;
-	case 9: return {vshlq_n_s32((int16x8_t) u.v128[0], 9)}; break;
-	case 10: return {vshlq_n_s32((int16x8_t) u.v128[0], 10)}; break;
-	case 11: return {vshlq_n_s32((int16x8_t) u.v128[0], 11)}; break;
-	case 12: return {vshlq_n_s32((int16x8_t) u.v128[0], 12)}; break;
-	case 13: return {vshlq_n_s32((int16x8_t) u.v128[0], 13)}; break;
-	case 14: return {vshlq_n_s32((int16x8_t) u.v128[0], 14)}; break;
-	case 15: return {vshlq_n_s32((int16x8_t) u.v128[0], 15)}; break;
+        case 1: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 1)}; break;
+        case 2: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 2)}; break;
+        case 3: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 3)}; break;
+        case 4: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 4)}; break;
+        case 5: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 5)}; break;
+        case 6: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 6)}; break;
+        case 7: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 7)}; break;
+        case 8: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 8)}; break;
+        case 9: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 9)}; break;
+        case 10: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 10)}; break;
+        case 11: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 11)}; break;
+        case 12: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 12)}; break;
+        case 13: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 13)}; break;
+        case 14: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 14)}; break;
+        case 15: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 15)}; break;
 	case 16: return Zeroes(); break;
 	default: break;
 	}
@@ -225,33 +225,34 @@ really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
 }
 #endif
 
-#ifdef HS_OPTIMIZE
+#ifndef HS_OPTIMIZE
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
+really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
 {
-	return {vshrq_n_s32(u.v128[0], N)};
+    return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - N)};
 }
 #else
 template <>
-really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
+really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
 {
 	switch(N) {
-	case 0: return {vshrq_n_s32(u.v128[0], 0)}; break;
-	case 1: return {vshrq_n_s32(u.v128[0], 1)}; break;
-	case 2: return {vshrq_n_s32(u.v128[0], 2)}; break;
-	case 3: return {vshrq_n_s32(u.v128[0], 3)}; break;
-	case 4: return {vshrq_n_s32(u.v128[0], 4)}; break;
-	case 5: return {vshrq_n_s32(u.v128[0], 5)}; break;
-	case 6: return {vshrq_n_s32(u.v128[0], 6)}; break;
-	case 7: return {vshrq_n_s32(u.v128[0], 7)}; break;
-	case 8: return {vshrq_n_s32(u.v128[0], 8)}; break;
-	case 9: return {vshrq_n_s32(u.v128[0], 9)}; break;
-	case 10: return {vshrq_n_s32(u.v128[0], 10)}; break;
-	case 11: return {vshrq_n_s32(u.v128[0], 11)}; break;
-	case 12: return {vshrq_n_s32(u.v128[0], 12)}; break;
-	case 13: return {vshrq_n_s32(u.v128[0], 13)}; break;
-	case 14: return {vshrq_n_s32(u.v128[0], 14)}; break;
-	case 15: return {vshrq_n_s32(u.v128[0], 15)}; break;
+	case 0: return *this; break;
+        case 1: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 15)}; break;
+        case 2: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 14)}; break;
+        case 3: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 13)}; break;
+        case 4: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 12)}; break;
+        case 5: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 11)}; break;
+        case 6: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 10)}; break;
+        case 7: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 9)}; break;
+        case 8: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 8)}; break;
+        case 9: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 7)}; break;
+        case 10: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 6)}; break;
+        case 11: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 5)}; break;
+        case 12: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 4)}; break;
+        case 13: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 3)}; break;
+        case 14: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 2)}; break;
+        case 15: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 1)}; break;
+	case 16: return Zeroes(); break;
 	default: break;
 	}
 	return *this;
@@ -286,30 +287,30 @@ really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint
 template<>
 really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> r, int8_t offset)
 {
-    return {vextq_s8((int16x8_t)u.v128[0], (int16x8_t)r.u.v128[0], offset)};
+    return {vextq_s8((int16x8_t)r.u.v128[0], (int16x8_t)u.v128[0], 16 - offset)};
 }
 #else
 template<>
-really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> l, int8_t offset)
+really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> r, int8_t offset)
 {
 	switch(offset) {
 	case 0: return *this; break;
-	case 1: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 1)}; break;
-	case 2: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 2)}; break;
-	case 3: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 3)}; break;
-	case 4: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 4)}; break;
-	case 5: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 5)}; break;
-	case 6: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 6)}; break;
-	case 7: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 7)}; break;
-	case 8: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 8)}; break;
-	case 9: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 9)}; break;
-	case 10: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 10)}; break;
-	case 11: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 11)}; break;
-	case 12: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 12)}; break;
-	case 13: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 13)}; break;
-	case 14: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 14)}; break;
-	case 15: return {vextq_s8((int16x8_t) u.v128[0], (int16x8_t) l.u.v128[0], 15)}; break;
-	case 16: return l; break;
+	case 1: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 15)}; break;
+	case 2: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 14)}; break;
+	case 3: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 13)}; break;
+	case 4: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 12)}; break;
+	case 5: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 11)}; break;
+	case 6: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 10)}; break;
+	case 7: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 9)}; break;
+	case 8: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 8)}; break;
+	case 9: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 7)}; break;
+	case 10: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 6)}; break;
+	case 11: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 5)}; break;
+	case 12: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 4)}; break;
+	case 13: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 3)}; break;
+	case 14: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 2)}; break;
+	case 15: return {vextq_s8((int16x8_t) r.u.v128[0], (int16x8_t) u.v128[0], 1)}; break;
+	case 16: return r; break;
 	default: break;
 	}
 	return *this;
diff --git a/src/util/simd/types.hpp b/src/util/simd/types.hpp
index 4c948888..5bfd55ec 100644
--- a/src/util/simd/types.hpp
+++ b/src/util/simd/types.hpp
@@ -31,6 +31,7 @@
 #define SIMD_TYPES_HPP
 
 #include <cstdint>
+#include <cstdio>
 
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/simd/arch/x86/types.hpp"
@@ -213,5 +214,38 @@ public:
 #endif
 
 
+template <uint16_t S>
+static void printv_u8(const char *label, SuperVector<S> &v) {
+    printf("%s: ", label);
+    for(int i=0; i < S; i++)
+        printf("%02x ", v.u.u8[i]);
+    printf("\n");
+}
+
+template <uint16_t S>
+static void printv_u16(const char *label, SuperVector<S> &v) {
+    printf("%s: ", label);
+    for(int i=0; i < S/sizeof(u16); i++)
+        printf("%04x ", v.u.u16[i]);
+    printf("\n");
+}
+
+template <uint16_t S>
+static void printv_u32(const char *label, SuperVector<S> &v) {
+    printf("%s: ", label);
+    for(int i=0; i < S/sizeof(u32); i++)
+        printf("%08x ", v.u.u32[i]);
+    printf("\n");
+}
+
+template <uint16_t S>
+static inline void printv_u64(const char *label, SuperVector<S> &v) {
+    printf("%s: ", label);
+    for(int i=0; i < S/sizeof(u64a); i++)
+        printf("%016lx ", v.u.u64[i]);
+    printf("\n");
+}
+
+
 #endif /* SIMD_TYPES_H */
 
diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp
index 12d9fae0..c6caae6e 100644
--- a/unit/internal/supervector.cpp
+++ b/unit/internal/supervector.cpp
@@ -38,232 +38,266 @@
 
 
 TEST(SuperVectorUtilsTest, Zero128c) {
-    m128_t zeroes = SuperVector<16>::Zeroes();
-    char buf[16]{0};
-    for(int i=0; i<16; i++){ASSERT_EQ(zeroes.u.s8[i],buf[i]);}
+    auto zeroes = SuperVector<16>::Zeroes();
+    u8 buf[16]{0};
+    for(int i=0; i<16; i++) {
+        ASSERT_EQ(zeroes.u.u8[i],buf[i]);
+    }
 }
 
-
 TEST(SuperVectorUtilsTest, Ones128c) {
-    m128_t ones = SuperVector<16>::Ones();
-    char buf[16];
-    for (int i=0; i<16; i++){buf[i]=0xff;}
-    for(int i=0; i<16; i++){ASSERT_EQ(ones.u.s8[i],buf[i]);}
+    auto ones = SuperVector<16>::Ones();
+    u8 buf[16];
+    for (int i=0; i<16; i++) { buf[i]=0xff; }
+    for(int i=0; i<16; i++) {
+        ASSERT_EQ(ones.u.u8[i],buf[i]);
+    }
 }
 
-
 TEST(SuperVectorUtilsTest, Loadu128c) {
-    char vec[32];
-    for(int i=0; i<32;i++){vec[i]=i;}
-    for(int i=0; i<=16;i++){
-        m128_t SP = SuperVector<16>::loadu(vec+i);
-        for(int j=0; j<16; j++){
-            ASSERT_EQ(SP.u.s8[j],vec[j+i]);
+    u8 vec[32];
+    for(int i=0; i<32;i++) { vec[i]=i; }
+    for(int i=0; i<=16;i++) {
+        auto SP = SuperVector<16>::loadu(vec+i);
+        for(int j=0; j<16; j++) {
+            ASSERT_EQ(SP.u.u8[j],vec[j+i]);
         }
     }
 }
 
 TEST(SuperVectorUtilsTest, Load128c) {
-    char vec[128] __attribute__((aligned(16)));
-    for(int i=0; i<128;i++){vec[i]=i;}
-    for(int i=0;i<=16;i+=16){
-        m128_t SP = SuperVector<16>::loadu(vec+i);
+    u8 ALIGN_ATTR(16) vec[32];
+    for(int i=0; i<32;i++) { vec[i]=i; }
+    for(int i=0;i<=16;i+=16) {
+        auto SP = SuperVector<16>::loadu(vec+i);
         for(int j=0; j<16; j++){
-            ASSERT_EQ(SP.u.s8[j],vec[j+i]);
+            ASSERT_EQ(SP.u.u8[j],vec[j+i]);
         }
     }    
 }
 
 TEST(SuperVectorUtilsTest,Equal128c){
-    char vec[32];
+    u8 vec[32];
      for (int i=0; i<32; i++) {vec[i]=i;};
-    m128_t SP1 = SuperVector<16>::loadu(vec);
-    m128_t SP2 = SuperVector<16>::loadu(vec+16);
-    char buf[16]={0};
+    auto SP1 = SuperVector<16>::loadu(vec);
+    auto SP2 = SuperVector<16>::loadu(vec+16);
+    u8 buf[16]={0};
     /*check for equality byte by byte*/
     for (int s=0; s<16; s++){
         if(vec[s]==vec[s+16]){
             buf[s]=1;
         }
     }
-    m128_t SPResult = SP1.eq(SP2);
-    for (int i=0; i<16; i++){ASSERT_EQ(SPResult.u.s8[i],buf[i]);}
+    auto SPResult = SP1.eq(SP2);
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(SPResult.u.s8[i],buf[i]);
+    }
 }
 
 TEST(SuperVectorUtilsTest,And128c){
-    m128_t SPResult = SuperVector<16>::Zeroes() & SuperVector<16>::Ones();
-    for (int i=0; i<16; i++){ASSERT_EQ(SPResult.u.s8[i],0);}
+    auto SPResult = SuperVector<16>::Zeroes() & SuperVector<16>::Ones();
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],0);
+    }
 }
 
 TEST(SuperVectorUtilsTest,OPAnd128c){
-    m128_t SP1 = SuperVector<16>::Zeroes(); 
-    m128_t SP2 = SuperVector<16>::Ones();
+    auto SP1 = SuperVector<16>::Zeroes(); 
+    auto SP2 = SuperVector<16>::Ones();
     SP2 = SP2.opand(SP1);
-    for (int i=0; i<16; i++){ASSERT_EQ(SP2.u.s8[i],0);}
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(SP2.u.u8[i],0);
+    }
 }
 
-
 TEST(SuperVectorUtilsTest,OR128c){
-    m128_t SPResult = SuperVector<16>::Zeroes() | SuperVector<16>::Ones();
-    for (int i=0; i<16; i++){ASSERT_EQ(SPResult.u.s8[i],-1);}
+    auto SPResult = SuperVector<16>::Zeroes() | SuperVector<16>::Ones();
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],0xff);
+    }
 }
 
 TEST(SuperVectorUtilsTest,OPANDNOT128c){
-    m128_t SP1 = SuperVector<16>::Zeroes(); 
-    m128_t SP2 = SuperVector<16>::Ones();
+    auto SP1 = SuperVector<16>::Zeroes(); 
+    auto SP2 = SuperVector<16>::Ones();
     SP2 = SP2.opandnot(SP1);
-    for (int i=0; i<16; i++){ASSERT_EQ(SP2.u.s8[i],0);}
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(SP2.u.s8[i],0);
+    }
 }
 
 TEST(SuperVectorUtilsTest,Movemask128c){
-    uint8_t vec[16] = {0,0xff,0xff,3,4,5,6,7,8,9,0xff,11,12,13,14,0xff};
+    u8 vec[16] = { 0, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0, 0, 0, 0xff };
     /*according to the array above the movemask outcome must be the following:
       10000100000000110 or 0x8406*/
-    m128_t SP = SuperVector<16>::loadu(vec);
-    int SP_Mask = SP.movemask();
-    ASSERT_EQ(SP_Mask,0x8406);
+    auto SP = SuperVector<16>::loadu(vec);
+    int mask = SP.movemask();
+    ASSERT_EQ(mask, 0x8c06);
 }
 
 TEST(SuperVectorUtilsTest,Eqmask128c){
-    uint8_t vec[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-    uint8_t vec2[16] = {16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
-    uint8_t vec3[16] = {16,17,3,4,5,6,7,8,1,2,11,12,13,14,15,16};
-    m128_t SP = SuperVector<16>::loadu(vec);
-    m128_t SP1 = SuperVector<16>::loadu(vec);
-    int SP_Mask = SP.eqmask(SP1);
-    /*if masks are equal the outcome is 1111111111111111 or 0xffff*/
-    ASSERT_EQ(SP_Mask,0xffff);
-    SP = SuperVector<16>::loadu(vec);
-    SP1 = SuperVector<16>::loadu(vec2);
-    SP_Mask = SP.eqmask(SP1);
-    ASSERT_EQ(SP_Mask,0);
-    SP = SuperVector<16>::loadu(vec2);
-    SP1 = SuperVector<16>::loadu(vec3);
-    SP_Mask = SP.eqmask(SP1);
-    ASSERT_EQ(SP_Mask,3);
+    u8 vec[16]  = {  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 };
+    u8 vec2[16] = { 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 };
+    u8 vec3[16] = { 16,17, 3, 4, 5, 6, 7, 8, 1, 2,11,12,13,14,15,16 };
+    auto SP = SuperVector<16>::loadu(vec);
+    auto SP1 = SuperVector<16>::loadu(vec2);
+    auto SP2 = SuperVector<16>::loadu(vec3);
+    int mask = SP.eqmask(SP);
+    /*if vectors are equal the mask is 1111111111111111 or 0xffff*/
+    ASSERT_EQ(mask,0xffff);
+    mask = SP.eqmask(SP2);
+    ASSERT_EQ(mask,0);
+    mask = SP1.eqmask(SP2);
+    ASSERT_EQ(mask,3);
 }
 
 /*Define LSHIFT128 macro*/
-#define TEST_LSHIFT128(l)   {   SP_after_Lshift = SP<<(l);                                              \
-                                buf[l-1]=0;                                                             \
-                                for(int i=0; i<16; i++){ASSERT_EQ(SP_after_Lshift.u.s8[i],buf[i]);}     \
-                            }           
+#define TEST_LSHIFT128(buf, vec, v, l) {                                                  \
+                                           auto v_shifted = v << (l);                     \
+                                           for (int i=15; i>= l; --i) {                   \
+                                               buf[i] = vec[i-l];                         \
+                                           }                                              \
+                                           for (int i=0; i<l; i++) {                      \
+                                               buf[i] = 0;                                \
+                                           }                                              \
+                                           for(int i=0; i<16; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
 
 TEST(SuperVectorUtilsTest,LShift128c){
-    char vec[16];
-    for (int i=0; i<16; i++) {vec[i]=0xff;}
-    m128_t SP = SuperVector<16>::loadu(vec);
-    char buf[16];
-    for (int i=0; i<16; i++){buf[i]=0xff;}
-    m128_t SP_after_Lshift = SP<<(0);
-    TEST_LSHIFT128(1)
-    TEST_LSHIFT128(2)
-    TEST_LSHIFT128(3)
-    TEST_LSHIFT128(4)
-    TEST_LSHIFT128(5)
-    TEST_LSHIFT128(6)
-    TEST_LSHIFT128(7)
-    TEST_LSHIFT128(8)
-    TEST_LSHIFT128(9)
-    TEST_LSHIFT128(10)
-    TEST_LSHIFT128(11)
-    TEST_LSHIFT128(12)
-    TEST_LSHIFT128(13)
-    TEST_LSHIFT128(14)
-    TEST_LSHIFT128(15)
-    TEST_LSHIFT128(16)
+    u8 vec[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
+    auto SP = SuperVector<16>::loadu(vec);
+    u8 buf[16];
+    TEST_LSHIFT128(buf, vec, SP, 0);
+    TEST_LSHIFT128(buf, vec, SP, 1);
+    TEST_LSHIFT128(buf, vec, SP, 2);
+    TEST_LSHIFT128(buf, vec, SP, 3);
+    TEST_LSHIFT128(buf, vec, SP, 4);
+    TEST_LSHIFT128(buf, vec, SP, 5);
+    TEST_LSHIFT128(buf, vec, SP, 6);
+    TEST_LSHIFT128(buf, vec, SP, 7);
+    TEST_LSHIFT128(buf, vec, SP, 8);
+    TEST_LSHIFT128(buf, vec, SP, 9);
+    TEST_LSHIFT128(buf, vec, SP, 10);
+    TEST_LSHIFT128(buf, vec, SP, 11);
+    TEST_LSHIFT128(buf, vec, SP, 12);
+    TEST_LSHIFT128(buf, vec, SP, 13);
+    TEST_LSHIFT128(buf, vec, SP, 14);
+    TEST_LSHIFT128(buf, vec, SP, 15);
+    TEST_LSHIFT128(buf, vec, SP, 16);
 }
 
 TEST(SuperVectorUtilsTest,LShift64_128c){
-    u_int64_t vec[2] = {128, 512}; 
-    m128_t SP = SuperVector<16>::loadu(vec);
-    for(int s = 0; s<16; s++){
-        m128_t SP_after_shift = SP.lshift64(s);
-        for (int i=0; i<2; i++){ASSERT_EQ(SP_after_shift.u.u64[i],vec[i]<<s);}
+    u64a vec[2] = {128, 512};
+    auto SP = SuperVector<16>::loadu(vec);
+    for(int s = 0; s<16; s++) {
+        auto SP_after_shift = SP.lshift64(s);
+        for (int i=0; i<2; i++) {
+            ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] << s);
+        }
     }   
 }
 
 TEST(SuperVectorUtilsTest,RShift64_128c){
-    u_int64_t vec[2] = {128, 512}; 
-    m128_t SP = SuperVector<16>::loadu(vec);
-    for(int s = 0; s<16; s++){
-        m128_t SP_after_shift = SP.rshift64(s);
-        for (int i=0; i<2; i++){ASSERT_EQ(SP_after_shift.u.u64[i],vec[i]>>s);}
+    u64a vec[2] = {128, 512};
+    auto SP = SuperVector<16>::loadu(vec);
+    for(int s = 0; s<16; s++) {
+        auto SP_after_shift = SP.rshift64(s);
+        for (int i=0; i<2; i++) {
+            ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] >> s);
+        }
     }   
 }
 
-
 /*Define RSHIFT128 macro*/
-#define TEST_RSHIFT128(l)   {   SP_after_Rshift = SP>>(l);                                           \
-                                buf[16-l] = 0;                                                       \
-                                for(int i=0; i<16; i++) {ASSERT_EQ(SP_after_Rshift.u.u8[i],buf[i]);} \
-                            }   
+#define TEST_RSHIFT128(buf, vec, v, l) {                                                  \
+                                           auto v_shifted = v >> (l);                     \
+                                           for (int i=0; i<16-l; i++) {                   \
+                                               buf[i] = vec[i+l];                         \
+                                           }                                              \
+                                           for (int i=16-l; i<16; i++) {                  \
+                                               buf[i] = 0;                                \
+                                           }                                              \
+                                           for(int i=0; i<16; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
 
 TEST(SuperVectorUtilsTest,RShift128c){
-    char vec[16];
-    for (int i=0; i<16; i++) {vec[i]=0xff;}
-    m128_t SP = SuperVector<16>::loadu(vec);
-    uint8_t buf[16];
-    for (int i=0; i<16; i++){buf[i]=0xff;}
-    m128_t SP_after_Rshift = SP>>(0);
-    TEST_RSHIFT128(1)
-    TEST_RSHIFT128(2)
-    TEST_RSHIFT128(3)
-    TEST_RSHIFT128(4)
-    TEST_RSHIFT128(5)
-    TEST_RSHIFT128(6)
-    TEST_RSHIFT128(7)
-    TEST_RSHIFT128(8)
-    TEST_RSHIFT128(9)
-    TEST_RSHIFT128(10)
-    TEST_RSHIFT128(11)
-    TEST_RSHIFT128(12)
-    TEST_RSHIFT128(13)
-    TEST_RSHIFT128(14)
-    TEST_RSHIFT128(15)
-    TEST_RSHIFT128(16)
+    u8 vec[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
+    auto SP = SuperVector<16>::loadu(vec);
+    u8 buf[16];
+    TEST_RSHIFT128(buf, vec, SP, 0);
+    TEST_RSHIFT128(buf, vec, SP, 1);
+    TEST_RSHIFT128(buf, vec, SP, 2);
+    TEST_RSHIFT128(buf, vec, SP, 3);
+    TEST_RSHIFT128(buf, vec, SP, 4);
+    TEST_RSHIFT128(buf, vec, SP, 5);
+    TEST_RSHIFT128(buf, vec, SP, 6);
+    TEST_RSHIFT128(buf, vec, SP, 7);
+    TEST_RSHIFT128(buf, vec, SP, 8);
+    TEST_RSHIFT128(buf, vec, SP, 9);
+    TEST_RSHIFT128(buf, vec, SP, 10);
+    TEST_RSHIFT128(buf, vec, SP, 11);
+    TEST_RSHIFT128(buf, vec, SP, 12);
+    TEST_RSHIFT128(buf, vec, SP, 13);
+    TEST_RSHIFT128(buf, vec, SP, 14);
+    TEST_RSHIFT128(buf, vec, SP, 15);
+    TEST_RSHIFT128(buf, vec, SP, 16);
 }
 
-
-TEST(SuperVectorUtilsTest,pshufbc){
+TEST(SuperVectorUtilsTest,pshufbc) {
     srand (time(NULL));
-    uint8_t vec[16];
-    for (int i=0; i<16; i++){vec[i]=rand() % 100 + 1;;};
-    uint8_t vec2[16];
-    for (int i=0; i<16; i++){vec2[i]=i;};
-    m128_t SP1 = SuperVector<16>::loadu(vec);
-    m128_t SP2 = SuperVector<16>::loadu(vec2);
-    m128_t SResult = SP1.pshufb(SP2);
-    for (int i=0; i<16; i++){ASSERT_EQ(vec[vec2[i]],SResult.u.u8[i]);}
+    u8 vec[16];
+    for (int i=0; i<16; i++) {
+        vec[i] = rand() % 100 + 1;
+    }
+    u8 vec2[16];
+    for (int i=0; i<16; i++) {
+        vec2[i]=i;
+    }
+    auto SP1 = SuperVector<16>::loadu(vec);
+    auto SP2 = SuperVector<16>::loadu(vec2);
+    auto SResult = SP1.pshufb(SP2);
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(vec[vec2[i]],SResult.u.u8[i]);
+    }
 }
 
-
 /*Define ALIGNR128 macro*/
-#define TEST_ALIGNR128(l)       {  SP_test = SP1.alignr(SP,l);                                             \
-                                   for (int i=0; i<16; i++){ASSERT_EQ(SP_test.u.u8[i],vec[i+l]);}          \
-                                }
+#define TEST_ALIGNR128(v1, v2, buf, l) {                                                    \
+                                           auto v_aligned = v2.alignr(v1,l);                \
+                                           printv_u8("v1", v1);                        \
+                                           printv_u8("v2", v2);                        \
+                                           printv_u8("v_aligned", v_aligned);          \
+                                           for (size_t i=0; i<16; i++) {                    \
+                                               ASSERT_EQ(v_aligned.u.u8[i], vec[16 -l +i]); \
+                                           }                                                \
+                                       }
 
 TEST(SuperVectorUtilsTest,Alignr128c){
-    uint8_t vec[32];
-    for (int i=0; i<32; i++) {vec[i]=i;}
-    m128_t SP = SuperVector<16>::loadu(vec);
-    m128_t SP1 = SuperVector<16>::loadu(vec+16);
-    m128_t SP_test = SP1.alignr(SP,0);
-    TEST_ALIGNR128(1)
-    TEST_ALIGNR128(2)
-    TEST_ALIGNR128(3)
-    TEST_ALIGNR128(4)
-    TEST_ALIGNR128(5)
-    TEST_ALIGNR128(6)
-    TEST_ALIGNR128(7)
-    TEST_ALIGNR128(8)
-    TEST_ALIGNR128(9)
-    TEST_ALIGNR128(10)
-    TEST_ALIGNR128(11)
-    TEST_ALIGNR128(12)
-    TEST_ALIGNR128(13)
-    TEST_ALIGNR128(14)
-    TEST_ALIGNR128(15)
-    TEST_ALIGNR128(16)
-    
+    u8 vec[32];
+    for (int i=0; i<32; i++) {
+        vec[i]=i;
+    }
+    auto SP1 = SuperVector<16>::loadu(vec);
+    auto SP2 = SuperVector<16>::loadu(vec+16);
+    TEST_ALIGNR128(SP1, SP2, vec, 0);
+    TEST_ALIGNR128(SP1, SP2, vec, 1);
+    TEST_ALIGNR128(SP1, SP2, vec, 2);
+    TEST_ALIGNR128(SP1, SP2, vec, 3);
+    TEST_ALIGNR128(SP1, SP2, vec, 4);
+    TEST_ALIGNR128(SP1, SP2, vec, 5);
+    TEST_ALIGNR128(SP1, SP2, vec, 6);
+    TEST_ALIGNR128(SP1, SP2, vec, 7);
+    TEST_ALIGNR128(SP1, SP2, vec, 8);
+    TEST_ALIGNR128(SP1, SP2, vec, 9);
+    TEST_ALIGNR128(SP1, SP2, vec, 10);
+    TEST_ALIGNR128(SP1, SP2, vec, 11);
+    TEST_ALIGNR128(SP1, SP2, vec, 12);
+    TEST_ALIGNR128(SP1, SP2, vec, 13);
+    TEST_ALIGNR128(SP1, SP2, vec, 14);
+    TEST_ALIGNR128(SP1, SP2, vec, 15);
+    TEST_ALIGNR128(SP1, SP2, vec, 16);
 }