Merge pull request #119 from VectorCamp/feature/vsx-optimizations

VSX optimizations
2025-06-28 16:41:01 +03:00 · 2022-09-08 13:41:13 +03:00 · 2022-09-08 13:41:13 +03:00 · 0c97e5f2c2
commit 0c97e5f2c2
parent c043730675 e3c237a7e0
9 changed files with 344 additions and 307 deletions
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@ -53,24 +53,6 @@
 #include <string.h> // for memcpy
 #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
 /** \brief LUT for the mask1bit functions. */
 ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
    ZEROES_32, ZEROES_32,
    ZEROES_31, 0x01, ZEROES_32,
    ZEROES_31, 0x02, ZEROES_32,
    ZEROES_31, 0x04, ZEROES_32,
    ZEROES_31, 0x08, ZEROES_32,
    ZEROES_31, 0x10, ZEROES_32,
    ZEROES_31, 0x20, ZEROES_32,
    ZEROES_31, 0x40, ZEROES_32,
    ZEROES_31, 0x80, ZEROES_32,
    ZEROES_32, ZEROES_32,
 };
 static really_inline m128 ones128(void) {
    return (m128) vdupq_n_s8(0xFF);
 }
@ -595,9 +577,9 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
 static really_inline
 m128 mask1bit128(unsigned int n) {
    assert(n < sizeof(m128) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
+    static m128 onebit = { 1, 0 };
-    mask_idx -= n / 8;
+    m128 mask = lshiftbyte_m128( onebit, n / 8 );
-    return loadu128(&simd_onebit_masks[mask_idx]);
+    return lshift64_m128( mask, n % 8 );
 }
 // switches on bit N in the given vector.
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@ -88,6 +88,26 @@ static inline void print_m128_2x64(const char *label, m128 vec) {
 #define print_m128_2x64(label, vec) ;
 #endif
 #if !defined(ARCH_IA32) && !defined(ARCH_X86_64)
 #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
 /** \brief LUT for the mask1bit functions. */
 ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
    ZEROES_32, ZEROES_32,
    ZEROES_31, 0x01, ZEROES_32,
    ZEROES_31, 0x02, ZEROES_32,
    ZEROES_31, 0x04, ZEROES_32,
    ZEROES_31, 0x08, ZEROES_32,
    ZEROES_31, 0x10, ZEROES_32,
    ZEROES_31, 0x20, ZEROES_32,
    ZEROES_31, 0x40, ZEROES_32,
    ZEROES_31, 0x80, ZEROES_32,
    ZEROES_32, ZEROES_32,
 };
 #endif // !defined(ARCH_IA32) && !defined(ARCH_X86_64)
 /****
 **** 256-bit Primitives
 ****/
--- a/src/util/arch/ppc64el/simd_utils.h
+++ b/src/util/arch/ppc64el/simd_utils.h
@ -54,34 +54,6 @@ typedef __vector  signed char             int8x16_t;
 typedef unsigned long long int ulong64_t;
 typedef   signed long long int  long64_t;
 /*
 typedef __vector  uint64_t uint64x2_t;
 typedef __vector   int64_t  int64x2_t;
 typedef __vector  uint32_t uint32x4_t;
 typedef __vector   int32_t  int32x4_t;
 typedef __vector  uint16_t uint16x8_t;
 typedef __vector   int16_t  int16x8_t;
 typedef __vector   uint8_t uint8x16_t;
 typedef __vector    int8_t  int8x16_t;*/
 #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
 /** \brief LUT for the mask1bit functions. */
 ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
    ZEROES_32, ZEROES_32,
    ZEROES_31, 0x01, ZEROES_32,
    ZEROES_31, 0x02, ZEROES_32,
    ZEROES_31, 0x04, ZEROES_32,
    ZEROES_31, 0x08, ZEROES_32,
    ZEROES_31, 0x10, ZEROES_32,
    ZEROES_31, 0x20, ZEROES_32,
    ZEROES_31, 0x40, ZEROES_32,
    ZEROES_31, 0x80, ZEROES_32,
    ZEROES_32, ZEROES_32,
 };
 static really_inline m128 ones128(void) {
    return (m128) vec_splat_u8(-1);
@ -115,10 +87,6 @@ static really_inline u32 diffrich128(m128 a, m128 b) {
    m128 mask = (m128) vec_cmpeq(a, b); // _mm_cmpeq_epi32 (a, b);
    mask = vec_and(not128(mask), movemask);
    m128 sum = vec_sums(mask, zeroes128()); 
    //sum = vec_sld(zeroes128(), sum, 4); 
    //s32 ALIGN_ATTR(16) x;
    //vec_ste(sum, 0, &x);   
    //return x;   // it could be ~(movemask_128(mask)) & 0x;
    return sum[3];
 }
@ -131,10 +99,6 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
    uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
    mask = (uint64x2_t) vec_and((uint64x2_t)not128((m128)mask), movemask);
    m128 sum = vec_sums((m128)mask, zeroes128());
    //sum = vec_sld(zeroes128(), sum, 4);
    //s32 ALIGN_ATTR(16) x;
    //vec_ste(sum, 0, &x);
    //return x;
    return sum[3];
 }
@ -150,46 +114,18 @@ m128 sub_2x64(m128 a, m128 b) {
 static really_really_inline
 m128 lshift_m128(m128 a, unsigned b) {
-    switch(b){
+    if (b == 0) return a;
-    case 1: return vec_sld(a, zeroes128(), 1); break;	    
+    m128 sl = (m128) vec_splats((uint8_t) b << 3);
-    case 2: return vec_sld(a, zeroes128(), 2); break;	    
+    m128 result = (m128) vec_slo((uint8x16_t) a, (uint8x16_t) sl);
-    case 3: return vec_sld(a, zeroes128(), 3); break;	    
+    return result;
    case 4: return vec_sld(a, zeroes128(), 4); break;	    
    case 5: return vec_sld(a, zeroes128(), 5); break;	    
    case 6: return vec_sld(a, zeroes128(), 6); break;	    
    case 7: return vec_sld(a, zeroes128(), 7); break;	    
    case 8: return vec_sld(a, zeroes128(), 8); break;	    
    case 9: return vec_sld(a, zeroes128(), 9); break;	    
    case 10: return vec_sld(a, zeroes128(), 10); break;	    
    case 11: return vec_sld(a, zeroes128(), 11); break;	    
    case 12: return vec_sld(a, zeroes128(), 12); break;	    
    case 13: return vec_sld(a, zeroes128(), 13); break;	    
    case 14: return vec_sld(a, zeroes128(), 14); break;	   
    case 15: return vec_sld(a, zeroes128(), 15); break;
    }	
    return a;
 }
 static really_really_inline
 m128 rshift_m128(m128 a, unsigned b) {
-   switch(b){ 
+    if (b == 0) return a;
-    case 1: return vec_sld(zeroes128(), a, 15); break;	    
+    m128 sl = (m128) vec_splats((uint8_t) b << 3);
-    case 2: return vec_sld(zeroes128(), a, 14); break;	    
+    m128 result = (m128) vec_sro((uint8x16_t) a, (uint8x16_t) sl);
-    case 3: return vec_sld(zeroes128(), a, 13); break;	    
+    return result;
    case 4: return vec_sld(zeroes128(), a, 12); break;	    
    case 5: return vec_sld(zeroes128(), a, 11); break;	    
    case 6: return vec_sld(zeroes128(), a, 10); break;	    
    case 7: return vec_sld(zeroes128(), a, 9); break;	    
    case 8: return vec_sld(zeroes128(), a, 8); break;	    
    case 9: return vec_sld(zeroes128(), a, 7); break;	    
    case 10: return vec_sld(zeroes128(), a, 6); break;	    
    case 11: return vec_sld(zeroes128(), a, 5); break;	    
    case 12: return vec_sld(zeroes128(), a, 4); break;	    
    case 13: return vec_sld(zeroes128(), a, 3); break;	    
    case 14: return vec_sld(zeroes128(), a, 2); break;	    
    case 15: return vec_sld(zeroes128(), a, 1); break;	    
   }
   return a;
 }
 static really_really_inline
@ -212,27 +148,13 @@ static really_inline m128 eq64_m128(m128 a, m128 b) {
   return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
 }
 static really_inline u32 movemask128(m128 a) {
-   uint8x16_t s1 = vec_sr((uint8x16_t)a, vec_splat_u8(7));
+   static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-   
+   uint8x16_t bitmask = vec_gb((uint8x16_t) a);
-   uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
+   bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm);
-   uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff));
+   u32 movemask;
-   uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and);
+   vec_ste((uint32x4_t) bitmask, 0, &movemask);
-  
+   return movemask;
   uint32x4_t ss2 = vec_sr((uint32x4_t)s2, vec_splat_u32(14));
   uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff));
   uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
   uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
   uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
   uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
   uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9);
   uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
   uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
   return s5[0];
 }
 static really_inline m128 set1_16x8(u8 c) {
@ -363,7 +285,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
    return a;
 }
 #define CASE_ALIGN_VECTORS(a, b, offset)  case offset: return (m128)vec_sld((int8x16_t)(b), (int8x16_t)(a), (16 - offset)); break;
 static really_really_inline
@ -392,42 +313,50 @@ m128 palignr_imm(m128 r, m128 l, int offset) {
 static really_really_inline
 m128 palignr(m128 r, m128 l, int offset) {
-#if defined(HS_OPTIMIZE)
+    if (offset == 0) return l;
-    // need a faster way to do this.
+    if (offset == 16) return r;
-    return palignr_imm(r, l, offset);
+#if defined(HAVE__BUILTIN_CONSTANT_P)
-#else
+    if (__builtin_constant_p(offset)) {
-    return palignr_imm(r, l, offset);
+        return (m128)vec_sld((int8x16_t)(r), (int8x16_t)(l), 16 - offset);
    }
 #endif
    m128 sl = (m128) vec_splats((uint8_t) (offset << 3));
    m128 sr = (m128) vec_splats((uint8_t) ((16 - offset) << 3));
    m128 rhs = (m128) vec_slo((uint8x16_t) r, (uint8x16_t) sr);
    m128 lhs = (m128) vec_sro((uint8x16_t) l, (uint8x16_t) sl);
    return or128(lhs, rhs);
 }
 #undef CASE_ALIGN_VECTORS
 static really_really_inline
 m128 rshiftbyte_m128(m128 a, unsigned b) {
-   return rshift_m128(a,b);
+    return palignr_imm(zeroes128(), a, b);
 }
 static really_really_inline
 m128 lshiftbyte_m128(m128 a, unsigned b) {
-   return lshift_m128(a,b);
+    return palignr_imm(a, zeroes128(), 16 - b);
 }
 static really_inline
 m128 variable_byte_shift_m128(m128 in, s32 amount) {
    assert(amount >= -16 && amount <= 16);
-    if (amount < 0){
+    if (amount < 0) {
-	    return palignr_imm(zeroes128(), in, -amount);
+        return rshiftbyte_m128(in, -amount);
-    } else{
+    } else {
-	    return palignr_imm(in, zeroes128(), 16 - amount);
+        return lshiftbyte_m128(in, amount);
    }
 }
 static really_inline
 m128 mask1bit128(unsigned int n) {
    assert(n < sizeof(m128) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
+    static uint64x2_t onebit = { 1, 0 };
-    mask_idx -= n / 8;
+    m128 octets = (m128) vec_splats((uint8_t) ((n / 8) << 3));
-    return loadu128(&simd_onebit_masks[mask_idx]);
+    m128 bits = (m128) vec_splats((uint8_t) ((n % 8)));
    m128 mask = (m128) vec_slo((uint8x16_t) onebit, (uint8x16_t) octets);
    return (m128) vec_sll((uint8x16_t) mask, (uint8x16_t) bits);
 }
 // switches on bit N in the given vector.
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@ -165,8 +165,67 @@ m128 load_m128_from_u64a(const u64a *p) {
    return _mm_set_epi64x(0LL, *p);
 }
-#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
+#define CASE_RSHIFT_VECTOR(a, count)  case count: return _mm_srli_si128((m128)(a), (count)); break;
-#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
+
 static really_inline
 m128 rshiftbyte_m128(const m128 a, int count_immed) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
    if (__builtin_constant_p(count_immed)) {
        return _mm_srli_si128(a, count_immed);
    }
 #endif
    switch (count_immed) {
    case 0: return a; break;
    CASE_RSHIFT_VECTOR(a, 1);
    CASE_RSHIFT_VECTOR(a, 2);
    CASE_RSHIFT_VECTOR(a, 3);
    CASE_RSHIFT_VECTOR(a, 4);
    CASE_RSHIFT_VECTOR(a, 5);
    CASE_RSHIFT_VECTOR(a, 6);
    CASE_RSHIFT_VECTOR(a, 7);
    CASE_RSHIFT_VECTOR(a, 8);
    CASE_RSHIFT_VECTOR(a, 9);
    CASE_RSHIFT_VECTOR(a, 10);
    CASE_RSHIFT_VECTOR(a, 11);
    CASE_RSHIFT_VECTOR(a, 12);
    CASE_RSHIFT_VECTOR(a, 13);
    CASE_RSHIFT_VECTOR(a, 14);
    CASE_RSHIFT_VECTOR(a, 15);
    default: return zeroes128(); break;
    }
 }
 #undef CASE_RSHIFT_VECTOR
 #define CASE_LSHIFT_VECTOR(a, count)  case count: return _mm_slli_si128((m128)(a), (count)); break;
 static really_inline
 m128 lshiftbyte_m128(const m128 a, int count_immed) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
    if (__builtin_constant_p(count_immed)) {
        return _mm_slli_si128(a, count_immed);
    }
 #endif
    switch (count_immed) {
    case 0: return a; break;
    CASE_LSHIFT_VECTOR(a, 1);
    CASE_LSHIFT_VECTOR(a, 2);
    CASE_LSHIFT_VECTOR(a, 3);
    CASE_LSHIFT_VECTOR(a, 4);
    CASE_LSHIFT_VECTOR(a, 5);
    CASE_LSHIFT_VECTOR(a, 6);
    CASE_LSHIFT_VECTOR(a, 7);
    CASE_LSHIFT_VECTOR(a, 8);
    CASE_LSHIFT_VECTOR(a, 9);
    CASE_LSHIFT_VECTOR(a, 10);
    CASE_LSHIFT_VECTOR(a, 11);
    CASE_LSHIFT_VECTOR(a, 12);
    CASE_LSHIFT_VECTOR(a, 13);
    CASE_LSHIFT_VECTOR(a, 14);
    CASE_LSHIFT_VECTOR(a, 15);
    default: return zeroes128(); break;
    }
 }
 #undef CASE_LSHIFT_VECTOR
 #if defined(HAVE_SSE41)
 #define extract32from128(a, imm) _mm_extract_epi32(a, imm)
@ -255,14 +314,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
    memcpy(&a, ptr, n);
    return a;
 }
 /*
 #ifdef __cplusplus
 extern "C" {
 #endif
 extern const u8 simd_onebit_masks[];
 #ifdef __cplusplus
 }
 #endif*/
 static really_inline
 m128 mask1bit128(unsigned int n) {
@ -330,6 +381,7 @@ m128 palignr_sw(m128 r, m128 l, int offset) {
 	    break;
    }
 }
 #undef CASE_ALIGN_VECTORS
 static really_really_inline
 m128 palignr(m128 r, m128 l, int offset) {
@ -340,7 +392,6 @@ m128 palignr(m128 r, m128 l, int offset) {
 #endif
    return palignr_sw(r, l, offset);
 }
 #undef CASE_ALIGN_VECTORS
 static really_inline
 m128 variable_byte_shift_m128(m128 in, s32 amount) {
--- a/src/util/bitfield.h
+++ b/src/util/bitfield.h
@ -189,10 +189,7 @@ public:
        size_t sum = 0;
        size_t i = 0;
        for (; i + 4 <= num_blocks; i += 4) {
-            sum += popcount64(bits[i]);
+            sum += popcount64x4(&bits[i]);
            sum += popcount64(bits[i + 1]);
            sum += popcount64(bits[i + 2]);
            sum += popcount64(bits[i + 3]);
        }
        for (; i < num_blocks; i++) {
            sum += popcount64(bits[i]);
--- a/src/util/popcount.h
+++ b/src/util/popcount.h
@ -52,6 +52,15 @@ u32 popcount32(u32 x) {
 // #endif
 }
 static really_inline
 u32 popcount32x4(u32 const *x) {
    u32 sum = popcount32(x[0]);
    sum += popcount32(x[1]);
    sum += popcount32(x[2]);
    sum += popcount32(x[3]);
    return sum;
 }
 static really_inline
 u32 popcount64(u64a x) {
    return __builtin_popcountll(x);
@ -73,5 +82,14 @@ u32 popcount64(u64a x) {
 // #endif
 }
 static really_inline
 u32 popcount64x4(u64a const *x) {
    volatile u32 sum = popcount64(x[0]);
    sum += popcount64(x[1]);
    sum += popcount64(x[2]);
    sum += popcount64(x[3]);
    return sum;
 }
 #endif /* UTIL_POPCOUNT_H_ */
--- a/src/util/supervector/arch/ppc64el/impl.cpp
+++ b/src/util/supervector/arch/ppc64el/impl.cpp
@ -39,7 +39,7 @@
 #include "util/supervector/supervector.hpp"
 #include <iostream>
-// 128-bit Powerpc64le implementation
+// 128-bit IBM Power VSX implementation
 template<>
 really_inline SuperVector<16>::SuperVector(SuperVector const &other)
@ -47,6 +47,69 @@ really_inline SuperVector<16>::SuperVector(SuperVector const &other)
    u.v128[0] = other.u.v128[0];
 }
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(char __bool __vector v)
 {
    u.u8x16[0] = (uint8x16_t) v;
 };
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(int8x16_t const v)
 {
    u.s8x16[0] = v;
 };
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(uint8x16_t const v)
 {
    u.u8x16[0] = v;
 };
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(int16x8_t const v)
 {
    u.s16x8[0] = v;
 };
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(uint16x8_t const v)
 {
    u.u16x8[0] = v;
 };
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(int32x4_t const v)
 {
    u.s32x4[0] = v;
 };
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(uint32x4_t const v)
 {
    u.u32x4[0] = v;
 };
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(int64x2_t const v)
 {
    u.s64x2[0] = v;
 };
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(uint64x2_t const v)
 {
    u.u64x2[0] = v;
 };
 template<>
 really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
 {
@ -57,69 +120,69 @@ template<>
 template<>
 really_inline SuperVector<16>::SuperVector(int8_t const other)
 {
-    u.v128[0] = (m128) vec_splats(other);
+    u.s8x16[0] = vec_splats(other);
 }
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(uint8_t const other)
 {
-    u.v128[0] = (m128) vec_splats(static_cast<uint8_t>(other));
+    u.u8x16[0] = vec_splats(static_cast<uint8_t>(other));
 }
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(int16_t const other)
 {
-    u.v128[0] = (m128) vec_splats(other);
+    u.s16x8[0] = vec_splats(other);
 }
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(uint16_t const other)
 {
-    u.v128[0] = (m128) vec_splats(static_cast<uint16_t>(other));
+    u.u16x8[0] = vec_splats(static_cast<uint16_t>(other));
 }
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(int32_t const other)
 {
-    u.v128[0] = (m128) vec_splats(other);
+    u.s32x4[0] = vec_splats(other);
 }
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(uint32_t const other)
 {
-    u.v128[0] = (m128) vec_splats(static_cast<uint32_t>(other));
+    u.u32x4[0] = vec_splats(static_cast<uint32_t>(other));
 }
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(int64_t const other)
 {
-    u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
+    u.s64x2[0] = (int64x2_t) vec_splats(static_cast<ulong64_t>(other));
 }
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector(uint64_t const other)
 {
-    u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
+    u.u64x2[0] = (uint64x2_t) vec_splats(static_cast<ulong64_t>(other));
 }
 // Constants
 template<>
 really_inline SuperVector<16> SuperVector<16>::Ones(void)
 {
-    return  {(m128) vec_splat_s8(-1)};
+    return  { vec_splat_s8(-1)};
 }
 template<>
 really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
 {
-    return  {(m128) vec_splat_s8(0)};
+    return  { vec_splat_s8(0) };
 }
 // Methods
@ -133,39 +196,38 @@ really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
 {
-    return {vec_and(u.v128[0], b.u.v128[0])};
+    return { vec_and(u.v128[0], b.u.v128[0]) };
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
 {
-    return  {vec_or(u.v128[0], b.u.v128[0])};
+    return  { vec_or(u.v128[0], b.u.v128[0]) };
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
 {
-    return  {(m128) vec_xor(u.v128[0], b.u.v128[0])};
+    return  { vec_xor(u.v128[0], b.u.v128[0]) };
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator!() const
 {
-    return  {(m128) vec_xor(u.v128[0], u.v128[0])};
+    return  { vec_xor(u.v128[0], u.v128[0]) };
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
 {
-   m128 not_res = vec_xor(u.v128[0], (m128)vec_splat_s8(-1));
+   int8x16_t not_res = vec_xor(u.s8x16[0], vec_splat_s8(-1));
-   return {(m128) vec_and(not_res, (m128)b.u.v128[0]) };
+   return { vec_and(not_res, b.u.s8x16[0]) };
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
 {
-    return {(m128) vec_cmpeq(u.s8x16[0], b.u.s8x16[0])};
+    return { vec_cmpeq(u.s8x16[0], b.u.s8x16[0])};
 }
 template <>
@ -177,28 +239,27 @@ really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
 { 
-    return {(m128) vec_cmpgt(u.v128[0], b.u.v128[0])}; 
+    return { vec_cmpgt(u.s8x16[0], b.u.s8x16[0])};
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
 {
-    return {(m128) vec_cmpge(u.v128[0], b.u.v128[0])};  
+    return { vec_cmpge(u.s8x16[0], b.u.s8x16[0])};
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
 {
-    return {(m128) vec_cmpgt(b.u.v128[0], u.v128[0])};  
+    return { vec_cmpgt(b.u.s8x16[0], u.s8x16[0])};
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
 {   
-    return {(m128) vec_cmpge(b.u.v128[0], u.v128[0])};   
+    return { vec_cmpge(b.u.s8x16[0], u.s8x16[0])};
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
 {
@ -208,25 +269,12 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
 template <>
 really_inline typename SuperVector<16>::comparemask_type
 SuperVector<16>::comparemask(void) const {
-    uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7));
+    uint8x16_t bitmask = vec_gb( u.u8x16[0]);
-    
+    static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-    uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
+    bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm);
-    uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff));
+    u32 movemask;
-    uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and);
+    vec_ste((uint32x4_t) bitmask, 0, &movemask);
-    
+    return movemask;
    uint32x4_t ss2 = vec_sr((uint32x4_t)s2 , vec_splat_u32(14));
    uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff));
    uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
    uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
    uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
    uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
    uint64x2_t ss4 = vec_sld((uint64x2_t) vec_splats(0), s4, 9);
    uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
    uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
    return s5[0];
 }
 template <>
@ -248,35 +296,35 @@ template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
 {
-    return { (m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)N)) }; 
+    return { vec_sl(u.s8x16[0], vec_splat_u8(N)) };
 }
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
 {
-    return { (m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)N)) };
+    return { vec_sl(u.s16x8[0], vec_splat_u16(N)) };
 }
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
 {
-    return { (m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)N)) };
+    return { vec_sl(u.s32x4[0], vec_splat_u32(N)) };
 }
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
 {
-    return { (m128) vec_sl(u.s64x2[0], vec_splats((ulong64_t)N)) };
+    return { vec_sl(u.s64x2[0], vec_splats((ulong64_t) N)) };
 }
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
 {
-    return { (m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), N)}; 
+    return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)};
 }
 template <>
@ -290,35 +338,35 @@ template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
 {
-    return { (m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)N)) };
+    return { vec_sr(u.s8x16[0], vec_splat_u8(N)) };
 }
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
 {
-    return { (m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)N)) }; 
+    return { vec_sr(u.s16x8[0], vec_splat_u16(N)) };
 }
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
 {
-    return { (m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)N)) };
+    return { vec_sr(u.s32x4[0], vec_splat_u32(N)) };
 }
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
 {		 
-   return { (m128) vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) }; 
+   return { vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) };
 }
 template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
 {   
-    return { (m128) vec_sld((int8x16_t)vec_splat_s8(0), u.s8x16[0], 16 - N) };	
+    return { vec_sld(vec_splat_s8(0), u.s8x16[0], 16 - N) };
 }
 template <>
@ -348,50 +396,40 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
 {
    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    uint8x16_t shift_indices = vec_splats((uint8_t) N);
-    SuperVector result;
+    return { vec_sl(u.u8x16[0], shift_indices) };
    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
    return result;
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) const
 {
    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    uint16x8_t shift_indices = vec_splats((uint16_t) N);
-    SuperVector result; 
+    return { vec_sl(u.u16x8[0], shift_indices) };
    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
    return result;
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
 {
    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    uint32x4_t shift_indices = vec_splats((uint32_t) N);
-    SuperVector result;
+    return { vec_sl(u.u32x4[0], shift_indices) };
    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
    return result;
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
 {
    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    uint64x2_t shift_indices = vec_splats((ulong64_t) N);
-    SuperVector result;
+    return { vec_sl(u.u64x2[0], shift_indices) };
    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
    return result;
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
 {
    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    SuperVector sl{N << 3};
-    SuperVector result;
+    return { vec_slo(u.u8x16[0], sl.u.u8x16[0]) };
    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(v->u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; });
    return result;
 }
 template <>
@ -404,50 +442,40 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
 {
    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    uint8x16_t shift_indices = vec_splats((uint8_t) N);
-    SuperVector result;
+    return { vec_sr(u.u8x16[0], shift_indices) };
    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
    return result;
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
 {
    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    uint16x8_t shift_indices = vec_splats((uint16_t) N);
-    SuperVector result;
+    return { vec_sr(u.u16x8[0], shift_indices) };
    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
    return result;
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
 {
    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    uint32x4_t shift_indices = vec_splats((uint32_t) N);
-    SuperVector result;
+    return { vec_sr(u.u32x4[0], shift_indices) };
    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
    return result;
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
 {
    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    uint64x2_t shift_indices = vec_splats((ulong64_t) N);
-    SuperVector result;
+    return { vec_sr(u.u64x2[0], shift_indices) };
    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
    return result;
 }
 template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N) const
+really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
 {
    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    SuperVector sr{N << 3};
-    SuperVector result;
+    return { vec_sro(u.u8x16[0], sr.u.u8x16[0]) };
    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), v->u.s8x16[0], 16 - n)}; });
    return result;
 }
 template <>
@ -459,51 +487,25 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 {
-    switch(N) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
-    case 1: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0], 15)}; break;
+    if (N == 0) return *this;
-    case 2: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0], 14)}; break;
+    if (__builtin_constant_p(N)) {
-    case 3: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0], 13)}; break;
+        return { vec_sld(vec_splat_s8(0),  u.s8x16[0], 16 - N) };
    case 4: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0], 12)}; break;
    case 5: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0], 11)}; break;
    case 6: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0], 10)}; break;
    case 7: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0],  9)}; break;
    case 8: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0],  8)}; break;
    case 9: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0),  u.s8x16[0],  7)}; break;
    case 10: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 6)}; break;
    case 11: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 5)}; break;
    case 12: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 4)}; break;
    case 13: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 3)}; break;
    case 14: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 2)}; break;
    case 15: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 1)}; break;
    case 16: return Zeroes(); break;
    default: break;
    }
-    return *this;
+#endif
    return vshr_128(N);
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
 {
-    switch(N) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
-    case 1: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 1)}; break;
+    if (N == 0) return *this;
-    case 2: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 2)}; break;
+    if (__builtin_constant_p(N)) {
-    case 3: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 3)}; break;
+        return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)};
    case 4: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 4)}; break;
    case 5: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 5)}; break;
    case 6: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 6)}; break;
    case 7: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 7)}; break;
    case 8: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 8)}; break;
    case 9: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 9)}; break;
    case 10: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 10)}; break;
    case 11: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 11)}; break;
    case 12: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 12)}; break;
    case 13: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 13)}; break;
    case 14: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 14)}; break;
    case 15: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 15)}; break;
    case 16: return Zeroes(); break;
    default: break;
    }
-    return *this;
+#endif
    return vshl_128(N);
 }
 template<>
@ -521,50 +523,39 @@ really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
 template <>
 really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
 {
-    return (m128) vec_xl(0, (const long64_t*)ptr);
+    return { vec_xl(0, (const long64_t*)ptr) };
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
 {
    assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
-    return (m128)  vec_xl(0, (const long64_t*)ptr);
+    return { vec_xl(0, (const long64_t*)ptr) };
 }
 template <>
 really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
 {
    SuperVector<16> mask = Ones_vshr(16 -len);
    mask.print8("mask");
    SuperVector<16> v = loadu(ptr);
    v.print8("v");
    return mask & v;
 }
 template<>
 really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
 {   
-    
+    if (offset == 0) return other;
-    switch(offset) {
+    if (offset == 16) return *this;
-    case 0: return other; break;
+#if defined(HAVE__BUILTIN_CONSTANT_P)
-    case 1: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0], 15)}; break;
+    if (__builtin_constant_p(offset)) {
-    case 2: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0], 14)}; break;
+        return { vec_sld(u.s8x16[0], other.u.s8x16[0], offset) };
    case 3: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0], 13)}; break;
    case 4: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0], 12)}; break;
    case 5: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0], 11)}; break;
    case 6: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0], 10)}; break;
    case 7: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0],  9)}; break;
    case 8: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0],  8)}; break;
    case 9: return {(m128) vec_sld(u.s8x16[0],  other.u.s8x16[0],  7)}; break;
    case 10: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 6)}; break;
    case 11: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 5)}; break;
    case 12: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 4)}; break;
    case 13: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 3)}; break;
    case 14: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 2)}; break;
    case 15: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 1)}; break;
    default: break;
    }
-    return *this;
+#endif
    uint8x16_t sl = vec_splats((uint8_t) (offset << 3));
    uint8x16_t sr = vec_splats((uint8_t) ((16 - offset) << 3));
    uint8x16_t rhs = vec_slo(u.u8x16[0], sr);
    uint8x16_t lhs = vec_sro(other.u.u8x16[0], sl);
    return { vec_or(lhs, rhs) };
 }
 template<>
@ -574,9 +565,9 @@ really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b)
    /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
       In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
       below is the version that is converted from Intel to PPC.  */
-    uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], (uint8x16_t)vec_splats((uint8_t)0x80));
+    uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], vec_splats((uint8_t)0x80));
    uint8x16_t res = vec_perm (u.u8x16[0], u.u8x16[0], b.u.u8x16[0]);
-    return (m128) vec_sel(res, (uint8x16_t)vec_splat_s8(0), mask);
+    return { vec_sel(res, vec_splat_u8(0), mask) };
 }
 template<>
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@ -204,7 +204,7 @@ public:
  SuperVector(typename base_type::type const v);
  template<typename T>
-  SuperVector(T other);
+  SuperVector(T const other);
  SuperVector(SuperVector<SIZE/2> const lo, SuperVector<SIZE/2> const hi);
  SuperVector(previous_type const lo, previous_type const hi);
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@ -723,10 +723,59 @@ TEST(SimdUtilsTest, set2x128) {
 }
 #endif
 #define TEST_LSHIFTBYTE128(v1, buf, l) {                                                 \
                                           m128 v_shifted = lshiftbyte_m128(v1, l);      \
                                           storeu128(res, v_shifted);                    \
                                           int i;                                        \
                                           for (i=0; i < l; i++) {                       \
                                               assert(res[i] == 0);                      \
                                           }                                             \
                                           for (; i < 16; i++) {                         \
                                               assert(res[i] == vec[i - l]);             \
                                           }                                             \
                                       }
 TEST(SimdUtilsTest, lshiftbyte128){
    u8 vec[16];
    u8 res[16];
    for (int i=0; i<16; i++) {
        vec[i]=i;
    }
    m128 v1 = loadu128(vec);
    for (int j = 0; j<16; j++){
        TEST_LSHIFTBYTE128(v1, vec, j);
    }
 }
 #define TEST_RSHIFTBYTE128(v1, buf, l) {                                                 \
                                           m128 v_shifted = rshiftbyte_m128(v1, l);      \
                                           storeu128(res, v_shifted);                    \
                                           int i;                                        \
                                           for (i=15; i >= 16 - l; i--) {                \
                                               assert(res[i] == 0);                      \
                                           }                                             \
                                           for (; i >= 0; i--) {                         \
                                               assert(res[i] == vec[i + l]);             \
                                           }                                             \
                                       }
 TEST(SimdUtilsTest, rshiftbyte128){
    u8 vec[16];
    u8 res[16];
    for (int i=0; i<16; i++) {
        vec[i]=i;
    }
    m128 v1 = loadu128(vec);
    for (int j = 0; j<16; j++){
        TEST_RSHIFTBYTE128(v1, vec, j);
    }
 }
 TEST(SimdUtilsTest, variableByteShift128) {
    char base[] = "0123456789ABCDEF";
    m128 in = loadu128(base);
    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
                         variable_byte_shift_m128(in, 0)));
    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
@ -773,7 +822,7 @@ TEST(SimdUtilsTest, variableByteShift128) {
    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
                         variable_byte_shift_m128(in, 10)));
-    EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 15), variable_byte_shift_m128(in, 15)));
    EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16)));
 }