Changes/Additions to SuperVector class * added ==,!=,>=,>,<=,< operators * reworked shift operators to be more uniform and orthogonal, like Arm ISA * Added Unroller class to allow handling of multiple cases but avoid code duplication * pshufb method can now emulate Intel or not (avoids one instruction).

2026-01-17 16:00:26 +03:00 · 2021-10-03 10:43:13 +00:00
parent e7161fdfec
commit 67e0674df8
3 changed files with 329 additions and 184 deletions
--- a/src/util/supervector/arch/arm/impl.cpp
+++ b/src/util/supervector/arch/arm/impl.cpp
@@ -37,86 +37,80 @@

 // 128-bit NEON implementation

-template<>
-really_inline SuperVector<16>::SuperVector(SuperVector const &other)
-{
-  u.v128[0] = other.u.v128[0];
-}
-
 template<>
 really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
 {
-  u.v128[0] = v;
-};
+    u.v128[0] = v;
+}

 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<int8x16_t>(int8x16_t const other)
 {
-  u.v128[0] = static_cast<int32x4_t>(other);
+    u.v128[0] = static_cast<m128>(other);
 }

 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<uint8x16_t>(uint8x16_t const other)
 {
-  u.v128[0] = static_cast<int32x4_t>(other);
+    u.v128[0] = static_cast<m128>(other);
 }

 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<int8_t>(int8_t const other)
 {
-  u.v128[0] = vdupq_n_s8(other);
+    u.v128[0] = vdupq_n_s8(other);
 }

 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<uint8_t>(uint8_t const other)
 {
-  u.v128[0] = vdupq_n_u8(other);
+    u.v128[0] = vdupq_n_u8(other);
 }

 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<int16_t>(int16_t const other)
 {
-  u.v128[0] = vdupq_n_s16(other);
+    u.v128[0] = vdupq_n_s16(other);
 }

 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<uint16_t>(uint16_t const other)
 {
-  u.v128[0] = vdupq_n_u16(other);
+    u.v128[0] = vdupq_n_u16(other);
 }

 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<int32_t>(int32_t const other)
 {
-  u.v128[0] = vdupq_n_s32(other);
+    u.v128[0] = vdupq_n_s32(other);
 }

 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<uint32_t>(uint32_t const other)
 {
-  u.v128[0] = vdupq_n_u32(other);
+    u.v128[0] = vdupq_n_u32(other);
 }

 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<int64_t>(int64_t const other)
 {
-  u.v128[0] = vdupq_n_s64(other);
+    u.v128[0] = vdupq_n_s64(other);
 }

 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<uint64_t>(uint64_t const other)
 {
-  u.v128[0] = vdupq_n_u64(other);
+    u.v128[0] = vdupq_n_u64(other);
 }

 // Constants
@@ -159,9 +153,9 @@ really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &
 }

 template <>
-really_inline SuperVector<16> SuperVector<16>::opand(SuperVector<16> const &b) const
+really_inline SuperVector<16> SuperVector<16>::operator!() const
 {
-    return {vandq_s8(u.v128[0], b.u.v128[0])};
+    return {vmvnq_s8(u.v128[0])};
 }

 template <>
@@ -171,56 +165,279 @@ really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b
 }

 template <>
-really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
+really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
 {
    return {vceqq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
 }

+template <>
+really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const
+{
+    return !(*this == b);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
+{
+    return {vcgtq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
+{
+    return {vcgeq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
+{
+    return {vcltq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
+{
+    return {vcgeq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
+{
+    return (*this == b);
+}
+
 template <>
 really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void) const
 {
-    static const uint8x16_t powers{ 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
+    SuperVector powers{0x8040201008040201UL};

    // Compute the mask from the input
-    uint64x2_t mask  = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint16x8_t)u.v128[0], powers))));
+    uint64x2_t mask  = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint16x8_t)u.v128[0], powers.u.v128[0]))));
    uint64x2_t mask1 = (m128)vextq_s8(mask, vdupq_n_u8(0), 7);
    mask = vorrq_u8(mask, mask1);

    // Get the resulting bytes
    uint16_t output;
-    vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0);
+    vst1q_lane_u16(&output, (uint16x8_t)mask, 0);
    return static_cast<typename SuperVector<16>::movemask_type>(output);
 }

 template <>
 really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
 {
-  return eq(b).movemask();
+    return eq(b).movemask();
 }

 template <>
-really_inline SuperVector<16> SuperVector<16>::rshift128_var(uint8_t const N) const
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
 {
-    switch(N) {
-    case 1: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 1)}; break;
-    case 2: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 2)}; break;
-    case 3: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 3)}; break;
-    case 4: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 4)}; break;
-    case 5: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 5)}; break;
-    case 6: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 6)}; break;
-    case 7: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 7)}; break;
-    case 8: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 8)}; break;
-    case 9: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 9)}; break;
-    case 10: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 10)}; break;
-    case 11: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 11)}; break;
-    case 12: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 12)}; break;
-    case 13: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 13)}; break;
-    case 14: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 14)}; break;
-    case 15: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 15)}; break;
-    case 16: return Zeroes(); break;
-    default: break;
-    }
-    return *this;
+    return {(m128)vshlq_n_s8(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
+{
+    return {(m128)vshlq_n_s16(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
+{
+    return {(m128)vshlq_n_s32(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
+{
+    return {(m128)vshlq_n_s64(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
+{
+    return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_imm() const
+{
+    return vshl_128_imm<N>();
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
+{
+    return {(m128)vshrq_n_s8(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
+{
+    return {(m128)vshrq_n_s16(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
+{
+    return {(m128)vshrq_n_s32(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
+{
+    return {(m128)vshrq_n_s64(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
+{
+    return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_imm() const
+{
+    return vshr_128_imm<N>();
+}
+
+#if !defined(HS_OPTIMIZE)
+template SuperVector<16> SuperVector<16>::vshl_8_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_8_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_8_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const;
+#endif
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s8(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s16(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s32(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s64(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const
+{
+    return vshl_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s8(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s16(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s32(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s64(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
+{
+    return vshr_128(N);
 }

 #ifdef HS_OPTIMIZE
@@ -233,35 +450,10 @@ really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 {
-    return rshift128_var(N);
+    return vshr_128(N);
 }
 #endif

-template <>
-really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) const
-{
-    switch(N) {
-    case 1: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 15)}; break;
-    case 2: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 14)}; break;
-    case 3: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 13)}; break;
-    case 4: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 12)}; break;
-    case 5: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 11)}; break;
-    case 6: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 10)}; break;
-    case 7: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 9)}; break;
-    case 8: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 8)}; break;
-    case 9: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 7)}; break;
-    case 10: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 6)}; break;
-    case 11: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 5)}; break;
-    case 12: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 4)}; break;
-    case 13: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 3)}; break;
-    case 14: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 2)}; break;
-    case 15: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 1)}; break;
-    case 16: return Zeroes(); break;
-    default: break;
-    }
-    return *this;
-}
-
 #ifdef HS_OPTIMIZE
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
@@ -272,10 +464,23 @@ really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
 {
-    return lshift128_var(N);
+    return vshl_128(N);
 }
 #endif

+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N)
+{
+    return Ones().vshr_128(N);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
+{
+    return Ones().vshl_128(N);
+}
+
 template <>
 really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
 {
@@ -293,10 +498,10 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
 template <>
 really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
 {
-    SuperVector<16> mask = Ones().rshift128_var(16 -len);
-    mask.print8("mask");
+    SuperVector mask = Ones_vshr(16 -len);
+    //mask.print8("mask");
    SuperVector<16> v = loadu(ptr);
-    v.print8("v");
+    //v.print8("v");
    return mask & v;
 }

@@ -314,124 +519,53 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in
 template<>
 really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
 {
-  switch(offset) {
-  case 0: return other; break;
-  case 1: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 1)}; break;
-  case 2: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 2)}; break;
-  case 3: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 3)}; break;
-  case 4: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 4)}; break;
-  case 5: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 5)}; break;
-  case 6: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 6)}; break;
-  case 7: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 7)}; break;
-  case 8: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 8)}; break;
-  case 9: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 9)}; break;
-  case 10: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 10)}; break;
-  case 11: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 11)}; break;
-  case 12: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 12)}; break;
-  case 13: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 13)}; break;
-  case 14: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 14)}; break;
-  case 15: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 15)}; break;
-  case 16: return *this; break;
-  default: break;
-  }
-  return *this;
+    switch(offset) {
+    case 0: return other; break;
+    case 1: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 1)}; break;
+    case 2: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 2)}; break;
+    case 3: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 3)}; break;
+    case 4: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 4)}; break;
+    case 5: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 5)}; break;
+    case 6: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 6)}; break;
+    case 7: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 7)}; break;
+    case 8: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 8)}; break;
+    case 9: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 9)}; break;
+    case 10: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 10)}; break;
+    case 11: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 11)}; break;
+    case 12: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 12)}; break;
+    case 13: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 13)}; break;
+    case 14: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 14)}; break;
+    case 15: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 15)}; break;
+    case 16: return *this; break;
+    default: break;
+    }
+    return *this;
 }
 #endif

 template<>
-really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b)
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b)
+{
+    return {vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)b.u.v128[0])};
+}
+
+template<>
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b)
 {
    /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
       In NEON, if >=16, then the result is zero, otherwise it is that lane.
       btranslated is the version that is converted from Intel to NEON.  */
-    int8x16_t btranslated = vandq_s8((int8x16_t)b.u.v128[0], vdupq_n_s8(0x8f));
-    return {vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)btranslated)};
+    SuperVector<16> btranslated = b & SuperVector<16>::dup_s8(0x8f);
+    return pshufb<false>(btranslated);
 }

 template<>
 really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
 {
-    SuperVector<16> mask = Ones().rshift128_var(16 -len);
-    return mask & pshufb(b);
+    SuperVector mask = Ones_vshr(16 -len);
+    return mask & pshufb<true>(b);
 }

-#ifdef HS_OPTIMIZE
-template<>
-really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N)
-{
-  return {(m128)vshlq_n_s64(u.v128[0], N)};
-}
-#else
-template<>
-really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N)
-{
-  switch(N) {
-  case 0: return {(m128)vshlq_n_s64(u.v128[0], 0)}; break;
-  case 1: return {(m128)vshlq_n_s64(u.v128[0], 1)}; break;
-  case 2: return {(m128)vshlq_n_s64(u.v128[0], 2)}; break;
-  case 3: return {(m128)vshlq_n_s64(u.v128[0], 3)}; break;
-  case 4: return {(m128)vshlq_n_s64(u.v128[0], 4)}; break;
-  case 5: return {(m128)vshlq_n_s64(u.v128[0], 5)}; break;
-  case 6: return {(m128)vshlq_n_s64(u.v128[0], 6)}; break;
-  case 7: return {(m128)vshlq_n_s64(u.v128[0], 7)}; break;
-  case 8: return {(m128)vshlq_n_s64(u.v128[0], 8)}; break;
-  case 9: return {(m128)vshlq_n_s64(u.v128[0], 9)}; break;
-  case 10: return {(m128)vshlq_n_s64(u.v128[0], 10)}; break;
-  case 11: return {(m128)vshlq_n_s64(u.v128[0], 11)}; break;
-  case 12: return {(m128)vshlq_n_s64(u.v128[0], 12)}; break;
-  case 13: return {(m128)vshlq_n_s64(u.v128[0], 13)}; break;
-  case 14: return {(m128)vshlq_n_s64(u.v128[0], 14)}; break;
-  case 15: return {(m128)vshlq_n_s64(u.v128[0], 15)}; break;
-  default: break;
-  }
-  return *this;
-}
-#endif
-
-#ifdef HS_OPTIMIZE
-template<>
-really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N)
-{
-  return {(m128)vshrq_n_s64(u.v128[0], N)};
-}
-#else
-template<>
-really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N)
-{
-  switch(N) {
-  case 0: return {(m128)vshrq_n_s64(u.v128[0], 0)}; break;
-  case 1: return {(m128)vshrq_n_s64(u.v128[0], 1)}; break;
-  case 2: return {(m128)vshrq_n_s64(u.v128[0], 2)}; break;
-  case 3: return {(m128)vshrq_n_s64(u.v128[0], 3)}; break;
-  case 4: return {(m128)vshrq_n_s64(u.v128[0], 4)}; break;
-  case 5: return {(m128)vshrq_n_s64(u.v128[0], 5)}; break;
-  case 6: return {(m128)vshrq_n_s64(u.v128[0], 6)}; break;
-  case 7: return {(m128)vshrq_n_s64(u.v128[0], 7)}; break;
-  case 8: return {(m128)vshrq_n_s64(u.v128[0], 8)}; break;
-  case 9: return {(m128)vshrq_n_s64(u.v128[0], 9)}; break;
-  case 10: return {(m128)vshrq_n_s64(u.v128[0], 10)}; break;
-  case 11: return {(m128)vshrq_n_s64(u.v128[0], 11)}; break;
-  case 12: return {(m128)vshrq_n_s64(u.v128[0], 12)}; break;
-  case 13: return {(m128)vshrq_n_s64(u.v128[0], 13)}; break;
-  case 14: return {(m128)vshrq_n_s64(u.v128[0], 14)}; break;
-  case 15: return {(m128)vshrq_n_s64(u.v128[0], 15)}; break;
-  default: break;
-  }
-  return *this;
-}
-#endif
-
-template<>
-really_inline SuperVector<16> SuperVector<16>::lshift128(uint8_t const N)
-{
-  return *this << N;
-}
-
-template<>
-really_inline SuperVector<16> SuperVector<16>::rshift128(uint8_t const N)
-{
-  return *this >> N;
-}
-
-
 #endif // SIMD_IMPL_HPP
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -174,8 +174,9 @@ public:
    double   f64[SIZE / sizeof(double)];
  } u;

-  SuperVector() {};
-  SuperVector(SuperVector const &other);
+  constexpr SuperVector() {};
+  constexpr SuperVector(SuperVector const &other)
+  :u(other.u) {};
  SuperVector(typename base_type::type const v);

  template<typename T>
@@ -198,11 +199,20 @@ public:
  SuperVector operator&(SuperVector const &b) const;
  SuperVector operator|(SuperVector const &b) const;
  SuperVector operator^(SuperVector const &b) const;
+  SuperVector operator!() const;
+
+  SuperVector operator==(SuperVector const &b) const;
+  SuperVector operator!=(SuperVector const &b) const;
+  SuperVector operator>(SuperVector const &b) const;
+  SuperVector operator>=(SuperVector const &b) const;
+  SuperVector operator<(SuperVector const &b) const;
+  SuperVector operator<=(SuperVector const &b) const;

  SuperVector opand(SuperVector const &b) const { return *this & b; }
  SuperVector opor (SuperVector const &b) const { return *this | b; }
  SuperVector opxor(SuperVector const &b) const { return *this ^ b; }
  SuperVector opandnot(SuperVector const &b) const;
+  SuperVector opnot() const { return !(*this); }

  SuperVector eq(SuperVector const &b) const;
  SuperVector operator<<(uint8_t const N) const;
@@ -215,6 +225,7 @@ public:
  static SuperVector loadu_maskz(void const *ptr, uint8_t const len);
  SuperVector alignr(SuperVector &other, int8_t offset);

+  template<bool emulateIntel>
  SuperVector pshufb(SuperVector b);
  SuperVector pshufb_maskz(SuperVector b, uint8_t const len);

--- a/unit/internal/supervector.cpp
+++ b/unit/internal/supervector.cpp
@@ -284,7 +284,7 @@ TEST(SuperVectorUtilsTest,pshufb128c) {
    }
    auto SP1 = SuperVector<16>::loadu(vec);
    auto SP2 = SuperVector<16>::loadu(vec2);
-    auto SResult = SP1.pshufb(SP2);
+    auto SResult = SP1.template pshufb<true>(SP2);
    for (int i=0; i<16; i++) {
        ASSERT_EQ(vec[vec2[i]],SResult.u.u8[i]);
    }