From 67e0674df8760f751c019bea9abdc125cd974d1a Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Sun, 3 Oct 2021 10:43:13 +0000
Subject: [PATCH] Changes/Additions to SuperVector class * added
 ==,!=,>=,>,<=,< operators * reworked shift operators to be more uniform and
 orthogonal, like Arm ISA * Added Unroller class to allow handling of multiple
 cases but avoid code duplication * pshufb method can now emulate Intel or not
 (avoids one instruction).

---
 src/util/supervector/arch/arm/impl.cpp | 496 ++++++++++++++++---------
 src/util/supervector/supervector.hpp   |  15 +-
 unit/internal/supervector.cpp          |   2 +-
 3 files changed, 329 insertions(+), 184 deletions(-)
diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp
index 65d0faa5..34e5486d 100644
--- a/src/util/supervector/arch/arm/impl.cpp
+++ b/src/util/supervector/arch/arm/impl.cpp
@@ -37,86 +37,80 @@
 
 // 128-bit NEON implementation
 
-template<>
-really_inline SuperVector<16>::SuperVector(SuperVector const &other)
-{
-  u.v128[0] = other.u.v128[0];
-}
-
 template<>
 really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
 {
-  u.v128[0] = v;
-};
+    u.v128[0] = v;
+}
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<int8x16_t>(int8x16_t const other)
 {
-  u.v128[0] = static_cast<int32x4_t>(other);
+    u.v128[0] = static_cast<m128>(other);
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<uint8x16_t>(uint8x16_t const other)
 {
-  u.v128[0] = static_cast<int32x4_t>(other);
+    u.v128[0] = static_cast<m128>(other);
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<int8_t>(int8_t const other)
 {
-  u.v128[0] = vdupq_n_s8(other);
+    u.v128[0] = vdupq_n_s8(other);
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<uint8_t>(uint8_t const other)
 {
-  u.v128[0] = vdupq_n_u8(other);
+    u.v128[0] = vdupq_n_u8(other);
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<int16_t>(int16_t const other)
 {
-  u.v128[0] = vdupq_n_s16(other);
+    u.v128[0] = vdupq_n_s16(other);
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<uint16_t>(uint16_t const other)
 {
-  u.v128[0] = vdupq_n_u16(other);
+    u.v128[0] = vdupq_n_u16(other);
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<int32_t>(int32_t const other)
 {
-  u.v128[0] = vdupq_n_s32(other);
+    u.v128[0] = vdupq_n_s32(other);
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<uint32_t>(uint32_t const other)
 {
-  u.v128[0] = vdupq_n_u32(other);
+    u.v128[0] = vdupq_n_u32(other);
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<int64_t>(int64_t const other)
 {
-  u.v128[0] = vdupq_n_s64(other);
+    u.v128[0] = vdupq_n_s64(other);
 }
 
 template<>
 template<>
 really_inline SuperVector<16>::SuperVector<uint64_t>(uint64_t const other)
 {
-  u.v128[0] = vdupq_n_u64(other);
+    u.v128[0] = vdupq_n_u64(other);
 }
 
 // Constants
@@ -159,9 +153,9 @@ really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::opand(SuperVector<16> const &b) const
+really_inline SuperVector<16> SuperVector<16>::operator!() const
 {
-    return {vandq_s8(u.v128[0], b.u.v128[0])};
+    return {vmvnq_s8(u.v128[0])};
 }
 
 template <>
@@ -171,56 +165,279 @@ really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
+really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
 {
     return {vceqq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
 }
 
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const
+{
+    return !(*this == b);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
+{
+    return {vcgtq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
+{
+    return {vcgeq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
+{
+    return {vcltq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
+{
+    return {vcgeq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
+{
+    return (*this == b);
+}
+
 template <>
 really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void) const
 {
-    static const uint8x16_t powers{ 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
+    SuperVector powers{0x8040201008040201UL};
 
     // Compute the mask from the input
-    uint64x2_t mask  = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint16x8_t)u.v128[0], powers))));
+    uint64x2_t mask  = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint16x8_t)u.v128[0], powers.u.v128[0]))));
     uint64x2_t mask1 = (m128)vextq_s8(mask, vdupq_n_u8(0), 7);
     mask = vorrq_u8(mask, mask1);
 
     // Get the resulting bytes
     uint16_t output;
-    vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0);
+    vst1q_lane_u16(&output, (uint16x8_t)mask, 0);
     return static_cast<typename SuperVector<16>::movemask_type>(output);
 }
 
 template <>
 really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
 {
-  return eq(b).movemask();
+    return eq(b).movemask();
 }
 
 template <>
-really_inline SuperVector<16> SuperVector<16>::rshift128_var(uint8_t const N) const
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
 {
-    switch(N) {
-    case 1: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 1)}; break;
-    case 2: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 2)}; break;
-    case 3: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 3)}; break;
-    case 4: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 4)}; break;
-    case 5: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 5)}; break;
-    case 6: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 6)}; break;
-    case 7: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 7)}; break;
-    case 8: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 8)}; break;
-    case 9: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 9)}; break;
-    case 10: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 10)}; break;
-    case 11: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 11)}; break;
-    case 12: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 12)}; break;
-    case 13: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 13)}; break;
-    case 14: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 14)}; break;
-    case 15: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 15)}; break;
-    case 16: return Zeroes(); break;
-    default: break;
-    }
-    return *this;
+    return {(m128)vshlq_n_s8(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
+{
+    return {(m128)vshlq_n_s16(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
+{
+    return {(m128)vshlq_n_s32(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
+{
+    return {(m128)vshlq_n_s64(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
+{
+    return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_imm() const
+{
+    return vshl_128_imm<N>();
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
+{
+    return {(m128)vshrq_n_s8(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
+{
+    return {(m128)vshrq_n_s16(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
+{
+    return {(m128)vshrq_n_s32(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
+{
+    return {(m128)vshrq_n_s64(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
+{
+    return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_imm() const
+{
+    return vshr_128_imm<N>();
+}
+
+#if !defined(HS_OPTIMIZE)
+template SuperVector<16> SuperVector<16>::vshl_8_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_8_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_8_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const;
+#endif
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s8(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s16(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s32(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s64(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const
+{
+    return vshl_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s8(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s16(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s32(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s64(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
+{
+    return vshr_128(N);
 }
 
 #ifdef HS_OPTIMIZE
@@ -233,35 +450,10 @@ really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 {
-    return rshift128_var(N);
+    return vshr_128(N);
 }
 #endif
 
-template <>
-really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) const
-{
-    switch(N) {
-    case 1: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 15)}; break;
-    case 2: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 14)}; break;
-    case 3: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 13)}; break;
-    case 4: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 12)}; break;
-    case 5: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 11)}; break;
-    case 6: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 10)}; break;
-    case 7: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 9)}; break;
-    case 8: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 8)}; break;
-    case 9: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 7)}; break;
-    case 10: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 6)}; break;
-    case 11: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 5)}; break;
-    case 12: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 4)}; break;
-    case 13: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 3)}; break;
-    case 14: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 2)}; break;
-    case 15: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 1)}; break;
-    case 16: return Zeroes(); break;
-    default: break;
-    }
-    return *this;
-}
-
 #ifdef HS_OPTIMIZE
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
@@ -272,10 +464,23 @@ really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
 {
-    return lshift128_var(N);
+    return vshl_128(N);
 }
 #endif
 
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N)
+{
+    return Ones().vshr_128(N);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
+{
+    return Ones().vshl_128(N);
+}
+
 template <>
 really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
 {
@@ -293,10 +498,10 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
 template <>
 really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
 {
-    SuperVector<16> mask = Ones().rshift128_var(16 -len);
-    mask.print8("mask");
+    SuperVector mask = Ones_vshr(16 -len);
+    //mask.print8("mask");
     SuperVector<16> v = loadu(ptr);
-    v.print8("v");
+    //v.print8("v");
     return mask & v;
 }
 
@@ -314,124 +519,53 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in
 template<>
 really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
 {
-  switch(offset) {
-  case 0: return other; break;
-  case 1: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 1)}; break;
-  case 2: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 2)}; break;
-  case 3: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 3)}; break;
-  case 4: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 4)}; break;
-  case 5: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 5)}; break;
-  case 6: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 6)}; break;
-  case 7: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 7)}; break;
-  case 8: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 8)}; break;
-  case 9: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 9)}; break;
-  case 10: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 10)}; break;
-  case 11: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 11)}; break;
-  case 12: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 12)}; break;
-  case 13: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 13)}; break;
-  case 14: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 14)}; break;
-  case 15: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 15)}; break;
-  case 16: return *this; break;
-  default: break;
-  }
-  return *this;
+    switch(offset) {
+    case 0: return other; break;
+    case 1: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 1)}; break;
+    case 2: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 2)}; break;
+    case 3: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 3)}; break;
+    case 4: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 4)}; break;
+    case 5: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 5)}; break;
+    case 6: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 6)}; break;
+    case 7: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 7)}; break;
+    case 8: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 8)}; break;
+    case 9: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 9)}; break;
+    case 10: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 10)}; break;
+    case 11: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 11)}; break;
+    case 12: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 12)}; break;
+    case 13: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 13)}; break;
+    case 14: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 14)}; break;
+    case 15: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 15)}; break;
+    case 16: return *this; break;
+    default: break;
+    }
+    return *this;
 }
 #endif
 
 template<>
-really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b)
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b)
+{
+    return {vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)b.u.v128[0])};
+}
+
+template<>
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b)
 {
     /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
        In NEON, if >=16, then the result is zero, otherwise it is that lane.
        btranslated is the version that is converted from Intel to NEON.  */
-    int8x16_t btranslated = vandq_s8((int8x16_t)b.u.v128[0], vdupq_n_s8(0x8f));
-    return {vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)btranslated)};
+    SuperVector<16> btranslated = b & SuperVector<16>::dup_s8(0x8f);
+    return pshufb<false>(btranslated);
 }
 
 template<>
 really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
 {
-    SuperVector<16> mask = Ones().rshift128_var(16 -len);
-    return mask & pshufb(b);
+    SuperVector mask = Ones_vshr(16 -len);
+    return mask & pshufb<true>(b);
 }
 
-#ifdef HS_OPTIMIZE
-template<>
-really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N)
-{
-  return {(m128)vshlq_n_s64(u.v128[0], N)};
-}
-#else
-template<>
-really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N)
-{
-  switch(N) {
-  case 0: return {(m128)vshlq_n_s64(u.v128[0], 0)}; break;
-  case 1: return {(m128)vshlq_n_s64(u.v128[0], 1)}; break;
-  case 2: return {(m128)vshlq_n_s64(u.v128[0], 2)}; break;
-  case 3: return {(m128)vshlq_n_s64(u.v128[0], 3)}; break;
-  case 4: return {(m128)vshlq_n_s64(u.v128[0], 4)}; break;
-  case 5: return {(m128)vshlq_n_s64(u.v128[0], 5)}; break;
-  case 6: return {(m128)vshlq_n_s64(u.v128[0], 6)}; break;
-  case 7: return {(m128)vshlq_n_s64(u.v128[0], 7)}; break;
-  case 8: return {(m128)vshlq_n_s64(u.v128[0], 8)}; break;
-  case 9: return {(m128)vshlq_n_s64(u.v128[0], 9)}; break;
-  case 10: return {(m128)vshlq_n_s64(u.v128[0], 10)}; break;
-  case 11: return {(m128)vshlq_n_s64(u.v128[0], 11)}; break;
-  case 12: return {(m128)vshlq_n_s64(u.v128[0], 12)}; break;
-  case 13: return {(m128)vshlq_n_s64(u.v128[0], 13)}; break;
-  case 14: return {(m128)vshlq_n_s64(u.v128[0], 14)}; break;
-  case 15: return {(m128)vshlq_n_s64(u.v128[0], 15)}; break;
-  default: break;
-  }
-  return *this;
-}
-#endif
-
-#ifdef HS_OPTIMIZE
-template<>
-really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N)
-{
-  return {(m128)vshrq_n_s64(u.v128[0], N)};
-}
-#else
-template<>
-really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N)
-{
-  switch(N) {
-  case 0: return {(m128)vshrq_n_s64(u.v128[0], 0)}; break;
-  case 1: return {(m128)vshrq_n_s64(u.v128[0], 1)}; break;
-  case 2: return {(m128)vshrq_n_s64(u.v128[0], 2)}; break;
-  case 3: return {(m128)vshrq_n_s64(u.v128[0], 3)}; break;
-  case 4: return {(m128)vshrq_n_s64(u.v128[0], 4)}; break;
-  case 5: return {(m128)vshrq_n_s64(u.v128[0], 5)}; break;
-  case 6: return {(m128)vshrq_n_s64(u.v128[0], 6)}; break;
-  case 7: return {(m128)vshrq_n_s64(u.v128[0], 7)}; break;
-  case 8: return {(m128)vshrq_n_s64(u.v128[0], 8)}; break;
-  case 9: return {(m128)vshrq_n_s64(u.v128[0], 9)}; break;
-  case 10: return {(m128)vshrq_n_s64(u.v128[0], 10)}; break;
-  case 11: return {(m128)vshrq_n_s64(u.v128[0], 11)}; break;
-  case 12: return {(m128)vshrq_n_s64(u.v128[0], 12)}; break;
-  case 13: return {(m128)vshrq_n_s64(u.v128[0], 13)}; break;
-  case 14: return {(m128)vshrq_n_s64(u.v128[0], 14)}; break;
-  case 15: return {(m128)vshrq_n_s64(u.v128[0], 15)}; break;
-  default: break;
-  }
-  return *this;
-}
-#endif
-
-template<>
-really_inline SuperVector<16> SuperVector<16>::lshift128(uint8_t const N)
-{
-  return *this << N;
-}
-
-template<>
-really_inline SuperVector<16> SuperVector<16>::rshift128(uint8_t const N)
-{
-  return *this >> N;
-}
-
-
 #endif // SIMD_IMPL_HPP
diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp
index 718cd0f6..200783e1 100644
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -174,8 +174,9 @@ public:
     double   f64[SIZE / sizeof(double)];
   } u;
 
-  SuperVector() {};
-  SuperVector(SuperVector const &other);
+  constexpr SuperVector() {};
+  constexpr SuperVector(SuperVector const &other)
+  :u(other.u) {};
   SuperVector(typename base_type::type const v);
 
   template<typename T>
@@ -198,11 +199,20 @@ public:
   SuperVector operator&(SuperVector const &b) const;
   SuperVector operator|(SuperVector const &b) const;
   SuperVector operator^(SuperVector const &b) const;
+  SuperVector operator!() const;
+
+  SuperVector operator==(SuperVector const &b) const;
+  SuperVector operator!=(SuperVector const &b) const;
+  SuperVector operator>(SuperVector const &b) const;
+  SuperVector operator>=(SuperVector const &b) const;
+  SuperVector operator<(SuperVector const &b) const;
+  SuperVector operator<=(SuperVector const &b) const;
 
   SuperVector opand(SuperVector const &b) const { return *this & b; }
   SuperVector opor (SuperVector const &b) const { return *this | b; }
   SuperVector opxor(SuperVector const &b) const { return *this ^ b; }
   SuperVector opandnot(SuperVector const &b) const;
+  SuperVector opnot() const { return !(*this); }
 
   SuperVector eq(SuperVector const &b) const;
   SuperVector operator<<(uint8_t const N) const;
@@ -215,6 +225,7 @@ public:
   static SuperVector loadu_maskz(void const *ptr, uint8_t const len);
   SuperVector alignr(SuperVector &other, int8_t offset);
 
+  template<bool emulateIntel>
   SuperVector pshufb(SuperVector b);
   SuperVector pshufb_maskz(SuperVector b, uint8_t const len);
 
diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp
index 8b6830f0..16a59046 100644
--- a/unit/internal/supervector.cpp
+++ b/unit/internal/supervector.cpp
@@ -284,7 +284,7 @@ TEST(SuperVectorUtilsTest,pshufb128c) {
     }
     auto SP1 = SuperVector<16>::loadu(vec);
     auto SP2 = SuperVector<16>::loadu(vec2);
-    auto SResult = SP1.pshufb(SP2);
+    auto SResult = SP1.template pshufb<true>(SP2);
     for (int i=0; i<16; i++) {
         ASSERT_EQ(vec[vec2[i]],SResult.u.u8[i]);
     }