diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp index 2c4cf3d6..e40b6a38 100644 --- a/src/util/supervector/arch/arm/impl.cpp +++ b/src/util/supervector/arch/arm/impl.cpp @@ -40,83 +40,83 @@ template<> really_inline SuperVector<16>::SuperVector(SuperVector const &other) { - u.v128[0] = other.u.v128[0]; + u.v128[0] = other.u.v128[0]; } template<> really_inline SuperVector<16>::SuperVector(typename base_type::type const v) { - u.v128[0] = v; + u.v128[0] = v; }; template<> template<> really_inline SuperVector<16>::SuperVector(int8x16_t const other) { - u.v128[0] = static_cast(other); + u.v128[0] = static_cast(other); } template<> template<> really_inline SuperVector<16>::SuperVector(uint8x16_t const other) { - u.v128[0] = static_cast(other); + u.v128[0] = static_cast(other); } template<> template<> really_inline SuperVector<16>::SuperVector(int8_t const other) { - u.v128[0] = vdupq_n_s8(other); + u.v128[0] = vdupq_n_s8(other); } template<> template<> really_inline SuperVector<16>::SuperVector(uint8_t const other) { - u.v128[0] = vdupq_n_u8(other); + u.v128[0] = vdupq_n_u8(other); } template<> template<> really_inline SuperVector<16>::SuperVector(int16_t const other) { - u.v128[0] = vdupq_n_s16(other); + u.v128[0] = vdupq_n_s16(other); } template<> template<> really_inline SuperVector<16>::SuperVector(uint16_t const other) { - u.v128[0] = vdupq_n_u16(other); + u.v128[0] = vdupq_n_u16(other); } template<> template<> really_inline SuperVector<16>::SuperVector(int32_t const other) { - u.v128[0] = vdupq_n_s32(other); + u.v128[0] = vdupq_n_s32(other); } template<> template<> really_inline SuperVector<16>::SuperVector(uint32_t const other) { - u.v128[0] = vdupq_n_u32(other); + u.v128[0] = vdupq_n_u32(other); } template<> template<> really_inline SuperVector<16>::SuperVector(int64_t const other) { - u.v128[0] = vdupq_n_s64(other); + u.v128[0] = vdupq_n_s64(other); } template<> template<> really_inline SuperVector<16>::SuperVector(uint64_t const other) { - u.v128[0] = vdupq_n_u64(other); + u.v128[0] = vdupq_n_u64(other); } // Constants @@ -195,22 +195,11 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask( template <> really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const { - return eq(b).movemask(); + return eq(b).movemask(); } -#ifdef HS_OPTIMIZE template <> -really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const -{ - if (N >= 16) { - return Zeroes(); - } else { - return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), N)}; - } -} -#else -template <> -really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const +really_inline SuperVector<16> SuperVector<16>::rshift128_var(uint8_t const N) const { switch(N) { case 1: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 1)}; break; @@ -233,21 +222,23 @@ really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const } return *this; } -#endif #ifdef HS_OPTIMIZE template <> -really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const +really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const { - if (N == 0) { - return *this; - } else { - return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - N)}; - } + return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), N)}; } #else template <> -really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const +really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const +{ + return rshift128_var(N); +} +#endif + +template <> +really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) const { switch(N) { case 1: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 15)}; break; @@ -270,6 +261,19 @@ really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const } return *this; } + +#ifdef HS_OPTIMIZE +template <> +really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const +{ + return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - N)}; +} +#else +template <> +really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const +{ + return lshift128_var(N); +} #endif template <> @@ -289,7 +293,7 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) template <> really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len) { - SuperVector<16> mask = Ones() >> (16 -len); + SuperVector<16> mask = Ones().rshift128_var(16 -len); mask.print8("mask"); SuperVector<16> v = loadu(ptr); v.print8("v"); @@ -310,27 +314,27 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in template<> really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) { - switch(offset) { - case 0: return other; break; - case 1: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 1)}; break; - case 2: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 2)}; break; - case 3: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 3)}; break; - case 4: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 4)}; break; - case 5: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 5)}; break; - case 6: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 6)}; break; - case 7: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 7)}; break; - case 8: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 8)}; break; - case 9: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 9)}; break; - case 10: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 10)}; break; - case 11: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 11)}; break; - case 12: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 12)}; break; - case 13: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 13)}; break; - case 14: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 14)}; break; - case 15: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 15)}; break; - case 16: return *this; break; - default: break; - } - return *this; + switch(offset) { + case 0: return other; break; + case 1: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 1)}; break; + case 2: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 2)}; break; + case 3: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 3)}; break; + case 4: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 4)}; break; + case 5: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 5)}; break; + case 6: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 6)}; break; + case 7: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 7)}; break; + case 8: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 8)}; break; + case 9: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 9)}; break; + case 10: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 10)}; break; + case 11: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 11)}; break; + case 12: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 12)}; break; + case 13: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 13)}; break; + case 14: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 14)}; break; + case 15: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 15)}; break; + case 16: return *this; break; + default: break; + } + return *this; } #endif @@ -348,32 +352,32 @@ really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) template<> really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) { - return {(m128)vshlq_n_s64(u.v128[0], N)}; + return {(m128)vshlq_n_s64(u.v128[0], N)}; } #else template<> really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) { - switch(N) { - case 0: return {(m128)vshlq_n_s64(u.v128[0], 0)}; break; - case 1: return {(m128)vshlq_n_s64(u.v128[0], 1)}; break; - case 2: return {(m128)vshlq_n_s64(u.v128[0], 2)}; break; - case 3: return {(m128)vshlq_n_s64(u.v128[0], 3)}; break; - case 4: return {(m128)vshlq_n_s64(u.v128[0], 4)}; break; - case 5: return {(m128)vshlq_n_s64(u.v128[0], 5)}; break; - case 6: return {(m128)vshlq_n_s64(u.v128[0], 6)}; break; - case 7: return {(m128)vshlq_n_s64(u.v128[0], 7)}; break; - case 8: return {(m128)vshlq_n_s64(u.v128[0], 8)}; break; - case 9: return {(m128)vshlq_n_s64(u.v128[0], 9)}; break; - case 10: return {(m128)vshlq_n_s64(u.v128[0], 10)}; break; - case 11: return {(m128)vshlq_n_s64(u.v128[0], 11)}; break; - case 12: return {(m128)vshlq_n_s64(u.v128[0], 12)}; break; - case 13: return {(m128)vshlq_n_s64(u.v128[0], 13)}; break; - case 14: return {(m128)vshlq_n_s64(u.v128[0], 14)}; break; - case 15: return {(m128)vshlq_n_s64(u.v128[0], 15)}; break; - default: break; - } - return *this; + switch(N) { + case 0: return {(m128)vshlq_n_s64(u.v128[0], 0)}; break; + case 1: return {(m128)vshlq_n_s64(u.v128[0], 1)}; break; + case 2: return {(m128)vshlq_n_s64(u.v128[0], 2)}; break; + case 3: return {(m128)vshlq_n_s64(u.v128[0], 3)}; break; + case 4: return {(m128)vshlq_n_s64(u.v128[0], 4)}; break; + case 5: return {(m128)vshlq_n_s64(u.v128[0], 5)}; break; + case 6: return {(m128)vshlq_n_s64(u.v128[0], 6)}; break; + case 7: return {(m128)vshlq_n_s64(u.v128[0], 7)}; break; + case 8: return {(m128)vshlq_n_s64(u.v128[0], 8)}; break; + case 9: return {(m128)vshlq_n_s64(u.v128[0], 9)}; break; + case 10: return {(m128)vshlq_n_s64(u.v128[0], 10)}; break; + case 11: return {(m128)vshlq_n_s64(u.v128[0], 11)}; break; + case 12: return {(m128)vshlq_n_s64(u.v128[0], 12)}; break; + case 13: return {(m128)vshlq_n_s64(u.v128[0], 13)}; break; + case 14: return {(m128)vshlq_n_s64(u.v128[0], 14)}; break; + case 15: return {(m128)vshlq_n_s64(u.v128[0], 15)}; break; + default: break; + } + return *this; } #endif @@ -381,45 +385,45 @@ really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) template<> really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) { - return {(m128)vshrq_n_s64(u.v128[0], N)}; + return {(m128)vshrq_n_s64(u.v128[0], N)}; } #else template<> really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) { - switch(N) { - case 0: return {(m128)vshrq_n_s64(u.v128[0], 0)}; break; - case 1: return {(m128)vshrq_n_s64(u.v128[0], 1)}; break; - case 2: return {(m128)vshrq_n_s64(u.v128[0], 2)}; break; - case 3: return {(m128)vshrq_n_s64(u.v128[0], 3)}; break; - case 4: return {(m128)vshrq_n_s64(u.v128[0], 4)}; break; - case 5: return {(m128)vshrq_n_s64(u.v128[0], 5)}; break; - case 6: return {(m128)vshrq_n_s64(u.v128[0], 6)}; break; - case 7: return {(m128)vshrq_n_s64(u.v128[0], 7)}; break; - case 8: return {(m128)vshrq_n_s64(u.v128[0], 8)}; break; - case 9: return {(m128)vshrq_n_s64(u.v128[0], 9)}; break; - case 10: return {(m128)vshrq_n_s64(u.v128[0], 10)}; break; - case 11: return {(m128)vshrq_n_s64(u.v128[0], 11)}; break; - case 12: return {(m128)vshrq_n_s64(u.v128[0], 12)}; break; - case 13: return {(m128)vshrq_n_s64(u.v128[0], 13)}; break; - case 14: return {(m128)vshrq_n_s64(u.v128[0], 14)}; break; - case 15: return {(m128)vshrq_n_s64(u.v128[0], 15)}; break; - default: break; - } - return *this; + switch(N) { + case 0: return {(m128)vshrq_n_s64(u.v128[0], 0)}; break; + case 1: return {(m128)vshrq_n_s64(u.v128[0], 1)}; break; + case 2: return {(m128)vshrq_n_s64(u.v128[0], 2)}; break; + case 3: return {(m128)vshrq_n_s64(u.v128[0], 3)}; break; + case 4: return {(m128)vshrq_n_s64(u.v128[0], 4)}; break; + case 5: return {(m128)vshrq_n_s64(u.v128[0], 5)}; break; + case 6: return {(m128)vshrq_n_s64(u.v128[0], 6)}; break; + case 7: return {(m128)vshrq_n_s64(u.v128[0], 7)}; break; + case 8: return {(m128)vshrq_n_s64(u.v128[0], 8)}; break; + case 9: return {(m128)vshrq_n_s64(u.v128[0], 9)}; break; + case 10: return {(m128)vshrq_n_s64(u.v128[0], 10)}; break; + case 11: return {(m128)vshrq_n_s64(u.v128[0], 11)}; break; + case 12: return {(m128)vshrq_n_s64(u.v128[0], 12)}; break; + case 13: return {(m128)vshrq_n_s64(u.v128[0], 13)}; break; + case 14: return {(m128)vshrq_n_s64(u.v128[0], 14)}; break; + case 15: return {(m128)vshrq_n_s64(u.v128[0], 15)}; break; + default: break; + } + return *this; } #endif template<> really_inline SuperVector<16> SuperVector<16>::lshift128(uint8_t const N) { - return *this << N; + return *this << N; } template<> really_inline SuperVector<16> SuperVector<16>::rshift128(uint8_t const N) { - return *this >> N; + return *this >> N; } diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index 0017592f..d245e0ca 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -200,6 +200,8 @@ public: SuperVector rshift64(uint8_t const N); SuperVector lshift128(uint8_t const N); SuperVector rshift128(uint8_t const N); + SuperVector lshift128_var(uint8_t const N) const; + SuperVector rshift128_var(uint8_t const N) const; // Constants static SuperVector Ones();