From 8b7ba89cb5173dad9a04ebd01246655f4155427d Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Sat, 9 Oct 2021 00:31:13 +0300 Subject: [PATCH] add x86 vsh* implementations --- src/util/supervector/arch/x86/impl.cpp | 878 +++++++++++++++---------- unit/internal/supervector.cpp | 4 +- 2 files changed, 516 insertions(+), 366 deletions(-) diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp index 61107d58..eaee7424 100644 --- a/src/util/supervector/arch/x86/impl.cpp +++ b/src/util/supervector/arch/x86/impl.cpp @@ -38,6 +38,7 @@ #include "util/unaligned.h" #include "util/supervector/supervector.hpp" +#if (defined(FAT_RUNTIME) && !defined(HAVE_AVX2) && !defined(HAVE_AVX512)) || (!defined(FAT_RUNTIME) && defined(HAVE_SIMD_128_BITS)) // 128-bit SSE implementation template<> @@ -147,6 +148,12 @@ really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const & return {_mm_xor_si128(u.v128[0], b.u.v128[0])}; } +template <> +really_inline SuperVector<16> SuperVector<16>::operator!() const +{ + return {_mm_xor_si128(u.v128[0], u.v128[0])}; +} + template <> really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const { @@ -154,11 +161,47 @@ really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b } template <> -really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const +really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const { return {_mm_cmpeq_epi8(u.v128[0], b.u.v128[0])}; } +template <> +really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const +{ + return !(*this == b); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const +{ + return {_mm_cmpgt_epi8(u.v128[0], b.u.v128[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const +{ + return {_mm_cmplt_epi8(u.v128[0], b.u.v128[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const +{ + return !(*this < b); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const +{ + return !(*this > b); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const +{ + return (*this == b); +} + template <> really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const { @@ -256,6 +299,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_imm() const return vshr_128_imm(); } +#if !defined(HS_OPTIMIZE) template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const; template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const; template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const; @@ -266,6 +310,7 @@ template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const; template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const; template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const; template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const; +#endif // template <> // really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const @@ -277,6 +322,11 @@ template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const; template <> really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {_mm_slli_epi16(u.v128[0], N)}; + } +#endif if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; @@ -287,6 +337,11 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const template <> really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {_mm_slli_epi32(u.v128[0], N)}; + } +#endif if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; @@ -297,6 +352,11 @@ really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const template <> really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {_mm_slli_epi64(u.v128[0], N)}; + } +#endif if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; @@ -307,6 +367,11 @@ really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const template <> really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {_mm_slli_si128(u.v128[0], N)}; + } +#endif if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; @@ -332,6 +397,11 @@ really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const template <> really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {_mm_srli_epi16(u.v128[0], N)}; + } +#endif if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; @@ -342,6 +412,11 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const template <> really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {_mm_srli_epi32(u.v128[0], N)}; + } +#endif if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; @@ -352,6 +427,11 @@ really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const template <> really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {_mm_srli_epi64(u.v128[0], N)}; + } +#endif if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; @@ -362,6 +442,11 @@ really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const template <> really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {_mm_srli_si128(u.v128[0], N)}; + } +#endif if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; @@ -375,33 +460,27 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const return vshr_128(N); } -#ifdef HS_OPTIMIZE -template <> -really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const -{ - return {_mm_srli_si128(u.v128[0], N)}; -} -#else template <> really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {_mm_srli_si128(u.v128[0], N)}; + } +#endif return vshr_128(N); } -#endif -#ifdef HS_OPTIMIZE -template <> -really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const -{ - return {_mm_slli_si128(u.v128[0], N)}; -} -#else template <> really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {_mm_slli_si128(u.v128[0], N)}; + } +#endif return vshl_128(N); } -#endif template<> really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N) @@ -475,7 +554,8 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in #endif template<> -really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) +template<> +really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) { return {_mm_shuffle_epi8(u.v128[0], b.u.v128[0])}; } @@ -487,8 +567,10 @@ really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, u return mask & pshufb(b); } +#endif // !defined(FAT_RUNTIME) && !defined(HAVE_AVX2) + // 256-bit AVX2 implementation -#if defined(HAVE_AVX2) +#if (defined(FAT_RUNTIME) && defined(HAVE_AVX2) && !defined(HAVE_AVX512)) || (!defined(FAT_RUNTIME) && defined(HAVE_AVX2)) template<> really_inline SuperVector<32>::SuperVector(SuperVector const &other) { @@ -615,6 +697,12 @@ really_inline SuperVector<32> SuperVector<32>::operator^(SuperVector<32> const & return {_mm256_xor_si256(u.v256[0], b.u.v256[0])}; } +template <> +really_inline SuperVector<32> SuperVector<32>::operator!() const +{ + return {_mm256_xor_si256(u.v256[0], u.v256[0])}; +} + template <> really_inline SuperVector<32> SuperVector<32>::opandnot(SuperVector<32> const &b) const { @@ -622,11 +710,47 @@ really_inline SuperVector<32> SuperVector<32>::opandnot(SuperVector<32> const &b } template <> -really_inline SuperVector<32> SuperVector<32>::eq(SuperVector<32> const &b) const +really_inline SuperVector<32> SuperVector<32>::operator==(SuperVector<32> const &b) const { return {_mm256_cmpeq_epi8(u.v256[0], b.u.v256[0])}; } +template <> +really_inline SuperVector<32> SuperVector<32>::operator!=(SuperVector<32> const &b) const +{ + return !(*this == b); +} + +template <> +really_inline SuperVector<32> SuperVector<32>::operator>(SuperVector<32> const &b) const +{ + return {_mm256_cmpgt_epi8(u.v256[0], b.u.v256[0])}; +} + +template <> +really_inline SuperVector<32> SuperVector<32>::operator<(SuperVector<32> const &b) const +{ + return (b > *this); +} + +template <> +really_inline SuperVector<32> SuperVector<32>::operator>=(SuperVector<32> const &b) const +{ + return !(*this < b); +} + +template <> +really_inline SuperVector<32> SuperVector<32>::operator<=(SuperVector<32> const &b) const +{ + return !(*this > b); +} + +template <> +really_inline SuperVector<32> SuperVector<32>::eq(SuperVector<32> const &b) const +{ + return (*this == b); +} + template <> really_inline typename SuperVector<32>::movemask_type SuperVector<32>::movemask(void)const { @@ -678,9 +802,23 @@ really_inline SuperVector<32> SuperVector<32>::vshl_128_imm() const template <> template -really_inline SuperVector<32> SuperVector<32>::vshr_imm() const +really_inline SuperVector<16> SuperVector<32>::vshl_256_imm() const { - return vshr_256_imm(); + if (N == 0) return *this; + if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))}; + if (N == 32) return Zeroes(); + if (N < 16) { + return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)}; + } else { + return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), N - 16)}; + } +} + +template <> +template +really_inline SuperVector<32> SuperVector<32>::vshl_imm() const +{ + return vshl_256_imm(); } // template <> @@ -713,11 +851,25 @@ really_inline SuperVector<32> SuperVector<32>::vshr_64_imm() const template <> template -really_inline SuperVector<16> SuperVector<32>::vshr_128_imm() const +really_inline SuperVector<32> SuperVector<32>::vshr_128_imm() const { return {_mm256_srli_si256(u.v256[0], N)}; } +template <> +template +really_inline SuperVector<32> SuperVector<32>::vshr_256_imm() const +{ + if (N == 0) return *this; + if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))}; + if (N == 32) return Zeroes(); + if (N < 16) { + return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)}; + } else { + return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16)}; + } +} + template <> template really_inline SuperVector<32> SuperVector<32>::vshr_imm() const @@ -725,6 +877,7 @@ really_inline SuperVector<32> SuperVector<32>::vshr_imm() const return vshr_256_imm(); } +#if !defined(HS_OPTIMIZE) template SuperVector<32> SuperVector<32>::vshl_16_imm<1>() const; template SuperVector<32> SuperVector<32>::vshl_64_imm<1>() const; template SuperVector<32> SuperVector<32>::vshl_64_imm<4>() const; @@ -735,6 +888,9 @@ template SuperVector<32> SuperVector<32>::vshr_64_imm<1>() const; template SuperVector<32> SuperVector<32>::vshr_64_imm<4>() const; template SuperVector<32> SuperVector<32>::vshr_128_imm<1>() const; template SuperVector<32> SuperVector<32>::vshr_128_imm<4>() const; +template SuperVector<32> SuperVector<32>::vshr_256_imm<1>() const; +template SuperVector<32> SuperVector<32>::vshr_imm<1>() const; +#endif // template <> // really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const @@ -1026,7 +1182,8 @@ really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, in #endif template<> -really_inline SuperVector<32> SuperVector<32>::pshufb(SuperVector<32> b) +template<> +really_inline SuperVector<32> SuperVector<32>::pshufb(SuperVector<32> b) { return {_mm256_shuffle_epi8(u.v256[0], b.u.v256[0])}; } @@ -1152,7 +1309,6 @@ really_inline SuperVector<64> SuperVector<64>::Zeroes(void) return {_mm512_set1_epi8(0)}; } - // Methods template <> really_inline void SuperVector<64>::operator=(SuperVector<64> const &o) @@ -1178,27 +1334,72 @@ really_inline SuperVector<64> SuperVector<64>::operator^(SuperVector<64> const & return {_mm512_xor_si512(u.v512[0], b.u.v512[0])}; } +template <> +really_inline SuperVector<64> SuperVector<64>::operator!() const +{ + return {_mm512_xor_si512(u.v512[0], u.v512[0])}; +} + template <> really_inline SuperVector<64> SuperVector<64>::opandnot(SuperVector<64> const &b) const { return {_mm512_andnot_si512(u.v512[0], b.u.v512[0])}; } +template <> +really_inline SuperVector<64> SuperVector<64>::operator==(SuperVector<64> const &b) const +{ + SuperVector<64>::movemask_type mask = _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]); + return {_mm512_movm_epi8(mask)}; +} + +template <> +really_inline SuperVector<64> SuperVector<64>::operator!=(SuperVector<64> const &b) const +{ + SuperVector<64>::movemask_type mask = _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]); + return {_mm512_movm_epi8(mask)}; +} + +template <> +really_inline SuperVector<64> SuperVector<64>::operator>(SuperVector<64> const &b) const +{ + SuperVector<64>::movemask_type mask = _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]); + return {_mm512_movm_epi8(mask)}; +} + +template <> +really_inline SuperVector<64> SuperVector<64>::operator<(SuperVector<64> const &b) const +{ + SuperVector<64>::movemask_type mask = _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]); + return {_mm512_movm_epi8(mask)}; +} + +template <> +really_inline SuperVector<64> SuperVector<64>::operator>=(SuperVector<64> const &b) const +{ + SuperVector<64>::movemask_type mask = _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]); + return {_mm512_movm_epi8(mask)}; +} + +template <> +really_inline SuperVector<64> SuperVector<64>::operator<=(SuperVector<64> const &b) const +{ + SuperVector<64>::movemask_type mask = _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]); + return {_mm512_movm_epi8(mask)}; +} + template <> really_inline SuperVector<64> SuperVector<64>::eq(SuperVector<64> const &b) const { - m512_t sp = SuperVector<64>::Zeroes(); - sp.u.v256[0] = _mm256_cmpeq_epi8(u.v256[0], b.u.v256[0]); - sp.u.v256[1] = _mm256_cmpeq_epi8(u.v256[1], b.u.v256[1]); - return {sp.u.v512[0]}; + return (*this == b); } template <> really_inline typename SuperVector<64>::movemask_type SuperVector<64>::movemask(void)const -{ - m512_t msb = SuperVector<64>::dup_u8(0x80); - m512_t mask = msb & *this; - return _mm512_cmpeq_epi8_mask(mask.u.v512[0],msb.u.v512[0]); +{ + __m512i msb = _mm512_set1_epi8(0xFF); + __m512i mask = _mm512_and_si512(msb, u.v512[0]); + return _mm512_cmpeq_epi8_mask(mask, msb); } template <> @@ -1207,6 +1408,283 @@ really_inline typename SuperVector<64>::movemask_type SuperVector<64>::eqmask(Su return _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]); } +// template <> +// template +// really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const +// { +// const uint8_t i = N; +// return {_mm_slli_epi8(u.v128[0], i)}; +// } + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshl_16_imm() const +{ + return {_mm512_slli_epi16(u.v512[0], N)}; +} + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshl_32_imm() const +{ + return {_mm512_slli_epi32(u.v512[0], N)}; +} + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshl_64_imm() const +{ + return {_mm512_slli_epi64(u.v512[0], N)}; +} + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshl_128_imm() const +{ + return {_mm512_bslli_epi128(u.v512[0], N)}; +} + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshl_256_imm() const +{ + return {}; +} + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshl_512_imm() const +{ + return {}; +} + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshl_imm() const +{ + return vshl_512_imm(); +} + +// template <> +// template +// really_inline SuperVector<64> SuperVector<64>::vshr_8_imm() const +// { +// return {_mm_srli_epi8(u.v128[0], N)}; +// } + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshr_16_imm() const +{ + return {_mm512_srli_epi16(u.v512[0], N)}; +} + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshr_32_imm() const +{ + return {_mm512_srli_epi32(u.v512[0], N)}; +} + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshr_64_imm() const +{ + return {_mm512_srli_epi64(u.v512[0], N)}; +} + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshr_128_imm() const +{ + return {_mm512_bsrli_epi128(u.v512[0], N)}; +} + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshr_256_imm() const +{ + return {}; +} + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshr_512_imm() const +{ + return {}; +} + +template <> +template +really_inline SuperVector<64> SuperVector<64>::vshr_imm() const +{ + return vshr_512_imm(); +} + +#if !defined(HS_OPTIMIZE) +template SuperVector<64> SuperVector<64>::vshl_16_imm<1>() const; +template SuperVector<64> SuperVector<64>::vshl_64_imm<1>() const; +template SuperVector<64> SuperVector<64>::vshl_64_imm<4>() const; +template SuperVector<64> SuperVector<64>::vshl_128_imm<1>() const; +template SuperVector<64> SuperVector<64>::vshl_128_imm<4>() const; +template SuperVector<64> SuperVector<64>::vshr_16_imm<1>() const; +template SuperVector<64> SuperVector<64>::vshr_64_imm<1>() const; +template SuperVector<64> SuperVector<64>::vshr_64_imm<4>() const; +template SuperVector<64> SuperVector<64>::vshr_128_imm<1>() const; +template SuperVector<64> SuperVector<64>::vshr_128_imm<4>() const; +#endif + +// template <> +// really_inline SuperVector<64> SuperVector<64>::vshl_8 (uint8_t const N) const +// { +// Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm_slli_epi8(v->u.v128[0], i)}; }); +// if (N == 16) return Zeroes(); +// } +template <> +really_inline SuperVector<64> SuperVector<64>::vshl_16 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 64) return Zeroes(); + SuperVector result; + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_slli_epi16(v->u.v512[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<64> SuperVector<64>::vshl_32 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 64) return Zeroes(); + SuperVector result; + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_slli_epi32(v->u.v512[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<64> SuperVector<64>::vshl_64 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 64) return Zeroes(); + SuperVector result; + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_slli_epi64(v->u.v512[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<64> SuperVector<64>::vshl_128(uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 64) return Zeroes(); + SuperVector result; + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_bslli_epi128(v->u.v512[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<64> SuperVector<64>::vshl_256(uint8_t const N) const +{ + return vshl_128(N); +} + +template <> +really_inline SuperVector<64> SuperVector<64>::vshl_512(uint8_t const N) const +{ + return vshl_128(N); +} + +template <> +really_inline SuperVector<64> SuperVector<64>::vshl(uint8_t const N) const +{ + return vshl_512(N); +} + +// template <> +// really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const +// { +// SuperVector<16> result; +// Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; }); +// if (N == 16) result = Zeroes(); +// return result; +// } + +template <> +really_inline SuperVector<64> SuperVector<64>::vshr_16 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 64) return Zeroes(); + SuperVector result; + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_srli_epi16(v->u.v512[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<64> SuperVector<64>::vshr_32 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 64) return Zeroes(); + SuperVector result; + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_srli_epi32(v->u.v512[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<64> SuperVector<64>::vshr_64 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_srli_epi64(v->u.v512[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<64> SuperVector<64>::vshr_128(uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 64) return Zeroes(); + SuperVector result; + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_bsrli_epi128(v->u.v512[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<64> SuperVector<64>::vshr_256(uint8_t const N) const +{ + return vshr_128(N); +} + +template <> +really_inline SuperVector<64> SuperVector<64>::vshr_512(uint8_t const N) const +{ + return vshr_128(N); +} + +template <> +really_inline SuperVector<64> SuperVector<64>::vshr(uint8_t const N) const +{ + return vshr_512(N); +} + +template<> +really_inline SuperVector<64> SuperVector<64>::Ones_vshr(uint8_t const N) +{ + if (N == 0) return Ones(); + if (N >= 32) + return {SuperVector<32>::Ones_vshr(N - 32), SuperVector<32>::Zeroes()}; + else + return {SuperVector<32>::Ones(), SuperVector<32>::Ones_vshr(N)}; +} + +template<> +really_inline SuperVector<64> SuperVector<64>::Ones_vshl(uint8_t const N) +{ + if (N == 0) return Ones(); + if (N >= 32) + return {SuperVector<32>::Zeroes(), SuperVector<32>::Ones_vshl(N - 32)}; + else + return {SuperVector<32>::Ones_vshl(N), SuperVector<32>::Ones()}; +} + template <> really_inline SuperVector<64> SuperVector<64>::operator>>(uint8_t const N) const { @@ -1278,7 +1756,8 @@ really_inline SuperVector<64> SuperVector<64>::loadu_maskz(void const *ptr, uint } template<> -really_inline SuperVector<64> SuperVector<64>::pshufb(SuperVector<64> b) +template<> +really_inline SuperVector<64> SuperVector<64>::pshufb(SuperVector<64> b) { return {_mm512_shuffle_epi8(u.v512[0], b.u.v512[0])}; } @@ -1301,7 +1780,7 @@ really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t template<> really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t offset) { - if(offset == 0){ + if(offset == 0) { return *this; } else if (offset < 32){ SuperVector<32> lo256 = u.v256[0]; @@ -1323,335 +1802,6 @@ really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t } #endif - -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<64> SuperVector<64>::lshift64(uint8_t const N) -{ - return {_mm512_slli_epi64(u.v512[0], N)}; -} -#else -template<> -really_inline SuperVector<64> SuperVector<64>::lshift64(uint8_t const N) -{ - switch(N) { - case 0: return *this; break; - case 1: return {_mm512_slli_epi64(u.v512[0], 1)}; break; - case 2: return {_mm512_slli_epi64(u.v512[0], 2)}; break; - case 3: return {_mm512_slli_epi64(u.v512[0], 3)}; break; - case 4: return {_mm512_slli_epi64(u.v512[0], 4)}; break; - case 5: return {_mm512_slli_epi64(u.v512[0], 5)}; break; - case 6: return {_mm512_slli_epi64(u.v512[0], 6)}; break; - case 7: return {_mm512_slli_epi64(u.v512[0], 7)}; break; - case 8: return {_mm512_slli_epi64(u.v512[0], 8)}; break; - case 9: return {_mm512_slli_epi64(u.v512[0], 9)}; break; - case 10: return {_mm512_slli_epi64(u.v512[0], 10)}; break; - case 11: return {_mm512_slli_epi64(u.v512[0], 11)}; break; - case 12: return {_mm512_slli_epi64(u.v512[0], 12)}; break; - case 13: return {_mm512_slli_epi64(u.v512[0], 13)}; break; - case 14: return {_mm512_slli_epi64(u.v512[0], 14)}; break; - case 15: return {_mm512_slli_epi64(u.v512[0], 15)}; break; - case 16: return {_mm512_slli_epi64(u.v512[0], 16)}; break; - case 17: return {_mm512_slli_epi64(u.v512[0], 17)}; break; - case 18: return {_mm512_slli_epi64(u.v512[0], 18)}; break; - case 19: return {_mm512_slli_epi64(u.v512[0], 19)}; break; - case 20: return {_mm512_slli_epi64(u.v512[0], 20)}; break; - case 21: return {_mm512_slli_epi64(u.v512[0], 21)}; break; - case 22: return {_mm512_slli_epi64(u.v512[0], 22)}; break; - case 23: return {_mm512_slli_epi64(u.v512[0], 23)}; break; - case 24: return {_mm512_slli_epi64(u.v512[0], 24)}; break; - case 25: return {_mm512_slli_epi64(u.v512[0], 25)}; break; - case 26: return {_mm512_slli_epi64(u.v512[0], 26)}; break; - case 27: return {_mm512_slli_epi64(u.v512[0], 27)}; break; - case 28: return {_mm512_slli_epi64(u.v512[0], 28)}; break; - case 29: return {_mm512_slli_epi64(u.v512[0], 29)}; break; - case 30: return {_mm512_slli_epi64(u.v512[0], 30)}; break; - case 31: return {_mm512_slli_epi64(u.v512[0], 31)}; break; - case 32: return {_mm512_slli_epi64(u.v512[0], 32)}; break; - case 33: return {_mm512_slli_epi64(u.v512[0], 33)}; break; - case 34: return {_mm512_slli_epi64(u.v512[0], 34)}; break; - case 35: return {_mm512_slli_epi64(u.v512[0], 35)}; break; - case 36: return {_mm512_slli_epi64(u.v512[0], 36)}; break; - case 37: return {_mm512_slli_epi64(u.v512[0], 37)}; break; - case 38: return {_mm512_slli_epi64(u.v512[0], 38)}; break; - case 39: return {_mm512_slli_epi64(u.v512[0], 39)}; break; - case 40: return {_mm512_slli_epi64(u.v512[0], 40)}; break; - case 41: return {_mm512_slli_epi64(u.v512[0], 41)}; break; - case 42: return {_mm512_slli_epi64(u.v512[0], 42)}; break; - case 43: return {_mm512_slli_epi64(u.v512[0], 43)}; break; - case 44: return {_mm512_slli_epi64(u.v512[0], 44)}; break; - case 45: return {_mm512_slli_epi64(u.v512[0], 45)}; break; - case 46: return {_mm512_slli_epi64(u.v512[0], 46)}; break; - case 47: return {_mm512_slli_epi64(u.v512[0], 47)}; break; - case 48: return {_mm512_slli_epi64(u.v512[0], 48)}; break; - case 49: return {_mm512_slli_epi64(u.v512[0], 49)}; break; - case 50: return {_mm512_slli_epi64(u.v512[0], 50)}; break; - case 51: return {_mm512_slli_epi64(u.v512[0], 51)}; break; - case 52: return {_mm512_slli_epi64(u.v512[0], 52)}; break; - case 53: return {_mm512_slli_epi64(u.v512[0], 53)}; break; - case 54: return {_mm512_slli_epi64(u.v512[0], 54)}; break; - case 55: return {_mm512_slli_epi64(u.v512[0], 55)}; break; - case 56: return {_mm512_slli_epi64(u.v512[0], 56)}; break; - case 57: return {_mm512_slli_epi64(u.v512[0], 57)}; break; - case 58: return {_mm512_slli_epi64(u.v512[0], 58)}; break; - case 59: return {_mm512_slli_epi64(u.v512[0], 59)}; break; - case 60: return {_mm512_slli_epi64(u.v512[0], 60)}; break; - case 61: return {_mm512_slli_epi64(u.v512[0], 61)}; break; - case 62: return {_mm512_slli_epi64(u.v512[0], 62)}; break; - case 63: return {_mm512_slli_epi64(u.v512[0], 63)}; break; - case 64: return Zeroes(); - default: break; - } - return *this; -} -#endif - -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<64> SuperVector<64>::rshift64(uint8_t const N) -{ - return {_mm512_srli_epi64(u.v512[0], N)}; -} -#else -template<> -really_inline SuperVector<64> SuperVector<64>::rshift64(uint8_t const N) -{ - switch(N) { - case 0: return *this; break; - case 1: return {_mm512_srli_epi64(u.v512[0], 1)}; break; - case 2: return {_mm512_srli_epi64(u.v512[0], 2)}; break; - case 3: return {_mm512_srli_epi64(u.v512[0], 3)}; break; - case 4: return {_mm512_srli_epi64(u.v512[0], 4)}; break; - case 5: return {_mm512_srli_epi64(u.v512[0], 5)}; break; - case 6: return {_mm512_srli_epi64(u.v512[0], 6)}; break; - case 7: return {_mm512_srli_epi64(u.v512[0], 7)}; break; - case 8: return {_mm512_srli_epi64(u.v512[0], 8)}; break; - case 9: return {_mm512_srli_epi64(u.v512[0], 9)}; break; - case 10: return {_mm512_srli_epi64(u.v512[0], 10)}; break; - case 11: return {_mm512_srli_epi64(u.v512[0], 11)}; break; - case 12: return {_mm512_srli_epi64(u.v512[0], 12)}; break; - case 13: return {_mm512_srli_epi64(u.v512[0], 13)}; break; - case 14: return {_mm512_srli_epi64(u.v512[0], 14)}; break; - case 15: return {_mm512_srli_epi64(u.v512[0], 15)}; break; - case 16: return {_mm512_srli_epi64(u.v512[0], 16)}; break; - case 17: return {_mm512_srli_epi64(u.v512[0], 17)}; break; - case 18: return {_mm512_srli_epi64(u.v512[0], 18)}; break; - case 19: return {_mm512_srli_epi64(u.v512[0], 19)}; break; - case 20: return {_mm512_srli_epi64(u.v512[0], 20)}; break; - case 21: return {_mm512_srli_epi64(u.v512[0], 21)}; break; - case 22: return {_mm512_srli_epi64(u.v512[0], 22)}; break; - case 23: return {_mm512_srli_epi64(u.v512[0], 23)}; break; - case 24: return {_mm512_srli_epi64(u.v512[0], 24)}; break; - case 25: return {_mm512_srli_epi64(u.v512[0], 25)}; break; - case 26: return {_mm512_srli_epi64(u.v512[0], 26)}; break; - case 27: return {_mm512_srli_epi64(u.v512[0], 27)}; break; - case 28: return {_mm512_srli_epi64(u.v512[0], 28)}; break; - case 29: return {_mm512_srli_epi64(u.v512[0], 29)}; break; - case 30: return {_mm512_srli_epi64(u.v512[0], 30)}; break; - case 31: return {_mm512_srli_epi64(u.v512[0], 31)}; break; - case 32: return {_mm512_srli_epi64(u.v512[0], 32)}; break; - case 33: return {_mm512_srli_epi64(u.v512[0], 33)}; break; - case 34: return {_mm512_srli_epi64(u.v512[0], 34)}; break; - case 35: return {_mm512_srli_epi64(u.v512[0], 35)}; break; - case 36: return {_mm512_srli_epi64(u.v512[0], 36)}; break; - case 37: return {_mm512_srli_epi64(u.v512[0], 37)}; break; - case 38: return {_mm512_srli_epi64(u.v512[0], 38)}; break; - case 39: return {_mm512_srli_epi64(u.v512[0], 39)}; break; - case 40: return {_mm512_srli_epi64(u.v512[0], 40)}; break; - case 41: return {_mm512_srli_epi64(u.v512[0], 41)}; break; - case 42: return {_mm512_srli_epi64(u.v512[0], 42)}; break; - case 43: return {_mm512_srli_epi64(u.v512[0], 43)}; break; - case 44: return {_mm512_srli_epi64(u.v512[0], 44)}; break; - case 45: return {_mm512_srli_epi64(u.v512[0], 45)}; break; - case 46: return {_mm512_srli_epi64(u.v512[0], 46)}; break; - case 47: return {_mm512_srli_epi64(u.v512[0], 47)}; break; - case 48: return {_mm512_srli_epi64(u.v512[0], 48)}; break; - case 49: return {_mm512_srli_epi64(u.v512[0], 49)}; break; - case 50: return {_mm512_srli_epi64(u.v512[0], 50)}; break; - case 51: return {_mm512_srli_epi64(u.v512[0], 51)}; break; - case 52: return {_mm512_srli_epi64(u.v512[0], 52)}; break; - case 53: return {_mm512_srli_epi64(u.v512[0], 53)}; break; - case 54: return {_mm512_srli_epi64(u.v512[0], 54)}; break; - case 55: return {_mm512_srli_epi64(u.v512[0], 55)}; break; - case 56: return {_mm512_srli_epi64(u.v512[0], 56)}; break; - case 57: return {_mm512_srli_epi64(u.v512[0], 57)}; break; - case 58: return {_mm512_srli_epi64(u.v512[0], 58)}; break; - case 59: return {_mm512_srli_epi64(u.v512[0], 59)}; break; - case 60: return {_mm512_srli_epi64(u.v512[0], 60)}; break; - case 61: return {_mm512_srli_epi64(u.v512[0], 61)}; break; - case 62: return {_mm512_srli_epi64(u.v512[0], 62)}; break; - case 63: return {_mm512_srli_epi64(u.v512[0], 63)}; break; - case 64: return Zeroes(); - default: break; - } - return *this; -} -#endif - -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<64> SuperVector<64>::lshift128(uint8_t const N) -{ - return {_mm512_bslli_epi128(u.v512[0], N)}; -} -#else -template<> -really_inline SuperVector<64> SuperVector<64>::lshift128(uint8_t const N) -{ - switch(N) { - case 0: return *this; break; - case 1: return {_mm512_bslli_epi128(u.v512[0], 1)}; break; - case 2: return {_mm512_bslli_epi128(u.v512[0], 2)}; break; - case 3: return {_mm512_bslli_epi128(u.v512[0], 3)}; break; - case 4: return {_mm512_bslli_epi128(u.v512[0], 4)}; break; - case 5: return {_mm512_bslli_epi128(u.v512[0], 5)}; break; - case 6: return {_mm512_bslli_epi128(u.v512[0], 6)}; break; - case 7: return {_mm512_bslli_epi128(u.v512[0], 7)}; break; - case 8: return {_mm512_bslli_epi128(u.v512[0], 8)}; break; - case 9: return {_mm512_bslli_epi128(u.v512[0], 9)}; break; - case 10: return {_mm512_bslli_epi128(u.v512[0], 10)}; break; - case 11: return {_mm512_bslli_epi128(u.v512[0], 11)}; break; - case 12: return {_mm512_bslli_epi128(u.v512[0], 12)}; break; - case 13: return {_mm512_bslli_epi128(u.v512[0], 13)}; break; - case 14: return {_mm512_bslli_epi128(u.v512[0], 14)}; break; - case 15: return {_mm512_bslli_epi128(u.v512[0], 15)}; break; - case 16: return {_mm512_bslli_epi128(u.v512[0], 16)}; break; - case 17: return {_mm512_bslli_epi128(u.v512[0], 17)}; break; - case 18: return {_mm512_bslli_epi128(u.v512[0], 18)}; break; - case 19: return {_mm512_bslli_epi128(u.v512[0], 19)}; break; - case 20: return {_mm512_bslli_epi128(u.v512[0], 20)}; break; - case 21: return {_mm512_bslli_epi128(u.v512[0], 21)}; break; - case 22: return {_mm512_bslli_epi128(u.v512[0], 22)}; break; - case 23: return {_mm512_bslli_epi128(u.v512[0], 23)}; break; - case 24: return {_mm512_bslli_epi128(u.v512[0], 24)}; break; - case 25: return {_mm512_bslli_epi128(u.v512[0], 25)}; break; - case 26: return {_mm512_bslli_epi128(u.v512[0], 26)}; break; - case 27: return {_mm512_bslli_epi128(u.v512[0], 27)}; break; - case 28: return {_mm512_bslli_epi128(u.v512[0], 28)}; break; - case 29: return {_mm512_bslli_epi128(u.v512[0], 29)}; break; - case 30: return {_mm512_bslli_epi128(u.v512[0], 30)}; break; - case 31: return {_mm512_bslli_epi128(u.v512[0], 31)}; break; - case 32: return {_mm512_bslli_epi128(u.v512[0], 32)}; break; - case 33: return {_mm512_bslli_epi128(u.v512[0], 33)}; break; - case 34: return {_mm512_bslli_epi128(u.v512[0], 34)}; break; - case 35: return {_mm512_bslli_epi128(u.v512[0], 35)}; break; - case 36: return {_mm512_bslli_epi128(u.v512[0], 36)}; break; - case 37: return {_mm512_bslli_epi128(u.v512[0], 37)}; break; - case 38: return {_mm512_bslli_epi128(u.v512[0], 38)}; break; - case 39: return {_mm512_bslli_epi128(u.v512[0], 39)}; break; - case 40: return {_mm512_bslli_epi128(u.v512[0], 40)}; break; - case 41: return {_mm512_bslli_epi128(u.v512[0], 41)}; break; - case 42: return {_mm512_bslli_epi128(u.v512[0], 42)}; break; - case 43: return {_mm512_bslli_epi128(u.v512[0], 43)}; break; - case 44: return {_mm512_bslli_epi128(u.v512[0], 44)}; break; - case 45: return {_mm512_bslli_epi128(u.v512[0], 45)}; break; - case 46: return {_mm512_bslli_epi128(u.v512[0], 46)}; break; - case 47: return {_mm512_bslli_epi128(u.v512[0], 47)}; break; - case 48: return {_mm512_bslli_epi128(u.v512[0], 48)}; break; - case 49: return {_mm512_bslli_epi128(u.v512[0], 49)}; break; - case 50: return {_mm512_bslli_epi128(u.v512[0], 50)}; break; - case 51: return {_mm512_bslli_epi128(u.v512[0], 51)}; break; - case 52: return {_mm512_bslli_epi128(u.v512[0], 52)}; break; - case 53: return {_mm512_bslli_epi128(u.v512[0], 53)}; break; - case 54: return {_mm512_bslli_epi128(u.v512[0], 54)}; break; - case 55: return {_mm512_bslli_epi128(u.v512[0], 55)}; break; - case 56: return {_mm512_bslli_epi128(u.v512[0], 56)}; break; - case 57: return {_mm512_bslli_epi128(u.v512[0], 57)}; break; - case 58: return {_mm512_bslli_epi128(u.v512[0], 58)}; break; - case 59: return {_mm512_bslli_epi128(u.v512[0], 59)}; break; - case 60: return {_mm512_bslli_epi128(u.v512[0], 60)}; break; - case 61: return {_mm512_bslli_epi128(u.v512[0], 61)}; break; - case 62: return {_mm512_bslli_epi128(u.v512[0], 62)}; break; - case 63: return {_mm512_bslli_epi128(u.v512[0], 63)}; break; - case 64: return Zeroes(); - default: break; - } - return *this; -} -#endif - -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<64> SuperVector<64>::rshift128(uint8_t const N) -{ - return {_mm512_bsrli_epi128(u.v512[0], N)}; -} -#else -template<> -really_inline SuperVector<64> SuperVector<64>::rshift128(uint8_t const N) -{ - switch(N) { - case 0: return *this; break; - case 1: return {_mm512_bsrli_epi128(u.v512[0], 1)}; break; - case 2: return {_mm512_bsrli_epi128(u.v512[0], 2)}; break; - case 3: return {_mm512_bsrli_epi128(u.v512[0], 3)}; break; - case 4: return {_mm512_bsrli_epi128(u.v512[0], 4)}; break; - case 5: return {_mm512_bsrli_epi128(u.v512[0], 5)}; break; - case 6: return {_mm512_bsrli_epi128(u.v512[0], 6)}; break; - case 7: return {_mm512_bsrli_epi128(u.v512[0], 7)}; break; - case 8: return {_mm512_bsrli_epi128(u.v512[0], 8)}; break; - case 9: return {_mm512_bsrli_epi128(u.v512[0], 9)}; break; - case 10: return {_mm512_bsrli_epi128(u.v512[0], 10)}; break; - case 11: return {_mm512_bsrli_epi128(u.v512[0], 11)}; break; - case 12: return {_mm512_bsrli_epi128(u.v512[0], 12)}; break; - case 13: return {_mm512_bsrli_epi128(u.v512[0], 13)}; break; - case 14: return {_mm512_bsrli_epi128(u.v512[0], 14)}; break; - case 15: return {_mm512_bsrli_epi128(u.v512[0], 15)}; break; - case 16: return {_mm512_bsrli_epi128(u.v512[0], 16)}; break; - case 17: return {_mm512_bsrli_epi128(u.v512[0], 17)}; break; - case 18: return {_mm512_bsrli_epi128(u.v512[0], 18)}; break; - case 19: return {_mm512_bsrli_epi128(u.v512[0], 19)}; break; - case 20: return {_mm512_bsrli_epi128(u.v512[0], 20)}; break; - case 21: return {_mm512_bsrli_epi128(u.v512[0], 21)}; break; - case 22: return {_mm512_bsrli_epi128(u.v512[0], 22)}; break; - case 23: return {_mm512_bsrli_epi128(u.v512[0], 23)}; break; - case 24: return {_mm512_bsrli_epi128(u.v512[0], 24)}; break; - case 25: return {_mm512_bsrli_epi128(u.v512[0], 25)}; break; - case 26: return {_mm512_bsrli_epi128(u.v512[0], 26)}; break; - case 27: return {_mm512_bsrli_epi128(u.v512[0], 27)}; break; - case 28: return {_mm512_bsrli_epi128(u.v512[0], 28)}; break; - case 29: return {_mm512_bsrli_epi128(u.v512[0], 29)}; break; - case 30: return {_mm512_bsrli_epi128(u.v512[0], 30)}; break; - case 31: return {_mm512_bsrli_epi128(u.v512[0], 31)}; break; - case 32: return {_mm512_bsrli_epi128(u.v512[0], 32)}; break; - case 33: return {_mm512_bsrli_epi128(u.v512[0], 33)}; break; - case 34: return {_mm512_bsrli_epi128(u.v512[0], 34)}; break; - case 35: return {_mm512_bsrli_epi128(u.v512[0], 35)}; break; - case 36: return {_mm512_bsrli_epi128(u.v512[0], 36)}; break; - case 37: return {_mm512_bsrli_epi128(u.v512[0], 37)}; break; - case 38: return {_mm512_bsrli_epi128(u.v512[0], 38)}; break; - case 39: return {_mm512_bsrli_epi128(u.v512[0], 39)}; break; - case 40: return {_mm512_bsrli_epi128(u.v512[0], 40)}; break; - case 41: return {_mm512_bsrli_epi128(u.v512[0], 41)}; break; - case 42: return {_mm512_bsrli_epi128(u.v512[0], 42)}; break; - case 43: return {_mm512_bsrli_epi128(u.v512[0], 43)}; break; - case 44: return {_mm512_bsrli_epi128(u.v512[0], 44)}; break; - case 45: return {_mm512_bsrli_epi128(u.v512[0], 45)}; break; - case 46: return {_mm512_bsrli_epi128(u.v512[0], 46)}; break; - case 47: return {_mm512_bsrli_epi128(u.v512[0], 47)}; break; - case 48: return {_mm512_bsrli_epi128(u.v512[0], 48)}; break; - case 49: return {_mm512_bsrli_epi128(u.v512[0], 49)}; break; - case 50: return {_mm512_bsrli_epi128(u.v512[0], 50)}; break; - case 51: return {_mm512_bsrli_epi128(u.v512[0], 51)}; break; - case 52: return {_mm512_bsrli_epi128(u.v512[0], 52)}; break; - case 53: return {_mm512_bsrli_epi128(u.v512[0], 53)}; break; - case 54: return {_mm512_bsrli_epi128(u.v512[0], 54)}; break; - case 55: return {_mm512_bsrli_epi128(u.v512[0], 55)}; break; - case 56: return {_mm512_bsrli_epi128(u.v512[0], 56)}; break; - case 57: return {_mm512_bsrli_epi128(u.v512[0], 57)}; break; - case 58: return {_mm512_bsrli_epi128(u.v512[0], 58)}; break; - case 59: return {_mm512_bsrli_epi128(u.v512[0], 59)}; break; - case 60: return {_mm512_bsrli_epi128(u.v512[0], 60)}; break; - case 61: return {_mm512_bsrli_epi128(u.v512[0], 61)}; break; - case 62: return {_mm512_bsrli_epi128(u.v512[0], 62)}; break; - case 63: return {_mm512_bsrli_epi128(u.v512[0], 63)}; break; - case 64: return Zeroes(); - default: break; - } - return *this; -} -#endif - #endif // HAVE_AVX512 #endif // SIMD_IMPL_HPP diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp index 16a59046..342f8fd4 100644 --- a/unit/internal/supervector.cpp +++ b/unit/internal/supervector.cpp @@ -920,7 +920,7 @@ TEST(SuperVectorUtilsTest,LShift64_512c){ u64a vec[8] = {32, 64, 128, 256, 512, 512, 256, 1024}; auto SP = SuperVector<64>::loadu(vec); for(int s = 0; s<64; s++) { - auto SP_after_shift = SP.lshift64(s); + auto SP_after_shift = SP.vshl_64(s); for (int i=0; i<8; i++) { ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] << s); } @@ -931,7 +931,7 @@ TEST(SuperVectorUtilsTest,RShift64_512c){ u64a vec[8] = {32, 64, 128, 256, 512, 512, 256, 1024}; auto SP = SuperVector<64>::loadu(vec); for(int s = 0; s<64; s++) { - auto SP_after_shift = SP.rshift64(s); + auto SP_after_shift = SP.vshr_64(s); for (int i=0; i<8; i++) { ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] >> s); }