From acca824deaaa5af69e891a5725f2a55bc2083cac Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 11 Jun 2021 13:33:01 +0300 Subject: [PATCH] add missing ARM SuperVector methods, some tests still fail, WIP --- CMakeLists.txt | 4 +- src/util/simd/arch/arm/impl.cpp | 143 +++++++++++++++++++++++++++++++- src/util/simd/arch/x86/impl.cpp | 4 +- src/util/simd/types.hpp | 5 +- 4 files changed, 147 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b46e610..7645ee56 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -602,7 +602,7 @@ set (hs_exec_common_SRCS ${hs_exec_common_SRCS} src/util/arch/x86/cpuid_flags.c ) -else (ARCH_ARM32 OR ARCH_AARCH64) +elseif (ARCH_ARM32 OR ARCH_AARCH64) set (hs_exec_common_SRCS ${hs_exec_common_SRCS} src/util/arch/arm/cpuid_flags.c @@ -758,7 +758,7 @@ if (ARCH_IA32 OR ARCH_X86_64) set (hs_exec_SRCS ${hs_exec_SRCS} src/util/simd/arch/x86/impl.cpp) -else (ARCH_ARM32 OR ARCH_AARCH64) +elseif (ARCH_ARM32 OR ARCH_AARCH64) set (hs_exec_SRCS ${hs_exec_SRCS} src/util/simd/arch/arm/impl.cpp) diff --git a/src/util/simd/arch/arm/impl.cpp b/src/util/simd/arch/arm/impl.cpp index 2c150489..75796a4b 100644 --- a/src/util/simd/arch/arm/impl.cpp +++ b/src/util/simd/arch/arm/impl.cpp @@ -131,6 +131,8 @@ really_inline SuperVector<16> SuperVector<16>::Zeroes(void) return {vdupq_n_u8(0)}; } +// Methods + template <> really_inline void SuperVector<16>::operator=(SuperVector<16> const &o) { @@ -143,6 +145,24 @@ really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const b return {vandq_s8(u.v128[0], b.u.v128[0])}; } +template <> +really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const b) const +{ + return {vandq_s8(u.v128[0], b.u.v128[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::opand(SuperVector<16> const b) const +{ + return {vandq_s8(u.v128[0], b.u.v128[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const b) const +{ + return {vandq_s8(u.v128[0], b.u.v128[0])}; +} + template <> really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const b) const { @@ -171,7 +191,7 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(Su return eq(b).movemask(); } -#ifndef DEBUG +#ifndef HS_OPTIMIZE template <> really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { @@ -205,6 +225,38 @@ really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const } #endif +#ifdef HS_OPTIMIZE +template <> +really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const +{ + return {vshrq_n_s32(u.v128[0], N)}; +} +#else +template <> +really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const +{ + switch(N) { + case 0: return {vshrq_n_s32(u.v128[0], 0)}; break; + case 1: return {vshrq_n_s32(u.v128[0], 1)}; break; + case 2: return {vshrq_n_s32(u.v128[0], 2)}; break; + case 3: return {vshrq_n_s32(u.v128[0], 3)}; break; + case 4: return {vshrq_n_s32(u.v128[0], 4)}; break; + case 5: return {vshrq_n_s32(u.v128[0], 5)}; break; + case 6: return {vshrq_n_s32(u.v128[0], 6)}; break; + case 7: return {vshrq_n_s32(u.v128[0], 7)}; break; + case 8: return {vshrq_n_s32(u.v128[0], 8)}; break; + case 9: return {vshrq_n_s32(u.v128[0], 9)}; break; + case 10: return {vshrq_n_s32(u.v128[0], 10)}; break; + case 11: return {vshrq_n_s32(u.v128[0], 11)}; break; + case 12: return {vshrq_n_s32(u.v128[0], 12)}; break; + case 13: return {vshrq_n_s32(u.v128[0], 13)}; break; + case 14: return {vshrq_n_s32(u.v128[0], 14)}; break; + case 15: return {vshrq_n_s32(u.v128[0], 15)}; break; + default: break; + } + return *this; +} +#endif template <> really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) @@ -217,10 +269,20 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) { assert(ISALIGNED_N(ptr, alignof(SuperVector::size))); ptr = assume_aligned(ptr, SuperVector::size); - return vld1q_s32((const int32_t *)ptr); + return {vld1q_s32((const int32_t *)ptr)}; } -#ifndef DEBUG +template <> +really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len) +{ + uint8_t alignment = (uintptr_t)(ptr) & 15; + SuperVector<16> maskb = Ones() << alignment; + SuperVector<16> maske = Ones() >> (16 -len - alignment); + SuperVector<16> v = SuperVector<16>::loadu((const m128 *)ptr); + return {maskb.u.v128[0] & maske.u.v128[0] & v.u.v128[0]}; +} + +#ifndef HS_OPTIMIZE template<> really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> r, int8_t offset) { @@ -254,6 +316,81 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> l, int8_t } #endif +template<> +really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) +{ + /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. + In NEON, if >=16, then the result is zero, otherwise it is that lane. + btranslated is the version that is converted from Intel to NEON. */ + int8x16_t btranslated = vandq_s8((int8x16_t)b.u.v128[0], vdupq_n_s8(0x8f)); + return {vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)btranslated)}; +} + +#ifdef HS_OPTIMIZE +template<> +really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const l) +{ + return {(m128)vshlq_n_s64(u.v128[0], l)}; +} +#else +template<> +really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const l) +{ + switch(l) { + case 0: return {vshlq_n_s64(u.v128[0], 0)}; break; + case 1: return {vshlq_n_s64(u.v128[0], 1)}; break; + case 2: return {vshlq_n_s64(u.v128[0], 2)}; break; + case 3: return {vshlq_n_s64(u.v128[0], 3)}; break; + case 4: return {vshlq_n_s64(u.v128[0], 4)}; break; + case 5: return {vshlq_n_s64(u.v128[0], 5)}; break; + case 6: return {vshlq_n_s64(u.v128[0], 6)}; break; + case 7: return {vshlq_n_s64(u.v128[0], 7)}; break; + case 8: return {vshlq_n_s64(u.v128[0], 8)}; break; + case 9: return {vshlq_n_s64(u.v128[0], 9)}; break; + case 10: return {vshlq_n_s64(u.v128[0], 10)}; break; + case 11: return {vshlq_n_s64(u.v128[0], 11)}; break; + case 12: return {vshlq_n_s64(u.v128[0], 12)}; break; + case 13: return {vshlq_n_s64(u.v128[0], 13)}; break; + case 14: return {vshlq_n_s64(u.v128[0], 14)}; break; + case 15: return {vshlq_n_s64(u.v128[0], 15)}; break; + default: break; + } + return *this; +} +#endif + +#ifdef HS_OPTIMIZE +template<> +really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const l) +{ + return {(m128)vshrq_n_s64(u.v128[0], l)}; +} +#else +template<> +really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const l) +{ + switch(l) { + case 0: return {vshrq_n_s64(u.v128[0], 0)}; break; + case 1: return {vshrq_n_s64(u.v128[0], 1)}; break; + case 2: return {vshrq_n_s64(u.v128[0], 2)}; break; + case 3: return {vshrq_n_s64(u.v128[0], 3)}; break; + case 4: return {vshrq_n_s64(u.v128[0], 4)}; break; + case 5: return {vshrq_n_s64(u.v128[0], 5)}; break; + case 6: return {vshrq_n_s64(u.v128[0], 6)}; break; + case 7: return {vshrq_n_s64(u.v128[0], 7)}; break; + case 8: return {vshrq_n_s64(u.v128[0], 8)}; break; + case 9: return {vshrq_n_s64(u.v128[0], 9)}; break; + case 10: return {vshrq_n_s64(u.v128[0], 10)}; break; + case 11: return {vshrq_n_s64(u.v128[0], 11)}; break; + case 12: return {vshrq_n_s64(u.v128[0], 12)}; break; + case 13: return {vshrq_n_s64(u.v128[0], 13)}; break; + case 14: return {vshrq_n_s64(u.v128[0], 14)}; break; + case 15: return {vshrq_n_s64(u.v128[0], 15)}; break; + default: break; + } + return *this; +} +#endif #endif // SIMD_IMPL_HPP diff --git a/src/util/simd/arch/x86/impl.cpp b/src/util/simd/arch/x86/impl.cpp index 476d28ac..d3132519 100644 --- a/src/util/simd/arch/x86/impl.cpp +++ b/src/util/simd/arch/x86/impl.cpp @@ -165,13 +165,13 @@ really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const b } template <> -really_inline SuperVector<16> SuperVector<16>::mand(SuperVector<16> const b) const +really_inline SuperVector<16> SuperVector<16>::opand(SuperVector<16> const b) const { return *this & b; } template <> -really_inline SuperVector<16> SuperVector<16>::mandnot(SuperVector<16> const b) const +really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const b) const { return {_mm_andnot_si128(u.v128[0], b.u.v128[0])}; } diff --git a/src/util/simd/types.hpp b/src/util/simd/types.hpp index a9883458..4c948888 100644 --- a/src/util/simd/types.hpp +++ b/src/util/simd/types.hpp @@ -175,8 +175,9 @@ public: SuperVector operator&(SuperVector const b) const; SuperVector operator|(SuperVector const b) const; - SuperVector mand(SuperVector const b) const; - SuperVector mandnot(SuperVector const b) const; + SuperVector opand(SuperVector const b) const; + SuperVector opor(SuperVector const b) const; + SuperVector opandnot(SuperVector const b) const; SuperVector eq(SuperVector const b) const; SuperVector operator<<(uint8_t const N) const;