add missing ARM SuperVector methods, some tests still fail, WIP

This commit is contained in:
Konstantinos Margaritis 2021-06-11 13:33:01 +03:00
parent 5d9d958e74
commit acca824dea
4 changed files with 147 additions and 9 deletions

View File

@ -602,7 +602,7 @@ set (hs_exec_common_SRCS
${hs_exec_common_SRCS}
src/util/arch/x86/cpuid_flags.c
)
else (ARCH_ARM32 OR ARCH_AARCH64)
elseif (ARCH_ARM32 OR ARCH_AARCH64)
set (hs_exec_common_SRCS
${hs_exec_common_SRCS}
src/util/arch/arm/cpuid_flags.c
@ -758,7 +758,7 @@ if (ARCH_IA32 OR ARCH_X86_64)
set (hs_exec_SRCS
${hs_exec_SRCS}
src/util/simd/arch/x86/impl.cpp)
else (ARCH_ARM32 OR ARCH_AARCH64)
elseif (ARCH_ARM32 OR ARCH_AARCH64)
set (hs_exec_SRCS
${hs_exec_SRCS}
src/util/simd/arch/arm/impl.cpp)

View File

@ -131,6 +131,8 @@ really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
return {vdupq_n_u8(0)};
}
// Methods
template <>
really_inline void SuperVector<16>::operator=(SuperVector<16> const &o)
{
@ -143,6 +145,24 @@ really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const b
return {vandq_s8(u.v128[0], b.u.v128[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const b) const
{
return {vandq_s8(u.v128[0], b.u.v128[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::opand(SuperVector<16> const b) const
{
return {vandq_s8(u.v128[0], b.u.v128[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const b) const
{
return {vandq_s8(u.v128[0], b.u.v128[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const b) const
{
@ -171,7 +191,7 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(Su
return eq(b).movemask();
}
#ifndef DEBUG
#ifndef HS_OPTIMIZE
template <>
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
{
@ -205,6 +225,38 @@ really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
}
#endif
#ifdef HS_OPTIMIZE
template <>
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
{
return {vshrq_n_s32(u.v128[0], N)};
}
#else
template <>
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
{
switch(N) {
case 0: return {vshrq_n_s32(u.v128[0], 0)}; break;
case 1: return {vshrq_n_s32(u.v128[0], 1)}; break;
case 2: return {vshrq_n_s32(u.v128[0], 2)}; break;
case 3: return {vshrq_n_s32(u.v128[0], 3)}; break;
case 4: return {vshrq_n_s32(u.v128[0], 4)}; break;
case 5: return {vshrq_n_s32(u.v128[0], 5)}; break;
case 6: return {vshrq_n_s32(u.v128[0], 6)}; break;
case 7: return {vshrq_n_s32(u.v128[0], 7)}; break;
case 8: return {vshrq_n_s32(u.v128[0], 8)}; break;
case 9: return {vshrq_n_s32(u.v128[0], 9)}; break;
case 10: return {vshrq_n_s32(u.v128[0], 10)}; break;
case 11: return {vshrq_n_s32(u.v128[0], 11)}; break;
case 12: return {vshrq_n_s32(u.v128[0], 12)}; break;
case 13: return {vshrq_n_s32(u.v128[0], 13)}; break;
case 14: return {vshrq_n_s32(u.v128[0], 14)}; break;
case 15: return {vshrq_n_s32(u.v128[0], 15)}; break;
default: break;
}
return *this;
}
#endif
template <>
really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
@ -217,10 +269,20 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
{
assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
ptr = assume_aligned(ptr, SuperVector::size);
return vld1q_s32((const int32_t *)ptr);
return {vld1q_s32((const int32_t *)ptr)};
}
#ifndef DEBUG
template <>
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
{
uint8_t alignment = (uintptr_t)(ptr) & 15;
SuperVector<16> maskb = Ones() << alignment;
SuperVector<16> maske = Ones() >> (16 -len - alignment);
SuperVector<16> v = SuperVector<16>::loadu((const m128 *)ptr);
return {maskb.u.v128[0] & maske.u.v128[0] & v.u.v128[0]};
}
#ifndef HS_OPTIMIZE
template<>
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> r, int8_t offset)
{
@ -254,6 +316,81 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> l, int8_t
}
#endif
template<>
really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b)
{
/* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
In NEON, if >=16, then the result is zero, otherwise it is that lane.
btranslated is the version that is converted from Intel to NEON. */
int8x16_t btranslated = vandq_s8((int8x16_t)b.u.v128[0], vdupq_n_s8(0x8f));
return {vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)btranslated)};
}
#ifdef HS_OPTIMIZE
template<>
really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const l)
{
return {(m128)vshlq_n_s64(u.v128[0], l)};
}
#else
template<>
really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const l)
{
switch(l) {
case 0: return {vshlq_n_s64(u.v128[0], 0)}; break;
case 1: return {vshlq_n_s64(u.v128[0], 1)}; break;
case 2: return {vshlq_n_s64(u.v128[0], 2)}; break;
case 3: return {vshlq_n_s64(u.v128[0], 3)}; break;
case 4: return {vshlq_n_s64(u.v128[0], 4)}; break;
case 5: return {vshlq_n_s64(u.v128[0], 5)}; break;
case 6: return {vshlq_n_s64(u.v128[0], 6)}; break;
case 7: return {vshlq_n_s64(u.v128[0], 7)}; break;
case 8: return {vshlq_n_s64(u.v128[0], 8)}; break;
case 9: return {vshlq_n_s64(u.v128[0], 9)}; break;
case 10: return {vshlq_n_s64(u.v128[0], 10)}; break;
case 11: return {vshlq_n_s64(u.v128[0], 11)}; break;
case 12: return {vshlq_n_s64(u.v128[0], 12)}; break;
case 13: return {vshlq_n_s64(u.v128[0], 13)}; break;
case 14: return {vshlq_n_s64(u.v128[0], 14)}; break;
case 15: return {vshlq_n_s64(u.v128[0], 15)}; break;
default: break;
}
return *this;
}
#endif
#ifdef HS_OPTIMIZE
template<>
really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const l)
{
return {(m128)vshrq_n_s64(u.v128[0], l)};
}
#else
template<>
really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const l)
{
switch(l) {
case 0: return {vshrq_n_s64(u.v128[0], 0)}; break;
case 1: return {vshrq_n_s64(u.v128[0], 1)}; break;
case 2: return {vshrq_n_s64(u.v128[0], 2)}; break;
case 3: return {vshrq_n_s64(u.v128[0], 3)}; break;
case 4: return {vshrq_n_s64(u.v128[0], 4)}; break;
case 5: return {vshrq_n_s64(u.v128[0], 5)}; break;
case 6: return {vshrq_n_s64(u.v128[0], 6)}; break;
case 7: return {vshrq_n_s64(u.v128[0], 7)}; break;
case 8: return {vshrq_n_s64(u.v128[0], 8)}; break;
case 9: return {vshrq_n_s64(u.v128[0], 9)}; break;
case 10: return {vshrq_n_s64(u.v128[0], 10)}; break;
case 11: return {vshrq_n_s64(u.v128[0], 11)}; break;
case 12: return {vshrq_n_s64(u.v128[0], 12)}; break;
case 13: return {vshrq_n_s64(u.v128[0], 13)}; break;
case 14: return {vshrq_n_s64(u.v128[0], 14)}; break;
case 15: return {vshrq_n_s64(u.v128[0], 15)}; break;
default: break;
}
return *this;
}
#endif
#endif // SIMD_IMPL_HPP

View File

@ -165,13 +165,13 @@ really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const b
}
template <>
really_inline SuperVector<16> SuperVector<16>::mand(SuperVector<16> const b) const
really_inline SuperVector<16> SuperVector<16>::opand(SuperVector<16> const b) const
{
return *this & b;
}
template <>
really_inline SuperVector<16> SuperVector<16>::mandnot(SuperVector<16> const b) const
really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const b) const
{
return {_mm_andnot_si128(u.v128[0], b.u.v128[0])};
}

View File

@ -175,8 +175,9 @@ public:
SuperVector operator&(SuperVector const b) const;
SuperVector operator|(SuperVector const b) const;
SuperVector mand(SuperVector const b) const;
SuperVector mandnot(SuperVector const b) const;
SuperVector opand(SuperVector const b) const;
SuperVector opor(SuperVector const b) const;
SuperVector opandnot(SuperVector const b) const;
SuperVector eq(SuperVector const b) const;
SuperVector operator<<(uint8_t const N) const;