diff --git a/src/util/simd/arch/x86/impl.hpp b/src/util/simd/arch/x86/impl.cpp similarity index 77% rename from src/util/simd/arch/x86/impl.hpp rename to src/util/simd/arch/x86/impl.cpp index 90ad09e8..4e8acf94 100644 --- a/src/util/simd/arch/x86/impl.hpp +++ b/src/util/simd/arch/x86/impl.cpp @@ -31,12 +31,18 @@ #define SIMD_IMPL_HPP #include +#include + +#include "ue2common.h" +#include "util/arch.h" +#include "util/unaligned.h" +#include "util/simd/types.hpp" #if !defined(m128) && defined(HAVE_SSE2) typedef __m128i m128; #endif -#if !defined(m128) && defined(HAVE_AVX2) +#if !defined(m256) && defined(HAVE_AVX2) typedef __m256i m256; #endif @@ -44,6 +50,17 @@ typedef __m256i m256; typedef __m512i m512; #endif +#ifdef DEBUG +static inline void print_m128_16x8(const char *label, m128 vector) { + uint8_t ALIGN_ATTR(16) data[16]; + _mm_store_si128 ((m128 *)data, vector); + DEBUG_PRINTF("%s: ", label); + for(int i=0; i < 16; i++) + printf("%02x ", data[i]); + printf("\n"); +} +#endif + // 128-bit SSE implementation template<> @@ -114,6 +131,21 @@ really_inline SuperVector<16>::SuperVector(uint64_t const o) u.v128[0] = _mm_set1_epi64x(static_cast(o)); } +// Constants +template<> +really_inline SuperVector<16> SuperVector<16>::Ones(void) +{ + return {_mm_set1_epi8(0xFF)}; +} + +template<> +really_inline SuperVector<16> SuperVector<16>::Zeroes(void) +{ + return {_mm_set1_epi8(0)}; +} + +// Methods + template <> really_inline void SuperVector<16>::operator=(SuperVector<16> const &o) { @@ -126,6 +158,18 @@ really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const b return {_mm_and_si128(u.v128[0], b.u.v128[0])}; } +template <> +really_inline SuperVector<16> SuperVector<16>::mand(SuperVector<16> const b) const +{ + return *this & b; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::mandnot(SuperVector<16> const b) const +{ + return {_mm_andnot_si128(u.v128[0], b.u.v128[0])}; +} + template <> really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const b) const { @@ -144,7 +188,7 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(Su return eq(b).movemask(); } -#ifndef DEBUG +#ifdef HS_OPTIMIZE template <> really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { @@ -177,6 +221,38 @@ really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const } #endif +#ifdef HS_OPTIMIZE +template <> +really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const +{ + return {_mm_srli_si128(u.v128[0], N)}; +} +#else +template <> +really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const +{ + switch(N) { + case 0: return {_mm_srli_si128(u.v128[0], 0)}; break; + case 1: return {_mm_srli_si128(u.v128[0], 1)}; break; + case 2: return {_mm_srli_si128(u.v128[0], 2)}; break; + case 3: return {_mm_srli_si128(u.v128[0], 3)}; break; + case 4: return {_mm_srli_si128(u.v128[0], 4)}; break; + case 5: return {_mm_srli_si128(u.v128[0], 5)}; break; + case 6: return {_mm_srli_si128(u.v128[0], 6)}; break; + case 7: return {_mm_srli_si128(u.v128[0], 7)}; break; + case 8: return {_mm_srli_si128(u.v128[0], 8)}; break; + case 9: return {_mm_srli_si128(u.v128[0], 9)}; break; + case 10: return {_mm_srli_si128(u.v128[0], 10)}; break; + case 11: return {_mm_srli_si128(u.v128[0], 11)}; break; + case 12: return {_mm_srli_si128(u.v128[0], 12)}; break; + case 13: return {_mm_srli_si128(u.v128[0], 13)}; break; + case 14: return {_mm_srli_si128(u.v128[0], 14)}; break; + case 15: return {_mm_srli_si128(u.v128[0], 15)}; break; + default: break; + } + return *this; +} +#endif template <> really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) @@ -192,7 +268,21 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) return _mm_load_si128((const m128 *)ptr); } -#ifndef DEBUG +template <> +really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len) +{ + uint8_t alignment = (uintptr_t)(ptr) & 15; + printf("alignment = %d\n", alignment); + SuperVector<16> maskb = Ones() << alignment; + SuperVector<16> maske = Ones() >> (16 -len - alignment); + print_m128_16x8("maskb", maskb.u.v128[0]); + print_m128_16x8("maske", maske.u.v128[0]); + SuperVector<16> v = _mm_loadu_si128((const m128 *)ptr); + print_m128_16x8("v", v.u.v128[0]); + return {maskb.u.v128[0] & maske.u.v128[0] & v.u.v128[0]}; +} + +#ifdef HS_OPTIMIZE template<> really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> l, int8_t offset) { @@ -225,20 +315,77 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> l, int8_t } #endif - -// Constants template<> -really_inline SuperVector<16> SuperVector<16>::Ones(void) +really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) { - return {_mm_set1_epi8(0xFF)}; + return {_mm_shuffle_epi8(u.v128[0], b.u.v128[0])}; } -// Constants +#ifdef HS_HS_OPTIMIZE template<> -really_inline SuperVector<16> SuperVector<16>::Zeroes(void) +really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const l) { - return {_mm_set1_epi8(0)}; + return {_mm_slli_epi64(u.v128[0], l)}; } +#else +template<> +really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const l) +{ + switch(l) { + case 0: return {_mm_slli_epi64(u.v128[0], 0)}; break; + case 1: return {_mm_slli_epi64(u.v128[0], 1)}; break; + case 2: return {_mm_slli_epi64(u.v128[0], 2)}; break; + case 3: return {_mm_slli_epi64(u.v128[0], 3)}; break; + case 4: return {_mm_slli_epi64(u.v128[0], 4)}; break; + case 5: return {_mm_slli_epi64(u.v128[0], 5)}; break; + case 6: return {_mm_slli_epi64(u.v128[0], 6)}; break; + case 7: return {_mm_slli_epi64(u.v128[0], 7)}; break; + case 8: return {_mm_slli_epi64(u.v128[0], 8)}; break; + case 9: return {_mm_slli_epi64(u.v128[0], 9)}; break; + case 10: return {_mm_slli_epi64(u.v128[0], 10)}; break; + case 11: return {_mm_slli_epi64(u.v128[0], 11)}; break; + case 12: return {_mm_slli_epi64(u.v128[0], 12)}; break; + case 13: return {_mm_slli_epi64(u.v128[0], 13)}; break; + case 14: return {_mm_slli_epi64(u.v128[0], 14)}; break; + case 15: return {_mm_slli_epi64(u.v128[0], 15)}; break; + default: break; + } + return *this; +} +#endif + +#ifdef HS_HS_OPTIMIZE +template<> +really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const l) +{ + return {_mm_srli_epi64(u.v128[0], l)}; +} +#else +template<> +really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const l) +{ + switch(l) { + case 0: return {_mm_srli_epi64(u.v128[0], 0)}; break; + case 1: return {_mm_srli_epi64(u.v128[0], 1)}; break; + case 2: return {_mm_srli_epi64(u.v128[0], 2)}; break; + case 3: return {_mm_srli_epi64(u.v128[0], 3)}; break; + case 4: return {_mm_srli_epi64(u.v128[0], 4)}; break; + case 5: return {_mm_srli_epi64(u.v128[0], 5)}; break; + case 6: return {_mm_srli_epi64(u.v128[0], 6)}; break; + case 7: return {_mm_srli_epi64(u.v128[0], 7)}; break; + case 8: return {_mm_srli_epi64(u.v128[0], 8)}; break; + case 9: return {_mm_srli_epi64(u.v128[0], 9)}; break; + case 10: return {_mm_srli_epi64(u.v128[0], 10)}; break; + case 11: return {_mm_srli_epi64(u.v128[0], 11)}; break; + case 12: return {_mm_srli_epi64(u.v128[0], 12)}; break; + case 13: return {_mm_srli_epi64(u.v128[0], 13)}; break; + case 14: return {_mm_srli_epi64(u.v128[0], 14)}; break; + case 15: return {_mm_srli_epi64(u.v128[0], 15)}; break; + default: break; + } + return *this; +} +#endif // 256-bit AVX2 implementation #if defined(HAVE_AVX2) @@ -386,24 +533,13 @@ really_inline SuperVector<32> SuperVector<32>::load(void const *ptr) ptr = assume_aligned(ptr, SuperVector::size); return {_mm256_load_si256((const m256 *)ptr)}; } -/* -static void print1_m128_16x8(const char *label, __m128i vector) { - uint8_t __attribute__((aligned((16)))) data[16]; - _mm_store_si128((__m128i*)data, vector); - printf("%s : ", label); - for(int i=0; i < 16; i++) - printf("%02x ", data[i]); - printf("\n"); -} -static void print_m256_32x8(const char *label, __m256i vector) { - uint8_t __attribute__((aligned((32)))) data[32]; - _mm256_store_si256((__m256i*)data, vector); - printf("%s : ", label); - for(int i=0; i < 32; i++) - printf("%02x ", data[i]); - printf("\n"); -}*/ +template <> +really_inline SuperVector<32> SuperVector<32>::loadu_mask(void const *ptr, size_t const len) +{ + + return {_mm256_loadu_si256((const m256 *)ptr)}; +} #ifndef DEBUG template<> diff --git a/src/util/simd/arch/x86/types.hpp b/src/util/simd/arch/x86/types.hpp index 1361d968..b6332781 100644 --- a/src/util/simd/arch/x86/types.hpp +++ b/src/util/simd/arch/x86/types.hpp @@ -31,7 +31,7 @@ typedef __m128i m128; #endif -#if !defined(m128) && defined(HAVE_AVX2) +#if !defined(m256) && defined(HAVE_AVX2) typedef __m256i m256; #endif diff --git a/src/util/simd/types.hpp b/src/util/simd/types.hpp index 16b7e69a..7e18eb49 100644 --- a/src/util/simd/types.hpp +++ b/src/util/simd/types.hpp @@ -38,6 +38,43 @@ #include "util/simd/arch/arm/types.hpp" #endif +#if defined(HAVE_SIMD_512_BITS) +using Z_TYPE = u64a; +#define Z_BITS 64 +#define Z_SHIFT 63 +#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS -(l))) +#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) +#elif defined(HAVE_SIMD_256_BITS) +using Z_TYPE = u32; +#define Z_BITS 32 +#define Z_SHIFT 31 +#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) +#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) +#elif defined(HAVE_SIMD_128_BITS) +using Z_TYPE = u32; +#define Z_BITS 32 +#define Z_SHIFT 0 +#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) +#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL) +#endif + +// Define a common assume_aligned using an appropriate compiler built-in, if +// it's available. Note that we need to handle C or C++ compilation. +#ifdef __cplusplus +# ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED +# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) +# endif +#else +# ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED +# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) +# endif +#endif + +// Fallback to identity case. +#ifndef assume_aligned +#define assume_aligned(x, y) (x) +#endif + template class SuperVector; @@ -124,16 +161,37 @@ public: template SuperVector(T const o); + static SuperVector set1u_16x8(uint8_t o) { return {o}; }; + static SuperVector set1_16x8(int8_t o) { return {o}; }; + static SuperVector set1u_8x16(uint16_t o) { return {o}; }; + static SuperVector set1_8x16(int16_t o) { return {o}; }; + static SuperVector set1u_4x32(uint32_t o) { return {o}; }; + static SuperVector set1_4x32(int32_t o) { return {o}; }; + static SuperVector set1u_2x64(uint64_t o) { return {o}; }; + static SuperVector set1_2x64(int64_t o) { return {o}; }; + void operator=(SuperVector const &o); + SuperVector operator&(SuperVector const b) const; + + SuperVector mand(SuperVector const b) const; + SuperVector mandnot(SuperVector const b) const; + SuperVector eq(SuperVector const b) const; SuperVector operator<<(uint8_t const N) const; + SuperVector operator>>(uint8_t const N) const; typename base_type::movemask_type movemask(void) const; typename base_type::movemask_type eqmask(SuperVector const b) const; + static SuperVector loadu(void const *ptr); static SuperVector load(void const *ptr); + static SuperVector loadu_maskz(void const *ptr, uint8_t const len); SuperVector alignr(SuperVector l, int8_t offset); + SuperVector pshufb(SuperVector b); + SuperVector lshift64(uint8_t const l); + SuperVector rshift64(uint8_t const l); + // Constants static SuperVector Ones(); static SuperVector Zeroes(); @@ -144,11 +202,13 @@ public: // class SuperVector<64>; // class SuperVector<128>; +#if defined(HS_OPTIMIZE) #if defined(ARCH_IA32) || defined(ARCH_X86_64) -#include "util/simd/arch/x86/impl.hpp" +#include "util/simd/arch/x86/impl.cpp" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/simd/arch/arm/impl.hpp" #endif +#endif #endif /* SIMD_TYPES_H */