mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
add more functions, move defines here, enable inlining of template specializations only when running optimized code
This commit is contained in:
parent
d8b5eb5d17
commit
6526df81e4
@ -31,12 +31,18 @@
|
||||
#define SIMD_IMPL_HPP
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "util/arch.h"
|
||||
#include "util/unaligned.h"
|
||||
#include "util/simd/types.hpp"
|
||||
|
||||
#if !defined(m128) && defined(HAVE_SSE2)
|
||||
typedef __m128i m128;
|
||||
#endif
|
||||
|
||||
#if !defined(m128) && defined(HAVE_AVX2)
|
||||
#if !defined(m256) && defined(HAVE_AVX2)
|
||||
typedef __m256i m256;
|
||||
#endif
|
||||
|
||||
@ -44,6 +50,17 @@ typedef __m256i m256;
|
||||
typedef __m512i m512;
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
static inline void print_m128_16x8(const char *label, m128 vector) {
|
||||
uint8_t ALIGN_ATTR(16) data[16];
|
||||
_mm_store_si128 ((m128 *)data, vector);
|
||||
DEBUG_PRINTF("%s: ", label);
|
||||
for(int i=0; i < 16; i++)
|
||||
printf("%02x ", data[i]);
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
// 128-bit SSE implementation
|
||||
|
||||
template<>
|
||||
@ -114,6 +131,21 @@ really_inline SuperVector<16>::SuperVector<uint64_t>(uint64_t const o)
|
||||
u.v128[0] = _mm_set1_epi64x(static_cast<int64_t>(o));
|
||||
}
|
||||
|
||||
// Constants
|
||||
template<>
|
||||
really_inline SuperVector<16> SuperVector<16>::Ones(void)
|
||||
{
|
||||
return {_mm_set1_epi8(0xFF)};
|
||||
}
|
||||
|
||||
template<>
|
||||
really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
|
||||
{
|
||||
return {_mm_set1_epi8(0)};
|
||||
}
|
||||
|
||||
// Methods
|
||||
|
||||
template <>
|
||||
really_inline void SuperVector<16>::operator=(SuperVector<16> const &o)
|
||||
{
|
||||
@ -126,6 +158,18 @@ really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const b
|
||||
return {_mm_and_si128(u.v128[0], b.u.v128[0])};
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::mand(SuperVector<16> const b) const
|
||||
{
|
||||
return *this & b;
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::mandnot(SuperVector<16> const b) const
|
||||
{
|
||||
return {_mm_andnot_si128(u.v128[0], b.u.v128[0])};
|
||||
}
|
||||
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const b) const
|
||||
{
|
||||
@ -144,7 +188,7 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(Su
|
||||
return eq(b).movemask();
|
||||
}
|
||||
|
||||
#ifndef DEBUG
|
||||
#ifdef HS_OPTIMIZE
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
|
||||
{
|
||||
@ -177,6 +221,38 @@ really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HS_OPTIMIZE
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
|
||||
{
|
||||
return {_mm_srli_si128(u.v128[0], N)};
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
|
||||
{
|
||||
switch(N) {
|
||||
case 0: return {_mm_srli_si128(u.v128[0], 0)}; break;
|
||||
case 1: return {_mm_srli_si128(u.v128[0], 1)}; break;
|
||||
case 2: return {_mm_srli_si128(u.v128[0], 2)}; break;
|
||||
case 3: return {_mm_srli_si128(u.v128[0], 3)}; break;
|
||||
case 4: return {_mm_srli_si128(u.v128[0], 4)}; break;
|
||||
case 5: return {_mm_srli_si128(u.v128[0], 5)}; break;
|
||||
case 6: return {_mm_srli_si128(u.v128[0], 6)}; break;
|
||||
case 7: return {_mm_srli_si128(u.v128[0], 7)}; break;
|
||||
case 8: return {_mm_srli_si128(u.v128[0], 8)}; break;
|
||||
case 9: return {_mm_srli_si128(u.v128[0], 9)}; break;
|
||||
case 10: return {_mm_srli_si128(u.v128[0], 10)}; break;
|
||||
case 11: return {_mm_srli_si128(u.v128[0], 11)}; break;
|
||||
case 12: return {_mm_srli_si128(u.v128[0], 12)}; break;
|
||||
case 13: return {_mm_srli_si128(u.v128[0], 13)}; break;
|
||||
case 14: return {_mm_srli_si128(u.v128[0], 14)}; break;
|
||||
case 15: return {_mm_srli_si128(u.v128[0], 15)}; break;
|
||||
default: break;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
|
||||
@ -192,7 +268,21 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
|
||||
return _mm_load_si128((const m128 *)ptr);
|
||||
}
|
||||
|
||||
#ifndef DEBUG
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
|
||||
{
|
||||
uint8_t alignment = (uintptr_t)(ptr) & 15;
|
||||
printf("alignment = %d\n", alignment);
|
||||
SuperVector<16> maskb = Ones() << alignment;
|
||||
SuperVector<16> maske = Ones() >> (16 -len - alignment);
|
||||
print_m128_16x8("maskb", maskb.u.v128[0]);
|
||||
print_m128_16x8("maske", maske.u.v128[0]);
|
||||
SuperVector<16> v = _mm_loadu_si128((const m128 *)ptr);
|
||||
print_m128_16x8("v", v.u.v128[0]);
|
||||
return {maskb.u.v128[0] & maske.u.v128[0] & v.u.v128[0]};
|
||||
}
|
||||
|
||||
#ifdef HS_OPTIMIZE
|
||||
template<>
|
||||
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> l, int8_t offset)
|
||||
{
|
||||
@ -225,20 +315,77 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> l, int8_t
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
// Constants
|
||||
template<>
|
||||
really_inline SuperVector<16> SuperVector<16>::Ones(void)
|
||||
really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b)
|
||||
{
|
||||
return {_mm_set1_epi8(0xFF)};
|
||||
return {_mm_shuffle_epi8(u.v128[0], b.u.v128[0])};
|
||||
}
|
||||
|
||||
// Constants
|
||||
#ifdef HS_HS_OPTIMIZE
|
||||
template<>
|
||||
really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
|
||||
really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const l)
|
||||
{
|
||||
return {_mm_set1_epi8(0)};
|
||||
return {_mm_slli_epi64(u.v128[0], l)};
|
||||
}
|
||||
#else
|
||||
template<>
|
||||
really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const l)
|
||||
{
|
||||
switch(l) {
|
||||
case 0: return {_mm_slli_epi64(u.v128[0], 0)}; break;
|
||||
case 1: return {_mm_slli_epi64(u.v128[0], 1)}; break;
|
||||
case 2: return {_mm_slli_epi64(u.v128[0], 2)}; break;
|
||||
case 3: return {_mm_slli_epi64(u.v128[0], 3)}; break;
|
||||
case 4: return {_mm_slli_epi64(u.v128[0], 4)}; break;
|
||||
case 5: return {_mm_slli_epi64(u.v128[0], 5)}; break;
|
||||
case 6: return {_mm_slli_epi64(u.v128[0], 6)}; break;
|
||||
case 7: return {_mm_slli_epi64(u.v128[0], 7)}; break;
|
||||
case 8: return {_mm_slli_epi64(u.v128[0], 8)}; break;
|
||||
case 9: return {_mm_slli_epi64(u.v128[0], 9)}; break;
|
||||
case 10: return {_mm_slli_epi64(u.v128[0], 10)}; break;
|
||||
case 11: return {_mm_slli_epi64(u.v128[0], 11)}; break;
|
||||
case 12: return {_mm_slli_epi64(u.v128[0], 12)}; break;
|
||||
case 13: return {_mm_slli_epi64(u.v128[0], 13)}; break;
|
||||
case 14: return {_mm_slli_epi64(u.v128[0], 14)}; break;
|
||||
case 15: return {_mm_slli_epi64(u.v128[0], 15)}; break;
|
||||
default: break;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HS_HS_OPTIMIZE
|
||||
template<>
|
||||
really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const l)
|
||||
{
|
||||
return {_mm_srli_epi64(u.v128[0], l)};
|
||||
}
|
||||
#else
|
||||
template<>
|
||||
really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const l)
|
||||
{
|
||||
switch(l) {
|
||||
case 0: return {_mm_srli_epi64(u.v128[0], 0)}; break;
|
||||
case 1: return {_mm_srli_epi64(u.v128[0], 1)}; break;
|
||||
case 2: return {_mm_srli_epi64(u.v128[0], 2)}; break;
|
||||
case 3: return {_mm_srli_epi64(u.v128[0], 3)}; break;
|
||||
case 4: return {_mm_srli_epi64(u.v128[0], 4)}; break;
|
||||
case 5: return {_mm_srli_epi64(u.v128[0], 5)}; break;
|
||||
case 6: return {_mm_srli_epi64(u.v128[0], 6)}; break;
|
||||
case 7: return {_mm_srli_epi64(u.v128[0], 7)}; break;
|
||||
case 8: return {_mm_srli_epi64(u.v128[0], 8)}; break;
|
||||
case 9: return {_mm_srli_epi64(u.v128[0], 9)}; break;
|
||||
case 10: return {_mm_srli_epi64(u.v128[0], 10)}; break;
|
||||
case 11: return {_mm_srli_epi64(u.v128[0], 11)}; break;
|
||||
case 12: return {_mm_srli_epi64(u.v128[0], 12)}; break;
|
||||
case 13: return {_mm_srli_epi64(u.v128[0], 13)}; break;
|
||||
case 14: return {_mm_srli_epi64(u.v128[0], 14)}; break;
|
||||
case 15: return {_mm_srli_epi64(u.v128[0], 15)}; break;
|
||||
default: break;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
// 256-bit AVX2 implementation
|
||||
#if defined(HAVE_AVX2)
|
||||
@ -386,24 +533,13 @@ really_inline SuperVector<32> SuperVector<32>::load(void const *ptr)
|
||||
ptr = assume_aligned(ptr, SuperVector::size);
|
||||
return {_mm256_load_si256((const m256 *)ptr)};
|
||||
}
|
||||
/*
|
||||
static void print1_m128_16x8(const char *label, __m128i vector) {
|
||||
uint8_t __attribute__((aligned((16)))) data[16];
|
||||
_mm_store_si128((__m128i*)data, vector);
|
||||
printf("%s : ", label);
|
||||
for(int i=0; i < 16; i++)
|
||||
printf("%02x ", data[i]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static void print_m256_32x8(const char *label, __m256i vector) {
|
||||
uint8_t __attribute__((aligned((32)))) data[32];
|
||||
_mm256_store_si256((__m256i*)data, vector);
|
||||
printf("%s : ", label);
|
||||
for(int i=0; i < 32; i++)
|
||||
printf("%02x ", data[i]);
|
||||
printf("\n");
|
||||
}*/
|
||||
template <>
|
||||
really_inline SuperVector<32> SuperVector<32>::loadu_mask(void const *ptr, size_t const len)
|
||||
{
|
||||
|
||||
return {_mm256_loadu_si256((const m256 *)ptr)};
|
||||
}
|
||||
|
||||
#ifndef DEBUG
|
||||
template<>
|
@ -31,7 +31,7 @@
|
||||
typedef __m128i m128;
|
||||
#endif
|
||||
|
||||
#if !defined(m128) && defined(HAVE_AVX2)
|
||||
#if !defined(m256) && defined(HAVE_AVX2)
|
||||
typedef __m256i m256;
|
||||
#endif
|
||||
|
||||
|
@ -38,6 +38,43 @@
|
||||
#include "util/simd/arch/arm/types.hpp"
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_SIMD_512_BITS)
|
||||
using Z_TYPE = u64a;
|
||||
#define Z_BITS 64
|
||||
#define Z_SHIFT 63
|
||||
#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS -(l)))
|
||||
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||
#elif defined(HAVE_SIMD_256_BITS)
|
||||
using Z_TYPE = u32;
|
||||
#define Z_BITS 32
|
||||
#define Z_SHIFT 31
|
||||
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||
#elif defined(HAVE_SIMD_128_BITS)
|
||||
using Z_TYPE = u32;
|
||||
#define Z_BITS 32
|
||||
#define Z_SHIFT 0
|
||||
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
|
||||
#endif
|
||||
|
||||
// Define a common assume_aligned using an appropriate compiler built-in, if
|
||||
// it's available. Note that we need to handle C or C++ compilation.
|
||||
#ifdef __cplusplus
|
||||
# ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
|
||||
# define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
|
||||
# endif
|
||||
#else
|
||||
# ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
|
||||
# define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
|
||||
# endif
|
||||
#endif
|
||||
|
||||
// Fallback to identity case.
|
||||
#ifndef assume_aligned
|
||||
#define assume_aligned(x, y) (x)
|
||||
#endif
|
||||
|
||||
template <uint16_t SIZE>
|
||||
class SuperVector;
|
||||
|
||||
@ -124,16 +161,37 @@ public:
|
||||
template<typename T>
|
||||
SuperVector(T const o);
|
||||
|
||||
static SuperVector set1u_16x8(uint8_t o) { return {o}; };
|
||||
static SuperVector set1_16x8(int8_t o) { return {o}; };
|
||||
static SuperVector set1u_8x16(uint16_t o) { return {o}; };
|
||||
static SuperVector set1_8x16(int16_t o) { return {o}; };
|
||||
static SuperVector set1u_4x32(uint32_t o) { return {o}; };
|
||||
static SuperVector set1_4x32(int32_t o) { return {o}; };
|
||||
static SuperVector set1u_2x64(uint64_t o) { return {o}; };
|
||||
static SuperVector set1_2x64(int64_t o) { return {o}; };
|
||||
|
||||
void operator=(SuperVector const &o);
|
||||
|
||||
SuperVector operator&(SuperVector const b) const;
|
||||
|
||||
SuperVector mand(SuperVector const b) const;
|
||||
SuperVector mandnot(SuperVector const b) const;
|
||||
|
||||
SuperVector eq(SuperVector const b) const;
|
||||
SuperVector operator<<(uint8_t const N) const;
|
||||
SuperVector operator>>(uint8_t const N) const;
|
||||
typename base_type::movemask_type movemask(void) const;
|
||||
typename base_type::movemask_type eqmask(SuperVector const b) const;
|
||||
|
||||
static SuperVector loadu(void const *ptr);
|
||||
static SuperVector load(void const *ptr);
|
||||
static SuperVector loadu_maskz(void const *ptr, uint8_t const len);
|
||||
SuperVector alignr(SuperVector l, int8_t offset);
|
||||
|
||||
SuperVector pshufb(SuperVector b);
|
||||
SuperVector lshift64(uint8_t const l);
|
||||
SuperVector rshift64(uint8_t const l);
|
||||
|
||||
// Constants
|
||||
static SuperVector Ones();
|
||||
static SuperVector Zeroes();
|
||||
@ -144,11 +202,13 @@ public:
|
||||
// class SuperVector<64>;
|
||||
// class SuperVector<128>;
|
||||
|
||||
#if defined(HS_OPTIMIZE)
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
#include "util/simd/arch/x86/impl.hpp"
|
||||
#include "util/simd/arch/x86/impl.cpp"
|
||||
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
|
||||
#include "util/simd/arch/arm/impl.hpp"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* SIMD_TYPES_H */
|
||||
|
Loading…
x
Reference in New Issue
Block a user