mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
If we can shift by an immediate, do it. Otherwise, don't.
This commit is contained in:
parent
0275869b3e
commit
3e345c2567
@ -313,6 +313,7 @@ endif ()
|
|||||||
# testing a builtin takes a little more work
|
# testing a builtin takes a little more work
|
||||||
CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
|
CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
|
||||||
CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
|
CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
|
||||||
|
CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
|
||||||
|
|
||||||
if (NOT WIN32)
|
if (NOT WIN32)
|
||||||
set(C_FLAGS_TO_CHECK
|
set(C_FLAGS_TO_CHECK
|
||||||
|
@ -81,6 +81,9 @@
|
|||||||
/* Define to 1 if you have the `_aligned_malloc' function. */
|
/* Define to 1 if you have the `_aligned_malloc' function. */
|
||||||
#cmakedefine HAVE__ALIGNED_MALLOC
|
#cmakedefine HAVE__ALIGNED_MALLOC
|
||||||
|
|
||||||
|
/* Define if compiler has __builtin_constant_p */
|
||||||
|
#cmakedefine HAVE__BUILTIN_CONSTANT_P
|
||||||
|
|
||||||
/* Optimize, inline critical functions */
|
/* Optimize, inline critical functions */
|
||||||
#cmakedefine HS_OPTIMIZE
|
#cmakedefine HS_OPTIMIZE
|
||||||
|
|
||||||
|
@ -123,7 +123,17 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#define lshift64_m128(a, b) _mm_slli_epi64((a), (b))
|
static really_really_inline
|
||||||
|
m128 lshift64_m128(m128 a, unsigned b) {
|
||||||
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
|
if (__builtin_constant_p(b)) {
|
||||||
|
return _mm_slli_epi64(a, b);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
m128 x = _mm_cvtsi32_si128(b);
|
||||||
|
return _mm_sll_epi64(a, x);
|
||||||
|
}
|
||||||
|
|
||||||
#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
|
#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
|
||||||
#define eq128(a, b) _mm_cmpeq_epi8((a), (b))
|
#define eq128(a, b) _mm_cmpeq_epi8((a), (b))
|
||||||
#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
|
#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
|
||||||
@ -339,7 +349,18 @@ m128 set64x2(u64a hi, u64a lo) {
|
|||||||
****/
|
****/
|
||||||
|
|
||||||
#if defined(HAVE_AVX2)
|
#if defined(HAVE_AVX2)
|
||||||
#define lshift64_m256(a, b) _mm256_slli_epi64((a), (b))
|
|
||||||
|
static really_really_inline
|
||||||
|
m256 lshift64_m256(m256 a, unsigned b) {
|
||||||
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
|
if (__builtin_constant_p(b)) {
|
||||||
|
return _mm256_slli_epi64(a, b);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
m128 x = _mm_cvtsi32_si128(b);
|
||||||
|
return _mm256_sll_epi64(a, x);
|
||||||
|
}
|
||||||
|
|
||||||
#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
|
#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
|
||||||
|
|
||||||
static really_inline
|
static really_inline
|
||||||
@ -357,7 +378,7 @@ m256 set2x128(m128 a) {
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
static really_inline
|
static really_really_inline
|
||||||
m256 lshift64_m256(m256 a, int b) {
|
m256 lshift64_m256(m256 a, int b) {
|
||||||
m256 rv = a;
|
m256 rv = a;
|
||||||
rv.lo = lshift64_m128(rv.lo, b);
|
rv.lo = lshift64_m128(rv.lo, b);
|
||||||
@ -776,7 +797,6 @@ static really_inline m384 andnot384(m384 a, m384 b) {
|
|||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The shift amount is an immediate
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m384 lshift64_m384(m384 a, unsigned b) {
|
m384 lshift64_m384(m384 a, unsigned b) {
|
||||||
m384 rv;
|
m384 rv;
|
||||||
@ -1016,9 +1036,17 @@ m512 andnot512(m512 a, m512 b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(HAVE_AVX512)
|
#if defined(HAVE_AVX512)
|
||||||
#define lshift64_m512(a, b) _mm512_slli_epi64((a), b)
|
static really_really_inline
|
||||||
|
m512 lshift64_m512(m512 a, unsigned b) {
|
||||||
|
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||||
|
if (__builtin_constant_p(b)) {
|
||||||
|
return _mm512_slli_epi64(a, b);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
m128 x = _mm_cvtsi32_si128(b);
|
||||||
|
return _mm512_sll_epi64(a, x);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
// The shift amount is an immediate
|
|
||||||
static really_really_inline
|
static really_really_inline
|
||||||
m512 lshift64_m512(m512 a, unsigned b) {
|
m512 lshift64_m512(m512 a, unsigned b) {
|
||||||
m512 rv;
|
m512 rv;
|
||||||
|
@ -143,6 +143,10 @@ void simd_loadbytes(m128 *a, const void *ptr, unsigned i) { *a = loadbytes128(pt
|
|||||||
void simd_loadbytes(m256 *a, const void *ptr, unsigned i) { *a = loadbytes256(ptr, i); }
|
void simd_loadbytes(m256 *a, const void *ptr, unsigned i) { *a = loadbytes256(ptr, i); }
|
||||||
void simd_loadbytes(m384 *a, const void *ptr, unsigned i) { *a = loadbytes384(ptr, i); }
|
void simd_loadbytes(m384 *a, const void *ptr, unsigned i) { *a = loadbytes384(ptr, i); }
|
||||||
void simd_loadbytes(m512 *a, const void *ptr, unsigned i) { *a = loadbytes512(ptr, i); }
|
void simd_loadbytes(m512 *a, const void *ptr, unsigned i) { *a = loadbytes512(ptr, i); }
|
||||||
|
m128 simd_lshift64(const m128 &a, unsigned i) { return lshift64_m128(a, i); }
|
||||||
|
m256 simd_lshift64(const m256 &a, unsigned i) { return lshift64_m256(a, i); }
|
||||||
|
m384 simd_lshift64(const m384 &a, unsigned i) { return lshift64_m384(a, i); }
|
||||||
|
m512 simd_lshift64(const m512 &a, unsigned i) { return lshift64_m512(a, i); }
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
class SimdUtilsTest : public testing::Test {
|
class SimdUtilsTest : public testing::Test {
|
||||||
@ -586,6 +590,56 @@ TYPED_TEST(SimdUtilsTest, loadbytes_storebytes) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TYPED_TEST(SimdUtilsTest, lshift64) {
|
||||||
|
TypeParam a;
|
||||||
|
memset(&a, 0x5a, sizeof(a));
|
||||||
|
|
||||||
|
static constexpr u64a exp_val = 0x5a5a5a5a5a5a5a5aULL;
|
||||||
|
|
||||||
|
union {
|
||||||
|
TypeParam simd;
|
||||||
|
u64a qword[sizeof(TypeParam) / 8];
|
||||||
|
} c;
|
||||||
|
cout << "non-const for size " << sizeof(a) << '\n';
|
||||||
|
for (unsigned s = 0; s < 64; s++) {
|
||||||
|
c.simd = simd_lshift64(a, s);
|
||||||
|
|
||||||
|
const u64a expected = exp_val << s;
|
||||||
|
for (size_t i = 0; i < sizeof(c) / 8; i++) {
|
||||||
|
EXPECT_EQ(expected, c.qword[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// test immediates
|
||||||
|
u64a expected;
|
||||||
|
|
||||||
|
cout << "imm for size " << sizeof(a) << '\n';
|
||||||
|
c.simd = simd_lshift64(a, 1);
|
||||||
|
expected = exp_val << 1;
|
||||||
|
for (size_t i = 0; i < sizeof(c) / 8; i++) {
|
||||||
|
EXPECT_EQ(expected, c.qword[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
c.simd = simd_lshift64(a, 2);
|
||||||
|
expected = exp_val << 2;
|
||||||
|
for (size_t i = 0; i < sizeof(c) / 8; i++) {
|
||||||
|
EXPECT_EQ(expected, c.qword[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
c.simd = simd_lshift64(a, 7);
|
||||||
|
expected = exp_val << 7;
|
||||||
|
for (size_t i = 0; i < sizeof(c) / 8; i++) {
|
||||||
|
EXPECT_EQ(expected, c.qword[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
c.simd = simd_lshift64(a, 31);
|
||||||
|
expected = exp_val << 31;
|
||||||
|
for (size_t i = 0; i < sizeof(c) / 8; i++) {
|
||||||
|
EXPECT_EQ(expected, c.qword[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
TEST(SimdUtilsTest, alignment) {
|
TEST(SimdUtilsTest, alignment) {
|
||||||
ASSERT_EQ(16, alignof(m128));
|
ASSERT_EQ(16, alignof(m128));
|
||||||
ASSERT_EQ(32, alignof(m256));
|
ASSERT_EQ(32, alignof(m256));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user