From 3e345c256770c229cf3cba66d07c350497566d3c Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Tue, 16 May 2017 11:05:53 +1000 Subject: [PATCH] If we can shift by an immediate, do it. Otherwise, don't. --- CMakeLists.txt | 1 + cmake/config.h.in | 3 ++ src/util/simd_utils.h | 40 ++++++++++++++++++++++---- unit/internal/simd_utils.cpp | 54 ++++++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e0f06b2..cfb1325c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -313,6 +313,7 @@ endif () # testing a builtin takes a little more work CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED) CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED) +CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P) if (NOT WIN32) set(C_FLAGS_TO_CHECK diff --git a/cmake/config.h.in b/cmake/config.h.in index 62029cb9..9c250b4c 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -81,6 +81,9 @@ /* Define to 1 if you have the `_aligned_malloc' function. */ #cmakedefine HAVE__ALIGNED_MALLOC +/* Define if compiler has __builtin_constant_p */ +#cmakedefine HAVE__BUILTIN_CONSTANT_P + /* Optimize, inline critical functions */ #cmakedefine HS_OPTIMIZE diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index b4c0f7c8..047cdbab 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -123,7 +123,17 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) { #endif } -#define lshift64_m128(a, b) _mm_slli_epi64((a), (b)) +static really_really_inline +m128 lshift64_m128(m128 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm_slli_epi64(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm_sll_epi64(a, x); +} + #define rshift64_m128(a, b) _mm_srli_epi64((a), (b)) #define eq128(a, b) _mm_cmpeq_epi8((a), (b)) #define movemask128(a) ((u32)_mm_movemask_epi8((a))) @@ -339,7 +349,18 @@ m128 set64x2(u64a hi, u64a lo) { ****/ #if defined(HAVE_AVX2) -#define lshift64_m256(a, b) _mm256_slli_epi64((a), (b)) + +static really_really_inline +m256 lshift64_m256(m256 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm256_slli_epi64(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm256_sll_epi64(a, x); +} + #define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) static really_inline @@ -357,7 +378,7 @@ m256 set2x128(m128 a) { #else -static really_inline +static really_really_inline m256 lshift64_m256(m256 a, int b) { m256 rv = a; rv.lo = lshift64_m128(rv.lo, b); @@ -776,7 +797,6 @@ static really_inline m384 andnot384(m384 a, m384 b) { return rv; } -// The shift amount is an immediate static really_really_inline m384 lshift64_m384(m384 a, unsigned b) { m384 rv; @@ -1016,9 +1036,17 @@ m512 andnot512(m512 a, m512 b) { } #if defined(HAVE_AVX512) -#define lshift64_m512(a, b) _mm512_slli_epi64((a), b) +static really_really_inline +m512 lshift64_m512(m512 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm512_slli_epi64(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm512_sll_epi64(a, x); +} #else -// The shift amount is an immediate static really_really_inline m512 lshift64_m512(m512 a, unsigned b) { m512 rv; diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 0d3926d6..d3e34f52 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -143,6 +143,10 @@ void simd_loadbytes(m128 *a, const void *ptr, unsigned i) { *a = loadbytes128(pt void simd_loadbytes(m256 *a, const void *ptr, unsigned i) { *a = loadbytes256(ptr, i); } void simd_loadbytes(m384 *a, const void *ptr, unsigned i) { *a = loadbytes384(ptr, i); } void simd_loadbytes(m512 *a, const void *ptr, unsigned i) { *a = loadbytes512(ptr, i); } +m128 simd_lshift64(const m128 &a, unsigned i) { return lshift64_m128(a, i); } +m256 simd_lshift64(const m256 &a, unsigned i) { return lshift64_m256(a, i); } +m384 simd_lshift64(const m384 &a, unsigned i) { return lshift64_m384(a, i); } +m512 simd_lshift64(const m512 &a, unsigned i) { return lshift64_m512(a, i); } template class SimdUtilsTest : public testing::Test { @@ -586,6 +590,56 @@ TYPED_TEST(SimdUtilsTest, loadbytes_storebytes) { } } +TYPED_TEST(SimdUtilsTest, lshift64) { + TypeParam a; + memset(&a, 0x5a, sizeof(a)); + + static constexpr u64a exp_val = 0x5a5a5a5a5a5a5a5aULL; + + union { + TypeParam simd; + u64a qword[sizeof(TypeParam) / 8]; + } c; + cout << "non-const for size " << sizeof(a) << '\n'; + for (unsigned s = 0; s < 64; s++) { + c.simd = simd_lshift64(a, s); + + const u64a expected = exp_val << s; + for (size_t i = 0; i < sizeof(c) / 8; i++) { + EXPECT_EQ(expected, c.qword[i]); + } + } + + // test immediates + u64a expected; + + cout << "imm for size " << sizeof(a) << '\n'; + c.simd = simd_lshift64(a, 1); + expected = exp_val << 1; + for (size_t i = 0; i < sizeof(c) / 8; i++) { + EXPECT_EQ(expected, c.qword[i]); + } + + c.simd = simd_lshift64(a, 2); + expected = exp_val << 2; + for (size_t i = 0; i < sizeof(c) / 8; i++) { + EXPECT_EQ(expected, c.qword[i]); + } + + c.simd = simd_lshift64(a, 7); + expected = exp_val << 7; + for (size_t i = 0; i < sizeof(c) / 8; i++) { + EXPECT_EQ(expected, c.qword[i]); + } + + c.simd = simd_lshift64(a, 31); + expected = exp_val << 31; + for (size_t i = 0; i < sizeof(c) / 8; i++) { + EXPECT_EQ(expected, c.qword[i]); + } +} + + TEST(SimdUtilsTest, alignment) { ASSERT_EQ(16, alignof(m128)); ASSERT_EQ(32, alignof(m256));