If we can shift by an immediate, do it. Otherwise, don't.

This commit is contained in:
Matthew Barr 2017-05-16 11:05:53 +10:00
parent 0275869b3e
commit 3e345c2567
4 changed files with 92 additions and 6 deletions

View File

@ -313,6 +313,7 @@ endif ()
# testing a builtin takes a little more work
CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
if (NOT WIN32)
set(C_FLAGS_TO_CHECK

View File

@ -81,6 +81,9 @@
/* Define to 1 if you have the `_aligned_malloc' function. */
#cmakedefine HAVE__ALIGNED_MALLOC
/* Define if compiler has __builtin_constant_p */
#cmakedefine HAVE__BUILTIN_CONSTANT_P
/* Optimize, inline critical functions */
#cmakedefine HS_OPTIMIZE

View File

@ -123,7 +123,17 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
#endif
}
#define lshift64_m128(a, b) _mm_slli_epi64((a), (b))
static really_really_inline
m128 lshift64_m128(m128 a, unsigned b) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return _mm_slli_epi64(a, b);
}
#endif
m128 x = _mm_cvtsi32_si128(b);
return _mm_sll_epi64(a, x);
}
#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
#define eq128(a, b) _mm_cmpeq_epi8((a), (b))
#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
@ -339,7 +349,18 @@ m128 set64x2(u64a hi, u64a lo) {
****/
#if defined(HAVE_AVX2)
#define lshift64_m256(a, b) _mm256_slli_epi64((a), (b))
static really_really_inline
m256 lshift64_m256(m256 a, unsigned b) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return _mm256_slli_epi64(a, b);
}
#endif
m128 x = _mm_cvtsi32_si128(b);
return _mm256_sll_epi64(a, x);
}
#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
static really_inline
@ -357,7 +378,7 @@ m256 set2x128(m128 a) {
#else
static really_inline
static really_really_inline
m256 lshift64_m256(m256 a, int b) {
m256 rv = a;
rv.lo = lshift64_m128(rv.lo, b);
@ -776,7 +797,6 @@ static really_inline m384 andnot384(m384 a, m384 b) {
return rv;
}
// The shift amount is an immediate
static really_really_inline
m384 lshift64_m384(m384 a, unsigned b) {
m384 rv;
@ -1016,9 +1036,17 @@ m512 andnot512(m512 a, m512 b) {
}
#if defined(HAVE_AVX512)
#define lshift64_m512(a, b) _mm512_slli_epi64((a), b)
static really_really_inline
m512 lshift64_m512(m512 a, unsigned b) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return _mm512_slli_epi64(a, b);
}
#endif
m128 x = _mm_cvtsi32_si128(b);
return _mm512_sll_epi64(a, x);
}
#else
// The shift amount is an immediate
static really_really_inline
m512 lshift64_m512(m512 a, unsigned b) {
m512 rv;

View File

@ -143,6 +143,10 @@ void simd_loadbytes(m128 *a, const void *ptr, unsigned i) { *a = loadbytes128(pt
void simd_loadbytes(m256 *a, const void *ptr, unsigned i) { *a = loadbytes256(ptr, i); }
void simd_loadbytes(m384 *a, const void *ptr, unsigned i) { *a = loadbytes384(ptr, i); }
void simd_loadbytes(m512 *a, const void *ptr, unsigned i) { *a = loadbytes512(ptr, i); }
m128 simd_lshift64(const m128 &a, unsigned i) { return lshift64_m128(a, i); }
m256 simd_lshift64(const m256 &a, unsigned i) { return lshift64_m256(a, i); }
m384 simd_lshift64(const m384 &a, unsigned i) { return lshift64_m384(a, i); }
m512 simd_lshift64(const m512 &a, unsigned i) { return lshift64_m512(a, i); }
template<typename T>
class SimdUtilsTest : public testing::Test {
@ -586,6 +590,56 @@ TYPED_TEST(SimdUtilsTest, loadbytes_storebytes) {
}
}
TYPED_TEST(SimdUtilsTest, lshift64) {
TypeParam a;
memset(&a, 0x5a, sizeof(a));
static constexpr u64a exp_val = 0x5a5a5a5a5a5a5a5aULL;
union {
TypeParam simd;
u64a qword[sizeof(TypeParam) / 8];
} c;
cout << "non-const for size " << sizeof(a) << '\n';
for (unsigned s = 0; s < 64; s++) {
c.simd = simd_lshift64(a, s);
const u64a expected = exp_val << s;
for (size_t i = 0; i < sizeof(c) / 8; i++) {
EXPECT_EQ(expected, c.qword[i]);
}
}
// test immediates
u64a expected;
cout << "imm for size " << sizeof(a) << '\n';
c.simd = simd_lshift64(a, 1);
expected = exp_val << 1;
for (size_t i = 0; i < sizeof(c) / 8; i++) {
EXPECT_EQ(expected, c.qword[i]);
}
c.simd = simd_lshift64(a, 2);
expected = exp_val << 2;
for (size_t i = 0; i < sizeof(c) / 8; i++) {
EXPECT_EQ(expected, c.qword[i]);
}
c.simd = simd_lshift64(a, 7);
expected = exp_val << 7;
for (size_t i = 0; i < sizeof(c) / 8; i++) {
EXPECT_EQ(expected, c.qword[i]);
}
c.simd = simd_lshift64(a, 31);
expected = exp_val << 31;
for (size_t i = 0; i < sizeof(c) / 8; i++) {
EXPECT_EQ(expected, c.qword[i]);
}
}
TEST(SimdUtilsTest, alignment) {
ASSERT_EQ(16, alignof(m128));
ASSERT_EQ(32, alignof(m256));