From 290eabbca08e7e591ea53cfe3bf37bce5bc7f9fb Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 6 Dec 2021 18:22:58 +0000 Subject: [PATCH] fix compilation with clang and some incomplete/wrong implementations for arm this time --- src/util/arch/arm/simd_utils.h | 238 ++++++++++++++++++++++++- src/util/supervector/arch/arm/impl.cpp | 62 +++---- 2 files changed, 264 insertions(+), 36 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 4c68b485..96cd332c 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -122,24 +122,252 @@ m128 sub_2x64(m128 a, m128 b) { return (m128) vsubq_u64((uint64x2_t)a, (uint64x2_t)b); } -static really_really_inline +static really_inline m128 lshift_m128(m128 a, unsigned b) { - return (m128) vshlq_n_u32((uint32x4_t)a, b); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return (m128) vshlq_n_u32((uint32x4_t)a, b); + } +#endif +#define CASE_LSHIFT_m128(a, offset) case offset: return (m128)vshlq_n_u32((int8x16_t)(a), (offset)); break; + switch (b) { + case 0: return a; break; + CASE_LSHIFT_m128(a, 1); + CASE_LSHIFT_m128(a, 2); + CASE_LSHIFT_m128(a, 3); + CASE_LSHIFT_m128(a, 4); + CASE_LSHIFT_m128(a, 5); + CASE_LSHIFT_m128(a, 6); + CASE_LSHIFT_m128(a, 7); + CASE_LSHIFT_m128(a, 8); + CASE_LSHIFT_m128(a, 9); + CASE_LSHIFT_m128(a, 10); + CASE_LSHIFT_m128(a, 11); + CASE_LSHIFT_m128(a, 12); + CASE_LSHIFT_m128(a, 13); + CASE_LSHIFT_m128(a, 14); + CASE_LSHIFT_m128(a, 15); + CASE_LSHIFT_m128(a, 16); + CASE_LSHIFT_m128(a, 17); + CASE_LSHIFT_m128(a, 18); + CASE_LSHIFT_m128(a, 19); + CASE_LSHIFT_m128(a, 20); + CASE_LSHIFT_m128(a, 21); + CASE_LSHIFT_m128(a, 22); + CASE_LSHIFT_m128(a, 23); + CASE_LSHIFT_m128(a, 24); + CASE_LSHIFT_m128(a, 25); + CASE_LSHIFT_m128(a, 26); + CASE_LSHIFT_m128(a, 27); + CASE_LSHIFT_m128(a, 28); + CASE_LSHIFT_m128(a, 29); + CASE_LSHIFT_m128(a, 30); + CASE_LSHIFT_m128(a, 31); + default: return zeroes128(); break; + } +#undef CASE_LSHIFT_m128 } static really_really_inline m128 rshift_m128(m128 a, unsigned b) { - return (m128) vshrq_n_u32((uint32x4_t)a, b); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return (m128) vshrq_n_u32((uint32x4_t)a, b); + } +#endif +#define CASE_RSHIFT_m128(a, offset) case offset: return (m128)vshrq_n_u32((int8x16_t)(a), (offset)); break; + switch (b) { + case 0: return a; break; + CASE_RSHIFT_m128(a, 1); + CASE_RSHIFT_m128(a, 2); + CASE_RSHIFT_m128(a, 3); + CASE_RSHIFT_m128(a, 4); + CASE_RSHIFT_m128(a, 5); + CASE_RSHIFT_m128(a, 6); + CASE_RSHIFT_m128(a, 7); + CASE_RSHIFT_m128(a, 8); + CASE_RSHIFT_m128(a, 9); + CASE_RSHIFT_m128(a, 10); + CASE_RSHIFT_m128(a, 11); + CASE_RSHIFT_m128(a, 12); + CASE_RSHIFT_m128(a, 13); + CASE_RSHIFT_m128(a, 14); + CASE_RSHIFT_m128(a, 15); + CASE_RSHIFT_m128(a, 16); + CASE_RSHIFT_m128(a, 17); + CASE_RSHIFT_m128(a, 18); + CASE_RSHIFT_m128(a, 19); + CASE_RSHIFT_m128(a, 20); + CASE_RSHIFT_m128(a, 21); + CASE_RSHIFT_m128(a, 22); + CASE_RSHIFT_m128(a, 23); + CASE_RSHIFT_m128(a, 24); + CASE_RSHIFT_m128(a, 25); + CASE_RSHIFT_m128(a, 26); + CASE_RSHIFT_m128(a, 27); + CASE_RSHIFT_m128(a, 28); + CASE_RSHIFT_m128(a, 29); + CASE_RSHIFT_m128(a, 30); + CASE_RSHIFT_m128(a, 31); + default: return zeroes128(); break; + } +#undef CASE_RSHIFT_m128 } static really_really_inline m128 lshift64_m128(m128 a, unsigned b) { - return (m128) vshlq_n_u64((uint64x2_t)a, b); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return (m128) vshlq_n_u64((uint64x2_t)a, b); + } +#endif +#define CASE_LSHIFT64_m128(a, offset) case offset: return (m128)vshlq_n_u64((int8x16_t)(a), (offset)); break; + switch (b) { + case 0: return a; break; + CASE_LSHIFT64_m128(a, 1); + CASE_LSHIFT64_m128(a, 2); + CASE_LSHIFT64_m128(a, 3); + CASE_LSHIFT64_m128(a, 4); + CASE_LSHIFT64_m128(a, 5); + CASE_LSHIFT64_m128(a, 6); + CASE_LSHIFT64_m128(a, 7); + CASE_LSHIFT64_m128(a, 8); + CASE_LSHIFT64_m128(a, 9); + CASE_LSHIFT64_m128(a, 10); + CASE_LSHIFT64_m128(a, 11); + CASE_LSHIFT64_m128(a, 12); + CASE_LSHIFT64_m128(a, 13); + CASE_LSHIFT64_m128(a, 14); + CASE_LSHIFT64_m128(a, 15); + CASE_LSHIFT64_m128(a, 16); + CASE_LSHIFT64_m128(a, 17); + CASE_LSHIFT64_m128(a, 18); + CASE_LSHIFT64_m128(a, 19); + CASE_LSHIFT64_m128(a, 20); + CASE_LSHIFT64_m128(a, 21); + CASE_LSHIFT64_m128(a, 22); + CASE_LSHIFT64_m128(a, 23); + CASE_LSHIFT64_m128(a, 24); + CASE_LSHIFT64_m128(a, 25); + CASE_LSHIFT64_m128(a, 26); + CASE_LSHIFT64_m128(a, 27); + CASE_LSHIFT64_m128(a, 28); + CASE_LSHIFT64_m128(a, 29); + CASE_LSHIFT64_m128(a, 30); + CASE_LSHIFT64_m128(a, 31); + CASE_LSHIFT64_m128(a, 32); + CASE_LSHIFT64_m128(a, 33); + CASE_LSHIFT64_m128(a, 34); + CASE_LSHIFT64_m128(a, 35); + CASE_LSHIFT64_m128(a, 36); + CASE_LSHIFT64_m128(a, 37); + CASE_LSHIFT64_m128(a, 38); + CASE_LSHIFT64_m128(a, 39); + CASE_LSHIFT64_m128(a, 40); + CASE_LSHIFT64_m128(a, 41); + CASE_LSHIFT64_m128(a, 42); + CASE_LSHIFT64_m128(a, 43); + CASE_LSHIFT64_m128(a, 44); + CASE_LSHIFT64_m128(a, 45); + CASE_LSHIFT64_m128(a, 46); + CASE_LSHIFT64_m128(a, 47); + CASE_LSHIFT64_m128(a, 48); + CASE_LSHIFT64_m128(a, 49); + CASE_LSHIFT64_m128(a, 50); + CASE_LSHIFT64_m128(a, 51); + CASE_LSHIFT64_m128(a, 52); + CASE_LSHIFT64_m128(a, 53); + CASE_LSHIFT64_m128(a, 54); + CASE_LSHIFT64_m128(a, 55); + CASE_LSHIFT64_m128(a, 56); + CASE_LSHIFT64_m128(a, 57); + CASE_LSHIFT64_m128(a, 58); + CASE_LSHIFT64_m128(a, 59); + CASE_LSHIFT64_m128(a, 60); + CASE_LSHIFT64_m128(a, 61); + CASE_LSHIFT64_m128(a, 62); + CASE_LSHIFT64_m128(a, 63); + default: return zeroes128(); break; + } +#undef CASE_LSHIFT64_m128 } static really_really_inline m128 rshift64_m128(m128 a, unsigned b) { - return (m128) vshrq_n_u64((uint64x2_t)a, b); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return (m128) vshrq_n_u64((uint64x2_t)a, b); + } +#endif +#define CASE_RSHIFT64_m128(a, offset) case offset: return (m128)vshrq_n_u64((int8x16_t)(a), (offset)); break; + switch (b) { + case 0: return a; break; + CASE_RSHIFT64_m128(a, 1); + CASE_RSHIFT64_m128(a, 2); + CASE_RSHIFT64_m128(a, 3); + CASE_RSHIFT64_m128(a, 4); + CASE_RSHIFT64_m128(a, 5); + CASE_RSHIFT64_m128(a, 6); + CASE_RSHIFT64_m128(a, 7); + CASE_RSHIFT64_m128(a, 8); + CASE_RSHIFT64_m128(a, 9); + CASE_RSHIFT64_m128(a, 10); + CASE_RSHIFT64_m128(a, 11); + CASE_RSHIFT64_m128(a, 12); + CASE_RSHIFT64_m128(a, 13); + CASE_RSHIFT64_m128(a, 14); + CASE_RSHIFT64_m128(a, 15); + CASE_RSHIFT64_m128(a, 16); + CASE_RSHIFT64_m128(a, 17); + CASE_RSHIFT64_m128(a, 18); + CASE_RSHIFT64_m128(a, 19); + CASE_RSHIFT64_m128(a, 20); + CASE_RSHIFT64_m128(a, 21); + CASE_RSHIFT64_m128(a, 22); + CASE_RSHIFT64_m128(a, 23); + CASE_RSHIFT64_m128(a, 24); + CASE_RSHIFT64_m128(a, 25); + CASE_RSHIFT64_m128(a, 26); + CASE_RSHIFT64_m128(a, 27); + CASE_RSHIFT64_m128(a, 28); + CASE_RSHIFT64_m128(a, 29); + CASE_RSHIFT64_m128(a, 30); + CASE_RSHIFT64_m128(a, 31); + CASE_RSHIFT64_m128(a, 32); + CASE_RSHIFT64_m128(a, 33); + CASE_RSHIFT64_m128(a, 34); + CASE_RSHIFT64_m128(a, 35); + CASE_RSHIFT64_m128(a, 36); + CASE_RSHIFT64_m128(a, 37); + CASE_RSHIFT64_m128(a, 38); + CASE_RSHIFT64_m128(a, 39); + CASE_RSHIFT64_m128(a, 40); + CASE_RSHIFT64_m128(a, 41); + CASE_RSHIFT64_m128(a, 42); + CASE_RSHIFT64_m128(a, 43); + CASE_RSHIFT64_m128(a, 44); + CASE_RSHIFT64_m128(a, 45); + CASE_RSHIFT64_m128(a, 46); + CASE_RSHIFT64_m128(a, 47); + CASE_RSHIFT64_m128(a, 48); + CASE_RSHIFT64_m128(a, 49); + CASE_RSHIFT64_m128(a, 50); + CASE_RSHIFT64_m128(a, 51); + CASE_RSHIFT64_m128(a, 52); + CASE_RSHIFT64_m128(a, 53); + CASE_RSHIFT64_m128(a, 54); + CASE_RSHIFT64_m128(a, 55); + CASE_RSHIFT64_m128(a, 56); + CASE_RSHIFT64_m128(a, 57); + CASE_RSHIFT64_m128(a, 58); + CASE_RSHIFT64_m128(a, 59); + CASE_RSHIFT64_m128(a, 60); + CASE_RSHIFT64_m128(a, 61); + CASE_RSHIFT64_m128(a, 62); + CASE_RSHIFT64_m128(a, 63); + default: return zeroes128(); break; + } +#undef CASE_RSHIFT64_m128 } static really_inline m128 eq128(m128 a, m128 b) { diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp index 980f0b39..ff1149a9 100644 --- a/src/util/supervector/arch/arm/impl.cpp +++ b/src/util/supervector/arch/arm/impl.cpp @@ -45,112 +45,112 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v) template<> template<> -really_inline SuperVector<16>::SuperVector(int8x16_t other) +really_inline SuperVector<16>::SuperVector(int8x16_t other) { u.s8x16[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(uint8x16_t other) +really_inline SuperVector<16>::SuperVector(uint8x16_t other) { u.u8x16[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(int16x8_t other) +really_inline SuperVector<16>::SuperVector(int16x8_t other) { u.s16x8[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(uint16x8_t other) +really_inline SuperVector<16>::SuperVector(uint16x8_t other) { u.u16x8[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(int32x4_t other) +really_inline SuperVector<16>::SuperVector(int32x4_t other) { u.s32x4[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(uint32x4_t other) +really_inline SuperVector<16>::SuperVector(uint32x4_t other) { u.u32x4[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(int64x2_t other) +really_inline SuperVector<16>::SuperVector(int64x2_t other) { u.s64x2[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(uint64x2_t other) +really_inline SuperVector<16>::SuperVector(uint64x2_t other) { u.u64x2[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(int8_t const other) +really_inline SuperVector<16>::SuperVector(int8_t const other) { u.s8x16[0] = vdupq_n_s8(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint8_t const other) +really_inline SuperVector<16>::SuperVector(uint8_t const other) { u.u8x16[0] = vdupq_n_u8(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(int16_t const other) +really_inline SuperVector<16>::SuperVector(int16_t const other) { u.s16x8[0] = vdupq_n_s16(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint16_t const other) +really_inline SuperVector<16>::SuperVector(uint16_t const other) { u.u16x8[0] = vdupq_n_u16(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(int32_t const other) +really_inline SuperVector<16>::SuperVector(int32_t const other) { u.s32x4[0] = vdupq_n_s32(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint32_t const other) +really_inline SuperVector<16>::SuperVector(uint32_t const other) { u.u32x4[0] = vdupq_n_u32(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(int64_t const other) +really_inline SuperVector<16>::SuperVector(int64_t const other) { u.s64x2[0] = vdupq_n_s64(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint64_t const other) +really_inline SuperVector<16>::SuperVector(uint64_t const other) { u.u64x2[0] = vdupq_n_u64(other); } @@ -376,7 +376,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(u.u8x16[0], n)}; }); + Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(v->u.u8x16[0], n)}; }); return result; } @@ -386,7 +386,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(u.u16x8[0], n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(v->u.u16x8[0], n)}; }); return result; } @@ -394,9 +394,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 32) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(u.u32x4[0], n)}; }); + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(v->u.u32x4[0], n)}; }); return result; } @@ -404,9 +404,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 64) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(u.u64x2[0], n)}; }); + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(v->u.u64x2[0], n)}; }); return result; } @@ -416,7 +416,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), v->u.u8x16[0], 16 - n)}; }); return result; } @@ -430,9 +430,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 8) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(u.u8x16[0], n)}; }); + Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(v->u.u8x16[0], n)}; }); return result; } @@ -442,7 +442,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(u.u16x8[0], n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(v->u.u16x8[0], n)}; }); return result; } @@ -450,9 +450,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 32) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(u.u32x4[0], n)}; }); + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(v->u.u32x4[0], n)}; }); return result; } @@ -460,9 +460,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 64) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(u.u64x2[0], n)}; }); + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(v->u.u64x2[0], n)}; }); return result; } @@ -472,7 +472,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(u.u8x16[0], vdupq_n_u8(0), n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(v->u.u8x16[0], vdupq_n_u8(0), n)}; }); return result; }