From 63e26a4b2880eda7b6ac7b49271d83ba3e6143c4 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 26 Jan 2021 00:44:38 +0200 Subject: [PATCH] add initial ppc64el support --- CMakeLists.txt | 10 +- cmake/arch.cmake | 14 + cmake/config.h.in | 6 + cmake/platform.cmake | 3 +- src/util/arch.h | 2 + src/util/arch/arm/bitutils.h | 51 ++-- src/util/arch/ppc64el/bitutils.h | 217 +++++++++++++++ src/util/arch/ppc64el/ppc64el.h | 42 +++ src/util/arch/ppc64el/simd_types.h | 37 +++ src/util/arch/ppc64el/simd_utils.h | 429 +++++++++++++++++++++++++++++ src/util/bitutils.h | 2 + src/util/intrinsics.h | 6 + src/util/simd_types.h | 2 + src/util/simd_utils.h | 2 + 14 files changed, 796 insertions(+), 27 deletions(-) create mode 100644 src/util/arch/ppc64el/bitutils.h create mode 100644 src/util/arch/ppc64el/ppc64el.h create mode 100644 src/util/arch/ppc64el/simd_types.h create mode 100644 src/util/arch/ppc64el/simd_utils.h diff --git a/CMakeLists.txt b/CMakeLists.txt index bbed8e2f..193180b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -209,7 +209,7 @@ else() message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid") endif() set(TUNE_FLAG ${GNUCC_ARCH}) - else () + else() set(TUNE_FLAG native) endif() @@ -252,11 +252,11 @@ else() endif() if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) - set(ARCH_C_FLAGS "-march=native -mtune=${TUNE_FLAG}") + set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}") endif() if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) - set(ARCH_CXX_FLAGS "-march=native -mtune=${TUNE_FLAG}") + set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}") endif() if(CMAKE_COMPILER_IS_GNUCC) @@ -298,6 +298,8 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64) CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_C_ARM_NEON_H) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions") +elseif (ARCH_PPC64EL) + CHECK_INCLUDE_FILE_CXX(altivec.h HAVE_C_PPC64EL_ALTIVEC_H) endif() CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN) @@ -587,7 +589,7 @@ set (hs_exec_common_SRCS ${hs_exec_common_SRCS} src/util/arch/x86/cpuid_flags.c ) -else (ARCH_ARM32 OR ARCH_AARCH64) +else (ARCH_ARM32 OR ARCH_AARCH64 OR ARCH_PPC64EL) set (hs_exec_common_SRCS ${hs_exec_common_SRCS} src/util/arch/arm/cpuid_flags.c diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 691861d6..d29a252c 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -9,6 +9,9 @@ elseif (HAVE_C_INTRIN_H) elseif (HAVE_C_ARM_NEON_H) set (INTRIN_INC_H "arm_neon.h") set (FAT_RUNTIME OFF) +elseif (HAVE_C_PPC64EL_ALTIVEC_H) + set (INTRIN_INC_H "altivec.h") + set (FAT_RUNTIME OFF) else() message (FATAL_ERROR "No intrinsics header found") endif () @@ -96,6 +99,13 @@ int main() { int32x4_t a = vdupq_n_s32(1); (void)a; }" HAVE_NEON) +elseif (ARCH_PPC64EL) + CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> +int main() { + vector int a = vec_splat_s32(1); + (void)a; +}" HAVE_VSX) + else () message (FATAL_ERROR "Unsupported architecture") endif () @@ -129,6 +139,10 @@ else (NOT FAT_RUNTIME) if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON) message(FATAL_ERROR "NEON support required for ARM support") endif () + if (ARCH_PPPC64EL AND NOT HAVE_VSX) + message(FATAL_ERROR "VSX support required for Power support") + endif () + endif () unset (CMAKE_REQUIRED_FLAGS) diff --git a/cmake/config.h.in b/cmake/config.h.in index 0de8cca2..36ec96b0 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -21,6 +21,9 @@ /* "Define if building for AARCH64" */ #cmakedefine ARCH_AARCH64 +/* "Define if building for PPC64EL" */ +#cmakedefine ARCH_PPC64EL + /* internal build, switch on dump support. */ #cmakedefine DUMP_SUPPORT @@ -60,6 +63,9 @@ /* C compiler has arm_neon.h */ #cmakedefine HAVE_C_ARM_NEON_H +/* C compiler has arm_neon.h */ +#cmakedefine HAVE_C_PPC64EL_ALTIVEC_H + /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to 0 if you don't. */ #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP diff --git a/cmake/platform.cmake b/cmake/platform.cmake index 479b3680..3439cd8d 100644 --- a/cmake/platform.cmake +++ b/cmake/platform.cmake @@ -8,7 +8,8 @@ CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error no CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32) -if (ARCH_X86_64 OR ARCH_AARCH64) +CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !defined(__LITTLE_ENDIAN__) && !defined(__VSX__)\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL) +if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL) set(ARCH_64_BIT TRUE) else() set(ARCH_32_BIT TRUE) diff --git a/src/util/arch.h b/src/util/arch.h index 794f28f7..1e8d2fbd 100644 --- a/src/util/arch.h +++ b/src/util/arch.h @@ -39,6 +39,8 @@ #include "util/arch/x86/x86.h" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/arch/arm/arm.h" +#elif defined(ARCH_PPC64EL) +#include "util/arch/ppc64el/ppc64el.h" #endif #endif // UTIL_ARCH_X86_H_ diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h index 498db568..859ac0bd 100644 --- a/src/util/arch/arm/bitutils.h +++ b/src/util/arch/arm/bitutils.h @@ -82,7 +82,11 @@ u32 findAndClearLSB_64_impl(u64a *v) { static really_inline u32 findAndClearMSB_32_impl(u32 *v) { - return findAndClearMSB_32_impl_c(v); + u32 val = *v; + u32 offset = 31 - clz32_impl(val); + *v = val & ~(1 << offset); + assert(offset < 32); + return offset; } static really_inline @@ -103,19 +107,20 @@ u64a compress64_impl(u64a x, u64a m) { static really_inline m128 compress128_impl(m128 x, m128 m) { m128 one = set1_2x64(1); - m128 bb = one; - m128 res = zeroes128(); + m128 bitset = one; + m128 vres = zeroes128(); while (isnonzero128(m)) { - m128 mm = sub_2x64(zeroes128(), m); - m128 xm = and128(x, m); - xm = and128(xm, mm); - - m128 mask = not128(eq64_m128(xm, zeroes128())); - res = or128(res, and128(bb, mask)); - m = and128(m, sub_2x64(m, one)); - bb = lshift64_m128(bb, 1); + m128 mm = sub_2x64(zeroes128(), m); + m128 tv = and128(x, m); + tv = and128(tv, mm); + + m128 mask = not128(eq64_m128(tv, zeroes128())); + mask = and128(bitset, mask); + vres = or128(vres, mask); + m = and128(m, sub_2x64(m, one)); + bitset = lshift64_m128(bitset, 1); } - return res; + return vres; } static really_inline @@ -131,18 +136,20 @@ u64a expand64_impl(u64a x, u64a m) { static really_inline m128 expand128_impl(m128 x, m128 m) { m128 one = set1_2x64(1); - m128 bb = one; - m128 res = zeroes128(); + m128 bitset = one; + m128 vres = zeroes128(); while (isnonzero128(m)) { - m128 xm = and128(x, bb); - m128 mm = sub_2x64(zeroes128(), m); - m128 mask = not128(eq64_m128(xm, zeroes128())); - mask = and128(mask, and128(m, mm)); - res = or128(res, mask); - m = and128(m, sub_2x64(m, one)); - bb = lshift64_m128(bb, 1); + m128 tv = and128(x, m); + + m128 mm = sub_2x64(zeroes128(), m); + m128 mask = not128(eq64_m128(tv, zeroes128())); + mask = and128(bitset, mask); + mask = and128(mask, mm); + vres = or128(vres, mask); + m = and128(m, sub_2x64(m, one)); + bitset = lshift64_m128(bitset, 1); } - return res; + return vres; } /* returns the first set bit after begin (if not ~0U). If no bit is set after diff --git a/src/util/arch/ppc64el/bitutils.h b/src/util/arch/ppc64el/bitutils.h new file mode 100644 index 00000000..b23c573e --- /dev/null +++ b/src/util/arch/ppc64el/bitutils.h @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Bit-twiddling primitives (ctz, compress etc) + */ + +#ifndef BITUTILS_ARCH_PPC64EL_H +#define BITUTILS_ARCH_PPC64EL_H + +#include "ue2common.h" +#include "util/popcount.h" +#include "util/arch.h" +#include "util/intrinsics.h" + +#include "util/arch/common/bitutils.h" + +static really_inline +u32 clz32_impl(u32 x) { + return clz32_impl_c(x); +} + +static really_inline +u32 clz64_impl(u64a x) { + return clz64_impl_c(x); +} + +static really_inline +u32 ctz32_impl(u32 x) { + return ctz32_impl_c(x); +} + +static really_inline +u32 ctz64_impl(u64a x) { + return ctz64_impl_c(x); +} + +static really_inline +u32 lg2_impl(u32 x) { + return lg2_impl_c(x); +} + +static really_inline +u64a lg2_64_impl(u64a x) { + return lg2_64_impl_c(x); +} + +static really_inline +u32 findAndClearLSB_32_impl(u32 *v) { + return findAndClearLSB_32_impl_c(v); +} + +static really_inline +u32 findAndClearLSB_64_impl(u64a *v) { + return findAndClearLSB_64_impl_c(v); +} + +static really_inline +u32 findAndClearMSB_32_impl(u32 *v) { + u32 val = *v; + u32 offset = 31 - clz32_impl(val); + *v = val & ~(1 << offset); + assert(offset < 32); + return offset; +} + +static really_inline +u32 findAndClearMSB_64_impl(u64a *v) { + return findAndClearMSB_64_impl_c(v); +} + +static really_inline +u32 compress32_impl(u32 x, u32 m) { + return compress32_impl_c(x, m); +} + +static really_inline +u64a compress64_impl(u64a x, u64a m) { + return compress64_impl_c(x, m); +} + +static really_inline +m128 compress128_impl(m128 x, m128 m) { + m128 one = set1_2x64(1); + m128 bitset = one; + m128 vres = zeroes128(); + while (isnonzero128(m)) { + m128 mm = sub_2x64(zeroes128(), m); + m128 tv = and128(x, m); + tv = and128(tv, mm); + + m128 mask = not128(eq64_m128(tv, zeroes128())); + mask = and128(bitset, mask); + vres = or128(vres, mask); + m = and128(m, sub_2x64(m, one)); + bitset = lshift64_m128(bitset, 1); + } + return vres; +} + +static really_inline +u32 expand32_impl(u32 x, u32 m) { + return expand32_impl_c(x, m); +} + +static really_inline +u64a expand64_impl(u64a x, u64a m) { + return expand64_impl_c(x, m); +} + +static really_inline +m128 expand128_impl(m128 x, m128 m) { + m128 one = set1_2x64(1); + m128 bitset = one; + m128 vres = zeroes128(); + while (isnonzero128(m)) { + m128 tv = and128(x, m); + + m128 mm = sub_2x64(zeroes128(), m); + m128 mask = not128(eq64_m128(tv, zeroes128())); + mask = and128(bitset, mask); + mask = and128(mask, mm); + vres = or128(vres, mask); + m = and128(m, sub_2x64(m, one)); + bitset = lshift64_m128(bitset, 1); + } + return vres; +} + +/* returns the first set bit after begin (if not ~0U). If no bit is set after + * begin returns ~0U + */ +static really_inline +u32 bf64_iterate_impl(u64a bitfield, u32 begin) { + if (begin != ~0U) { + /* switch off all bits at or below begin. Note: not legal to shift by + * by size of the datatype or larger. */ + assert(begin <= 63); + bitfield &= ~((2ULL << begin) - 1); + } + + if (!bitfield) { + return ~0U; + } + + return ctz64_impl(bitfield); +} + +static really_inline +char bf64_set_impl(u64a *bitfield, u32 i) { + return bf64_set_impl_c(bitfield, i); +} + +static really_inline +void bf64_unset_impl(u64a *bitfield, u32 i) { + return bf64_unset_impl_c(bitfield, i); +} + +static really_inline +u32 rank_in_mask32_impl(u32 mask, u32 bit) { + return rank_in_mask32_impl_c(mask, bit); +} + +static really_inline +u32 rank_in_mask64_impl(u64a mask, u32 bit) { + return rank_in_mask64_impl_c(mask, bit); +} + +static really_inline +u32 pext32_impl(u32 x, u32 mask) { + return pext32_impl_c(x, mask); +} + +static really_inline +u64a pext64_impl(u64a x, u64a mask) { + return pext64_impl_c(x, mask); +} + +static really_inline +u64a pdep64(u64a x, u64a mask) { + return pdep64_impl_c(x, mask); +} + +/* compilers don't reliably synthesize the 32-bit ANDN instruction here, + * so we force its generation. + */ +static really_inline +u64a andn_impl(const u32 a, const u8 *b) { + return andn_impl_c(a, b); +} + +#endif // BITUTILS_ARCH_ARM_H diff --git a/src/util/arch/ppc64el/ppc64el.h b/src/util/arch/ppc64el/ppc64el.h new file mode 100644 index 00000000..59e7e25d --- /dev/null +++ b/src/util/arch/ppc64el/ppc64el.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2017-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Per-platform architecture definitions + */ + +#ifndef UTIL_ARCH_PPC64EL_H_ +#define UTIL_ARCH_PPC64EL_H_ + +#if defined(__VSX__) && defined(ARCH_PPC64EL) +#define HAVE_VSX +#define HAVE_SIMD_128_BITS +#endif + +#endif // UTIL_ARCH_ARM_H_ + diff --git a/src/util/arch/ppc64el/simd_types.h b/src/util/arch/ppc64el/simd_types.h new file mode 100644 index 00000000..27b5d75d --- /dev/null +++ b/src/util/arch/ppc64el/simd_types.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SIMD_TYPES_ARM_H +#define SIMD_TYPES_ARM_H + +#if !defined(m128) && defined(HAVE_VSX) +typedef __vector int32_t m128; +#endif + +#endif /* SIMD_TYPES_ARM_H */ + diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h new file mode 100644 index 00000000..8b5767e6 --- /dev/null +++ b/src/util/arch/ppc64el/simd_utils.h @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief SIMD types and primitive operations. + */ + +#ifndef ARCH_PPC64EL_SIMD_UTILS_H +#define ARCH_PPC64EL_SIMD_UTILS_H + +#include + +#include "ue2common.h" +#include "util/simd_types.h" +#include "util/unaligned.h" +#include "util/intrinsics.h" + +#include // for memcpy + +typedef __vector uint64_t uint64x2_t; +typedef __vector int64_t int64x2_t; +typedef __vector uint32_t uint32x4_t; +typedef __vector int32_t int32x4_t; +typedef __vector uint16_t uint16x8_t; +typedef __vector int16_t int16x8_t; +typedef __vector uint8_t uint8x16_t; +typedef __vector int8_t int8x16_t; + +static really_inline m128 ones128(void) { + return (m128) vec_splat_s8(0xFF); +} + +static really_inline m128 zeroes128(void) { + return (m128) vec_splat_s32(0); +} + +/** \brief Bitwise not for m128*/ +static really_inline m128 not128(m128 a) { + return (m128) vec_xor(a, a); +} + +/** \brief Return 1 if a and b are different otherwise 0 */ +static really_inline int diff128(m128 a, m128 b) { + return vec_any_ne(a, b); +} + +static really_inline int isnonzero128(m128 a) { + return diff128(a, zeroes128()); +} + +/** + * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich128(m128 a, m128 b) { + static const m128 movemask = { 1, 2, 4, 8 }; + m128 mask = (m128) vec_cmpeq(a, b); + mask = vec_and(vec_xor(mask, mask), movemask); + m128 sum = vec_sums(mask, zeroes128()); + sum = vec_sld(zeroes128(), sum, 4); + s32 ALIGN_ATTR(16) x; + vec_ste(sum, 0, &x); + return x; +} + +/** + * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and + * returns a 4-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_128(m128 a, m128 b) { + static const uint64x2_t movemask = { 1, 4 }; + uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b); + mask = vec_and(vec_xor(mask, mask), movemask); + m128 sum = vec_sums((m128)mask, zeroes128()); + sum = vec_sld(zeroes128(), sum, 4); + s32 ALIGN_ATTR(16) x; + vec_ste(sum, 0, &x); + return x; +} + +static really_really_inline +m128 add_2x64(m128 a, m128 b) { + return (m128) vec_add((uint64x2_t)a, (uint64x2_t)b); +} + +static really_really_inline +m128 sub_2x64(m128 a, m128 b) { + return (m128) vec_sub((uint64x2_t)a, (uint64x2_t)b); +} + +static really_really_inline +m128 lshift_m128(m128 a, unsigned b) { + return (m128) vshlq_n_s32((int64x2_t)a, b); +} + +static really_really_inline +m128 rshift_m128(m128 a, unsigned b) { + return (m128) vshrq_n_s32((int64x2_t)a, b); +} + +static really_really_inline +m128 lshift64_m128(m128 a, unsigned b) { + return (m128) vshlq_n_s64((int64x2_t)a, b); +} + +static really_really_inline +m128 rshift64_m128(m128 a, unsigned b) { + return (m128) vshrq_n_s64((int64x2_t)a, b); +} + +static really_inline m128 eq128(m128 a, m128 b) { + return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 eq64_m128(m128 a, m128 b) { + return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b); +} + + +static really_inline u32 movemask128(m128 a) { + static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; + + // Compute the mask from the input + uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); + uint64x2_t mask1 = (m128)vextq_s8(mask, zeroes128(), 7); + mask = vorrq_u8(mask, mask1); + + // Get the resulting bytes + uint16_t output; + vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0); + return output; +} + +static really_inline m128 set1_16x8(u8 c) { + return (m128) vdupq_n_u8(c); +} + +static really_inline m128 set1_4x32(u32 c) { + return (m128) vdupq_n_u32(c); +} + +static really_inline m128 set1_2x64(u64a c) { + return (m128) vdupq_n_u64(c); +} + +static really_inline u32 movd(const m128 in) { + return vgetq_lane_u32((uint32x4_t) in, 0); +} + +static really_inline u64a movq(const m128 in) { + return vgetq_lane_u64((uint64x2_t) in, 0); +} + +/* another form of movq */ +static really_inline +m128 load_m128_from_u64a(const u64a *p) { + return (m128) vsetq_lane_u64(*p, zeroes128(), 0); +} + +static really_inline u32 extract32from128(const m128 in, unsigned imm) { +#if defined(HS_OPTIMIZE) + return vgetq_lane_u32((uint32x4_t) in, imm); +#else + switch (imm) { + case 0: + return vgetq_lane_u32((uint32x4_t) in, 0); + break; + case 1: + return vgetq_lane_u32((uint32x4_t) in, 1); + break; + case 2: + return vgetq_lane_u32((uint32x4_t) in, 2); + break; + case 3: + return vgetq_lane_u32((uint32x4_t) in, 3); + break; + default: + return 0; + break; + } +#endif +} + +static really_inline u64a extract64from128(const m128 in, unsigned imm) { +#if defined(HS_OPTIMIZE) + return vgetq_lane_u64((uint64x2_t) in, imm); +#else + switch (imm) { + case 0: + return vgetq_lane_u64((uint32x4_t) in, 0); + break; + case 1: + return vgetq_lane_u64((uint32x4_t) in, 1); + break; + default: + return 0; + break; + } +#endif +} + +static really_inline m128 low64from128(const m128 in) { + return vcombine_u64(vget_low_u64(in), vdup_n_u64(0)); +} + +static really_inline m128 high64from128(const m128 in) { + return vcombine_u64(vget_high_u64(in), vdup_n_u64(0)); +} + +static really_inline m128 add128(m128 a, m128 b) { + return (m128) vaddq_u64((uint64x2_t)a, (uint64x2_t)b); +} + +static really_inline m128 and128(m128 a, m128 b) { + return (m128) vandq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 xor128(m128 a, m128 b) { + return (m128) veorq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 or128(m128 a, m128 b) { + return (m128) vorrq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 andnot128(m128 a, m128 b) { + return (m128) (m128) vandq_s8( vmvnq_s8(a), b); +} + +// aligned load +static really_inline m128 load128(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m128))); + return (m128) vld1q_s32((const int32_t *)ptr); +} + +// aligned store +static really_inline void store128(void *ptr, m128 a) { + assert(ISALIGNED_N(ptr, alignof(m128))); + vst1q_s32((int32_t *)ptr, a); +} + +// unaligned load +static really_inline m128 loadu128(const void *ptr) { + return (m128) vld1q_s32((const int32_t *)ptr); +} + +// unaligned store +static really_inline void storeu128(void *ptr, m128 a) { + vst1q_s32((int32_t *)ptr, a); +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes128(void *ptr, m128 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m128 loadbytes128(const void *ptr, unsigned int n) { + m128 a = zeroes128(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + + +#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break; + +static really_really_inline +m128 palignr_imm(m128 r, m128 l, int offset) { + switch (offset) { + case 0: return l; break; + CASE_ALIGN_VECTORS(l, r, 1); + CASE_ALIGN_VECTORS(l, r, 2); + CASE_ALIGN_VECTORS(l, r, 3); + CASE_ALIGN_VECTORS(l, r, 4); + CASE_ALIGN_VECTORS(l, r, 5); + CASE_ALIGN_VECTORS(l, r, 6); + CASE_ALIGN_VECTORS(l, r, 7); + CASE_ALIGN_VECTORS(l, r, 8); + CASE_ALIGN_VECTORS(l, r, 9); + CASE_ALIGN_VECTORS(l, r, 10); + CASE_ALIGN_VECTORS(l, r, 11); + CASE_ALIGN_VECTORS(l, r, 12); + CASE_ALIGN_VECTORS(l, r, 13); + CASE_ALIGN_VECTORS(l, r, 14); + CASE_ALIGN_VECTORS(l, r, 15); + case 16: return r; break; + default: + return zeroes128(); + break; + } +} + +static really_really_inline +m128 palignr(m128 r, m128 l, int offset) { +#if defined(HS_OPTIMIZE) + return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); +#else + return palignr_imm(r, l, offset); +#endif +} +#undef CASE_ALIGN_VECTORS + +static really_really_inline +m128 rshiftbyte_m128(m128 a, unsigned b) { + return palignr(zeroes128(), a, b); +} + +static really_really_inline +m128 lshiftbyte_m128(m128 a, unsigned b) { + return palignr(a, zeroes128(), 16 - b); +} + +static really_inline +m128 variable_byte_shift_m128(m128 in, s32 amount) { + assert(amount >= -16 && amount <= 16); + static const uint8x16_t vbs_mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }; + const uint8x16_t outside_mask = set1_16x8(0xf0); + + m128 shift_mask = palignr_imm(vbs_mask, outside_mask, 16 - amount); + return vqtbl1q_s8(in, shift_mask); +} + +#ifdef __cplusplus +extern "C" { +#endif +extern const u8 simd_onebit_masks[]; +#ifdef __cplusplus +} +#endif + +static really_inline +m128 mask1bit128(unsigned int n) { + assert(n < sizeof(m128) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu128(&simd_onebit_masks[mask_idx]); +} + +// switches on bit N in the given vector. +static really_inline +void setbit128(m128 *ptr, unsigned int n) { + *ptr = or128(mask1bit128(n), *ptr); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit128(m128 *ptr, unsigned int n) { + *ptr = andnot128(mask1bit128(n), *ptr); +} + +// tests bit N in the given vector. +static really_inline +char testbit128(m128 val, unsigned int n) { + const m128 mask = mask1bit128(n); + + return isnonzero128(and128(mask, val)); +} + +static really_inline +m128 pshufb_m128(m128 a, m128 b) { + /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. + In NEON, if >=16, then the result is zero, otherwise it is that lane. + btranslated is the version that is converted from Intel to NEON. */ + int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f)); + return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated); +} + +static really_inline +m128 max_u8_m128(m128 a, m128 b) { + return (m128) vmaxq_u8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline +m128 min_u8_m128(m128 a, m128 b) { + return (m128) vminq_u8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline +m128 sadd_u8_m128(m128 a, m128 b) { + return (m128) vqaddq_u8((uint8x16_t)a, (uint8x16_t)b); +} + +static really_inline +m128 sub_u8_m128(m128 a, m128 b) { + return (m128) vsubq_u8((uint8x16_t)a, (uint8x16_t)b); +} + +static really_inline +m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) { + uint32_t ALIGN_ATTR(16) data[4] = { x0, x1, x2, x3 }; + return (m128) vld1q_u32((uint32_t *) data); +} + +static really_inline +m128 set2x64(u64a hi, u64a lo) { + uint64_t ALIGN_ATTR(16) data[2] = { lo, hi }; + return (m128) vld1q_u64((uint64_t *) data); +} + +#endif // ARCH_ARM_SIMD_UTILS_H diff --git a/src/util/bitutils.h b/src/util/bitutils.h index 68494507..ffc8f45d 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -49,6 +49,8 @@ #include "util/arch/x86/bitutils.h" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/arch/arm/bitutils.h" +#elif defined(ARCH_PPC64EL) +#include "util/arch/ppc64el/bitutils.h" #endif static really_inline diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h index 3e2afc22..99e367dd 100644 --- a/src/util/intrinsics.h +++ b/src/util/intrinsics.h @@ -49,6 +49,10 @@ # define USE_ARM_NEON_H #endif +#if defined(HAVE_C_PPC64EL_ALTIVEC_H) +# define USE_PPC64EL_ALTIVEC_H +#endif + #ifdef __cplusplus # if defined(HAVE_CXX_INTRIN_H) # define USE_INTRIN_H @@ -65,6 +69,8 @@ #include #elif defined(USE_ARM_NEON_H) #include +#elif defined(USE_PPC64EL_ALTIVEC_H) +#include #else #error no intrinsics file #endif diff --git a/src/util/simd_types.h b/src/util/simd_types.h index 5777374b..0deff7e5 100644 --- a/src/util/simd_types.h +++ b/src/util/simd_types.h @@ -38,6 +38,8 @@ #include "util/arch/x86/simd_types.h" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/arch/arm/simd_types.h" +#elif defined(ARCH_PPC64EL) +#include "util/arch/ppc64el/simd_types.h" #endif #if !defined(m128) && !defined(HAVE_SIMD_128_BITS) diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index 0724c94e..2913c4fe 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -65,6 +65,8 @@ extern const char vbs_mask_data[]; #include "util/arch/x86/simd_utils.h" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/arch/arm/simd_utils.h" +#elif defined(ARCH_PPC64EL) +#include "util/arch/ppc64el/simd_utils.h" #endif #include "util/arch/common/simd_utils.h"