diff --git a/CMakeLists.txt b/CMakeLists.txt index 05e6a5c7..85006e36 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -226,13 +226,21 @@ endif () set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG") set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG") endif() - - if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) - set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") - endif() - - if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) - set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") + + if (ARCH_IA32 OR ARCH_ARM32 OR ARCH_X86_64) + if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) + set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") + endif() + if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) + set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") + endif() + elseif(ARCH_AARCH64) + if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) + set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}") + endif() + if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) + set(ARCH_CXX_FLAGS " -mtune=${TUNE_FLAG}") + endif() endif() if(CMAKE_COMPILER_IS_GNUCC) @@ -279,6 +287,8 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64) endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions") +elseif (ARCH_PPC64EL) + CHECK_INCLUDE_FILE_CXX(altivec.h HAVE_C_PPC64EL_ALTIVEC_H) endif() CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN) @@ -522,7 +532,7 @@ set (hs_exec_common_SRCS ${hs_exec_common_SRCS} src/util/arch/x86/cpuid_flags.c ) -elseif (ARCH_ARM32 OR ARCH_AARCH64) +elseif (ARCH_ARM32 OR ARCH_AARCH64 OR ARCH_PPC64EL) set (hs_exec_common_SRCS ${hs_exec_common_SRCS} src/util/arch/arm/cpuid_flags.c diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 073f26c5..2100799f 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -9,6 +9,9 @@ elseif (HAVE_C_INTRIN_H) elseif (HAVE_C_ARM_NEON_H) set (INTRIN_INC_H "arm_neon.h") set (FAT_RUNTIME OFF) +elseif (HAVE_C_PPC64EL_ALTIVEC_H) + set (INTRIN_INC_H "altivec.h") + set (FAT_RUNTIME OFF) else() message (FATAL_ERROR "No intrinsics header found") endif () @@ -136,7 +139,20 @@ int main(){ (void)_mm512_permutexvar_epi8(idx, a); }" HAVE_AVX512VBMI) -elseif (!ARCH_ARM32 AND !ARCH_AARCH64) + +elseif (ARCH_ARM32 OR ARCH_AARCH64) + CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> +int main() { + int32x4_t a = vdupq_n_s32(1); + (void)a; +}" HAVE_NEON) +elseif (ARCH_PPC64EL) + CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> +int main() { + vector int a = vec_splat_s32(1); + (void)a; +}" HAVE_VSX) +else () message (FATAL_ERROR "Unsupported architecture") endif () @@ -169,6 +185,10 @@ else (NOT FAT_RUNTIME) if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON) message(FATAL_ERROR "NEON support required for ARM support") endif () + if (ARCH_PPPC64EL AND NOT HAVE_VSX) + message(FATAL_ERROR "VSX support required for Power support") + endif () + endif () unset (PREV_FLAGS) diff --git a/cmake/config.h.in b/cmake/config.h.in index 0afd6998..dbd72445 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -21,6 +21,9 @@ /* "Define if building for AARCH64" */ #cmakedefine ARCH_AARCH64 +/* "Define if building for PPC64EL" */ +#cmakedefine ARCH_PPC64EL + /* "Define if cross compiling for AARCH64" */ #cmakedefine CROSS_COMPILE_AARCH64 @@ -75,6 +78,9 @@ /* C compiler has arm_sve.h */ #cmakedefine HAVE_C_ARM_SVE_H +/* C compiler has arm_neon.h */ +#cmakedefine HAVE_C_PPC64EL_ALTIVEC_H + /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to 0 if you don't. */ #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP diff --git a/cmake/platform.cmake b/cmake/platform.cmake index 295775df..2cdc3a6e 100644 --- a/cmake/platform.cmake +++ b/cmake/platform.cmake @@ -7,15 +7,13 @@ if (CROSS_COMPILE_AARCH64) else() # really only interested in the preprocessor here CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64) - CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32) - CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32) - - if (ARCH_X86_64 OR ARCH_AARCH64) + CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !defined(__LITTLE_ENDIAN__) && !defined(__VSX__)\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL) + if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL) set(ARCH_64_BIT TRUE) else() set(ARCH_32_BIT TRUE) endif() -endif() \ No newline at end of file +endif() diff --git a/src/util/arch.h b/src/util/arch.h index 794f28f7..1e8d2fbd 100644 --- a/src/util/arch.h +++ b/src/util/arch.h @@ -39,6 +39,8 @@ #include "util/arch/x86/x86.h" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/arch/arm/arm.h" +#elif defined(ARCH_PPC64EL) +#include "util/arch/ppc64el/ppc64el.h" #endif #endif // UTIL_ARCH_X86_H_ diff --git a/src/util/arch/ppc64el/bitutils.h b/src/util/arch/ppc64el/bitutils.h new file mode 100644 index 00000000..b23c573e --- /dev/null +++ b/src/util/arch/ppc64el/bitutils.h @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Bit-twiddling primitives (ctz, compress etc) + */ + +#ifndef BITUTILS_ARCH_PPC64EL_H +#define BITUTILS_ARCH_PPC64EL_H + +#include "ue2common.h" +#include "util/popcount.h" +#include "util/arch.h" +#include "util/intrinsics.h" + +#include "util/arch/common/bitutils.h" + +static really_inline +u32 clz32_impl(u32 x) { + return clz32_impl_c(x); +} + +static really_inline +u32 clz64_impl(u64a x) { + return clz64_impl_c(x); +} + +static really_inline +u32 ctz32_impl(u32 x) { + return ctz32_impl_c(x); +} + +static really_inline +u32 ctz64_impl(u64a x) { + return ctz64_impl_c(x); +} + +static really_inline +u32 lg2_impl(u32 x) { + return lg2_impl_c(x); +} + +static really_inline +u64a lg2_64_impl(u64a x) { + return lg2_64_impl_c(x); +} + +static really_inline +u32 findAndClearLSB_32_impl(u32 *v) { + return findAndClearLSB_32_impl_c(v); +} + +static really_inline +u32 findAndClearLSB_64_impl(u64a *v) { + return findAndClearLSB_64_impl_c(v); +} + +static really_inline +u32 findAndClearMSB_32_impl(u32 *v) { + u32 val = *v; + u32 offset = 31 - clz32_impl(val); + *v = val & ~(1 << offset); + assert(offset < 32); + return offset; +} + +static really_inline +u32 findAndClearMSB_64_impl(u64a *v) { + return findAndClearMSB_64_impl_c(v); +} + +static really_inline +u32 compress32_impl(u32 x, u32 m) { + return compress32_impl_c(x, m); +} + +static really_inline +u64a compress64_impl(u64a x, u64a m) { + return compress64_impl_c(x, m); +} + +static really_inline +m128 compress128_impl(m128 x, m128 m) { + m128 one = set1_2x64(1); + m128 bitset = one; + m128 vres = zeroes128(); + while (isnonzero128(m)) { + m128 mm = sub_2x64(zeroes128(), m); + m128 tv = and128(x, m); + tv = and128(tv, mm); + + m128 mask = not128(eq64_m128(tv, zeroes128())); + mask = and128(bitset, mask); + vres = or128(vres, mask); + m = and128(m, sub_2x64(m, one)); + bitset = lshift64_m128(bitset, 1); + } + return vres; +} + +static really_inline +u32 expand32_impl(u32 x, u32 m) { + return expand32_impl_c(x, m); +} + +static really_inline +u64a expand64_impl(u64a x, u64a m) { + return expand64_impl_c(x, m); +} + +static really_inline +m128 expand128_impl(m128 x, m128 m) { + m128 one = set1_2x64(1); + m128 bitset = one; + m128 vres = zeroes128(); + while (isnonzero128(m)) { + m128 tv = and128(x, m); + + m128 mm = sub_2x64(zeroes128(), m); + m128 mask = not128(eq64_m128(tv, zeroes128())); + mask = and128(bitset, mask); + mask = and128(mask, mm); + vres = or128(vres, mask); + m = and128(m, sub_2x64(m, one)); + bitset = lshift64_m128(bitset, 1); + } + return vres; +} + +/* returns the first set bit after begin (if not ~0U). If no bit is set after + * begin returns ~0U + */ +static really_inline +u32 bf64_iterate_impl(u64a bitfield, u32 begin) { + if (begin != ~0U) { + /* switch off all bits at or below begin. Note: not legal to shift by + * by size of the datatype or larger. */ + assert(begin <= 63); + bitfield &= ~((2ULL << begin) - 1); + } + + if (!bitfield) { + return ~0U; + } + + return ctz64_impl(bitfield); +} + +static really_inline +char bf64_set_impl(u64a *bitfield, u32 i) { + return bf64_set_impl_c(bitfield, i); +} + +static really_inline +void bf64_unset_impl(u64a *bitfield, u32 i) { + return bf64_unset_impl_c(bitfield, i); +} + +static really_inline +u32 rank_in_mask32_impl(u32 mask, u32 bit) { + return rank_in_mask32_impl_c(mask, bit); +} + +static really_inline +u32 rank_in_mask64_impl(u64a mask, u32 bit) { + return rank_in_mask64_impl_c(mask, bit); +} + +static really_inline +u32 pext32_impl(u32 x, u32 mask) { + return pext32_impl_c(x, mask); +} + +static really_inline +u64a pext64_impl(u64a x, u64a mask) { + return pext64_impl_c(x, mask); +} + +static really_inline +u64a pdep64(u64a x, u64a mask) { + return pdep64_impl_c(x, mask); +} + +/* compilers don't reliably synthesize the 32-bit ANDN instruction here, + * so we force its generation. + */ +static really_inline +u64a andn_impl(const u32 a, const u8 *b) { + return andn_impl_c(a, b); +} + +#endif // BITUTILS_ARCH_ARM_H diff --git a/src/util/arch/ppc64el/ppc64el.h b/src/util/arch/ppc64el/ppc64el.h new file mode 100644 index 00000000..59e7e25d --- /dev/null +++ b/src/util/arch/ppc64el/ppc64el.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2017-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Per-platform architecture definitions + */ + +#ifndef UTIL_ARCH_PPC64EL_H_ +#define UTIL_ARCH_PPC64EL_H_ + +#if defined(__VSX__) && defined(ARCH_PPC64EL) +#define HAVE_VSX +#define HAVE_SIMD_128_BITS +#endif + +#endif // UTIL_ARCH_ARM_H_ + diff --git a/src/util/arch/ppc64el/simd_types.h b/src/util/arch/ppc64el/simd_types.h new file mode 100644 index 00000000..27b5d75d --- /dev/null +++ b/src/util/arch/ppc64el/simd_types.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SIMD_TYPES_ARM_H +#define SIMD_TYPES_ARM_H + +#if !defined(m128) && defined(HAVE_VSX) +typedef __vector int32_t m128; +#endif + +#endif /* SIMD_TYPES_ARM_H */ + diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h new file mode 100644 index 00000000..8b5767e6 --- /dev/null +++ b/src/util/arch/ppc64el/simd_utils.h @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief SIMD types and primitive operations. + */ + +#ifndef ARCH_PPC64EL_SIMD_UTILS_H +#define ARCH_PPC64EL_SIMD_UTILS_H + +#include + +#include "ue2common.h" +#include "util/simd_types.h" +#include "util/unaligned.h" +#include "util/intrinsics.h" + +#include // for memcpy + +typedef __vector uint64_t uint64x2_t; +typedef __vector int64_t int64x2_t; +typedef __vector uint32_t uint32x4_t; +typedef __vector int32_t int32x4_t; +typedef __vector uint16_t uint16x8_t; +typedef __vector int16_t int16x8_t; +typedef __vector uint8_t uint8x16_t; +typedef __vector int8_t int8x16_t; + +static really_inline m128 ones128(void) { + return (m128) vec_splat_s8(0xFF); +} + +static really_inline m128 zeroes128(void) { + return (m128) vec_splat_s32(0); +} + +/** \brief Bitwise not for m128*/ +static really_inline m128 not128(m128 a) { + return (m128) vec_xor(a, a); +} + +/** \brief Return 1 if a and b are different otherwise 0 */ +static really_inline int diff128(m128 a, m128 b) { + return vec_any_ne(a, b); +} + +static really_inline int isnonzero128(m128 a) { + return diff128(a, zeroes128()); +} + +/** + * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich128(m128 a, m128 b) { + static const m128 movemask = { 1, 2, 4, 8 }; + m128 mask = (m128) vec_cmpeq(a, b); + mask = vec_and(vec_xor(mask, mask), movemask); + m128 sum = vec_sums(mask, zeroes128()); + sum = vec_sld(zeroes128(), sum, 4); + s32 ALIGN_ATTR(16) x; + vec_ste(sum, 0, &x); + return x; +} + +/** + * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and + * returns a 4-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_128(m128 a, m128 b) { + static const uint64x2_t movemask = { 1, 4 }; + uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b); + mask = vec_and(vec_xor(mask, mask), movemask); + m128 sum = vec_sums((m128)mask, zeroes128()); + sum = vec_sld(zeroes128(), sum, 4); + s32 ALIGN_ATTR(16) x; + vec_ste(sum, 0, &x); + return x; +} + +static really_really_inline +m128 add_2x64(m128 a, m128 b) { + return (m128) vec_add((uint64x2_t)a, (uint64x2_t)b); +} + +static really_really_inline +m128 sub_2x64(m128 a, m128 b) { + return (m128) vec_sub((uint64x2_t)a, (uint64x2_t)b); +} + +static really_really_inline +m128 lshift_m128(m128 a, unsigned b) { + return (m128) vshlq_n_s32((int64x2_t)a, b); +} + +static really_really_inline +m128 rshift_m128(m128 a, unsigned b) { + return (m128) vshrq_n_s32((int64x2_t)a, b); +} + +static really_really_inline +m128 lshift64_m128(m128 a, unsigned b) { + return (m128) vshlq_n_s64((int64x2_t)a, b); +} + +static really_really_inline +m128 rshift64_m128(m128 a, unsigned b) { + return (m128) vshrq_n_s64((int64x2_t)a, b); +} + +static really_inline m128 eq128(m128 a, m128 b) { + return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 eq64_m128(m128 a, m128 b) { + return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b); +} + + +static really_inline u32 movemask128(m128 a) { + static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; + + // Compute the mask from the input + uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); + uint64x2_t mask1 = (m128)vextq_s8(mask, zeroes128(), 7); + mask = vorrq_u8(mask, mask1); + + // Get the resulting bytes + uint16_t output; + vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0); + return output; +} + +static really_inline m128 set1_16x8(u8 c) { + return (m128) vdupq_n_u8(c); +} + +static really_inline m128 set1_4x32(u32 c) { + return (m128) vdupq_n_u32(c); +} + +static really_inline m128 set1_2x64(u64a c) { + return (m128) vdupq_n_u64(c); +} + +static really_inline u32 movd(const m128 in) { + return vgetq_lane_u32((uint32x4_t) in, 0); +} + +static really_inline u64a movq(const m128 in) { + return vgetq_lane_u64((uint64x2_t) in, 0); +} + +/* another form of movq */ +static really_inline +m128 load_m128_from_u64a(const u64a *p) { + return (m128) vsetq_lane_u64(*p, zeroes128(), 0); +} + +static really_inline u32 extract32from128(const m128 in, unsigned imm) { +#if defined(HS_OPTIMIZE) + return vgetq_lane_u32((uint32x4_t) in, imm); +#else + switch (imm) { + case 0: + return vgetq_lane_u32((uint32x4_t) in, 0); + break; + case 1: + return vgetq_lane_u32((uint32x4_t) in, 1); + break; + case 2: + return vgetq_lane_u32((uint32x4_t) in, 2); + break; + case 3: + return vgetq_lane_u32((uint32x4_t) in, 3); + break; + default: + return 0; + break; + } +#endif +} + +static really_inline u64a extract64from128(const m128 in, unsigned imm) { +#if defined(HS_OPTIMIZE) + return vgetq_lane_u64((uint64x2_t) in, imm); +#else + switch (imm) { + case 0: + return vgetq_lane_u64((uint32x4_t) in, 0); + break; + case 1: + return vgetq_lane_u64((uint32x4_t) in, 1); + break; + default: + return 0; + break; + } +#endif +} + +static really_inline m128 low64from128(const m128 in) { + return vcombine_u64(vget_low_u64(in), vdup_n_u64(0)); +} + +static really_inline m128 high64from128(const m128 in) { + return vcombine_u64(vget_high_u64(in), vdup_n_u64(0)); +} + +static really_inline m128 add128(m128 a, m128 b) { + return (m128) vaddq_u64((uint64x2_t)a, (uint64x2_t)b); +} + +static really_inline m128 and128(m128 a, m128 b) { + return (m128) vandq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 xor128(m128 a, m128 b) { + return (m128) veorq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 or128(m128 a, m128 b) { + return (m128) vorrq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 andnot128(m128 a, m128 b) { + return (m128) (m128) vandq_s8( vmvnq_s8(a), b); +} + +// aligned load +static really_inline m128 load128(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m128))); + return (m128) vld1q_s32((const int32_t *)ptr); +} + +// aligned store +static really_inline void store128(void *ptr, m128 a) { + assert(ISALIGNED_N(ptr, alignof(m128))); + vst1q_s32((int32_t *)ptr, a); +} + +// unaligned load +static really_inline m128 loadu128(const void *ptr) { + return (m128) vld1q_s32((const int32_t *)ptr); +} + +// unaligned store +static really_inline void storeu128(void *ptr, m128 a) { + vst1q_s32((int32_t *)ptr, a); +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes128(void *ptr, m128 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m128 loadbytes128(const void *ptr, unsigned int n) { + m128 a = zeroes128(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + + +#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break; + +static really_really_inline +m128 palignr_imm(m128 r, m128 l, int offset) { + switch (offset) { + case 0: return l; break; + CASE_ALIGN_VECTORS(l, r, 1); + CASE_ALIGN_VECTORS(l, r, 2); + CASE_ALIGN_VECTORS(l, r, 3); + CASE_ALIGN_VECTORS(l, r, 4); + CASE_ALIGN_VECTORS(l, r, 5); + CASE_ALIGN_VECTORS(l, r, 6); + CASE_ALIGN_VECTORS(l, r, 7); + CASE_ALIGN_VECTORS(l, r, 8); + CASE_ALIGN_VECTORS(l, r, 9); + CASE_ALIGN_VECTORS(l, r, 10); + CASE_ALIGN_VECTORS(l, r, 11); + CASE_ALIGN_VECTORS(l, r, 12); + CASE_ALIGN_VECTORS(l, r, 13); + CASE_ALIGN_VECTORS(l, r, 14); + CASE_ALIGN_VECTORS(l, r, 15); + case 16: return r; break; + default: + return zeroes128(); + break; + } +} + +static really_really_inline +m128 palignr(m128 r, m128 l, int offset) { +#if defined(HS_OPTIMIZE) + return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); +#else + return palignr_imm(r, l, offset); +#endif +} +#undef CASE_ALIGN_VECTORS + +static really_really_inline +m128 rshiftbyte_m128(m128 a, unsigned b) { + return palignr(zeroes128(), a, b); +} + +static really_really_inline +m128 lshiftbyte_m128(m128 a, unsigned b) { + return palignr(a, zeroes128(), 16 - b); +} + +static really_inline +m128 variable_byte_shift_m128(m128 in, s32 amount) { + assert(amount >= -16 && amount <= 16); + static const uint8x16_t vbs_mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }; + const uint8x16_t outside_mask = set1_16x8(0xf0); + + m128 shift_mask = palignr_imm(vbs_mask, outside_mask, 16 - amount); + return vqtbl1q_s8(in, shift_mask); +} + +#ifdef __cplusplus +extern "C" { +#endif +extern const u8 simd_onebit_masks[]; +#ifdef __cplusplus +} +#endif + +static really_inline +m128 mask1bit128(unsigned int n) { + assert(n < sizeof(m128) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu128(&simd_onebit_masks[mask_idx]); +} + +// switches on bit N in the given vector. +static really_inline +void setbit128(m128 *ptr, unsigned int n) { + *ptr = or128(mask1bit128(n), *ptr); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit128(m128 *ptr, unsigned int n) { + *ptr = andnot128(mask1bit128(n), *ptr); +} + +// tests bit N in the given vector. +static really_inline +char testbit128(m128 val, unsigned int n) { + const m128 mask = mask1bit128(n); + + return isnonzero128(and128(mask, val)); +} + +static really_inline +m128 pshufb_m128(m128 a, m128 b) { + /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. + In NEON, if >=16, then the result is zero, otherwise it is that lane. + btranslated is the version that is converted from Intel to NEON. */ + int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f)); + return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated); +} + +static really_inline +m128 max_u8_m128(m128 a, m128 b) { + return (m128) vmaxq_u8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline +m128 min_u8_m128(m128 a, m128 b) { + return (m128) vminq_u8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline +m128 sadd_u8_m128(m128 a, m128 b) { + return (m128) vqaddq_u8((uint8x16_t)a, (uint8x16_t)b); +} + +static really_inline +m128 sub_u8_m128(m128 a, m128 b) { + return (m128) vsubq_u8((uint8x16_t)a, (uint8x16_t)b); +} + +static really_inline +m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) { + uint32_t ALIGN_ATTR(16) data[4] = { x0, x1, x2, x3 }; + return (m128) vld1q_u32((uint32_t *) data); +} + +static really_inline +m128 set2x64(u64a hi, u64a lo) { + uint64_t ALIGN_ATTR(16) data[2] = { lo, hi }; + return (m128) vld1q_u64((uint64_t *) data); +} + +#endif // ARCH_ARM_SIMD_UTILS_H diff --git a/src/util/bitutils.h b/src/util/bitutils.h index 68494507..ffc8f45d 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -49,6 +49,8 @@ #include "util/arch/x86/bitutils.h" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/arch/arm/bitutils.h" +#elif defined(ARCH_PPC64EL) +#include "util/arch/ppc64el/bitutils.h" #endif static really_inline diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h index 099c8f91..08eb6ba6 100644 --- a/src/util/intrinsics.h +++ b/src/util/intrinsics.h @@ -49,6 +49,10 @@ # define USE_ARM_NEON_H #endif +#if defined(HAVE_C_PPC64EL_ALTIVEC_H) +# define USE_PPC64EL_ALTIVEC_H +#endif + #ifdef __cplusplus # if defined(HAVE_CXX_INTRIN_H) # define USE_INTRIN_H @@ -68,6 +72,8 @@ # if defined(HAVE_SVE) # include # endif +#elif defined(USE_PPC64EL_ALTIVEC_H) +#include #else #error no intrinsics file #endif diff --git a/src/util/simd_types.h b/src/util/simd_types.h index 5777374b..0deff7e5 100644 --- a/src/util/simd_types.h +++ b/src/util/simd_types.h @@ -38,6 +38,8 @@ #include "util/arch/x86/simd_types.h" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/arch/arm/simd_types.h" +#elif defined(ARCH_PPC64EL) +#include "util/arch/ppc64el/simd_types.h" #endif #if !defined(m128) && !defined(HAVE_SIMD_128_BITS) diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index 0724c94e..2913c4fe 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -65,6 +65,8 @@ extern const char vbs_mask_data[]; #include "util/arch/x86/simd_utils.h" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/arch/arm/simd_utils.h" +#elif defined(ARCH_PPC64EL) +#include "util/arch/ppc64el/simd_utils.h" #endif #include "util/arch/common/simd_utils.h"