From 1f55d419eb1c54a9408908ea943b74c75bc54ffc Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 26 Jan 2021 00:44:38 +0200 Subject: [PATCH 01/37] add initial ppc64el support (cherry picked from commit 63e26a4b2880eda7b6ac7b49271d83ba3e6143c4) (cherry picked from commit c214ba253327114c16d0724f75c998ab00d44919) --- CMakeLists.txt | 26 +- cmake/arch.cmake | 22 +- cmake/config.h.in | 6 + cmake/platform.cmake | 8 +- src/util/arch.h | 2 + src/util/arch/ppc64el/bitutils.h | 217 +++++++++++++++ src/util/arch/ppc64el/ppc64el.h | 42 +++ src/util/arch/ppc64el/simd_types.h | 37 +++ src/util/arch/ppc64el/simd_utils.h | 429 +++++++++++++++++++++++++++++ src/util/bitutils.h | 2 + src/util/intrinsics.h | 6 + src/util/simd_types.h | 2 + src/util/simd_utils.h | 2 + 13 files changed, 787 insertions(+), 14 deletions(-) create mode 100644 src/util/arch/ppc64el/bitutils.h create mode 100644 src/util/arch/ppc64el/ppc64el.h create mode 100644 src/util/arch/ppc64el/simd_types.h create mode 100644 src/util/arch/ppc64el/simd_utils.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 05e6a5c7..85006e36 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -226,13 +226,21 @@ endif () set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG") set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG") endif() - - if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) - set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") - endif() - - if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) - set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") + + if (ARCH_IA32 OR ARCH_ARM32 OR ARCH_X86_64) + if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) + set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") + endif() + if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) + set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") + endif() + elseif(ARCH_AARCH64) + if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) + set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}") + endif() + if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) + set(ARCH_CXX_FLAGS " -mtune=${TUNE_FLAG}") + endif() endif() if(CMAKE_COMPILER_IS_GNUCC) @@ -279,6 +287,8 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64) endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions") +elseif (ARCH_PPC64EL) + CHECK_INCLUDE_FILE_CXX(altivec.h HAVE_C_PPC64EL_ALTIVEC_H) endif() CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN) @@ -522,7 +532,7 @@ set (hs_exec_common_SRCS ${hs_exec_common_SRCS} src/util/arch/x86/cpuid_flags.c ) -elseif (ARCH_ARM32 OR ARCH_AARCH64) +elseif (ARCH_ARM32 OR ARCH_AARCH64 OR ARCH_PPC64EL) set (hs_exec_common_SRCS ${hs_exec_common_SRCS} src/util/arch/arm/cpuid_flags.c diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 073f26c5..2100799f 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -9,6 +9,9 @@ elseif (HAVE_C_INTRIN_H) elseif (HAVE_C_ARM_NEON_H) set (INTRIN_INC_H "arm_neon.h") set (FAT_RUNTIME OFF) +elseif (HAVE_C_PPC64EL_ALTIVEC_H) + set (INTRIN_INC_H "altivec.h") + set (FAT_RUNTIME OFF) else() message (FATAL_ERROR "No intrinsics header found") endif () @@ -136,7 +139,20 @@ int main(){ (void)_mm512_permutexvar_epi8(idx, a); }" HAVE_AVX512VBMI) -elseif (!ARCH_ARM32 AND !ARCH_AARCH64) + +elseif (ARCH_ARM32 OR ARCH_AARCH64) + CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> +int main() { + int32x4_t a = vdupq_n_s32(1); + (void)a; +}" HAVE_NEON) +elseif (ARCH_PPC64EL) + CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> +int main() { + vector int a = vec_splat_s32(1); + (void)a; +}" HAVE_VSX) +else () message (FATAL_ERROR "Unsupported architecture") endif () @@ -169,6 +185,10 @@ else (NOT FAT_RUNTIME) if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON) message(FATAL_ERROR "NEON support required for ARM support") endif () + if (ARCH_PPPC64EL AND NOT HAVE_VSX) + message(FATAL_ERROR "VSX support required for Power support") + endif () + endif () unset (PREV_FLAGS) diff --git a/cmake/config.h.in b/cmake/config.h.in index 0afd6998..dbd72445 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -21,6 +21,9 @@ /* "Define if building for AARCH64" */ #cmakedefine ARCH_AARCH64 +/* "Define if building for PPC64EL" */ +#cmakedefine ARCH_PPC64EL + /* "Define if cross compiling for AARCH64" */ #cmakedefine CROSS_COMPILE_AARCH64 @@ -75,6 +78,9 @@ /* C compiler has arm_sve.h */ #cmakedefine HAVE_C_ARM_SVE_H +/* C compiler has arm_neon.h */ +#cmakedefine HAVE_C_PPC64EL_ALTIVEC_H + /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to 0 if you don't. */ #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP diff --git a/cmake/platform.cmake b/cmake/platform.cmake index 295775df..2cdc3a6e 100644 --- a/cmake/platform.cmake +++ b/cmake/platform.cmake @@ -7,15 +7,13 @@ if (CROSS_COMPILE_AARCH64) else() # really only interested in the preprocessor here CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64) - CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32) - CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32) - - if (ARCH_X86_64 OR ARCH_AARCH64) + CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !defined(__LITTLE_ENDIAN__) && !defined(__VSX__)\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL) + if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL) set(ARCH_64_BIT TRUE) else() set(ARCH_32_BIT TRUE) endif() -endif() \ No newline at end of file +endif() diff --git a/src/util/arch.h b/src/util/arch.h index 794f28f7..1e8d2fbd 100644 --- a/src/util/arch.h +++ b/src/util/arch.h @@ -39,6 +39,8 @@ #include "util/arch/x86/x86.h" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/arch/arm/arm.h" +#elif defined(ARCH_PPC64EL) +#include "util/arch/ppc64el/ppc64el.h" #endif #endif // UTIL_ARCH_X86_H_ diff --git a/src/util/arch/ppc64el/bitutils.h b/src/util/arch/ppc64el/bitutils.h new file mode 100644 index 00000000..b23c573e --- /dev/null +++ b/src/util/arch/ppc64el/bitutils.h @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Bit-twiddling primitives (ctz, compress etc) + */ + +#ifndef BITUTILS_ARCH_PPC64EL_H +#define BITUTILS_ARCH_PPC64EL_H + +#include "ue2common.h" +#include "util/popcount.h" +#include "util/arch.h" +#include "util/intrinsics.h" + +#include "util/arch/common/bitutils.h" + +static really_inline +u32 clz32_impl(u32 x) { + return clz32_impl_c(x); +} + +static really_inline +u32 clz64_impl(u64a x) { + return clz64_impl_c(x); +} + +static really_inline +u32 ctz32_impl(u32 x) { + return ctz32_impl_c(x); +} + +static really_inline +u32 ctz64_impl(u64a x) { + return ctz64_impl_c(x); +} + +static really_inline +u32 lg2_impl(u32 x) { + return lg2_impl_c(x); +} + +static really_inline +u64a lg2_64_impl(u64a x) { + return lg2_64_impl_c(x); +} + +static really_inline +u32 findAndClearLSB_32_impl(u32 *v) { + return findAndClearLSB_32_impl_c(v); +} + +static really_inline +u32 findAndClearLSB_64_impl(u64a *v) { + return findAndClearLSB_64_impl_c(v); +} + +static really_inline +u32 findAndClearMSB_32_impl(u32 *v) { + u32 val = *v; + u32 offset = 31 - clz32_impl(val); + *v = val & ~(1 << offset); + assert(offset < 32); + return offset; +} + +static really_inline +u32 findAndClearMSB_64_impl(u64a *v) { + return findAndClearMSB_64_impl_c(v); +} + +static really_inline +u32 compress32_impl(u32 x, u32 m) { + return compress32_impl_c(x, m); +} + +static really_inline +u64a compress64_impl(u64a x, u64a m) { + return compress64_impl_c(x, m); +} + +static really_inline +m128 compress128_impl(m128 x, m128 m) { + m128 one = set1_2x64(1); + m128 bitset = one; + m128 vres = zeroes128(); + while (isnonzero128(m)) { + m128 mm = sub_2x64(zeroes128(), m); + m128 tv = and128(x, m); + tv = and128(tv, mm); + + m128 mask = not128(eq64_m128(tv, zeroes128())); + mask = and128(bitset, mask); + vres = or128(vres, mask); + m = and128(m, sub_2x64(m, one)); + bitset = lshift64_m128(bitset, 1); + } + return vres; +} + +static really_inline +u32 expand32_impl(u32 x, u32 m) { + return expand32_impl_c(x, m); +} + +static really_inline +u64a expand64_impl(u64a x, u64a m) { + return expand64_impl_c(x, m); +} + +static really_inline +m128 expand128_impl(m128 x, m128 m) { + m128 one = set1_2x64(1); + m128 bitset = one; + m128 vres = zeroes128(); + while (isnonzero128(m)) { + m128 tv = and128(x, m); + + m128 mm = sub_2x64(zeroes128(), m); + m128 mask = not128(eq64_m128(tv, zeroes128())); + mask = and128(bitset, mask); + mask = and128(mask, mm); + vres = or128(vres, mask); + m = and128(m, sub_2x64(m, one)); + bitset = lshift64_m128(bitset, 1); + } + return vres; +} + +/* returns the first set bit after begin (if not ~0U). If no bit is set after + * begin returns ~0U + */ +static really_inline +u32 bf64_iterate_impl(u64a bitfield, u32 begin) { + if (begin != ~0U) { + /* switch off all bits at or below begin. Note: not legal to shift by + * by size of the datatype or larger. */ + assert(begin <= 63); + bitfield &= ~((2ULL << begin) - 1); + } + + if (!bitfield) { + return ~0U; + } + + return ctz64_impl(bitfield); +} + +static really_inline +char bf64_set_impl(u64a *bitfield, u32 i) { + return bf64_set_impl_c(bitfield, i); +} + +static really_inline +void bf64_unset_impl(u64a *bitfield, u32 i) { + return bf64_unset_impl_c(bitfield, i); +} + +static really_inline +u32 rank_in_mask32_impl(u32 mask, u32 bit) { + return rank_in_mask32_impl_c(mask, bit); +} + +static really_inline +u32 rank_in_mask64_impl(u64a mask, u32 bit) { + return rank_in_mask64_impl_c(mask, bit); +} + +static really_inline +u32 pext32_impl(u32 x, u32 mask) { + return pext32_impl_c(x, mask); +} + +static really_inline +u64a pext64_impl(u64a x, u64a mask) { + return pext64_impl_c(x, mask); +} + +static really_inline +u64a pdep64(u64a x, u64a mask) { + return pdep64_impl_c(x, mask); +} + +/* compilers don't reliably synthesize the 32-bit ANDN instruction here, + * so we force its generation. + */ +static really_inline +u64a andn_impl(const u32 a, const u8 *b) { + return andn_impl_c(a, b); +} + +#endif // BITUTILS_ARCH_ARM_H diff --git a/src/util/arch/ppc64el/ppc64el.h b/src/util/arch/ppc64el/ppc64el.h new file mode 100644 index 00000000..59e7e25d --- /dev/null +++ b/src/util/arch/ppc64el/ppc64el.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2017-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Per-platform architecture definitions + */ + +#ifndef UTIL_ARCH_PPC64EL_H_ +#define UTIL_ARCH_PPC64EL_H_ + +#if defined(__VSX__) && defined(ARCH_PPC64EL) +#define HAVE_VSX +#define HAVE_SIMD_128_BITS +#endif + +#endif // UTIL_ARCH_ARM_H_ + diff --git a/src/util/arch/ppc64el/simd_types.h b/src/util/arch/ppc64el/simd_types.h new file mode 100644 index 00000000..27b5d75d --- /dev/null +++ b/src/util/arch/ppc64el/simd_types.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SIMD_TYPES_ARM_H +#define SIMD_TYPES_ARM_H + +#if !defined(m128) && defined(HAVE_VSX) +typedef __vector int32_t m128; +#endif + +#endif /* SIMD_TYPES_ARM_H */ + diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h new file mode 100644 index 00000000..8b5767e6 --- /dev/null +++ b/src/util/arch/ppc64el/simd_utils.h @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief SIMD types and primitive operations. + */ + +#ifndef ARCH_PPC64EL_SIMD_UTILS_H +#define ARCH_PPC64EL_SIMD_UTILS_H + +#include + +#include "ue2common.h" +#include "util/simd_types.h" +#include "util/unaligned.h" +#include "util/intrinsics.h" + +#include // for memcpy + +typedef __vector uint64_t uint64x2_t; +typedef __vector int64_t int64x2_t; +typedef __vector uint32_t uint32x4_t; +typedef __vector int32_t int32x4_t; +typedef __vector uint16_t uint16x8_t; +typedef __vector int16_t int16x8_t; +typedef __vector uint8_t uint8x16_t; +typedef __vector int8_t int8x16_t; + +static really_inline m128 ones128(void) { + return (m128) vec_splat_s8(0xFF); +} + +static really_inline m128 zeroes128(void) { + return (m128) vec_splat_s32(0); +} + +/** \brief Bitwise not for m128*/ +static really_inline m128 not128(m128 a) { + return (m128) vec_xor(a, a); +} + +/** \brief Return 1 if a and b are different otherwise 0 */ +static really_inline int diff128(m128 a, m128 b) { + return vec_any_ne(a, b); +} + +static really_inline int isnonzero128(m128 a) { + return diff128(a, zeroes128()); +} + +/** + * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich128(m128 a, m128 b) { + static const m128 movemask = { 1, 2, 4, 8 }; + m128 mask = (m128) vec_cmpeq(a, b); + mask = vec_and(vec_xor(mask, mask), movemask); + m128 sum = vec_sums(mask, zeroes128()); + sum = vec_sld(zeroes128(), sum, 4); + s32 ALIGN_ATTR(16) x; + vec_ste(sum, 0, &x); + return x; +} + +/** + * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and + * returns a 4-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_128(m128 a, m128 b) { + static const uint64x2_t movemask = { 1, 4 }; + uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b); + mask = vec_and(vec_xor(mask, mask), movemask); + m128 sum = vec_sums((m128)mask, zeroes128()); + sum = vec_sld(zeroes128(), sum, 4); + s32 ALIGN_ATTR(16) x; + vec_ste(sum, 0, &x); + return x; +} + +static really_really_inline +m128 add_2x64(m128 a, m128 b) { + return (m128) vec_add((uint64x2_t)a, (uint64x2_t)b); +} + +static really_really_inline +m128 sub_2x64(m128 a, m128 b) { + return (m128) vec_sub((uint64x2_t)a, (uint64x2_t)b); +} + +static really_really_inline +m128 lshift_m128(m128 a, unsigned b) { + return (m128) vshlq_n_s32((int64x2_t)a, b); +} + +static really_really_inline +m128 rshift_m128(m128 a, unsigned b) { + return (m128) vshrq_n_s32((int64x2_t)a, b); +} + +static really_really_inline +m128 lshift64_m128(m128 a, unsigned b) { + return (m128) vshlq_n_s64((int64x2_t)a, b); +} + +static really_really_inline +m128 rshift64_m128(m128 a, unsigned b) { + return (m128) vshrq_n_s64((int64x2_t)a, b); +} + +static really_inline m128 eq128(m128 a, m128 b) { + return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 eq64_m128(m128 a, m128 b) { + return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b); +} + + +static really_inline u32 movemask128(m128 a) { + static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; + + // Compute the mask from the input + uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); + uint64x2_t mask1 = (m128)vextq_s8(mask, zeroes128(), 7); + mask = vorrq_u8(mask, mask1); + + // Get the resulting bytes + uint16_t output; + vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0); + return output; +} + +static really_inline m128 set1_16x8(u8 c) { + return (m128) vdupq_n_u8(c); +} + +static really_inline m128 set1_4x32(u32 c) { + return (m128) vdupq_n_u32(c); +} + +static really_inline m128 set1_2x64(u64a c) { + return (m128) vdupq_n_u64(c); +} + +static really_inline u32 movd(const m128 in) { + return vgetq_lane_u32((uint32x4_t) in, 0); +} + +static really_inline u64a movq(const m128 in) { + return vgetq_lane_u64((uint64x2_t) in, 0); +} + +/* another form of movq */ +static really_inline +m128 load_m128_from_u64a(const u64a *p) { + return (m128) vsetq_lane_u64(*p, zeroes128(), 0); +} + +static really_inline u32 extract32from128(const m128 in, unsigned imm) { +#if defined(HS_OPTIMIZE) + return vgetq_lane_u32((uint32x4_t) in, imm); +#else + switch (imm) { + case 0: + return vgetq_lane_u32((uint32x4_t) in, 0); + break; + case 1: + return vgetq_lane_u32((uint32x4_t) in, 1); + break; + case 2: + return vgetq_lane_u32((uint32x4_t) in, 2); + break; + case 3: + return vgetq_lane_u32((uint32x4_t) in, 3); + break; + default: + return 0; + break; + } +#endif +} + +static really_inline u64a extract64from128(const m128 in, unsigned imm) { +#if defined(HS_OPTIMIZE) + return vgetq_lane_u64((uint64x2_t) in, imm); +#else + switch (imm) { + case 0: + return vgetq_lane_u64((uint32x4_t) in, 0); + break; + case 1: + return vgetq_lane_u64((uint32x4_t) in, 1); + break; + default: + return 0; + break; + } +#endif +} + +static really_inline m128 low64from128(const m128 in) { + return vcombine_u64(vget_low_u64(in), vdup_n_u64(0)); +} + +static really_inline m128 high64from128(const m128 in) { + return vcombine_u64(vget_high_u64(in), vdup_n_u64(0)); +} + +static really_inline m128 add128(m128 a, m128 b) { + return (m128) vaddq_u64((uint64x2_t)a, (uint64x2_t)b); +} + +static really_inline m128 and128(m128 a, m128 b) { + return (m128) vandq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 xor128(m128 a, m128 b) { + return (m128) veorq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 or128(m128 a, m128 b) { + return (m128) vorrq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 andnot128(m128 a, m128 b) { + return (m128) (m128) vandq_s8( vmvnq_s8(a), b); +} + +// aligned load +static really_inline m128 load128(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m128))); + return (m128) vld1q_s32((const int32_t *)ptr); +} + +// aligned store +static really_inline void store128(void *ptr, m128 a) { + assert(ISALIGNED_N(ptr, alignof(m128))); + vst1q_s32((int32_t *)ptr, a); +} + +// unaligned load +static really_inline m128 loadu128(const void *ptr) { + return (m128) vld1q_s32((const int32_t *)ptr); +} + +// unaligned store +static really_inline void storeu128(void *ptr, m128 a) { + vst1q_s32((int32_t *)ptr, a); +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes128(void *ptr, m128 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m128 loadbytes128(const void *ptr, unsigned int n) { + m128 a = zeroes128(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + + +#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break; + +static really_really_inline +m128 palignr_imm(m128 r, m128 l, int offset) { + switch (offset) { + case 0: return l; break; + CASE_ALIGN_VECTORS(l, r, 1); + CASE_ALIGN_VECTORS(l, r, 2); + CASE_ALIGN_VECTORS(l, r, 3); + CASE_ALIGN_VECTORS(l, r, 4); + CASE_ALIGN_VECTORS(l, r, 5); + CASE_ALIGN_VECTORS(l, r, 6); + CASE_ALIGN_VECTORS(l, r, 7); + CASE_ALIGN_VECTORS(l, r, 8); + CASE_ALIGN_VECTORS(l, r, 9); + CASE_ALIGN_VECTORS(l, r, 10); + CASE_ALIGN_VECTORS(l, r, 11); + CASE_ALIGN_VECTORS(l, r, 12); + CASE_ALIGN_VECTORS(l, r, 13); + CASE_ALIGN_VECTORS(l, r, 14); + CASE_ALIGN_VECTORS(l, r, 15); + case 16: return r; break; + default: + return zeroes128(); + break; + } +} + +static really_really_inline +m128 palignr(m128 r, m128 l, int offset) { +#if defined(HS_OPTIMIZE) + return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); +#else + return palignr_imm(r, l, offset); +#endif +} +#undef CASE_ALIGN_VECTORS + +static really_really_inline +m128 rshiftbyte_m128(m128 a, unsigned b) { + return palignr(zeroes128(), a, b); +} + +static really_really_inline +m128 lshiftbyte_m128(m128 a, unsigned b) { + return palignr(a, zeroes128(), 16 - b); +} + +static really_inline +m128 variable_byte_shift_m128(m128 in, s32 amount) { + assert(amount >= -16 && amount <= 16); + static const uint8x16_t vbs_mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }; + const uint8x16_t outside_mask = set1_16x8(0xf0); + + m128 shift_mask = palignr_imm(vbs_mask, outside_mask, 16 - amount); + return vqtbl1q_s8(in, shift_mask); +} + +#ifdef __cplusplus +extern "C" { +#endif +extern const u8 simd_onebit_masks[]; +#ifdef __cplusplus +} +#endif + +static really_inline +m128 mask1bit128(unsigned int n) { + assert(n < sizeof(m128) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu128(&simd_onebit_masks[mask_idx]); +} + +// switches on bit N in the given vector. +static really_inline +void setbit128(m128 *ptr, unsigned int n) { + *ptr = or128(mask1bit128(n), *ptr); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit128(m128 *ptr, unsigned int n) { + *ptr = andnot128(mask1bit128(n), *ptr); +} + +// tests bit N in the given vector. +static really_inline +char testbit128(m128 val, unsigned int n) { + const m128 mask = mask1bit128(n); + + return isnonzero128(and128(mask, val)); +} + +static really_inline +m128 pshufb_m128(m128 a, m128 b) { + /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. + In NEON, if >=16, then the result is zero, otherwise it is that lane. + btranslated is the version that is converted from Intel to NEON. */ + int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f)); + return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated); +} + +static really_inline +m128 max_u8_m128(m128 a, m128 b) { + return (m128) vmaxq_u8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline +m128 min_u8_m128(m128 a, m128 b) { + return (m128) vminq_u8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline +m128 sadd_u8_m128(m128 a, m128 b) { + return (m128) vqaddq_u8((uint8x16_t)a, (uint8x16_t)b); +} + +static really_inline +m128 sub_u8_m128(m128 a, m128 b) { + return (m128) vsubq_u8((uint8x16_t)a, (uint8x16_t)b); +} + +static really_inline +m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) { + uint32_t ALIGN_ATTR(16) data[4] = { x0, x1, x2, x3 }; + return (m128) vld1q_u32((uint32_t *) data); +} + +static really_inline +m128 set2x64(u64a hi, u64a lo) { + uint64_t ALIGN_ATTR(16) data[2] = { lo, hi }; + return (m128) vld1q_u64((uint64_t *) data); +} + +#endif // ARCH_ARM_SIMD_UTILS_H diff --git a/src/util/bitutils.h b/src/util/bitutils.h index 68494507..ffc8f45d 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -49,6 +49,8 @@ #include "util/arch/x86/bitutils.h" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/arch/arm/bitutils.h" +#elif defined(ARCH_PPC64EL) +#include "util/arch/ppc64el/bitutils.h" #endif static really_inline diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h index 099c8f91..08eb6ba6 100644 --- a/src/util/intrinsics.h +++ b/src/util/intrinsics.h @@ -49,6 +49,10 @@ # define USE_ARM_NEON_H #endif +#if defined(HAVE_C_PPC64EL_ALTIVEC_H) +# define USE_PPC64EL_ALTIVEC_H +#endif + #ifdef __cplusplus # if defined(HAVE_CXX_INTRIN_H) # define USE_INTRIN_H @@ -68,6 +72,8 @@ # if defined(HAVE_SVE) # include # endif +#elif defined(USE_PPC64EL_ALTIVEC_H) +#include #else #error no intrinsics file #endif diff --git a/src/util/simd_types.h b/src/util/simd_types.h index 5777374b..0deff7e5 100644 --- a/src/util/simd_types.h +++ b/src/util/simd_types.h @@ -38,6 +38,8 @@ #include "util/arch/x86/simd_types.h" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/arch/arm/simd_types.h" +#elif defined(ARCH_PPC64EL) +#include "util/arch/ppc64el/simd_types.h" #endif #if !defined(m128) && !defined(HAVE_SIMD_128_BITS) diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index 0724c94e..2913c4fe 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -65,6 +65,8 @@ extern const char vbs_mask_data[]; #include "util/arch/x86/simd_utils.h" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/arch/arm/simd_utils.h" +#elif defined(ARCH_PPC64EL) +#include "util/arch/ppc64el/simd_utils.h" #endif #include "util/arch/common/simd_utils.h" From f1d781ffee60c07fd58fede3ef6b2642ee93f64b Mon Sep 17 00:00:00 2001 From: Vectorcamp Date: Thu, 23 Sep 2021 09:28:37 -0400 Subject: [PATCH 02/37] test commit from VM and CMakelists add power support --- CMakeLists.txt | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 85006e36..612214b9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -226,15 +226,17 @@ endif () set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG") set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG") endif() - - if (ARCH_IA32 OR ARCH_ARM32 OR ARCH_X86_64) - if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) - set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") - endif() - if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) - set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") - endif() - elseif(ARCH_AARCH64) + + + if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) + set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") + endif() + + if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) + set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") + endif() + + if(ARCH_AARCH64) if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}") endif() From 079f3518d7e4e3a9aa937750c3e2ef01a6d4e6fe Mon Sep 17 00:00:00 2001 From: Vectorcamp Date: Thu, 23 Sep 2021 10:07:27 -0400 Subject: [PATCH 03/37] ppc64el arcitecture added in CMakelists file --- CMakeLists.txt | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 612214b9..51b8d6b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -146,7 +146,7 @@ endif () string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}") endforeach () - if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE_AARCH64) + if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE_AARCH64 AND NOT ARCH_PPC64EL) message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}") # If gcc doesn't recognise the host cpu, then mtune=native becomes # generic, which isn't very good in some cases. march=native looks at @@ -227,21 +227,23 @@ endif () set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG") endif() - - if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) - set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") + + if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64) + if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) + set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") + endif() + + if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) + set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") + endif() endif() - - if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) - set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") - endif() - - if(ARCH_AARCH64) + + if(ARCH_PPC64EL) if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}") endif() if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) - set(ARCH_CXX_FLAGS " -mtune=${TUNE_FLAG}") + set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}") endif() endif() From 0078c28ee6c7e684a8a5bea9b2c59c13330e7bcf Mon Sep 17 00:00:00 2001 From: apostolos Date: Fri, 24 Sep 2021 13:01:14 +0300 Subject: [PATCH 04/37] implementations for powerpc64el architecture --- src/util/supervector/arch/ppc64el/impl.cpp | 429 ++++++++++++++++++++ src/util/supervector/arch/ppc64el/types.hpp | 37 ++ 2 files changed, 466 insertions(+) create mode 100644 src/util/supervector/arch/ppc64el/impl.cpp create mode 100644 src/util/supervector/arch/ppc64el/types.hpp diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp new file mode 100644 index 00000000..2ddd3658 --- /dev/null +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SIMD_IMPL_HPP +#define SIMD_IMPL_HPP + +#include +#include + +#include "ue2common.h" +#include "util/arch.h" +#include "util/unaligned.h" +#include "util/supervector/supervector.hpp" + +// 128-bit Powerpc64le implementation + +template<> +really_inline SuperVector<16>::SuperVector(SuperVector const &other) +{ + u.v128[0] = other.u.v128[0]; +} + +template<> +really_inline SuperVector<16>::SuperVector(typename base_type::type const v) +{ + u.v128[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int8_t const other) +{ + //u.v128[0] = _mm_set1_epi8(other); + u.v128[0] = vdupq_n_u8(other); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint8_t const other) +{ + //u.v128[0] = _mm_set1_epi8(static_cast(other)); + u.v128[0] = vdupq_n_u8(static_cast(other)); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int16_t const other) +{ + //u.v128[0] = _mm_set1_epi16(other); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint16_t const other) +{ + //u.v128[0] = _mm_set1_epi16(static_cast(other)); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int32_t const other) +{ + //u.v128[0] = _mm_set1_epi32(other); + u.v128[0] = vdupq_n_u32(other); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint32_t const other) +{ + //u.v128[0] = _mm_set1_epi32(static_cast(other)); + u.v128[0] = vdupq_n_u32(static_cast(other)); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int64_t const other) +{ + //u.v128[0] = _mm_set1_epi64x(other); + u.v128[0] = vdupq_n_u64(other); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint64_t const other) +{ + //u.v128[0] = _mm_set1_epi64x(static_cast(other)); + u.v128[0] = vdupq_n_u64(static_cast(other)); +} + +// Constants +template<> +really_inline SuperVector<16> SuperVector<16>::Ones(void) +{ + //return {_mm_set1_epi8(0xFF)}; + return {vec_splat_s8(0xFF)}; +} + +template<> +really_inline SuperVector<16> SuperVector<16>::Zeroes(void) +{ + //return {_mm_set1_epi8(0)}; + return {vec_splat_s8(0)}; +} + +// Methods + +template <> +really_inline void SuperVector<16>::operator=(SuperVector<16> const &other) +{ + u.v128[0] = other.u.v128[0]; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const +{ + //return {_mm_and_si128(u.v128[0], b.u.v128[0])}; + return {vec_add(u.v128[0], b.u.v128[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const +{ + //return {_mm_or_si128(u.v128[0], b.u.v128[0])}; + return {vec_or(u.v128[0], b.u.v128[0]);} +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const +{ + //return {_mm_xor_si128(u.v128[0], b.u.v128[0])}; + return {vec_xor(u.v128[0], b.u.v128[0]);} +} + +template <> +really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const +{ + //return {_mm_andnot_si128(u.v128[0], b.u.v128[0])}; + return 0; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const +{ + //return {_mm_cmpeq_epi8(u.v128[0], b.u.v128[0])}; + return {vec_cmpeq(u.v128[0], b.u.v128[0])}; +} + +template <> +really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const +{ + //return _mm_movemask_epi8(u.v128[0]); + // Compute the mask from the input + uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)u.v128[0], 0)))); + uint64x2_t mask1 = (m128)vextq_s8(mask, Zeroes(), 7); + mask = vorrq_u8(mask, mask1); + + // Get the resulting bytes + uint16_t output; + vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0); + return output; + return 0; +} + +template <> +really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const +{ + return eq(b).movemask(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::rshift128_var(uint8_t const N) const +{ + switch(N) { + case 1: return {vshrq_n_s32(u.v128[0], 1)}; break; + case 2: return {vshrq_n_s32(u.v128[0], 2)}; break; + case 3: return {vshrq_n_s32(u.v128[0], 3)}; break; + case 4: return {vshrq_n_s32(u.v128[0], 4)}; break; + case 5: return {vshrq_n_s32(u.v128[0], 5)}; break; + case 6: return {vshrq_n_s32(u.v128[0], 6)}; break; + case 7: return {vshrq_n_s32(u.v128[0], 7)}; break; + case 8: return {vshrq_n_s32(u.v128[0], 8)}; break; + case 9: return {vshrq_n_s32(u.v128[0], 9)}; break; + case 10: return {vshrq_n_s32(u.v128[0], 10)}; break; + case 11: return {vshrq_n_s32(u.v128[0], 11)}; break; + case 12: return {vshrq_n_s32(u.v128[0], 12)}; break; + case 13: return {vshrq_n_s32(u.v128[0], 13)}; break; + case 14: return {vshrq_n_s32(u.v128[0], 14)}; break; + case 15: return {vshrq_n_s32(u.v128[0], 15)}; break; + case 16: return Zeroes(); break; + default: break; + } + return *this; +} + +#ifdef HS_OPTIMIZE +template <> +really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const +{ + return {vshrq_n_s32(u.v128[0], N)}; +} +#else +template <> +really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const +{ + return rshift128_var(N); +} +#endif + +template <> +really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) const +{ + switch(N) { + case 1: return {vshlq_n_s32(u.v128[0], 1)}; break; + case 2: return {vshlq_n_s32(u.v128[0], 2)}; break; + case 3: return {vshlq_n_s32(u.v128[0], 3)}; break; + case 4: return {vshlq_n_s32(u.v128[0], 4)}; break; + case 5: return {vshlq_n_s32(u.v128[0], 5)}; break; + case 6: return {vshlq_n_s32(u.v128[0], 6)}; break; + case 7: return {vshlq_n_s32(u.v128[0], 7)}; break; + case 8: return {vshlq_n_s32(u.v128[0], 8)}; break; + case 9: return {vshlq_n_s32(u.v128[0], 9)}; break; + case 10: return {vshlq_n_s32(u.v128[0], 10)}; break; + case 11: return {vshlq_n_s32(u.v128[0], 11)}; break; + case 12: return {vshlq_n_s32(u.v128[0], 12)}; break; + case 13: return {vshlq_n_s32(u.v128[0], 13)}; break; + case 14: return {vshlq_n_s32(u.v128[0], 14)}; break; + case 15: return {vshlq_n_s32(u.v128[0], 15)}; break; + case 16: return Zeroes(); break; + default: break; + } + return *this; +} + +#ifdef HS_OPTIMIZE +template <> +really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const +{ + return {vshlq_n_s32(u.v128[0], N)}; +} +#else +template <> +really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const +{ + return lshift128_var(N); +} +#endif + +template <> +really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) +{ + //return _mm_loadu_si128((const m128 *)ptr); + return vld1q_s32((const int32_t *)ptr) +} + +template <> +really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) +{ + //assert(ISALIGNED_N(ptr, alignof(SuperVector::size))); + //ptr = assume_aligned(ptr, SuperVector::size); + //return _mm_load_si128((const m128 *)ptr); + assert(ISALIGNED_N(ptr, alignof(m128))); + return vld1q_s32((const int32_t *)ptr); + +} + +template <> +really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len) +{ + SuperVector<16> mask = Ones().rshift128_var(16 -len); + mask.print8("mask"); + SuperVector<16> v = vld1q_s32((const int32_t *)ptr); + v.print8("v"); + return mask & v; +} + +#ifdef HS_OPTIMIZE +template<> +really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) +{ + return {vextq_s8(u.v128[0], other.u.v128[0], offset)}; +} +#else +template<> +really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) +{ + switch(offset) { + case 0: return other; break; + case 1: return {vextq_s8(u.v128[0], other.u.v128[0], 1)}; break; + case 2: return {vextq_s8(u.v128[0], other.u.v128[0], 2)}; break; + case 3: return {vextq_s8(u.v128[0], other.u.v128[0], 3)}; break; + case 4: return {vextq_s8(u.v128[0], other.u.v128[0], 4)}; break; + case 5: return {vextq_s8(u.v128[0], other.u.v128[0], 5)}; break; + case 6: return {vextq_s8(u.v128[0], other.u.v128[0], 6)}; break; + case 7: return {vextq_s8(u.v128[0], other.u.v128[0], 7)}; break; + case 8: return {vextq_s8(u.v128[0], other.u.v128[0], 8)}; break; + case 9: return {vextq_s8(u.v128[0], other.u.v128[0], 9)}; break; + case 10: return {vextq_s8(u.v128[0], other.u.v128[0], 10)}; break; + case 11: return {vextq_s8(u.v128[0], other.u.v128[0], 11)}; break; + case 12: return {vextq_s8(u.v128[0], other.u.v128[0], 12)}; break; + case 13: return {vextq_s8(u.v128[0], other.u.v128[0], 13)}; break; + case 14: return {vextq_s8(u.v128[0], other.u.v128[0], 14)}; break; + case 15: return {vextq_s8(u.v128[0], other.u.v128[0], 15)}; break; + default: break; + } + return *this; +} +#endif + +template<> +really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) +{ + //return {_mm_shuffle_epi8(u.v128[0], b.u.v128[0])}; + int8x16_t btranslated = vandq_s8((int8x16_t)b.u.v128[0],vdupq_n_s8(0x8f)); + return (m128)vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)btranslated); +} + +template<> +really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len) +{ + SuperVector<16> mask = Ones().rshift128_var(16 -len); + return mask & pshufb(b); +} + +#ifdef HS_OPTIMIZE +template<> +really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) +{ + return {vshlq_n_s64(u.v128[0], N)}; +} +#else +template<> +really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) +{ + switch(N) { + case 0: return *this; break; + case 1: return {vshlq_n_s64(u.v128[0], 1)}; break; + case 2: return {vshlq_n_s64(u.v128[0], 2)}; break; + case 3: return {vshlq_n_s64(u.v128[0], 3)}; break; + case 4: return {vshlq_n_s64(u.v128[0], 4)}; break; + case 5: return {vshlq_n_s64(u.v128[0], 5)}; break; + case 6: return {vshlq_n_s64(u.v128[0], 6)}; break; + case 7: return {vshlq_n_s64(u.v128[0], 7)}; break; + case 8: return {vshlq_n_s64(u.v128[0], 8)}; break; + case 9: return {vshlq_n_s64(u.v128[0], 9)}; break; + case 10: return {vshlq_n_s64(u.v128[0], 10)}; break; + case 11: return {vshlq_n_s64(u.v128[0], 11)}; break; + case 12: return {vshlq_n_s64(u.v128[0], 12)}; break; + case 13: return {vshlq_n_s64(u.v128[0], 13)}; break; + case 14: return {vshlq_n_s64(u.v128[0], 14)}; break; + case 15: return {vshlq_n_s64(u.v128[0], 15)}; break; + case 16: return Zeroes(); + default: break; + } + return *this; +} +#endif + +#ifdef HS_OPTIMIZE +template<> +really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) +{ + return {vshrq_n_s64(u.v128[0], N)}; +} +#else +template<> +really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) +{ + switch(N) { + case 0: return {vshrq_n_s64(u.v128[0], 0)}; break; + case 1: return {vshrq_n_s64(u.v128[0], 1)}; break; + case 2: return {vshrq_n_s64(u.v128[0], 2)}; break; + case 3: return {vshrq_n_s64(u.v128[0], 3)}; break; + case 4: return {vshrq_n_s64(u.v128[0], 4)}; break; + case 5: return {vshrq_n_s64(u.v128[0], 5)}; break; + case 6: return {vshrq_n_s64(u.v128[0], 6)}; break; + case 7: return {vshrq_n_s64(u.v128[0], 7)}; break; + case 8: return {vshrq_n_s64(u.v128[0], 8)}; break; + case 9: return {vshrq_n_s64(u.v128[0], 9)}; break; + case 10: return {vshrq_n_s64(u.v128[0], 10)}; break; + case 11: return {vshrq_n_s64(u.v128[0], 11)}; break; + case 12: return {vshrq_n_s64(u.v128[0], 12)}; break; + case 13: return {vshrq_n_s64(u.v128[0], 13)}; break; + case 14: return {vshrq_n_s64(u.v128[0], 14)}; break; + case 15: return {vshrq_n_s64(u.v128[0], 15)}; break; + case 16: return Zeroes(); + default: break; + } + return *this; +} +#endif + +template<> +really_inline SuperVector<16> SuperVector<16>::lshift128(uint8_t const N) +{ + return *this << N; +} + +template<> +really_inline SuperVector<16> SuperVector<16>::rshift128(uint8_t const N) +{ + return *this >> N; +} diff --git a/src/util/supervector/arch/ppc64el/types.hpp b/src/util/supervector/arch/ppc64el/types.hpp new file mode 100644 index 00000000..75f14551 --- /dev/null +++ b/src/util/supervector/arch/ppc64el/types.hpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SIMD_TYPES_ARM_H +#define SIMD_TYPES_ARM_H + +#if !defined(m128) && defined(HAVE_VSX) +typedef __vector int32_t m128; +#endif + +#endif /* SIMD_TYPES_ARM_H */ \ No newline at end of file From 90d3db177619f141fe09a64d5daa25fa7815a947 Mon Sep 17 00:00:00 2001 From: apostolos Date: Mon, 27 Sep 2021 15:14:07 +0300 Subject: [PATCH 05/37] update powerpc simd util file functions --- src/util/arch/ppc64el/simd_types.h | 6 +- src/util/arch/ppc64el/simd_utils.h | 145 +++++++++++------ src/util/supervector/arch/ppc64el/impl.cpp | 171 +++++++++++--------- src/util/supervector/arch/ppc64el/types.hpp | 5 - 4 files changed, 193 insertions(+), 134 deletions(-) diff --git a/src/util/arch/ppc64el/simd_types.h b/src/util/arch/ppc64el/simd_types.h index 27b5d75d..21dae5cb 100644 --- a/src/util/arch/ppc64el/simd_types.h +++ b/src/util/arch/ppc64el/simd_types.h @@ -26,12 +26,12 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#ifndef SIMD_TYPES_ARM_H -#define SIMD_TYPES_ARM_H +#ifndef ARCH_PPC64EL_SIMD_TYPES_H +#define ARCH_PPC64EL_SIMD_TYPES_H #if !defined(m128) && defined(HAVE_VSX) typedef __vector int32_t m128; #endif -#endif /* SIMD_TYPES_ARM_H */ +#endif /* ARCH_PPC64EL_SIMD_TYPES_H */ diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index 8b5767e6..f8ff3b90 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -61,7 +61,9 @@ static really_inline m128 zeroes128(void) { /** \brief Bitwise not for m128*/ static really_inline m128 not128(m128 a) { - return (m128) vec_xor(a, a); + return (m128)vec_xor(a, ones128()); + // or + return (m128)vec_xor(a, a); } /** \brief Return 1 if a and b are different otherwise 0 */ @@ -70,7 +72,7 @@ static really_inline int diff128(m128 a, m128 b) { } static really_inline int isnonzero128(m128 a) { - return diff128(a, zeroes128()); + return !!diff128(a, zeroes128()); } /** @@ -115,74 +117,95 @@ m128 sub_2x64(m128 a, m128 b) { static really_really_inline m128 lshift_m128(m128 a, unsigned b) { - return (m128) vshlq_n_s32((int64x2_t)a, b); + //return (m128) vshlq_n_s32((int64x2_t)a, b); + return (m128) vec_sl((int64x2_t)a, b); + // or + // return (m128) vec_sll((int64x2_t)a, b); + // the above command executes Left shifts an entire vector by a given number of bits. } static really_really_inline m128 rshift_m128(m128 a, unsigned b) { - return (m128) vshrq_n_s32((int64x2_t)a, b); + //return (m128) vshrq_n_s32((int64x2_t)a, b); + return (m128) vec_srl((int64x2_t)a, b); + // or + // return (m128) vec_srl((int64x2_t)a, b); + // the above command executes Right shifts an entire vector by a given number of bits. } static really_really_inline m128 lshift64_m128(m128 a, unsigned b) { - return (m128) vshlq_n_s64((int64x2_t)a, b); + return (m128) vec_sldw ((int64x2_t)a, b, 8); } static really_really_inline m128 rshift64_m128(m128 a, unsigned b) { - return (m128) vshrq_n_s64((int64x2_t)a, b); + //return (m128) vshrq_n_s64((int64x2_t)a, b); + #warning FIXME } static really_inline m128 eq128(m128 a, m128 b) { - return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b); + return (m128) vec_all_eq((uint64x2_t)a, (uint64x2_t)b); + //or + //return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b); } static really_inline m128 eq64_m128(m128 a, m128 b) { - return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b); + //return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b); + #warning FIXME } static really_inline u32 movemask128(m128 a) { - static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; + //static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; // Compute the mask from the input - uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); - uint64x2_t mask1 = (m128)vextq_s8(mask, zeroes128(), 7); - mask = vorrq_u8(mask, mask1); + //uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); + //uint64x2_t mask1 = (m128)vextq_s8(mask, zeroes128(), 7); + //mask = vorrq_u8(mask, mask1); // Get the resulting bytes - uint16_t output; - vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0); - return output; + //uint16_t output; + //vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0); + //return output; + #warning FIXME } static really_inline m128 set1_16x8(u8 c) { - return (m128) vdupq_n_u8(c); + //return (m128) vdupq_n_u8(c); + return (m128) vec_splat_u8(c); } static really_inline m128 set1_4x32(u32 c) { - return (m128) vdupq_n_u32(c); + //return (m128) vdupq_n_u32(c); + return (m128) vec_splat_u32(c); } static really_inline m128 set1_2x64(u64a c) { - return (m128) vdupq_n_u64(c); + //return (m128) vdupq_n_u64(c); + return (m128) vec_splat_u64(c); } static really_inline u32 movd(const m128 in) { - return vgetq_lane_u32((uint32x4_t) in, 0); + //return vgetq_lane_u32((uint32x4_t) in, 0); + #warning FIXME } static really_inline u64a movq(const m128 in) { - return vgetq_lane_u64((uint64x2_t) in, 0); + //return vgetq_lane_u64((uint64x2_t) in, 0); + #warning FIXME } /* another form of movq */ static really_inline m128 load_m128_from_u64a(const u64a *p) { - return (m128) vsetq_lane_u64(*p, zeroes128(), 0); + //return (m128) vsetq_lane_u64(*p, zeroes128(), 0); + #warning FIXME } + static really_inline u32 extract32from128(const m128 in, unsigned imm) { +/* #if defined(HS_OPTIMIZE) return vgetq_lane_u32((uint32x4_t) in, imm); #else @@ -204,9 +227,12 @@ static really_inline u32 extract32from128(const m128 in, unsigned imm) { break; } #endif +*/ +#warning FIXME } static really_inline u64a extract64from128(const m128 in, unsigned imm) { +/* #if defined(HS_OPTIMIZE) return vgetq_lane_u64((uint64x2_t) in, imm); #else @@ -222,56 +248,70 @@ static really_inline u64a extract64from128(const m128 in, unsigned imm) { break; } #endif +*/ +#warning FIXME } static really_inline m128 low64from128(const m128 in) { - return vcombine_u64(vget_low_u64(in), vdup_n_u64(0)); + //return vcombine_u64(vget_low_u64(in), vdup_n_u64(0)); + #warning FIXME } static really_inline m128 high64from128(const m128 in) { - return vcombine_u64(vget_high_u64(in), vdup_n_u64(0)); + //return vcombine_u64(vget_high_u64(in), vdup_n_u64(0)); + #warning FIXME } + static really_inline m128 add128(m128 a, m128 b) { - return (m128) vaddq_u64((uint64x2_t)a, (uint64x2_t)b); + return (m128) vec_add((uint64x2_t)a, (uint64x2_t)b); } static really_inline m128 and128(m128 a, m128 b) { - return (m128) vandq_s8((int8x16_t)a, (int8x16_t)b); + return (m128) vec_and((int8x16_t)a, (int8x16_t)b); } static really_inline m128 xor128(m128 a, m128 b) { - return (m128) veorq_s8((int8x16_t)a, (int8x16_t)b); + return (m128) vec_xor((int8x16_t)a, (int8x16_t)b); } static really_inline m128 or128(m128 a, m128 b) { - return (m128) vorrq_s8((int8x16_t)a, (int8x16_t)b); + return (m128) vec_or((int8x16_t)a, (int8x16_t)b); } static really_inline m128 andnot128(m128 a, m128 b) { - return (m128) (m128) vandq_s8( vmvnq_s8(a), b); + m128 and_res = and128(a,b); + return (m128) not128(and_res); + // or + //return (m128) not128(and128(a,b)); } // aligned load static really_inline m128 load128(const void *ptr) { assert(ISALIGNED_N(ptr, alignof(m128))); - return (m128) vld1q_s32((const int32_t *)ptr); + //return (m128) vld1q_s32((const int32_t *)ptr); + //return *(int64x2_t *) (&ptr[0]); + #warning FIXME } // aligned store static really_inline void store128(void *ptr, m128 a) { - assert(ISALIGNED_N(ptr, alignof(m128))); - vst1q_s32((int32_t *)ptr, a); + //assert(ISALIGNED_N(ptr, alignof(m128))); + //vst1q_s32((int32_t *)ptr, a); + #warning FIXME } // unaligned load static really_inline m128 loadu128(const void *ptr) { - return (m128) vld1q_s32((const int32_t *)ptr); + //return (m128) vld1q_s32((const int32_t *)ptr); + //return *(uint64x2_t *) (&ptr[0]); + #warning FIXME } // unaligned store static really_inline void storeu128(void *ptr, m128 a) { - vst1q_s32((int32_t *)ptr, a); + //vst1q_s32((int32_t *)ptr, a); + #warning FIXME } // packed unaligned store of first N bytes @@ -321,32 +361,41 @@ m128 palignr_imm(m128 r, m128 l, int offset) { static really_really_inline m128 palignr(m128 r, m128 l, int offset) { +/* #if defined(HS_OPTIMIZE) return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); #else return palignr_imm(r, l, offset); #endif +*/ +#warning FIXME } + #undef CASE_ALIGN_VECTORS static really_really_inline m128 rshiftbyte_m128(m128 a, unsigned b) { - return palignr(zeroes128(), a, b); + //return palignr(zeroes128(), a, b); + #warning FIXME } static really_really_inline m128 lshiftbyte_m128(m128 a, unsigned b) { - return palignr(a, zeroes128(), 16 - b); + //return palignr(a, zeroes128(), 16 - b); + #warning FIXME } static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) { +/* assert(amount >= -16 && amount <= 16); static const uint8x16_t vbs_mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }; const uint8x16_t outside_mask = set1_16x8(0xf0); m128 shift_mask = palignr_imm(vbs_mask, outside_mask, 16 - amount); return vqtbl1q_s8(in, shift_mask); +*/ +#warning FIXME } #ifdef __cplusplus @@ -381,7 +430,6 @@ void clearbit128(m128 *ptr, unsigned int n) { static really_inline char testbit128(m128 val, unsigned int n) { const m128 mask = mask1bit128(n); - return isnonzero128(and128(mask, val)); } @@ -390,40 +438,43 @@ m128 pshufb_m128(m128 a, m128 b) { /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. In NEON, if >=16, then the result is zero, otherwise it is that lane. btranslated is the version that is converted from Intel to NEON. */ - int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f)); - return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated); + //int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f)); + //return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated); + #warning FIXME } static really_inline m128 max_u8_m128(m128 a, m128 b) { - return (m128) vmaxq_u8((int8x16_t)a, (int8x16_t)b); + return (m128) vec_max((int8x16_t)a, (int8x16_t)b); } static really_inline m128 min_u8_m128(m128 a, m128 b) { - return (m128) vminq_u8((int8x16_t)a, (int8x16_t)b); + return (m128) vec_min((int8x16_t)a, (int8x16_t)b); } static really_inline m128 sadd_u8_m128(m128 a, m128 b) { - return (m128) vqaddq_u8((uint8x16_t)a, (uint8x16_t)b); + return (m128) vec_add((uint8x16_t)a, (uint8x16_t)b); } static really_inline m128 sub_u8_m128(m128 a, m128 b) { - return (m128) vsubq_u8((uint8x16_t)a, (uint8x16_t)b); + return (m128) vec_sub((uint8x16_t)a, (uint8x16_t)b); } static really_inline m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) { - uint32_t ALIGN_ATTR(16) data[4] = { x0, x1, x2, x3 }; - return (m128) vld1q_u32((uint32_t *) data); + //uint32_t ALIGN_ATTR(16) data[4] = { x0, x1, x2, x3 }; + //return (m128) vld1q_u32((uint32_t *) data); + #warning FIXME } static really_inline m128 set2x64(u64a hi, u64a lo) { - uint64_t ALIGN_ATTR(16) data[2] = { lo, hi }; - return (m128) vld1q_u64((uint64_t *) data); + //uint64_t ALIGN_ATTR(16) data[2] = { lo, hi }; + //return (m128) vld1q_u64((uint64_t *) data); + #warning FIXME } -#endif // ARCH_ARM_SIMD_UTILS_H +#endif // ARCH_PPC64EL_SIMD_UTILS_H diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 2ddd3658..d58297fe 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -57,7 +57,7 @@ template<> really_inline SuperVector<16>::SuperVector(int8_t const other) { //u.v128[0] = _mm_set1_epi8(other); - u.v128[0] = vdupq_n_u8(other); + u.v128[0] = vec_splat_s8(other); } template<> @@ -65,7 +65,7 @@ template<> really_inline SuperVector<16>::SuperVector(uint8_t const other) { //u.v128[0] = _mm_set1_epi8(static_cast(other)); - u.v128[0] = vdupq_n_u8(static_cast(other)); + u.v128[0] = vec_splat_s8(static_cast(other)); } template<> @@ -73,6 +73,7 @@ template<> really_inline SuperVector<16>::SuperVector(int16_t const other) { //u.v128[0] = _mm_set1_epi16(other); + u.v128[0] = vec_splat_s16(other); } template<> @@ -80,6 +81,7 @@ template<> really_inline SuperVector<16>::SuperVector(uint16_t const other) { //u.v128[0] = _mm_set1_epi16(static_cast(other)); + u.v128[0] = vec_splat_s16(static_cast(other)); } template<> @@ -87,7 +89,7 @@ template<> really_inline SuperVector<16>::SuperVector(int32_t const other) { //u.v128[0] = _mm_set1_epi32(other); - u.v128[0] = vdupq_n_u32(other); + u.v128[0] = vec_splat_s32(other); } template<> @@ -95,7 +97,7 @@ template<> really_inline SuperVector<16>::SuperVector(uint32_t const other) { //u.v128[0] = _mm_set1_epi32(static_cast(other)); - u.v128[0] = vdupq_n_u32(static_cast(other)); + u.v128[0] = vec_splat_s32(static_cast(other)); } template<> @@ -103,7 +105,7 @@ template<> really_inline SuperVector<16>::SuperVector(int64_t const other) { //u.v128[0] = _mm_set1_epi64x(other); - u.v128[0] = vdupq_n_u64(other); + u.v128[0] = vec_splat_u64(other); } template<> @@ -111,7 +113,7 @@ template<> really_inline SuperVector<16>::SuperVector(uint64_t const other) { //u.v128[0] = _mm_set1_epi64x(static_cast(other)); - u.v128[0] = vdupq_n_u64(static_cast(other)); + u.v128[0] = vec_splat_u32(static_cast(other)); } // Constants @@ -141,7 +143,7 @@ template <> really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const { //return {_mm_and_si128(u.v128[0], b.u.v128[0])}; - return {vec_add(u.v128[0], b.u.v128[0])}; + return {vec_and(u.v128[0], b.u.v128[0])}; } template <> @@ -162,14 +164,14 @@ template <> really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const { //return {_mm_andnot_si128(u.v128[0], b.u.v128[0])}; - return 0; + #warning FIXME } template <> really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const { //return {_mm_cmpeq_epi8(u.v128[0], b.u.v128[0])}; - return {vec_cmpeq(u.v128[0], b.u.v128[0])}; + return { vec_all_eq(u.v128[0], b.u.v128[0])}; } template <> @@ -177,15 +179,15 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask( { //return _mm_movemask_epi8(u.v128[0]); // Compute the mask from the input - uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)u.v128[0], 0)))); - uint64x2_t mask1 = (m128)vextq_s8(mask, Zeroes(), 7); - mask = vorrq_u8(mask, mask1); + //uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)u.v128[0], 0)))); + //uint64x2_t mask1 = (m128)vextq_s8(mask, Zeroes(), 7); + //mask = vorrq_u8(mask, mask1); // Get the resulting bytes - uint16_t output; - vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0); - return output; - return 0; + //uint16_t output; + //vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0); + //return output; + #warning FIXME } template <> @@ -198,21 +200,21 @@ template <> really_inline SuperVector<16> SuperVector<16>::rshift128_var(uint8_t const N) const { switch(N) { - case 1: return {vshrq_n_s32(u.v128[0], 1)}; break; - case 2: return {vshrq_n_s32(u.v128[0], 2)}; break; - case 3: return {vshrq_n_s32(u.v128[0], 3)}; break; - case 4: return {vshrq_n_s32(u.v128[0], 4)}; break; - case 5: return {vshrq_n_s32(u.v128[0], 5)}; break; - case 6: return {vshrq_n_s32(u.v128[0], 6)}; break; - case 7: return {vshrq_n_s32(u.v128[0], 7)}; break; - case 8: return {vshrq_n_s32(u.v128[0], 8)}; break; - case 9: return {vshrq_n_s32(u.v128[0], 9)}; break; - case 10: return {vshrq_n_s32(u.v128[0], 10)}; break; - case 11: return {vshrq_n_s32(u.v128[0], 11)}; break; - case 12: return {vshrq_n_s32(u.v128[0], 12)}; break; - case 13: return {vshrq_n_s32(u.v128[0], 13)}; break; - case 14: return {vshrq_n_s32(u.v128[0], 14)}; break; - case 15: return {vshrq_n_s32(u.v128[0], 15)}; break; + case 1: return {vec_srl(u.v128[0], 1)}; break; + case 2: return {vec_srl(u.v128[0], 2)}; break; + case 3: return {vec_srl(u.v128[0], 3)}; break; + case 4: return {vec_srl(u.v128[0], 4)}; break; + case 5: return {vec_srl(u.v128[0], 5)}; break; + case 6: return {vec_srl(u.v128[0], 6)}; break; + case 7: return {vec_srl(u.v128[0], 7)}; break; + case 8: return {vec_srl(u.v128[0], 8)}; break; + case 9: return {vec_srl(u.v128[0], 9)}; break; + case 10: return {vec_srl(u.v128[0], 10)}; break; + case 11: return {vec_srl(u.v128[0], 11)}; break; + case 12: return {vec_srl(u.v128[0], 12)}; break; + case 13: return {vec_srl(u.v128[0], 13)}; break; + case 14: return {vec_srl(u.v128[0], 14)}; break; + case 15: return {vec_srl(u.v128[0], 15)}; break; case 16: return Zeroes(); break; default: break; } @@ -223,7 +225,7 @@ really_inline SuperVector<16> SuperVector<16>::rshift128_var(uint8_t const N) co template <> really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const { - return {vshrq_n_s32(u.v128[0], N)}; + return {vec_srl(u.v128[0], N)}; } #else template <> @@ -237,21 +239,21 @@ template <> really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) const { switch(N) { - case 1: return {vshlq_n_s32(u.v128[0], 1)}; break; - case 2: return {vshlq_n_s32(u.v128[0], 2)}; break; - case 3: return {vshlq_n_s32(u.v128[0], 3)}; break; - case 4: return {vshlq_n_s32(u.v128[0], 4)}; break; - case 5: return {vshlq_n_s32(u.v128[0], 5)}; break; - case 6: return {vshlq_n_s32(u.v128[0], 6)}; break; - case 7: return {vshlq_n_s32(u.v128[0], 7)}; break; - case 8: return {vshlq_n_s32(u.v128[0], 8)}; break; - case 9: return {vshlq_n_s32(u.v128[0], 9)}; break; - case 10: return {vshlq_n_s32(u.v128[0], 10)}; break; - case 11: return {vshlq_n_s32(u.v128[0], 11)}; break; - case 12: return {vshlq_n_s32(u.v128[0], 12)}; break; - case 13: return {vshlq_n_s32(u.v128[0], 13)}; break; - case 14: return {vshlq_n_s32(u.v128[0], 14)}; break; - case 15: return {vshlq_n_s32(u.v128[0], 15)}; break; + case 1: return {vec_sll(u.v128[0], 1)}; break; + case 2: return {vec_sll(u.v128[0], 2)}; break; + case 3: return {vec_sll(u.v128[0], 3)}; break; + case 4: return {vec_sll(u.v128[0], 4)}; break; + case 5: return {vec_sll(u.v128[0], 5)}; break; + case 6: return {vec_sll(u.v128[0], 6)}; break; + case 7: return {vec_sll(u.v128[0], 7)}; break; + case 8: return {vec_sll(u.v128[0], 8)}; break; + case 9: return {vec_sll(u.v128[0], 9)}; break; + case 10: return {vec_sll(u.v128[0], 10)}; break; + case 11: return {vec_sll(u.v128[0], 11)}; break; + case 12: return {vec_sll(u.v128[0], 12)}; break; + case 13: return {vec_sll(u.v128[0], 13)}; break; + case 14: return {vec_sll(u.v128[0], 14)}; break; + case 15: return {vec_sll(u.v128[0], 15)}; break; case 16: return Zeroes(); break; default: break; } @@ -262,7 +264,7 @@ really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) co template <> really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { - return {vshlq_n_s32(u.v128[0], N)}; + return {vec_sll(u.v128[0], N)}; } #else template <> @@ -276,7 +278,7 @@ template <> really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) { //return _mm_loadu_si128((const m128 *)ptr); - return vld1q_s32((const int32_t *)ptr) + #warning FIXME } template <> @@ -285,31 +287,34 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) //assert(ISALIGNED_N(ptr, alignof(SuperVector::size))); //ptr = assume_aligned(ptr, SuperVector::size); //return _mm_load_si128((const m128 *)ptr); - assert(ISALIGNED_N(ptr, alignof(m128))); - return vld1q_s32((const int32_t *)ptr); - + //assert(ISALIGNED_N(ptr, alignof(m128))); + //return vld1q_s32((const int32_t *)ptr); + #warning FIXME } template <> really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len) { - SuperVector<16> mask = Ones().rshift128_var(16 -len); - mask.print8("mask"); - SuperVector<16> v = vld1q_s32((const int32_t *)ptr); - v.print8("v"); - return mask & v; + //SuperVector<16> mask = Ones().rshift128_var(16 -len); + //mask.print8("mask"); + //SuperVector<16> v = vld1q_s32((const int32_t *)ptr); + //v.print8("v"); + //return mask & v; + #warning FIXME } #ifdef HS_OPTIMIZE template<> really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) { - return {vextq_s8(u.v128[0], other.u.v128[0], offset)}; + //return {vextq_s8(u.v128[0], other.u.v128[0], offset)}; + #warning FIXME } #else template<> really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) -{ +{ + /* switch(offset) { case 0: return other; break; case 1: return {vextq_s8(u.v128[0], other.u.v128[0], 1)}; break; @@ -330,6 +335,8 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in default: break; } return *this; + */ + #warning FIXME } #endif @@ -337,8 +344,9 @@ template<> really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) { //return {_mm_shuffle_epi8(u.v128[0], b.u.v128[0])}; - int8x16_t btranslated = vandq_s8((int8x16_t)b.u.v128[0],vdupq_n_s8(0x8f)); - return (m128)vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)btranslated); + //int8x16_t btranslated = vandq_s8((int8x16_t)b.u.v128[0],vdupq_n_s8(0x8f)); + //return (m128)vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)btranslated); + #warning FIXME } template<> @@ -352,7 +360,8 @@ really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, u template<> really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) { - return {vshlq_n_s64(u.v128[0], N)}; + //return {vshlq_n_s64(u.v128[0], N)}; + return {vec_sldw((int64x2_t)u.v128[0], N, 8)}; } #else template<> @@ -360,21 +369,21 @@ really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) { switch(N) { case 0: return *this; break; - case 1: return {vshlq_n_s64(u.v128[0], 1)}; break; - case 2: return {vshlq_n_s64(u.v128[0], 2)}; break; - case 3: return {vshlq_n_s64(u.v128[0], 3)}; break; - case 4: return {vshlq_n_s64(u.v128[0], 4)}; break; - case 5: return {vshlq_n_s64(u.v128[0], 5)}; break; - case 6: return {vshlq_n_s64(u.v128[0], 6)}; break; - case 7: return {vshlq_n_s64(u.v128[0], 7)}; break; - case 8: return {vshlq_n_s64(u.v128[0], 8)}; break; - case 9: return {vshlq_n_s64(u.v128[0], 9)}; break; - case 10: return {vshlq_n_s64(u.v128[0], 10)}; break; - case 11: return {vshlq_n_s64(u.v128[0], 11)}; break; - case 12: return {vshlq_n_s64(u.v128[0], 12)}; break; - case 13: return {vshlq_n_s64(u.v128[0], 13)}; break; - case 14: return {vshlq_n_s64(u.v128[0], 14)}; break; - case 15: return {vshlq_n_s64(u.v128[0], 15)}; break; + case 1: return {vec_sldw((int64x2_t)u.v128[0], 1, 8)}; break; + case 2: return {vec_sldw((int64x2_t)u.v128[0], 2, 8)}; break; + case 3: return {vec_sldw((int64x2_t)u.v128[0], 3, 8)}; break; + case 4: return {vec_sldw((int64x2_t)u.v128[0], 4, 8)}; break; + case 5: return {vec_sldw((int64x2_t)u.v128[0], 5, 8)}; break; + case 6: return {vec_sldw((int64x2_t)u.v128[0], 6, 8)}; break; + case 7: return {vec_sldw((int64x2_t)u.v128[0], 7, 8)}; break; + case 8: return {vec_sldw((int64x2_t)u.v128[0], 8, 8)}; break; + case 9: return {vec_sldw((int64x2_t)u.v128[0], 9, 8)}; break; + case 10: return {vec_sldw((int64x2_t)u.v128[0], 10, 8)}; break; + case 11: return {vec_sldw((int64x2_t)u.v128[0], 11, 8)}; break; + case 12: return {vec_sldw((int64x2_t)u.v128[0], 12, 8)}; break; + case 13: return {vec_sldw((int64x2_t)u.v128[0], 13, 8)}; break; + case 14: return {vec_sldw((int64x2_t)u.v128[0], 14, 8)}; break; + case 15: return {vec_sldw((int64x2_t)u.v128[0], 15, 8)}; break; case 16: return Zeroes(); default: break; } @@ -386,12 +395,14 @@ really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) template<> really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) { - return {vshrq_n_s64(u.v128[0], N)}; + //return {vshrq_n_s64(u.v128[0], N)}; + #warning FIXME } #else template<> really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) -{ +{ + /* switch(N) { case 0: return {vshrq_n_s64(u.v128[0], 0)}; break; case 1: return {vshrq_n_s64(u.v128[0], 1)}; break; @@ -413,6 +424,8 @@ really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) default: break; } return *this; + */ + #warning FIXME } #endif diff --git a/src/util/supervector/arch/ppc64el/types.hpp b/src/util/supervector/arch/ppc64el/types.hpp index 75f14551..dbd863f4 100644 --- a/src/util/supervector/arch/ppc64el/types.hpp +++ b/src/util/supervector/arch/ppc64el/types.hpp @@ -27,11 +27,6 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#ifndef SIMD_TYPES_ARM_H -#define SIMD_TYPES_ARM_H - #if !defined(m128) && defined(HAVE_VSX) typedef __vector int32_t m128; #endif - -#endif /* SIMD_TYPES_ARM_H */ \ No newline at end of file From 2231f7c024402b781ae9eb45874a9c64e03ee6d1 Mon Sep 17 00:00:00 2001 From: Vectorcamp Date: Wed, 6 Oct 2021 06:23:46 -0400 Subject: [PATCH 06/37] compile fixes for vsc port --- CMakeLists.txt | 4 + src/fdr/teddy.c | 8 +- src/hs_valid_platform.c | 2 + src/util/arch/ppc64el/ppc64el.h | 1 + src/util/arch/ppc64el/simd_utils.h | 160 ++++++++++++--------- src/util/supervector/arch/ppc64el/impl.cpp | 156 +++++++++++++------- src/util/supervector/supervector.hpp | 4 + 7 files changed, 208 insertions(+), 127 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 51b8d6b1..7d12e2f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -695,6 +695,10 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64) set (hs_exec_SRCS ${hs_exec_SRCS} src/util/supervector/arch/arm/impl.cpp) +elseif (ARCH_PPC64EL) +set (hs_exec_SRCS + ${hs_exec_SRCS} + src/util/supervector/arch/ppc64el/impl.cpp) endif () endif() diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c index 3e46a0d6..65db3dff 100644 --- a/src/fdr/teddy.c +++ b/src/fdr/teddy.c @@ -893,10 +893,10 @@ do { \ #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ do { \ if (unlikely(diff128(var, ones128()))) { \ - u64a __attribute__((aligned(16))) vector[2]; \ - store128(vector, var); \ - u64a lo = vector[0]; \ - u64a hi = vector[1]; \ + u64a __attribute__((aligned(16))) vec[2]; \ + store128(vec, var); \ + u64a lo = vec[0]; \ + u64a hi = vec[1]; \ CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn); \ CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn); \ } \ diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c index 8323f343..809deee1 100644 --- a/src/hs_valid_platform.c +++ b/src/hs_valid_platform.c @@ -44,5 +44,7 @@ hs_error_t HS_CDECL hs_valid_platform(void) { } #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) return HS_SUCCESS; +#elif defined(ARCH_PPC64EL) + return HS_SUCCESS; #endif } diff --git a/src/util/arch/ppc64el/ppc64el.h b/src/util/arch/ppc64el/ppc64el.h index 59e7e25d..dbb38297 100644 --- a/src/util/arch/ppc64el/ppc64el.h +++ b/src/util/arch/ppc64el/ppc64el.h @@ -36,6 +36,7 @@ #if defined(__VSX__) && defined(ARCH_PPC64EL) #define HAVE_VSX #define HAVE_SIMD_128_BITS +#define VECTORSIZE 16 #endif #endif // UTIL_ARCH_ARM_H_ diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index f8ff3b90..3f8fdf73 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -52,7 +52,8 @@ typedef __vector uint8_t uint8x16_t; typedef __vector int8_t int8x16_t; static really_inline m128 ones128(void) { - return (m128) vec_splat_s8(0xFF); + // the value in function must be a signed literal in range -16 to 15 + return (m128) vec_splat_s8(-1); } static really_inline m128 zeroes128(void) { @@ -61,9 +62,8 @@ static really_inline m128 zeroes128(void) { /** \brief Bitwise not for m128*/ static really_inline m128 not128(m128 a) { - return (m128)vec_xor(a, ones128()); - // or - return (m128)vec_xor(a, a); + //return (m128)vec_xor(a, a); + return (m128) vec_xor(a,ones128()); } /** \brief Return 1 if a and b are different otherwise 0 */ @@ -116,43 +116,40 @@ m128 sub_2x64(m128 a, m128 b) { } static really_really_inline -m128 lshift_m128(m128 a, unsigned b) { - //return (m128) vshlq_n_s32((int64x2_t)a, b); - return (m128) vec_sl((int64x2_t)a, b); - // or - // return (m128) vec_sll((int64x2_t)a, b); - // the above command executes Left shifts an entire vector by a given number of bits. +m128 lshift_m128(m128 a, unsigned UNUSED b) { + // #warning FIXME + // b must be 4 bit literal + return (m128) vec_sld(a, zeroes128(), 0); } static really_really_inline -m128 rshift_m128(m128 a, unsigned b) { - //return (m128) vshrq_n_s32((int64x2_t)a, b); - return (m128) vec_srl((int64x2_t)a, b); - // or - // return (m128) vec_srl((int64x2_t)a, b); - // the above command executes Right shifts an entire vector by a given number of bits. +m128 rshift_m128(m128 a, unsigned UNUSED b) { + // #warning FIXME + // b must be 4 bit literal + return (m128) vec_sld(zeroes128(), a, 0 - 0); } static really_really_inline -m128 lshift64_m128(m128 a, unsigned b) { - return (m128) vec_sldw ((int64x2_t)a, b, 8); +m128 lshift64_m128(m128 a, unsigned UNUSED b) { + // #warnint FIXME + // b must be 4 bit literal + return (m128) vec_sld(zeroes128(), a, 0); + } static really_really_inline -m128 rshift64_m128(m128 a, unsigned b) { - //return (m128) vshrq_n_s64((int64x2_t)a, b); - #warning FIXME +m128 rshift64_m128(m128 a, unsigned UNUSED b) { + // warnint FIXME + // b must be 4 bit literal + return (m128) vec_sld(zeroes128(), a, 0); } static really_inline m128 eq128(m128 a, m128 b) { - return (m128) vec_all_eq((uint64x2_t)a, (uint64x2_t)b); - //or - //return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b); + return (m128) vec_cmpeq((uint8x16_t)a, (uint8x16_t)b); } static really_inline m128 eq64_m128(m128 a, m128 b) { - //return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b); - #warning FIXME + return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b); } @@ -168,39 +165,46 @@ static really_inline u32 movemask128(m128 a) { //uint16_t output; //vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0); //return output; - #warning FIXME + // #warning FIXME + return !!diff128(a, zeroes128()); } -static really_inline m128 set1_16x8(u8 c) { - //return (m128) vdupq_n_u8(c); - return (m128) vec_splat_u8(c); +static really_inline m128 set1_16x8(u8 UNUSED c) { + // warning FIXME + // c must be 5 bit literal + // a solution is to use vec_splats + //return (m128) vec_splat_u8(0); + return (m128) vec_splats(c); } -static really_inline m128 set1_4x32(u32 c) { - //return (m128) vdupq_n_u32(c); - return (m128) vec_splat_u32(c); +static really_inline m128 set1_4x32(u32 UNUSED c) { + // warning FIXME + // c must be 5 bit literal + // a solution is to use vec_splats + // return (m128) vec_splat_u32(0); + return (m128) vec_splats(c); } static really_inline m128 set1_2x64(u64a c) { - //return (m128) vdupq_n_u64(c); - return (m128) vec_splat_u64(c); + return (m128) vec_splats(c); } static really_inline u32 movd(const m128 in) { //return vgetq_lane_u32((uint32x4_t) in, 0); - #warning FIXME + return !!diff128(in, zeroes128()); + // #warning FIXME } static really_inline u64a movq(const m128 in) { //return vgetq_lane_u64((uint64x2_t) in, 0); - #warning FIXME + return !!diff128(in, zeroes128()); + // #warning FIXME } /* another form of movq */ static really_inline m128 load_m128_from_u64a(const u64a *p) { - //return (m128) vsetq_lane_u64(*p, zeroes128(), 0); - #warning FIXME + return (m128) vec_ld(0,p); } @@ -228,7 +232,8 @@ static really_inline u32 extract32from128(const m128 in, unsigned imm) { } #endif */ -#warning FIXME +// #warning FIXME +return vec_any_ne(in,lshift_m128(in,imm)); } static really_inline u64a extract64from128(const m128 in, unsigned imm) { @@ -249,17 +254,20 @@ static really_inline u64a extract64from128(const m128 in, unsigned imm) { } #endif */ -#warning FIXME +// #warning FIXME +return vec_any_ne(in,lshift_m128(in,imm)); } static really_inline m128 low64from128(const m128 in) { //return vcombine_u64(vget_low_u64(in), vdup_n_u64(0)); - #warning FIXME + // #warning FIXME + return in; } static really_inline m128 high64from128(const m128 in) { //return vcombine_u64(vget_high_u64(in), vdup_n_u64(0)); - #warning FIXME + // #warning FIXME + return in; } @@ -289,29 +297,28 @@ static really_inline m128 andnot128(m128 a, m128 b) { // aligned load static really_inline m128 load128(const void *ptr) { assert(ISALIGNED_N(ptr, alignof(m128))); - //return (m128) vld1q_s32((const int32_t *)ptr); - //return *(int64x2_t *) (&ptr[0]); - #warning FIXME + //return (m128) vec_ld(0, ptr); + // #warning FIXME + return zeroes128(); } // aligned store -static really_inline void store128(void *ptr, m128 a) { - //assert(ISALIGNED_N(ptr, alignof(m128))); - //vst1q_s32((int32_t *)ptr, a); - #warning FIXME +static really_inline void store128(void *ptr, m128 UNUSED a) { + assert(ISALIGNED_N(ptr, alignof(m128))); + //vec_st(a, 0, ptr); + // warning FIXME } // unaligned load -static really_inline m128 loadu128(const void *ptr) { - //return (m128) vld1q_s32((const int32_t *)ptr); - //return *(uint64x2_t *) (&ptr[0]); - #warning FIXME +static really_inline m128 loadu128(const void UNUSED *ptr) { + //return (m128) vec_ld(0, ptr); + // #warning FIXME + return zeroes128(); } // unaligned store -static really_inline void storeu128(void *ptr, m128 a) { - //vst1q_s32((int32_t *)ptr, a); - #warning FIXME +static really_inline void storeu128(void UNUSED *ptr, m128 UNUSED a) { + // #warning FIXME } // packed unaligned store of first N bytes @@ -331,10 +338,11 @@ m128 loadbytes128(const void *ptr, unsigned int n) { } -#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break; +//#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break; static really_really_inline m128 palignr_imm(m128 r, m128 l, int offset) { + /* switch (offset) { case 0: return l; break; CASE_ALIGN_VECTORS(l, r, 1); @@ -357,6 +365,9 @@ m128 palignr_imm(m128 r, m128 l, int offset) { return zeroes128(); break; } + */ + // #warning FIXME + return (m128) vec_cmpeq(r,lshift_m128(l,offset)); } static really_really_inline @@ -368,21 +379,24 @@ m128 palignr(m128 r, m128 l, int offset) { return palignr_imm(r, l, offset); #endif */ -#warning FIXME +// #warning FIXME +return (m128) vec_cmpeq(r, lshift_m128(l,offset)); } #undef CASE_ALIGN_VECTORS static really_really_inline m128 rshiftbyte_m128(m128 a, unsigned b) { - //return palignr(zeroes128(), a, b); - #warning FIXME + // #warning FIXME + // return vec_sro(a,b); + return rshift_m128(a,b); } static really_really_inline m128 lshiftbyte_m128(m128 a, unsigned b) { - //return palignr(a, zeroes128(), 16 - b); - #warning FIXME + //#warning FIXME + //return vec_slo(a,b); + return lshift_m128(a,b); } static really_inline @@ -395,7 +409,8 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) { m128 shift_mask = palignr_imm(vbs_mask, outside_mask, 16 - amount); return vqtbl1q_s8(in, shift_mask); */ -#warning FIXME +// #warning FIXME +return lshift_m128(in,amount); } #ifdef __cplusplus @@ -440,7 +455,8 @@ m128 pshufb_m128(m128 a, m128 b) { btranslated is the version that is converted from Intel to NEON. */ //int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f)); //return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated); - #warning FIXME + // #warning FIXME + return (m128) vec_max((int8x16_t)a, (int8x16_t)b); } static really_inline @@ -464,17 +480,19 @@ m128 sub_u8_m128(m128 a, m128 b) { } static really_inline -m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) { +m128 set4x32(u32 UNUSED x3, u32 UNUSED x2, u32 UNUSED x1, u32 UNUSED x0) { //uint32_t ALIGN_ATTR(16) data[4] = { x0, x1, x2, x3 }; - //return (m128) vld1q_u32((uint32_t *) data); - #warning FIXME + //return (m128) vec_splat_u32(data); + // #warning FIXME + return zeroes128(); } static really_inline -m128 set2x64(u64a hi, u64a lo) { +m128 set2x64(u64a UNUSED hi, u64a UNUSED lo) { //uint64_t ALIGN_ATTR(16) data[2] = { lo, hi }; - //return (m128) vld1q_u64((uint64_t *) data); - #warning FIXME + //return (m128) vec_splats(data); + // #warning FIXME + return zeroes128(); } #endif // ARCH_PPC64EL_SIMD_UTILS_H diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index d58297fe..f00b5b3d 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -37,6 +37,7 @@ #include "util/arch.h" #include "util/unaligned.h" #include "util/supervector/supervector.hpp" +#include // 128-bit Powerpc64le implementation @@ -57,7 +58,8 @@ template<> really_inline SuperVector<16>::SuperVector(int8_t const other) { //u.v128[0] = _mm_set1_epi8(other); - u.v128[0] = vec_splat_s8(other); + //u.v128[0] = vec_splat_s8(other); + std::cout< @@ -65,7 +67,8 @@ template<> really_inline SuperVector<16>::SuperVector(uint8_t const other) { //u.v128[0] = _mm_set1_epi8(static_cast(other)); - u.v128[0] = vec_splat_s8(static_cast(other)); + //u.v128[0] = vec_splat_s8(static_cast(other)); + std::cout< @@ -73,7 +76,8 @@ template<> really_inline SuperVector<16>::SuperVector(int16_t const other) { //u.v128[0] = _mm_set1_epi16(other); - u.v128[0] = vec_splat_s16(other); + //u.v128[0] = vec_splat_s16(other); + std::cout< @@ -81,7 +85,8 @@ template<> really_inline SuperVector<16>::SuperVector(uint16_t const other) { //u.v128[0] = _mm_set1_epi16(static_cast(other)); - u.v128[0] = vec_splat_s16(static_cast(other)); + //u.v128[0] = vec_splat_s16(static_cast(other)); + std::cout< @@ -89,7 +94,8 @@ template<> really_inline SuperVector<16>::SuperVector(int32_t const other) { //u.v128[0] = _mm_set1_epi32(other); - u.v128[0] = vec_splat_s32(other); + //u.v128[0] = vec_splat_s32(other); + std::cout< @@ -97,7 +103,8 @@ template<> really_inline SuperVector<16>::SuperVector(uint32_t const other) { //u.v128[0] = _mm_set1_epi32(static_cast(other)); - u.v128[0] = vec_splat_s32(static_cast(other)); + //u.v128[0] = vec_splat_s32(static_cast(other)); + std::cout< @@ -105,7 +112,8 @@ template<> really_inline SuperVector<16>::SuperVector(int64_t const other) { //u.v128[0] = _mm_set1_epi64x(other); - u.v128[0] = vec_splat_u64(other); + //u.v128[0] = vec_splat_u64(other); + std::cout< @@ -113,7 +121,8 @@ template<> really_inline SuperVector<16>::SuperVector(uint64_t const other) { //u.v128[0] = _mm_set1_epi64x(static_cast(other)); - u.v128[0] = vec_splat_u32(static_cast(other)); + //u.v128[0] = vec_splat_u32(static_cast(other)); + std::cout< really_inline SuperVector<16> SuperVector<16>::Ones(void) { //return {_mm_set1_epi8(0xFF)}; - return {vec_splat_s8(0xFF)}; + return {(m128) vec_splat_s8(1)}; } template<> really_inline SuperVector<16> SuperVector<16>::Zeroes(void) { //return {_mm_set1_epi8(0)}; - return {vec_splat_s8(0)}; +return {(m128) vec_splat_s8(0)}; } // Methods @@ -150,21 +159,22 @@ template <> really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const { //return {_mm_or_si128(u.v128[0], b.u.v128[0])}; - return {vec_or(u.v128[0], b.u.v128[0]);} + return {vec_or(u.v128[0], b.u.v128[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const { //return {_mm_xor_si128(u.v128[0], b.u.v128[0])}; - return {vec_xor(u.v128[0], b.u.v128[0]);} + return {vec_xor(u.v128[0], b.u.v128[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const { //return {_mm_andnot_si128(u.v128[0], b.u.v128[0])}; - #warning FIXME + m128 and_res = vec_and(u.v128[0], b.u.v128[0]); + return vec_xor(and_res,and_res); } template <> @@ -187,7 +197,8 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask( //uint16_t output; //vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0); //return output; - #warning FIXME + //#warning FIXME + return 0; } template <> @@ -198,46 +209,55 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(Su template <> really_inline SuperVector<16> SuperVector<16>::rshift128_var(uint8_t const N) const -{ +{ + /* switch(N) { - case 1: return {vec_srl(u.v128[0], 1)}; break; - case 2: return {vec_srl(u.v128[0], 2)}; break; - case 3: return {vec_srl(u.v128[0], 3)}; break; - case 4: return {vec_srl(u.v128[0], 4)}; break; - case 5: return {vec_srl(u.v128[0], 5)}; break; - case 6: return {vec_srl(u.v128[0], 6)}; break; - case 7: return {vec_srl(u.v128[0], 7)}; break; - case 8: return {vec_srl(u.v128[0], 8)}; break; - case 9: return {vec_srl(u.v128[0], 9)}; break; - case 10: return {vec_srl(u.v128[0], 10)}; break; - case 11: return {vec_srl(u.v128[0], 11)}; break; - case 12: return {vec_srl(u.v128[0], 12)}; break; - case 13: return {vec_srl(u.v128[0], 13)}; break; - case 14: return {vec_srl(u.v128[0], 14)}; break; - case 15: return {vec_srl(u.v128[0], 15)}; break; + case 1: return {vec_srl(u.v128[0], Zeroes(), 1)}; break; + case 2: return {vec_srl(u.v128[0], Zeroes(), 2)}; break; + case 3: return {vec_srl(u.v128[0], Zeroes(),3)}; break; + case 4: return {vec_srl(u.v128[0], Zeroes(),4)}; break; + case 5: return {vec_srl(u.v128[0], Zeroes(),5)}; break; + case 6: return {vec_srl(u.v128[0], Zeroes(),6)}; break; + case 7: return {vec_srl(u.v128[0], Zeroes(),7)}; break; + case 8: return {vec_srl(u.v128[0], Zeroes(),8)}; break; + case 9: return {vec_srl(u.v128[0], Zeroes(),9)}; break; + case 10: return {vec_srl(u.v128[0], Zeroes(),10)}; break; + case 11: return {vec_srl(u.v128[0], Zeroes(),11)}; break; + case 12: return {vec_srl(u.v128[0], Zeroes(),12)}; break; + case 13: return {vec_srl(u.v128[0], Zeroes(),13)}; break; + case 14: return {vec_srl(u.v128[0], Zeroes(),14)}; break; + case 15: return {vec_srl(u.v128[0], Zeroes(),15)}; break; case 16: return Zeroes(); break; default: break; } return *this; + */ + std::cout< really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const { - return {vec_srl(u.v128[0], N)}; + //return {vec_srl(u.v128[0], N)}; + std::cout< really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const { - return rshift128_var(N); + //return rshift128_var(N); + std::cout< really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) const -{ +{ + /* switch(N) { case 1: return {vec_sll(u.v128[0], 1)}; break; case 2: return {vec_sll(u.v128[0], 2)}; break; @@ -258,19 +278,26 @@ really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) co default: break; } return *this; + */ + std::cout< really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { - return {vec_sll(u.v128[0], N)}; + //return {vec_sll(u.v128[0], N)}; + std::cout< really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { - return lshift128_var(N); + //return lshift128_var(N); + std::cout< really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) { //return _mm_loadu_si128((const m128 *)ptr); - #warning FIXME + //#warning FIXME + std::cout< @@ -289,7 +318,9 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) //return _mm_load_si128((const m128 *)ptr); //assert(ISALIGNED_N(ptr, alignof(m128))); //return vld1q_s32((const int32_t *)ptr); - #warning FIXME + //#warning FIXME + std::cout< @@ -300,7 +331,20 @@ really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint //SuperVector<16> v = vld1q_s32((const int32_t *)ptr); //v.print8("v"); //return mask & v; - #warning FIXME + //#warning FIXME + std::cout< +really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) +{ + //return {_mm_shuffle_epi8(u.v128[0], b.u.v128[0])}; + //int8x16_t btranslated = vandq_s8((int8x16_t)b.u.v128[0],vdupq_n_s8(0x8f)); + //return (m128)vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)btranslated); + //#warning FIXM + return eq(b).movemask(); } #ifdef HS_OPTIMIZE @@ -308,7 +352,10 @@ template<> really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) { //return {vextq_s8(u.v128[0], other.u.v128[0], offset)}; - #warning FIXME + //#warning FIXME + std::cout< mask = Ones().rshift128_var(16 - 0); + return mask & pshufb(other); } #else template<> @@ -336,19 +383,13 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in } return *this; */ - #warning FIXME + //#warning FIXME + SuperVector<16> mask = Ones().rshift128_var(16 - 0); + std::cout< -really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) -{ - //return {_mm_shuffle_epi8(u.v128[0], b.u.v128[0])}; - //int8x16_t btranslated = vandq_s8((int8x16_t)b.u.v128[0],vdupq_n_s8(0x8f)); - //return (m128)vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)btranslated); - #warning FIXME -} - template<> really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len) { @@ -361,12 +402,15 @@ template<> really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) { //return {vshlq_n_s64(u.v128[0], N)}; - return {vec_sldw((int64x2_t)u.v128[0], N, 8)}; + //return {vec_sldw((int64x2_t)u.v128[0], N, 8)}; + std::cout< really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) { + /* switch(N) { case 0: return *this; break; case 1: return {vec_sldw((int64x2_t)u.v128[0], 1, 8)}; break; @@ -388,6 +432,9 @@ really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) default: break; } return *this; + */ + std::cout< really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) { //return {vshrq_n_s64(u.v128[0], N)}; - #warning FIXME + //#warning FIXME + std::cout< @@ -425,7 +474,9 @@ really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) } return *this; */ - #warning FIXME + //#warning FIXME + std::cout< SuperVector<16>::rshift128(uint8_t const N) { return *this >> N; } +#endif diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index 76e167ce..4cd10144 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -38,6 +38,8 @@ #include "util/supervector/arch/x86/types.hpp" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/supervector/arch/arm/types.hpp" +#elif defined(ARCH_PPC64EL) +#include "util/supervector/arch/ppc64el/types.hpp" #endif #if defined(HAVE_SIMD_512_BITS) @@ -353,6 +355,8 @@ struct Unroller #include "util/supervector/arch/x86/impl.cpp" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/supervector/arch/arm/impl.cpp" +#elif defined(ARCH_PPC64EL) +#include "util/supervector/arch/ppc64el/impl.cpp" #endif #endif From 7888dd44180d7be46f6906f38cafd2a9ca0a002f Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Thu, 14 Oct 2021 10:33:10 +0000 Subject: [PATCH 07/37] WIP: Power VSX support almost completed --- src/util/arch/ppc64el/simd_utils.h | 266 +++++++-------- src/util/supervector/arch/ppc64el/impl.cpp | 366 +++++++-------------- unit/internal/simd_utils.cpp | 3 + 3 files changed, 254 insertions(+), 381 deletions(-) diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index 3f8fdf73..89f381d5 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2015-2020, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -51,8 +52,8 @@ typedef __vector int16_t int16x8_t; typedef __vector uint8_t uint8x16_t; typedef __vector int8_t int8x16_t; + static really_inline m128 ones128(void) { - // the value in function must be a signed literal in range -16 to 15 return (m128) vec_splat_s8(-1); } @@ -80,14 +81,15 @@ static really_inline int isnonzero128(m128 a) { * mask indicating which 32-bit words contain differences. */ static really_inline u32 diffrich128(m128 a, m128 b) { - static const m128 movemask = { 1, 2, 4, 8 }; - m128 mask = (m128) vec_cmpeq(a, b); - mask = vec_and(vec_xor(mask, mask), movemask); - m128 sum = vec_sums(mask, zeroes128()); - sum = vec_sld(zeroes128(), sum, 4); - s32 ALIGN_ATTR(16) x; - vec_ste(sum, 0, &x); - return x; + static const m128 movemask = { 1, 2, 4, 8 }; + m128 mask = (m128) vec_cmpeq(a, b); // _mm_cmpeq_epi32 (a, b); + mask = vec_and(not128(mask), movemask); + m128 sum = vec_sums(mask, zeroes128()); + //sum = vec_sld(zeroes128(), sum, 4); + //s32 ALIGN_ATTR(16) x; + //vec_ste(sum, 0, &x); + //return x; // it could be ~(movemask_128(mask)) & 0x; + return sum[3]; } /** @@ -97,12 +99,13 @@ static really_inline u32 diffrich128(m128 a, m128 b) { static really_inline u32 diffrich64_128(m128 a, m128 b) { static const uint64x2_t movemask = { 1, 4 }; uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b); - mask = vec_and(vec_xor(mask, mask), movemask); + mask = (uint64x2_t) vec_and((uint64x2_t)not128((m128)mask), movemask); m128 sum = vec_sums((m128)mask, zeroes128()); - sum = vec_sld(zeroes128(), sum, 4); - s32 ALIGN_ATTR(16) x; - vec_ste(sum, 0, &x); - return x; + //sum = vec_sld(zeroes128(), sum, 4); + //s32 ALIGN_ATTR(16) x; + //vec_ste(sum, 0, &x); + //return x; + return sum[3]; } static really_really_inline @@ -116,32 +119,59 @@ m128 sub_2x64(m128 a, m128 b) { } static really_really_inline -m128 lshift_m128(m128 a, unsigned UNUSED b) { - // #warning FIXME - // b must be 4 bit literal - return (m128) vec_sld(a, zeroes128(), 0); +m128 lshift_m128(m128 a, unsigned b) { + switch(b){ + case 1: return vec_sld(a, zeroes128(), 1); break; + case 2: return vec_sld(a, zeroes128(), 2); break; + case 3: return vec_sld(a, zeroes128(), 3); break; + case 4: return vec_sld(a, zeroes128(), 4); break; + case 5: return vec_sld(a, zeroes128(), 5); break; + case 6: return vec_sld(a, zeroes128(), 6); break; + case 7: return vec_sld(a, zeroes128(), 7); break; + case 8: return vec_sld(a, zeroes128(), 8); break; + case 9: return vec_sld(a, zeroes128(), 9); break; + case 10: return vec_sld(a, zeroes128(), 10); break; + case 11: return vec_sld(a, zeroes128(), 11); break; + case 12: return vec_sld(a, zeroes128(), 12); break; + case 13: return vec_sld(a, zeroes128(), 13); break; + case 14: return vec_sld(a, zeroes128(), 14); break; + case 15: return vec_sld(a, zeroes128(), 15); break; + } + return a; } static really_really_inline -m128 rshift_m128(m128 a, unsigned UNUSED b) { - // #warning FIXME - // b must be 4 bit literal - return (m128) vec_sld(zeroes128(), a, 0 - 0); +m128 rshift_m128(m128 a, unsigned b) { + switch(b){ + case 1: return vec_sld(zeroes128(), a, 15); break; + case 2: return vec_sld(zeroes128(), a, 14); break; + case 3: return vec_sld(zeroes128(), a, 13); break; + case 4: return vec_sld(zeroes128(), a, 12); break; + case 5: return vec_sld(zeroes128(), a, 11); break; + case 6: return vec_sld(zeroes128(), a, 10); break; + case 7: return vec_sld(zeroes128(), a, 9); break; + case 8: return vec_sld(zeroes128(), a, 8); break; + case 9: return vec_sld(zeroes128(), a, 7); break; + case 10: return vec_sld(zeroes128(), a, 6); break; + case 11: return vec_sld(zeroes128(), a, 5); break; + case 12: return vec_sld(zeroes128(), a, 4); break; + case 13: return vec_sld(zeroes128(), a, 3); break; + case 14: return vec_sld(zeroes128(), a, 2); break; + case 15: return vec_sld(zeroes128(), a, 1); break; + } + return a; } static really_really_inline -m128 lshift64_m128(m128 a, unsigned UNUSED b) { - // #warnint FIXME - // b must be 4 bit literal - return (m128) vec_sld(zeroes128(), a, 0); - +m128 lshift64_m128(m128 a, unsigned b) { + uint64x2_t shift_indices = vec_splats((uint64_t)b); + return (m128) vec_sl((int64x2_t)a, shift_indices); } static really_really_inline -m128 rshift64_m128(m128 a, unsigned UNUSED b) { - // warnint FIXME - // b must be 4 bit literal - return (m128) vec_sld(zeroes128(), a, 0); +m128 rshift64_m128(m128 a, unsigned b) { + uint64x2_t shift_indices = vec_splats((uint64_t)b); + return (m128) vec_sr((int64x2_t)a, shift_indices); } static really_inline m128 eq128(m128 a, m128 b) { @@ -149,39 +179,36 @@ static really_inline m128 eq128(m128 a, m128 b) { } static really_inline m128 eq64_m128(m128 a, m128 b) { - return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b); + return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b); } static really_inline u32 movemask128(m128 a) { - //static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; + uint8x16_t s1 = vec_sr((uint8x16_t)a, vec_splat_u8(7)); + uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7)); + uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff)); + uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and); - // Compute the mask from the input - //uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); - //uint64x2_t mask1 = (m128)vextq_s8(mask, zeroes128(), 7); - //mask = vorrq_u8(mask, mask1); + uint32x4_t ss2 = vec_sr((uint32x4_t)s2, vec_splat_u32(14)); + uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff)); + uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2); - // Get the resulting bytes - //uint16_t output; - //vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0); - //return output; - // #warning FIXME - return !!diff128(a, zeroes128()); + uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28)); + uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((uint64_t)0xff)); + uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3); + + uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9); + uint64x2_t res_and4 = vec_and((uint64x2_t)ss4, vec_splats((uint64_t)0xff)); + uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); + + return s5[0]; } -static really_inline m128 set1_16x8(u8 UNUSED c) { - // warning FIXME - // c must be 5 bit literal - // a solution is to use vec_splats - //return (m128) vec_splat_u8(0); +static really_inline m128 set1_16x8(u8 c) { return (m128) vec_splats(c); } -static really_inline m128 set1_4x32(u32 UNUSED c) { - // warning FIXME - // c must be 5 bit literal - // a solution is to use vec_splats - // return (m128) vec_splat_u32(0); +static really_inline m128 set1_4x32(u32 c) { return (m128) vec_splats(c); } @@ -196,15 +223,15 @@ static really_inline u32 movd(const m128 in) { } static really_inline u64a movq(const m128 in) { - //return vgetq_lane_u64((uint64x2_t) in, 0); - return !!diff128(in, zeroes128()); - // #warning FIXME + u64a ALIGN_ATTR(16) a[2]; + vec_xst((uint64x2_t) in, 0, a); + return a[0]; } /* another form of movq */ static really_inline m128 load_m128_from_u64a(const u64a *p) { - return (m128) vec_ld(0,p); + return (m128) vec_ld(0, p); } @@ -236,8 +263,8 @@ static really_inline u32 extract32from128(const m128 in, unsigned imm) { return vec_any_ne(in,lshift_m128(in,imm)); } -static really_inline u64a extract64from128(const m128 in, unsigned imm) { -/* +static really_inline u64a extract64from128(const m128 UNUSED in, unsigned UNUSED imm) { +/* is this #if defined(HS_OPTIMIZE) return vgetq_lane_u64((uint64x2_t) in, imm); #else @@ -253,21 +280,32 @@ static really_inline u64a extract64from128(const m128 in, unsigned imm) { break; } #endif -*/ -// #warning FIXME -return vec_any_ne(in,lshift_m128(in,imm)); +*/ + /* + u64a ALIGN_ATTR(16) a[2]; + vec_xst((uint64x2_t) in, 0, a); + switch(imm) { + case 0: return a[0]; break; + case 1: return a[1]; break; + default: return 0; break; + } + */ +return 0; + } static really_inline m128 low64from128(const m128 in) { - //return vcombine_u64(vget_low_u64(in), vdup_n_u64(0)); - // #warning FIXME - return in; + //u64a ALIGN_ATTR(16) a[2]; + //vec_xst((uint64x2_t) in, 0, a); + //return a[1]; + return vec_add(in, in); } static really_inline m128 high64from128(const m128 in) { - //return vcombine_u64(vget_high_u64(in), vdup_n_u64(0)); - // #warning FIXME - return in; + //u64a ALIGN_ATTR(16) a[2]; + //vec_xst((uint64x2_t) in, 0, a); + //return a[0]; + return vec_add(in, in); } @@ -288,37 +326,29 @@ static really_inline m128 or128(m128 a, m128 b) { } static really_inline m128 andnot128(m128 a, m128 b) { - m128 and_res = and128(a,b); - return (m128) not128(and_res); - // or - //return (m128) not128(and128(a,b)); + return (m128) and128(not128(a),b); } // aligned load static really_inline m128 load128(const void *ptr) { assert(ISALIGNED_N(ptr, alignof(m128))); - //return (m128) vec_ld(0, ptr); - // #warning FIXME - return zeroes128(); + return (m128) vec_xl(0, (const int32_t*)ptr); } // aligned store -static really_inline void store128(void *ptr, m128 UNUSED a) { +static really_inline void store128(void *ptr, m128 a) { assert(ISALIGNED_N(ptr, alignof(m128))); - //vec_st(a, 0, ptr); - // warning FIXME + vec_st(a, 0, (int32_t*)ptr); } // unaligned load -static really_inline m128 loadu128(const void UNUSED *ptr) { - //return (m128) vec_ld(0, ptr); - // #warning FIXME - return zeroes128(); +static really_inline m128 loadu128(const void *ptr) { + return (m128) vec_xl(0, (const int64_t*)ptr); } // unaligned store -static really_inline void storeu128(void UNUSED *ptr, m128 UNUSED a) { - // #warning FIXME +static really_inline void storeu128(void *ptr, m128 a) { + vec_st(a, 0, (int32_t*)ptr); } // packed unaligned store of first N bytes @@ -338,11 +368,10 @@ m128 loadbytes128(const void *ptr, unsigned int n) { } -//#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break; +#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vec_sld((int8x16_t)(b), (int8x16_t)(a), (16 - offset)); break; static really_really_inline m128 palignr_imm(m128 r, m128 l, int offset) { - /* switch (offset) { case 0: return l; break; CASE_ALIGN_VECTORS(l, r, 1); @@ -361,56 +390,39 @@ m128 palignr_imm(m128 r, m128 l, int offset) { CASE_ALIGN_VECTORS(l, r, 14); CASE_ALIGN_VECTORS(l, r, 15); case 16: return r; break; - default: - return zeroes128(); - break; - } - */ - // #warning FIXME - return (m128) vec_cmpeq(r,lshift_m128(l,offset)); + default: return zeroes128(); break; + } } static really_really_inline m128 palignr(m128 r, m128 l, int offset) { -/* #if defined(HS_OPTIMIZE) - return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); + return (m128)vec_sld((int8x16_t)l, (int8x16_t)r, offset); #else return palignr_imm(r, l, offset); #endif -*/ -// #warning FIXME -return (m128) vec_cmpeq(r, lshift_m128(l,offset)); } #undef CASE_ALIGN_VECTORS static really_really_inline m128 rshiftbyte_m128(m128 a, unsigned b) { - // #warning FIXME - // return vec_sro(a,b); return rshift_m128(a,b); } static really_really_inline m128 lshiftbyte_m128(m128 a, unsigned b) { - //#warning FIXME - //return vec_slo(a,b); - return lshift_m128(a,b); + return lshift_m128(a,b); } static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) { -/* assert(amount >= -16 && amount <= 16); - static const uint8x16_t vbs_mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }; - const uint8x16_t outside_mask = set1_16x8(0xf0); - - m128 shift_mask = palignr_imm(vbs_mask, outside_mask, 16 - amount); - return vqtbl1q_s8(in, shift_mask); -*/ -// #warning FIXME -return lshift_m128(in,amount); + if (amount < 0){ + return palignr_imm(zeroes128(), in, -amount); + } else{ + return palignr_imm(in, zeroes128(), 16 - amount); + } } #ifdef __cplusplus @@ -450,28 +462,22 @@ char testbit128(m128 val, unsigned int n) { static really_inline m128 pshufb_m128(m128 a, m128 b) { - /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. - In NEON, if >=16, then the result is zero, otherwise it is that lane. - btranslated is the version that is converted from Intel to NEON. */ - //int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f)); - //return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated); - // #warning FIXME - return (m128) vec_max((int8x16_t)a, (int8x16_t)b); + return (m128) vec_permxor((int8x16_t)vec_splat_s8(0), (int8x16_t)a, (int8x16_t)b); } static really_inline m128 max_u8_m128(m128 a, m128 b) { - return (m128) vec_max((int8x16_t)a, (int8x16_t)b); + return (m128) vec_max((uint8x16_t)a, (uint8x16_t)b); } static really_inline m128 min_u8_m128(m128 a, m128 b) { - return (m128) vec_min((int8x16_t)a, (int8x16_t)b); + return (m128) vec_min((uint8x16_t)a, (uint8x16_t)b); } static really_inline m128 sadd_u8_m128(m128 a, m128 b) { - return (m128) vec_add((uint8x16_t)a, (uint8x16_t)b); + return (m128) vec_adds((uint8x16_t)a, (uint8x16_t)b); } static really_inline @@ -480,19 +486,15 @@ m128 sub_u8_m128(m128 a, m128 b) { } static really_inline -m128 set4x32(u32 UNUSED x3, u32 UNUSED x2, u32 UNUSED x1, u32 UNUSED x0) { - //uint32_t ALIGN_ATTR(16) data[4] = { x0, x1, x2, x3 }; - //return (m128) vec_splat_u32(data); - // #warning FIXME - return zeroes128(); +m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) { + uint32x4_t v = { x0, x1, x2, x3 }; + return (m128) v; } static really_inline -m128 set2x64(u64a UNUSED hi, u64a UNUSED lo) { - //uint64_t ALIGN_ATTR(16) data[2] = { lo, hi }; - //return (m128) vec_splats(data); - // #warning FIXME - return zeroes128(); +m128 set2x64(u64a hi, u64a lo) { + uint64x2_t v = { lo, hi }; + return (m128) v; } #endif // ARCH_PPC64EL_SIMD_UTILS_H diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index f00b5b3d..b3562f75 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -39,8 +39,24 @@ #include "util/supervector/supervector.hpp" #include + +typedef __vector uint64_t uint64x2_t; +typedef __vector int64_t int64x2_t; +typedef __vector uint32_t uint32x4_t; +typedef __vector int32_t int32x4_t; +typedef __vector uint16_t uint16x8_t; +typedef __vector int16_t int16x8_t; +typedef __vector uint8_t uint8x16_t; +typedef __vector int8_t int8x16_t; + // 128-bit Powerpc64le implementation +union Tmp +{ + uint32_t u32; + uint16_t u16[2]; +}; + template<> really_inline SuperVector<16>::SuperVector(SuperVector const &other) { @@ -57,87 +73,69 @@ template<> template<> really_inline SuperVector<16>::SuperVector(int8_t const other) { - //u.v128[0] = _mm_set1_epi8(other); - //u.v128[0] = vec_splat_s8(other); - std::cout< template<> really_inline SuperVector<16>::SuperVector(uint8_t const other) { - //u.v128[0] = _mm_set1_epi8(static_cast(other)); - //u.v128[0] = vec_splat_s8(static_cast(other)); - std::cout<(other)); } template<> template<> really_inline SuperVector<16>::SuperVector(int16_t const other) { - //u.v128[0] = _mm_set1_epi16(other); - //u.v128[0] = vec_splat_s16(other); - std::cout< template<> really_inline SuperVector<16>::SuperVector(uint16_t const other) { - //u.v128[0] = _mm_set1_epi16(static_cast(other)); - //u.v128[0] = vec_splat_s16(static_cast(other)); - std::cout<(other)); } template<> template<> really_inline SuperVector<16>::SuperVector(int32_t const other) { - //u.v128[0] = _mm_set1_epi32(other); - //u.v128[0] = vec_splat_s32(other); - std::cout< template<> really_inline SuperVector<16>::SuperVector(uint32_t const other) { - //u.v128[0] = _mm_set1_epi32(static_cast(other)); - //u.v128[0] = vec_splat_s32(static_cast(other)); - std::cout<(other)); } template<> template<> really_inline SuperVector<16>::SuperVector(int64_t const other) { - //u.v128[0] = _mm_set1_epi64x(other); - //u.v128[0] = vec_splat_u64(other); - std::cout< template<> really_inline SuperVector<16>::SuperVector(uint64_t const other) { - //u.v128[0] = _mm_set1_epi64x(static_cast(other)); - //u.v128[0] = vec_splat_u32(static_cast(other)); - std::cout<(other)); } // Constants template<> really_inline SuperVector<16> SuperVector<16>::Ones(void) { - //return {_mm_set1_epi8(0xFF)}; - return {(m128) vec_splat_s8(1)}; + return {(m128) vec_splat_s8(-1)}; } template<> really_inline SuperVector<16> SuperVector<16>::Zeroes(void) { - //return {_mm_set1_epi8(0)}; -return {(m128) vec_splat_s8(0)}; + return {(m128) vec_splat_s8(0)}; } // Methods @@ -151,28 +149,24 @@ really_inline void SuperVector<16>::operator=(SuperVector<16> const &other) template <> really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const { - //return {_mm_and_si128(u.v128[0], b.u.v128[0])}; return {vec_and(u.v128[0], b.u.v128[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const { - //return {_mm_or_si128(u.v128[0], b.u.v128[0])}; return {vec_or(u.v128[0], b.u.v128[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const { - //return {_mm_xor_si128(u.v128[0], b.u.v128[0])}; - return {vec_xor(u.v128[0], b.u.v128[0])}; + return {(m128) vec_xor(u.v128[0], b.u.v128[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const { - //return {_mm_andnot_si128(u.v128[0], b.u.v128[0])}; m128 and_res = vec_and(u.v128[0], b.u.v128[0]); return vec_xor(and_res,and_res); } @@ -180,215 +174,156 @@ really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b template <> really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const { - //return {_mm_cmpeq_epi8(u.v128[0], b.u.v128[0])}; - return { vec_all_eq(u.v128[0], b.u.v128[0])}; + return {(m128) vec_cmpeq((int8x16_t)u.v128[0], (int8x16_t)b.u.v128[0])}; } template <> really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const -{ - //return _mm_movemask_epi8(u.v128[0]); - // Compute the mask from the input - //uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)u.v128[0], 0)))); - //uint64x2_t mask1 = (m128)vextq_s8(mask, Zeroes(), 7); - //mask = vorrq_u8(mask, mask1); +{ + uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7)); + uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7)); + uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff)); + uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and); - // Get the resulting bytes - //uint16_t output; - //vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0); - //return output; - //#warning FIXME - return 0; + uint32x4_t ss2 = vec_sr((uint32x4_t)s2 , vec_splat_u32(14)); + uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff)); + uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2); + + uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28)); + uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((uint64_t)0xff)); + uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3); + + uint64x2_t ss4 = vec_sld((uint64x2_t) vec_splats(0), s4, 9); + uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((uint64_t)0xff)); + uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); + + return s5[0]; } template <> really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const { - return eq(b).movemask(); + return eq(b).movemask(); } template <> really_inline SuperVector<16> SuperVector<16>::rshift128_var(uint8_t const N) const -{ - /* +{ switch(N) { - case 1: return {vec_srl(u.v128[0], Zeroes(), 1)}; break; - case 2: return {vec_srl(u.v128[0], Zeroes(), 2)}; break; - case 3: return {vec_srl(u.v128[0], Zeroes(),3)}; break; - case 4: return {vec_srl(u.v128[0], Zeroes(),4)}; break; - case 5: return {vec_srl(u.v128[0], Zeroes(),5)}; break; - case 6: return {vec_srl(u.v128[0], Zeroes(),6)}; break; - case 7: return {vec_srl(u.v128[0], Zeroes(),7)}; break; - case 8: return {vec_srl(u.v128[0], Zeroes(),8)}; break; - case 9: return {vec_srl(u.v128[0], Zeroes(),9)}; break; - case 10: return {vec_srl(u.v128[0], Zeroes(),10)}; break; - case 11: return {vec_srl(u.v128[0], Zeroes(),11)}; break; - case 12: return {vec_srl(u.v128[0], Zeroes(),12)}; break; - case 13: return {vec_srl(u.v128[0], Zeroes(),13)}; break; - case 14: return {vec_srl(u.v128[0], Zeroes(),14)}; break; - case 15: return {vec_srl(u.v128[0], Zeroes(),15)}; break; + case 1: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 15)}; break; + case 2: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 14)}; break; + case 3: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 13)}; break; + case 4: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 12)}; break; + case 5: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 11)}; break; + case 6: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 10)}; break; + case 7: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 9)}; break; + case 8: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 8)}; break; + case 9: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 7)}; break; + case 10: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 6)}; break; + case 11: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 5)}; break; + case 12: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 4)}; break; + case 13: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 3)}; break; + case 14: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 2)}; break; + case 15: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 1)}; break; case 16: return Zeroes(); break; default: break; } return *this; - */ - std::cout< really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const { - //return {vec_srl(u.v128[0], N)}; - std::cout< -really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const -{ - //return rshift128_var(N); - std::cout< really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) const { - /* switch(N) { - case 1: return {vec_sll(u.v128[0], 1)}; break; - case 2: return {vec_sll(u.v128[0], 2)}; break; - case 3: return {vec_sll(u.v128[0], 3)}; break; - case 4: return {vec_sll(u.v128[0], 4)}; break; - case 5: return {vec_sll(u.v128[0], 5)}; break; - case 6: return {vec_sll(u.v128[0], 6)}; break; - case 7: return {vec_sll(u.v128[0], 7)}; break; - case 8: return {vec_sll(u.v128[0], 8)}; break; - case 9: return {vec_sll(u.v128[0], 9)}; break; - case 10: return {vec_sll(u.v128[0], 10)}; break; - case 11: return {vec_sll(u.v128[0], 11)}; break; - case 12: return {vec_sll(u.v128[0], 12)}; break; - case 13: return {vec_sll(u.v128[0], 13)}; break; - case 14: return {vec_sll(u.v128[0], 14)}; break; - case 15: return {vec_sll(u.v128[0], 15)}; break; + case 1: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 1)}; break; + case 2: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 2)}; break; + case 3: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 3)}; break; + case 4: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 4)}; break; + case 5: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 5)}; break; + case 6: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 6)}; break; + case 7: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 7)}; break; + case 8: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 8)}; break; + case 9: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 9)}; break; + case 10: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 10)}; break; + case 11: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 11)}; break; + case 12: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 12)}; break; + case 13: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 13)}; break; + case 14: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 14)}; break; + case 15: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 15)}; break; case 16: return Zeroes(); break; default: break; } return *this; - */ - std::cout< really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { - //return {vec_sll(u.v128[0], N)}; - std::cout< -really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const -{ - //return lshift128_var(N); - std::cout< really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) { - //return _mm_loadu_si128((const m128 *)ptr); - //#warning FIXME - std::cout< really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) { - //assert(ISALIGNED_N(ptr, alignof(SuperVector::size))); - //ptr = assume_aligned(ptr, SuperVector::size); - //return _mm_load_si128((const m128 *)ptr); - //assert(ISALIGNED_N(ptr, alignof(m128))); - //return vld1q_s32((const int32_t *)ptr); - //#warning FIXME - std::cout< really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len) { - //SuperVector<16> mask = Ones().rshift128_var(16 -len); - //mask.print8("mask"); - //SuperVector<16> v = vld1q_s32((const int32_t *)ptr); - //v.print8("v"); - //return mask & v; - //#warning FIXME - std::cout< mask = Ones().rshift128_var(16 -len); + mask.print8("mask"); + SuperVector<16> v = loadu(ptr); + v.print8("v"); + return mask & v; } template<> really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) { - //return {_mm_shuffle_epi8(u.v128[0], b.u.v128[0])}; - //int8x16_t btranslated = vandq_s8((int8x16_t)b.u.v128[0],vdupq_n_s8(0x8f)); - //return (m128)vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)btranslated); - //#warning FIXM - return eq(b).movemask(); + return (m128) vec_permxor((int8x16_t)vec_splat_s8(0), (int8x16_t)u.v128[0], (int8x16_t) b.u.v128[0]); } -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) -{ - //return {vextq_s8(u.v128[0], other.u.v128[0], offset)}; - //#warning FIXME - std::cout< mask = Ones().rshift128_var(16 - 0); - return mask & pshufb(other); -} -#else template<> really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) { - /* + switch(offset) { case 0: return other; break; - case 1: return {vextq_s8(u.v128[0], other.u.v128[0], 1)}; break; - case 2: return {vextq_s8(u.v128[0], other.u.v128[0], 2)}; break; - case 3: return {vextq_s8(u.v128[0], other.u.v128[0], 3)}; break; - case 4: return {vextq_s8(u.v128[0], other.u.v128[0], 4)}; break; - case 5: return {vextq_s8(u.v128[0], other.u.v128[0], 5)}; break; - case 6: return {vextq_s8(u.v128[0], other.u.v128[0], 6)}; break; - case 7: return {vextq_s8(u.v128[0], other.u.v128[0], 7)}; break; - case 8: return {vextq_s8(u.v128[0], other.u.v128[0], 8)}; break; - case 9: return {vextq_s8(u.v128[0], other.u.v128[0], 9)}; break; - case 10: return {vextq_s8(u.v128[0], other.u.v128[0], 10)}; break; - case 11: return {vextq_s8(u.v128[0], other.u.v128[0], 11)}; break; - case 12: return {vextq_s8(u.v128[0], other.u.v128[0], 12)}; break; - case 13: return {vextq_s8(u.v128[0], other.u.v128[0], 13)}; break; - case 14: return {vextq_s8(u.v128[0], other.u.v128[0], 14)}; break; - case 15: return {vextq_s8(u.v128[0], other.u.v128[0], 15)}; break; + case 1: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 15)}; break; + case 2: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 14)}; break; + case 3: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 13)}; break; + case 4: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 12)}; break; + case 5: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 11)}; break; + case 6: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 10)}; break; + case 7: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 9)}; break; + case 8: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 8)}; break; + case 9: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 7)}; break; + case 10: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 6)}; break; + case 11: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 5)}; break; + case 12: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 4)}; break; + case 13: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 3)}; break; + case 14: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 2)}; break; + case 15: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 1)}; break; default: break; } return *this; - */ - //#warning FIXME - SuperVector<16> mask = Ones().rshift128_var(16 - 0); - std::cout< really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len) @@ -397,88 +332,21 @@ really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, u return mask & pshufb(b); } -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) -{ - //return {vshlq_n_s64(u.v128[0], N)}; - //return {vec_sldw((int64x2_t)u.v128[0], N, 8)}; - std::cout< -really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) -{ - /* - switch(N) { - case 0: return *this; break; - case 1: return {vec_sldw((int64x2_t)u.v128[0], 1, 8)}; break; - case 2: return {vec_sldw((int64x2_t)u.v128[0], 2, 8)}; break; - case 3: return {vec_sldw((int64x2_t)u.v128[0], 3, 8)}; break; - case 4: return {vec_sldw((int64x2_t)u.v128[0], 4, 8)}; break; - case 5: return {vec_sldw((int64x2_t)u.v128[0], 5, 8)}; break; - case 6: return {vec_sldw((int64x2_t)u.v128[0], 6, 8)}; break; - case 7: return {vec_sldw((int64x2_t)u.v128[0], 7, 8)}; break; - case 8: return {vec_sldw((int64x2_t)u.v128[0], 8, 8)}; break; - case 9: return {vec_sldw((int64x2_t)u.v128[0], 9, 8)}; break; - case 10: return {vec_sldw((int64x2_t)u.v128[0], 10, 8)}; break; - case 11: return {vec_sldw((int64x2_t)u.v128[0], 11, 8)}; break; - case 12: return {vec_sldw((int64x2_t)u.v128[0], 12, 8)}; break; - case 13: return {vec_sldw((int64x2_t)u.v128[0], 13, 8)}; break; - case 14: return {vec_sldw((int64x2_t)u.v128[0], 14, 8)}; break; - case 15: return {vec_sldw((int64x2_t)u.v128[0], 15, 8)}; break; - case 16: return Zeroes(); - default: break; - } - return *this; - */ - std::cout< +really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) +{ + uint64x2_t shift_indices = vec_splats((uint64_t)N); + return (m128) vec_sl((int64x2_t)u.v128[0] , shift_indices); +} + + template<> really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) { - //return {vshrq_n_s64(u.v128[0], N)}; - //#warning FIXME - std::cout< -really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) -{ - /* - switch(N) { - case 0: return {vshrq_n_s64(u.v128[0], 0)}; break; - case 1: return {vshrq_n_s64(u.v128[0], 1)}; break; - case 2: return {vshrq_n_s64(u.v128[0], 2)}; break; - case 3: return {vshrq_n_s64(u.v128[0], 3)}; break; - case 4: return {vshrq_n_s64(u.v128[0], 4)}; break; - case 5: return {vshrq_n_s64(u.v128[0], 5)}; break; - case 6: return {vshrq_n_s64(u.v128[0], 6)}; break; - case 7: return {vshrq_n_s64(u.v128[0], 7)}; break; - case 8: return {vshrq_n_s64(u.v128[0], 8)}; break; - case 9: return {vshrq_n_s64(u.v128[0], 9)}; break; - case 10: return {vshrq_n_s64(u.v128[0], 10)}; break; - case 11: return {vshrq_n_s64(u.v128[0], 11)}; break; - case 12: return {vshrq_n_s64(u.v128[0], 12)}; break; - case 13: return {vshrq_n_s64(u.v128[0], 13)}; break; - case 14: return {vshrq_n_s64(u.v128[0], 14)}; break; - case 15: return {vshrq_n_s64(u.v128[0], 15)}; break; - case 16: return Zeroes(); - default: break; - } - return *this; - */ - //#warning FIXME - std::cout< really_inline SuperVector<16> SuperVector<16>::lshift128(uint8_t const N) diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 2a9accae..d66db7e2 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -668,6 +668,9 @@ TEST(SimdUtilsTest, movq) { #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) int64x2_t a = { 0x123456789abcdefLL, ~0LL }; simd = vreinterpretq_s64_s8(a); +#elif defined(ARCH_PPC64EL) + int64x2_t a = {0x123456789abcdefLL, ~0LL }; + simd = (m128) a; #endif #endif r = movq(simd); From 4d2acd59e262931608d5746c0f600457e1a751f7 Mon Sep 17 00:00:00 2001 From: apostolos Date: Thu, 14 Oct 2021 15:08:23 +0300 Subject: [PATCH 08/37] Supervector vsh* added --- src/util/supervector/arch/ppc64el/impl.cpp | 344 +++++++++++++++++++-- 1 file changed, 323 insertions(+), 21 deletions(-) diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index b3562f75..478a195f 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -51,12 +51,6 @@ typedef __vector int8_t int8x16_t; // 128-bit Powerpc64le implementation -union Tmp -{ - uint32_t u32; - uint16_t u16[2]; -}; - template<> really_inline SuperVector<16>::SuperVector(SuperVector const &other) { @@ -164,17 +158,71 @@ really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const & return {(m128) vec_xor(u.v128[0], b.u.v128[0])}; } +template <> +really_inline SuperVector<16> SuperVector<16>::operator!() const +{ + return {(m128) vec_xor(u.v128[0], u.v128[0])}; +} + template <> really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const { - m128 and_res = vec_and(u.v128[0], b.u.v128[0]); - return vec_xor(and_res,and_res); + //m128 and_res = vec_and(u.v128[0], b.u.v128[0]); + //return vec_xor(and_res,and_res); + return vec_xor(vec_and(u.v128[0], b.u.v128[0]), vec_and(u.v128[0], b.u.v128[0])); } + +template <> +really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const +{ + return {(m128) vec_cmpeq((int8x16_t)u.v128[0], (int8x16_t)b.u.v128[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const +{ + return !(*this == b); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const UNUSED &b) const +{ + //return {vcgtq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])}; + // #warning FIXME + return Zeroes(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const UNUSED &b) const +{ + //return {vcgeq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])}; + // #warning FIXME + return Zeroes(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const UNUSED &b) const +{ + //return {vcltq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])}; + // #warning FIXME + return Zeroes(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const UNUSED &b) const +{ + //return {vcgeq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])}; + // #warning FIXME + return Zeroes(); +} + + template <> really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const { - return {(m128) vec_cmpeq((int8x16_t)u.v128[0], (int8x16_t)b.u.v128[0])}; + return (*this == b); + //return {(m128) vec_cmpeq((int8x16_t)u.v128[0], (int8x16_t)b.u.v128[0])}; } template <> @@ -206,9 +254,264 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(Su return eq(b).movemask(); } + template <> -really_inline SuperVector<16> SuperVector<16>::rshift128_var(uint8_t const N) const -{ +template +really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const +{ + //return {(m128)vshlq_n_s8(u.v128[0], N)}; + // #warning FIXME + return Zeroes(); +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const +{ + //return {(m128)vshlq_n_s16(u.v128[0], N)}; + // #warning FIXME + return Zeroes(); +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const +{ + //return {(m128)vshlq_n_s32(u.v128[0], N)}; + // #warning FIXME + return Zeroes(); +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const +{ + //return {(m128)vshlq_n_s64(u.v128[0], N)}; + // #warning FIXME + return Zeroes(); +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const +{ + //return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - N)}; + // #warning FIXME + return Zeroes(); +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_imm() const +{ + //return vshl_128_imm(); + // #warning FIXME + return Zeroes(); +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const +{ + //return {(m128)vshrq_n_s8(u.v128[0], N)}; + // #warning FIXME + return Zeroes(); +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const +{ + //return {(m128)vshrq_n_s16(u.v128[0], N)}; + // #warning FIXME + return Zeroes(); +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const +{ + //return {(m128)vshrq_n_s32(u.v128[0], N)}; + // #warning FIXME + return Zeroes(); +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const +{ + //return {(m128)vshrq_n_s64(u.v128[0], N)}; + // #warning FIXME + return Zeroes(); +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const +{ + //return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), N)}; + // #warning FIXME + return Zeroes(); +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_imm() const +{ + return vshr_128_imm(); +} + +#if !defined(HS_OPTIMIZE) +template SuperVector<16> SuperVector<16>::vshl_8_imm<4>() const; +template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const; +template SuperVector<16> SuperVector<16>::vshl_128_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshl_128_imm<4>() const; +template SuperVector<16> SuperVector<16>::vshr_8_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshr_8_imm<4>() const; +template SuperVector<16> SuperVector<16>::vshr_16_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const; +template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const; +#endif + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const UNUSED N) const +{ + //if (N == 0) return *this; + //if (N == 16) return Zeroes(); + //SuperVector result; + //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s8(u.v128[0], n)}; }); + //return result; + // #warning FIXME + return Zeroes(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) const +{ + //if (N == 0) return *this; + //if (N == 16) return Zeroes(); + //SuperVector result; + //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s16(u.v128[0], n)}; }); + //return result; + // #warning FIXME + return Zeroes(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const UNUSED N) const +{ + //if (N == 0) return *this; + //if (N == 16) return Zeroes(); + //SuperVector result; + //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s32(u.v128[0], n)}; }); + //return result; + // #warning FIXME + return Zeroes(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const UNUSED N) const +{ + //if (N == 0) return *this; + //if (N == 16) return Zeroes(); + //SuperVector result; + //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s64(u.v128[0], n)}; }); + //return result; + // #warning FIXME + return Zeroes(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const UNUSED N) const +{ + //if (N == 0) return *this; + //if (N == 16) return Zeroes(); + //SuperVector result; + //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - n)}; }); + //return result; + // #warning FIXME + return Zeroes(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const +{ + return vshl_128(N); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const +{ + //if (N == 0) return *this; + //if (N == 16) return Zeroes(); + //SuperVector result; + //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s8(u.v128[0], n)}; }); + //return result; + // #warning FIXME + return Zeroes(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const +{ + //if (N == 0) return *this; + //if (N == 16) return Zeroes(); + //SuperVector result; + //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s16(u.v128[0], n)}; }); + //return result; + // #warning FIXME + return Zeroes(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const +{ + //if (N == 0) return *this; + //if (N == 16) return Zeroes(); + //SuperVector result; + //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s32(u.v128[0], n)}; }); + //return result; + // #warning FIXME + return Zeroes(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const +{ + //if (N == 0) return *this; + //if (N == 16) return Zeroes(); + //SuperVector result; + //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s64(u.v128[0], n)}; }); + //return result; + // #warning FIXME + return Zeroes(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const +{ + //if (N == 0) return *this; + //if (N == 16) return Zeroes(); + //SuperVector result; + //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), n)}; }); + //return result; + // #warning FIXME + return Zeroes(); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const +{ + return vshr_128(N); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const +{ switch(N) { case 1: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 15)}; break; case 2: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 14)}; break; @@ -232,14 +535,8 @@ really_inline SuperVector<16> SuperVector<16>::rshift128_var(uint8_t const N) co } template <> -really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const +really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { - return rshift128_var(N); -} - -template <> -really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) const -{ switch(N) { case 1: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 1)}; break; case 2: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 2)}; break; @@ -262,12 +559,17 @@ really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) co return *this; } -template <> -really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const +template<> +really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N) { - return lshift128_var(N); + return Ones().vshr_128(N); } +template<> +really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N) +{ + return Ones().vshl_128(N); +} template <> really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) From d0a41252c8851c2bbe2d0759a8a9de3d4b281e0c Mon Sep 17 00:00:00 2001 From: apostolos Date: Thu, 14 Oct 2021 15:56:13 +0300 Subject: [PATCH 09/37] blockSigleMask implementations for ARCH_PPC64 added --- src/nfa/shufti_simd.hpp | 2 ++ src/nfa/truffle_simd.hpp | 2 ++ src/util/supervector/arch/ppc64el/impl.cpp | 37 +++++++++++++++------- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/src/nfa/shufti_simd.hpp b/src/nfa/shufti_simd.hpp index e7f3f6c9..83ab428b 100644 --- a/src/nfa/shufti_simd.hpp +++ b/src/nfa/shufti_simd.hpp @@ -56,6 +56,8 @@ SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, #include "x86/shufti.hpp" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "arm/shufti.hpp" +#elif defined(ARCH_PPC64EL) +#include "ppc64el/shufti.hpp" #endif template diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp index 8d61722b..b3a82266 100644 --- a/src/nfa/truffle_simd.hpp +++ b/src/nfa/truffle_simd.hpp @@ -49,6 +49,8 @@ const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, Supe #include "x86/truffle.hpp" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "arm/truffle.hpp" +#elif defined(ARCH_PPC64EL) +#include "ppc64el/truffle.hpp" #endif template diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 478a195f..89fe89c6 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -444,7 +444,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const } template <> -really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const +really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const UNUSED N) const { //if (N == 0) return *this; //if (N == 16) return Zeroes(); @@ -456,7 +456,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const } template <> -really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const +really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const UNUSED N) const { //if (N == 0) return *this; //if (N == 16) return Zeroes(); @@ -468,7 +468,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const } template <> -really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const +really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const UNUSED N) const { //if (N == 0) return *this; //if (N == 16) return Zeroes(); @@ -480,7 +480,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const } template <> -really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const +really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const UNUSED N) const { //if (N == 0) return *this; //if (N == 16) return Zeroes(); @@ -492,7 +492,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const } template <> -really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const +really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N) const { //if (N == 0) return *this; //if (N == 16) return Zeroes(); @@ -594,12 +594,6 @@ really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint return mask & v; } -template<> -really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) -{ - return (m128) vec_permxor((int8x16_t)vec_splat_s8(0), (int8x16_t)u.v128[0], (int8x16_t) b.u.v128[0]); -} - template<> really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) { @@ -626,6 +620,24 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in return *this; } +template<> +template<> +really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) +{ + return (m128) vec_permxor((int8x16_t)vec_splat_s8(0), (int8x16_t)u.v128[0], (int8x16_t) b.u.v128[0]); +} + +template<> +template<> +really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) +{ + /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. + In NEON, if >=16, then the result is zero, otherwise it is that lane. + btranslated is the version that is converted from Intel to NEON. */ + SuperVector<16> btranslated = b & SuperVector<16>::dup_s8(0x8f); + return pshufb(btranslated); +} + template<> really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len) @@ -635,6 +647,8 @@ really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, u } + +/* template<> really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) { @@ -661,4 +675,5 @@ really_inline SuperVector<16> SuperVector<16>::rshift128(uint8_t const N) { return *this >> N; } +*/ #endif From ba4472a61cff35659f29776e6999e13285a7a3a2 Mon Sep 17 00:00:00 2001 From: apostolos Date: Thu, 14 Oct 2021 16:01:21 +0300 Subject: [PATCH 10/37] trufle and shufle implementations for ARCH_PPC64EL --- src/nfa/ppc64el/shufti.hpp | 76 +++++++++++++++++++++++++++++++++++++ src/nfa/ppc64el/truffle.hpp | 62 ++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 src/nfa/ppc64el/shufti.hpp create mode 100644 src/nfa/ppc64el/truffle.hpp diff --git a/src/nfa/ppc64el/shufti.hpp b/src/nfa/ppc64el/shufti.hpp new file mode 100644 index 00000000..76461175 --- /dev/null +++ b/src/nfa/ppc64el/shufti.hpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2021, Arm Limited + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Shufti: character class acceleration. + * + */ + +template +static really_inline +const SuperVector blockSingleMask(SuperVector mask_lo, SuperVector mask_hi, SuperVector chars) { + const SuperVector low4bits = SuperVector::dup_u8(0xf); + + SuperVector c_lo = chars & low4bits; + SuperVector c_hi = chars.template vshr_8_imm<4>(); + c_lo = mask_lo.template pshufb(c_lo); + c_hi = mask_hi.template pshufb(c_hi); + + return (c_lo & c_hi) > (SuperVector::Zeroes()); +} + +template +static really_inline +SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars) { + + const SuperVector low4bits = SuperVector::dup_u8(0xf); + SuperVector chars_lo = chars & low4bits; + chars_lo.print8("chars_lo"); + SuperVector chars_hi = chars.template vshr_64_imm<4>() & low4bits; + chars_hi.print8("chars_hi"); + SuperVector c1_lo = mask1_lo.template pshufb(chars_lo); + c1_lo.print8("c1_lo"); + SuperVector c1_hi = mask1_hi.template pshufb(chars_hi); + c1_hi.print8("c1_hi"); + SuperVector t1 = c1_lo | c1_hi; + t1.print8("t1"); + + SuperVector c2_lo = mask2_lo.template pshufb(chars_lo); + c2_lo.print8("c2_lo"); + SuperVector c2_hi = mask2_hi.template pshufb(chars_hi); + c2_hi.print8("c2_hi"); + SuperVector t2 = c2_lo | c2_hi; + t2.print8("t2"); + t2.template vshr_128_imm<1>().print8("t2.vshr_128(1)"); + SuperVector t = t1 | (t2.template vshr_128_imm<1>()); + t.print8("t"); + + return !t.eq(SuperVector::Ones()); +} diff --git a/src/nfa/ppc64el/truffle.hpp b/src/nfa/ppc64el/truffle.hpp new file mode 100644 index 00000000..92333261 --- /dev/null +++ b/src/nfa/ppc64el/truffle.hpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Truffle: character class acceleration. + * + */ + +template +static really_inline +const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, SuperVector chars) { + + chars.print8("chars"); + shuf_mask_lo_highclear.print8("shuf_mask_lo_highclear"); + shuf_mask_lo_highset.print8("shuf_mask_lo_highset"); + + SuperVector highconst = SuperVector::dup_u8(0x80); + highconst.print8("highconst"); + SuperVector shuf_mask_hi = SuperVector::dup_u64(0x8040201008040201); + shuf_mask_hi.print8("shuf_mask_hi"); + + SuperVector shuf1 = shuf_mask_lo_highclear.pshufb(chars); + shuf1.print8("shuf1"); + SuperVector t1 = chars ^ highconst; + t1.print8("t1"); + SuperVector shuf2 = shuf_mask_lo_highset.pshufb(t1); + shuf2.print8("shuf2"); + SuperVector t2 = highconst.opandnot(chars.template vshr_64_imm<4>()); + t2.print8("t2"); + SuperVector shuf3 = shuf_mask_hi.pshufb(t2); + shuf3.print8("shuf3"); + SuperVector res = (shuf1 | shuf2) & shuf3; + res.print8("(shuf1 | shuf2) & shuf3"); + + return !res.eq(SuperVector::Zeroes()); +} From b1f53f8e493d87551e9eb2a3fa70df7917dc7478 Mon Sep 17 00:00:00 2001 From: apostolos Date: Thu, 14 Oct 2021 16:26:59 +0300 Subject: [PATCH 11/37] match file for ARCH_PPC64EL added --- src/util/arch/ppc64el/match.hpp | 64 ++++++++++++++++++++++++++++++ src/util/arch/ppc64el/simd_utils.h | 26 ++++++++---- src/util/match.hpp | 2 + 3 files changed, 84 insertions(+), 8 deletions(-) create mode 100644 src/util/arch/ppc64el/match.hpp diff --git a/src/util/arch/ppc64el/match.hpp b/src/util/arch/ppc64el/match.hpp new file mode 100644 index 00000000..3cb3d667 --- /dev/null +++ b/src/util/arch/ppc64el/match.hpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +template <> +really_really_inline +const u8 *firstMatch<16>(const u8 *buf, SuperVector<16> v) { + SuperVector<16>::movemask_type z = v.movemask(); + DEBUG_PRINTF("buf %p z %08x \n", buf, z); + DEBUG_PRINTF("z %08x\n", z); + if (unlikely(z != 0xffff)) { + u32 pos = ctz32(~z & 0xffff); + DEBUG_PRINTF("~z %08x\n", ~z); + DEBUG_PRINTF("match @ pos %u\n", pos); + assert(pos < 16); + return buf + pos; + } else { + return NULL; // no match + } +} + +template <> +really_really_inline +const u8 *lastMatch<16>(const u8 *buf, SuperVector<16> v) { + SuperVector<16>::movemask_type z = v.movemask(); + DEBUG_PRINTF("buf %p z %08x \n", buf, z); + DEBUG_PRINTF("z %08x\n", z); + if (unlikely(z != 0xffff)) { + u32 pos = clz32(~z & 0xffff); + DEBUG_PRINTF("~z %08x\n", ~z); + DEBUG_PRINTF("match @ pos %u\n", pos); + assert(pos >= 16 && pos < 32); + return buf + (31 - pos); + } else { + return NULL; // no match + } +} + + diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index 89f381d5..e8f626cb 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -53,6 +53,24 @@ typedef __vector uint8_t uint8x16_t; typedef __vector int8_t int8x16_t; +#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 + +/** \brief LUT for the mask1bit functions. */ +ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = { + ZEROES_32, ZEROES_32, + ZEROES_31, 0x01, ZEROES_32, + ZEROES_31, 0x02, ZEROES_32, + ZEROES_31, 0x04, ZEROES_32, + ZEROES_31, 0x08, ZEROES_32, + ZEROES_31, 0x10, ZEROES_32, + ZEROES_31, 0x20, ZEROES_32, + ZEROES_31, 0x40, ZEROES_32, + ZEROES_31, 0x80, ZEROES_32, + ZEROES_32, ZEROES_32, +}; + static really_inline m128 ones128(void) { return (m128) vec_splat_s8(-1); } @@ -425,14 +443,6 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) { } } -#ifdef __cplusplus -extern "C" { -#endif -extern const u8 simd_onebit_masks[]; -#ifdef __cplusplus -} -#endif - static really_inline m128 mask1bit128(unsigned int n) { assert(n < sizeof(m128) * 8); diff --git a/src/util/match.hpp b/src/util/match.hpp index 9331d1f8..e3dd2e02 100644 --- a/src/util/match.hpp +++ b/src/util/match.hpp @@ -47,6 +47,8 @@ const u8 *lastMatch(const u8 *buf, SuperVector v); #include "util/arch/x86/match.hpp" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "util/arch/arm/match.hpp" +#elif defined(ARCH_PPC64EL) +#include "util/arch/ppc64el/match.hpp" #endif #endif // MATCH_HPP From e084c2d6e4828a672192e741fd8ac25a9d933754 Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Fri, 15 Oct 2021 14:07:17 +0000 Subject: [PATCH 12/37] SuperVector vsh* implementations --- src/util/arch/ppc64el/simd_utils.h | 66 ++--- src/util/supervector/arch/ppc64el/impl.cpp | 296 ++++++++------------- 2 files changed, 137 insertions(+), 225 deletions(-) diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index e8f626cb..f4b97ffb 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -235,15 +235,15 @@ static really_inline m128 set1_2x64(u64a c) { } static really_inline u32 movd(const m128 in) { - //return vgetq_lane_u32((uint32x4_t) in, 0); - return !!diff128(in, zeroes128()); - // #warning FIXME + u32 ALIGN_ATTR(16) a[4]; + vec_xst((uint32x4_t) in, 0, a); + return a[0]; } static really_inline u64a movq(const m128 in) { u64a ALIGN_ATTR(16) a[2]; vec_xst((uint64x2_t) in, 0, a); - return a[0]; + return a[0]; } /* another form of movq */ @@ -254,68 +254,41 @@ m128 load_m128_from_u64a(const u64a *p) { static really_inline u32 extract32from128(const m128 in, unsigned imm) { -/* -#if defined(HS_OPTIMIZE) - return vgetq_lane_u32((uint32x4_t) in, imm); -#else - switch (imm) { +u32 ALIGN_ATTR(16) a[4]; +vec_xst((uint32x4_t) in, 0, a); +switch (imm) { case 0: - return vgetq_lane_u32((uint32x4_t) in, 0); - break; + return a[0];break; case 1: - return vgetq_lane_u32((uint32x4_t) in, 1); - break; + return a[1];break; case 2: - return vgetq_lane_u32((uint32x4_t) in, 2); - break; + return a[2];break; case 3: - return vgetq_lane_u32((uint32x4_t) in, 3); - break; + return a[3];break; default: - return 0; - break; + return 0;break; } -#endif -*/ -// #warning FIXME -return vec_any_ne(in,lshift_m128(in,imm)); } -static really_inline u64a extract64from128(const m128 UNUSED in, unsigned UNUSED imm) { -/* is this -#if defined(HS_OPTIMIZE) - return vgetq_lane_u64((uint64x2_t) in, imm); -#else - switch (imm) { +static really_inline u64a extract64from128(const m128 in, unsigned UNUSED imm) { +u64a ALIGN_ATTR(16) a[2]; +vec_xst((uint64x2_t) in, 0, a); +switch (imm) { case 0: - return vgetq_lane_u64((uint32x4_t) in, 0); - break; + return a[0];break; case 1: - return vgetq_lane_u64((uint32x4_t) in, 1); - break; + return a[1];break; default: return 0; break; } -#endif -*/ - /* - u64a ALIGN_ATTR(16) a[2]; - vec_xst((uint64x2_t) in, 0, a); - switch(imm) { - case 0: return a[0]; break; - case 1: return a[1]; break; - default: return 0; break; - } - */ -return 0; - } static really_inline m128 low64from128(const m128 in) { //u64a ALIGN_ATTR(16) a[2]; //vec_xst((uint64x2_t) in, 0, a); //return a[1]; + // #warning FIXME return vec_add(in, in); } @@ -323,6 +296,7 @@ static really_inline m128 high64from128(const m128 in) { //u64a ALIGN_ATTR(16) a[2]; //vec_xst((uint64x2_t) in, 0, a); //return a[0]; + // #warning FIXME return vec_add(in, in); } diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 89fe89c6..8628c662 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -167,8 +167,6 @@ really_inline SuperVector<16> SuperVector<16>::operator!() const template <> really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const { - //m128 and_res = vec_and(u.v128[0], b.u.v128[0]); - //return vec_xor(and_res,and_res); return vec_xor(vec_and(u.v128[0], b.u.v128[0]), vec_and(u.v128[0], b.u.v128[0])); } @@ -186,35 +184,31 @@ really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const } template <> -really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const UNUSED &b) const -{ - //return {vcgtq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])}; - // #warning FIXME - return Zeroes(); +really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const +{ + int32x4_t v = {u.s32[0] > b.u.s32[0], u.s32[1] > b.u.s32[1], u.s32[2] > b.u.s32[2], u.s32[3] > b.u.s32[3]}; + return (m128) v; } template <> -really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const UNUSED &b) const +really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const { - //return {vcgeq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])}; - // #warning FIXME - return Zeroes(); + int32x4_t v = {u.s32[0] >= b.u.s32[0], u.s32[1] >= b.u.s32[1], u.s32[2] >= b.u.s32[2], u.s32[3] >= b.u.s32[3]}; + return (m128) v; } template <> -really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const UNUSED &b) const +really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const { - //return {vcltq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])}; - // #warning FIXME - return Zeroes(); + int32x4_t v = {u.s32[0] < b.u.s32[0], u.s32[1] < b.u.s32[1], u.s32[2] < b.u.s32[2], u.s32[3] < b.u.s32[3]}; + return (m128) v; } template <> -really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const UNUSED &b) const +really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const { - //return {vcgeq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])}; - // #warning FIXME - return Zeroes(); + int32x4_t v = {u.s32[0] <= b.u.s32[0], u.s32[1] <= b.u.s32[1], u.s32[2] <= b.u.s32[2], u.s32[3] <= b.u.s32[3]}; + return (m128) v; } @@ -222,7 +216,6 @@ template <> really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const { return (*this == b); - //return {(m128) vec_cmpeq((int8x16_t)u.v128[0], (int8x16_t)b.u.v128[0])}; } template <> @@ -259,99 +252,88 @@ template <> template really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const { + return { (m128) vec_sl((int8x16_t)u.v128[0], vec_splats((uint8_t)N)) }; //return {(m128)vshlq_n_s8(u.v128[0], N)}; - // #warning FIXME - return Zeroes(); } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const { + return { (m128) vec_sl((int16x8_t)u.v128[0], vec_splats((uint16_t)N)) }; //return {(m128)vshlq_n_s16(u.v128[0], N)}; - // #warning FIXME - return Zeroes(); } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const { + return { (m128) vec_sl((int32x4_t)u.v128[0], vec_splats((uint32_t)N)) }; //return {(m128)vshlq_n_s32(u.v128[0], N)}; - // #warning FIXME - return Zeroes(); + } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const { + return { (m128) vec_sl((int64x2_t)u.v128[0], vec_splats((uint64_t)N)) }; //return {(m128)vshlq_n_s64(u.v128[0], N)}; - // #warning FIXME - return Zeroes(); } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const { + return { (m128) vec_sld((int8x16_t)u.v128[0], (int8x16_t)vec_splat_s8(0), N)}; //return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - N)}; - // #warning FIXME - return Zeroes(); } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_imm() const { - //return vshl_128_imm(); - // #warning FIXME - return Zeroes(); + return vshl_128_imm(); } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const { + return { (m128) vec_sr((int8x16_t)u.v128[0], vec_splats((uint8_t)N)) }; //return {(m128)vshrq_n_s8(u.v128[0], N)}; - // #warning FIXME - return Zeroes(); } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const { + return { (m128) vec_sr((int16x8_t)u.v128[0], vec_splats((uint16_t)N)) }; //return {(m128)vshrq_n_s16(u.v128[0], N)}; - // #warning FIXME - return Zeroes(); } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const { + return { (m128) vec_sr((int32x4_t)u.v128[0], vec_splats((uint32_t)N)) }; //return {(m128)vshrq_n_s32(u.v128[0], N)}; - // #warning FIXME - return Zeroes(); } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const -{ +{ + return { (m128) vec_sr((int64x2_t)u.v128[0], vec_splats((uint64_t)N)) }; //return {(m128)vshrq_n_s64(u.v128[0], N)}; - // #warning FIXME - return Zeroes(); } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const -{ +{ + return { (m128) vec_sld((int8x16_t)vec_splat_s8(0), (int8x16_t)u.v128[0], 16 - N) }; //return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), N)}; - // #warning FIXME - return Zeroes(); } template <> @@ -378,63 +360,56 @@ template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const; #endif template <> -really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const UNUSED N) const +really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const { - //if (N == 0) return *this; - //if (N == 16) return Zeroes(); - //SuperVector result; + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl((int8x16_t)u.v128[0], vec_splats((uint8_t)n))}; }); //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s8(u.v128[0], n)}; }); - //return result; - // #warning FIXME - return Zeroes(); + return result; } template <> really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) const { - //if (N == 0) return *this; - //if (N == 16) return Zeroes(); - //SuperVector result; + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl((int16x8_t)u.v128[0], vec_splats((uint16_t)n))}; }); //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s16(u.v128[0], n)}; }); - //return result; - // #warning FIXME - return Zeroes(); + return result; } template <> -really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const UNUSED N) const +really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const { - //if (N == 0) return *this; - //if (N == 16) return Zeroes(); - //SuperVector result; + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl((int32x4_t)u.v128[0], vec_splats((uint32_t)n))}; }); //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s32(u.v128[0], n)}; }); - //return result; - // #warning FIXME - return Zeroes(); + return result; } template <> -really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const UNUSED N) const +really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const { - //if (N == 0) return *this; - //if (N == 16) return Zeroes(); - //SuperVector result; - //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s64(u.v128[0], n)}; }); - //return result; - // #warning FIXME - return Zeroes(); + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl((int64x2_t)u.v128[0], vec_splats((uint64_t)n))}; }); + return result; } template <> -really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const UNUSED N) const +really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const { - //if (N == 0) return *this; - //if (N == 16) return Zeroes(); - //SuperVector result; - //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - n)}; }); - //return result; - // #warning FIXME - return Zeroes(); + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)u.v128[0], (int8x16_t)vec_splat_s8(0), n)}; }); + return result; } template <> @@ -444,63 +419,56 @@ really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const } template <> -really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const UNUSED N) const +really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const { - //if (N == 0) return *this; - //if (N == 16) return Zeroes(); - //SuperVector result; + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr((int8x16_t)u.v128[0], vec_splats((uint8_t)n))}; }); //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s8(u.v128[0], n)}; }); - //return result; - // #warning FIXME - return Zeroes(); + return result; } template <> -really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const UNUSED N) const +really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const { - //if (N == 0) return *this; - //if (N == 16) return Zeroes(); - //SuperVector result; + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr((int16x8_t)u.v128[0], vec_splats((uint16_t)n))}; }); //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s16(u.v128[0], n)}; }); - //return result; - // #warning FIXME - return Zeroes(); + return result; } template <> -really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const UNUSED N) const +really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const { - //if (N == 0) return *this; - //if (N == 16) return Zeroes(); - //SuperVector result; + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr((int32x4_t)u.v128[0], vec_splats((uint32_t)n))}; }); //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s32(u.v128[0], n)}; }); - //return result; - // #warning FIXME - return Zeroes(); + return result; } template <> -really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const UNUSED N) const +really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const { - //if (N == 0) return *this; - //if (N == 16) return Zeroes(); - //SuperVector result; - //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s64(u.v128[0], n)}; }); - //return result; - // #warning FIXME - return Zeroes(); + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr((int64x2_t)u.v128[0], vec_splats((uint64_t)n))}; }); + return result; } template <> really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N) const { - //if (N == 0) return *this; - //if (N == 16) return Zeroes(); - //SuperVector result; - //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), n)}; }); - //return result; - // #warning FIXME - return Zeroes(); + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), (int8x16_t)u.v128[0], 16 - n)}; }); + return result; } template <> @@ -513,21 +481,21 @@ template <> really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const { switch(N) { - case 1: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 15)}; break; - case 2: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 14)}; break; - case 3: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 13)}; break; - case 4: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 12)}; break; - case 5: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 11)}; break; - case 6: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 10)}; break; - case 7: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 9)}; break; - case 8: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 8)}; break; - case 9: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 7)}; break; - case 10: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 6)}; break; - case 11: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 5)}; break; - case 12: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 4)}; break; - case 13: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 3)}; break; - case 14: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 2)}; break; - case 15: return {(m128) vec_sld((int16x8_t) vec_splat_s8(0), (int16x8_t) u.v128[0], 1)}; break; + case 1: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 15)}; break; + case 2: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 14)}; break; + case 3: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 13)}; break; + case 4: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 12)}; break; + case 5: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 11)}; break; + case 6: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 10)}; break; + case 7: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 9)}; break; + case 8: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 8)}; break; + case 9: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 7)}; break; + case 10: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 6)}; break; + case 11: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 5)}; break; + case 12: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 4)}; break; + case 13: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 3)}; break; + case 14: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 2)}; break; + case 15: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 1)}; break; case 16: return Zeroes(); break; default: break; } @@ -538,21 +506,21 @@ template <> really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { switch(N) { - case 1: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 1)}; break; - case 2: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 2)}; break; - case 3: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 3)}; break; - case 4: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 4)}; break; - case 5: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 5)}; break; - case 6: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 6)}; break; - case 7: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 7)}; break; - case 8: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 8)}; break; - case 9: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 9)}; break; - case 10: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 10)}; break; - case 11: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 11)}; break; - case 12: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 12)}; break; - case 13: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 13)}; break; - case 14: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 14)}; break; - case 15: return {(m128) vec_sld((int16x8_t) u.v128[0], (int16x8_t) vec_splat_s8(0), 15)}; break; + case 1: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 1)}; break; + case 2: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 2)}; break; + case 3: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 3)}; break; + case 4: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 4)}; break; + case 5: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 5)}; break; + case 6: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 6)}; break; + case 7: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 7)}; break; + case 8: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 8)}; break; + case 9: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 9)}; break; + case 10: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 10)}; break; + case 11: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 11)}; break; + case 12: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 12)}; break; + case 13: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 13)}; break; + case 14: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 14)}; break; + case 15: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 15)}; break; case 16: return Zeroes(); break; default: break; } @@ -587,7 +555,7 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) template <> really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len) { - SuperVector<16> mask = Ones().rshift128_var(16 -len); + SuperVector<16> mask = Ones_vshr(16 -len); mask.print8("mask"); SuperVector<16> v = loadu(ptr); v.print8("v"); @@ -642,38 +610,8 @@ really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) template<> really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len) { - SuperVector<16> mask = Ones().rshift128_var(16 -len); + SuperVector<16> mask = Ones_vshr(16 -len); return mask & pshufb(b); } - - -/* -template<> -really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) -{ - uint64x2_t shift_indices = vec_splats((uint64_t)N); - return (m128) vec_sl((int64x2_t)u.v128[0] , shift_indices); -} - - -template<> -really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) -{ - uint64x2_t shift_indices = vec_splats((uint64_t)N); - return (m128) vec_sr((int64x2_t)u.v128[0] , shift_indices); -} - -template<> -really_inline SuperVector<16> SuperVector<16>::lshift128(uint8_t const N) -{ - return *this << N; -} - -template<> -really_inline SuperVector<16> SuperVector<16>::rshift128(uint8_t const N) -{ - return *this >> N; -} -*/ #endif From 558313a2c2d35e7fc61b2aa856085ddc4eaffcee Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Mon, 18 Oct 2021 12:26:38 +0000 Subject: [PATCH 13/37] SuperVector operators fixes and simd_utils low/high64 functions implementations added --- src/util/arch/common/simd_utils.h | 16 ++++++------- src/util/arch/ppc64el/simd_utils.h | 14 +++-------- src/util/supervector/arch/ppc64el/impl.cpp | 27 +++++++++++++++------- 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 65e7b69a..5bf846f9 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -49,8 +49,8 @@ static inline void print_m128_16x8(const char *label, m128 vector) { uint8_t ALIGN_ATTR(16) data[16]; store128(data, vector); - DEBUG_PRINTF("%s: ", label); - for(int i=0; i < 16; i++) + DEBUG_PRINTF("%12s: ", label); + for(int i=15; i >=0; i--) printf("%02x ", data[i]); printf("\n"); } @@ -58,8 +58,8 @@ static inline void print_m128_16x8(const char *label, m128 vector) { static inline void print_m128_8x16(const char *label, m128 vector) { uint16_t ALIGN_ATTR(16) data[8]; store128(data, vector); - DEBUG_PRINTF("%s: ", label); - for(int i=0; i < 8; i++) + DEBUG_PRINTF("%12s: ", label); + for(int i=7; i >= 0; i--) printf("%04x ", data[i]); printf("\n"); } @@ -67,8 +67,8 @@ static inline void print_m128_8x16(const char *label, m128 vector) { static inline void print_m128_4x32(const char *label, m128 vector) { uint32_t ALIGN_ATTR(16) data[4]; store128(data, vector); - DEBUG_PRINTF("%s: ", label); - for(int i=0; i < 4; i++) + DEBUG_PRINTF("%12s: ", label); + for(int i=3; i >= 0; i--) printf("%08x ", data[i]); printf("\n"); } @@ -76,8 +76,8 @@ static inline void print_m128_4x32(const char *label, m128 vector) { static inline void print_m128_2x64(const char *label, m128 vector) { uint64_t ALIGN_ATTR(16) data[2]; store128(data, vector); - DEBUG_PRINTF("%s: ", label); - for(int i=0; i < 2; i++) + DEBUG_PRINTF("%12s: ", label); + for(int i=1; i >= 0; i--) printf("%016lx ", data[i]); printf("\n"); } diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index f4b97ffb..a54012aa 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -270,7 +270,7 @@ switch (imm) { } } -static really_inline u64a extract64from128(const m128 in, unsigned UNUSED imm) { +static really_inline u64a extract64from128(const m128 in, unsigned imm) { u64a ALIGN_ATTR(16) a[2]; vec_xst((uint64x2_t) in, 0, a); switch (imm) { @@ -285,19 +285,11 @@ switch (imm) { } static really_inline m128 low64from128(const m128 in) { - //u64a ALIGN_ATTR(16) a[2]; - //vec_xst((uint64x2_t) in, 0, a); - //return a[1]; - // #warning FIXME - return vec_add(in, in); + return (m128) vec_perm((int64x2_t)in, (int64x2_t)vec_splats((uint64_t)0), (uint8x16_t)vec_splat_u8(1)); } static really_inline m128 high64from128(const m128 in) { - //u64a ALIGN_ATTR(16) a[2]; - //vec_xst((uint64x2_t) in, 0, a); - //return a[0]; - // #warning FIXME - return vec_add(in, in); + return (m128) vec_perm((int64x2_t)in, (int64x2_t)vec_splats((uint64_t)0), (uint8x16_t)vec_splat_u8(0)); } diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 8628c662..93cc4d63 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -186,29 +186,25 @@ really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const template <> really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const { - int32x4_t v = {u.s32[0] > b.u.s32[0], u.s32[1] > b.u.s32[1], u.s32[2] > b.u.s32[2], u.s32[3] > b.u.s32[3]}; - return (m128) v; + return {(m128) vec_cmpgt(u.v128[0], b.u.v128[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const { - int32x4_t v = {u.s32[0] >= b.u.s32[0], u.s32[1] >= b.u.s32[1], u.s32[2] >= b.u.s32[2], u.s32[3] >= b.u.s32[3]}; - return (m128) v; + return {(m128) vec_cmpge(u.v128[0], b.u.v128[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const { - int32x4_t v = {u.s32[0] < b.u.s32[0], u.s32[1] < b.u.s32[1], u.s32[2] < b.u.s32[2], u.s32[3] < b.u.s32[3]}; - return (m128) v; + return {(m128) vec_cmpgt(b.u.v128[0], u.v128[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const { - int32x4_t v = {u.s32[0] <= b.u.s32[0], u.s32[1] <= b.u.s32[1], u.s32[2] <= b.u.s32[2], u.s32[3] <= b.u.s32[3]}; - return (m128) v; + return {(m128) vec_cmpge(b.u.v128[0], u.v128[0])}; } @@ -222,9 +218,21 @@ template <> really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const { uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7)); + //printf("s1:"); + //for(int i=15; i>=0; i--) {printf("%02x, ",s1[i]);} + //printf("\n"); uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7)); + //printf("ss:"); + //for(int i=7; i>=0; i--) {printf("%04x, ",ss[i]);} + //printf("\n"); uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff)); + //printf("res_and:"); + //for(int i=7; i>=0; i--) {printf("%04x, ",res_and[i]);} + //printf("\n"); uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and); + //printf("s2:"); + //for(int i=7; i>=0; i--) {printf("%04x, ",s2[i]);} + //printf("\n"); uint32x4_t ss2 = vec_sr((uint32x4_t)s2 , vec_splat_u32(14)); uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff)); @@ -238,6 +246,9 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask( uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((uint64_t)0xff)); uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); + //printf("s5:"); + //for(int i=1; i>=0; i--) {printf("%016llx, ",s5[i]);} + //printf("\n"); return s5[0]; } From 2b1db733261e8cea12d248a32f10b6bafb546b33 Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Thu, 21 Oct 2021 13:34:02 +0000 Subject: [PATCH 14/37] WIP: simd & bitutils files finctions fixes --- src/nfa/limex_shuffle.h | 4 +++ src/nfa/vermicelli_sse.h | 14 +++++++++- src/util/arch/ppc64el/bitutils.h | 26 +++++++----------- src/util/arch/ppc64el/simd_utils.h | 44 ++++++++++++++++++++++++++---- unit/internal/shuffle.cpp | 6 ++-- 5 files changed, 69 insertions(+), 25 deletions(-) diff --git a/src/nfa/limex_shuffle.h b/src/nfa/limex_shuffle.h index 365d4729..b2aa9a0a 100644 --- a/src/nfa/limex_shuffle.h +++ b/src/nfa/limex_shuffle.h @@ -45,6 +45,10 @@ static really_inline u32 packedExtract128(m128 s, const m128 permute, const m128 compare) { m128 shuffled = pshufb_m128(s, permute); + int8x16_t res = (int8x16_t) pshufb_m128(s, permute); + printf("shufled:"); + for(int i=15; i>=0; i--) {printf("%02x ", res[i]);} + printf("\n"); m128 compared = and128(shuffled, compare); u16 rv = ~movemask128(eq128(compared, shuffled)); return (u32)rv; diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h index 268e9e08..d985dd94 100644 --- a/src/nfa/vermicelli_sse.h +++ b/src/nfa/vermicelli_sse.h @@ -155,6 +155,18 @@ const u8 *rvermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, assert((size_t)buf_end % 16 == 0); for (; buf + 15 < buf_end; buf_end -= 16) { m128 data = load128(buf_end - 16); + /* + { + printf("after_load128 data:"); + for (int i=3; i>=0; i--) {printf("%d, ",data[i]);} + printf("\n"); + } + { + m128 res_eq = eq128(chars, data); + printf("dd:"); + for (int i=3; i>=0; i--) { printf("%d, ", res_eq[i]); } + } + */ u32 z = movemask128(eq128(chars, data)); if (negate) { z = ~z & 0xffff; @@ -1281,4 +1293,4 @@ const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, } else { return rdvermSearchAligned(chars1, chars2, c1, c2, buf, buf_end); } -} \ No newline at end of file +} diff --git a/src/util/arch/ppc64el/bitutils.h b/src/util/arch/ppc64el/bitutils.h index b23c573e..bcc88f3d 100644 --- a/src/util/arch/ppc64el/bitutils.h +++ b/src/util/arch/ppc64el/bitutils.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -134,22 +135,15 @@ u64a expand64_impl(u64a x, u64a m) { } static really_inline -m128 expand128_impl(m128 x, m128 m) { - m128 one = set1_2x64(1); - m128 bitset = one; - m128 vres = zeroes128(); - while (isnonzero128(m)) { - m128 tv = and128(x, m); - - m128 mm = sub_2x64(zeroes128(), m); - m128 mask = not128(eq64_m128(tv, zeroes128())); - mask = and128(bitset, mask); - mask = and128(mask, mm); - vres = or128(vres, mask); - m = and128(m, sub_2x64(m, one)); - bitset = lshift64_m128(bitset, 1); - } - return vres; +m128 expand128_impl(m128 xvec, m128 mvec) { + u64a ALIGN_ATTR(16) x[2]; + u64a ALIGN_ATTR(16) m[2]; + vec_xst((uint64x2_t)xvec, 0, x); + vec_xst((uint64x2_t)mvec, 0, m); + DEBUG_PRINTF("calling expand64_impl:\n"); + x[0] = expand64_impl(x[0], m[0]); + x[1] = expand64_impl(x[1], m[1]); + return load128(x); } /* returns the first set bit after begin (if not ~0U). If no bit is set after diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index a54012aa..d962163e 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -72,7 +72,7 @@ ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = { }; static really_inline m128 ones128(void) { - return (m128) vec_splat_s8(-1); + return (m128) vec_splat_u8(-1); } static really_inline m128 zeroes128(void) { @@ -202,23 +202,43 @@ static really_inline m128 eq64_m128(m128 a, m128 b) { static really_inline u32 movemask128(m128 a) { + //printf("input vector:"); + //for (int i=3; i>=0; i--) {printf("%04x, ", a[i]);} + //printf("\n"); uint8x16_t s1 = vec_sr((uint8x16_t)a, vec_splat_u8(7)); + //printf("s1:"); + //for (int i=15; i>=0; i--) {printf("%02x, ", s1[i]);} + //printf("\n"); uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7)); uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff)); uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and); + //printf("s2:"); + //for (int i=7; i>=0; i--) {printf("%04x, ", s2[i]);} + //printf("\n"); uint32x4_t ss2 = vec_sr((uint32x4_t)s2, vec_splat_u32(14)); uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff)); uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2); + //printf("s3:"); + //for (int i=3; i>=0; i--) {printf("%08x, ", s3[i]);} + //printf("\n"); uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28)); uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((uint64_t)0xff)); uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3); + //printf("s4:"); + //for (int i=1; i>=0; i--) {printf("%016llx, ", s4[i]);} + //printf("\n"); uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9); - uint64x2_t res_and4 = vec_and((uint64x2_t)ss4, vec_splats((uint64_t)0xff)); + uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((uint64_t)0xff)); uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); + //printf("s5:"); + //for (int i=1; i>=0; i--) {printf("%016llx, ", s5[i]);} + //printf("\n"); + + //printf("%lld and %lld\n", s5[0],s5[1]); return s5[0]; } @@ -285,6 +305,10 @@ switch (imm) { } static really_inline m128 low64from128(const m128 in) { + //int64x2_t v = vec_perm((int64x2_t)in, (int64x2_t)vec_splats((uint64_t)0), (uint8x16_t)vec_splat_u8(1)); + //printf("v:"); + //for (int i=1; i>=0; i++) {printf("%016llx",v[i]);} + //printf("\n"); return (m128) vec_perm((int64x2_t)in, (int64x2_t)vec_splats((uint64_t)0), (uint8x16_t)vec_splat_u8(1)); } @@ -316,11 +340,11 @@ static really_inline m128 andnot128(m128 a, m128 b) { // aligned load static really_inline m128 load128(const void *ptr) { assert(ISALIGNED_N(ptr, alignof(m128))); - return (m128) vec_xl(0, (const int32_t*)ptr); + return (m128) vec_xl(0, (const int64_t*)ptr); } // aligned store -static really_inline void store128(void *ptr, m128 a) { +static really_inline void store128(void *ptr, m128 a) { assert(ISALIGNED_N(ptr, alignof(m128))); vec_st(a, 0, (int32_t*)ptr); } @@ -332,7 +356,7 @@ static really_inline m128 loadu128(const void *ptr) { // unaligned store static really_inline void storeu128(void *ptr, m128 a) { - vec_st(a, 0, (int32_t*)ptr); + vec_xst(a, 0, (int32_t*)ptr); } // packed unaligned store of first N bytes @@ -438,7 +462,15 @@ char testbit128(m128 val, unsigned int n) { static really_inline m128 pshufb_m128(m128 a, m128 b) { - return (m128) vec_permxor((int8x16_t)vec_splat_s8(0), (int8x16_t)a, (int8x16_t)b); + return (m128) vec_perm((uint8x16_t)a, (uint8x16_t)a, (uint8x16_t)b); + //return (m128) vec_perm((int8x16_t)vec_splat_s8(0), (int8x16_t)a, (uint8x16_t)b);; + //uint8x16_t btransparent = vec_and((uint8x16_t)b, (uint8x16_t)vec_splats(0x8f)); + //return (m128) vec_perm(a, a, btransparent); + //return (m128) vec_perm((int8x16_t)vec_splat_s8(0), (int8x16_t)b, (uint8x16_t)a); + + //return (m128) vec_perm((int8x16_t)a, (int8x16_t)b, (uint8x16_t)vec_splat_s8(0)); + //return (m128) vec_perm((int8x16_t)b, (int8x16_t)a, (uint8x16_t)vec_splat_s8(0)); + } static really_inline diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp index d74509d6..129e63c9 100644 --- a/unit/internal/shuffle.cpp +++ b/unit/internal/shuffle.cpp @@ -183,11 +183,11 @@ void build_pshufb_masks_onebit(unsigned int bit, T *permute, T *compare) { TEST(Shuffle, PackedExtract128_1) { // Try all possible one-bit masks - for (unsigned int i = 0; i < 128; i++) { + for (unsigned int i = 0; i < 1; i++) { // shuffle a single 1 bit to the front m128 permute, compare; build_pshufb_masks_onebit(i, &permute, &compare); - EXPECT_EQ(1U, packedExtract128(setbit(i), permute, compare)); + EXPECT_EQ(1U, packedExtract128(setbit(i), permute, compare)); EXPECT_EQ(1U, packedExtract128(ones128(), permute, compare)); // we should get zero out of these cases EXPECT_EQ(0U, packedExtract128(zeroes128(), permute, compare)); @@ -199,6 +199,7 @@ TEST(Shuffle, PackedExtract128_1) { } } +/* TEST(Shuffle, PackedExtract_templatized_128_1) { // Try all possible one-bit masks for (unsigned int i = 0; i < 128; i++) { @@ -217,6 +218,7 @@ TEST(Shuffle, PackedExtract_templatized_128_1) { } } } +*/ #if defined(HAVE_AVX2) From 7184ce9870c5fef0a084dcb687cfa5ca2755f74c Mon Sep 17 00:00:00 2001 From: apostolos Date: Fri, 22 Oct 2021 09:46:04 +0300 Subject: [PATCH 15/37] expand128 implementation was changed to be like arm's --- src/util/arch/ppc64el/bitutils.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/util/arch/ppc64el/bitutils.h b/src/util/arch/ppc64el/bitutils.h index bcc88f3d..fbe016f2 100644 --- a/src/util/arch/ppc64el/bitutils.h +++ b/src/util/arch/ppc64el/bitutils.h @@ -136,14 +136,20 @@ u64a expand64_impl(u64a x, u64a m) { static really_inline m128 expand128_impl(m128 xvec, m128 mvec) { - u64a ALIGN_ATTR(16) x[2]; - u64a ALIGN_ATTR(16) m[2]; - vec_xst((uint64x2_t)xvec, 0, x); - vec_xst((uint64x2_t)mvec, 0, m); - DEBUG_PRINTF("calling expand64_impl:\n"); - x[0] = expand64_impl(x[0], m[0]); - x[1] = expand64_impl(x[1], m[1]); - return load128(x); + m128 one = set1_2x64(1); + m128 bb = one; + m128 res = zeroes128(); + while (isnonzero128(m)) { + m128 mm = sub_2x64(zeroes128(), m); + m128 xm = and128(x, m); + xm = and128(xm, mm); + + m128 mask = not128(eq64_m128(xm, zeroes128())); + res = or128(res, and128(bb, mask)); + m = and128(m, sub_2x64(m, one)); + bb = lshift64_m128(bb, 1); + } + return res; } /* returns the first set bit after begin (if not ~0U). If no bit is set after From 5abda15c268d0129f02fcbb3f071243d8f31d419 Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Fri, 22 Oct 2021 07:05:55 +0000 Subject: [PATCH 16/37] expand128 bugs fixed --- src/util/arch/ppc64el/bitutils.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/util/arch/ppc64el/bitutils.h b/src/util/arch/ppc64el/bitutils.h index fbe016f2..10c4869b 100644 --- a/src/util/arch/ppc64el/bitutils.h +++ b/src/util/arch/ppc64el/bitutils.h @@ -135,17 +135,16 @@ u64a expand64_impl(u64a x, u64a m) { } static really_inline -m128 expand128_impl(m128 xvec, m128 mvec) { +m128 expand128_impl(m128 x, m128 m) { m128 one = set1_2x64(1); m128 bb = one; m128 res = zeroes128(); while (isnonzero128(m)) { + m128 xm = and128(x, bb); m128 mm = sub_2x64(zeroes128(), m); - m128 xm = and128(x, m); - xm = and128(xm, mm); - m128 mask = not128(eq64_m128(xm, zeroes128())); - res = or128(res, and128(bb, mask)); + mask = and128(mask, and128(m,mm)); + res = or128(res, mask); m = and128(m, sub_2x64(m, one)); bb = lshift64_m128(bb, 1); } From b53b0a0fcd1a1cb38dcb57f870dda6b18a9b04d3 Mon Sep 17 00:00:00 2001 From: apostolos Date: Fri, 22 Oct 2021 11:17:43 +0300 Subject: [PATCH 17/37] test for movemask and shuffle cases added --- src/nfa/limex_shuffle.h | 8 +++---- unit/internal/simd_utils.cpp | 43 ++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/src/nfa/limex_shuffle.h b/src/nfa/limex_shuffle.h index b2aa9a0a..413eece7 100644 --- a/src/nfa/limex_shuffle.h +++ b/src/nfa/limex_shuffle.h @@ -45,10 +45,10 @@ static really_inline u32 packedExtract128(m128 s, const m128 permute, const m128 compare) { m128 shuffled = pshufb_m128(s, permute); - int8x16_t res = (int8x16_t) pshufb_m128(s, permute); - printf("shufled:"); - for(int i=15; i>=0; i--) {printf("%02x ", res[i]);} - printf("\n"); + //int8x16_t res = (int8x16_t) pshufb_m128(s, permute); + //printf("shufled:"); + //for(int i=15; i>=0; i--) {printf("%02x ", res[i]);} + //printf("\n"); m128 compared = and128(shuffled, compare); u16 rv = ~movemask128(eq128(compared, shuffled)); return (u32)rv; diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index d66db7e2..26743abe 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -819,4 +819,47 @@ TEST(SimdUtilsTest, sub_u8_m128) { EXPECT_TRUE(!diff128(result, loadu128(expec))); } +TEST(SimdUtilsTest, movemask_128) { + srand (time(NULL)); + u8 vec[16] = {0}; + u8 vec2[16] = {0}; + u16 r = rand() % 100 + 1; + for(int i=0; i<16; i++) { + if (r & (1 << i)) { + vec[i] = 0xff; + } + } + m128 v = loadu128(vec); + u16 mask = movemask128(v); + for(int i=0; i<16; i++) { + if (mask & (1 << i)) { + vec2[i] = 0xff; + } + } + for (int i=0; i<16; i++) { + ASSERT_EQ(vec[i],vec2[i]); + } +} + +TEST(SimdUtilsTest, pshufb_m128) { + srand (time(NULL)); + u8 vec[16]; + for (int i=0; i<16; i++) { + vec[i] = rand() % 100 + 1; + } + u8 vec2[16]; + for (int i=0; i<16; i++) { + vec2[i]=i; + } + m128 v1 = loadu128(vec); + m128 v2 = loadu128(vec2); + m128 vres = pshufb_m128(v1, v2); + u8 res[16]; + store128(res, vres); + for (int i=0; i<16; i++) { + ASSERT_EQ(vec[vec2[i]], res[i]); + } +} + + } // namespace From 24f149f239b5e30d59ae258f620897788ee866a2 Mon Sep 17 00:00:00 2001 From: apostolos Date: Fri, 22 Oct 2021 12:36:07 +0300 Subject: [PATCH 18/37] print functions keyword renamed --- src/util/arch/common/simd_utils.h | 20 ++++++++++---------- unit/internal/shuffle.cpp | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 5bf846f9..40a569f7 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -46,25 +46,25 @@ #endif // HAVE_SIMD_128_BITS #ifdef DEBUG -static inline void print_m128_16x8(const char *label, m128 vector) { +static inline void print_m128_16x8(const char *label, m128 vec) { uint8_t ALIGN_ATTR(16) data[16]; - store128(data, vector); + store128(data, vec); DEBUG_PRINTF("%12s: ", label); for(int i=15; i >=0; i--) printf("%02x ", data[i]); printf("\n"); } -static inline void print_m128_8x16(const char *label, m128 vector) { +static inline void print_m128_8x16(const char *label, m128 vec) { uint16_t ALIGN_ATTR(16) data[8]; - store128(data, vector); + store128(data, vec); DEBUG_PRINTF("%12s: ", label); for(int i=7; i >= 0; i--) printf("%04x ", data[i]); printf("\n"); } -static inline void print_m128_4x32(const char *label, m128 vector) { +static inline void print_m128_4x32(const char *label, m128 vec) { uint32_t ALIGN_ATTR(16) data[4]; store128(data, vector); DEBUG_PRINTF("%12s: ", label); @@ -73,7 +73,7 @@ static inline void print_m128_4x32(const char *label, m128 vector) { printf("\n"); } -static inline void print_m128_2x64(const char *label, m128 vector) { +static inline void print_m128_2x64(const char *label, m128 vec) { uint64_t ALIGN_ATTR(16) data[2]; store128(data, vector); DEBUG_PRINTF("%12s: ", label); @@ -82,10 +82,10 @@ static inline void print_m128_2x64(const char *label, m128 vector) { printf("\n"); } #else -#define print_m128_16x8(label, vector) ; -#define print_m128_8x16(label, vector) ; -#define print_m128_4x32(label, vector) ; -#define print_m128_2x64(label, vector) ; +#define print_m128_16x8(label, vec) ; +#define print_m128_8x16(label, vec) ; +#define print_m128_4x32(label, vec) ; +#define print_m128_2x64(label, vec) ; #endif /**** diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp index 129e63c9..b7c1b4f5 100644 --- a/unit/internal/shuffle.cpp +++ b/unit/internal/shuffle.cpp @@ -187,7 +187,7 @@ TEST(Shuffle, PackedExtract128_1) { // shuffle a single 1 bit to the front m128 permute, compare; build_pshufb_masks_onebit(i, &permute, &compare); - EXPECT_EQ(1U, packedExtract128(setbit(i), permute, compare)); + EXPECT_EQ(1U, packedExtract128(setbit(i), permute, compare)); EXPECT_EQ(1U, packedExtract128(ones128(), permute, compare)); // we should get zero out of these cases EXPECT_EQ(0U, packedExtract128(zeroes128(), permute, compare)); From 57301721f1af939c565eb02ec65960fc5f8b004c Mon Sep 17 00:00:00 2001 From: apostolos Date: Fri, 22 Oct 2021 12:38:16 +0300 Subject: [PATCH 19/37] print functions missing keywords replaced --- src/util/arch/common/simd_utils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 40a569f7..17de949a 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -66,7 +66,7 @@ static inline void print_m128_8x16(const char *label, m128 vec) { static inline void print_m128_4x32(const char *label, m128 vec) { uint32_t ALIGN_ATTR(16) data[4]; - store128(data, vector); + store128(data, vec); DEBUG_PRINTF("%12s: ", label); for(int i=3; i >= 0; i--) printf("%08x ", data[i]); @@ -75,7 +75,7 @@ static inline void print_m128_4x32(const char *label, m128 vec) { static inline void print_m128_2x64(const char *label, m128 vec) { uint64_t ALIGN_ATTR(16) data[2]; - store128(data, vector); + store128(data, vec); DEBUG_PRINTF("%12s: ", label); for(int i=1; i >= 0; i--) printf("%016lx ", data[i]); From d43d6733b6a014b660362851161bba018b338fcb Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Fri, 22 Oct 2021 11:55:39 +0000 Subject: [PATCH 20/37] SuperVector shuffle implementation and test function optimized --- src/nfa/limex_shuffle.h | 5 +---- src/util/arch/ppc64el/simd_utils.h | 8 -------- src/util/supervector/arch/ppc64el/impl.cpp | 2 +- unit/internal/simd_utils.cpp | 6 +++--- unit/internal/supervector.cpp | 4 ++-- 5 files changed, 7 insertions(+), 18 deletions(-) diff --git a/src/nfa/limex_shuffle.h b/src/nfa/limex_shuffle.h index 413eece7..a1728e6a 100644 --- a/src/nfa/limex_shuffle.h +++ b/src/nfa/limex_shuffle.h @@ -45,10 +45,7 @@ static really_inline u32 packedExtract128(m128 s, const m128 permute, const m128 compare) { m128 shuffled = pshufb_m128(s, permute); - //int8x16_t res = (int8x16_t) pshufb_m128(s, permute); - //printf("shufled:"); - //for(int i=15; i>=0; i--) {printf("%02x ", res[i]);} - //printf("\n"); + print_m128_16x8("shufled", shuffled); m128 compared = and128(shuffled, compare); u16 rv = ~movemask128(eq128(compared, shuffled)); return (u32)rv; diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index d962163e..9e8c59bf 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -463,14 +463,6 @@ char testbit128(m128 val, unsigned int n) { static really_inline m128 pshufb_m128(m128 a, m128 b) { return (m128) vec_perm((uint8x16_t)a, (uint8x16_t)a, (uint8x16_t)b); - //return (m128) vec_perm((int8x16_t)vec_splat_s8(0), (int8x16_t)a, (uint8x16_t)b);; - //uint8x16_t btransparent = vec_and((uint8x16_t)b, (uint8x16_t)vec_splats(0x8f)); - //return (m128) vec_perm(a, a, btransparent); - //return (m128) vec_perm((int8x16_t)vec_splat_s8(0), (int8x16_t)b, (uint8x16_t)a); - - //return (m128) vec_perm((int8x16_t)a, (int8x16_t)b, (uint8x16_t)vec_splat_s8(0)); - //return (m128) vec_perm((int8x16_t)b, (int8x16_t)a, (uint8x16_t)vec_splat_s8(0)); - } static really_inline diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 93cc4d63..dc318c82 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -603,7 +603,7 @@ template<> template<> really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) { - return (m128) vec_permxor((int8x16_t)vec_splat_s8(0), (int8x16_t)u.v128[0], (int8x16_t) b.u.v128[0]); + return (m128) vec_perm((uint8x16_t)u.v128[0], (uint8x16_t)u.v128[0], (uint8x16_t)b.u.v128[0]); } template<> diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 26743abe..2085c9df 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -849,15 +849,15 @@ TEST(SimdUtilsTest, pshufb_m128) { } u8 vec2[16]; for (int i=0; i<16; i++) { - vec2[i]=i; - } + vec2[i]=i + (rand() % 16 + 0); + } m128 v1 = loadu128(vec); m128 v2 = loadu128(vec2); m128 vres = pshufb_m128(v1, v2); u8 res[16]; store128(res, vres); for (int i=0; i<16; i++) { - ASSERT_EQ(vec[vec2[i]], res[i]); + ASSERT_EQ(vec[vec2[i] % 16 ], res[i]); } } diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp index 342f8fd4..4be93aa8 100644 --- a/unit/internal/supervector.cpp +++ b/unit/internal/supervector.cpp @@ -280,13 +280,13 @@ TEST(SuperVectorUtilsTest,pshufb128c) { } u8 vec2[16]; for (int i=0; i<16; i++) { - vec2[i]=i; + vec2[i]=i + (rand() % 15 + 0); } auto SP1 = SuperVector<16>::loadu(vec); auto SP2 = SuperVector<16>::loadu(vec2); auto SResult = SP1.template pshufb(SP2); for (int i=0; i<16; i++) { - ASSERT_EQ(vec[vec2[i]],SResult.u.u8[i]); + ASSERT_EQ(vec[vec2[i] % 16 ],SResult.u.u8[i]); } } From 1eb3b19f63f05bad1cb5776bb5ca39b8f192bc23 Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Sun, 24 Oct 2021 16:52:12 +0000 Subject: [PATCH 21/37] Shuffle simd and SuperVector implementetions as well as their test realy fixed --- src/nfa/limex_shuffle.h | 1 - src/util/arch/ppc64el/simd_utils.h | 4 +++- src/util/supervector/arch/ppc64el/impl.cpp | 4 +++- unit/internal/shuffle.cpp | 6 +++--- unit/internal/simd_utils.cpp | 17 ++++++++++++----- unit/internal/supervector.cpp | 6 +++++- 6 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/nfa/limex_shuffle.h b/src/nfa/limex_shuffle.h index a1728e6a..365d4729 100644 --- a/src/nfa/limex_shuffle.h +++ b/src/nfa/limex_shuffle.h @@ -45,7 +45,6 @@ static really_inline u32 packedExtract128(m128 s, const m128 permute, const m128 compare) { m128 shuffled = pshufb_m128(s, permute); - print_m128_16x8("shufled", shuffled); m128 compared = and128(shuffled, compare); u16 rv = ~movemask128(eq128(compared, shuffled)); return (u32)rv; diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index 9e8c59bf..107ca110 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -462,7 +462,9 @@ char testbit128(m128 val, unsigned int n) { static really_inline m128 pshufb_m128(m128 a, m128 b) { - return (m128) vec_perm((uint8x16_t)a, (uint8x16_t)a, (uint8x16_t)b); + uint8x16_t mask =(uint8x16_t)vec_cmpge((uint8x16_t)b, (uint8x16_t)vec_splats((uint8_t)0x80)); + uint8x16_t res = vec_perm ((uint8x16_t)a, (uint8x16_t)a, (uint8x16_t)b); + return (m128) vec_sel((uint8x16_t)res, (uint8x16_t)zeroes128(), (uint8x16_t)mask); } static really_inline diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index dc318c82..0af136a5 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -603,7 +603,9 @@ template<> template<> really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) { - return (m128) vec_perm((uint8x16_t)u.v128[0], (uint8x16_t)u.v128[0], (uint8x16_t)b.u.v128[0]); + uint8x16_t mask =(uint8x16_t)vec_cmpge((uint8x16_t)b.u.v128[0], (uint8x16_t)vec_splats((uint8_t)0x80)); + uint8x16_t res = vec_perm ((uint8x16_t)u.v128[0], (uint8x16_t)u.v128[0], (uint8x16_t)b.u.v128[0]); + return (m128) vec_sel((uint8x16_t)res, (uint8x16_t)vec_splat_s8(0), (uint8x16_t)mask); } template<> diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp index b7c1b4f5..038c6193 100644 --- a/unit/internal/shuffle.cpp +++ b/unit/internal/shuffle.cpp @@ -187,7 +187,7 @@ TEST(Shuffle, PackedExtract128_1) { // shuffle a single 1 bit to the front m128 permute, compare; build_pshufb_masks_onebit(i, &permute, &compare); - EXPECT_EQ(1U, packedExtract128(setbit(i), permute, compare)); + EXPECT_EQ(1U, packedExtract128(setbit(i), permute, compare)); EXPECT_EQ(1U, packedExtract128(ones128(), permute, compare)); // we should get zero out of these cases EXPECT_EQ(0U, packedExtract128(zeroes128(), permute, compare)); @@ -199,7 +199,7 @@ TEST(Shuffle, PackedExtract128_1) { } } -/* + TEST(Shuffle, PackedExtract_templatized_128_1) { // Try all possible one-bit masks for (unsigned int i = 0; i < 128; i++) { @@ -218,7 +218,7 @@ TEST(Shuffle, PackedExtract_templatized_128_1) { } } } -*/ + #if defined(HAVE_AVX2) diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 2085c9df..037230d0 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -849,15 +849,22 @@ TEST(SimdUtilsTest, pshufb_m128) { } u8 vec2[16]; for (int i=0; i<16; i++) { - vec2[i]=i + (rand() % 16 + 0); - } + vec2[i]=i + (rand() % 15 + 0); + } + m128 v1 = loadu128(vec); m128 v2 = loadu128(vec2); - m128 vres = pshufb_m128(v1, v2); + m128 vres = pshufb_m128(v1, v2); + u8 res[16]; - store128(res, vres); + storeu128(res, vres); + for (int i=0; i<16; i++) { - ASSERT_EQ(vec[vec2[i] % 16 ], res[i]); + if(vec2[i] & 0x80){ + ASSERT_EQ(res[i], 0); + }else{ + ASSERT_EQ(vec[vec2[i] % 16 ], res[i]); + } } } diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp index 4be93aa8..9c5f8f3a 100644 --- a/unit/internal/supervector.cpp +++ b/unit/internal/supervector.cpp @@ -286,7 +286,11 @@ TEST(SuperVectorUtilsTest,pshufb128c) { auto SP2 = SuperVector<16>::loadu(vec2); auto SResult = SP1.template pshufb(SP2); for (int i=0; i<16; i++) { - ASSERT_EQ(vec[vec2[i] % 16 ],SResult.u.u8[i]); + if(vec2[i] & 0x80){ + ASSERT_EQ(SResult.u.u8[i], 0); + }else{ + ASSERT_EQ(vec[vec2[i] % 16 ],SResult.u.u8[i]); + } } } From bf54aae7793a4ec2eb4783f4aab8b0d1c2b308aa Mon Sep 17 00:00:00 2001 From: apostolos Date: Tue, 26 Oct 2021 11:48:33 +0300 Subject: [PATCH 22/37] Special case for Shuffle test added as well as comments for respectives implementations --- src/util/arch/ppc64el/simd_utils.h | 3 ++ src/util/supervector/arch/ppc64el/impl.cpp | 3 ++ unit/internal/simd_utils.cpp | 45 ++++++++++++++++++++-- 3 files changed, 47 insertions(+), 4 deletions(-) diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index 107ca110..6e93651e 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -462,6 +462,9 @@ char testbit128(m128 val, unsigned int n) { static really_inline m128 pshufb_m128(m128 a, m128 b) { + /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. + In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane. + below is the version that is converted from Intel to PPC. */ uint8x16_t mask =(uint8x16_t)vec_cmpge((uint8x16_t)b, (uint8x16_t)vec_splats((uint8_t)0x80)); uint8x16_t res = vec_perm ((uint8x16_t)a, (uint8x16_t)a, (uint8x16_t)b); return (m128) vec_sel((uint8x16_t)res, (uint8x16_t)zeroes128(), (uint8x16_t)mask); diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 0af136a5..ce975cec 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -603,6 +603,9 @@ template<> template<> really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) { + /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. + In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane. + below is the version that is converted from Intel to PPC. */ uint8x16_t mask =(uint8x16_t)vec_cmpge((uint8x16_t)b.u.v128[0], (uint8x16_t)vec_splats((uint8_t)0x80)); uint8x16_t res = vec_perm ((uint8x16_t)u.v128[0], (uint8x16_t)u.v128[0], (uint8x16_t)b.u.v128[0]); return (m128) vec_sel((uint8x16_t)res, (uint8x16_t)vec_splat_s8(0), (uint8x16_t)mask); diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 037230d0..1fc6224b 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -845,26 +845,63 @@ TEST(SimdUtilsTest, pshufb_m128) { srand (time(NULL)); u8 vec[16]; for (int i=0; i<16; i++) { - vec[i] = rand() % 100 + 1; + vec[i] = rand() % 1000 + 1; } u8 vec2[16]; for (int i=0; i<16; i++) { - vec2[i]=i + (rand() % 15 + 0); + vec2[i]=i + (rand() % 100 + 0); } + /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. + In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane. + Thus bellow we have to check thah case to NEON or PPC. */ + + /*Insure that vec2 has at least 1 or more 0x80*/ + u8 vec3[16] = {0}; + vec3[15] = 0x80; + + for (int i=0; i<15; i++) { + int l = rand() % 1000 + 0; + if (l % 16 ==0){ + vec3[i]= 0x80; + } else{ + vec3[i]= vec2[i]; + } + } + /* + printf("vec3: "); + for(int i=15; i>=0; i--) { printf("%02x, ", vec3[i]); } + printf("\n"); + */ + + /*Test Special Case*/ m128 v1 = loadu128(vec); - m128 v2 = loadu128(vec2); + m128 v2 = loadu128(vec3); m128 vres = pshufb_m128(v1, v2); u8 res[16]; storeu128(res, vres); + for (int i=0; i<16; i++) { + if(vec3[i] & 0x80){ + ASSERT_EQ(res[i], 0); + }else{ + ASSERT_EQ(vec[vec3[i] % 16 ], res[i]); + } + } + + /*Test Other Cases*/ + v1 = loadu128(vec); + v2 = loadu128(vec2); + vres = pshufb_m128(v1, v2); + storeu128(res, vres); + for (int i=0; i<16; i++) { if(vec2[i] & 0x80){ ASSERT_EQ(res[i], 0); }else{ ASSERT_EQ(vec[vec2[i] % 16 ], res[i]); - } + } } } From 3f17750a27f1ea12fc9d970504158161a7dd2cda Mon Sep 17 00:00:00 2001 From: apostolos Date: Tue, 26 Oct 2021 11:55:02 +0300 Subject: [PATCH 23/37] nits --- unit/internal/simd_utils.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 1fc6224b..1f16adcd 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -854,9 +854,9 @@ TEST(SimdUtilsTest, pshufb_m128) { /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane. - Thus bellow we have to check thah case to NEON or PPC. */ + Thus bellow we have to check that case to NEON or PPC. */ - /*Insure that vec2 has at least 1 or more 0x80*/ + /*Insure that vec3 has at least 1 or more 0x80 elements*/ u8 vec3[16] = {0}; vec3[15] = 0x80; From d9d39d48c5a36c65201d10d494a4707a74146c77 Mon Sep 17 00:00:00 2001 From: apostolos Date: Mon, 1 Nov 2021 10:05:25 +0200 Subject: [PATCH 24/37] prints commants and formating fixes --- src/nfa/ppc64el/truffle.hpp | 2 +- src/nfa/truffle_simd.hpp | 1 - src/util/arch/ppc64el/simd_utils.h | 37 ++++---------------- src/util/supervector/arch/ppc64el/impl.cpp | 39 +++------------------- unit/internal/shuffle.cpp | 2 +- unit/internal/simd_utils.cpp | 12 +++---- 6 files changed, 19 insertions(+), 74 deletions(-) diff --git a/src/nfa/ppc64el/truffle.hpp b/src/nfa/ppc64el/truffle.hpp index 92333261..7dc711f4 100644 --- a/src/nfa/ppc64el/truffle.hpp +++ b/src/nfa/ppc64el/truffle.hpp @@ -58,5 +58,5 @@ const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, Supe SuperVector res = (shuf1 | shuf2) & shuf3; res.print8("(shuf1 | shuf2) & shuf3"); - return !res.eq(SuperVector::Zeroes()); + return res.eq(SuperVector::Zeroes()); } diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp index b3a82266..51b9ee68 100644 --- a/src/nfa/truffle_simd.hpp +++ b/src/nfa/truffle_simd.hpp @@ -57,7 +57,6 @@ template static really_inline const u8 *fwdBlock(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, SuperVector chars, const u8 *buf) { SuperVector res = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); - return firstMatch(buf, res); } diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index 6e93651e..d27832d4 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -202,43 +202,24 @@ static really_inline m128 eq64_m128(m128 a, m128 b) { static really_inline u32 movemask128(m128 a) { - //printf("input vector:"); - //for (int i=3; i>=0; i--) {printf("%04x, ", a[i]);} - //printf("\n"); uint8x16_t s1 = vec_sr((uint8x16_t)a, vec_splat_u8(7)); - //printf("s1:"); - //for (int i=15; i>=0; i--) {printf("%02x, ", s1[i]);} - //printf("\n"); + uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7)); uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff)); uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and); - //printf("s2:"); - //for (int i=7; i>=0; i--) {printf("%04x, ", s2[i]);} - //printf("\n"); - + uint32x4_t ss2 = vec_sr((uint32x4_t)s2, vec_splat_u32(14)); uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff)); uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2); - //printf("s3:"); - //for (int i=3; i>=0; i--) {printf("%08x, ", s3[i]);} - //printf("\n"); - + uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28)); uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((uint64_t)0xff)); uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3); - //printf("s4:"); - //for (int i=1; i>=0; i--) {printf("%016llx, ", s4[i]);} - //printf("\n"); - + uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9); uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((uint64_t)0xff)); uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); - //printf("s5:"); - //for (int i=1; i>=0; i--) {printf("%016llx, ", s5[i]);} - //printf("\n"); - - - //printf("%lld and %lld\n", s5[0],s5[1]); + return s5[0]; } @@ -305,10 +286,6 @@ switch (imm) { } static really_inline m128 low64from128(const m128 in) { - //int64x2_t v = vec_perm((int64x2_t)in, (int64x2_t)vec_splats((uint64_t)0), (uint8x16_t)vec_splat_u8(1)); - //printf("v:"); - //for (int i=1; i>=0; i++) {printf("%016llx",v[i]);} - //printf("\n"); return (m128) vec_perm((int64x2_t)in, (int64x2_t)vec_splats((uint64_t)0), (uint8x16_t)vec_splat_u8(1)); } @@ -340,7 +317,7 @@ static really_inline m128 andnot128(m128 a, m128 b) { // aligned load static really_inline m128 load128(const void *ptr) { assert(ISALIGNED_N(ptr, alignof(m128))); - return (m128) vec_xl(0, (const int64_t*)ptr); + return (m128) vec_xl(0, (const int32_t*)ptr); } // aligned store @@ -351,7 +328,7 @@ static really_inline void store128(void *ptr, m128 a) { // unaligned load static really_inline m128 loadu128(const void *ptr) { - return (m128) vec_xl(0, (const int64_t*)ptr); + return (m128) vec_xl(0, (const int32_t*)ptr); } // unaligned store diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index ce975cec..acdb89d4 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -218,22 +218,11 @@ template <> really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const { uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7)); - //printf("s1:"); - //for(int i=15; i>=0; i--) {printf("%02x, ",s1[i]);} - //printf("\n"); + uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7)); - //printf("ss:"); - //for(int i=7; i>=0; i--) {printf("%04x, ",ss[i]);} - //printf("\n"); uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff)); - //printf("res_and:"); - //for(int i=7; i>=0; i--) {printf("%04x, ",res_and[i]);} - //printf("\n"); uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and); - //printf("s2:"); - //for(int i=7; i>=0; i--) {printf("%04x, ",s2[i]);} - //printf("\n"); - + uint32x4_t ss2 = vec_sr((uint32x4_t)s2 , vec_splat_u32(14)); uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff)); uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2); @@ -246,9 +235,6 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask( uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((uint64_t)0xff)); uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); - //printf("s5:"); - //for(int i=1; i>=0; i--) {printf("%016llx, ",s5[i]);} - //printf("\n"); return s5[0]; } @@ -264,7 +250,6 @@ template really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const { return { (m128) vec_sl((int8x16_t)u.v128[0], vec_splats((uint8_t)N)) }; - //return {(m128)vshlq_n_s8(u.v128[0], N)}; } template <> @@ -272,7 +257,6 @@ template really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const { return { (m128) vec_sl((int16x8_t)u.v128[0], vec_splats((uint16_t)N)) }; - //return {(m128)vshlq_n_s16(u.v128[0], N)}; } template <> @@ -280,8 +264,6 @@ template really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const { return { (m128) vec_sl((int32x4_t)u.v128[0], vec_splats((uint32_t)N)) }; - //return {(m128)vshlq_n_s32(u.v128[0], N)}; - } template <> @@ -289,7 +271,6 @@ template really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const { return { (m128) vec_sl((int64x2_t)u.v128[0], vec_splats((uint64_t)N)) }; - //return {(m128)vshlq_n_s64(u.v128[0], N)}; } template <> @@ -297,7 +278,6 @@ template really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const { return { (m128) vec_sld((int8x16_t)u.v128[0], (int8x16_t)vec_splat_s8(0), N)}; - //return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - N)}; } template <> @@ -312,7 +292,6 @@ template really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const { return { (m128) vec_sr((int8x16_t)u.v128[0], vec_splats((uint8_t)N)) }; - //return {(m128)vshrq_n_s8(u.v128[0], N)}; } template <> @@ -320,7 +299,6 @@ template really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const { return { (m128) vec_sr((int16x8_t)u.v128[0], vec_splats((uint16_t)N)) }; - //return {(m128)vshrq_n_s16(u.v128[0], N)}; } template <> @@ -328,7 +306,6 @@ template really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const { return { (m128) vec_sr((int32x4_t)u.v128[0], vec_splats((uint32_t)N)) }; - //return {(m128)vshrq_n_s32(u.v128[0], N)}; } template <> @@ -336,7 +313,6 @@ template really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const { return { (m128) vec_sr((int64x2_t)u.v128[0], vec_splats((uint64_t)N)) }; - //return {(m128)vshrq_n_s64(u.v128[0], N)}; } template <> @@ -344,7 +320,6 @@ template really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const { return { (m128) vec_sld((int8x16_t)vec_splat_s8(0), (int8x16_t)u.v128[0], 16 - N) }; - //return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), N)}; } template <> @@ -377,7 +352,6 @@ really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const if (N == 16) return Zeroes(); SuperVector result; Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl((int8x16_t)u.v128[0], vec_splats((uint8_t)n))}; }); - //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s8(u.v128[0], n)}; }); return result; } @@ -388,7 +362,6 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) if (N == 16) return Zeroes(); SuperVector result; Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl((int16x8_t)u.v128[0], vec_splats((uint16_t)n))}; }); - //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s16(u.v128[0], n)}; }); return result; } @@ -399,7 +372,6 @@ really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const if (N == 16) return Zeroes(); SuperVector result; Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl((int32x4_t)u.v128[0], vec_splats((uint32_t)n))}; }); - //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s32(u.v128[0], n)}; }); return result; } @@ -436,7 +408,6 @@ really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const if (N == 16) return Zeroes(); SuperVector result; Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr((int8x16_t)u.v128[0], vec_splats((uint8_t)n))}; }); - //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s8(u.v128[0], n)}; }); return result; } @@ -447,7 +418,6 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const if (N == 16) return Zeroes(); SuperVector result; Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr((int16x8_t)u.v128[0], vec_splats((uint16_t)n))}; }); - //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s16(u.v128[0], n)}; }); return result; } @@ -458,7 +428,6 @@ really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const if (N == 16) return Zeroes(); SuperVector result; Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr((int32x4_t)u.v128[0], vec_splats((uint32_t)n))}; }); - //Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s32(u.v128[0], n)}; }); return result; } @@ -616,8 +585,8 @@ template<> really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) { /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. - In NEON, if >=16, then the result is zero, otherwise it is that lane. - btranslated is the version that is converted from Intel to NEON. */ + In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane. + btranslated is the version that is converted from Intel to PPC. */ SuperVector<16> btranslated = b & SuperVector<16>::dup_s8(0x8f); return pshufb(btranslated); } diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp index 038c6193..f1a03d5a 100644 --- a/unit/internal/shuffle.cpp +++ b/unit/internal/shuffle.cpp @@ -187,7 +187,7 @@ TEST(Shuffle, PackedExtract128_1) { // shuffle a single 1 bit to the front m128 permute, compare; build_pshufb_masks_onebit(i, &permute, &compare); - EXPECT_EQ(1U, packedExtract128(setbit(i), permute, compare)); + EXPECT_EQ(1U, packedExtract128(setbit(i), permute, compare)); EXPECT_EQ(1U, packedExtract128(ones128(), permute, compare)); // we should get zero out of these cases EXPECT_EQ(0U, packedExtract128(zeroes128(), permute, compare)); diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 1f16adcd..884f2d0a 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -852,11 +852,11 @@ TEST(SimdUtilsTest, pshufb_m128) { vec2[i]=i + (rand() % 100 + 0); } - /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. - In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane. - Thus bellow we have to check that case to NEON or PPC. */ + // On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. + // In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane. + // Thus bellow we have to check that case to NEON or PPC. - /*Insure that vec3 has at least 1 or more 0x80 elements*/ + //Insure that vec3 has at least 1 or more 0x80 elements u8 vec3[16] = {0}; vec3[15] = 0x80; @@ -874,7 +874,7 @@ TEST(SimdUtilsTest, pshufb_m128) { printf("\n"); */ - /*Test Special Case*/ + //Test Special Case m128 v1 = loadu128(vec); m128 v2 = loadu128(vec3); m128 vres = pshufb_m128(v1, v2); @@ -890,7 +890,7 @@ TEST(SimdUtilsTest, pshufb_m128) { } } - /*Test Other Cases*/ + //Test Other Cases v1 = loadu128(vec); v2 = loadu128(vec2); vres = pshufb_m128(v1, v2); From ba90cdeb5aba1ecc12b2f31d744969e6a9ca8030 Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Fri, 5 Nov 2021 13:34:48 +0000 Subject: [PATCH 25/37] SuperVector constructors as well as andnot implementation fixed --- src/nfa/ppc64el/shufti.hpp | 4 ++-- src/util/arch/ppc64el/match.hpp | 16 ++++++++-------- src/util/supervector/arch/ppc64el/impl.cpp | 15 ++++++++------- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/nfa/ppc64el/shufti.hpp b/src/nfa/ppc64el/shufti.hpp index 76461175..dedeb52d 100644 --- a/src/nfa/ppc64el/shufti.hpp +++ b/src/nfa/ppc64el/shufti.hpp @@ -43,7 +43,7 @@ const SuperVector blockSingleMask(SuperVector mask_lo, SuperVector mask c_lo = mask_lo.template pshufb(c_lo); c_hi = mask_hi.template pshufb(c_hi); - return (c_lo & c_hi) > (SuperVector::Zeroes()); + return (c_lo & c_hi).eq(SuperVector::Zeroes()); } template @@ -72,5 +72,5 @@ SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector t = t1 | (t2.template vshr_128_imm<1>()); t.print8("t"); - return !t.eq(SuperVector::Ones()); + return t.eq(SuperVector::Ones()); } diff --git a/src/util/arch/ppc64el/match.hpp b/src/util/arch/ppc64el/match.hpp index 3cb3d667..3f24ce7f 100644 --- a/src/util/arch/ppc64el/match.hpp +++ b/src/util/arch/ppc64el/match.hpp @@ -30,10 +30,10 @@ template <> really_really_inline const u8 *firstMatch<16>(const u8 *buf, SuperVector<16> v) { - SuperVector<16>::movemask_type z = v.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); - if (unlikely(z != 0xffff)) { + if (unlikely(vec_any_ne(v.u.v128[0], SuperVector<16>::Ones().u.v128[0]))) { + SuperVector<16>::movemask_type z = v.movemask(); + DEBUG_PRINTF("buf %p z %08x \n", buf, z); + DEBUG_PRINTF("z %08x\n", z); u32 pos = ctz32(~z & 0xffff); DEBUG_PRINTF("~z %08x\n", ~z); DEBUG_PRINTF("match @ pos %u\n", pos); @@ -47,10 +47,10 @@ const u8 *firstMatch<16>(const u8 *buf, SuperVector<16> v) { template <> really_really_inline const u8 *lastMatch<16>(const u8 *buf, SuperVector<16> v) { - SuperVector<16>::movemask_type z = v.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); - if (unlikely(z != 0xffff)) { + if (unlikely(vec_any_ne(v.u.v128[0], SuperVector<16>::Ones().u.v128[0]))) { + SuperVector<16>::movemask_type z = v.movemask(); + DEBUG_PRINTF("buf %p z %08x \n", buf, z); + DEBUG_PRINTF("z %08x\n", z); u32 pos = clz32(~z & 0xffff); DEBUG_PRINTF("~z %08x\n", ~z); DEBUG_PRINTF("match @ pos %u\n", pos); diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index acdb89d4..20a735b8 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -74,7 +74,7 @@ template<> template<> really_inline SuperVector<16>::SuperVector(uint8_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.v128[0] = (m128) vec_splats(static_cast(other)); } template<> @@ -88,7 +88,7 @@ template<> template<> really_inline SuperVector<16>::SuperVector(uint16_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.v128[0] = (m128) vec_splats(static_cast(other)); } template<> @@ -102,7 +102,7 @@ template<> template<> really_inline SuperVector<16>::SuperVector(uint32_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.v128[0] = (m128) vec_splats(static_cast(other)); } template<> @@ -116,7 +116,7 @@ template<> template<> really_inline SuperVector<16>::SuperVector(uint64_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.v128[0] = (m128) vec_splats(static_cast(other)); } // Constants @@ -167,7 +167,8 @@ really_inline SuperVector<16> SuperVector<16>::operator!() const template <> really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const { - return vec_xor(vec_and(u.v128[0], b.u.v128[0]), vec_and(u.v128[0], b.u.v128[0])); + m128 not_res = vec_xor(u.v128[0], (m128)vec_splat_s8(-1)); + return {(m128) vec_and(not_res, (m128)b.u.v128[0]) }; } @@ -311,8 +312,8 @@ really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const template <> template really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const -{ - return { (m128) vec_sr((int64x2_t)u.v128[0], vec_splats((uint64_t)N)) }; +{ + return { (m128) vec_sr((int64x2_t)u.v128[0], vec_splats((uint64_t)N)) }; } template <> From 82bea29f4e2581fa60788d396347e2b125eb0845 Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Mon, 8 Nov 2021 14:22:58 +0000 Subject: [PATCH 26/37] simd_utils functions fixed --- src/util/arch/ppc64el/simd_utils.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index d27832d4..c47c4585 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -236,9 +236,7 @@ static really_inline m128 set1_2x64(u64a c) { } static really_inline u32 movd(const m128 in) { - u32 ALIGN_ATTR(16) a[4]; - vec_xst((uint32x4_t) in, 0, a); - return a[0]; + return (u32) vec_extract((uint32x4_t)in, 0); } static really_inline u64a movq(const m128 in) { @@ -250,7 +248,8 @@ static really_inline u64a movq(const m128 in) { /* another form of movq */ static really_inline m128 load_m128_from_u64a(const u64a *p) { - return (m128) vec_ld(0, p); + m128 vec =(m128) vec_splats(*p); + return rshift_m128(vec,8); } @@ -286,11 +285,11 @@ switch (imm) { } static really_inline m128 low64from128(const m128 in) { - return (m128) vec_perm((int64x2_t)in, (int64x2_t)vec_splats((uint64_t)0), (uint8x16_t)vec_splat_u8(1)); + return rshift_m128(in,8); } static really_inline m128 high64from128(const m128 in) { - return (m128) vec_perm((int64x2_t)in, (int64x2_t)vec_splats((uint64_t)0), (uint8x16_t)vec_splat_u8(0)); + return lshift_m128(in,8); } From 942deb7d802a81a37298420af4b8b46729d69a98 Mon Sep 17 00:00:00 2001 From: apostolos Date: Wed, 10 Nov 2021 09:01:28 +0200 Subject: [PATCH 27/37] test for load m128 from u64a function added --- unit/internal/simd_utils.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 884f2d0a..b1b9bfb1 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -819,6 +819,17 @@ TEST(SimdUtilsTest, sub_u8_m128) { EXPECT_TRUE(!diff128(result, loadu128(expec))); } +TEST(SimdUtilsTest, load_m128_from_u64a) { + srand (time(NULL)); + u64a tmp = rand(); + m128 res = load_m128_from_u64a(&tmp); + m128 cmp = set2x64(0LL, tmp); + //print_m128_16x8("res",res); + //print_m128_16x8("cmp",cmp); + EXPECT_TRUE(!diff128(res, cmp)); +} + + TEST(SimdUtilsTest, movemask_128) { srand (time(NULL)); u8 vec[16] = {0}; From 4114b8a480ea37ed058a17385b9fcd2c4f034421 Mon Sep 17 00:00:00 2001 From: apostolos Date: Wed, 10 Nov 2021 15:12:25 +0200 Subject: [PATCH 28/37] SuperVector opandnot test enriched --- unit/internal/supervector.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp index 9c5f8f3a..deb3b169 100644 --- a/unit/internal/supervector.cpp +++ b/unit/internal/supervector.cpp @@ -155,10 +155,14 @@ TEST(SuperVectorUtilsTest,OPXOR128c){ TEST(SuperVectorUtilsTest,OPANDNOT128c){ auto SP1 = SuperVector<16>::Zeroes(); auto SP2 = SuperVector<16>::Ones(); + SP1 = SP1.opandnot(SP2); + for (int i=0; i<16; i++) { + ASSERT_EQ(SP1.u.u8[i],0xff); + } SP2 = SP2.opandnot(SP1); for (int i=0; i<16; i++) { - ASSERT_EQ(SP2.u.s8[i],0); - } + ASSERT_EQ(SP2.u.u8[i],0); + } } TEST(SuperVectorUtilsTest,Movemask128c){ From 54158a174651736cf9524aba09e3e06133652b4b Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Sat, 13 Nov 2021 19:36:46 +0000 Subject: [PATCH 29/37] vermicelli and match implementations for ppc64el added --- src/nfa/ppc64el/vermicelli.hpp | 126 ++++++++++++++++++++++++++++++++ src/nfa/vermicelli_simd.cpp | 2 + src/util/arch/ppc64el/match.hpp | 56 +++++++++++--- unit/internal/simd_utils.cpp | 1 - 4 files changed, 173 insertions(+), 12 deletions(-) create mode 100644 src/nfa/ppc64el/vermicelli.hpp diff --git a/src/nfa/ppc64el/vermicelli.hpp b/src/nfa/ppc64el/vermicelli.hpp new file mode 100644 index 00000000..eeaad6a1 --- /dev/null +++ b/src/nfa/ppc64el/vermicelli.hpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Vermicelli: single-byte and double-byte acceleration. + */ + +template +static really_inline +const u8 *vermicelliBlock(SuperVector const data, SuperVector const chars, SuperVector const casemask, u8 const *buf, u16 const len) { + + SuperVector mask = chars.eq(casemask & data); + return first_non_zero_match(buf, mask, len); +} + +template +static really_inline +const u8 *vermicelliBlockNeg(SuperVector const data, SuperVector const chars, SuperVector const casemask, u8 const *buf, u16 const len) { + + SuperVector mask = chars.eq(casemask & data); + return first_zero_match_inverted(buf, mask, len); +} + +template +static really_inline +const u8 *rvermicelliBlock(SuperVector const data, SuperVector const chars, SuperVector const casemask, u8 const *buf, u16 const len) { + + SuperVector mask = chars.eq(casemask & data); + return last_non_zero_match(buf, mask, len); +} + +template +static really_inline +const u8 *rvermicelliBlockNeg(SuperVector const data, SuperVector const chars, SuperVector const casemask, const u8 *buf, u16 const len) { + + data.print8("data"); + chars.print8("chars"); + casemask.print8("casemask"); + SuperVector mask = chars.eq(casemask & data); + mask.print8("mask"); + return last_zero_match_inverted(buf, mask, len); +} + +template +static really_inline +const u8 *vermicelliDoubleBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const casemask, + u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) { + + SuperVector v = casemask & data; + SuperVector mask1 = chars1.eq(v); + SuperVector mask2 = chars2.eq(v); + SuperVector mask = mask1 & (mask2 >> 1); + + DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]); + bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); + DEBUG_PRINTF("partial = %d\n", partial_match); + if (partial_match) return buf - 1; + + return first_non_zero_match(buf, mask, len); +} + +template +static really_inline +const u8 *rvermicelliDoubleBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const casemask, + u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) { + + SuperVector v = casemask & data; + SuperVector mask1 = chars1.eq(v); + SuperVector mask2 = chars2.eq(v); + SuperVector mask = (mask1 << 1)& mask2; + + DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]); + bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); + DEBUG_PRINTF("partial = %d\n", partial_match); + if (partial_match) { + mask = mask | (SuperVector::Ones() >> (S-1)); + } + + return last_non_zero_match(buf, mask, len); +} + +template +static really_inline +const u8 *vermicelliDoubleMaskedBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, + SuperVector const mask1, SuperVector const mask2, + u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len) { + + SuperVector v1 = chars1.eq(data & mask1); + SuperVector v2 = chars2.eq(data & mask2); + SuperVector mask = v1 & (v2 >> 1); + + DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]); + bool partial_match = (((buf[0] & m1) == c2) && ((buf[-1] & m2) == c1)); + DEBUG_PRINTF("partial = %d\n", partial_match); + if (partial_match) return buf - 1; + + return first_non_zero_match(buf, mask, len); +} + + diff --git a/src/nfa/vermicelli_simd.cpp b/src/nfa/vermicelli_simd.cpp index dbce6dc4..d790d137 100644 --- a/src/nfa/vermicelli_simd.cpp +++ b/src/nfa/vermicelli_simd.cpp @@ -75,6 +75,8 @@ const u8 *vermicelliDoubleMaskedBlock(SuperVector const data, SuperVector #include "x86/vermicelli.hpp" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #include "arm/vermicelli.hpp" +#elif defined(ARCH_PPC64EL) +#include "ppc64el/vermicelli.hpp" #endif template diff --git a/src/util/arch/ppc64el/match.hpp b/src/util/arch/ppc64el/match.hpp index 3f24ce7f..a3f52e41 100644 --- a/src/util/arch/ppc64el/match.hpp +++ b/src/util/arch/ppc64el/match.hpp @@ -29,12 +29,12 @@ template <> really_really_inline -const u8 *firstMatch<16>(const u8 *buf, SuperVector<16> v) { - if (unlikely(vec_any_ne(v.u.v128[0], SuperVector<16>::Ones().u.v128[0]))) { - SuperVector<16>::movemask_type z = v.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); - u32 pos = ctz32(~z & 0xffff); +const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { + SuperVector<16>::movemask_type z = v.movemask(); + DEBUG_PRINTF("buf %p z %08x \n", buf, z); + DEBUG_PRINTF("z %08x\n", z); + if (unlikely(z)) { + u32 pos = ctz32(z); DEBUG_PRINTF("~z %08x\n", ~z); DEBUG_PRINTF("match @ pos %u\n", pos); assert(pos < 16); @@ -46,11 +46,45 @@ const u8 *firstMatch<16>(const u8 *buf, SuperVector<16> v) { template <> really_really_inline -const u8 *lastMatch<16>(const u8 *buf, SuperVector<16> v) { - if (unlikely(vec_any_ne(v.u.v128[0], SuperVector<16>::Ones().u.v128[0]))) { - SuperVector<16>::movemask_type z = v.movemask(); - DEBUG_PRINTF("buf %p z %08x \n", buf, z); - DEBUG_PRINTF("z %08x\n", z); +const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { + SuperVector<16>::movemask_type z = v.movemask(); + DEBUG_PRINTF("buf %p z %08x \n", buf, z); + DEBUG_PRINTF("z %08x\n", z); + if (unlikely(z)) { + u32 pos = clz32(z); + DEBUG_PRINTF("match @ pos %u\n", pos); + assert(pos >= 16 && pos < 32); + return buf + (31 - pos); + } else { + return NULL; // no match + } +} + +template <> +really_really_inline +const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) { + SuperVector<16>::movemask_type z = v.movemask(); + DEBUG_PRINTF("buf %p z %08x \n", buf, z); + DEBUG_PRINTF("z %08x\n", z); + if (unlikely(z != 0xffff)) { + u32 pos = ctz32(~z & 0xffff); + DEBUG_PRINTF("~z %08x\n", ~z); + DEBUG_PRINTF("match @ pos %u\n", pos); + assert(pos < 16); + return buf + pos; + } else { + return NULL; // no match + } +} + + +template <> +really_really_inline +const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) { + SuperVector<16>::movemask_type z = v.movemask(); + DEBUG_PRINTF("buf %p z %08x \n", buf, z); + DEBUG_PRINTF("z %08x\n", z); + if (unlikely(z != 0xffff)) { u32 pos = clz32(~z & 0xffff); DEBUG_PRINTF("~z %08x\n", ~z); DEBUG_PRINTF("match @ pos %u\n", pos); diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 23640034..b1b9bfb1 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -671,7 +671,6 @@ TEST(SimdUtilsTest, movq) { #elif defined(ARCH_PPC64EL) int64x2_t a = {0x123456789abcdefLL, ~0LL }; simd = (m128) a; - simd = vreinterpretq_s32_s64(a); #endif #endif r = movq(simd); From 0287724413c61c9650956afd9221332de0aa7dea Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Tue, 16 Nov 2021 15:24:22 +0000 Subject: [PATCH 30/37] WIP:tracking last bugs in failing tests for release build --- src/util/supervector/arch/ppc64el/impl.cpp | 138 ++++++++++----------- src/util/supervector/supervector.hpp | 11 ++ 2 files changed, 80 insertions(+), 69 deletions(-) diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 20a735b8..e054e02e 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -175,7 +175,7 @@ really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b template <> really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const { - return {(m128) vec_cmpeq((int8x16_t)u.v128[0], (int8x16_t)b.u.v128[0])}; + return {(m128) vec_cmpeq(u.s8x16[0], b.u.s8x16[0])}; } template <> @@ -250,35 +250,35 @@ template <> template really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const { - return { (m128) vec_sl((int8x16_t)u.v128[0], vec_splats((uint8_t)N)) }; + return { (m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const { - return { (m128) vec_sl((int16x8_t)u.v128[0], vec_splats((uint16_t)N)) }; + return { (m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const { - return { (m128) vec_sl((int32x4_t)u.v128[0], vec_splats((uint32_t)N)) }; + return { (m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const { - return { (m128) vec_sl((int64x2_t)u.v128[0], vec_splats((uint64_t)N)) }; + return { (m128) vec_sl(u.s64x2[0], vec_splats((uint64_t)N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const { - return { (m128) vec_sld((int8x16_t)u.v128[0], (int8x16_t)vec_splat_s8(0), N)}; + return { (m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), N)}; } template <> @@ -292,35 +292,35 @@ template <> template really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const { - return { (m128) vec_sr((int8x16_t)u.v128[0], vec_splats((uint8_t)N)) }; + return { (m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const { - return { (m128) vec_sr((int16x8_t)u.v128[0], vec_splats((uint16_t)N)) }; + return { (m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const { - return { (m128) vec_sr((int32x4_t)u.v128[0], vec_splats((uint32_t)N)) }; + return { (m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const { - return { (m128) vec_sr((int64x2_t)u.v128[0], vec_splats((uint64_t)N)) }; + return { (m128) vec_sr(u.s64x2[0], vec_splats((uint64_t)N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const { - return { (m128) vec_sld((int8x16_t)vec_splat_s8(0), (int8x16_t)u.v128[0], 16 - N) }; + return { (m128) vec_sld((int8x16_t)vec_splat_s8(0), u.s8x16[0], 16 - N) }; } template <> @@ -352,7 +352,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl((int8x16_t)u.v128[0], vec_splats((uint8_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)n))}; }); return result; } @@ -362,7 +362,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl((int16x8_t)u.v128[0], vec_splats((uint16_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)n))}; }); return result; } @@ -372,7 +372,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl((int32x4_t)u.v128[0], vec_splats((uint32_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)n))}; }); return result; } @@ -382,7 +382,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl((int64x2_t)u.v128[0], vec_splats((uint64_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s64x2[0], vec_splats((uint64_t)n))}; }); return result; } @@ -392,7 +392,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)u.v128[0], (int8x16_t)vec_splat_s8(0), n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; }); return result; } @@ -408,7 +408,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr((int8x16_t)u.v128[0], vec_splats((uint8_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)n))}; }); return result; } @@ -418,7 +418,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr((int16x8_t)u.v128[0], vec_splats((uint16_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)n))}; }); return result; } @@ -428,7 +428,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr((int32x4_t)u.v128[0], vec_splats((uint32_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)n))}; }); return result; } @@ -438,7 +438,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr((int64x2_t)u.v128[0], vec_splats((uint64_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s64x2[0], vec_splats((uint64_t)n))}; }); return result; } @@ -448,7 +448,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N) if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), (int8x16_t)u.v128[0], 16 - n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), u.s8x16[0], 16 - n)}; }); return result; } @@ -462,21 +462,21 @@ template <> really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const { switch(N) { - case 1: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 15)}; break; - case 2: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 14)}; break; - case 3: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 13)}; break; - case 4: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 12)}; break; - case 5: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 11)}; break; - case 6: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 10)}; break; - case 7: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 9)}; break; - case 8: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 8)}; break; - case 9: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 7)}; break; - case 10: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 6)}; break; - case 11: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 5)}; break; - case 12: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 4)}; break; - case 13: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 3)}; break; - case 14: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 2)}; break; - case 15: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), (int8x16_t) u.v128[0], 1)}; break; + case 1: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 15)}; break; + case 2: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 14)}; break; + case 3: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 13)}; break; + case 4: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 12)}; break; + case 5: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 11)}; break; + case 6: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 10)}; break; + case 7: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 9)}; break; + case 8: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 8)}; break; + case 9: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 7)}; break; + case 10: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 6)}; break; + case 11: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 5)}; break; + case 12: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 4)}; break; + case 13: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 3)}; break; + case 14: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 2)}; break; + case 15: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 1)}; break; case 16: return Zeroes(); break; default: break; } @@ -487,21 +487,21 @@ template <> really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { switch(N) { - case 1: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 1)}; break; - case 2: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 2)}; break; - case 3: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 3)}; break; - case 4: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 4)}; break; - case 5: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 5)}; break; - case 6: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 6)}; break; - case 7: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 7)}; break; - case 8: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 8)}; break; - case 9: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 9)}; break; - case 10: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 10)}; break; - case 11: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 11)}; break; - case 12: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 12)}; break; - case 13: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 13)}; break; - case 14: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 14)}; break; - case 15: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) vec_splat_s8(0), 15)}; break; + case 1: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 1)}; break; + case 2: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 2)}; break; + case 3: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 3)}; break; + case 4: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 4)}; break; + case 5: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 5)}; break; + case 6: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 6)}; break; + case 7: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 7)}; break; + case 8: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 8)}; break; + case 9: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 9)}; break; + case 10: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 10)}; break; + case 11: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 11)}; break; + case 12: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 12)}; break; + case 13: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 13)}; break; + case 14: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 14)}; break; + case 15: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 15)}; break; case 16: return Zeroes(); break; default: break; } @@ -549,21 +549,21 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in switch(offset) { case 0: return other; break; - case 1: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 15)}; break; - case 2: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 14)}; break; - case 3: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 13)}; break; - case 4: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 12)}; break; - case 5: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 11)}; break; - case 6: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 10)}; break; - case 7: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 9)}; break; - case 8: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 8)}; break; - case 9: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 7)}; break; - case 10: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 6)}; break; - case 11: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 5)}; break; - case 12: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 4)}; break; - case 13: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 3)}; break; - case 14: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 2)}; break; - case 15: return {(m128) vec_sld((int8x16_t) u.v128[0], (int8x16_t) other.u.v128[0], 1)}; break; + case 1: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 15)}; break; + case 2: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 14)}; break; + case 3: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 13)}; break; + case 4: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 12)}; break; + case 5: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 11)}; break; + case 6: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 10)}; break; + case 7: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 9)}; break; + case 8: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 8)}; break; + case 9: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 7)}; break; + case 10: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 6)}; break; + case 11: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 5)}; break; + case 12: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 4)}; break; + case 13: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 3)}; break; + case 14: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 2)}; break; + case 15: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 1)}; break; default: break; } return *this; @@ -576,9 +576,9 @@ really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane. below is the version that is converted from Intel to PPC. */ - uint8x16_t mask =(uint8x16_t)vec_cmpge((uint8x16_t)b.u.v128[0], (uint8x16_t)vec_splats((uint8_t)0x80)); - uint8x16_t res = vec_perm ((uint8x16_t)u.v128[0], (uint8x16_t)u.v128[0], (uint8x16_t)b.u.v128[0]); - return (m128) vec_sel((uint8x16_t)res, (uint8x16_t)vec_splat_s8(0), (uint8x16_t)mask); + uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], (uint8x16_t)vec_splats((uint8_t)0x80)); + uint8x16_t res = vec_perm (u.u8x16[0], u.u8x16[0], b.u.u8x16[0]); + return (m128) vec_sel(res, (uint8x16_t)vec_splat_s8(0), mask); } template<> diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index ed9d266a..737412f6 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -176,6 +176,17 @@ public: int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; #endif +#if defined(ARCH_PPC64EL) + __vector uint64_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size]; + __vector int64_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size]; + __vector uint32_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size]; + __vector int32_t ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size]; + __vector uint16_t ALIGN_ATTR(BaseVector<16>::size) u16x8[SIZE / BaseVector<16>::size]; + __vector int16_t ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size]; + __vector uint8_t ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size]; + __vector int8_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; +#endif + uint64_t u64[SIZE / sizeof(uint64_t)]; int64_t s64[SIZE / sizeof(int64_t)]; uint32_t u32[SIZE / sizeof(uint32_t)]; From e13bfec734ac74642ac46cfcba486c66149e8424 Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Wed, 24 Nov 2021 11:18:18 +0000 Subject: [PATCH 31/37] found and solved very hard to track bug of intrinsic function palignr, that manifested only in Release builds and not Debug builds in a particular number of tests --- src/util/arch/ppc64el/simd_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index c47c4585..a932682b 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -381,7 +381,7 @@ m128 palignr_imm(m128 r, m128 l, int offset) { static really_really_inline m128 palignr(m128 r, m128 l, int offset) { #if defined(HS_OPTIMIZE) - return (m128)vec_sld((int8x16_t)l, (int8x16_t)r, offset); + return palignr_imm(r, l, offset); #else return palignr_imm(r, l, offset); #endif From bfc8da11028a99da0966000795cf3132760f04d4 Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Wed, 24 Nov 2021 12:11:21 +0000 Subject: [PATCH 32/37] Removed accidentaly included header file --- src/nfa/vermicelli_sse.h | 1296 -------------------------------------- 1 file changed, 1296 deletions(-) delete mode 100644 src/nfa/vermicelli_sse.h diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h deleted file mode 100644 index d985dd94..00000000 --- a/src/nfa/vermicelli_sse.h +++ /dev/null @@ -1,1296 +0,0 @@ -/* - * Copyright (c) 2015-2020, Intel Corporation - * Copyright (c) 2021, Arm Limited - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/** \file - * \brief Vermicelli: Intel SSE implementation. - * - * (users should include vermicelli.h instead of this) - */ - -#if !defined(HAVE_AVX512) - -#define VERM_BOUNDARY 16 -#define VERM_TYPE m128 -#define VERM_SET_FN set1_16x8 - -static really_inline -const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, - char negate) { - assert((size_t)buf % 16 == 0); - for (; buf + 31 < buf_end; buf += 32) { - m128 data = load128(buf); - u32 z1 = movemask128(eq128(chars, data)); - m128 data2 = load128(buf + 16); - u32 z2 = movemask128(eq128(chars, data2)); - u32 z = z1 | (z2 << 16); - if (negate) { - z = ~z; - } - if (unlikely(z)) { - const u8 *matchPos = buf + ctz32(z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - } - for (; buf + 15 < buf_end; buf += 16) { - m128 data = load128(buf); - u32 z = movemask128(eq128(chars, data)); - if (negate) { - z = ~z & 0xffff; - } - if (unlikely(z)) { - const u8 *matchPos = buf + ctz32(z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - } - return NULL; -} - -static really_inline -const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf, - const u8 *buf_end, char negate) { - assert((size_t)buf % 16 == 0); - m128 casemask = set1_16x8(CASE_CLEAR); - - for (; buf + 31 < buf_end; buf += 32) { - m128 data = load128(buf); - u32 z1 = movemask128(eq128(chars, and128(casemask, data))); - m128 data2 = load128(buf + 16); - u32 z2 = movemask128(eq128(chars, and128(casemask, data2))); - u32 z = z1 | (z2 << 16); - if (negate) { - z = ~z; - } - if (unlikely(z)) { - const u8 *matchPos = buf + ctz32(z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - } - - for (; buf + 15 < buf_end; buf += 16) { - m128 data = load128(buf); - u32 z = movemask128(eq128(chars, and128(casemask, data))); - if (negate) { - z = ~z & 0xffff; - } - if (unlikely(z)) { - const u8 *matchPos = buf + ctz32(z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) { - m128 data = loadu128(buf); // unaligned - u32 z = movemask128(eq128(chars, data)); - if (negate) { - z = ~z & 0xffff; - } - if (unlikely(z)) { - const u8 *matchPos = buf + ctz32(z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) { - m128 casemask = set1_16x8(CASE_CLEAR); - m128 data = loadu128(buf); // unaligned - u32 z = movemask128(eq128(chars, and128(casemask, data))); - if (negate) { - z = ~z & 0xffff; - } - if (unlikely(z)) { - const u8 *matchPos = buf + ctz32(z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - return NULL; -} - -static really_inline -const u8 *lastMatchOffset(const u8 *buf_end, u32 z) { - assert(z); - return buf_end - 16 + 31 - clz32(z); -} - -static really_inline -const u8 *rvermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, - char negate) { - assert((size_t)buf_end % 16 == 0); - for (; buf + 15 < buf_end; buf_end -= 16) { - m128 data = load128(buf_end - 16); - /* - { - printf("after_load128 data:"); - for (int i=3; i>=0; i--) {printf("%d, ",data[i]);} - printf("\n"); - } - { - m128 res_eq = eq128(chars, data); - printf("dd:"); - for (int i=3; i>=0; i--) { printf("%d, ", res_eq[i]); } - } - */ - u32 z = movemask128(eq128(chars, data)); - if (negate) { - z = ~z & 0xffff; - } - if (unlikely(z)) { - const u8 *matchPos = lastMatchOffset(buf_end, z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - } - return NULL; -} - -static really_inline -const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf, - const u8 *buf_end, char negate) { - assert((size_t)buf_end % 16 == 0); - m128 casemask = set1_16x8(CASE_CLEAR); - - for (; buf + 15 < buf_end; buf_end -= 16) { - m128 data = load128(buf_end - 16); - u32 z = movemask128(eq128(chars, and128(casemask, data))); - if (negate) { - z = ~z & 0xffff; - } - if (unlikely(z)) { - const u8 *matchPos = lastMatchOffset(buf_end, z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) { - m128 data = loadu128(buf); // unaligned - u32 z = movemask128(eq128(chars, data)); - if (negate) { - z = ~z & 0xffff; - } - if (unlikely(z)) { - const u8 *matchPos = lastMatchOffset(buf + 16, z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) { - m128 casemask = set1_16x8(CASE_CLEAR); - m128 data = loadu128(buf); // unaligned - u32 z = movemask128(eq128(chars, and128(casemask, data))); - if (negate) { - z = ~z & 0xffff; - } - if (unlikely(z)) { - const u8 *matchPos = lastMatchOffset(buf + 16, z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - return NULL; -} - -static really_inline -const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - for (; buf + 16 < buf_end; buf += 16) { - m128 data = load128(buf); - u32 z = movemask128(and128(eq128(chars1, data), - rshiftbyte_m128(eq128(chars2, data), 1))); - if (buf[15] == c1 && buf[16] == c2) { - z |= (1 << 15); - } - if (unlikely(z)) { - const u8 *matchPos = buf + ctz32(z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - } - - return NULL; -} - -static really_inline -const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - assert((size_t)buf % 16 == 0); - m128 casemask = set1_16x8(CASE_CLEAR); - - for (; buf + 16 < buf_end; buf += 16) { - m128 data = load128(buf); - m128 v = and128(casemask, data); - u32 z = movemask128(and128(eq128(chars1, v), - rshiftbyte_m128(eq128(chars2, v), 1))); - if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) { - z |= (1 << 15); - } - if (unlikely(z)) { - const u8 *matchPos = buf + ctz32(z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - } - - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { - m128 data = loadu128(buf); // unaligned - u32 z = movemask128(and128(eq128(chars1, data), - rshiftbyte_m128(eq128(chars2, data), 1))); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - const u8 *matchPos = buf + ctz32(z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { - /* due to laziness, nonalphas and nocase having interesting behaviour */ - m128 casemask = set1_16x8(CASE_CLEAR); - m128 data = loadu128(buf); // unaligned - m128 v = and128(casemask, data); - u32 z = movemask128(and128(eq128(chars1, v), - rshiftbyte_m128(eq128(chars2, v), 1))); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - const u8 *matchPos = buf + ctz32(z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - return NULL; -} - - -static really_inline -const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - assert((size_t)buf_end % 16 == 0); - - for (; buf + 16 < buf_end; buf_end -= 16) { - m128 data = load128(buf_end - 16); - u32 z = movemask128(and128(eq128(chars2, data), - lshiftbyte_m128(eq128(chars1, data), 1))); - if (buf_end[-17] == c1 && buf_end[-16] == c2) { - z |= 1; - } - if (unlikely(z)) { - const u8 *matchPos = lastMatchOffset(buf_end, z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - } - return buf_end; -} - -static really_inline -const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - assert((size_t)buf_end % 16 == 0); - m128 casemask = set1_16x8(CASE_CLEAR); - - for (; buf + 16 < buf_end; buf_end -= 16) { - m128 data = load128(buf_end - 16); - m128 v = and128(casemask, data); - u32 z = movemask128(and128(eq128(chars2, v), - lshiftbyte_m128(eq128(chars1, v), 1))); - if ((buf_end[-17] & CASE_CLEAR) == c1 - && (buf_end[-16] & CASE_CLEAR) == c2) { - z |= 1; - } - if (unlikely(z)) { - const u8 *matchPos = lastMatchOffset(buf_end, z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - } - return buf_end; -} - -// returns NULL if not found -static really_inline -const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { - m128 data = loadu128(buf); - u32 z = movemask128(and128(eq128(chars2, data), - lshiftbyte_m128(eq128(chars1, data), 1))); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - const u8 *matchPos = lastMatchOffset(buf + 16, z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { - /* due to laziness, nonalphas and nocase having interesting behaviour */ - m128 casemask = set1_16x8(CASE_CLEAR); - m128 data = loadu128(buf); - m128 v = and128(casemask, data); - u32 z = movemask128(and128(eq128(chars2, v), - lshiftbyte_m128(eq128(chars1, v), 1))); - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - const u8 *matchPos = lastMatchOffset(buf + 16, z); - DEBUG_PRINTF("match pos %p\n", matchPos); - return matchPos; - } - - return NULL; -} - -#else // HAVE_AVX512 - -#define VERM_BOUNDARY 64 -#define VERM_TYPE m512 -#define VERM_SET_FN set1_64x8 - -static really_inline -const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) { - uintptr_t len = buf_end - buf; - __mmask64 mask = (~0ULL) >> (64 - len); - m512 data = loadu_maskz_m512(mask, buf); - - u64a z = eq512mask(chars, data); - - if (negate) { - z = ~z & mask; - } - z &= mask; - if (unlikely(z)) { - return buf + ctz64(z); - } - return NULL; -} - -static really_inline -const u8 *vermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end, - char negate) { - uintptr_t len = buf_end - buf; - __mmask64 mask = (~0ULL) >> (64 - len); - m512 data = loadu_maskz_m512(mask, buf); - m512 casemask = set1_64x8(CASE_CLEAR); - m512 v = and512(casemask, data); - - u64a z = eq512mask(chars, v); - - if (negate) { - z = ~z & mask; - } - z &= mask; - if (unlikely(z)) { - return buf + ctz64(z); - } - return NULL; -} - -static really_inline -const u8 *vermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end, - char negate) { - assert((size_t)buf % 64 == 0); - for (; buf + 63 < buf_end; buf += 64) { - m512 data = load512(buf); - u64a z = eq512mask(chars, data); - if (negate) { - z = ~z & ~0ULL; - } - if (unlikely(z)) { - u64a pos = ctz64(z); - return buf + pos; - } - } - return NULL; -} - -static really_inline -const u8 *vermSearchAlignedNocase(m512 chars, const u8 *buf, - const u8 *buf_end, char negate) { - assert((size_t)buf % 64 == 0); - m512 casemask = set1_64x8(CASE_CLEAR); - - for (; buf + 63 < buf_end; buf += 64) { - m512 data = load512(buf); - u64a z = eq512mask(chars, and512(casemask, data)); - if (negate) { - z = ~z & ~0ULL; - } - if (unlikely(z)) { - u64a pos = ctz64(z); - return buf + pos; - } - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *vermUnalign(m512 chars, const u8 *buf, char negate) { - m512 data = loadu512(buf); // unaligned - u64a z = eq512mask(chars, data); - if (negate) { - z = ~z & ~0ULL; - } - if (unlikely(z)) { - return buf + ctz64(z); - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *vermUnalignNocase(m512 chars, const u8 *buf, char negate) { - m512 casemask = set1_64x8(CASE_CLEAR); - m512 data = loadu512(buf); // unaligned - u64a z = eq512mask(chars, and512(casemask, data)); - if (negate) { - z = ~z & ~0ULL; - } - if (unlikely(z)) { - return buf + ctz64(z); - } - return NULL; -} - -static really_inline -const u8 *dvermMini(m512 chars1, m512 chars2, const u8 *buf, - const u8 *buf_end) { - uintptr_t len = buf_end - buf; - __mmask64 mask = (~0ULL) >> (64 - len); - m512 data = loadu_maskz_m512(mask, buf); - - u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1); - - z &= mask; - if (unlikely(z)) { - u64a pos = ctz64(z); - return buf + pos; - } - return NULL; -} - -static really_inline -const u8 *dvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf, - const u8 *buf_end) { - uintptr_t len = buf_end - buf; - __mmask64 mask = (~0ULL) >> (64 - len); - m512 data = loadu_maskz_m512(mask, buf); - m512 casemask = set1_64x8(CASE_CLEAR); - m512 v = and512(casemask, data); - - u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1); - - z &= mask; - if (unlikely(z)) { - u64a pos = ctz64(z); - return buf + pos; - } - return NULL; -} - -static really_inline -const u8 *dvermMiniMasked(m512 chars1, m512 chars2, m512 mask1, m512 mask2, - const u8 *buf, const u8 *buf_end) { - uintptr_t len = buf_end - buf; - __mmask64 mask = (~0ULL) >> (64 - len); - m512 data = loadu_maskz_m512(mask, buf); - m512 v1 = and512(data, mask1); - m512 v2 = and512(data, mask2); - - u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1); - - z &= mask; - if (unlikely(z)) { - u64a pos = ctz64(z); - return buf + pos; - } - return NULL; -} - -static really_inline -const u8 *dvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - for (; buf + 64 < buf_end; buf += 64) { - m512 data = load512(buf); - u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1); - if (buf[63] == c1 && buf[64] == c2) { - z |= (1ULL << 63); - } - if (unlikely(z)) { - u64a pos = ctz64(z); - return buf + pos; - } - } - - return NULL; -} - -static really_inline -const u8 *dvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - assert((size_t)buf % 64 == 0); - m512 casemask = set1_64x8(CASE_CLEAR); - - for (; buf + 64 < buf_end; buf += 64) { - m512 data = load512(buf); - m512 v = and512(casemask, data); - u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1); - if ((buf[63] & CASE_CLEAR) == c1 && (buf[64] & CASE_CLEAR) == c2) { - z |= (1ULL << 63); - } - if (unlikely(z)) { - u64a pos = ctz64(z); - return buf + pos; - } - } - - return NULL; -} - -static really_inline -const u8 *dvermSearchAlignedMasked(m512 chars1, m512 chars2, - m512 mask1, m512 mask2, u8 c1, u8 c2, u8 m1, - u8 m2, const u8 *buf, const u8 *buf_end) { - assert((size_t)buf % 64 == 0); - - for (; buf + 64 < buf_end; buf += 64) { - m512 data = load512(buf); - m512 v1 = and512(data, mask1); - m512 v2 = and512(data, mask2); - u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1); - - if ((buf[63] & m1) == c1 && (buf[64] & m2) == c2) { - z |= (1ULL << 63); - } - if (unlikely(z)) { - u64a pos = ctz64(z); - return buf + pos; - } - } - - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *dvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) { - m512 data = loadu512(buf); // unaligned - u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - u64a pos = ctz64(z); - return buf + pos; - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *dvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) { - /* due to laziness, nonalphas and nocase having interesting behaviour */ - m512 casemask = set1_64x8(CASE_CLEAR); - m512 data = loadu512(buf); // unaligned - m512 v = and512(casemask, data); - u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - u64a pos = ctz64(z); - return buf + pos; - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *dvermPreconditionMasked(m512 chars1, m512 chars2, - m512 mask1, m512 mask2, const u8 *buf) { - m512 data = loadu512(buf); // unaligned - m512 v1 = and512(data, mask1); - m512 v2 = and512(data, mask2); - u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1); - - /* no fixup of the boundary required - the aligned run will pick it up */ - if (unlikely(z)) { - u64a pos = ctz64(z); - return buf + pos; - } - return NULL; -} - -static really_inline -const u8 *lastMatchOffset(const u8 *buf_end, u64a z) { - assert(z); - return buf_end - 64 + 63 - clz64(z); -} - -static really_inline -const u8 *rvermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) { - uintptr_t len = buf_end - buf; - __mmask64 mask = (~0ULL) >> (64 - len); - m512 data = loadu_maskz_m512(mask, buf); - - u64a z = eq512mask(chars, data); - - if (negate) { - z = ~z & mask; - } - z &= mask; - if (unlikely(z)) { - return lastMatchOffset(buf + 64, z); - } - return NULL; -} - -static really_inline -const u8 *rvermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end, - char negate) { - uintptr_t len = buf_end - buf; - __mmask64 mask = (~0ULL) >> (64 - len); - m512 data = loadu_maskz_m512(mask, buf); - m512 casemask = set1_64x8(CASE_CLEAR); - m512 v = and512(casemask, data); - - u64a z = eq512mask(chars, v); - - if (negate) { - z = ~z & mask; - } - z &= mask; - if (unlikely(z)) { - return lastMatchOffset(buf + 64, z); - } - return NULL; -} - -static really_inline -const u8 *rvermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end, - char negate) { - assert((size_t)buf_end % 64 == 0); - for (; buf + 63 < buf_end; buf_end -= 64) { - m512 data = load512(buf_end - 64); - u64a z = eq512mask(chars, data); - if (negate) { - z = ~z & ~0ULL; - } - if (unlikely(z)) { - return lastMatchOffset(buf_end, z); - } - } - return NULL; -} - -static really_inline -const u8 *rvermSearchAlignedNocase(m512 chars, const u8 *buf, - const u8 *buf_end, char negate) { - assert((size_t)buf_end % 64 == 0); - m512 casemask = set1_64x8(CASE_CLEAR); - - for (; buf + 63 < buf_end; buf_end -= 64) { - m512 data = load512(buf_end - 64); - u64a z = eq512mask(chars, and512(casemask, data)); - if (negate) { - z = ~z & ~0ULL; - } - if (unlikely(z)) { - return lastMatchOffset(buf_end, z); - } - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *rvermUnalign(m512 chars, const u8 *buf, char negate) { - m512 data = loadu512(buf); // unaligned - u64a z = eq512mask(chars, data); - if (negate) { - z = ~z & ~0ULL; - } - if (unlikely(z)) { - return lastMatchOffset(buf + 64, z); - } - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *rvermUnalignNocase(m512 chars, const u8 *buf, char negate) { - m512 casemask = set1_64x8(CASE_CLEAR); - m512 data = loadu512(buf); // unaligned - u64a z = eq512mask(chars, and512(casemask, data)); - if (negate) { - z = ~z & ~0ULL; - } - if (unlikely(z)) { - return lastMatchOffset(buf + 64, z); - } - return NULL; -} - -static really_inline -const u8 *rdvermMini(m512 chars1, m512 chars2, const u8 *buf, - const u8 *buf_end) { - uintptr_t len = buf_end - buf; - __mmask64 mask = (~0ULL) >> (64 - len); - m512 data = loadu_maskz_m512(mask, buf); - - u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1); - - z &= mask; - if (unlikely(z)) { - return lastMatchOffset(buf + 64, z); - } - return NULL; -} - -static really_inline -const u8 *rdvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf, - const u8 *buf_end) { - uintptr_t len = buf_end - buf; - __mmask64 mask = (~0ULL) >> (64 - len); - m512 data = loadu_maskz_m512(mask, buf); - m512 casemask = set1_64x8(CASE_CLEAR); - m512 v = and512(casemask, data); - - u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1); - - z &= mask; - if (unlikely(z)) { - return lastMatchOffset(buf + 64, z); - } - return NULL; -} - -static really_inline -const u8 *rdvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - assert((size_t)buf_end % 64 == 0); - - for (; buf + 64 < buf_end; buf_end -= 64) { - m512 data = load512(buf_end - 64); - u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1); - if (buf_end[-65] == c1 && buf_end[-64] == c2) { - z |= 1; - } - if (unlikely(z)) { - return lastMatchOffset(buf_end, z); - } - } - return buf_end; -} - -static really_inline -const u8 *rdvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2, - const u8 *buf, const u8 *buf_end) { - assert((size_t)buf_end % 64 == 0); - m512 casemask = set1_64x8(CASE_CLEAR); - - for (; buf + 64 < buf_end; buf_end -= 64) { - m512 data = load512(buf_end - 64); - m512 v = and512(casemask, data); - u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1); - if ((buf_end[-65] & CASE_CLEAR) == c1 - && (buf_end[-64] & CASE_CLEAR) == c2) { - z |= 1; - } - if (unlikely(z)) { - return lastMatchOffset(buf_end, z); - } - } - return buf_end; -} - -// returns NULL if not found -static really_inline -const u8 *rdvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) { - m512 data = loadu512(buf); - u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1); - - // no fixup of the boundary required - the aligned run will pick it up - if (unlikely(z)) { - return lastMatchOffset(buf + 64, z); - } - - return NULL; -} - -// returns NULL if not found -static really_inline -const u8 *rdvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) { - // due to laziness, nonalphas and nocase having interesting behaviour - m512 casemask = set1_64x8(CASE_CLEAR); - m512 data = loadu512(buf); - m512 v = and512(casemask, data); - u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1); - // no fixup of the boundary required - the aligned run will pick it up - if (unlikely(z)) { - return lastMatchOffset(buf + 64, z); - } - - return NULL; -} - -#endif // HAVE_AVX512 - -static really_inline -const u8 *vermicelliExec(char c, char nocase, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n", - nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); - assert(buf < buf_end); - - VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ - - // Handle small scans. -#ifdef HAVE_AVX512 - if (buf_end - buf <= VERM_BOUNDARY) { - const u8 *ptr = nocase - ? vermMiniNocase(chars, buf, buf_end, 0) - : vermMini(chars, buf, buf_end, 0); - if (ptr) { - return ptr; - } - return buf_end; - } -#else - if (buf_end - buf < VERM_BOUNDARY) { - for (; buf < buf_end; buf++) { - char cur = (char)*buf; - if (nocase) { - cur &= CASE_CLEAR; - } - if (cur == c) { - break; - } - } - return buf; - } -#endif - - uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; - if (min) { - // Input isn't aligned, so we need to run one iteration with an - // unaligned load, then skip buf forward to the next aligned address. - // There's some small overlap here, but we don't mind scanning it twice - // if we can do it quickly, do we? - const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 0) - : vermUnalign(chars, buf, 0); - if (ptr) { - return ptr; - } - - buf += VERM_BOUNDARY - min; - assert(buf < buf_end); - } - - // Aligned loops from here on in - const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 0) - : vermSearchAligned(chars, buf, buf_end - 1, 0); - if (ptr) { - return ptr; - } - - // Tidy up the mess at the end - ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 0) - : vermUnalign(chars, buf_end - VERM_BOUNDARY, 0); - return ptr ? ptr : buf_end; -} - -/* like vermicelliExec except returns the address of the first character which - * is not c */ -static really_inline -const u8 *nvermicelliExec(char c, char nocase, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("nverm scan %s\\x%02hhx over %zu bytes\n", - nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); - assert(buf < buf_end); - - VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ - - // Handle small scans. -#ifdef HAVE_AVX512 - if (buf_end - buf <= VERM_BOUNDARY) { - const u8 *ptr = nocase - ? vermMiniNocase(chars, buf, buf_end, 1) - : vermMini(chars, buf, buf_end, 1); - if (ptr) { - return ptr; - } - return buf_end; - } -#else - if (buf_end - buf < VERM_BOUNDARY) { - for (; buf < buf_end; buf++) { - char cur = (char)*buf; - if (nocase) { - cur &= CASE_CLEAR; - } - if (cur != c) { - break; - } - } - return buf; - } -#endif - - size_t min = (size_t)buf % VERM_BOUNDARY; - if (min) { - // Input isn't aligned, so we need to run one iteration with an - // unaligned load, then skip buf forward to the next aligned address. - // There's some small overlap here, but we don't mind scanning it twice - // if we can do it quickly, do we? - const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 1) - : vermUnalign(chars, buf, 1); - if (ptr) { - return ptr; - } - - buf += VERM_BOUNDARY - min; - assert(buf < buf_end); - } - - // Aligned loops from here on in - const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 1) - : vermSearchAligned(chars, buf, buf_end - 1, 1); - if (ptr) { - return ptr; - } - - // Tidy up the mess at the end - ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 1) - : vermUnalign(chars, buf_end - VERM_BOUNDARY, 1); - return ptr ? ptr : buf_end; -} - -// Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if -// character not found. -static really_inline -const u8 *rvermicelliExec(char c, char nocase, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n", - nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); - assert(buf < buf_end); - - VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ - - // Handle small scans. -#ifdef HAVE_AVX512 - if (buf_end - buf <= VERM_BOUNDARY) { - const u8 *ptr = nocase - ? rvermMiniNocase(chars, buf, buf_end, 0) - : rvermMini(chars, buf, buf_end, 0); - if (ptr) { - return ptr; - } - return buf - 1; - } -#else - if (buf_end - buf < VERM_BOUNDARY) { - for (buf_end--; buf_end >= buf; buf_end--) { - char cur = (char)*buf_end; - if (nocase) { - cur &= CASE_CLEAR; - } - if (cur == c) { - break; - } - } - return buf_end; - } -#endif - - size_t min = (size_t)buf_end % VERM_BOUNDARY; - if (min) { - // Input isn't aligned, so we need to run one iteration with an - // unaligned load, then skip buf backward to the next aligned address. - // There's some small overlap here, but we don't mind scanning it twice - // if we can do it quickly, do we? - const u8 *ptr = nocase ? rvermUnalignNocase(chars, - buf_end - VERM_BOUNDARY, - 0) - : rvermUnalign(chars, buf_end - VERM_BOUNDARY, - 0); - - if (ptr) { - return ptr; - } - - buf_end -= min; - if (buf >= buf_end) { - return buf_end; - } - } - - // Aligned loops from here on in. - const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 0) - : rvermSearchAligned(chars, buf, buf_end, 0); - if (ptr) { - return ptr; - } - - // Tidy up the mess at the end, return buf - 1 if not found. - ptr = nocase ? rvermUnalignNocase(chars, buf, 0) - : rvermUnalign(chars, buf, 0); - return ptr ? ptr : buf - 1; -} - -/* like rvermicelliExec except returns the address of the last character which - * is not c */ -static really_inline -const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n", - nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); - assert(buf < buf_end); - - VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ - - // Handle small scans. -#ifdef HAVE_AVX512 - if (buf_end - buf <= VERM_BOUNDARY) { - const u8 *ptr = nocase - ? rvermMiniNocase(chars, buf, buf_end, 1) - : rvermMini(chars, buf, buf_end, 1); - if (ptr) { - return ptr; - } - return buf - 1; - } -#else - if (buf_end - buf < VERM_BOUNDARY) { - for (buf_end--; buf_end >= buf; buf_end--) { - char cur = (char)*buf_end; - if (nocase) { - cur &= CASE_CLEAR; - } - if (cur != c) { - break; - } - } - return buf_end; - } -#endif - - size_t min = (size_t)buf_end % VERM_BOUNDARY; - if (min) { - // Input isn't aligned, so we need to run one iteration with an - // unaligned load, then skip buf backward to the next aligned address. - // There's some small overlap here, but we don't mind scanning it twice - // if we can do it quickly, do we? - const u8 *ptr = nocase ? rvermUnalignNocase(chars, - buf_end - VERM_BOUNDARY, - 1) - : rvermUnalign(chars, buf_end - VERM_BOUNDARY, - 1); - - if (ptr) { - return ptr; - } - - buf_end -= min; - if (buf >= buf_end) { - return buf_end; - } - } - - // Aligned loops from here on in. - const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 1) - : rvermSearchAligned(chars, buf, buf_end, 1); - if (ptr) { - return ptr; - } - - // Tidy up the mess at the end, return buf - 1 if not found. - ptr = nocase ? rvermUnalignNocase(chars, buf, 1) - : rvermUnalign(chars, buf, 1); - return ptr ? ptr : buf - 1; -} - -static really_inline -const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n", - nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf)); - assert(buf < buf_end); - - VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */ - VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */ - -#ifdef HAVE_AVX512 - if (buf_end - buf <= VERM_BOUNDARY) { - const u8 *ptr = nocase - ? dvermMiniNocase(chars1, chars2, buf, buf_end) - : dvermMini(chars1, chars2, buf, buf_end); - if (ptr) { - return ptr; - } - - /* check for partial match at end */ - u8 mask = nocase ? CASE_CLEAR : 0xff; - if ((buf_end[-1] & mask) == (u8)c1) { - DEBUG_PRINTF("partial!!!\n"); - return buf_end - 1; - } - - return buf_end; - } -#endif - - assert((buf_end - buf) >= VERM_BOUNDARY); - uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; - if (min) { - // Input isn't aligned, so we need to run one iteration with an - // unaligned load, then skip buf forward to the next aligned address. - // There's some small overlap here, but we don't mind scanning it twice - // if we can do it quickly, do we? - const u8 *ptr = nocase - ? dvermPreconditionNocase(chars1, chars2, buf) - : dvermPrecondition(chars1, chars2, buf); - if (ptr) { - return ptr; - } - - buf += VERM_BOUNDARY - min; - assert(buf < buf_end); - } - - // Aligned loops from here on in - const u8 *ptr = nocase ? dvermSearchAlignedNocase(chars1, chars2, c1, c2, - buf, buf_end) - : dvermSearchAligned(chars1, chars2, c1, c2, buf, - buf_end); - if (ptr) { - return ptr; - } - - // Tidy up the mess at the end - ptr = nocase ? dvermPreconditionNocase(chars1, chars2, - buf_end - VERM_BOUNDARY) - : dvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY); - - if (ptr) { - return ptr; - } - - /* check for partial match at end */ - u8 mask = nocase ? CASE_CLEAR : 0xff; - if ((buf_end[-1] & mask) == (u8)c1) { - DEBUG_PRINTF("partial!!!\n"); - return buf_end - 1; - } - - return buf_end; -} - -/* returns highest offset of c2 (NOTE: not c1) */ -static really_inline -const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n", - nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf)); - assert(buf < buf_end); - - VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */ - VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */ - -#ifdef HAVE_AVX512 - if (buf_end - buf <= VERM_BOUNDARY) { - const u8 *ptr = nocase - ? rdvermMiniNocase(chars1, chars2, buf, buf_end) - : rdvermMini(chars1, chars2, buf, buf_end); - - if (ptr) { - return ptr; - } - - // check for partial match at end ??? - return buf - 1; - } -#endif - - assert((buf_end - buf) >= VERM_BOUNDARY); - size_t min = (size_t)buf_end % VERM_BOUNDARY; - if (min) { - // input not aligned, so we need to run one iteration with an unaligned - // load, then skip buf forward to the next aligned address. There's - // some small overlap here, but we don't mind scanning it twice if we - // can do it quickly, do we? - const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2, - buf_end - VERM_BOUNDARY) - : rdvermPrecondition(chars1, chars2, - buf_end - VERM_BOUNDARY); - - if (ptr) { - return ptr; - } - - buf_end -= min; - if (buf >= buf_end) { - return buf_end; - } - } - - // Aligned loops from here on in - if (nocase) { - return rdvermSearchAlignedNocase(chars1, chars2, c1, c2, buf, buf_end); - } else { - return rdvermSearchAligned(chars1, chars2, c1, c2, buf, buf_end); - } -} From 35e5369c708f429d1ab3492dba4ddd71b263fcdf Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Wed, 24 Nov 2021 15:03:49 +0000 Subject: [PATCH 33/37] *fix palignr implementation for VSX Release mode *add unit test for palignr *enable unit test building for Release mode --- src/util/arch/ppc64el/simd_utils.h | 1 + unit/CMakeLists.txt | 24 +++++++++++++++++------- unit/internal/simd_utils.cpp | 25 +++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index a932682b..137fc94f 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -381,6 +381,7 @@ m128 palignr_imm(m128 r, m128 l, int offset) { static really_really_inline m128 palignr(m128 r, m128 l, int offset) { #if defined(HS_OPTIMIZE) + // need a faster way to do this. return palignr_imm(r, l, offset); #else return palignr_imm(r, l, offset); diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt index 859f7ac0..932cd65e 100644 --- a/unit/CMakeLists.txt +++ b/unit/CMakeLists.txt @@ -63,7 +63,7 @@ target_link_libraries(unit-hyperscan hs expressionutil) endif() -if (NOT (RELEASE_BUILD OR FAT_RUNTIME)) +if (NOT FAT_RUNTIME ) set(unit_internal_SOURCES ${gtest_SOURCES} internal/bitfield.cpp @@ -72,8 +72,8 @@ set(unit_internal_SOURCES internal/compare.cpp internal/database.cpp internal/depth.cpp - internal/fdr.cpp - internal/fdr_flood.cpp + #internal/fdr.cpp + #internal/fdr_flood.cpp internal/fdr_loadval.cpp internal/flat_set.cpp internal/flat_map.cpp @@ -81,7 +81,7 @@ set(unit_internal_SOURCES internal/graph_undirected.cpp internal/insertion_ordered.cpp internal/lbr.cpp - internal/limex_nfa.cpp + #internal/limex_nfa.cpp internal/multi_bit.cpp internal/multi_bit_compress.cpp internal/nfagraph_common.h @@ -121,13 +121,22 @@ if (BUILD_AVX2) set(unit_internal_SOURCES ${unit_internal_SOURCES} internal/masked_move.cpp - ) + ) endif(BUILD_AVX2) +if (NOT RELEASE_BUILD) +set(unit_internal_SOURCES + ${unit_internal_SOURCES} + internal/fdr.cpp + internal/fdr_flood.cpp + internal/limex_nfa.cpp + ) +endif(NOT RELEASE_BUILD) + add_executable(unit-internal ${unit_internal_SOURCES}) set_target_properties(unit-internal PROPERTIES COMPILE_FLAGS "${HS_CXX_FLAGS}") target_link_libraries(unit-internal hs corpusomatic) -endif(NOT (RELEASE_BUILD OR FAT_RUNTIME)) +endif(NOT FAT_RUNTIME) if (BUILD_CHIMERA) # enable Chimera unit tests @@ -178,9 +187,10 @@ else() else () add_custom_target( unit + COMMAND bin/unit-internal COMMAND bin/unit-hyperscan WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - DEPENDS unit-hyperscan + DEPENDS unit-internal unit-hyperscan ) endif() endif() diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index b1b9bfb1..928abbfb 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -917,4 +917,29 @@ TEST(SimdUtilsTest, pshufb_m128) { } +/*Define ALIGNR128 macro*/ +#define TEST_ALIGNR128(v1, v2, buf, l) { \ + m128 v_aligned =palignr(v2,v1, l); \ + storeu128(res, v_aligned); \ + for (size_t i=0; i<16; i++) { \ + ASSERT_EQ(res[i], vec[i + l]); \ + } \ + } + +TEST(SimdUtilsTest, Alignr128){ + u8 vec[32]; + u8 res[16]; + for (int i=0; i<32; i++) { + vec[i]=i; + } + m128 v1 = loadu128(vec); + m128 v2 = loadu128(vec+16); + for (int j = 0; j<16; j++){ + TEST_ALIGNR128(v1, v2, vec, j); + } +} + + + + } // namespace From 725a8d8f1ab6e03e64ef01da84fc718a45132da0 Mon Sep 17 00:00:00 2001 From: Apostolos Tapsas Date: Wed, 24 Nov 2021 15:09:53 +0000 Subject: [PATCH 34/37] Removed duplicates --- unit/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt index 932cd65e..ffc39a5f 100644 --- a/unit/CMakeLists.txt +++ b/unit/CMakeLists.txt @@ -72,8 +72,6 @@ set(unit_internal_SOURCES internal/compare.cpp internal/database.cpp internal/depth.cpp - #internal/fdr.cpp - #internal/fdr_flood.cpp internal/fdr_loadval.cpp internal/flat_set.cpp internal/flat_map.cpp @@ -81,7 +79,6 @@ set(unit_internal_SOURCES internal/graph_undirected.cpp internal/insertion_ordered.cpp internal/lbr.cpp - #internal/limex_nfa.cpp internal/multi_bit.cpp internal/multi_bit_compress.cpp internal/nfagraph_common.h From cd95b1a38c6b49474abb51e0fc8e2b8669141228 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 25 Nov 2021 06:20:53 +0000 Subject: [PATCH 35/37] use __builtin_constant_p() instead for arm as well --- src/util/arch/arm/simd_utils.h | 9 ++--- src/util/supervector/arch/arm/impl.cpp | 46 ++++++++++---------------- 2 files changed, 23 insertions(+), 32 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 630cac93..4c68b485 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -328,11 +328,12 @@ m128 palignr_imm(m128 r, m128 l, int offset) { static really_really_inline m128 palignr(m128 r, m128 l, int offset) { -#if defined(HS_OPTIMIZE) - return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); -#else - return palignr_imm(r, l, offset); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(offset)) { + return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); + } #endif + return palignr_imm(r, l, offset); } #undef CASE_ALIGN_VECTORS diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp index f804abeb..980f0b39 100644 --- a/src/util/supervector/arch/arm/impl.cpp +++ b/src/util/supervector/arch/arm/impl.cpp @@ -482,34 +482,27 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const return vshr_128(N); } -#ifdef HS_OPTIMIZE -template <> -really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const -{ - return {vextq_u8(u.u8x16[0], vdupq_n_u8(0), N)}; -} -#else template <> really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {vextq_u8(u.u8x16[0], vdupq_n_u8(0), N)}; + } +#endif return vshr_128(N); } -#endif -#ifdef HS_OPTIMIZE -template <> -really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const -{ - return {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - N)}; -} -#else template <> really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + return {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - N)}; + } +#endif return vshl_128(N); } -#endif - template<> really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N) @@ -547,20 +540,18 @@ really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint return mask & v; } -#ifdef HS_OPTIMIZE template<> really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) { - if (offset == 16) { - return *this; - } else { - return {vextq_u8(other.u.u8x16[0], u.u8x16[0], offset)}; +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(offset)) { + if (offset == 16) { + return *this; + } else { + return {vextq_u8(other.u.u8x16[0], u.u8x16[0], offset)}; + } } -} -#else -template<> -really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) -{ +#endif switch(offset) { case 0: return other; break; case 1: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 1)}; break; @@ -583,7 +574,6 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in } return *this; } -#endif template<> template<> From 00384c9e377286e6742b4ab606c79b6fd3dbf06a Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 25 Nov 2021 06:21:07 +0000 Subject: [PATCH 36/37] nit --- unit/internal/simd_utils.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 928abbfb..900078bb 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -916,10 +916,9 @@ TEST(SimdUtilsTest, pshufb_m128) { } } - /*Define ALIGNR128 macro*/ #define TEST_ALIGNR128(v1, v2, buf, l) { \ - m128 v_aligned =palignr(v2,v1, l); \ + m128 v_aligned = palignr(v2,v1, l); \ storeu128(res, v_aligned); \ for (size_t i=0; i<16; i++) { \ ASSERT_EQ(res[i], vec[i + l]); \ From 7ceca78db4486c2d8a075be66520fa79a269bbfd Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 25 Nov 2021 15:09:01 +0200 Subject: [PATCH 37/37] fix unit-internal release builds using __builtin_constant_p() as well --- src/util/supervector/arch/x86/impl.cpp | 101 ++++++++++++------------- 1 file changed, 49 insertions(+), 52 deletions(-) diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp index 164c4e8b..b7686220 100644 --- a/src/util/supervector/arch/x86/impl.cpp +++ b/src/util/supervector/arch/x86/impl.cpp @@ -520,16 +520,18 @@ really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint return mask & v; } -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) -{ - return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], offset)}; -} -#else template<> really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(offset)) { + if (offset == 16) { + return *this; + } else { + return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], offset)}; + } + } +#endif switch(offset) { case 0: return other; break; case 1: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 1)}; break; @@ -551,7 +553,6 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in } return *this; } -#endif template<> template<> @@ -1037,47 +1038,41 @@ really_inline SuperVector<32> SuperVector<32>::vshr(uint8_t const N) const return vshr_256(N); } -#ifdef HS_OPTIMIZE template <> really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const { - // As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx - if (N < 16) { - return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], N)}; - } else if (N == 16) { - return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))}; - } else { - return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16)}; +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + // As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx + if (N < 16) { + return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], N)}; + } else if (N == 16) { + return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))}; + } else { + return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16)}; + } } -} -#else -template <> -really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const -{ +#endif return vshr_256(N); } -#endif -#ifdef HS_OPTIMIZE template <> really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const { - // As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx - if (N < 16) { - return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)}; - } else if (N == 16) { - return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))}; - } else { - return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), N - 16)}; +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(N)) { + // As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx + if (N < 16) { + return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)}; + } else if (N == 16) { + return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))}; + } else { + return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), N - 16)}; + } } -} -#else -template <> -really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const -{ +#endif return vshl_256(N); } -#endif template<> really_inline SuperVector<32> SuperVector<32>::Ones_vshr(uint8_t const N) @@ -1132,16 +1127,18 @@ really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, uint #endif } -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset) -{ - return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], offset)}; -} -#else template<> really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(offset)) { + if (offset == 16) { + return *this; + } else { + return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], offset)}; + } + } +#endif // As found here: https://stackoverflow.com/questions/8517970/mm-alignr-epi8-palignr-equivalent-in-avx2#8637458 switch (offset){ case 0 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 0), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 0)); break; @@ -1180,7 +1177,6 @@ really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, in } return *this; } -#endif template<> template<> @@ -1772,16 +1768,18 @@ really_inline SuperVector<64> SuperVector<64>::pshufb_maskz(SuperVector<64> b, u return {_mm512_maskz_shuffle_epi8(mask, u.v512[0], b.u.v512[0])}; } -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t offset) -{ - return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], offset)}; -} -#else template<> really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t offset) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(offset)) { + if (offset == 16) { + return *this; + } else { + return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], offset)}; + } + } +#endif if(offset == 0) { return *this; } else if (offset < 32){ @@ -1802,7 +1800,6 @@ really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t return *this; } } -#endif #endif // HAVE_AVX512