From 2d89df44ae9cd83a58ec949733a4a7bd1456a193 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 17 Sep 2020 19:00:48 +0300 Subject: [PATCH 01/53] move x86 arch and SIMD types to x86 arch folder --- src/util/arch.h | 55 ++----------------- src/util/arch/x86/simd_types.h | 45 ++++++++++++++++ src/util/arch/x86/x86.h | 96 ++++++++++++++++++++++++++++++++++ src/util/simd_types.h | 16 +++--- src/util/simd_utils.h | 2 +- 5 files changed, 152 insertions(+), 62 deletions(-) create mode 100644 src/util/arch/x86/simd_types.h create mode 100644 src/util/arch/x86/x86.h diff --git a/src/util/arch.h b/src/util/arch.h index 985fec6a..57e39c07 100644 --- a/src/util/arch.h +++ b/src/util/arch.h @@ -33,58 +33,9 @@ #ifndef UTIL_ARCH_H_ #define UTIL_ARCH_H_ -#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2) -#define HAVE_SSE2 +#if defined(__i386__) || defined(__x86_64__) +#include "util/arch/x86/x86.h" #endif -#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__)) -#define HAVE_SSE41 -#endif +#endif // UTIL_ARCH_X86_H_ -#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__)) -#define HAVE_SSE42 -#endif - -#if defined(__AVX__) -#define HAVE_AVX -#endif - -#if defined(__AVX2__) -#define HAVE_AVX2 -#endif - -#if defined(__AVX512BW__) -#define HAVE_AVX512 -#endif - -#if defined(__AVX512VBMI__) -#define HAVE_AVX512VBMI -#endif - -/* - * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros - */ -#if defined(__POPCNT__) || \ - (defined(__INTEL_COMPILER) && defined(__SSE4_2__)) || \ - (defined(_WIN32) && defined(__AVX__)) -#define HAVE_POPCOUNT_INSTR -#endif - -#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) || \ - (defined(__INTEL_COMPILER) && defined(__AVX2__)) -#define HAVE_BMI -#endif - -#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) || \ - (defined(__INTEL_COMPILER) && defined(__AVX2__)) -#define HAVE_BMI2 -#endif - -/* - * MSVC uses a different form of inline asm - */ -#if defined(_WIN32) && defined(_MSC_VER) -#define NO_ASM -#endif - -#endif // UTIL_ARCH_H_ diff --git a/src/util/arch/x86/simd_types.h b/src/util/arch/x86/simd_types.h new file mode 100644 index 00000000..a582abd5 --- /dev/null +++ b/src/util/arch/x86/simd_types.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SIMD_TYPES_X86_H +#define SIMD_TYPES_X86_H + +#if !defined(m128) && defined(HAVE_SSE2) +typedef __m128i m128; +#endif + +#if !defined(m128) && defined(HAVE_AVX2) +typedef __m256i m256; +#endif + +#if !defined(m512) && defined(HAVE_AVX512) +typedef __m512i m512; +#endif + +#endif /* SIMD_TYPES_H */ + diff --git a/src/util/arch/x86/x86.h b/src/util/arch/x86/x86.h new file mode 100644 index 00000000..8126f14a --- /dev/null +++ b/src/util/arch/x86/x86.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2017-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Per-platform architecture definitions + */ + +#ifndef UTIL_ARCH_X86_H_ +#define UTIL_ARCH_X86_H_ + +#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2) +#define HAVE_SSE2 +#define HAVE_SIMD_128_BITS +#endif + +#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__)) +#define HAVE_SSE41 +#define HAVE_SIMD_128_BITS +#endif + +#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__)) +#define HAVE_SSE42 +#define HAVE_SIMD_128_BITS +#endif + +#if defined(__AVX__) +#define HAVE_AVX +#define HAVE_SIMD_256_BITS +#endif + +#if defined(__AVX2__) +#define HAVE_AVX2 +#define HAVE_SIMD_256_BITS +#endif + +#if defined(__AVX512BW__) +#define HAVE_AVX512 +#define HAVE_SIMD_512_BITS +#endif + +#if defined(__AVX512VBMI__) +#define HAVE_AVX512VBMI +#endif + +/* + * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros + */ +#if defined(__POPCNT__) || \ + (defined(__INTEL_COMPILER) && defined(__SSE4_2__)) || \ + (defined(_WIN32) && defined(__AVX__)) +#define HAVE_POPCOUNT_INSTR +#endif + +#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) || \ + (defined(__INTEL_COMPILER) && defined(__AVX2__)) +#define HAVE_BMI +#endif + +#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) || \ + (defined(__INTEL_COMPILER) && defined(__AVX2__)) +#define HAVE_BMI2 +#endif + +/* + * MSVC uses a different form of inline asm + */ +#if defined(_WIN32) && defined(_MSC_VER) +#define NO_ASM +#endif + +#endif // UTIL_ARCH_X86_H_ diff --git a/src/util/simd_types.h b/src/util/simd_types.h index 962cad6c..a58ede4d 100644 --- a/src/util/simd_types.h +++ b/src/util/simd_types.h @@ -34,22 +34,20 @@ #include "util/intrinsics.h" #include "ue2common.h" -#if defined(HAVE_SSE2) -typedef __m128i m128; -#else +#if defined(__i386__) || defined(__x86_64__) +#include "util/arch/x86/simd_types.h" +#endif + +#if !defined(m128) && !defined(HAVE_SIMD_128_BITS) typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128; #endif -#if defined(HAVE_AVX2) -typedef __m256i m256; -#else +#if !defined(m256) && !defined(HAVE_SIMD_256_BITS) typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256; #endif typedef struct {m128 lo; m128 mid; m128 hi;} m384; -#if defined(HAVE_AVX512) -typedef __m512i m512; -#else +#if !defined(m512) && !defined(HAVE_SIMD_512_BITS) typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512; #endif diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index 42223133..671a5bab 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -38,10 +38,10 @@ #endif #include "config.h" +#include "util/arch.h" #include "ue2common.h" #include "simd_types.h" #include "unaligned.h" -#include "util/arch.h" #include "util/intrinsics.h" #include // for memcpy From 6a407937197744252b1e90cc22245d8c9d8a80ae Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 17 Sep 2020 20:35:39 +0300 Subject: [PATCH 02/53] move cpuid stuff to util/arch/x86 --- CMakeLists.txt | 4 ++-- src/dispatcher.c | 4 +++- src/hs.cpp | 6 ++++-- src/hs_valid_platform.c | 6 ++++-- src/util/{ => arch/x86}/cpuid_flags.c | 0 src/util/{ => arch/x86}/cpuid_flags.h | 0 src/util/{ => arch/x86}/cpuid_inline.h | 0 src/util/target_info.cpp | 4 +++- 8 files changed, 16 insertions(+), 8 deletions(-) rename src/util/{ => arch/x86}/cpuid_flags.c (100%) rename src/util/{ => arch/x86}/cpuid_flags.h (100%) rename src/util/{ => arch/x86}/cpuid_inline.h (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 59c6e6e2..9cd6ad96 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -564,8 +564,8 @@ install(FILES ${hs_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs") set (hs_exec_common_SRCS src/alloc.c src/scratch.c - src/util/cpuid_flags.c - src/util/cpuid_flags.h + src/util/arch/x86/cpuid_flags.c + src/util/arch/x86/cpuid_flags.h src/util/multibit.c ) diff --git a/src/dispatcher.c b/src/dispatcher.c index a786b806..76ed37a1 100644 --- a/src/dispatcher.c +++ b/src/dispatcher.c @@ -30,7 +30,9 @@ #include "hs_common.h" #include "hs_runtime.h" #include "ue2common.h" -#include "util/cpuid_inline.h" +#if defined(ARCH_X86_64) +#include "util/arch/x86/cpuid_inline.h" +#endif #include "util/join.h" #if defined(DISABLE_AVX512_DISPATCH) diff --git a/src/hs.cpp b/src/hs.cpp index ab54105c..a0cb9bb3 100644 --- a/src/hs.cpp +++ b/src/hs.cpp @@ -44,8 +44,10 @@ #include "parser/prefilter.h" #include "parser/unsupported.h" #include "util/compile_error.h" -#include "util/cpuid_flags.h" -#include "util/cpuid_inline.h" +#if defined(ARCH_X86_64) +#include "util/arch/x86/cpuid_flags.h" +#include "util/arch/x86/cpuid_inline.h" +#endif #include "util/depth.h" #include "util/popcount.h" #include "util/target_info.h" diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c index 59ad3f3a..7a022607 100644 --- a/src/hs_valid_platform.c +++ b/src/hs_valid_platform.c @@ -26,9 +26,11 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include "config.h" #include "hs_common.h" -#include "util/cpuid_flags.h" -#include "util/cpuid_inline.h" +#if defined(ARCH_X86_64) +#include "util/arch/x86/cpuid_inline.h" +#endif HS_PUBLIC_API hs_error_t HS_CDECL hs_valid_platform(void) { diff --git a/src/util/cpuid_flags.c b/src/util/arch/x86/cpuid_flags.c similarity index 100% rename from src/util/cpuid_flags.c rename to src/util/arch/x86/cpuid_flags.c diff --git a/src/util/cpuid_flags.h b/src/util/arch/x86/cpuid_flags.h similarity index 100% rename from src/util/cpuid_flags.h rename to src/util/arch/x86/cpuid_flags.h diff --git a/src/util/cpuid_inline.h b/src/util/arch/x86/cpuid_inline.h similarity index 100% rename from src/util/cpuid_inline.h rename to src/util/arch/x86/cpuid_inline.h diff --git a/src/util/target_info.cpp b/src/util/target_info.cpp index 3a41e020..6eab701d 100644 --- a/src/util/target_info.cpp +++ b/src/util/target_info.cpp @@ -29,7 +29,9 @@ #include "hs_compile.h" // for various hs_platform_info flags #include "target_info.h" -#include "util/cpuid_flags.h" +#if defined(ARCH_X86_64) +#include "util/arch/x86/cpuid_flags.h" +#endif namespace ue2 { From ea721c908f9baa75c50320285f552ee995669191 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 18 Sep 2020 12:48:14 +0300 Subject: [PATCH 03/53] move crc32 SSE42 implementation to util/arch/x86 --- src/crc32.c | 49 +---------------------- src/util/arch/x86/crc32.h | 82 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 48 deletions(-) create mode 100644 src/util/arch/x86/crc32.h diff --git a/src/crc32.c b/src/crc32.c index 1dae47b4..19c7b7fa 100644 --- a/src/crc32.c +++ b/src/crc32.c @@ -30,7 +30,6 @@ #include "config.h" #include "ue2common.h" #include "util/arch.h" -#include "util/intrinsics.h" #if !defined(HAVE_SSE42) @@ -579,53 +578,7 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf, } #else // HAVE_SSE42 - -#ifdef ARCH_64_BIT -#define CRC_WORD 8 -#define CRC_TYPE u64a -#define CRC_FUNC _mm_crc32_u64 -#else -#define CRC_WORD 4 -#define CRC_TYPE u32 -#define CRC_FUNC _mm_crc32_u32 -#endif - -/* - * Use the crc32 instruction from SSE4.2 to compute our checksum - same - * polynomial as the above function. - */ -static really_inline -u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf, - const size_t length) { - u32 crc = running_crc; - - // Process byte-by-byte until p_buf is aligned - - const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD); - size_t init_bytes = aligned_buf - p_buf; - size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD; - size_t end_bytes = length - init_bytes - running_length; - - while (p_buf < aligned_buf) { - crc = _mm_crc32_u8(crc, *p_buf++); - } - - // Main aligned loop, processes a word at a time. - - for (size_t li = 0; li < running_length/CRC_WORD; li++) { - CRC_TYPE block = *(const CRC_TYPE *)p_buf; - crc = CRC_FUNC(crc, block); - p_buf += CRC_WORD; - } - - // Remaining bytes - - for(size_t li = 0; li < end_bytes; li++) { - crc = _mm_crc32_u8(crc, *p_buf++); - } - - return crc; -} +#include "util/arch/x86/crc32.h" #endif #ifdef VERIFY_ASSERTION diff --git a/src/util/arch/x86/crc32.h b/src/util/arch/x86/crc32.h new file mode 100644 index 00000000..d5e7d424 --- /dev/null +++ b/src/util/arch/x86/crc32.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef UTIL_ARCH_X86_CRC32_H_ +#define UTIL_ARCH_X86_CRC32_H_ + +#include "util/arch/x86/x86.h" +#include "util/intrinsics.h" + +#ifdef ARCH_64_BIT +#define CRC_WORD 8 +#define CRC_TYPE u64a +#define CRC_FUNC _mm_crc32_u64 +#else +#define CRC_WORD 4 +#define CRC_TYPE u32 +#define CRC_FUNC _mm_crc32_u32 +#endif + +/* + * Use the crc32 instruction from SSE4.2 to compute our checksum - same + * polynomial as the above function. + */ +static really_inline +u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf, + const size_t length) { + u32 crc = running_crc; + + // Process byte-by-byte until p_buf is aligned + + const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD); + size_t init_bytes = aligned_buf - p_buf; + size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD; + size_t end_bytes = length - init_bytes - running_length; + + while (p_buf < aligned_buf) { + crc = _mm_crc32_u8(crc, *p_buf++); + } + + // Main aligned loop, processes a word at a time. + + for (size_t li = 0; li < running_length/CRC_WORD; li++) { + CRC_TYPE block = *(const CRC_TYPE *)p_buf; + crc = CRC_FUNC(crc, block); + p_buf += CRC_WORD; + } + + // Remaining bytes + + for(size_t li = 0; li < end_bytes; li++) { + crc = _mm_crc32_u8(crc, *p_buf++); + } + + return crc; +} + +#endif // UTIL_ARCH_X86_CRC32_H_ \ No newline at end of file From 956b001613ef301e9e5b2e2742c9bad3037ddaef Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 18 Sep 2020 12:51:39 +0300 Subject: [PATCH 04/53] move masked_move* AVX2 implementation to util/arch/x86 --- src/util/{ => arch/x86}/masked_move.c | 0 src/util/{ => arch/x86}/masked_move.h | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename src/util/{ => arch/x86}/masked_move.c (100%) rename src/util/{ => arch/x86}/masked_move.h (100%) diff --git a/src/util/masked_move.c b/src/util/arch/x86/masked_move.c similarity index 100% rename from src/util/masked_move.c rename to src/util/arch/x86/masked_move.c diff --git a/src/util/masked_move.h b/src/util/arch/x86/masked_move.h similarity index 100% rename from src/util/masked_move.h rename to src/util/arch/x86/masked_move.h From 8ed5f4ac757b7eca7baf5dc58c3552f2bdc792c2 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 18 Sep 2020 12:55:57 +0300 Subject: [PATCH 05/53] fix include paths for masked_move --- CMakeLists.txt | 4 ++-- src/hwlm/noodle_engine.c | 5 ++++- src/util/arch/x86/masked_move.h | 6 +++--- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cd6ad96..e5078848 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -694,7 +694,6 @@ set (hs_exec_SRCS src/util/exhaust.h src/util/fatbit.h src/util/join.h - src/util/masked_move.h src/util/multibit.h src/util/multibit.c src/util/multibit_compress.h @@ -716,7 +715,8 @@ set (hs_exec_SRCS set (hs_exec_avx2_SRCS src/fdr/teddy_avx2.c - src/util/masked_move.c + src/util/arch/x86/masked_move.c + src/util/arch/x86/masked_move.h ) diff --git a/src/hwlm/noodle_engine.c b/src/hwlm/noodle_engine.c index d4f6902a..da61dfe8 100644 --- a/src/hwlm/noodle_engine.c +++ b/src/hwlm/noodle_engine.c @@ -39,10 +39,13 @@ #include "util/compare.h" #include "util/intrinsics.h" #include "util/join.h" -#include "util/masked_move.h" #include "util/partial_store.h" #include "util/simd_utils.h" +#if defined(HAVE_AVX2) +#include "util/arch/x86/masked_move.h" +#endif + #include #include #include diff --git a/src/util/arch/x86/masked_move.h b/src/util/arch/x86/masked_move.h index 4c877ca9..c46ad144 100644 --- a/src/util/arch/x86/masked_move.h +++ b/src/util/arch/x86/masked_move.h @@ -29,12 +29,12 @@ #ifndef MASKED_MOVE_H #define MASKED_MOVE_H -#include "arch.h" +#include "x86.h" #if defined(HAVE_AVX2) -#include "unaligned.h" -#include "simd_utils.h" +#include "util/unaligned.h" +#include "util/simd_utils.h" #ifdef __cplusplus extern "C" { From aac1f0f1dc2bdbdf330198e84e972871371a5ab0 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 22 Sep 2020 11:02:07 +0300 Subject: [PATCH 06/53] move x86 bitutils.h implementations to util/arch/x86/bitutils.h --- src/util/arch/common/bitutils.h | 353 +++++++++++++++++++++++++++++ src/util/arch/x86/bitutils.h | 304 +++++++++++++++++++++++++ src/util/bitutils.h | 384 +++----------------------------- 3 files changed, 688 insertions(+), 353 deletions(-) create mode 100644 src/util/arch/common/bitutils.h create mode 100644 src/util/arch/x86/bitutils.h diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h new file mode 100644 index 00000000..85d5dc49 --- /dev/null +++ b/src/util/arch/common/bitutils.h @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Bit-twiddling primitives (ctz, compress etc) + */ + +#ifndef BITUTILS_ARCH_COMMON_H +#define BITUTILS_ARCH_COMMON_H + +#include "util/popcount.h" + +static really_inline +u32 clz32_impl_c(u32 x) { + return (u32)__builtin_clz(x); +} + +static really_inline +u32 clz64_impl_c(u64a x) { + return (u32)__builtin_clzll(x); +} + +// CTZ (count trailing zero) implementations. +static really_inline +u32 ctz32_impl_c(u32 x) { + return (u32)__builtin_ctz(x); +} + +static really_inline +u32 ctz64_impl_c(u64a x) { + return (u32)__builtin_ctzll(x); +} + +static really_inline +u32 lg2_impl_c(u32 x) { + if (!x) { + return 0; + } + return 31 - clz32_impl_c(x); +} + +static really_inline +u64a lg2_64_impl_c(u64a x) { + if (!x) { + return 0; + } + return 63 - clz64_impl_c(x); +} + +static really_inline +u32 findAndClearLSB_32_impl_c(u32 *v) { + u32 val = *v; + u32 offset = ctz32_impl_c(val); + *v = val & (val - 1); + + assert(offset < 32); + return offset; +} + +static really_inline +u32 findAndClearLSB_64_impl_c(u64a *v) { +#ifdef ARCH_64_BIT + // generic variant using gcc's builtin on 64-bit + u64a val = *v, offset; + offset = ctz64_impl_c(val); + *v = val & (val - 1); +#else + // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't + // inline calls to __builtin_ctzll + u32 v1 = (u32)*v; + u32 v2 = (u32)(*v >> 32); + u32 offset; + if (v1) { + offset = findAndClearLSB_32_impl_c(&v1); + *v = (u64a)v1 | ((u64a)v2 << 32); + } else { + offset = findAndClearLSB_32_impl_c(&v2) + 32; + *v = (u64a)v2 << 32; + } +#endif + + assert(offset < 64); + return (u32)offset; +} + +static really_inline +u32 findAndClearMSB_32_impl_c(u32 *v) { + u32 val = *v; + u32 offset = 31 - clz32_impl_c(val); + *v = val & ~(1 << offset); + + assert(offset < 32); + return offset; +} + +static really_inline +u32 findAndClearMSB_64_impl_c(u64a *v) { +#ifdef ARCH_64_BIT + // generic variant using gcc's builtin on 64-bit + u64a val = *v, offset; + offset = 63 - clz64_impl_c(val); + *v = val & ~(1ULL << offset); +#else + // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't + // inline calls to __builtin_ctzll + u32 v1 = (u32)*v; + u32 v2 = (*v >> 32); + u32 offset; + if (v2) { + offset = findAndClearMSB_32_impl_c(&v2) + 32; + *v = ((u64a)v2 << 32) | (u64a)v1; + } else { + offset = findAndClearMSB_32_impl_c(&v1); + *v = (u64a)v1; + } +#endif + + assert(offset < 64); + return (u32)offset; +} + +static really_inline +u32 compress32_impl_c(u32 x, u32 m) { + + // Return zero quickly on trivial cases + if ((x & m) == 0) { + return 0; + } + + u32 mk, mp, mv, t; + + x &= m; // clear irrelevant bits + + mk = ~m << 1; // we will count 0's to right + for (u32 i = 0; i < 5; i++) { + mp = mk ^ (mk << 1); + mp ^= mp << 2; + mp ^= mp << 4; + mp ^= mp << 8; + mp ^= mp << 16; + + mv = mp & m; // bits to move + m = (m ^ mv) | (mv >> (1 << i)); // compress m + t = x & mv; + x = (x ^ t) | (t >> (1 << i)); // compress x + mk = mk & ~mp; + } + + return x; +} + +static really_inline +u64a compress64_impl_c(u64a x, u64a m) { + // Return zero quickly on trivial cases + if ((x & m) == 0) { + return 0; + } + + u64a mk, mp, mv, t; + + x &= m; // clear irrelevant bits + + mk = ~m << 1; // we will count 0's to right + for (u32 i = 0; i < 6; i++) { + mp = mk ^ (mk << 1); + mp ^= mp << 2; + mp ^= mp << 4; + mp ^= mp << 8; + mp ^= mp << 16; + mp ^= mp << 32; + + mv = mp & m; // bits to move + m = (m ^ mv) | (mv >> (1 << i)); // compress m + t = x & mv; + x = (x ^ t) | (t >> (1 << i)); // compress x + mk = mk & ~mp; + } + + return x; +} + +static really_inline +u32 expand32_impl_c(u32 x, u32 m) { + // Return zero quickly on trivial cases + if (!x || !m) { + return 0; + } + + u32 m0, mk, mp, mv, t; + u32 array[5]; + + m0 = m; // save original mask + mk = ~m << 1; // we will count 0's to right + + for (int i = 0; i < 5; i++) { + mp = mk ^ (mk << 1); // parallel suffix + mp = mp ^ (mp << 2); + mp = mp ^ (mp << 4); + mp = mp ^ (mp << 8); + mp = mp ^ (mp << 16); + mv = mp & m; // bits to move + array[i] = mv; + m = (m ^ mv) | (mv >> (1 << i)); // compress m + mk = mk & ~mp; + } + + for (int i = 4; i >= 0; i--) { + mv = array[i]; + t = x << (1 << i); + x = (x & ~mv) | (t & mv); + } + + return x & m0; // clear out extraneous bits +} + +static really_inline +u64a expand64_impl_c(u64a x, u64a m) { + + // Return zero quickly on trivial cases + if (!x || !m) { + return 0; + } + + u64a m0, mk, mp, mv, t; + u64a array[6]; + + m0 = m; // save original mask + mk = ~m << 1; // we will count 0's to right + + for (int i = 0; i < 6; i++) { + mp = mk ^ (mk << 1); // parallel suffix + mp = mp ^ (mp << 2); + mp = mp ^ (mp << 4); + mp = mp ^ (mp << 8); + mp = mp ^ (mp << 16); + mp = mp ^ (mp << 32); + mv = mp & m; // bits to move + array[i] = mv; + m = (m ^ mv) | (mv >> (1 << i)); // compress m + mk = mk & ~mp; + } + + for (int i = 5; i >= 0; i--) { + mv = array[i]; + t = x << (1 << i); + x = (x & ~mv) | (t & mv); + } + + return x & m0; // clear out extraneous bits +} + + +/* returns the first set bit after begin (if not ~0U). If no bit is set after + * begin returns ~0U + */ +static really_inline +u32 bf64_iterate_impl_c(u64a bitfield, u32 begin) { + if (begin != ~0U) { + /* switch off all bits at or below begin. Note: not legal to shift by + * by size of the datatype or larger. */ + assert(begin <= 63); + bitfield &= ~((2ULL << begin) - 1); + } + + if (!bitfield) { + return ~0U; + } + + return ctz64_impl_c(bitfield); +} + +static really_inline +char bf64_set_impl_c(u64a *bitfield, u32 i) { + u64a mask = 1ULL << i; + char was_set = !!(*bitfield & mask); + *bitfield |= mask; + + return was_set; +} + +static really_inline +void bf64_unset_impl_c(u64a *bitfield, u32 i) { + *bitfield &= ~(1ULL << i); +} + +static really_inline +u32 rank_in_mask32_impl_c(u32 mask, u32 bit) { + mask &= (u32)(1U << bit) - 1; + return popcount32(mask); +} + +static really_inline +u32 rank_in_mask64_impl_c(u64a mask, u32 bit) { + mask &= (u64a)(1ULL << bit) - 1; + return popcount64(mask); +} + +static really_inline +u32 pext32_impl_c(u32 x, u32 mask) { + + u32 result = 0, num = 1; + while (mask != 0) { + u32 bit = findAndClearLSB_32_impl_c(&mask); + if (x & (1U << bit)) { + assert(num != 0); // more than 32 bits! + result |= num; + } + num <<= 1; + } + return result; +} + +static really_inline +u64a pext64_impl_c(u64a x, u64a mask) { + + u32 result = 0, num = 1; + while (mask != 0) { + u32 bit = findAndClearLSB_64_impl_c(&mask); + if (x & (1ULL << bit)) { + assert(num != 0); // more than 32 bits! + result |= num; + } + num <<= 1; + } + return result; +} + +#endif // BITUTILS_ARCH_COMMON_H diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h new file mode 100644 index 00000000..da7c747e --- /dev/null +++ b/src/util/arch/x86/bitutils.h @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Bit-twiddling primitives (ctz, compress etc) + */ + +#ifndef BITUTILS_ARCH_X86_H +#define BITUTILS_ARCH_X86_H + +#include "ue2common.h" +#include "util/popcount.h" +#include "util/arch.h" +#include "util/intrinsics.h" + +#include "util/arch/common/bitutils.h" + +static really_inline +u32 clz32_impl(u32 x) { +#if defined(_WIN32) + unsigned long r; + _BitScanReverse(&r, x); + return 31 - r; +#else + return clz32_impl_c(x); +#endif +} + +static really_inline +u32 clz64_impl(u64a x) { +#if defined(_WIN64) + unsigned long r; + _BitScanReverse64(&r, x); + return 63 - r; +#elif defined(_WIN32) + unsigned long x1 = (u32)x; + unsigned long x2 = (u32)(x >> 32); + unsigned long r; + if (x2) { + _BitScanReverse(&r, x2); + return (u32)(31 - r); + } + _BitScanReverse(&r, (u32)x1); + return (u32)(63 - r); +#else + return clz64_impl_c(x); +#endif +} + +// CTZ (count trailing zero) implementations. +static really_inline +u32 ctz32_impl(u32 x) { +#if defined(_WIN32) + unsigned long r; + _BitScanForward(&r, x); + return r; +#else + return ctz32_impl_c(x); +#endif +} + +static really_inline +u32 ctz64_impl(u64a x) { +#if defined(_WIN64) + unsigned long r; + _BitScanForward64(&r, x); + return r; +#elif defined(_WIN32) + unsigned long r; + if (_BitScanForward(&r, (u32)x)) { + return (u32)r; + } + _BitScanForward(&r, x >> 32); + return (u32)(r + 32); +#else + return ctz64_impl_c(x); +#endif +} + +static really_inline +u32 lg2_impl(u32 x) { + return lg2_impl_c(x); +} + +static really_inline +u64a lg2_64_impl(u64a x) { + return lg2_64_impl_c(x); +} + +static really_inline +u32 findAndClearLSB_32_impl(u32 *v) { +#ifndef NO_ASM + u32 val = *v, offset; + __asm__ ("bsf %1, %0\n" + "btr %0, %1\n" + : "=r" (offset), "=r" (val) + : "1" (val)); + *v = val; + + assert(offset < 32); + return offset; +#else + return findAndClearLSB_32_impl_c(v); +#endif + +} + +static really_inline +u32 findAndClearLSB_64_impl(u64a *v) { +#ifdef ARCH_64_BIT +#if !defined(NO_ASM) + u64a val = *v, offset; + __asm__ ("bsfq %1, %0\n" + "btrq %0, %1\n" + : "=r" (offset), "=r" (val) + : "1" (val)); + *v = val; +#else + // generic variant using gcc's builtin on 64-bit + u64a val = *v, offset; + offset = ctz64(val); + *v = val & (val - 1); +#endif // ARCH_X86_64 + assert(offset < 64); + return (u32)offset; +#else + return findAndClearLSB_64_impl_c(v); +#endif +} + +static really_inline +u32 findAndClearMSB_32_impl(u32 *v) { +#if !defined(NO_ASM) + u32 val = *v, offset; + __asm__ ("bsr %1, %0\n" + "btr %0, %1\n" + : "=r" (offset), "=r" (val) + : "1" (val)); + *v = val; +#else + u32 val = *v; + u32 offset = 31 - clz32_impl(val); + *v = val & ~(1 << offset); +#endif + assert(offset < 32); + return offset; +} + +static really_inline +u32 findAndClearMSB_64_impl(u64a *v) { +#ifdef ARCH_64_BIT +#if !defined(NO_ASM) + u64a val = *v, offset; + __asm__ ("bsrq %1, %0\n" + "btrq %0, %1\n" + : "=r" (offset), "=r" (val) + : "1" (val)); + *v = val; +#else + // generic variant using gcc's builtin on 64-bit + u64a val = *v, offset; + offset = 63 - clz64_impl(val); + *v = val & ~(1ULL << offset); +#endif // ARCH_X86_64 + assert(offset < 64); + return (u32)offset; +#else + return findAndClearMSB_64_impl_c(v); +#endif +} + +static really_inline +u32 compress32_impl(u32 x, u32 m) { +#if defined(HAVE_BMI2) + // BMI2 has a single instruction for this operation. + return _pext_u32(x, m); +#else + return compress32_impl_c(x, m); +#endif +} + +static really_inline +u64a compress64_impl(u64a x, u64a m) { +#if defined(ARCH_X86_64) && defined(HAVE_BMI2) + // BMI2 has a single instruction for this operation. + return _pext_u64(x, m); +#else + return compress64_impl_c(x, m); +#endif +} + +static really_inline +u32 expand32_impl(u32 x, u32 m) { +#if defined(HAVE_BMI2) + // BMI2 has a single instruction for this operation. + return _pdep_u32(x, m); +#else + return expand32_impl_c(x, m); +#endif +} + +static really_inline +u64a expand64_impl(u64a x, u64a m) { +#if defined(ARCH_X86_64) && defined(HAVE_BMI2) + // BMI2 has a single instruction for this operation. + return _pdep_u64(x, m); +#else + return expand64_impl_c(x, m); +#endif +} + + +/* returns the first set bit after begin (if not ~0U). If no bit is set after + * begin returns ~0U + */ +static really_inline +u32 bf64_iterate_impl(u64a bitfield, u32 begin) { + if (begin != ~0U) { + /* switch off all bits at or below begin. Note: not legal to shift by + * by size of the datatype or larger. */ + assert(begin <= 63); + bitfield &= ~((2ULL << begin) - 1); + } + + if (!bitfield) { + return ~0U; + } + + return ctz64_impl(bitfield); +} + +static really_inline +char bf64_set_impl(u64a *bitfield, u32 i) { + return bf64_set_impl_c(bitfield, i); +} + +static really_inline +void bf64_unset_impl(u64a *bitfield, u32 i) { + return bf64_unset_impl_c(bitfield, i); +} + +static really_inline +u32 rank_in_mask32_impl(u32 mask, u32 bit) { + return rank_in_mask32_impl_c(mask, bit); +} + +static really_inline +u32 rank_in_mask64_impl(u64a mask, u32 bit) { + return rank_in_mask64_impl_c(mask, bit); +} + +static really_inline +u32 pext32_impl(u32 x, u32 mask) { +#if defined(HAVE_BMI2) + // Intel BMI2 can do this operation in one instruction. + return _pext_u32(x, mask); +#else + return pext32_impl_c(x, mask); +#endif +} + +static really_inline +u64a pext64_impl(u64a x, u64a mask) { +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) + // Intel BMI2 can do this operation in one instruction. + return _pext_u64(x, mask); +#else + return pext64_impl_c(x, mask); +#endif +} + +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) +static really_inline +u64a pdep64(u64a x, u64a mask) { + return _pdep_u64(x, mask); +} +#endif + +#endif // BITUTILS_ARCH_X86_H diff --git a/src/util/bitutils.h b/src/util/bitutils.h index c545ee18..651e5f93 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -33,6 +33,7 @@ #ifndef BITUTILS_H #define BITUTILS_H +#include "config.h" #include "ue2common.h" #include "popcount.h" #include "util/arch.h" @@ -43,351 +44,88 @@ #define DOUBLE_CASE_CLEAR 0xdfdf #define OCTO_CASE_CLEAR 0xdfdfdfdfdfdfdfdfULL + +#if defined(_WIN32) || defined(_WIN64) || defined(ARCH_IA32) || defined(ARCH_X86_64) +#include "util/arch/x86/bitutils.h" +#endif + static really_inline u32 clz32(u32 x) { assert(x); // behaviour not defined for x == 0 -#if defined(_WIN32) - unsigned long r; - _BitScanReverse(&r, x); - return 31 - r; -#else - return (u32)__builtin_clz(x); -#endif + + return clz32_impl(x); } static really_inline u32 clz64(u64a x) { assert(x); // behaviour not defined for x == 0 -#if defined(_WIN64) - unsigned long r; - _BitScanReverse64(&r, x); - return 63 - r; -#elif defined(_WIN32) - unsigned long x1 = (u32)x; - unsigned long x2 = (u32)(x >> 32); - unsigned long r; - if (x2) { - _BitScanReverse(&r, x2); - return (u32)(31 - r); - } - _BitScanReverse(&r, (u32)x1); - return (u32)(63 - r); -#else - return (u32)__builtin_clzll(x); -#endif + + return clz64_impl(x); } // CTZ (count trailing zero) implementations. static really_inline u32 ctz32(u32 x) { assert(x); // behaviour not defined for x == 0 -#if defined(_WIN32) - unsigned long r; - _BitScanForward(&r, x); - return r; -#else - return (u32)__builtin_ctz(x); -#endif + + return ctz32_impl(x); } static really_inline u32 ctz64(u64a x) { assert(x); // behaviour not defined for x == 0 -#if defined(_WIN64) - unsigned long r; - _BitScanForward64(&r, x); - return r; -#elif defined(_WIN32) - unsigned long r; - if (_BitScanForward(&r, (u32)x)) { - return (u32)r; - } - _BitScanForward(&r, x >> 32); - return (u32)(r + 32); -#else - return (u32)__builtin_ctzll(x); -#endif + + return ctz64_impl(x); } static really_inline u32 lg2(u32 x) { - if (!x) { - return 0; - } - return 31 - clz32(x); + return lg2_impl(x); } static really_inline u64a lg2_64(u64a x) { - if (!x) { - return 0; - } - return 63 - clz64(x); + return lg2_64_impl(x); } static really_inline u32 findAndClearLSB_32(u32 *v) { - assert(*v != 0); // behaviour not defined in this case -#ifndef NO_ASM - u32 val = *v, offset; - __asm__ ("bsf %1, %0\n" - "btr %0, %1\n" - : "=r" (offset), "=r" (val) - : "1" (val)); - *v = val; -#else - u32 val = *v; - u32 offset = ctz32(val); - *v = val & (val - 1); -#endif - - assert(offset < 32); - return offset; + return findAndClearLSB_32_impl(v); } static really_inline u32 findAndClearLSB_64(u64a *v) { - assert(*v != 0); // behaviour not defined in this case - -#ifdef ARCH_64_BIT -#if defined(ARCH_X86_64) && !defined(NO_ASM) - u64a val = *v, offset; - __asm__ ("bsfq %1, %0\n" - "btrq %0, %1\n" - : "=r" (offset), "=r" (val) - : "1" (val)); - *v = val; -#else - // generic variant using gcc's builtin on 64-bit - u64a val = *v, offset; - offset = ctz64(val); - *v = val & (val - 1); -#endif // ARCH_X86_64 -#else - // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't - // inline calls to __builtin_ctzll - u32 v1 = (u32)*v; - u32 v2 = (u32)(*v >> 32); - u32 offset; - if (v1) { - offset = findAndClearLSB_32(&v1); - *v = (u64a)v1 | ((u64a)v2 << 32); - } else { - offset = findAndClearLSB_32(&v2) + 32; - *v = (u64a)v2 << 32; - } -#endif - - assert(offset < 64); - return (u32)offset; + return findAndClearLSB_64_impl(v); } static really_inline u32 findAndClearMSB_32(u32 *v) { - assert(*v != 0); // behaviour not defined in this case -#ifndef NO_ASM - u32 val = *v, offset; - __asm__ ("bsr %1, %0\n" - "btr %0, %1\n" - : "=r" (offset), "=r" (val) - : "1" (val)); - *v = val; -#else - u32 val = *v; - u32 offset = 31 - clz32(val); - *v = val & ~(1 << offset); -#endif - assert(offset < 32); - return offset; + return findAndClearMSB_32_impl(v); } static really_inline u32 findAndClearMSB_64(u64a *v) { - assert(*v != 0); // behaviour not defined in this case - -#ifdef ARCH_64_BIT -#if defined(ARCH_X86_64) && !defined(NO_ASM) - u64a val = *v, offset; - __asm__ ("bsrq %1, %0\n" - "btrq %0, %1\n" - : "=r" (offset), "=r" (val) - : "1" (val)); - *v = val; -#else - // generic variant using gcc's builtin on 64-bit - u64a val = *v, offset; - offset = 63 - clz64(val); - *v = val & ~(1ULL << offset); -#endif // ARCH_X86_64 -#else - // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't - // inline calls to __builtin_ctzll - u32 v1 = (u32)*v; - u32 v2 = (*v >> 32); - u32 offset; - if (v2) { - offset = findAndClearMSB_32(&v2) + 32; - *v = ((u64a)v2 << 32) | (u64a)v1; - } else { - offset = findAndClearMSB_32(&v1); - *v = (u64a)v1; - } -#endif - - assert(offset < 64); - return (u32)offset; + return findAndClearMSB_64_impl(v); } static really_inline u32 compress32(u32 x, u32 m) { -#if defined(HAVE_BMI2) - // BMI2 has a single instruction for this operation. - return _pext_u32(x, m); -#else - - // Return zero quickly on trivial cases - if ((x & m) == 0) { - return 0; - } - - u32 mk, mp, mv, t; - - x &= m; // clear irrelevant bits - - mk = ~m << 1; // we will count 0's to right - for (u32 i = 0; i < 5; i++) { - mp = mk ^ (mk << 1); - mp ^= mp << 2; - mp ^= mp << 4; - mp ^= mp << 8; - mp ^= mp << 16; - - mv = mp & m; // bits to move - m = (m ^ mv) | (mv >> (1 << i)); // compress m - t = x & mv; - x = (x ^ t) | (t >> (1 << i)); // compress x - mk = mk & ~mp; - } - - return x; -#endif + return compress32_impl(x, m); } static really_inline u64a compress64(u64a x, u64a m) { -#if defined(ARCH_X86_64) && defined(HAVE_BMI2) - // BMI2 has a single instruction for this operation. - return _pext_u64(x, m); -#else - - // Return zero quickly on trivial cases - if ((x & m) == 0) { - return 0; - } - - u64a mk, mp, mv, t; - - x &= m; // clear irrelevant bits - - mk = ~m << 1; // we will count 0's to right - for (u32 i = 0; i < 6; i++) { - mp = mk ^ (mk << 1); - mp ^= mp << 2; - mp ^= mp << 4; - mp ^= mp << 8; - mp ^= mp << 16; - mp ^= mp << 32; - - mv = mp & m; // bits to move - m = (m ^ mv) | (mv >> (1 << i)); // compress m - t = x & mv; - x = (x ^ t) | (t >> (1 << i)); // compress x - mk = mk & ~mp; - } - - return x; -#endif + return compress64_impl(x, m); } static really_inline u32 expand32(u32 x, u32 m) { -#if defined(HAVE_BMI2) - // BMI2 has a single instruction for this operation. - return _pdep_u32(x, m); -#else - - // Return zero quickly on trivial cases - if (!x || !m) { - return 0; - } - - u32 m0, mk, mp, mv, t; - u32 array[5]; - - m0 = m; // save original mask - mk = ~m << 1; // we will count 0's to right - - for (int i = 0; i < 5; i++) { - mp = mk ^ (mk << 1); // parallel suffix - mp = mp ^ (mp << 2); - mp = mp ^ (mp << 4); - mp = mp ^ (mp << 8); - mp = mp ^ (mp << 16); - mv = mp & m; // bits to move - array[i] = mv; - m = (m ^ mv) | (mv >> (1 << i)); // compress m - mk = mk & ~mp; - } - - for (int i = 4; i >= 0; i--) { - mv = array[i]; - t = x << (1 << i); - x = (x & ~mv) | (t & mv); - } - - return x & m0; // clear out extraneous bits -#endif + return expand32_impl(x, m); } static really_inline u64a expand64(u64a x, u64a m) { -#if defined(ARCH_X86_64) && defined(HAVE_BMI2) - // BMI2 has a single instruction for this operation. - return _pdep_u64(x, m); -#else - - // Return zero quickly on trivial cases - if (!x || !m) { - return 0; - } - - u64a m0, mk, mp, mv, t; - u64a array[6]; - - m0 = m; // save original mask - mk = ~m << 1; // we will count 0's to right - - for (int i = 0; i < 6; i++) { - mp = mk ^ (mk << 1); // parallel suffix - mp = mp ^ (mp << 2); - mp = mp ^ (mp << 4); - mp = mp ^ (mp << 8); - mp = mp ^ (mp << 16); - mp = mp ^ (mp << 32); - mv = mp & m; // bits to move - array[i] = mv; - m = (m ^ mv) | (mv >> (1 << i)); // compress m - mk = mk & ~mp; - } - - for (int i = 5; i >= 0; i--) { - mv = array[i]; - t = x << (1 << i); - x = (x & ~mv) | (t & mv); - } - - return x & m0; // clear out extraneous bits -#endif + return expand64_impl(x, m); } @@ -396,97 +134,37 @@ u64a expand64(u64a x, u64a m) { */ static really_inline u32 bf64_iterate(u64a bitfield, u32 begin) { - if (begin != ~0U) { - /* switch off all bits at or below begin. Note: not legal to shift by - * by size of the datatype or larger. */ - assert(begin <= 63); - bitfield &= ~((2ULL << begin) - 1); - } - - if (!bitfield) { - return ~0U; - } - - return ctz64(bitfield); + return bf64_iterate_impl(bitfield, begin); } static really_inline char bf64_set(u64a *bitfield, u32 i) { - assert(i < 64); - u64a mask = 1ULL << i; - char was_set = !!(*bitfield & mask); - *bitfield |= mask; - - return was_set; + return bf64_set_impl(bitfield, i); } static really_inline void bf64_unset(u64a *bitfield, u32 i) { - assert(i < 64); - *bitfield &= ~(1ULL << i); + return bf64_unset_impl(bitfield, i); } static really_inline u32 rank_in_mask32(u32 mask, u32 bit) { - assert(bit < sizeof(u32) * 8); - assert(mask & (u32)(1U << bit)); - mask &= (u32)(1U << bit) - 1; - return popcount32(mask); + return rank_in_mask32_impl(mask, bit); } static really_inline u32 rank_in_mask64(u64a mask, u32 bit) { - assert(bit < sizeof(u64a) * 8); - assert(mask & (u64a)(1ULL << bit)); - mask &= (u64a)(1ULL << bit) - 1; - return popcount64(mask); + return rank_in_mask64_impl(mask, bit); } static really_inline u32 pext32(u32 x, u32 mask) { -#if defined(HAVE_BMI2) - // Intel BMI2 can do this operation in one instruction. - return _pext_u32(x, mask); -#else - - u32 result = 0, num = 1; - while (mask != 0) { - u32 bit = findAndClearLSB_32(&mask); - if (x & (1U << bit)) { - assert(num != 0); // more than 32 bits! - result |= num; - } - num <<= 1; - } - return result; -#endif + return pext32_impl(x, mask); } static really_inline u64a pext64(u64a x, u64a mask) { -#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) - // Intel BMI2 can do this operation in one instruction. - return _pext_u64(x, mask); -#else - - u32 result = 0, num = 1; - while (mask != 0) { - u32 bit = findAndClearLSB_64(&mask); - if (x & (1ULL << bit)) { - assert(num != 0); // more than 32 bits! - result |= num; - } - num <<= 1; - } - return result; -#endif + return pext64_impl(x, mask); } -#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) -static really_inline -u64a pdep64(u64a x, u64a mask) { - return _pdep_u64(x, mask); -} -#endif - #endif // BITUTILS_H From 6581aae90e55520353c03edb716de80ecc03521a Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 22 Sep 2020 11:45:24 +0300 Subject: [PATCH 07/53] move x86 popcount.h implementations to util/arch/x86/popcount.h --- src/util/arch/common/popcount.h | 60 +++++++++++++++++++++++++++++ src/util/arch/x86/popcount.h | 67 +++++++++++++++++++++++++++++++++ src/util/popcount.h | 35 ++++------------- 3 files changed, 135 insertions(+), 27 deletions(-) create mode 100644 src/util/arch/common/popcount.h create mode 100644 src/util/arch/x86/popcount.h diff --git a/src/util/arch/common/popcount.h b/src/util/arch/common/popcount.h new file mode 100644 index 00000000..0bd1e837 --- /dev/null +++ b/src/util/arch/common/popcount.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Platform specific popcount functions + */ + +#ifndef POPCOUNT_ARCH_COMMON_H +#define POPCOUNT_ARCH_COMMON_H + +static really_inline +u32 popcount32_impl_c(u32 x) { + // Fast branch-free version from bit-twiddling hacks as older Intel + // processors do not have a POPCNT instruction. + x -= (x >> 1) & 0x55555555; + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24; +} + +static really_inline +u32 popcount64_impl_c(u64a x) { +#if defined(ARCH_64_BIT) + // Fast branch-free version from bit-twiddling hacks as older Intel + // processors do not have a POPCNT instruction. + x -= (x >> 1) & 0x5555555555555555; + x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f; + return (x * 0x0101010101010101) >> 56; +#else + // Synthesise from two 32-bit cases. + return popcount32_impl(x >> 32) + popcount32_impl(x); +#endif +} + +#endif // POPCOUNT_ARCH_COMMON_H \ No newline at end of file diff --git a/src/util/arch/x86/popcount.h b/src/util/arch/x86/popcount.h new file mode 100644 index 00000000..86929ede --- /dev/null +++ b/src/util/arch/x86/popcount.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Platform specific popcount functions + */ + +#ifndef POPCOUNT_ARCH_X86_H +#define POPCOUNT_ARCH_X86_H + +#include "ue2common.h" +#include "util/arch.h" +#include "util/intrinsics.h" + +#include "util/arch/common/popcount.h" + +static really_inline +u32 popcount32_impl(u32 x) { +#if defined(HAVE_POPCOUNT_INSTR) + // Single-instruction builtin. + return _mm_popcnt_u32(x); +#else + return popcount32_impl_c(x); +#endif +} + +static really_inline +u32 popcount64_impl(u64a x) { +#if defined(ARCH_X86_64) +# if defined(HAVE_POPCOUNT_INSTR) + // Single-instruction builtin. + return (u32)_mm_popcnt_u64(x); +# else + return popcount64_impl_c(x); +# endif +#else + // Synthesise from two 32-bit cases. + return popcount32_impl(x >> 32) + popcount32_impl(x); +#endif +} + +#endif // POPCOUNT_ARCH_X86_h \ No newline at end of file diff --git a/src/util/popcount.h b/src/util/popcount.h index eb08f6b1..932fc2cf 100644 --- a/src/util/popcount.h +++ b/src/util/popcount.h @@ -33,41 +33,22 @@ #ifndef UTIL_POPCOUNT_H_ #define UTIL_POPCOUNT_H_ +#include "config.h" #include "ue2common.h" #include "util/arch.h" +#if defined(ARCH_IA32) || defined(ARCH_X86_64) +#include "util/arch/x86/popcount.h" +#endif + static really_inline u32 popcount32(u32 x) { -#if defined(HAVE_POPCOUNT_INSTR) - // Single-instruction builtin. - return _mm_popcnt_u32(x); -#else - // Fast branch-free version from bit-twiddling hacks as older Intel - // processors do not have a POPCNT instruction. - x -= (x >> 1) & 0x55555555; - x = (x & 0x33333333) + ((x >> 2) & 0x33333333); - return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24; -#endif + return popcount32_impl(x); } static really_inline -u32 popcount64(u64a x) { -#if defined(ARCH_X86_64) -# if defined(HAVE_POPCOUNT_INSTR) - // Single-instruction builtin. - return (u32)_mm_popcnt_u64(x); -# else - // Fast branch-free version from bit-twiddling hacks as older Intel - // processors do not have a POPCNT instruction. - x -= (x >> 1) & 0x5555555555555555; - x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); - x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f; - return (x * 0x0101010101010101) >> 56; -# endif -#else - // Synthesise from two 32-bit cases. - return popcount32(x >> 32) + popcount32(x); -#endif +u32 popcount64(u32 x) { + return popcount64_impl(x); } #endif /* UTIL_POPCOUNT_H_ */ From 9f3ad89ed63dc56f8fe84b88a5ed81a7c5c6b11b Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 22 Sep 2020 12:17:27 +0300 Subject: [PATCH 08/53] move andn helper function to bitutils.h --- src/fdr/fdr.c | 15 +-------------- src/util/arch/common/bitutils.h | 9 +++++++++ src/util/arch/x86/bitutils.h | 14 ++++++++++++++ src/util/bitutils.h | 8 ++++++++ 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index d33756d3..b0f90b52 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -36,6 +36,7 @@ #include "teddy.h" #include "teddy_internal.h" #include "util/arch.h" +#include "util/bitutils.h" #include "util/simd_utils.h" #include "util/uniform_ops.h" @@ -119,20 +120,6 @@ const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } }; -/* compilers don't reliably synthesize the 32-bit ANDN instruction here, - * so we force its generation. - */ -static really_inline -u64a andn(const u32 a, const u8 *b) { - u64a r; -#if defined(HAVE_BMI) && !defined(NO_ASM) - __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b)); -#else - r = unaligned_load_u32(b) & ~a; -#endif - return r; -} - /* generates an initial state mask based on the last byte-ish of history rather * than being all accepting. If there is no history to consider, the state is * generated based on the minimum length of each bucket in order to prevent diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h index 85d5dc49..f2706d70 100644 --- a/src/util/arch/common/bitutils.h +++ b/src/util/arch/common/bitutils.h @@ -34,6 +34,7 @@ #define BITUTILS_ARCH_COMMON_H #include "util/popcount.h" +#include "util/unaligned.h" static really_inline u32 clz32_impl_c(u32 x) { @@ -350,4 +351,12 @@ u64a pext64_impl_c(u64a x, u64a mask) { return result; } +/* compilers don't reliably synthesize the 32-bit ANDN instruction here, + * so we force its generation. + */ +static really_inline +u64a andn_impl_c(const u32 a, const u8 *b) { + return unaligned_load_u32(b) & ~a; +} + #endif // BITUTILS_ARCH_COMMON_H diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h index da7c747e..ec4c95ad 100644 --- a/src/util/arch/x86/bitutils.h +++ b/src/util/arch/x86/bitutils.h @@ -301,4 +301,18 @@ u64a pdep64(u64a x, u64a mask) { } #endif +/* compilers don't reliably synthesize the 32-bit ANDN instruction here, + * so we force its generation. + */ +static really_inline +u64a andn_impl(const u32 a, const u8 *b) { +#if defined(HAVE_BMI) && !defined(NO_ASM) + u64a r; + __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b)); + return r; +#else + return andn_impl_c(a, b); +#endif +} + #endif // BITUTILS_ARCH_X86_H diff --git a/src/util/bitutils.h b/src/util/bitutils.h index 651e5f93..b9f312cb 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -167,4 +167,12 @@ u64a pext64(u64a x, u64a mask) { return pext64_impl(x, mask); } +/* compilers don't reliably synthesize the 32-bit ANDN instruction here, + * so we force its generation. + */ +static really_inline +u64a andn(const u32 a, const u8 *b) { + return andn_impl_c(a, b); +} + #endif // BITUTILS_H From e915d848640baba904ada9a576eed00361d2e06b Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 22 Sep 2020 13:10:52 +0300 Subject: [PATCH 09/53] no need to check for WIN32* --- src/util/bitutils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/bitutils.h b/src/util/bitutils.h index b9f312cb..7373a9c8 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -45,7 +45,7 @@ #define OCTO_CASE_CLEAR 0xdfdfdfdfdfdfdfdfULL -#if defined(_WIN32) || defined(_WIN64) || defined(ARCH_IA32) || defined(ARCH_X86_64) +#if defined(ARCH_IA32) || defined(ARCH_X86_64) #include "util/arch/x86/bitutils.h" #endif From e8e188acaf450a86ff6e7c3f611815bb67710732 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 22 Sep 2020 13:12:07 +0300 Subject: [PATCH 10/53] move x86 implementations of simd_utils.h to util/arch/x86/ --- src/util/arch/x86/simd_utils.h | 1312 ++++++++++++++++++++++++++++++++ src/util/simd_utils.h | 1281 +------------------------------ 2 files changed, 1317 insertions(+), 1276 deletions(-) create mode 100644 src/util/arch/x86/simd_utils.h diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h new file mode 100644 index 00000000..6ec4042b --- /dev/null +++ b/src/util/arch/x86/simd_utils.h @@ -0,0 +1,1312 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief SIMD types and primitive operations. + */ + +#ifndef ARCH_X86_SIMD_UTILS_H +#define ARCH_X86_SIMD_UTILS_H + +#if !defined(_WIN32) && !defined(__SSSE3__) +#error SSSE3 instructions must be enabled +#endif + +#include "ue2common.h" +#include "util/simd_types.h" +#include "util/unaligned.h" +#include "util/intrinsics.h" + +#include // for memcpy + +static really_inline m128 ones128(void) { +#if defined(__GNUC__) || defined(__INTEL_COMPILER) + /* gcc gets this right */ + return _mm_set1_epi8(0xFF); +#else + /* trick from Intel's optimization guide to generate all-ones. + * ICC converts this to the single cmpeq instruction */ + return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); +#endif +} + +static really_inline m128 zeroes128(void) { + return _mm_setzero_si128(); +} + +/** \brief Bitwise not for m128*/ +static really_inline m128 not128(m128 a) { + return _mm_xor_si128(a, ones128()); +} + +/** \brief Return 1 if a and b are different otherwise 0 */ +static really_inline int diff128(m128 a, m128 b) { + return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff); +} + +static really_inline int isnonzero128(m128 a) { + return !!diff128(a, zeroes128()); +} + +/** + * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich128(m128 a, m128 b) { + a = _mm_cmpeq_epi32(a, b); + return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf; +} + +/** + * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and + * returns a 4-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_128(m128 a, m128 b) { +#if defined(HAVE_SSE41) + a = _mm_cmpeq_epi64(a, b); + return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5; +#else + u32 d = diffrich128(a, b); + return (d | (d >> 1)) & 0x5; +#endif +} + +static really_really_inline +m128 lshift64_m128(m128 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm_slli_epi64(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm_sll_epi64(a, x); +} + +#define rshift64_m128(a, b) _mm_srli_epi64((a), (b)) +#define eq128(a, b) _mm_cmpeq_epi8((a), (b)) +#define movemask128(a) ((u32)_mm_movemask_epi8((a))) + +static really_inline m128 set16x8(u8 c) { + return _mm_set1_epi8(c); +} + +static really_inline m128 set4x32(u32 c) { + return _mm_set1_epi32(c); +} + +static really_inline u32 movd(const m128 in) { + return _mm_cvtsi128_si32(in); +} + +#if defined(HAVE_AVX512) +static really_inline u32 movd512(const m512 in) { + // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in), + // so we use 2-step convertions to work around. + return _mm_cvtsi128_si32(_mm512_castsi512_si128(in)); +} +#endif + +static really_inline u64a movq(const m128 in) { +#if defined(ARCH_X86_64) + return _mm_cvtsi128_si64(in); +#else // 32-bit - this is horrific + u32 lo = movd(in); + u32 hi = movd(_mm_srli_epi64(in, 32)); + return (u64a)hi << 32 | lo; +#endif +} + +/* another form of movq */ +static really_inline +m128 load_m128_from_u64a(const u64a *p) { + return _mm_set_epi64x(0LL, *p); +} + +#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) +#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) + +#if defined(HAVE_SSE41) +#define extract32from128(a, imm) _mm_extract_epi32(a, imm) +#define extract64from128(a, imm) _mm_extract_epi64(a, imm) +#else +#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2)) +#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3)) +#endif + +#if !defined(HAVE_AVX2) +// TODO: this entire file needs restructuring - this carveout is awful +#define extractlow64from256(a) movq(a.lo) +#define extractlow32from256(a) movd(a.lo) +#if defined(HAVE_SSE41) +#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4) +#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2) +#else +#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4)) +#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8)) +#endif + +#endif // !AVX2 + +static really_inline m128 and128(m128 a, m128 b) { + return _mm_and_si128(a,b); +} + +static really_inline m128 xor128(m128 a, m128 b) { + return _mm_xor_si128(a,b); +} + +static really_inline m128 or128(m128 a, m128 b) { + return _mm_or_si128(a,b); +} + +static really_inline m128 andnot128(m128 a, m128 b) { + return _mm_andnot_si128(a, b); +} + +// aligned load +static really_inline m128 load128(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m128))); + ptr = assume_aligned(ptr, 16); + return _mm_load_si128((const m128 *)ptr); +} + +// aligned store +static really_inline void store128(void *ptr, m128 a) { + assert(ISALIGNED_N(ptr, alignof(m128))); + ptr = assume_aligned(ptr, 16); + *(m128 *)ptr = a; +} + +// unaligned load +static really_inline m128 loadu128(const void *ptr) { + return _mm_loadu_si128((const m128 *)ptr); +} + +// unaligned store +static really_inline void storeu128(void *ptr, m128 a) { + _mm_storeu_si128 ((m128 *)ptr, a); +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes128(void *ptr, m128 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m128 loadbytes128(const void *ptr, unsigned int n) { + m128 a = zeroes128(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +#ifdef __cplusplus +extern "C" { +#endif +extern const u8 simd_onebit_masks[]; +#ifdef __cplusplus +} +#endif + +static really_inline +m128 mask1bit128(unsigned int n) { + assert(n < sizeof(m128) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu128(&simd_onebit_masks[mask_idx]); +} + +// switches on bit N in the given vector. +static really_inline +void setbit128(m128 *ptr, unsigned int n) { + *ptr = or128(mask1bit128(n), *ptr); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit128(m128 *ptr, unsigned int n) { + *ptr = andnot128(mask1bit128(n), *ptr); +} + +// tests bit N in the given vector. +static really_inline +char testbit128(m128 val, unsigned int n) { + const m128 mask = mask1bit128(n); +#if defined(HAVE_SSE41) + return !_mm_testz_si128(mask, val); +#else + return isnonzero128(and128(mask, val)); +#endif +} + +// offset must be an immediate +#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) + +static really_inline +m128 pshufb_m128(m128 a, m128 b) { + m128 result; + result = _mm_shuffle_epi8(a, b); + return result; +} + +static really_inline +m256 pshufb_m256(m256 a, m256 b) { +#if defined(HAVE_AVX2) + return _mm256_shuffle_epi8(a, b); +#else + m256 rv; + rv.lo = pshufb_m128(a.lo, b.lo); + rv.hi = pshufb_m128(a.hi, b.hi); + return rv; +#endif +} + +#if defined(HAVE_AVX512) +static really_inline +m512 pshufb_m512(m512 a, m512 b) { + return _mm512_shuffle_epi8(a, b); +} + +static really_inline +m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { + return _mm512_maskz_shuffle_epi8(k, a, b); +} + +#if defined(HAVE_AVX512VBMI) +#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a) +#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a) +#endif + +#endif + +static really_inline +m128 variable_byte_shift_m128(m128 in, s32 amount) { + assert(amount >= -16 && amount <= 16); + m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); + return pshufb_m128(in, shift_mask); +} + +static really_inline +m128 max_u8_m128(m128 a, m128 b) { + return _mm_max_epu8(a, b); +} + +static really_inline +m128 min_u8_m128(m128 a, m128 b) { + return _mm_min_epu8(a, b); +} + +static really_inline +m128 sadd_u8_m128(m128 a, m128 b) { + return _mm_adds_epu8(a, b); +} + +static really_inline +m128 sub_u8_m128(m128 a, m128 b) { + return _mm_sub_epi8(a, b); +} + +static really_inline +m128 set64x2(u64a hi, u64a lo) { + return _mm_set_epi64x(hi, lo); +} + +/**** + **** 256-bit Primitives + ****/ + +#if defined(HAVE_AVX2) + +static really_really_inline +m256 lshift64_m256(m256 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm256_slli_epi64(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm256_sll_epi64(a, x); +} + +#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) + +static really_inline +m256 set32x8(u32 in) { + return _mm256_set1_epi8(in); +} + +#define eq256(a, b) _mm256_cmpeq_epi8((a), (b)) +#define movemask256(a) ((u32)_mm256_movemask_epi8((a))) + +static really_inline +m256 set2x128(m128 a) { + return _mm256_broadcastsi128_si256(a); +} + +#else + +static really_really_inline +m256 lshift64_m256(m256 a, int b) { + m256 rv = a; + rv.lo = lshift64_m128(rv.lo, b); + rv.hi = lshift64_m128(rv.hi, b); + return rv; +} + +static really_inline +m256 rshift64_m256(m256 a, int b) { + m256 rv = a; + rv.lo = rshift64_m128(rv.lo, b); + rv.hi = rshift64_m128(rv.hi, b); + return rv; +} +static really_inline +m256 set32x8(u32 in) { + m256 rv; + rv.lo = set16x8((u8) in); + rv.hi = rv.lo; + return rv; +} + +static really_inline +m256 eq256(m256 a, m256 b) { + m256 rv; + rv.lo = eq128(a.lo, b.lo); + rv.hi = eq128(a.hi, b.hi); + return rv; +} + +static really_inline +u32 movemask256(m256 a) { + u32 lo_mask = movemask128(a.lo); + u32 hi_mask = movemask128(a.hi); + return lo_mask | (hi_mask << 16); +} + +static really_inline +m256 set2x128(m128 a) { + m256 rv = {a, a}; + return rv; +} +#endif + +static really_inline m256 zeroes256(void) { +#if defined(HAVE_AVX2) + return _mm256_setzero_si256(); +#else + m256 rv = {zeroes128(), zeroes128()}; + return rv; +#endif +} + +static really_inline m256 ones256(void) { +#if defined(HAVE_AVX2) + m256 rv = _mm256_set1_epi8(0xFF); +#else + m256 rv = {ones128(), ones128()}; +#endif + return rv; +} + +#if defined(HAVE_AVX2) +static really_inline m256 and256(m256 a, m256 b) { + return _mm256_and_si256(a, b); +} +#else +static really_inline m256 and256(m256 a, m256 b) { + m256 rv; + rv.lo = and128(a.lo, b.lo); + rv.hi = and128(a.hi, b.hi); + return rv; +} +#endif + +#if defined(HAVE_AVX2) +static really_inline m256 or256(m256 a, m256 b) { + return _mm256_or_si256(a, b); +} +#else +static really_inline m256 or256(m256 a, m256 b) { + m256 rv; + rv.lo = or128(a.lo, b.lo); + rv.hi = or128(a.hi, b.hi); + return rv; +} +#endif + +#if defined(HAVE_AVX2) +static really_inline m256 xor256(m256 a, m256 b) { + return _mm256_xor_si256(a, b); +} +#else +static really_inline m256 xor256(m256 a, m256 b) { + m256 rv; + rv.lo = xor128(a.lo, b.lo); + rv.hi = xor128(a.hi, b.hi); + return rv; +} +#endif + +#if defined(HAVE_AVX2) +static really_inline m256 not256(m256 a) { + return _mm256_xor_si256(a, ones256()); +} +#else +static really_inline m256 not256(m256 a) { + m256 rv; + rv.lo = not128(a.lo); + rv.hi = not128(a.hi); + return rv; +} +#endif + +#if defined(HAVE_AVX2) +static really_inline m256 andnot256(m256 a, m256 b) { + return _mm256_andnot_si256(a, b); +} +#else +static really_inline m256 andnot256(m256 a, m256 b) { + m256 rv; + rv.lo = andnot128(a.lo, b.lo); + rv.hi = andnot128(a.hi, b.hi); + return rv; +} +#endif + +static really_inline int diff256(m256 a, m256 b) { +#if defined(HAVE_AVX2) + return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); +#else + return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); +#endif +} + +static really_inline int isnonzero256(m256 a) { +#if defined(HAVE_AVX2) + return !!diff256(a, zeroes256()); +#else + return isnonzero128(or128(a.lo, a.hi)); +#endif +} + +/** + * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich256(m256 a, m256 b) { +#if defined(HAVE_AVX2) + a = _mm256_cmpeq_epi32(a, b); + return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF; +#else + m128 z = zeroes128(); + a.lo = _mm_cmpeq_epi32(a.lo, b.lo); + a.hi = _mm_cmpeq_epi32(a.hi, b.hi); + m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z); + return ~(_mm_movemask_epi8(packed)) & 0xff; +#endif +} + +/** + * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and + * returns an 8-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_256(m256 a, m256 b) { + u32 d = diffrich256(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline m256 load256(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m256))); +#if defined(HAVE_AVX2) + return _mm256_load_si256((const m256 *)ptr); +#else + m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; + return rv; +#endif +} + +// aligned load of 128-bit value to low and high part of 256-bit value +static really_inline m256 load2x128(const void *ptr) { +#if defined(HAVE_AVX2) + return set2x128(load128(ptr)); +#else + assert(ISALIGNED_N(ptr, alignof(m128))); + m256 rv; + rv.hi = rv.lo = load128(ptr); + return rv; +#endif +} + +static really_inline m256 loadu2x128(const void *ptr) { + return set2x128(loadu128(ptr)); +} + +// aligned store +static really_inline void store256(void *ptr, m256 a) { + assert(ISALIGNED_N(ptr, alignof(m256))); +#if defined(HAVE_AVX2) + _mm256_store_si256((m256 *)ptr, a); +#else + ptr = assume_aligned(ptr, 16); + *(m256 *)ptr = a; +#endif +} + +// unaligned load +static really_inline m256 loadu256(const void *ptr) { +#if defined(HAVE_AVX2) + return _mm256_loadu_si256((const m256 *)ptr); +#else + m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; + return rv; +#endif +} + +// unaligned store +static really_inline void storeu256(void *ptr, m256 a) { +#if defined(HAVE_AVX2) + _mm256_storeu_si256((m256 *)ptr, a); +#else + storeu128(ptr, a.lo); + storeu128((char *)ptr + 16, a.hi); +#endif +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes256(void *ptr, m256 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m256 loadbytes256(const void *ptr, unsigned int n) { + m256 a = zeroes256(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +static really_inline +m256 mask1bit256(unsigned int n) { + assert(n < sizeof(m256) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu256(&simd_onebit_masks[mask_idx]); +} + +static really_inline +m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { +#if defined(HAVE_AVX2) + return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0); +#else + m256 rv; + rv.hi = set64x2(hi_1, hi_0); + rv.lo = set64x2(lo_1, lo_0); + return rv; +#endif +} + +#if !defined(HAVE_AVX2) +// switches on bit N in the given vector. +static really_inline +void setbit256(m256 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 128; + } + setbit128(sub, n); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit256(m256 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 128; + } + clearbit128(sub, n); +} + +// tests bit N in the given vector. +static really_inline +char testbit256(m256 val, unsigned int n) { + assert(n < sizeof(val) * 8); + m128 sub; + if (n < 128) { + sub = val.lo; + } else { + sub = val.hi; + n -= 128; + } + return testbit128(sub, n); +} + +static really_really_inline +m128 movdq_hi(m256 x) { + return x.hi; +} + +static really_really_inline +m128 movdq_lo(m256 x) { + return x.lo; +} + +static really_inline +m256 combine2x128(m128 hi, m128 lo) { + m256 rv = {lo, hi}; + return rv; +} + +#else // AVX2 + +// switches on bit N in the given vector. +static really_inline +void setbit256(m256 *ptr, unsigned int n) { + *ptr = or256(mask1bit256(n), *ptr); +} + +static really_inline +void clearbit256(m256 *ptr, unsigned int n) { + *ptr = andnot256(mask1bit256(n), *ptr); +} + +// tests bit N in the given vector. +static really_inline +char testbit256(m256 val, unsigned int n) { + const m256 mask = mask1bit256(n); + return !_mm256_testz_si256(mask, val); +} + +static really_really_inline +m128 movdq_hi(m256 x) { + return _mm256_extracti128_si256(x, 1); +} + +static really_really_inline +m128 movdq_lo(m256 x) { + return _mm256_extracti128_si256(x, 0); +} + +#define cast256to128(a) _mm256_castsi256_si128(a) +#define cast128to256(a) _mm256_castsi128_si256(a) +#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) +#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) +#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) +#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) +#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) +#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) +#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a)) +#define extractlow32from256(a) movd(cast256to128(a)) +#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b) +#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b) +#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) + +static really_inline +m256 combine2x128(m128 hi, m128 lo) { +#if defined(_mm256_set_m128i) + return _mm256_set_m128i(hi, lo); +#else + return insert128to256(cast128to256(lo), hi, 1); +#endif +} +#endif //AVX2 + +#if defined(HAVE_AVX512) +#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) +#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) +#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) +#define set2x256(a) _mm512_broadcast_i64x4(a) +#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) +#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) +#endif + +/**** + **** 384-bit Primitives + ****/ + +static really_inline m384 and384(m384 a, m384 b) { + m384 rv; + rv.lo = and128(a.lo, b.lo); + rv.mid = and128(a.mid, b.mid); + rv.hi = and128(a.hi, b.hi); + return rv; +} + +static really_inline m384 or384(m384 a, m384 b) { + m384 rv; + rv.lo = or128(a.lo, b.lo); + rv.mid = or128(a.mid, b.mid); + rv.hi = or128(a.hi, b.hi); + return rv; +} + +static really_inline m384 xor384(m384 a, m384 b) { + m384 rv; + rv.lo = xor128(a.lo, b.lo); + rv.mid = xor128(a.mid, b.mid); + rv.hi = xor128(a.hi, b.hi); + return rv; +} +static really_inline m384 not384(m384 a) { + m384 rv; + rv.lo = not128(a.lo); + rv.mid = not128(a.mid); + rv.hi = not128(a.hi); + return rv; +} +static really_inline m384 andnot384(m384 a, m384 b) { + m384 rv; + rv.lo = andnot128(a.lo, b.lo); + rv.mid = andnot128(a.mid, b.mid); + rv.hi = andnot128(a.hi, b.hi); + return rv; +} + +static really_really_inline +m384 lshift64_m384(m384 a, unsigned b) { + m384 rv; + rv.lo = lshift64_m128(a.lo, b); + rv.mid = lshift64_m128(a.mid, b); + rv.hi = lshift64_m128(a.hi, b); + return rv; +} + +static really_inline m384 zeroes384(void) { + m384 rv = {zeroes128(), zeroes128(), zeroes128()}; + return rv; +} + +static really_inline m384 ones384(void) { + m384 rv = {ones128(), ones128(), ones128()}; + return rv; +} + +static really_inline int diff384(m384 a, m384 b) { + return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); +} + +static really_inline int isnonzero384(m384 a) { + return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); +} + +/** + * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich384(m384 a, m384 b) { + m128 z = zeroes128(); + a.lo = _mm_cmpeq_epi32(a.lo, b.lo); + a.mid = _mm_cmpeq_epi32(a.mid, b.mid); + a.hi = _mm_cmpeq_epi32(a.hi, b.hi); + m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid), + _mm_packs_epi32(a.hi, z)); + return ~(_mm_movemask_epi8(packed)) & 0xfff; +} + +/** + * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and + * returns a 12-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_384(m384 a, m384 b) { + u32 d = diffrich384(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline m384 load384(const void *ptr) { + assert(ISALIGNED_16(ptr)); + m384 rv = { load128(ptr), load128((const char *)ptr + 16), + load128((const char *)ptr + 32) }; + return rv; +} + +// aligned store +static really_inline void store384(void *ptr, m384 a) { + assert(ISALIGNED_16(ptr)); + ptr = assume_aligned(ptr, 16); + *(m384 *)ptr = a; +} + +// unaligned load +static really_inline m384 loadu384(const void *ptr) { + m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16), + loadu128((const char *)ptr + 32)}; + return rv; +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes384(void *ptr, m384 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m384 loadbytes384(const void *ptr, unsigned int n) { + m384 a = zeroes384(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +// switches on bit N in the given vector. +static really_inline +void setbit384(m384 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else if (n < 256) { + sub = &ptr->mid; + } else { + sub = &ptr->hi; + } + setbit128(sub, n % 128); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit384(m384 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else if (n < 256) { + sub = &ptr->mid; + } else { + sub = &ptr->hi; + } + clearbit128(sub, n % 128); +} + +// tests bit N in the given vector. +static really_inline +char testbit384(m384 val, unsigned int n) { + assert(n < sizeof(val) * 8); + m128 sub; + if (n < 128) { + sub = val.lo; + } else if (n < 256) { + sub = val.mid; + } else { + sub = val.hi; + } + return testbit128(sub, n % 128); +} + +/**** + **** 512-bit Primitives + ****/ + +#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) +#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) + +static really_inline +m512 zeroes512(void) { +#if defined(HAVE_AVX512) + return _mm512_setzero_si512(); +#else + m512 rv = {zeroes256(), zeroes256()}; + return rv; +#endif +} + +static really_inline +m512 ones512(void) { +#if defined(HAVE_AVX512) + return _mm512_set1_epi8(0xFF); + //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512()); +#else + m512 rv = {ones256(), ones256()}; + return rv; +#endif +} + +#if defined(HAVE_AVX512) +static really_inline +m512 set64x8(u8 a) { + return _mm512_set1_epi8(a); +} + +static really_inline +m512 set8x64(u64a a) { + return _mm512_set1_epi64(a); +} + +static really_inline +m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, + u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { + return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0, + lo_3, lo_2, lo_1, lo_0); +} + +static really_inline +m512 swap256in512(m512 a) { + m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL); + return vpermq512(idx, a); +} + +static really_inline +m512 set4x128(m128 a) { + return _mm512_broadcast_i32x4(a); +} +#endif + +static really_inline +m512 and512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_and_si512(a, b); +#else + m512 rv; + rv.lo = and256(a.lo, b.lo); + rv.hi = and256(a.hi, b.hi); + return rv; +#endif +} + +static really_inline +m512 or512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_or_si512(a, b); +#else + m512 rv; + rv.lo = or256(a.lo, b.lo); + rv.hi = or256(a.hi, b.hi); + return rv; +#endif +} + +static really_inline +m512 xor512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_xor_si512(a, b); +#else + m512 rv; + rv.lo = xor256(a.lo, b.lo); + rv.hi = xor256(a.hi, b.hi); + return rv; +#endif +} + +static really_inline +m512 not512(m512 a) { +#if defined(HAVE_AVX512) + return _mm512_xor_si512(a, ones512()); +#else + m512 rv; + rv.lo = not256(a.lo); + rv.hi = not256(a.hi); + return rv; +#endif +} + +static really_inline +m512 andnot512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_andnot_si512(a, b); +#else + m512 rv; + rv.lo = andnot256(a.lo, b.lo); + rv.hi = andnot256(a.hi, b.hi); + return rv; +#endif +} + +#if defined(HAVE_AVX512) +static really_really_inline +m512 lshift64_m512(m512 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm512_slli_epi64(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm512_sll_epi64(a, x); +} +#else +static really_really_inline +m512 lshift64_m512(m512 a, unsigned b) { + m512 rv; + rv.lo = lshift64_m256(a.lo, b); + rv.hi = lshift64_m256(a.hi, b); + return rv; +} +#endif + +#if defined(HAVE_AVX512) +#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) +#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) +#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) +#endif + +#if !defined(_MM_CMPINT_NE) +#define _MM_CMPINT_NE 0x4 +#endif + +static really_inline +int diff512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE); +#else + return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); +#endif +} + +static really_inline +int isnonzero512(m512 a) { +#if defined(HAVE_AVX512) + return diff512(a, zeroes512()); +#elif defined(HAVE_AVX2) + m256 x = or256(a.lo, a.hi); + return !!diff256(x, zeroes256()); +#else + m128 x = or128(a.lo.lo, a.lo.hi); + m128 y = or128(a.hi.lo, a.hi.hi); + return isnonzero128(or128(x, y)); +#endif +} + +/** + * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline +u32 diffrich512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE); +#elif defined(HAVE_AVX2) + return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); +#else + a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); + a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi); + a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo); + a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi); + m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi), + _mm_packs_epi32(a.hi.lo, a.hi.hi)); + return ~(_mm_movemask_epi8(packed)) & 0xffff; +#endif +} + +/** + * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and + * returns a 16-bit mask indicating which 64-bit words contain differences. + */ +static really_inline +u32 diffrich64_512(m512 a, m512 b) { + //TODO: cmp_epi64? + u32 d = diffrich512(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline +m512 load512(const void *ptr) { +#if defined(HAVE_AVX512) + return _mm512_load_si512(ptr); +#else + assert(ISALIGNED_N(ptr, alignof(m256))); + m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; + return rv; +#endif +} + +// aligned store +static really_inline +void store512(void *ptr, m512 a) { + assert(ISALIGNED_N(ptr, alignof(m512))); +#if defined(HAVE_AVX512) + return _mm512_store_si512(ptr, a); +#elif defined(HAVE_AVX2) + m512 *x = (m512 *)ptr; + store256(&x->lo, a.lo); + store256(&x->hi, a.hi); +#else + ptr = assume_aligned(ptr, 16); + *(m512 *)ptr = a; +#endif +} + +// unaligned load +static really_inline +m512 loadu512(const void *ptr) { +#if defined(HAVE_AVX512) + return _mm512_loadu_si512(ptr); +#else + m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; + return rv; +#endif +} + +#if defined(HAVE_AVX512) +static really_inline +m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { + return _mm512_maskz_loadu_epi8(k, ptr); +} + +static really_inline +m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) { + return _mm512_mask_loadu_epi8(src, k, ptr); +} + +static really_inline +m512 set_mask_m512(__mmask64 k) { + return _mm512_movm_epi8(k); +} +#endif + +// packed unaligned store of first N bytes +static really_inline +void storebytes512(void *ptr, m512 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m512 loadbytes512(const void *ptr, unsigned int n) { + m512 a = zeroes512(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +static really_inline +m512 mask1bit512(unsigned int n) { + assert(n < sizeof(m512) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu512(&simd_onebit_masks[mask_idx]); +} + +// switches on bit N in the given vector. +static really_inline +void setbit512(m512 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); +#if !defined(HAVE_AVX2) + m128 *sub; + if (n < 128) { + sub = &ptr->lo.lo; + } else if (n < 256) { + sub = &ptr->lo.hi; + } else if (n < 384) { + sub = &ptr->hi.lo; + } else { + sub = &ptr->hi.hi; + } + setbit128(sub, n % 128); +#elif defined(HAVE_AVX512) + *ptr = or512(mask1bit512(n), *ptr); +#else + m256 *sub; + if (n < 256) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 256; + } + setbit256(sub, n); +#endif +} + +// switches off bit N in the given vector. +static really_inline +void clearbit512(m512 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); +#if !defined(HAVE_AVX2) + m128 *sub; + if (n < 128) { + sub = &ptr->lo.lo; + } else if (n < 256) { + sub = &ptr->lo.hi; + } else if (n < 384) { + sub = &ptr->hi.lo; + } else { + sub = &ptr->hi.hi; + } + clearbit128(sub, n % 128); +#elif defined(HAVE_AVX512) + *ptr = andnot512(mask1bit512(n), *ptr); +#else + m256 *sub; + if (n < 256) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 256; + } + clearbit256(sub, n); +#endif +} + +// tests bit N in the given vector. +static really_inline +char testbit512(m512 val, unsigned int n) { + assert(n < sizeof(val) * 8); +#if !defined(HAVE_AVX2) + m128 sub; + if (n < 128) { + sub = val.lo.lo; + } else if (n < 256) { + sub = val.lo.hi; + } else if (n < 384) { + sub = val.hi.lo; + } else { + sub = val.hi.hi; + } + return testbit128(sub, n % 128); +#elif defined(HAVE_AVX512) + const m512 mask = mask1bit512(n); + return !!_mm512_test_epi8_mask(mask, val); +#else + m256 sub; + if (n < 256) { + sub = val.lo; + } else { + sub = val.hi; + n -= 256; + } + return testbit256(sub, n); +#endif +} + +#endif // ARCH_X86_SIMD_UTILS_H diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index 671a5bab..019dc125 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -30,21 +30,11 @@ * \brief SIMD types and primitive operations. */ -#ifndef SIMD_UTILS -#define SIMD_UTILS - -#if !defined(_WIN32) && !defined(__SSSE3__) -#error SSSE3 instructions must be enabled -#endif +#ifndef SIMD_UTILS_H +#define SIMD_UTILS_H #include "config.h" #include "util/arch.h" -#include "ue2common.h" -#include "simd_types.h" -#include "unaligned.h" -#include "util/intrinsics.h" - -#include // for memcpy // Define a common assume_aligned using an appropriate compiler built-in, if // it's available. Note that we need to handle C or C++ compilation. @@ -71,1269 +61,8 @@ extern const char vbs_mask_data[]; } #endif -static really_inline m128 ones128(void) { -#if defined(__GNUC__) || defined(__INTEL_COMPILER) - /* gcc gets this right */ - return _mm_set1_epi8(0xFF); -#else - /* trick from Intel's optimization guide to generate all-ones. - * ICC converts this to the single cmpeq instruction */ - return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); -#endif -} - -static really_inline m128 zeroes128(void) { - return _mm_setzero_si128(); -} - -/** \brief Bitwise not for m128*/ -static really_inline m128 not128(m128 a) { - return _mm_xor_si128(a, ones128()); -} - -/** \brief Return 1 if a and b are different otherwise 0 */ -static really_inline int diff128(m128 a, m128 b) { - return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff); -} - -static really_inline int isnonzero128(m128 a) { - return !!diff128(a, zeroes128()); -} - -/** - * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit - * mask indicating which 32-bit words contain differences. - */ -static really_inline u32 diffrich128(m128 a, m128 b) { - a = _mm_cmpeq_epi32(a, b); - return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf; -} - -/** - * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and - * returns a 4-bit mask indicating which 64-bit words contain differences. - */ -static really_inline u32 diffrich64_128(m128 a, m128 b) { -#if defined(HAVE_SSE41) - a = _mm_cmpeq_epi64(a, b); - return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5; -#else - u32 d = diffrich128(a, b); - return (d | (d >> 1)) & 0x5; -#endif -} - -static really_really_inline -m128 lshift64_m128(m128 a, unsigned b) { -#if defined(HAVE__BUILTIN_CONSTANT_P) - if (__builtin_constant_p(b)) { - return _mm_slli_epi64(a, b); - } -#endif - m128 x = _mm_cvtsi32_si128(b); - return _mm_sll_epi64(a, x); -} - -#define rshift64_m128(a, b) _mm_srli_epi64((a), (b)) -#define eq128(a, b) _mm_cmpeq_epi8((a), (b)) -#define movemask128(a) ((u32)_mm_movemask_epi8((a))) - -static really_inline m128 set16x8(u8 c) { - return _mm_set1_epi8(c); -} - -static really_inline m128 set4x32(u32 c) { - return _mm_set1_epi32(c); -} - -static really_inline u32 movd(const m128 in) { - return _mm_cvtsi128_si32(in); -} - -#if defined(HAVE_AVX512) -static really_inline u32 movd512(const m512 in) { - // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in), - // so we use 2-step convertions to work around. - return _mm_cvtsi128_si32(_mm512_castsi512_si128(in)); -} +#if defined(ARCH_IA32) || defined(ARCH_X86_64) +#include "util/arch/x86/simd_utils.h" #endif -static really_inline u64a movq(const m128 in) { -#if defined(ARCH_X86_64) - return _mm_cvtsi128_si64(in); -#else // 32-bit - this is horrific - u32 lo = movd(in); - u32 hi = movd(_mm_srli_epi64(in, 32)); - return (u64a)hi << 32 | lo; -#endif -} - -/* another form of movq */ -static really_inline -m128 load_m128_from_u64a(const u64a *p) { - return _mm_set_epi64x(0LL, *p); -} - -#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) -#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) - -#if defined(HAVE_SSE41) -#define extract32from128(a, imm) _mm_extract_epi32(a, imm) -#define extract64from128(a, imm) _mm_extract_epi64(a, imm) -#else -#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2)) -#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3)) -#endif - -#if !defined(HAVE_AVX2) -// TODO: this entire file needs restructuring - this carveout is awful -#define extractlow64from256(a) movq(a.lo) -#define extractlow32from256(a) movd(a.lo) -#if defined(HAVE_SSE41) -#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4) -#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2) -#else -#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4)) -#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8)) -#endif - -#endif // !AVX2 - -static really_inline m128 and128(m128 a, m128 b) { - return _mm_and_si128(a,b); -} - -static really_inline m128 xor128(m128 a, m128 b) { - return _mm_xor_si128(a,b); -} - -static really_inline m128 or128(m128 a, m128 b) { - return _mm_or_si128(a,b); -} - -static really_inline m128 andnot128(m128 a, m128 b) { - return _mm_andnot_si128(a, b); -} - -// aligned load -static really_inline m128 load128(const void *ptr) { - assert(ISALIGNED_N(ptr, alignof(m128))); - ptr = assume_aligned(ptr, 16); - return _mm_load_si128((const m128 *)ptr); -} - -// aligned store -static really_inline void store128(void *ptr, m128 a) { - assert(ISALIGNED_N(ptr, alignof(m128))); - ptr = assume_aligned(ptr, 16); - *(m128 *)ptr = a; -} - -// unaligned load -static really_inline m128 loadu128(const void *ptr) { - return _mm_loadu_si128((const m128 *)ptr); -} - -// unaligned store -static really_inline void storeu128(void *ptr, m128 a) { - _mm_storeu_si128 ((m128 *)ptr, a); -} - -// packed unaligned store of first N bytes -static really_inline -void storebytes128(void *ptr, m128 a, unsigned int n) { - assert(n <= sizeof(a)); - memcpy(ptr, &a, n); -} - -// packed unaligned load of first N bytes, pad with zero -static really_inline -m128 loadbytes128(const void *ptr, unsigned int n) { - m128 a = zeroes128(); - assert(n <= sizeof(a)); - memcpy(&a, ptr, n); - return a; -} - -#ifdef __cplusplus -extern "C" { -#endif -extern const u8 simd_onebit_masks[]; -#ifdef __cplusplus -} -#endif - -static really_inline -m128 mask1bit128(unsigned int n) { - assert(n < sizeof(m128) * 8); - u32 mask_idx = ((n % 8) * 64) + 95; - mask_idx -= n / 8; - return loadu128(&simd_onebit_masks[mask_idx]); -} - -// switches on bit N in the given vector. -static really_inline -void setbit128(m128 *ptr, unsigned int n) { - *ptr = or128(mask1bit128(n), *ptr); -} - -// switches off bit N in the given vector. -static really_inline -void clearbit128(m128 *ptr, unsigned int n) { - *ptr = andnot128(mask1bit128(n), *ptr); -} - -// tests bit N in the given vector. -static really_inline -char testbit128(m128 val, unsigned int n) { - const m128 mask = mask1bit128(n); -#if defined(HAVE_SSE41) - return !_mm_testz_si128(mask, val); -#else - return isnonzero128(and128(mask, val)); -#endif -} - -// offset must be an immediate -#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) - -static really_inline -m128 pshufb_m128(m128 a, m128 b) { - m128 result; - result = _mm_shuffle_epi8(a, b); - return result; -} - -static really_inline -m256 pshufb_m256(m256 a, m256 b) { -#if defined(HAVE_AVX2) - return _mm256_shuffle_epi8(a, b); -#else - m256 rv; - rv.lo = pshufb_m128(a.lo, b.lo); - rv.hi = pshufb_m128(a.hi, b.hi); - return rv; -#endif -} - -#if defined(HAVE_AVX512) -static really_inline -m512 pshufb_m512(m512 a, m512 b) { - return _mm512_shuffle_epi8(a, b); -} - -static really_inline -m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { - return _mm512_maskz_shuffle_epi8(k, a, b); -} - -#if defined(HAVE_AVX512VBMI) -#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a) -#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a) -#endif - -#endif - -static really_inline -m128 variable_byte_shift_m128(m128 in, s32 amount) { - assert(amount >= -16 && amount <= 16); - m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); - return pshufb_m128(in, shift_mask); -} - -static really_inline -m128 max_u8_m128(m128 a, m128 b) { - return _mm_max_epu8(a, b); -} - -static really_inline -m128 min_u8_m128(m128 a, m128 b) { - return _mm_min_epu8(a, b); -} - -static really_inline -m128 sadd_u8_m128(m128 a, m128 b) { - return _mm_adds_epu8(a, b); -} - -static really_inline -m128 sub_u8_m128(m128 a, m128 b) { - return _mm_sub_epi8(a, b); -} - -static really_inline -m128 set64x2(u64a hi, u64a lo) { - return _mm_set_epi64x(hi, lo); -} - -/**** - **** 256-bit Primitives - ****/ - -#if defined(HAVE_AVX2) - -static really_really_inline -m256 lshift64_m256(m256 a, unsigned b) { -#if defined(HAVE__BUILTIN_CONSTANT_P) - if (__builtin_constant_p(b)) { - return _mm256_slli_epi64(a, b); - } -#endif - m128 x = _mm_cvtsi32_si128(b); - return _mm256_sll_epi64(a, x); -} - -#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) - -static really_inline -m256 set32x8(u32 in) { - return _mm256_set1_epi8(in); -} - -#define eq256(a, b) _mm256_cmpeq_epi8((a), (b)) -#define movemask256(a) ((u32)_mm256_movemask_epi8((a))) - -static really_inline -m256 set2x128(m128 a) { - return _mm256_broadcastsi128_si256(a); -} - -#else - -static really_really_inline -m256 lshift64_m256(m256 a, int b) { - m256 rv = a; - rv.lo = lshift64_m128(rv.lo, b); - rv.hi = lshift64_m128(rv.hi, b); - return rv; -} - -static really_inline -m256 rshift64_m256(m256 a, int b) { - m256 rv = a; - rv.lo = rshift64_m128(rv.lo, b); - rv.hi = rshift64_m128(rv.hi, b); - return rv; -} -static really_inline -m256 set32x8(u32 in) { - m256 rv; - rv.lo = set16x8((u8) in); - rv.hi = rv.lo; - return rv; -} - -static really_inline -m256 eq256(m256 a, m256 b) { - m256 rv; - rv.lo = eq128(a.lo, b.lo); - rv.hi = eq128(a.hi, b.hi); - return rv; -} - -static really_inline -u32 movemask256(m256 a) { - u32 lo_mask = movemask128(a.lo); - u32 hi_mask = movemask128(a.hi); - return lo_mask | (hi_mask << 16); -} - -static really_inline -m256 set2x128(m128 a) { - m256 rv = {a, a}; - return rv; -} -#endif - -static really_inline m256 zeroes256(void) { -#if defined(HAVE_AVX2) - return _mm256_setzero_si256(); -#else - m256 rv = {zeroes128(), zeroes128()}; - return rv; -#endif -} - -static really_inline m256 ones256(void) { -#if defined(HAVE_AVX2) - m256 rv = _mm256_set1_epi8(0xFF); -#else - m256 rv = {ones128(), ones128()}; -#endif - return rv; -} - -#if defined(HAVE_AVX2) -static really_inline m256 and256(m256 a, m256 b) { - return _mm256_and_si256(a, b); -} -#else -static really_inline m256 and256(m256 a, m256 b) { - m256 rv; - rv.lo = and128(a.lo, b.lo); - rv.hi = and128(a.hi, b.hi); - return rv; -} -#endif - -#if defined(HAVE_AVX2) -static really_inline m256 or256(m256 a, m256 b) { - return _mm256_or_si256(a, b); -} -#else -static really_inline m256 or256(m256 a, m256 b) { - m256 rv; - rv.lo = or128(a.lo, b.lo); - rv.hi = or128(a.hi, b.hi); - return rv; -} -#endif - -#if defined(HAVE_AVX2) -static really_inline m256 xor256(m256 a, m256 b) { - return _mm256_xor_si256(a, b); -} -#else -static really_inline m256 xor256(m256 a, m256 b) { - m256 rv; - rv.lo = xor128(a.lo, b.lo); - rv.hi = xor128(a.hi, b.hi); - return rv; -} -#endif - -#if defined(HAVE_AVX2) -static really_inline m256 not256(m256 a) { - return _mm256_xor_si256(a, ones256()); -} -#else -static really_inline m256 not256(m256 a) { - m256 rv; - rv.lo = not128(a.lo); - rv.hi = not128(a.hi); - return rv; -} -#endif - -#if defined(HAVE_AVX2) -static really_inline m256 andnot256(m256 a, m256 b) { - return _mm256_andnot_si256(a, b); -} -#else -static really_inline m256 andnot256(m256 a, m256 b) { - m256 rv; - rv.lo = andnot128(a.lo, b.lo); - rv.hi = andnot128(a.hi, b.hi); - return rv; -} -#endif - -static really_inline int diff256(m256 a, m256 b) { -#if defined(HAVE_AVX2) - return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); -#else - return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); -#endif -} - -static really_inline int isnonzero256(m256 a) { -#if defined(HAVE_AVX2) - return !!diff256(a, zeroes256()); -#else - return isnonzero128(or128(a.lo, a.hi)); -#endif -} - -/** - * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit - * mask indicating which 32-bit words contain differences. - */ -static really_inline u32 diffrich256(m256 a, m256 b) { -#if defined(HAVE_AVX2) - a = _mm256_cmpeq_epi32(a, b); - return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF; -#else - m128 z = zeroes128(); - a.lo = _mm_cmpeq_epi32(a.lo, b.lo); - a.hi = _mm_cmpeq_epi32(a.hi, b.hi); - m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z); - return ~(_mm_movemask_epi8(packed)) & 0xff; -#endif -} - -/** - * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and - * returns an 8-bit mask indicating which 64-bit words contain differences. - */ -static really_inline u32 diffrich64_256(m256 a, m256 b) { - u32 d = diffrich256(a, b); - return (d | (d >> 1)) & 0x55555555; -} - -// aligned load -static really_inline m256 load256(const void *ptr) { - assert(ISALIGNED_N(ptr, alignof(m256))); -#if defined(HAVE_AVX2) - return _mm256_load_si256((const m256 *)ptr); -#else - m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; - return rv; -#endif -} - -// aligned load of 128-bit value to low and high part of 256-bit value -static really_inline m256 load2x128(const void *ptr) { -#if defined(HAVE_AVX2) - return set2x128(load128(ptr)); -#else - assert(ISALIGNED_N(ptr, alignof(m128))); - m256 rv; - rv.hi = rv.lo = load128(ptr); - return rv; -#endif -} - -static really_inline m256 loadu2x128(const void *ptr) { - return set2x128(loadu128(ptr)); -} - -// aligned store -static really_inline void store256(void *ptr, m256 a) { - assert(ISALIGNED_N(ptr, alignof(m256))); -#if defined(HAVE_AVX2) - _mm256_store_si256((m256 *)ptr, a); -#else - ptr = assume_aligned(ptr, 16); - *(m256 *)ptr = a; -#endif -} - -// unaligned load -static really_inline m256 loadu256(const void *ptr) { -#if defined(HAVE_AVX2) - return _mm256_loadu_si256((const m256 *)ptr); -#else - m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; - return rv; -#endif -} - -// unaligned store -static really_inline void storeu256(void *ptr, m256 a) { -#if defined(HAVE_AVX2) - _mm256_storeu_si256((m256 *)ptr, a); -#else - storeu128(ptr, a.lo); - storeu128((char *)ptr + 16, a.hi); -#endif -} - -// packed unaligned store of first N bytes -static really_inline -void storebytes256(void *ptr, m256 a, unsigned int n) { - assert(n <= sizeof(a)); - memcpy(ptr, &a, n); -} - -// packed unaligned load of first N bytes, pad with zero -static really_inline -m256 loadbytes256(const void *ptr, unsigned int n) { - m256 a = zeroes256(); - assert(n <= sizeof(a)); - memcpy(&a, ptr, n); - return a; -} - -static really_inline -m256 mask1bit256(unsigned int n) { - assert(n < sizeof(m256) * 8); - u32 mask_idx = ((n % 8) * 64) + 95; - mask_idx -= n / 8; - return loadu256(&simd_onebit_masks[mask_idx]); -} - -static really_inline -m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { -#if defined(HAVE_AVX2) - return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0); -#else - m256 rv; - rv.hi = set64x2(hi_1, hi_0); - rv.lo = set64x2(lo_1, lo_0); - return rv; -#endif -} - -#if !defined(HAVE_AVX2) -// switches on bit N in the given vector. -static really_inline -void setbit256(m256 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 128; - } - setbit128(sub, n); -} - -// switches off bit N in the given vector. -static really_inline -void clearbit256(m256 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 128; - } - clearbit128(sub, n); -} - -// tests bit N in the given vector. -static really_inline -char testbit256(m256 val, unsigned int n) { - assert(n < sizeof(val) * 8); - m128 sub; - if (n < 128) { - sub = val.lo; - } else { - sub = val.hi; - n -= 128; - } - return testbit128(sub, n); -} - -static really_really_inline -m128 movdq_hi(m256 x) { - return x.hi; -} - -static really_really_inline -m128 movdq_lo(m256 x) { - return x.lo; -} - -static really_inline -m256 combine2x128(m128 hi, m128 lo) { - m256 rv = {lo, hi}; - return rv; -} - -#else // AVX2 - -// switches on bit N in the given vector. -static really_inline -void setbit256(m256 *ptr, unsigned int n) { - *ptr = or256(mask1bit256(n), *ptr); -} - -static really_inline -void clearbit256(m256 *ptr, unsigned int n) { - *ptr = andnot256(mask1bit256(n), *ptr); -} - -// tests bit N in the given vector. -static really_inline -char testbit256(m256 val, unsigned int n) { - const m256 mask = mask1bit256(n); - return !_mm256_testz_si256(mask, val); -} - -static really_really_inline -m128 movdq_hi(m256 x) { - return _mm256_extracti128_si256(x, 1); -} - -static really_really_inline -m128 movdq_lo(m256 x) { - return _mm256_extracti128_si256(x, 0); -} - -#define cast256to128(a) _mm256_castsi256_si128(a) -#define cast128to256(a) _mm256_castsi128_si256(a) -#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) -#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) -#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) -#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) -#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) -#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) -#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a)) -#define extractlow32from256(a) movd(cast256to128(a)) -#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b) -#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b) -#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) - -static really_inline -m256 combine2x128(m128 hi, m128 lo) { -#if defined(_mm256_set_m128i) - return _mm256_set_m128i(hi, lo); -#else - return insert128to256(cast128to256(lo), hi, 1); -#endif -} -#endif //AVX2 - -#if defined(HAVE_AVX512) -#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) -#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) -#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) -#define set2x256(a) _mm512_broadcast_i64x4(a) -#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) -#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) -#endif - -/**** - **** 384-bit Primitives - ****/ - -static really_inline m384 and384(m384 a, m384 b) { - m384 rv; - rv.lo = and128(a.lo, b.lo); - rv.mid = and128(a.mid, b.mid); - rv.hi = and128(a.hi, b.hi); - return rv; -} - -static really_inline m384 or384(m384 a, m384 b) { - m384 rv; - rv.lo = or128(a.lo, b.lo); - rv.mid = or128(a.mid, b.mid); - rv.hi = or128(a.hi, b.hi); - return rv; -} - -static really_inline m384 xor384(m384 a, m384 b) { - m384 rv; - rv.lo = xor128(a.lo, b.lo); - rv.mid = xor128(a.mid, b.mid); - rv.hi = xor128(a.hi, b.hi); - return rv; -} -static really_inline m384 not384(m384 a) { - m384 rv; - rv.lo = not128(a.lo); - rv.mid = not128(a.mid); - rv.hi = not128(a.hi); - return rv; -} -static really_inline m384 andnot384(m384 a, m384 b) { - m384 rv; - rv.lo = andnot128(a.lo, b.lo); - rv.mid = andnot128(a.mid, b.mid); - rv.hi = andnot128(a.hi, b.hi); - return rv; -} - -static really_really_inline -m384 lshift64_m384(m384 a, unsigned b) { - m384 rv; - rv.lo = lshift64_m128(a.lo, b); - rv.mid = lshift64_m128(a.mid, b); - rv.hi = lshift64_m128(a.hi, b); - return rv; -} - -static really_inline m384 zeroes384(void) { - m384 rv = {zeroes128(), zeroes128(), zeroes128()}; - return rv; -} - -static really_inline m384 ones384(void) { - m384 rv = {ones128(), ones128(), ones128()}; - return rv; -} - -static really_inline int diff384(m384 a, m384 b) { - return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); -} - -static really_inline int isnonzero384(m384 a) { - return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); -} - -/** - * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit - * mask indicating which 32-bit words contain differences. - */ -static really_inline u32 diffrich384(m384 a, m384 b) { - m128 z = zeroes128(); - a.lo = _mm_cmpeq_epi32(a.lo, b.lo); - a.mid = _mm_cmpeq_epi32(a.mid, b.mid); - a.hi = _mm_cmpeq_epi32(a.hi, b.hi); - m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid), - _mm_packs_epi32(a.hi, z)); - return ~(_mm_movemask_epi8(packed)) & 0xfff; -} - -/** - * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and - * returns a 12-bit mask indicating which 64-bit words contain differences. - */ -static really_inline u32 diffrich64_384(m384 a, m384 b) { - u32 d = diffrich384(a, b); - return (d | (d >> 1)) & 0x55555555; -} - -// aligned load -static really_inline m384 load384(const void *ptr) { - assert(ISALIGNED_16(ptr)); - m384 rv = { load128(ptr), load128((const char *)ptr + 16), - load128((const char *)ptr + 32) }; - return rv; -} - -// aligned store -static really_inline void store384(void *ptr, m384 a) { - assert(ISALIGNED_16(ptr)); - ptr = assume_aligned(ptr, 16); - *(m384 *)ptr = a; -} - -// unaligned load -static really_inline m384 loadu384(const void *ptr) { - m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16), - loadu128((const char *)ptr + 32)}; - return rv; -} - -// packed unaligned store of first N bytes -static really_inline -void storebytes384(void *ptr, m384 a, unsigned int n) { - assert(n <= sizeof(a)); - memcpy(ptr, &a, n); -} - -// packed unaligned load of first N bytes, pad with zero -static really_inline -m384 loadbytes384(const void *ptr, unsigned int n) { - m384 a = zeroes384(); - assert(n <= sizeof(a)); - memcpy(&a, ptr, n); - return a; -} - -// switches on bit N in the given vector. -static really_inline -void setbit384(m384 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else if (n < 256) { - sub = &ptr->mid; - } else { - sub = &ptr->hi; - } - setbit128(sub, n % 128); -} - -// switches off bit N in the given vector. -static really_inline -void clearbit384(m384 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else if (n < 256) { - sub = &ptr->mid; - } else { - sub = &ptr->hi; - } - clearbit128(sub, n % 128); -} - -// tests bit N in the given vector. -static really_inline -char testbit384(m384 val, unsigned int n) { - assert(n < sizeof(val) * 8); - m128 sub; - if (n < 128) { - sub = val.lo; - } else if (n < 256) { - sub = val.mid; - } else { - sub = val.hi; - } - return testbit128(sub, n % 128); -} - -/**** - **** 512-bit Primitives - ****/ - -#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) -#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) - -static really_inline -m512 zeroes512(void) { -#if defined(HAVE_AVX512) - return _mm512_setzero_si512(); -#else - m512 rv = {zeroes256(), zeroes256()}; - return rv; -#endif -} - -static really_inline -m512 ones512(void) { -#if defined(HAVE_AVX512) - return _mm512_set1_epi8(0xFF); - //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512()); -#else - m512 rv = {ones256(), ones256()}; - return rv; -#endif -} - -#if defined(HAVE_AVX512) -static really_inline -m512 set64x8(u8 a) { - return _mm512_set1_epi8(a); -} - -static really_inline -m512 set8x64(u64a a) { - return _mm512_set1_epi64(a); -} - -static really_inline -m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, - u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { - return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0, - lo_3, lo_2, lo_1, lo_0); -} - -static really_inline -m512 swap256in512(m512 a) { - m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL); - return vpermq512(idx, a); -} - -static really_inline -m512 set4x128(m128 a) { - return _mm512_broadcast_i32x4(a); -} -#endif - -static really_inline -m512 and512(m512 a, m512 b) { -#if defined(HAVE_AVX512) - return _mm512_and_si512(a, b); -#else - m512 rv; - rv.lo = and256(a.lo, b.lo); - rv.hi = and256(a.hi, b.hi); - return rv; -#endif -} - -static really_inline -m512 or512(m512 a, m512 b) { -#if defined(HAVE_AVX512) - return _mm512_or_si512(a, b); -#else - m512 rv; - rv.lo = or256(a.lo, b.lo); - rv.hi = or256(a.hi, b.hi); - return rv; -#endif -} - -static really_inline -m512 xor512(m512 a, m512 b) { -#if defined(HAVE_AVX512) - return _mm512_xor_si512(a, b); -#else - m512 rv; - rv.lo = xor256(a.lo, b.lo); - rv.hi = xor256(a.hi, b.hi); - return rv; -#endif -} - -static really_inline -m512 not512(m512 a) { -#if defined(HAVE_AVX512) - return _mm512_xor_si512(a, ones512()); -#else - m512 rv; - rv.lo = not256(a.lo); - rv.hi = not256(a.hi); - return rv; -#endif -} - -static really_inline -m512 andnot512(m512 a, m512 b) { -#if defined(HAVE_AVX512) - return _mm512_andnot_si512(a, b); -#else - m512 rv; - rv.lo = andnot256(a.lo, b.lo); - rv.hi = andnot256(a.hi, b.hi); - return rv; -#endif -} - -#if defined(HAVE_AVX512) -static really_really_inline -m512 lshift64_m512(m512 a, unsigned b) { -#if defined(HAVE__BUILTIN_CONSTANT_P) - if (__builtin_constant_p(b)) { - return _mm512_slli_epi64(a, b); - } -#endif - m128 x = _mm_cvtsi32_si128(b); - return _mm512_sll_epi64(a, x); -} -#else -static really_really_inline -m512 lshift64_m512(m512 a, unsigned b) { - m512 rv; - rv.lo = lshift64_m256(a.lo, b); - rv.hi = lshift64_m256(a.hi, b); - return rv; -} -#endif - -#if defined(HAVE_AVX512) -#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) -#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) -#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) -#endif - -#if !defined(_MM_CMPINT_NE) -#define _MM_CMPINT_NE 0x4 -#endif - -static really_inline -int diff512(m512 a, m512 b) { -#if defined(HAVE_AVX512) - return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE); -#else - return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); -#endif -} - -static really_inline -int isnonzero512(m512 a) { -#if defined(HAVE_AVX512) - return diff512(a, zeroes512()); -#elif defined(HAVE_AVX2) - m256 x = or256(a.lo, a.hi); - return !!diff256(x, zeroes256()); -#else - m128 x = or128(a.lo.lo, a.lo.hi); - m128 y = or128(a.hi.lo, a.hi.hi); - return isnonzero128(or128(x, y)); -#endif -} - -/** - * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit - * mask indicating which 32-bit words contain differences. - */ -static really_inline -u32 diffrich512(m512 a, m512 b) { -#if defined(HAVE_AVX512) - return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE); -#elif defined(HAVE_AVX2) - return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); -#else - a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); - a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi); - a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo); - a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi); - m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi), - _mm_packs_epi32(a.hi.lo, a.hi.hi)); - return ~(_mm_movemask_epi8(packed)) & 0xffff; -#endif -} - -/** - * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and - * returns a 16-bit mask indicating which 64-bit words contain differences. - */ -static really_inline -u32 diffrich64_512(m512 a, m512 b) { - //TODO: cmp_epi64? - u32 d = diffrich512(a, b); - return (d | (d >> 1)) & 0x55555555; -} - -// aligned load -static really_inline -m512 load512(const void *ptr) { -#if defined(HAVE_AVX512) - return _mm512_load_si512(ptr); -#else - assert(ISALIGNED_N(ptr, alignof(m256))); - m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; - return rv; -#endif -} - -// aligned store -static really_inline -void store512(void *ptr, m512 a) { - assert(ISALIGNED_N(ptr, alignof(m512))); -#if defined(HAVE_AVX512) - return _mm512_store_si512(ptr, a); -#elif defined(HAVE_AVX2) - m512 *x = (m512 *)ptr; - store256(&x->lo, a.lo); - store256(&x->hi, a.hi); -#else - ptr = assume_aligned(ptr, 16); - *(m512 *)ptr = a; -#endif -} - -// unaligned load -static really_inline -m512 loadu512(const void *ptr) { -#if defined(HAVE_AVX512) - return _mm512_loadu_si512(ptr); -#else - m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; - return rv; -#endif -} - -#if defined(HAVE_AVX512) -static really_inline -m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { - return _mm512_maskz_loadu_epi8(k, ptr); -} - -static really_inline -m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) { - return _mm512_mask_loadu_epi8(src, k, ptr); -} - -static really_inline -m512 set_mask_m512(__mmask64 k) { - return _mm512_movm_epi8(k); -} -#endif - -// packed unaligned store of first N bytes -static really_inline -void storebytes512(void *ptr, m512 a, unsigned int n) { - assert(n <= sizeof(a)); - memcpy(ptr, &a, n); -} - -// packed unaligned load of first N bytes, pad with zero -static really_inline -m512 loadbytes512(const void *ptr, unsigned int n) { - m512 a = zeroes512(); - assert(n <= sizeof(a)); - memcpy(&a, ptr, n); - return a; -} - -static really_inline -m512 mask1bit512(unsigned int n) { - assert(n < sizeof(m512) * 8); - u32 mask_idx = ((n % 8) * 64) + 95; - mask_idx -= n / 8; - return loadu512(&simd_onebit_masks[mask_idx]); -} - -// switches on bit N in the given vector. -static really_inline -void setbit512(m512 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); -#if !defined(HAVE_AVX2) - m128 *sub; - if (n < 128) { - sub = &ptr->lo.lo; - } else if (n < 256) { - sub = &ptr->lo.hi; - } else if (n < 384) { - sub = &ptr->hi.lo; - } else { - sub = &ptr->hi.hi; - } - setbit128(sub, n % 128); -#elif defined(HAVE_AVX512) - *ptr = or512(mask1bit512(n), *ptr); -#else - m256 *sub; - if (n < 256) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 256; - } - setbit256(sub, n); -#endif -} - -// switches off bit N in the given vector. -static really_inline -void clearbit512(m512 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); -#if !defined(HAVE_AVX2) - m128 *sub; - if (n < 128) { - sub = &ptr->lo.lo; - } else if (n < 256) { - sub = &ptr->lo.hi; - } else if (n < 384) { - sub = &ptr->hi.lo; - } else { - sub = &ptr->hi.hi; - } - clearbit128(sub, n % 128); -#elif defined(HAVE_AVX512) - *ptr = andnot512(mask1bit512(n), *ptr); -#else - m256 *sub; - if (n < 256) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 256; - } - clearbit256(sub, n); -#endif -} - -// tests bit N in the given vector. -static really_inline -char testbit512(m512 val, unsigned int n) { - assert(n < sizeof(val) * 8); -#if !defined(HAVE_AVX2) - m128 sub; - if (n < 128) { - sub = val.lo.lo; - } else if (n < 256) { - sub = val.lo.hi; - } else if (n < 384) { - sub = val.hi.lo; - } else { - sub = val.hi.hi; - } - return testbit128(sub, n % 128); -#elif defined(HAVE_AVX512) - const m512 mask = mask1bit512(n); - return !!_mm512_test_epi8_mask(mask, val); -#else - m256 sub; - if (n < 256) { - sub = val.lo; - } else { - sub = val.hi; - n -= 256; - } - return testbit256(sub, n); -#endif -} - -#endif +#endif // SIMD_UTILS_H From f7a6b8934cddbdfd77f1eb565b7ba08f9aa6a5f6 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 23 Sep 2020 11:49:26 +0300 Subject: [PATCH 11/53] add some set*() functions, harmonize names, rename setAxB to set1_AxB when using mm_set1_* internally --- src/util/arch/x86/simd_utils.h | 73 +++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index 6ec4042b..2d099f56 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -111,14 +111,18 @@ m128 lshift64_m128(m128 a, unsigned b) { #define eq128(a, b) _mm_cmpeq_epi8((a), (b)) #define movemask128(a) ((u32)_mm_movemask_epi8((a))) -static really_inline m128 set16x8(u8 c) { +static really_inline m128 set1_16x8(u8 c) { return _mm_set1_epi8(c); } -static really_inline m128 set4x32(u32 c) { +static really_inline m128 set1_4x32(u32 c) { return _mm_set1_epi32(c); } +static really_inline m128 set1_2x64(u64a c) { + return _mm_set1_epi64x(c); +} + static really_inline u32 movd(const m128 in) { return _mm_cvtsi128_si32(in); } @@ -335,7 +339,12 @@ m128 sub_u8_m128(m128 a, m128 b) { } static really_inline -m128 set64x2(u64a hi, u64a lo) { +m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) { + return _mm_set_epi32(x3, x2, x1, x0); +} + +static really_inline +m128 set2x64(u64a hi, u64a lo) { return _mm_set_epi64x(hi, lo); } @@ -358,16 +367,15 @@ m256 lshift64_m256(m256 a, unsigned b) { #define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) -static really_inline -m256 set32x8(u32 in) { - return _mm256_set1_epi8(in); +static really_inline m256 set1_4x64(u64a c) { + return _mm256_set1_epi64x(c); } #define eq256(a, b) _mm256_cmpeq_epi8((a), (b)) #define movemask256(a) ((u32)_mm256_movemask_epi8((a))) static really_inline -m256 set2x128(m128 a) { +m256 set1_2x128(m128 a) { return _mm256_broadcastsi128_si256(a); } @@ -388,13 +396,6 @@ m256 rshift64_m256(m256 a, int b) { rv.hi = rshift64_m128(rv.hi, b); return rv; } -static really_inline -m256 set32x8(u32 in) { - m256 rv; - rv.lo = set16x8((u8) in); - rv.hi = rv.lo; - return rv; -} static really_inline m256 eq256(m256 a, m256 b) { @@ -412,7 +413,7 @@ u32 movemask256(m256 a) { } static really_inline -m256 set2x128(m128 a) { +m256 set1_2x128(m128 a) { m256 rv = {a, a}; return rv; } @@ -557,7 +558,7 @@ static really_inline m256 load256(const void *ptr) { // aligned load of 128-bit value to low and high part of 256-bit value static really_inline m256 load2x128(const void *ptr) { #if defined(HAVE_AVX2) - return set2x128(load128(ptr)); + return set1_2x128(load128(ptr)); #else assert(ISALIGNED_N(ptr, alignof(m128))); m256 rv; @@ -567,7 +568,7 @@ static really_inline m256 load2x128(const void *ptr) { } static really_inline m256 loadu2x128(const void *ptr) { - return set2x128(loadu128(ptr)); + return set1_2x128(loadu128(ptr)); } // aligned store @@ -626,13 +627,37 @@ m256 mask1bit256(unsigned int n) { } static really_inline -m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { +m256 set1_32x8(u32 in) { +#if defined(HAVE_AVX2) + return _mm256_set1_epi8(in); +#else + m256 rv; + rv.hi = set1_16x8(in); + rv.lo = set1_16x8(in); + return rv; +#endif +} + +static really_inline +m256 set8x32(u32 hi_3, u32 hi_2, u32 hi_1, u32 hi_0, u32 lo_3, u32 lo_2, u32 lo_1, u32 lo_0) { +#if defined(HAVE_AVX2) + return _mm256_set_epi32(hi_3, hi_2, hi_1, hi_0, lo_3, lo_2, lo_1, lo_0); +#else + m256 rv; + rv.hi = set4x32(hi_3, hi_2, hi_1, hi_0); + rv.lo = set4x32(lo_3, lo_2, lo_1, lo_0); + return rv; +#endif +} + +static really_inline +m256 set4x64(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { #if defined(HAVE_AVX2) return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0); #else m256 rv; - rv.hi = set64x2(hi_1, hi_0); - rv.lo = set64x2(lo_1, lo_0); + rv.hi = set2x64(hi_1, hi_0); + rv.lo = set2x64(lo_1, lo_0); return rv; #endif } @@ -964,17 +989,17 @@ m512 ones512(void) { #if defined(HAVE_AVX512) static really_inline -m512 set64x8(u8 a) { +m512 set1_64x8(u8 a) { return _mm512_set1_epi8(a); } static really_inline -m512 set8x64(u64a a) { +m512 set1_8x64(u64a a) { return _mm512_set1_epi64(a); } static really_inline -m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, +m512 set8x64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0, lo_3, lo_2, lo_1, lo_0); @@ -987,7 +1012,7 @@ m512 swap256in512(m512 a) { } static really_inline -m512 set4x128(m128 a) { +m512 set1_4x128(m128 a) { return _mm512_broadcast_i32x4(a); } #endif From 53334672495387c4575ca88834d5f5ee2ae726f6 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 23 Sep 2020 11:51:21 +0300 Subject: [PATCH 12/53] fix names, use own intrinsic instead of explicit _mm* ones --- src/fdr/teddy.c | 64 +++++++++++++++--------------- src/fdr/teddy_avx2.c | 12 +++--- src/hwlm/noodle_engine_avx2.c | 4 +- src/hwlm/noodle_engine_sse.c | 4 +- src/nfa/mcclellan_common_impl.h | 2 +- src/nfa/mcsheng.c | 8 ++-- src/nfa/sheng_impl.h | 2 +- src/nfa/sheng_impl4.h | 2 +- src/nfa/shufti.c | 30 +++++++------- src/nfa/truffle.c | 16 ++++---- src/nfa/vermicelli_sse.h | 20 +++++----- src/rose/counting_miracle.h | 4 +- src/rose/program_runtime.c | 20 +++++----- src/rose/validate_shufti.h | 16 ++++---- src/util/state_compress.c | 70 ++++++++++++++++----------------- 15 files changed, 137 insertions(+), 137 deletions(-) diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c index 960e2a41..97cff0b4 100644 --- a/src/fdr/teddy.c +++ b/src/fdr/teddy.c @@ -311,26 +311,26 @@ const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = { sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS); #define PREPARE_MASKS_1 \ - dup_mask[0] = set4x128(maskBase[0]); \ - dup_mask[1] = set4x128(maskBase[1]); + dup_mask[0] = set1_4x128(maskBase[0]); \ + dup_mask[1] = set1_4x128(maskBase[1]); #define PREPARE_MASKS_2 \ PREPARE_MASKS_1 \ - dup_mask[2] = set4x128(maskBase[2]); \ - dup_mask[3] = set4x128(maskBase[3]); + dup_mask[2] = set1_4x128(maskBase[2]); \ + dup_mask[3] = set1_4x128(maskBase[3]); #define PREPARE_MASKS_3 \ PREPARE_MASKS_2 \ - dup_mask[4] = set4x128(maskBase[4]); \ - dup_mask[5] = set4x128(maskBase[5]); + dup_mask[4] = set1_4x128(maskBase[4]); \ + dup_mask[5] = set1_4x128(maskBase[5]); #define PREPARE_MASKS_4 \ PREPARE_MASKS_3 \ - dup_mask[6] = set4x128(maskBase[6]); \ - dup_mask[7] = set4x128(maskBase[7]); + dup_mask[6] = set1_4x128(maskBase[6]); \ + dup_mask[7] = set1_4x128(maskBase[7]); #define PREPARE_MASKS(n) \ - m512 lo_mask = set64x8(0xf); \ + m512 lo_mask = set1_64x8(0xf); \ m512 dup_mask[n * 2]; \ m512 sl_msk[n - 1]; \ PREPARE_MASKS_##n \ @@ -570,26 +570,26 @@ m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask, &c_0, &c_16, &c_32, &c_48) #define PREPARE_MASKS_1 \ - dup_mask[0] = set4x128(maskBase[0]); \ - dup_mask[1] = set4x128(maskBase[1]); + dup_mask[0] = set1_4x128(maskBase[0]); \ + dup_mask[1] = set1_4x128(maskBase[1]); #define PREPARE_MASKS_2 \ PREPARE_MASKS_1 \ - dup_mask[2] = set4x128(maskBase[2]); \ - dup_mask[3] = set4x128(maskBase[3]); + dup_mask[2] = set1_4x128(maskBase[2]); \ + dup_mask[3] = set1_4x128(maskBase[3]); #define PREPARE_MASKS_3 \ PREPARE_MASKS_2 \ - dup_mask[4] = set4x128(maskBase[4]); \ - dup_mask[5] = set4x128(maskBase[5]); + dup_mask[4] = set1_4x128(maskBase[4]); \ + dup_mask[5] = set1_4x128(maskBase[5]); #define PREPARE_MASKS_4 \ PREPARE_MASKS_3 \ - dup_mask[6] = set4x128(maskBase[6]); \ - dup_mask[7] = set4x128(maskBase[7]); + dup_mask[6] = set1_4x128(maskBase[6]); \ + dup_mask[7] = set1_4x128(maskBase[7]); #define PREPARE_MASKS(n) \ - m512 lo_mask = set64x8(0xf); \ + m512 lo_mask = set1_64x8(0xf); \ m512 dup_mask[n * 2]; \ PREPARE_MASKS_##n @@ -713,7 +713,7 @@ do { \ #define PREP_SHUF_MASK \ PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr)); \ *c_128 = *(ptr + 15); \ - m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \ + m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \ *c_0 = *(ptr + 31) #define SHIFT_OR_M1 \ @@ -805,26 +805,26 @@ m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask, prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128) #define PREPARE_MASKS_1 \ - dup_mask[0] = set2x128(maskBase[0]); \ - dup_mask[1] = set2x128(maskBase[1]); + dup_mask[0] = set1_2x128(maskBase[0]); \ + dup_mask[1] = set1_2x128(maskBase[1]); #define PREPARE_MASKS_2 \ PREPARE_MASKS_1 \ - dup_mask[2] = set2x128(maskBase[2]); \ - dup_mask[3] = set2x128(maskBase[3]); + dup_mask[2] = set1_2x128(maskBase[2]); \ + dup_mask[3] = set1_2x128(maskBase[3]); #define PREPARE_MASKS_3 \ PREPARE_MASKS_2 \ - dup_mask[4] = set2x128(maskBase[4]); \ - dup_mask[5] = set2x128(maskBase[5]); + dup_mask[4] = set1_2x128(maskBase[4]); \ + dup_mask[5] = set1_2x128(maskBase[5]); #define PREPARE_MASKS_4 \ PREPARE_MASKS_3 \ - dup_mask[6] = set2x128(maskBase[6]); \ - dup_mask[7] = set2x128(maskBase[7]); + dup_mask[6] = set1_2x128(maskBase[6]); \ + dup_mask[7] = set1_2x128(maskBase[7]); #define PREPARE_MASKS(n) \ - m256 lo_mask = set32x8(0xf); \ + m256 lo_mask = set1_32x8(0xf); \ m256 dup_mask[n * 2]; \ PREPARE_MASKS_##n @@ -925,7 +925,7 @@ do { \ static really_inline m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) { - m128 mask = set16x8(0xf); + m128 mask = set1_16x8(0xf); m128 lo = and128(val, mask); m128 hi = and128(rshift64_m128(val, 4), mask); return or128(pshufb_m128(maskBase[0 * 2], lo), @@ -934,7 +934,7 @@ m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) { static really_inline m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) { - m128 mask = set16x8(0xf); + m128 mask = set1_16x8(0xf); m128 lo = and128(val, mask); m128 hi = and128(rshift64_m128(val, 4), mask); m128 r = prep_conf_teddy_m1(maskBase, val); @@ -949,7 +949,7 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) { static really_inline m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2, m128 val) { - m128 mask = set16x8(0xf); + m128 mask = set1_16x8(0xf); m128 lo = and128(val, mask); m128 hi = and128(rshift64_m128(val, 4), mask); m128 r = prep_conf_teddy_m2(maskBase, old_1, val); @@ -964,7 +964,7 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2, static really_inline m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2, m128 *old_3, m128 val) { - m128 mask = set16x8(0xf); + m128 mask = set1_16x8(0xf); m128 lo = and128(val, mask); m128 hi = and128(rshift64_m128(val, 4), mask); m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val); diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c index 20ea938c..df54fc62 100644 --- a/src/fdr/teddy_avx2.c +++ b/src/fdr/teddy_avx2.c @@ -501,15 +501,15 @@ m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset, const u8 *buf_history, size_t len_history, const u32 nMasks) { m128 p_mask128; - m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi, + m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi, buf_history, len_history, nMasks)); - *p_mask = set2x128(p_mask128); + *p_mask = set1_2x128(p_mask128); return ret; } static really_inline m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) { - m256 mask = set32x8(0xf); + m256 mask = set1_32x8(0xf); m256 lo = and256(val, mask); m256 hi = and256(rshift64_m256(val, 4), mask); return or256(pshufb_m256(maskBase[0 * 2], lo), @@ -518,7 +518,7 @@ m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) { static really_inline m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) { - m256 mask = set32x8(0xf); + m256 mask = set1_32x8(0xf); m256 lo = and256(val, mask); m256 hi = and256(rshift64_m256(val, 4), mask); m256 r = prep_conf_fat_teddy_m1(maskBase, val); @@ -533,7 +533,7 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) { static really_inline m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2, m256 val) { - m256 mask = set32x8(0xf); + m256 mask = set1_32x8(0xf); m256 lo = and256(val, mask); m256 hi = and256(rshift64_m256(val, 4), mask); m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val); @@ -548,7 +548,7 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2, static really_inline m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2, m256 *old_3, m256 val) { - m256 mask = set32x8(0xf); + m256 mask = set1_32x8(0xf); m256 lo = and256(val, mask); m256 hi = and256(rshift64_m256(val, 4), mask); m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val); diff --git a/src/hwlm/noodle_engine_avx2.c b/src/hwlm/noodle_engine_avx2.c index 5edc646a..49fe168f 100644 --- a/src/hwlm/noodle_engine_avx2.c +++ b/src/hwlm/noodle_engine_avx2.c @@ -30,11 +30,11 @@ static really_inline m256 getMask(u8 c, bool noCase) { u8 k = caseClear8(c, noCase); - return set32x8(k); + return set1_32x8(k); } static really_inline m256 getCaseMask(void) { - return set32x8(0xdf); + return set1_32x8(0xdf); } static really_inline diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c index 7cd53d7c..5d47768d 100644 --- a/src/hwlm/noodle_engine_sse.c +++ b/src/hwlm/noodle_engine_sse.c @@ -30,11 +30,11 @@ static really_inline m128 getMask(u8 c, bool noCase) { u8 k = caseClear8(c, noCase); - return set16x8(k); + return set1_16x8(k); } static really_inline m128 getCaseMask(void) { - return set16x8(0xdf); + return set1_16x8(0xdf); } static really_inline diff --git a/src/nfa/mcclellan_common_impl.h b/src/nfa/mcclellan_common_impl.h index 7b0e7f48..6ec1b1f1 100644 --- a/src/nfa/mcclellan_common_impl.h +++ b/src/nfa/mcclellan_common_impl.h @@ -59,7 +59,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table, if (len) { m128 ss_char = load128(sherman_state); - m128 cur_char = set16x8(cprime); + m128 cur_char = set1_16x8(cprime); u32 z = movemask128(eq128(ss_char, cur_char)); diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c index 4619ff6f..dd00617e 100644 --- a/src/nfa/mcsheng.c +++ b/src/nfa/mcsheng.c @@ -72,7 +72,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table, if (len) { m128 ss_char = load128(sherman_state); - m128 cur_char = set16x8(cprime); + m128 cur_char = set1_16x8(cprime); u32 z = movemask128(eq128(ss_char, cur_char)); @@ -153,7 +153,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end, assert(s_in); /* should not already be dead */ assert(soft_c_end <= hard_c_end); DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1); - m128 s = set16x8(s_in - 1); + m128 s = set1_16x8(s_in - 1); const u8 *c = *c_inout; const u8 *c_end = hard_c_end - SHENG_CHUNK + 1; if (!do_accel) { @@ -171,8 +171,8 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end, #if defined(HAVE_BMI2) && defined(ARCH_64_BIT) u32 sheng_limit_x4 = sheng_limit * 0x01010101; - m128 simd_stop_limit = set4x32(sheng_stop_limit_x4); - m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit); + m128 simd_stop_limit = set1_4x32(sheng_stop_limit_x4); + m128 accel_delta = set1_16x8(sheng_limit - sheng_stop_limit); DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit, m->sheng_accel_limit, sheng_stop_limit); #endif diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h index 9552fe15..aa416194 100644 --- a/src/nfa/sheng_impl.h +++ b/src/nfa/sheng_impl.h @@ -52,7 +52,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, } DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start)); - m128 cur_state = set16x8(*state); + m128 cur_state = set1_16x8(*state); const m128 *masks = s->shuffle_masks; while (likely(cur_buf != end)) { diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h index 74032201..c51bcdea 100644 --- a/src/nfa/sheng_impl4.h +++ b/src/nfa/sheng_impl4.h @@ -86,7 +86,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, return MO_CONTINUE_MATCHING; } - m128 cur_state = set16x8(*state); + m128 cur_state = set1_16x8(*state); const m128 *masks = s->shuffle_masks; while (likely(end - cur_buf >= 4)) { diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c index 09ffc0cf..e76dcca8 100644 --- a/src/nfa/shufti.c +++ b/src/nfa/shufti.c @@ -159,7 +159,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, } const m128 zeroes = zeroes128(); - const m128 low4bits = _mm_set1_epi8(0xf); + const m128 low4bits = set1_16x8(0xf); const u8 *rv; size_t min = (size_t)buf % 16; @@ -246,7 +246,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, } const m128 zeroes = zeroes128(); - const m128 low4bits = _mm_set1_epi8(0xf); + const m128 low4bits = set1_16x8(0xf); const u8 *rv; assert(buf_end - buf >= 16); @@ -320,7 +320,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, const u8 *buf, const u8 *buf_end) { const m128 ones = ones128(); - const m128 low4bits = _mm_set1_epi8(0xf); + const m128 low4bits = set1_16x8(0xf); const u8 *rv; size_t min = (size_t)buf % 16; @@ -455,15 +455,15 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, buf, buf_end); } - const m256 low4bits = set32x8(0xf); + const m256 low4bits = set1_32x8(0xf); if (buf_end - buf <= 32) { return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits); } const m256 zeroes = zeroes256(); - const m256 wide_mask_lo = set2x128(mask_lo); - const m256 wide_mask_hi = set2x128(mask_hi); + const m256 wide_mask_lo = set1_2x128(mask_lo); + const m256 wide_mask_hi = set1_2x128(mask_hi); const u8 *rv; size_t min = (size_t)buf % 32; @@ -579,15 +579,15 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, buf, buf_end); } - const m256 low4bits = set32x8(0xf); + const m256 low4bits = set1_32x8(0xf); if (buf_end - buf <= 32) { return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits); } const m256 zeroes = zeroes256(); - const m256 wide_mask_lo = set2x128(mask_lo); - const m256 wide_mask_hi = set2x128(mask_hi); + const m256 wide_mask_lo = set1_2x128(mask_lo); + const m256 wide_mask_hi = set1_2x128(mask_hi); const u8 *rv; assert(buf_end - buf >= 32); @@ -676,7 +676,7 @@ static really_inline const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, const u8 *buf, const u8 *buf_end) { DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); - const m256 low4bits = set32x8(0xf); + const m256 low4bits = set1_32x8(0xf); // run shufti over two overlapping 16-byte unaligned reads const m256 mask1 = combine2x128(mask1_hi, mask1_lo); const m256 mask2 = combine2x128(mask2_hi, mask2_lo); @@ -708,11 +708,11 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, } const m256 ones = ones256(); - const m256 low4bits = set32x8(0xf); - const m256 wide_mask1_lo = set2x128(mask1_lo); - const m256 wide_mask1_hi = set2x128(mask1_hi); - const m256 wide_mask2_lo = set2x128(mask2_lo); - const m256 wide_mask2_hi = set2x128(mask2_hi); + const m256 low4bits = set1_32x8(0xf); + const m256 wide_mask1_lo = set1_2x128(mask1_lo); + const m256 wide_mask1_hi = set1_2x128(mask1_hi); + const m256 wide_mask2_lo = set1_2x128(mask2_lo); + const m256 wide_mask2_hi = set1_2x128(mask2_hi); const u8 *rv; size_t min = (size_t)buf % 32; diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c index be6b312c..37af13ad 100644 --- a/src/nfa/truffle.c +++ b/src/nfa/truffle.c @@ -64,8 +64,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) { static really_inline u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) { - m128 highconst = _mm_set1_epi8(0x80); - m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201); + m128 highconst = set1_16x8(0x80); + m128 shuf_mask_hi = set1_2x64(0x8040201008040201); // and now do the real work m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v); @@ -260,8 +260,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) { static really_inline u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) { - m256 highconst = _mm256_set1_epi8(0x80); - m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201); + m256 highconst = set1_32x8(0x80); + m256 shuf_mask_hi = set1_4x64(0x8040201008040201); // and now do the real work m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v); @@ -315,8 +315,8 @@ const u8 *truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) { DEBUG_PRINTF("len %zu\n", buf_end - buf); - const m256 wide_clear = set2x128(shuf_mask_lo_highclear); - const m256 wide_set = set2x128(shuf_mask_lo_highset); + const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear); + const m256 wide_set = set1_2x128(shuf_mask_lo_highset); assert(buf && buf_end); assert(buf < buf_end); @@ -382,8 +382,8 @@ const u8 *truffleRevMini(m256 shuf_mask_lo_highclear, const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) { - const m256 wide_clear = set2x128(shuf_mask_lo_highclear); - const m256 wide_set = set2x128(shuf_mask_lo_highset); + const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear); + const m256 wide_set = set1_2x128(shuf_mask_lo_highset); assert(buf && buf_end); assert(buf < buf_end); const u8 *rv; diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h index 3307486c..dc56a5f1 100644 --- a/src/nfa/vermicelli_sse.h +++ b/src/nfa/vermicelli_sse.h @@ -36,7 +36,7 @@ #define VERM_BOUNDARY 16 #define VERM_TYPE m128 -#define VERM_SET_FN set16x8 +#define VERM_SET_FN set1_16x8 static really_inline const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, @@ -74,7 +74,7 @@ static really_inline const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf, const u8 *buf_end, char negate) { assert((size_t)buf % 16 == 0); - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); for (; buf + 31 < buf_end; buf += 32) { m128 data = load128(buf); @@ -122,7 +122,7 @@ const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) { // returns NULL if not found static really_inline const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) { - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); m128 data = loadu128(buf); // unaligned u32 z = movemask128(eq128(chars, and128(casemask, data))); if (negate) { @@ -157,7 +157,7 @@ static really_inline const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, const u8 *buf, const u8 *buf_end) { assert((size_t)buf % 16 == 0); - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); for (; buf + 16 < buf_end; buf += 16) { m128 data = load128(buf); @@ -219,7 +219,7 @@ const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { static really_inline const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { /* due to laziness, nonalphas and nocase having interesting behaviour */ - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); m128 data = loadu128(buf); // unaligned m128 v = and128(casemask, data); u32 z = movemask128(and128(eq128(chars1, v), @@ -277,7 +277,7 @@ static really_inline const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf, const u8 *buf_end, char negate) { assert((size_t)buf_end % 16 == 0); - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); for (; buf + 15 < buf_end; buf_end -= 16) { m128 data = load128(buf_end - 16); @@ -309,7 +309,7 @@ const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) { // returns NULL if not found static really_inline const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) { - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); m128 data = loadu128(buf); // unaligned u32 z = movemask128(eq128(chars, and128(casemask, data))); if (negate) { @@ -344,7 +344,7 @@ static really_inline const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, const u8 *buf, const u8 *buf_end) { assert((size_t)buf_end % 16 == 0); - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); for (; buf + 16 < buf_end; buf_end -= 16) { m128 data = load128(buf_end - 16); @@ -381,7 +381,7 @@ const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { static really_inline const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { /* due to laziness, nonalphas and nocase having interesting behaviour */ - m128 casemask = set16x8(CASE_CLEAR); + m128 casemask = set1_16x8(CASE_CLEAR); m128 data = loadu128(buf); m128 v = and128(casemask, data); u32 z = movemask128(and128(eq128(chars2, v), @@ -398,7 +398,7 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { #define VERM_BOUNDARY 64 #define VERM_TYPE m512 -#define VERM_SET_FN set64x8 +#define VERM_SET_FN set1_64x8 static really_inline const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) { diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h index 976208b7..6210fca5 100644 --- a/src/rose/counting_miracle.h +++ b/src/rose/counting_miracle.h @@ -47,7 +47,7 @@ char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end, u32 count = *count_inout; - m128 chars = set16x8(c); + m128 chars = set1_16x8(c); for (; d + 16 <= d_end; d_end -= 16) { m128 data = loadu128(d_end - 16); @@ -94,7 +94,7 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison, u32 count = *count_inout; const m128 zeroes = zeroes128(); - const m128 low4bits = _mm_set1_epi8(0xf); + const m128 low4bits = set1_16x8(0xf); for (; d + 16 <= d_end; d_end -= 16) { m128 data = loadu128(d_end - 16); diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c index 0f2d1083..d01e30e8 100644 --- a/src/rose/program_runtime.c +++ b/src/rose/program_runtime.c @@ -938,7 +938,7 @@ int roseCheckShufti16x16(const struct core_info *ci, const u8 *hi_mask, return 1; } - m256 data_m256 = set2x128(data); + m256 data_m256 = set1_2x128(data); m256 hi_mask_m256 = loadu256(hi_mask); m256 lo_mask_m256 = loadu256(lo_mask); m256 bucket_select_mask_m256 = loadu256(bucket_select_mask); @@ -974,8 +974,8 @@ int roseCheckShufti32x8(const struct core_info *ci, const u8 *hi_mask, m128 hi_mask_m128 = loadu128(hi_mask); m128 lo_mask_m128 = loadu128(lo_mask); - m256 hi_mask_m256 = set2x128(hi_mask_m128); - m256 lo_mask_m256 = set2x128(lo_mask_m128); + m256 hi_mask_m256 = set1_2x128(hi_mask_m128); + m256 lo_mask_m256 = set1_2x128(lo_mask_m128); m256 bucket_select_mask_m256 = loadu256(bucket_select_mask); if (validateShuftiMask32x8(data, hi_mask_m256, lo_mask_m256, bucket_select_mask_m256, @@ -1287,7 +1287,7 @@ int roseCheckMultipathShufti16x8(const struct hs_scratch *scratch, u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); DEBUG_PRINTF("expand_hi %llx\n", valid_hi); DEBUG_PRINTF("expand_lo %llx\n", valid_lo); - expand_valid = set64x2(valid_hi, valid_lo); + expand_valid = set2x64(valid_hi, valid_lo); valid_path_mask = ~movemask128(pshufb_m128(expand_valid, data_select_mask)); } @@ -1332,7 +1332,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch, u32 valid_data_mask; m128 data_m128 = getData128(ci, offset, &valid_data_mask); - m256 data_double = set2x128(data_m128); + m256 data_double = set1_2x128(data_m128); m256 data_select_mask = loadu256(ri->data_select_mask); u32 valid_path_mask = 0; @@ -1346,7 +1346,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch, u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); DEBUG_PRINTF("expand_hi %llx\n", valid_hi); DEBUG_PRINTF("expand_lo %llx\n", valid_lo); - expand_valid = set64x4(valid_hi, valid_lo, valid_hi, + expand_valid = set4x64(valid_hi, valid_lo, valid_hi, valid_lo); valid_path_mask = ~movemask256(pshufb_m256(expand_valid, data_select_mask)); @@ -1393,7 +1393,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch, u32 valid_data_mask; m128 data_m128 = getData128(ci, offset, &valid_data_mask); - m256 data_double = set2x128(data_m128); + m256 data_double = set1_2x128(data_m128); m256 data_select_mask = loadu256(ri->data_select_mask); u32 valid_path_mask = 0; @@ -1407,7 +1407,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch, u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); DEBUG_PRINTF("expand_hi %llx\n", valid_hi); DEBUG_PRINTF("expand_lo %llx\n", valid_lo); - expand_valid = set64x4(valid_hi, valid_lo, valid_hi, + expand_valid = set4x64(valid_hi, valid_lo, valid_hi, valid_lo); valid_path_mask = ~movemask256(pshufb_m256(expand_valid, data_select_mask)); @@ -1460,7 +1460,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch, u32 valid_data_mask; m128 data_m128 = getData128(ci, offset, &valid_data_mask); - m256 data_m256 = set2x128(data_m128); + m256 data_m256 = set1_2x128(data_m128); m256 data_select_mask_1 = loadu256(ri->data_select_mask); m256 data_select_mask_2 = loadu256(ri->data_select_mask + 32); @@ -1475,7 +1475,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch, u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); DEBUG_PRINTF("expand_hi %llx\n", valid_hi); DEBUG_PRINTF("expand_lo %llx\n", valid_lo); - expand_valid = set64x4(valid_hi, valid_lo, valid_hi, + expand_valid = set4x64(valid_hi, valid_lo, valid_hi, valid_lo); u32 valid_path_1 = movemask256(pshufb_m256(expand_valid, data_select_mask_1)); diff --git a/src/rose/validate_shufti.h b/src/rose/validate_shufti.h index 1dc855d9..3b91f091 100644 --- a/src/rose/validate_shufti.h +++ b/src/rose/validate_shufti.h @@ -47,7 +47,7 @@ static really_inline int validateShuftiMask16x16(const m256 data, const m256 hi_mask, const m256 lo_mask, const m256 and_mask, const u32 neg_mask, const u32 valid_data_mask) { - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits)); m256 c_hi = pshufb_m256(hi_mask, rshift64_m256(andnot256(low4bits, data), 4)); @@ -78,7 +78,7 @@ int validateShuftiMask16x8(const m128 data, const m256 nib_mask, const m128 and_mask, const u32 neg_mask, const u32 valid_data_mask) { m256 data_m256 = combine2x128(rshift64_m128(data, 4), data); - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits)); m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib)); m128 nresult = eq128(and128(t, and_mask), zeroes128()); @@ -101,7 +101,7 @@ static really_inline int validateShuftiMask32x8(const m256 data, const m256 hi_mask, const m256 lo_mask, const m256 and_mask, const u32 neg_mask, const u32 valid_data_mask) { - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits)); m256 c_hi = pshufb_m256(hi_mask, rshift64_m256(andnot256(low4bits, data), 4)); @@ -133,7 +133,7 @@ int validateShuftiMask32x16(const m256 data, const m256 bucket_mask_hi, const m256 bucket_mask_lo, const u32 neg_mask, const u32 valid_data_mask) { - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 data_lo = and256(data, low4bits); m256 data_hi = and256(rshift64_m256(data, 4), low4bits); m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo); @@ -201,7 +201,7 @@ int validateMultipathShuftiMask16x8(const m128 data, const u32 neg_mask, const u32 valid_path_mask) { m256 data_256 = combine2x128(rshift64_m128(data, 4), data); - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits)); m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib)); m128 result = and128(t, bucket_select_mask); @@ -220,7 +220,7 @@ int validateMultipathShuftiMask32x8(const m256 data, const u32 hi_bits, const u32 lo_bits, const u32 neg_mask, const u32 valid_path_mask) { - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 data_lo = and256(data, low4bits); m256 data_hi = and256(rshift64_m256(data, 4), low4bits); m256 c_lo = pshufb_m256(lo_mask, data_lo); @@ -244,7 +244,7 @@ int validateMultipathShuftiMask32x16(const m256 data, const u32 hi_bits, const u32 lo_bits, const u32 neg_mask, const u32 valid_path_mask) { - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 data_lo = and256(data, low4bits); m256 data_hi = and256(rshift64_m256(data, 4), low4bits); m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo); @@ -271,7 +271,7 @@ int validateMultipathShuftiMask64(const m256 data_1, const m256 data_2, const u64a hi_bits, const u64a lo_bits, const u64a neg_mask, const u64a valid_path_mask) { - m256 low4bits = set32x8(0xf); + m256 low4bits = set1_32x8(0xf); m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits)); m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits)); m256 c_hi_1 = pshufb_m256(hi_mask, diff --git a/src/util/state_compress.c b/src/util/state_compress.c index 7238849e..e6cf205c 100644 --- a/src/util/state_compress.c +++ b/src/util/state_compress.c @@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]), expand32(v[2], m[2]), expand32(v[3], m[3]) }; - return _mm_set_epi32(x[3], x[2], x[1], x[0]); + return set32x4(x[3], x[2], x[1], x[0]); } #endif @@ -158,7 +158,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { static really_inline m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { // First, decompose our vectors into 64-bit chunks. - u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) }; + u64a m[2] = { movq(mvec), movq(rshiftbyte_m128(mvec, 8)) }; u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; u64a v[2]; @@ -167,7 +167,7 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) }; - return _mm_set_epi64x(x[1], x[0]); + return set2x64(x[1], x[0]); } #endif @@ -264,11 +264,11 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) { expand32(v[6], m[6]), expand32(v[7], m[7]) }; #if !defined(HAVE_AVX2) - m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), - .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) }; + m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]), + .hi = set32x4(x[7], x[6], x[5], x[4]) }; #else - m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4], - x[3], x[2], x[1], x[0]); + m256 xvec = set32x8(x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); #endif return xvec; } @@ -291,10 +291,10 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) { expand64(v[2], m[2]), expand64(v[3], m[3]) }; #if !defined(HAVE_AVX2) - m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), - .hi = _mm_set_epi64x(x[3], x[2]) }; + m256 xvec = { .lo = set2x64(x[1], x[0]), + .hi = set2x64(x[3], x[2]) }; #else - m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]); + m256 xvec = set4x64(x[3], x[2], x[1], x[0]); #endif return xvec; } @@ -402,9 +402,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) { expand32(v[8], m[8]), expand32(v[9], m[9]), expand32(v[10], m[10]), expand32(v[11], m[11]) }; - m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), - .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]), - .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) }; + m384 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]), + .mid = set32x4(x[7], x[6], x[5], x[4]), + .hi = set32x4(x[11], x[10], x[9], x[8]) }; return xvec; } #endif @@ -427,9 +427,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) { expand64(v[2], m[2]), expand64(v[3], m[3]), expand64(v[4], m[4]), expand64(v[5], m[5]) }; - m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), - .mid = _mm_set_epi64x(x[3], x[2]), - .hi = _mm_set_epi64x(x[5], x[4]) }; + m384 xvec = { .lo = set2x64(x[1], x[0]), + .mid = set2x64(x[3], x[2]), + .hi = set2x64(x[5], x[4]) }; return xvec; } #endif @@ -548,20 +548,20 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) { m512 xvec; #if defined(HAVE_AVX512) - xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12], - x[11], x[10], x[9], x[8], - x[7], x[6], x[5], x[4], - x[3], x[2], x[1], x[0]); + xvec = set32x16(x[15], x[14], x[13], x[12], + x[11], x[10], x[9], x[8], + x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); #elif defined(HAVE_AVX2) - xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4], - x[3], x[2], x[1], x[0]); - xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12], - x[11], x[10], x[9], x[8]); + xvec.lo = set32x8(x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); + xvec.hi = set32x8(x[15], x[14], x[13], x[12], + x[11], x[10], x[9], x[8]); #else - xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]); - xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]); - xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]); - xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]); + xvec.lo.lo = set32x4(x[3], x[2], x[1], x[0]); + xvec.lo.hi = set32x4(x[7], x[6], x[5], x[4]); + xvec.hi.lo = set32x4(x[11], x[10], x[9], x[8]); + xvec.hi.hi = set32x4(x[15], x[14], x[13], x[12]); #endif return xvec; } @@ -588,16 +588,16 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) { expand64(v[6], m[6]), expand64(v[7], m[7]) }; #if defined(HAVE_AVX512) - m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4], + m512 xvec = set64x8(x[7], x[6], x[5], x[4], x[3], x[2], x[1], x[0]); #elif defined(HAVE_AVX2) - m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]), - .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])}; + m512 xvec = { .lo = set4x64(x[3], x[2], x[1], x[0]), + .hi = set4x64(x[7], x[6], x[5], x[4])}; #else - m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]), - _mm_set_epi64x(x[3], x[2]) }, - .hi = { _mm_set_epi64x(x[5], x[4]), - _mm_set_epi64x(x[7], x[6]) } }; + m512 xvec = { .lo = { set2x64(x[1], x[0]), + set2x64(x[3], x[2]) }, + .hi = { set2x64(x[5], x[4]), + set2x64(x[7], x[6]) } }; #endif return xvec; } From 04fbf2468140cc4d7ccabc62a2bdc4503a3d31c5 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 23 Sep 2020 21:38:12 +0300 Subject: [PATCH 13/53] Revert "move x86 popcount.h implementations to util/arch/x86/popcount.h" This reverts commit 6581aae90e55520353c03edb716de80ecc03521a. --- src/util/arch/common/popcount.h | 60 ----------------------------- src/util/arch/x86/popcount.h | 67 --------------------------------- src/util/popcount.h | 35 +++++++++++++---- 3 files changed, 27 insertions(+), 135 deletions(-) delete mode 100644 src/util/arch/common/popcount.h delete mode 100644 src/util/arch/x86/popcount.h diff --git a/src/util/arch/common/popcount.h b/src/util/arch/common/popcount.h deleted file mode 100644 index 0bd1e837..00000000 --- a/src/util/arch/common/popcount.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2015-2017, Intel Corporation - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/** \file - * \brief Platform specific popcount functions - */ - -#ifndef POPCOUNT_ARCH_COMMON_H -#define POPCOUNT_ARCH_COMMON_H - -static really_inline -u32 popcount32_impl_c(u32 x) { - // Fast branch-free version from bit-twiddling hacks as older Intel - // processors do not have a POPCNT instruction. - x -= (x >> 1) & 0x55555555; - x = (x & 0x33333333) + ((x >> 2) & 0x33333333); - return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24; -} - -static really_inline -u32 popcount64_impl_c(u64a x) { -#if defined(ARCH_64_BIT) - // Fast branch-free version from bit-twiddling hacks as older Intel - // processors do not have a POPCNT instruction. - x -= (x >> 1) & 0x5555555555555555; - x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); - x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f; - return (x * 0x0101010101010101) >> 56; -#else - // Synthesise from two 32-bit cases. - return popcount32_impl(x >> 32) + popcount32_impl(x); -#endif -} - -#endif // POPCOUNT_ARCH_COMMON_H \ No newline at end of file diff --git a/src/util/arch/x86/popcount.h b/src/util/arch/x86/popcount.h deleted file mode 100644 index 86929ede..00000000 --- a/src/util/arch/x86/popcount.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2015-2017, Intel Corporation - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/** \file - * \brief Platform specific popcount functions - */ - -#ifndef POPCOUNT_ARCH_X86_H -#define POPCOUNT_ARCH_X86_H - -#include "ue2common.h" -#include "util/arch.h" -#include "util/intrinsics.h" - -#include "util/arch/common/popcount.h" - -static really_inline -u32 popcount32_impl(u32 x) { -#if defined(HAVE_POPCOUNT_INSTR) - // Single-instruction builtin. - return _mm_popcnt_u32(x); -#else - return popcount32_impl_c(x); -#endif -} - -static really_inline -u32 popcount64_impl(u64a x) { -#if defined(ARCH_X86_64) -# if defined(HAVE_POPCOUNT_INSTR) - // Single-instruction builtin. - return (u32)_mm_popcnt_u64(x); -# else - return popcount64_impl_c(x); -# endif -#else - // Synthesise from two 32-bit cases. - return popcount32_impl(x >> 32) + popcount32_impl(x); -#endif -} - -#endif // POPCOUNT_ARCH_X86_h \ No newline at end of file diff --git a/src/util/popcount.h b/src/util/popcount.h index 932fc2cf..eb08f6b1 100644 --- a/src/util/popcount.h +++ b/src/util/popcount.h @@ -33,22 +33,41 @@ #ifndef UTIL_POPCOUNT_H_ #define UTIL_POPCOUNT_H_ -#include "config.h" #include "ue2common.h" #include "util/arch.h" -#if defined(ARCH_IA32) || defined(ARCH_X86_64) -#include "util/arch/x86/popcount.h" -#endif - static really_inline u32 popcount32(u32 x) { - return popcount32_impl(x); +#if defined(HAVE_POPCOUNT_INSTR) + // Single-instruction builtin. + return _mm_popcnt_u32(x); +#else + // Fast branch-free version from bit-twiddling hacks as older Intel + // processors do not have a POPCNT instruction. + x -= (x >> 1) & 0x55555555; + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24; +#endif } static really_inline -u32 popcount64(u32 x) { - return popcount64_impl(x); +u32 popcount64(u64a x) { +#if defined(ARCH_X86_64) +# if defined(HAVE_POPCOUNT_INSTR) + // Single-instruction builtin. + return (u32)_mm_popcnt_u64(x); +# else + // Fast branch-free version from bit-twiddling hacks as older Intel + // processors do not have a POPCNT instruction. + x -= (x >> 1) & 0x5555555555555555; + x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f; + return (x * 0x0101010101010101) >> 56; +# endif +#else + // Synthesise from two 32-bit cases. + return popcount32(x >> 32) + popcount32(x); +#endif } #endif /* UTIL_POPCOUNT_H_ */ From f0e70bc0ad13d585d44115dd4e6c1f42ce5e446b Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 24 Sep 2020 11:52:59 +0300 Subject: [PATCH 14/53] Revert "Revert "move x86 popcount.h implementations to util/arch/x86/popcount.h"" This reverts commit 04fbf2468140cc4d7ccabc62a2bdc4503a3d31c5. --- src/util/arch/common/popcount.h | 60 +++++++++++++++++++++++++++++ src/util/arch/x86/popcount.h | 67 +++++++++++++++++++++++++++++++++ src/util/popcount.h | 35 ++++------------- 3 files changed, 135 insertions(+), 27 deletions(-) create mode 100644 src/util/arch/common/popcount.h create mode 100644 src/util/arch/x86/popcount.h diff --git a/src/util/arch/common/popcount.h b/src/util/arch/common/popcount.h new file mode 100644 index 00000000..0bd1e837 --- /dev/null +++ b/src/util/arch/common/popcount.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Platform specific popcount functions + */ + +#ifndef POPCOUNT_ARCH_COMMON_H +#define POPCOUNT_ARCH_COMMON_H + +static really_inline +u32 popcount32_impl_c(u32 x) { + // Fast branch-free version from bit-twiddling hacks as older Intel + // processors do not have a POPCNT instruction. + x -= (x >> 1) & 0x55555555; + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24; +} + +static really_inline +u32 popcount64_impl_c(u64a x) { +#if defined(ARCH_64_BIT) + // Fast branch-free version from bit-twiddling hacks as older Intel + // processors do not have a POPCNT instruction. + x -= (x >> 1) & 0x5555555555555555; + x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f; + return (x * 0x0101010101010101) >> 56; +#else + // Synthesise from two 32-bit cases. + return popcount32_impl(x >> 32) + popcount32_impl(x); +#endif +} + +#endif // POPCOUNT_ARCH_COMMON_H \ No newline at end of file diff --git a/src/util/arch/x86/popcount.h b/src/util/arch/x86/popcount.h new file mode 100644 index 00000000..86929ede --- /dev/null +++ b/src/util/arch/x86/popcount.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Platform specific popcount functions + */ + +#ifndef POPCOUNT_ARCH_X86_H +#define POPCOUNT_ARCH_X86_H + +#include "ue2common.h" +#include "util/arch.h" +#include "util/intrinsics.h" + +#include "util/arch/common/popcount.h" + +static really_inline +u32 popcount32_impl(u32 x) { +#if defined(HAVE_POPCOUNT_INSTR) + // Single-instruction builtin. + return _mm_popcnt_u32(x); +#else + return popcount32_impl_c(x); +#endif +} + +static really_inline +u32 popcount64_impl(u64a x) { +#if defined(ARCH_X86_64) +# if defined(HAVE_POPCOUNT_INSTR) + // Single-instruction builtin. + return (u32)_mm_popcnt_u64(x); +# else + return popcount64_impl_c(x); +# endif +#else + // Synthesise from two 32-bit cases. + return popcount32_impl(x >> 32) + popcount32_impl(x); +#endif +} + +#endif // POPCOUNT_ARCH_X86_h \ No newline at end of file diff --git a/src/util/popcount.h b/src/util/popcount.h index eb08f6b1..932fc2cf 100644 --- a/src/util/popcount.h +++ b/src/util/popcount.h @@ -33,41 +33,22 @@ #ifndef UTIL_POPCOUNT_H_ #define UTIL_POPCOUNT_H_ +#include "config.h" #include "ue2common.h" #include "util/arch.h" +#if defined(ARCH_IA32) || defined(ARCH_X86_64) +#include "util/arch/x86/popcount.h" +#endif + static really_inline u32 popcount32(u32 x) { -#if defined(HAVE_POPCOUNT_INSTR) - // Single-instruction builtin. - return _mm_popcnt_u32(x); -#else - // Fast branch-free version from bit-twiddling hacks as older Intel - // processors do not have a POPCNT instruction. - x -= (x >> 1) & 0x55555555; - x = (x & 0x33333333) + ((x >> 2) & 0x33333333); - return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24; -#endif + return popcount32_impl(x); } static really_inline -u32 popcount64(u64a x) { -#if defined(ARCH_X86_64) -# if defined(HAVE_POPCOUNT_INSTR) - // Single-instruction builtin. - return (u32)_mm_popcnt_u64(x); -# else - // Fast branch-free version from bit-twiddling hacks as older Intel - // processors do not have a POPCNT instruction. - x -= (x >> 1) & 0x5555555555555555; - x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); - x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f; - return (x * 0x0101010101010101) >> 56; -# endif -#else - // Synthesise from two 32-bit cases. - return popcount32(x >> 32) + popcount32(x); -#endif +u32 popcount64(u32 x) { + return popcount64_impl(x); } #endif /* UTIL_POPCOUNT_H_ */ From b1170bcc2e54b428ed0fa63802c0aced62b4b8c7 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 6 Oct 2020 08:09:18 +0300 Subject: [PATCH 15/53] add arm checks in platform.cmake --- cmake/platform.cmake | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cmake/platform.cmake b/cmake/platform.cmake index 593c544b..8c82da2b 100644 --- a/cmake/platform.cmake +++ b/cmake/platform.cmake @@ -1,9 +1,15 @@ # determine the target arch # really only interested in the preprocessor here -CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT) +CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64) -CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT) +CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32) -set(ARCH_X86_64 ${ARCH_64_BIT}) -set(ARCH_IA32 ${ARCH_32_BIT}) +CHECK_C_SOURCE_COMPILES("#if !defined(__aarch64__)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_ARM64) +CHECK_C_SOURCE_COMPILES("#if !(defined(__arm__) && !defined(__aarch64__))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32) + +if (DEFINED(ARCH_X86_64) OR DEFINED(ARCH_ARM64)) + set(ARCH_64_BIT TRUE) +else() + set(ARCH_32_BIT TRUE) +endif() From 5952c64066dc147b3a73024c572f416ba2d125cd Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 6 Oct 2020 12:44:23 +0300 Subject: [PATCH 16/53] add necessary modifications to CMake system to enable building on ARM, add arm_neon.h intrinsic header to intrinsics.h --- CMakeLists.txt | 14 ++++++++----- cmake/arch.cmake | 46 +++++++++++++++++++++++++++++-------------- cmake/config.h.in | 9 +++++++++ cmake/platform.cmake | 4 ++-- src/util/intrinsics.h | 6 ++++++ 5 files changed, 57 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e5078848..f4d1cc9f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,7 +175,7 @@ else() string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}") endforeach () - if (CMAKE_COMPILER_IS_GNUCC) + if (ARCH_IA32 OR ARCH_X86_64 AND CMAKE_COMPILER_IS_GNUCC) message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}") # If gcc doesn't recognise the host cpu, then mtune=native becomes # generic, which isn't very good in some cases. march=native looks at @@ -281,10 +281,14 @@ else() endif() CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H) -CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H) -CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H) -CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H) -CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H) +if (ARCH_IA32 OR ARCH_X86_64) + CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H) + CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H) + CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H) + CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H) +elseif (ARCH_ARM32 OR ARCH_AARCH64) + CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_C_ARM_NEON_H) +endif() CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN) CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index cced49c6..e3cc9f44 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -6,7 +6,10 @@ if (HAVE_C_X86INTRIN_H) set (INTRIN_INC_H "x86intrin.h") elseif (HAVE_C_INTRIN_H) set (INTRIN_INC_H "intrin.h") -else () +elseif (HAVE_C_ARM_NEON_H) + set (INTRIN_INC_H "arm_neon.h") + set (FAT_RUNTIME OFF) +else() message (FATAL_ERROR "No intrinsics header found") endif () @@ -29,15 +32,16 @@ else (NOT FAT_RUNTIME) set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ARCH_C_FLAGS}") endif () -# ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic -CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> +if (ARCH_IA32 OR ARCH_X86_64) + # ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic + CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> int main() { __m128i a = _mm_set1_epi8(1); (void)_mm_shuffle_epi8(a, a); }" HAVE_SSSE3) -# now look for AVX2 -CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> + # now look for AVX2 + CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> #if !defined(__AVX2__) #error no avx2 #endif @@ -47,8 +51,8 @@ int main(){ (void)_mm256_xor_si256(z, z); }" HAVE_AVX2) -# and now for AVX512 -CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> + # and now for AVX512 + CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> #if !defined(__AVX512BW__) #error no avx512bw #endif @@ -58,8 +62,8 @@ int main(){ (void)_mm512_abs_epi8(z); }" HAVE_AVX512) -# and now for AVX512VBMI -CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> + # and now for AVX512VBMI + CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> #if !defined(__AVX512VBMI__) #error no avx512vbmi #endif @@ -70,26 +74,38 @@ int main(){ (void)_mm512_permutexvar_epi8(idx, a); }" HAVE_AVX512VBMI) +elseif (ARCH_ARM32 OR ARCH_AARCH64) + CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> +int main() { + int32x4_t a = vdupq_n_s32(1); +}" HAVE_NEON) +else () + message (FATAL_ERROR "Unsupported architecture") +endif () + if (FAT_RUNTIME) - if (NOT HAVE_SSSE3) + if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3) message(FATAL_ERROR "SSSE3 support required to build fat runtime") endif () - if (NOT HAVE_AVX2) + if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2) message(FATAL_ERROR "AVX2 support required to build fat runtime") endif () - if (BUILD_AVX512 AND NOT HAVE_AVX512) + if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX512 AND NOT HAVE_AVX512) message(FATAL_ERROR "AVX512 support requested but not supported") endif () else (NOT FAT_RUNTIME) - if (NOT HAVE_AVX2) + if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2) message(STATUS "Building without AVX2 support") endif () - if (NOT HAVE_AVX512) + if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512) message(STATUS "Building without AVX512 support") endif () - if (NOT HAVE_SSSE3) + if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3) message(FATAL_ERROR "A minimum of SSSE3 compiler support is required") endif () + if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON) + message(FATAL_ERROR "NEON support required for ARM support") + endif () endif () unset (CMAKE_REQUIRED_FLAGS) diff --git a/cmake/config.h.in b/cmake/config.h.in index 203f0afd..2d2c78ce 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -15,6 +15,12 @@ /* "Define if building for EM64T" */ #cmakedefine ARCH_X86_64 +/* "Define if building for ARM32" */ +#cmakedefine ARCH_ARM32 + +/* "Define if building for AARCH64" */ +#cmakedefine ARCH_AARCH64 + /* internal build, switch on dump support. */ #cmakedefine DUMP_SUPPORT @@ -45,6 +51,9 @@ /* C compiler has intrin.h */ #cmakedefine HAVE_C_INTRIN_H +/* C compiler has arm_neon.h */ +#cmakedefine HAVE_C_ARM_NEON_H + /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to 0 if you don't. */ #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP diff --git a/cmake/platform.cmake b/cmake/platform.cmake index 8c82da2b..4591bf93 100644 --- a/cmake/platform.cmake +++ b/cmake/platform.cmake @@ -5,10 +5,10 @@ CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error n CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32) -CHECK_C_SOURCE_COMPILES("#if !defined(__aarch64__)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_ARM64) +CHECK_C_SOURCE_COMPILES("#if !defined(__aarch64__)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) CHECK_C_SOURCE_COMPILES("#if !(defined(__arm__) && !defined(__aarch64__))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32) -if (DEFINED(ARCH_X86_64) OR DEFINED(ARCH_ARM64)) +if (DEFINED(ARCH_X86_64) OR DEFINED(ARCH_AARCH64)) set(ARCH_64_BIT TRUE) else() set(ARCH_32_BIT TRUE) diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h index edc4f6ef..3e2afc22 100644 --- a/src/util/intrinsics.h +++ b/src/util/intrinsics.h @@ -45,6 +45,10 @@ # endif #endif +#if defined(HAVE_C_ARM_NEON_H) +# define USE_ARM_NEON_H +#endif + #ifdef __cplusplus # if defined(HAVE_CXX_INTRIN_H) # define USE_INTRIN_H @@ -59,6 +63,8 @@ #include #elif defined(USE_INTRIN_H) #include +#elif defined(USE_ARM_NEON_H) +#include #else #error no intrinsics file #endif From e91082d477a659bfc6f100f2a7ffd029553d2f3e Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 6 Oct 2020 13:45:52 +0300 Subject: [PATCH 17/53] use right intrinsic --- src/util/state_compress.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/util/state_compress.c b/src/util/state_compress.c index e6cf205c..87eccce7 100644 --- a/src/util/state_compress.c +++ b/src/util/state_compress.c @@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]), expand32(v[2], m[2]), expand32(v[3], m[3]) }; - return set32x4(x[3], x[2], x[1], x[0]); + return set4x32(x[3], x[2], x[1], x[0]); } #endif @@ -264,10 +264,10 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) { expand32(v[6], m[6]), expand32(v[7], m[7]) }; #if !defined(HAVE_AVX2) - m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]), - .hi = set32x4(x[7], x[6], x[5], x[4]) }; + m256 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]), + .hi = set4x32(x[7], x[6], x[5], x[4]) }; #else - m256 xvec = set32x8(x[7], x[6], x[5], x[4], + m256 xvec = set8x32(x[7], x[6], x[5], x[4], x[3], x[2], x[1], x[0]); #endif return xvec; @@ -402,9 +402,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) { expand32(v[8], m[8]), expand32(v[9], m[9]), expand32(v[10], m[10]), expand32(v[11], m[11]) }; - m384 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]), - .mid = set32x4(x[7], x[6], x[5], x[4]), - .hi = set32x4(x[11], x[10], x[9], x[8]) }; + m384 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]), + .mid = set4x32(x[7], x[6], x[5], x[4]), + .hi = set4x32(x[11], x[10], x[9], x[8]) }; return xvec; } #endif @@ -553,15 +553,15 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) { x[7], x[6], x[5], x[4], x[3], x[2], x[1], x[0]); #elif defined(HAVE_AVX2) - xvec.lo = set32x8(x[7], x[6], x[5], x[4], + xvec.lo = set8x32(x[7], x[6], x[5], x[4], x[3], x[2], x[1], x[0]); - xvec.hi = set32x8(x[15], x[14], x[13], x[12], + xvec.hi = set8x32(x[15], x[14], x[13], x[12], x[11], x[10], x[9], x[8]); #else - xvec.lo.lo = set32x4(x[3], x[2], x[1], x[0]); - xvec.lo.hi = set32x4(x[7], x[6], x[5], x[4]); - xvec.hi.lo = set32x4(x[11], x[10], x[9], x[8]); - xvec.hi.hi = set32x4(x[15], x[14], x[13], x[12]); + xvec.lo.lo = set4x32(x[3], x[2], x[1], x[0]); + xvec.lo.hi = set4x32(x[7], x[6], x[5], x[4]); + xvec.hi.lo = set4x32(x[11], x[10], x[9], x[8]); + xvec.hi.hi = set4x32(x[15], x[14], x[13], x[12]); #endif return xvec; } From 9a0494259efbce2654da3b0b9f4978749383a715 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 7 Oct 2020 14:26:41 +0300 Subject: [PATCH 18/53] minor fix --- src/util/arch/x86/simd_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/arch/x86/simd_types.h b/src/util/arch/x86/simd_types.h index a582abd5..d74493b4 100644 --- a/src/util/arch/x86/simd_types.h +++ b/src/util/arch/x86/simd_types.h @@ -41,5 +41,5 @@ typedef __m256i m256; typedef __m512i m512; #endif -#endif /* SIMD_TYPES_H */ +#endif /* SIMD_TYPES_X86_H */ From 4c924cc920ad4dce46e30a6e6fb40d0b59817787 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 7 Oct 2020 14:28:12 +0300 Subject: [PATCH 19/53] add arm architecture basic defines --- src/util/arch.h | 6 ++++- src/util/arch/arm/arm.h | 42 ++++++++++++++++++++++++++++++++++ src/util/arch/arm/simd_types.h | 37 ++++++++++++++++++++++++++++++ src/util/simd_types.h | 4 +++- 4 files changed, 87 insertions(+), 2 deletions(-) create mode 100644 src/util/arch/arm/arm.h create mode 100644 src/util/arch/arm/simd_types.h diff --git a/src/util/arch.h b/src/util/arch.h index 57e39c07..794f28f7 100644 --- a/src/util/arch.h +++ b/src/util/arch.h @@ -33,8 +33,12 @@ #ifndef UTIL_ARCH_H_ #define UTIL_ARCH_H_ -#if defined(__i386__) || defined(__x86_64__) +#include "config.h" + +#if defined(ARCH_IA32) || defined(ARCH_X86_64) #include "util/arch/x86/x86.h" +#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) +#include "util/arch/arm/arm.h" #endif #endif // UTIL_ARCH_X86_H_ diff --git a/src/util/arch/arm/arm.h b/src/util/arch/arm/arm.h new file mode 100644 index 00000000..326e8f56 --- /dev/null +++ b/src/util/arch/arm/arm.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2017-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Per-platform architecture definitions + */ + +#ifndef UTIL_ARCH_ARM_H_ +#define UTIL_ARCH_ARM_H_ + +#if defined(__ARM_NEON) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64)) +#define HAVE_NEON +#define HAVE_SIMD_128_BITS +#endif + +#endif // UTIL_ARCH_ARM_H_ + diff --git a/src/util/arch/arm/simd_types.h b/src/util/arch/arm/simd_types.h new file mode 100644 index 00000000..cc4c50e4 --- /dev/null +++ b/src/util/arch/arm/simd_types.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SIMD_TYPES_ARM_H +#define SIMD_TYPES_ARM_H + +#if !defined(m128) && defined(HAVE_NEON) +typedef int32x4_t m128; +#endif + +#endif /* SIMD_TYPES_ARM_H */ + diff --git a/src/util/simd_types.h b/src/util/simd_types.h index a58ede4d..5777374b 100644 --- a/src/util/simd_types.h +++ b/src/util/simd_types.h @@ -34,8 +34,10 @@ #include "util/intrinsics.h" #include "ue2common.h" -#if defined(__i386__) || defined(__x86_64__) +#if defined(ARCH_IA32) || defined(ARCH_X86_64) #include "util/arch/x86/simd_types.h" +#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) +#include "util/arch/arm/simd_types.h" #endif #if !defined(m128) && !defined(HAVE_SIMD_128_BITS) From 5d773dd9db21e2f753ce386bfcf53e69c5113abe Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 7 Oct 2020 14:28:45 +0300 Subject: [PATCH 20/53] use C implementation of popcount for arm --- src/util/arch/common/popcount.h | 4 ++-- src/util/popcount.h | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/util/arch/common/popcount.h b/src/util/arch/common/popcount.h index 0bd1e837..ef5776e8 100644 --- a/src/util/arch/common/popcount.h +++ b/src/util/arch/common/popcount.h @@ -53,8 +53,8 @@ u32 popcount64_impl_c(u64a x) { return (x * 0x0101010101010101) >> 56; #else // Synthesise from two 32-bit cases. - return popcount32_impl(x >> 32) + popcount32_impl(x); + return popcount32_impl_c(x >> 32) + popcount32_impl_c(x); #endif } -#endif // POPCOUNT_ARCH_COMMON_H \ No newline at end of file +#endif // POPCOUNT_ARCH_COMMON_H diff --git a/src/util/popcount.h b/src/util/popcount.h index 932fc2cf..5fd6dc33 100644 --- a/src/util/popcount.h +++ b/src/util/popcount.h @@ -39,6 +39,10 @@ #if defined(ARCH_IA32) || defined(ARCH_X86_64) #include "util/arch/x86/popcount.h" +#else +#include "util/arch/common/popcount.h" +#define popcount32_impl(x) popcount32_impl_c(x) +#define popcount64_impl(x) popcount64_impl_c(x) #endif static really_inline From d2cf1a7882d5f162fff756086bd2178a58c42cbc Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 8 Oct 2020 20:48:20 +0300 Subject: [PATCH 21/53] move cpuid_flags.h header to common --- CMakeLists.txt | 2 +- src/hs.cpp | 3 ++- src/util/arch/{x86 => common}/cpuid_flags.h | 2 +- src/util/target_info.cpp | 5 +++-- 4 files changed, 7 insertions(+), 5 deletions(-) rename src/util/arch/{x86 => common}/cpuid_flags.h (95%) diff --git a/CMakeLists.txt b/CMakeLists.txt index f4d1cc9f..c1db4dfa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -569,7 +569,7 @@ set (hs_exec_common_SRCS src/alloc.c src/scratch.c src/util/arch/x86/cpuid_flags.c - src/util/arch/x86/cpuid_flags.h + src/util/arch/common/cpuid_flags.h src/util/multibit.c ) diff --git a/src/hs.cpp b/src/hs.cpp index a0cb9bb3..7898cf46 100644 --- a/src/hs.cpp +++ b/src/hs.cpp @@ -44,9 +44,10 @@ #include "parser/prefilter.h" #include "parser/unsupported.h" #include "util/compile_error.h" +#include "util/arch/common/cpuid_flags.h" #if defined(ARCH_X86_64) -#include "util/arch/x86/cpuid_flags.h" #include "util/arch/x86/cpuid_inline.h" +#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #endif #include "util/depth.h" #include "util/popcount.h" diff --git a/src/util/arch/x86/cpuid_flags.h b/src/util/arch/common/cpuid_flags.h similarity index 95% rename from src/util/arch/x86/cpuid_flags.h rename to src/util/arch/common/cpuid_flags.h index 527c6d52..68e427dd 100644 --- a/src/util/arch/x86/cpuid_flags.h +++ b/src/util/arch/common/cpuid_flags.h @@ -31,7 +31,7 @@ #include "ue2common.h" -#if !defined(_WIN32) && !defined(CPUID_H_) +#if (defined(ARCH_IA32) || defined(ARCH_X86_64)) && !defined(_WIN32) && !defined(CPUID_H_) #include /* system header doesn't have a header guard */ #define CPUID_H_ diff --git a/src/util/target_info.cpp b/src/util/target_info.cpp index 6eab701d..5253755b 100644 --- a/src/util/target_info.cpp +++ b/src/util/target_info.cpp @@ -29,8 +29,9 @@ #include "hs_compile.h" // for various hs_platform_info flags #include "target_info.h" -#if defined(ARCH_X86_64) -#include "util/arch/x86/cpuid_flags.h" +#include "util/arch/common/cpuid_flags.h" +#if defined(ARCH_IA32) || defined(ARCH_X86_64) +#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #endif namespace ue2 { From 1c2c73becfa9ee26f2c468445d10e3ae638b0243 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 8 Oct 2020 20:50:18 +0300 Subject: [PATCH 22/53] add C implementation of pdep64() --- src/util/arch/common/bitutils.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h index f2706d70..e86b8d44 100644 --- a/src/util/arch/common/bitutils.h +++ b/src/util/arch/common/bitutils.h @@ -351,6 +351,36 @@ u64a pext64_impl_c(u64a x, u64a mask) { return result; } +static really_inline +u64a pdep64_impl_c(u64a x, u64a _m) { + /* Taken from: + * https://gcc.gnu.org/legacy-ml/gcc-patches/2017-06/msg01408.html + */ + + u64a result = 0x0UL; + const u64a mask = 0x8000000000000000UL; + u64a m = _m; + u64a c, t; + u64a p; + + /* The pop-count of the mask gives the number of the bits from + source to process. This is also needed to shift bits from the + source into the correct position for the result. */ + p = 64 - __builtin_popcountl (_m); + + /* The loop is for the number of '1' bits in the mask and clearing + each mask bit as it is processed. */ + while (m != 0) + { + c = __builtin_clzl (m); + t = x << (p - c); + m ^= (mask >> c); + result |= (t & (mask >> c)); + p++; + } + return (result); +} + /* compilers don't reliably synthesize the 32-bit ANDN instruction here, * so we force its generation. */ From a9212174eee2ffabc261b3323719c0a06640f83e Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 8 Oct 2020 20:50:55 +0300 Subject: [PATCH 23/53] add arm bitutils.h header --- src/util/arch/arm/bitutils.h | 179 +++++++++++++++++++++++++++++++++++ src/util/bitutils.h | 2 + 2 files changed, 181 insertions(+) create mode 100644 src/util/arch/arm/bitutils.h diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h new file mode 100644 index 00000000..514ddc5c --- /dev/null +++ b/src/util/arch/arm/bitutils.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Bit-twiddling primitives (ctz, compress etc) + */ + +#ifndef BITUTILS_ARCH_ARM_H +#define BITUTILS_ARCH_ARM_H + +#include "ue2common.h" +#include "util/popcount.h" +#include "util/arch.h" +#include "util/intrinsics.h" + +#include "util/arch/common/bitutils.h" + +static really_inline +u32 clz32_impl(u32 x) { + return clz32_impl_c(x); +} + +static really_inline +u32 clz64_impl(u64a x) { + return clz64_impl_c(x); +} + +static really_inline +u32 ctz32_impl(u32 x) { + return ctz32_impl_c(x); +} + +static really_inline +u32 ctz64_impl(u64a x) { + return ctz64_impl_c(x); +} + +static really_inline +u32 lg2_impl(u32 x) { + return lg2_impl_c(x); +} + +static really_inline +u64a lg2_64_impl(u64a x) { + return lg2_64_impl_c(x); +} + +static really_inline +u32 findAndClearLSB_32_impl(u32 *v) { + return findAndClearLSB_32_impl_c(v); +} + +static really_inline +u32 findAndClearLSB_64_impl(u64a *v) { + return findAndClearLSB_64_impl_c(v); +} + +static really_inline +u32 findAndClearMSB_32_impl(u32 *v) { + u32 val = *v; + u32 offset = 31 - clz32_impl(val); + *v = val & ~(1 << offset); + assert(offset < 32); + return offset; +} + +static really_inline +u32 findAndClearMSB_64_impl(u64a *v) { + return findAndClearMSB_64_impl_c(v); +} + +static really_inline +u32 compress32_impl(u32 x, u32 m) { + return compress32_impl_c(x, m); +} + +static really_inline +u64a compress64_impl(u64a x, u64a m) { + return compress64_impl_c(x, m); +} + +static really_inline +u32 expand32_impl(u32 x, u32 m) { + return expand32_impl_c(x, m); +} + +static really_inline +u64a expand64_impl(u64a x, u64a m) { + return expand64_impl_c(x, m); +} + +/* returns the first set bit after begin (if not ~0U). If no bit is set after + * begin returns ~0U + */ +static really_inline +u32 bf64_iterate_impl(u64a bitfield, u32 begin) { + if (begin != ~0U) { + /* switch off all bits at or below begin. Note: not legal to shift by + * by size of the datatype or larger. */ + assert(begin <= 63); + bitfield &= ~((2ULL << begin) - 1); + } + + if (!bitfield) { + return ~0U; + } + + return ctz64_impl(bitfield); +} + +static really_inline +char bf64_set_impl(u64a *bitfield, u32 i) { + return bf64_set_impl_c(bitfield, i); +} + +static really_inline +void bf64_unset_impl(u64a *bitfield, u32 i) { + return bf64_unset_impl_c(bitfield, i); +} + +static really_inline +u32 rank_in_mask32_impl(u32 mask, u32 bit) { + return rank_in_mask32_impl_c(mask, bit); +} + +static really_inline +u32 rank_in_mask64_impl(u64a mask, u32 bit) { + return rank_in_mask64_impl_c(mask, bit); +} + +static really_inline +u32 pext32_impl(u32 x, u32 mask) { + return pext32_impl_c(x, mask); +} + +static really_inline +u64a pext64_impl(u64a x, u64a mask) { + return pext64_impl_c(x, mask); +} + +static really_inline +u64a pdep64(u64a x, u64a mask) { + return pdep64_impl_c(x, mask); +} + +/* compilers don't reliably synthesize the 32-bit ANDN instruction here, + * so we force its generation. + */ +static really_inline +u64a andn_impl(const u32 a, const u8 *b) { + return andn_impl_c(a, b); +} + +#endif // BITUTILS_ARCH_ARM_H diff --git a/src/util/bitutils.h b/src/util/bitutils.h index 7373a9c8..556ba818 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -47,6 +47,8 @@ #if defined(ARCH_IA32) || defined(ARCH_X86_64) #include "util/arch/x86/bitutils.h" +#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) +#include "util/arch/arm/bitutils.h" #endif static really_inline From 31ac6718dd26f9b7b6e1319a7f55ae7be507a508 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 13 Oct 2020 09:19:56 +0300 Subject: [PATCH 24/53] add ARM version of simd_utils.h --- src/util/arch/arm/simd_utils.h | 288 +++++++++++++++++++++++++++++++++ src/util/simd_utils.h | 2 + 2 files changed, 290 insertions(+) create mode 100644 src/util/arch/arm/simd_utils.h diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h new file mode 100644 index 00000000..606892fb --- /dev/null +++ b/src/util/arch/arm/simd_utils.h @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief SIMD types and primitive operations. + */ + +#ifndef ARCH_ARM_SIMD_UTILS_H +#define ARCH_ARM_SIMD_UTILS_H + +#include "ue2common.h" +#include "util/simd_types.h" +#include "util/unaligned.h" +#include "util/intrinsics.h" + +#include // for memcpy + +static really_inline m128 ones128(void) { + return (m128) vdupq_n_s32(0xFF); +} + +static really_inline m128 zeroes128(void) { + return (m128) vdupq_n_s32(0); +} + +/** \brief Bitwise not for m128*/ +static really_inline m128 not128(m128 a) { + return (m128) veorq_s32(a, a); +} + +/** \brief Return 1 if a and b are different otherwise 0 */ +static really_inline int diff128(m128 a, m128 b) { + m128 t = (m128)vceqq_s8((int8x16_t)a, (int8x16_t)b); + return (16 != vaddvq_u8((uint8x16_t)t)); +} + +static really_inline int isnonzero128(m128 a) { + return !!diff128(a, zeroes128()); +} + +/** + * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich128(m128 a, m128 b) { + static const uint32x4_t movemask = { 1, 2, 4, 8 }; + return vaddvq_u32(vandq_u32(vceqq_s32((int32x4_t)a, (int32x4_t)b), movemask)); +} + +/** + * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and + * returns a 4-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_128(m128 a, m128 b) { + static const uint64x2_t movemask = { 1, 2 }; + return vaddvq_u64(vandq_u64(vceqq_s64((int64x2_t)a, (int64x2_t)b), movemask)); +} + +static really_really_inline +m128 lshift64_m128(m128 a, unsigned b) { + return (m128) vshlq_n_s64((int64x2_t)a, b); +} + +static really_really_inline +m128 rshift64_m128(m128 a, unsigned b) { + return (m128) vshrq_n_s64((int64x2_t)a, b); +} + +static really_inline m128 eq128(m128 a, m128 b) { + return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b); +} + +#define movemask128(a) ((u32)_mm_movemask_epi8((a))) + +static really_inline m128 set1_16x8(u8 c) { + return (m128) vdupq_n_u8(c); +} + +static really_inline m128 set1_4x32(u32 c) { + return (m128) vdupq_n_u32(c); +} + +static really_inline m128 set1_2x64(u64a c) { + return (m128) vdupq_n_u64(c); +} + +static really_inline u32 movd(const m128 in) { + return vgetq_lane_u32((uint32x4_t) in, 0); +} + +static really_inline u64a movq(const m128 in) { + return vgetq_lane_u64((uint64x2_t) in, 0); +} + +/* another form of movq */ +static really_inline +m128 load_m128_from_u64a(const u64a *p) { + return (m128) vdupq_n_u64(*p); +} + +static really_really_inline +m128 rshiftbyte_m128(m128 a, unsigned b) { + return (m128) vshrq_n_s8((int8x16_t)a, b); +} + +static really_really_inline +m128 lshiftbyte_m128(m128 a, unsigned b) { + return (m128) vshlq_n_s8((int8x16_t)a, b); +} + +static really_inline u32 extract32from128(const m128 in, unsigned imm) { + return vgetq_lane_u32((uint32x4_t) in, imm); +} + +static really_inline u32 extract64from128(const m128 in, unsigned imm) { + return vgetq_lane_u64((uint64x2_t) in, imm); +} + +static really_inline m128 and128(m128 a, m128 b) { + return (m128) vandq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 xor128(m128 a, m128 b) { + return (m128) veorq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 or128(m128 a, m128 b) { + return (m128) vorrq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline m128 andnot128(m128 a, m128 b) { + return (m128) vbicq_u32((uint32x4_t)a, (uint32x4_t)b); +} + +// aligned load +static really_inline m128 load128(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m128))); + ptr = assume_aligned(ptr, 16); + return (m128) vld1q_s32((const int32_t *)ptr); +} + +// aligned store +static really_inline void store128(void *ptr, m128 a) { + assert(ISALIGNED_N(ptr, alignof(m128))); + ptr = assume_aligned(ptr, 16); + vst1q_s32((int32_t *)ptr, a); +} + +// unaligned load +static really_inline m128 loadu128(const void *ptr) { + return (m128) vld1q_s32((const int32_t *)ptr); +} + +// unaligned store +static really_inline void storeu128(void *ptr, m128 a) { + vst1q_s32((int32_t *)ptr, a); +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes128(void *ptr, m128 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m128 loadbytes128(const void *ptr, unsigned int n) { + m128 a = zeroes128(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +#ifdef __cplusplus +extern "C" { +#endif +extern const u8 simd_onebit_masks[]; +#ifdef __cplusplus +} +#endif + +static really_inline +m128 mask1bit128(unsigned int n) { + assert(n < sizeof(m128) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu128(&simd_onebit_masks[mask_idx]); +} + +// switches on bit N in the given vector. +static really_inline +void setbit128(m128 *ptr, unsigned int n) { + *ptr = or128(mask1bit128(n), *ptr); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit128(m128 *ptr, unsigned int n) { + *ptr = andnot128(mask1bit128(n), *ptr); +} + +// tests bit N in the given vector. +static really_inline +char testbit128(m128 val, unsigned int n) { + const m128 mask = mask1bit128(n); +#if defined(HAVE_SSE41) + return !_mm_testz_si128(mask, val); +#else + return isnonzero128(and128(mask, val)); +#endif +} + +// offset must be an immediate +#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) + +static really_inline +m128 pshufb_m128(m128 a, m128 b) { + m128 result; + result = _mm_shuffle_epi8(a, b); + return result; +} + +static really_inline +m128 variable_byte_shift_m128(m128 in, s32 amount) { + assert(amount >= -16 && amount <= 16); + m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); + return pshufb_m128(in, shift_mask); +} + +static really_inline +m128 max_u8_m128(m128 a, m128 b) { + return (m128) vmaxq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline +m128 min_u8_m128(m128 a, m128 b) { + return (m128) vminq_s8((int8x16_t)a, (int8x16_t)b); +} + +static really_inline +m128 sadd_u8_m128(m128 a, m128 b) { + return (m128) vqaddq_u8((uint8x16_t)a, (uint8x16_t)b); +} + +static really_inline +m128 sub_u8_m128(m128 a, m128 b) { + return (m128) vsubq_u8((uint8x16_t)a, (uint8x16_t)b); +} + +static really_inline +m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) { + uint32_t __attribute__((aligned(16))) data[4] = { x3, x2, x1, x0 }; + return (m128) vld1q_u32((uint32_t *) data); +} + +static really_inline +m128 set2x64(u64a hi, u64a lo) { + uint64_t __attribute__((aligned(16))) data[2] = { hi, lo }; + return (m128) vld1q_u64((uint64_t *) data); +} + +#endif // ARCH_ARM_SIMD_UTILS_H diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index 019dc125..49200288 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -63,6 +63,8 @@ extern const char vbs_mask_data[]; #if defined(ARCH_IA32) || defined(ARCH_X86_64) #include "util/arch/x86/simd_utils.h" +#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) +#include "util/arch/arm/simd_utils.h" #endif #endif // SIMD_UTILS_H From 5b425bd5a6752d239ebe5957dc90bb22bfc37e2e Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 15 Oct 2020 16:25:29 +0300 Subject: [PATCH 25/53] add arm simple cpuid_flags --- CMakeLists.txt | 13 ++++++++++- src/hs_valid_platform.c | 5 +++++ src/util/arch/arm/cpuid_flags.c | 40 +++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 src/util/arch/arm/cpuid_flags.c diff --git a/CMakeLists.txt b/CMakeLists.txt index c1db4dfa..566a7dcd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -568,11 +568,22 @@ install(FILES ${hs_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs") set (hs_exec_common_SRCS src/alloc.c src/scratch.c - src/util/arch/x86/cpuid_flags.c src/util/arch/common/cpuid_flags.h src/util/multibit.c ) +if (ARCH_IA32 OR ARCH_X86_64) +set (hs_exec_common_SRCS + ${hs_exec_common_SRCS} + src/util/arch/x86/cpuid_flags.c + ) +elif (ARCH_ARM32 OR ARCH_AARCH64) +set (hs_exec_common_SRCS + ${hs_exec_common_SRCS} + src/util/arch/arm/cpuid_flags.c + ) +endif () + set (hs_exec_SRCS ${hs_HEADERS} src/hs_version.h diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c index 7a022607..b187090b 100644 --- a/src/hs_valid_platform.c +++ b/src/hs_valid_platform.c @@ -28,6 +28,7 @@ #include "config.h" #include "hs_common.h" +#include "ue2common.h" #if defined(ARCH_X86_64) #include "util/arch/x86/cpuid_inline.h" #endif @@ -35,9 +36,13 @@ HS_PUBLIC_API hs_error_t HS_CDECL hs_valid_platform(void) { /* Hyperscan requires SSSE3, anything else is a bonus */ +#if defined(ARCH_IA32) || defined(ARCH_X86_64) if (check_ssse3()) { return HS_SUCCESS; } else { return HS_ARCH_ERROR; } +#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) + return HS_SUCCESS; +#endif } diff --git a/src/util/arch/arm/cpuid_flags.c b/src/util/arch/arm/cpuid_flags.c new file mode 100644 index 00000000..8dbab473 --- /dev/null +++ b/src/util/arch/arm/cpuid_flags.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "cpuid_flags.h" +#include "ue2common.h" +#include "hs_compile.h" // for HS_MODE_ flags +#include "util/arch.h" + +u64a cpuid_flags(void) { + return cap; +} + +u32 cpuid_tune(void) { + return HS_TUNE_FAMILY_GENERIC; +} From c5a7f4b846edd8c6811fccae54a7df7ceabf52cf Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 15 Oct 2020 16:26:49 +0300 Subject: [PATCH 26/53] add ARM simd_utils vectorized functions for 128-bit vectors --- src/util/arch/arm/simd_utils.h | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 606892fb..74f447fb 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -95,7 +95,18 @@ static really_inline m128 eq128(m128 a, m128 b) { return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b); } -#define movemask128(a) ((u32)_mm_movemask_epi8((a))) +static really_inline u32 movemask128(m128 a) { + static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; + + // Compute the mask from the input + uint64x2_t mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); + + // Get the resulting bytes + uint16_t output; + vst1q_lane_u8((uint8_t*)&output + 0, (uint8x16_t)mask, 0); + vst1q_lane_u8((uint8_t*)&output + 1, (uint8x16_t)mask, 8); + return output; +} static really_inline m128 set1_16x8(u8 c) { return (m128) vdupq_n_u8(c); @@ -229,21 +240,22 @@ void clearbit128(m128 *ptr, unsigned int n) { static really_inline char testbit128(m128 val, unsigned int n) { const m128 mask = mask1bit128(n); -#if defined(HAVE_SSE41) - return !_mm_testz_si128(mask, val); -#else + return isnonzero128(and128(mask, val)); -#endif } -// offset must be an immediate -#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) +static really_inline +m128 palignr(m128 r, m128 l, int offset) { + return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); +} static really_inline m128 pshufb_m128(m128 a, m128 b) { - m128 result; - result = _mm_shuffle_epi8(a, b); - return result; + /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. + In NEON, if >=16, then the result is zero, otherwise it is that lane. + btranslated is the version that is converted from Intel to NEON. */ + int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f)); + return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated); } static really_inline From 45bfed9b9d22e172b82659d07d63e0a2802b2fa4 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 15 Oct 2020 16:30:18 +0300 Subject: [PATCH 27/53] add scalar versions of the vectorized functions for architectures that don't support 256-bit/512-bit SIMD vectors such as ARM --- src/util/arch/common/simd_utils.h | 753 ++++++++++++++++++++++++++++++ src/util/simd_utils.h | 2 + 2 files changed, 755 insertions(+) create mode 100644 src/util/arch/common/simd_utils.h diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h new file mode 100644 index 00000000..e682e2d5 --- /dev/null +++ b/src/util/arch/common/simd_utils.h @@ -0,0 +1,753 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief SIMD types and primitive operations. + */ + +#ifndef ARCH_COMMON_SIMD_UTILS_H +#define ARCH_COMMON_SIMD_UTILS_H + +#include "ue2common.h" +#include "util/simd_types.h" +#include "util/unaligned.h" +#include "util/intrinsics.h" + +#include // for memcpy + +#if !defined(HAVE_SIMD_128_BITS) +#error "You need at least a 128-bit capable SIMD engine!" +#endif // HAVE_SIMD_128_BITS + +/**** + **** 256-bit Primitives + ****/ + +#if !defined(HAVE_SIMD_256_BITS) + +static really_really_inline +m256 lshift64_m256(m256 a, int b) { + m256 rv = a; + rv.lo = lshift64_m128(rv.lo, b); + rv.hi = lshift64_m128(rv.hi, b); + return rv; +} + +static really_inline +m256 rshift64_m256(m256 a, int b) { + m256 rv = a; + rv.lo = rshift64_m128(rv.lo, b); + rv.hi = rshift64_m128(rv.hi, b); + return rv; +} + +static really_inline +m256 eq256(m256 a, m256 b) { + m256 rv; + rv.lo = eq128(a.lo, b.lo); + rv.hi = eq128(a.hi, b.hi); + return rv; +} + +static really_inline +u32 movemask256(m256 a) { + u32 lo_mask = movemask128(a.lo); + u32 hi_mask = movemask128(a.hi); + return lo_mask | (hi_mask << 16); +} + +static really_inline m256 set1_4x64(u64a c) { + m128 a128 = set1_2x64(c); + m256 rv = {a128, a128}; + return rv; +} + +static really_inline +m256 set1_2x128(m128 a) { + m256 rv = {a, a}; + return rv; +} + +static really_inline m256 zeroes256(void) { + m256 rv = {zeroes128(), zeroes128()}; + return rv; +} + +static really_inline m256 ones256(void) { + m256 rv = {ones128(), ones128()}; + return rv; +} + +static really_inline m256 and256(m256 a, m256 b) { + m256 rv; + rv.lo = and128(a.lo, b.lo); + rv.hi = and128(a.hi, b.hi); + return rv; +} + +static really_inline m256 or256(m256 a, m256 b) { + m256 rv; + rv.lo = or128(a.lo, b.lo); + rv.hi = or128(a.hi, b.hi); + return rv; +} + +static really_inline m256 xor256(m256 a, m256 b) { + m256 rv; + rv.lo = xor128(a.lo, b.lo); + rv.hi = xor128(a.hi, b.hi); + return rv; +} + +static really_inline m256 not256(m256 a) { + m256 rv; + rv.lo = not128(a.lo); + rv.hi = not128(a.hi); + return rv; +} + +static really_inline m256 andnot256(m256 a, m256 b) { + m256 rv; + rv.lo = andnot128(a.lo, b.lo); + rv.hi = andnot128(a.hi, b.hi); + return rv; +} + +static really_inline int diff256(m256 a, m256 b) { + return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); +} + +static really_inline int isnonzero256(m256 a) { + return isnonzero128(or128(a.lo, a.hi)); +} + +/** + * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich256(m256 a, m256 b) { +} + +/** + * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and + * returns an 8-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_256(m256 a, m256 b) { + u32 d = diffrich256(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline m256 load256(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m256))); + m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; + return rv; +} + +// aligned load of 128-bit value to low and high part of 256-bit value +static really_inline m256 load2x128(const void *ptr) { + return set1_2x128(load128(ptr)); +} + +static really_inline m256 loadu2x128(const void *ptr) { + return set1_2x128(loadu128(ptr)); +} + +// aligned store +static really_inline void store256(void *ptr, m256 a) { + assert(ISALIGNED_N(ptr, alignof(m256))); + ptr = assume_aligned(ptr, 16); + *(m256 *)ptr = a; +} + +// unaligned load +static really_inline m256 loadu256(const void *ptr) { + m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; + return rv; +} + +// unaligned store +static really_inline void storeu256(void *ptr, m256 a) { + storeu128(ptr, a.lo); + storeu128((char *)ptr + 16, a.hi); +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes256(void *ptr, m256 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m256 loadbytes256(const void *ptr, unsigned int n) { + m256 a = zeroes256(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +static really_inline +m256 mask1bit256(unsigned int n) { + assert(n < sizeof(m256) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu256(&simd_onebit_masks[mask_idx]); +} + +static really_inline +m256 set1_32x8(u32 in) { + m256 rv; + rv.hi = set1_16x8(in); + rv.lo = set1_16x8(in); + return rv; +} + +static really_inline +m256 set8x32(u32 hi_3, u32 hi_2, u32 hi_1, u32 hi_0, u32 lo_3, u32 lo_2, u32 lo_1, u32 lo_0) { + m256 rv; + rv.hi = set4x32(hi_3, hi_2, hi_1, hi_0); + rv.lo = set4x32(lo_3, lo_2, lo_1, lo_0); + return rv; +} + +static really_inline +m256 set4x64(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { + m256 rv; + rv.hi = set2x64(hi_1, hi_0); + rv.lo = set2x64(lo_1, lo_0); + return rv; +} + +// switches on bit N in the given vector. +static really_inline +void setbit256(m256 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 128; + } + setbit128(sub, n); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit256(m256 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 128; + } + clearbit128(sub, n); +} + +// tests bit N in the given vector. +static really_inline +char testbit256(m256 val, unsigned int n) { + assert(n < sizeof(val) * 8); + m128 sub; + if (n < 128) { + sub = val.lo; + } else { + sub = val.hi; + n -= 128; + } + return testbit128(sub, n); +} + +static really_really_inline +m128 movdq_hi(m256 x) { + return x.hi; +} + +static really_really_inline +m128 movdq_lo(m256 x) { + return x.lo; +} + +static really_inline +m256 combine2x128(m128 hi, m128 lo) { + m256 rv = {lo, hi}; + return rv; +} + +static really_inline +m256 pshufb_m256(m256 a, m256 b) { + m256 rv; + rv.lo = pshufb_m128(a.lo, b.lo); + rv.hi = pshufb_m128(a.hi, b.hi); + return rv; +} + +#define cast256to128(a) _mm256_castsi256_si128(a) +#define cast128to256(a) _mm256_castsi128_si256(a) +#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) +#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) +#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) +#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) +#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) +#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) +#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a)) +#define extractlow32from256(a) movd(cast256to128(a)) +#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b) +#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b) +#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) + +#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) +#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) +#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) +#define set2x256(a) _mm512_broadcast_i64x4(a) +#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) + +#endif // HAVE_SIMD_256_BITS + +/**** + **** 384-bit Primitives + ****/ + +static really_inline m384 and384(m384 a, m384 b) { + m384 rv; + rv.lo = and128(a.lo, b.lo); + rv.mid = and128(a.mid, b.mid); + rv.hi = and128(a.hi, b.hi); + return rv; +} + +static really_inline m384 or384(m384 a, m384 b) { + m384 rv; + rv.lo = or128(a.lo, b.lo); + rv.mid = or128(a.mid, b.mid); + rv.hi = or128(a.hi, b.hi); + return rv; +} + +static really_inline m384 xor384(m384 a, m384 b) { + m384 rv; + rv.lo = xor128(a.lo, b.lo); + rv.mid = xor128(a.mid, b.mid); + rv.hi = xor128(a.hi, b.hi); + return rv; +} +static really_inline m384 not384(m384 a) { + m384 rv; + rv.lo = not128(a.lo); + rv.mid = not128(a.mid); + rv.hi = not128(a.hi); + return rv; +} +static really_inline m384 andnot384(m384 a, m384 b) { + m384 rv; + rv.lo = andnot128(a.lo, b.lo); + rv.mid = andnot128(a.mid, b.mid); + rv.hi = andnot128(a.hi, b.hi); + return rv; +} + +static really_really_inline +m384 lshift64_m384(m384 a, unsigned b) { + m384 rv; + rv.lo = lshift64_m128(a.lo, b); + rv.mid = lshift64_m128(a.mid, b); + rv.hi = lshift64_m128(a.hi, b); + return rv; +} + +static really_inline m384 zeroes384(void) { + m384 rv = {zeroes128(), zeroes128(), zeroes128()}; + return rv; +} + +static really_inline m384 ones384(void) { + m384 rv = {ones128(), ones128(), ones128()}; + return rv; +} + +static really_inline int diff384(m384 a, m384 b) { + return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); +} + +static really_inline int isnonzero384(m384 a) { + return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); +} + +/** + * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich384(m384 a, m384 b) { +} + +/** + * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and + * returns a 12-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_384(m384 a, m384 b) { + u32 d = diffrich384(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline m384 load384(const void *ptr) { + assert(ISALIGNED_16(ptr)); + m384 rv = { load128(ptr), load128((const char *)ptr + 16), + load128((const char *)ptr + 32) }; + return rv; +} + +// aligned store +static really_inline void store384(void *ptr, m384 a) { + assert(ISALIGNED_16(ptr)); + ptr = assume_aligned(ptr, 16); + *(m384 *)ptr = a; +} + +// unaligned load +static really_inline m384 loadu384(const void *ptr) { + m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16), + loadu128((const char *)ptr + 32)}; + return rv; +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes384(void *ptr, m384 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m384 loadbytes384(const void *ptr, unsigned int n) { + m384 a = zeroes384(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +// switches on bit N in the given vector. +static really_inline +void setbit384(m384 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else if (n < 256) { + sub = &ptr->mid; + } else { + sub = &ptr->hi; + } + setbit128(sub, n % 128); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit384(m384 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else if (n < 256) { + sub = &ptr->mid; + } else { + sub = &ptr->hi; + } + clearbit128(sub, n % 128); +} + +// tests bit N in the given vector. +static really_inline +char testbit384(m384 val, unsigned int n) { + assert(n < sizeof(val) * 8); + m128 sub; + if (n < 128) { + sub = val.lo; + } else if (n < 256) { + sub = val.mid; + } else { + sub = val.hi; + } + return testbit128(sub, n % 128); +} + + +/**** + **** 512-bit Primitives + ****/ + +#if !defined(HAVE_SIMD_512_BITS) +#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) +#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) +#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) + +static really_inline +m512 zeroes512(void) { + m512 rv = {zeroes256(), zeroes256()}; + return rv; +} + +static really_inline +m512 ones512(void) { + m512 rv = {ones256(), ones256()}; + return rv; +} + +static really_inline +m512 set1_64x8(u8 a) { + m256 a256 = set1_32x8(a); + m512 rv = {a256, a256}; + return rv; +} + +static really_inline +m512 set1_8x64(u64a a) { + m256 a256 = set1_4x64(a); + m512 rv = {a256, a256}; + return rv; +} + +static really_inline +m512 set8x64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, + u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { + m512 rv; + rv.lo = set4x64(lo_3, lo_2, lo_1, lo_0); + rv.hi = set4x64(hi_3, hi_2, hi_1, hi_0); + return rv; +} +/* +static really_inline +m512 swap256in512(m512 a) { + m512 idx = set8x64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL); + return vpermq512(idx, a); +}*/ + +static really_inline +m512 set1_4x128(m128 a) { + m256 a256 = set1_2x128(a); + m512 rv = {a256, a256}; + return rv; +} + + +static really_inline +m512 and512(m512 a, m512 b) { + m512 rv; + rv.lo = and256(a.lo, b.lo); + rv.hi = and256(a.hi, b.hi); + return rv; +} + +static really_inline +m512 or512(m512 a, m512 b) { + m512 rv; + rv.lo = or256(a.lo, b.lo); + rv.hi = or256(a.hi, b.hi); + return rv; +} + +static really_inline +m512 xor512(m512 a, m512 b) { + m512 rv; + rv.lo = xor256(a.lo, b.lo); + rv.hi = xor256(a.hi, b.hi); + return rv; +} + +static really_inline +m512 not512(m512 a) { + m512 rv; + rv.lo = not256(a.lo); + rv.hi = not256(a.hi); + return rv; +} + +static really_inline +m512 andnot512(m512 a, m512 b) { + m512 rv; + rv.lo = andnot256(a.lo, b.lo); + rv.hi = andnot256(a.hi, b.hi); + return rv; +} + +static really_really_inline +m512 lshift64_m512(m512 a, unsigned b) { + m512 rv; + rv.lo = lshift64_m256(a.lo, b); + rv.hi = lshift64_m256(a.hi, b); + return rv; +} + +#if defined(HAVE_AVX512) +#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) +#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) +#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) +#endif + +static really_inline +int diff512(m512 a, m512 b) { + return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); +} + +static really_inline +int isnonzero512(m512 a) { + m128 x = or128(a.lo.lo, a.lo.hi); + m128 y = or128(a.hi.lo, a.hi.hi); + return isnonzero128(or128(x, y)); +} + +/** + * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline +u32 diffrich512(m512 a, m512 b) { + return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); +} + +/** + * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and + * returns a 16-bit mask indicating which 64-bit words contain differences. + */ +static really_inline +u32 diffrich64_512(m512 a, m512 b) { + //TODO: cmp_epi64? + u32 d = diffrich512(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline +m512 load512(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m256))); + m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; + return rv; +} + +// aligned store +static really_inline +void store512(void *ptr, m512 a) { + assert(ISALIGNED_N(ptr, alignof(m512))); + m512 *x = (m512 *)ptr; + store256(&x->lo, a.lo); + store256(&x->hi, a.hi); +} + +// unaligned load +static really_inline +m512 loadu512(const void *ptr) { + m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; + return rv; +} + +/*static really_inline +m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { +} + +static really_inline +m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) { +} + +static really_inline +m512 set_mask_m512(__mmask64 k) { +}*/ + +// packed unaligned store of first N bytes +static really_inline +void storebytes512(void *ptr, m512 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m512 loadbytes512(const void *ptr, unsigned int n) { + m512 a = zeroes512(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +static really_inline +m512 mask1bit512(unsigned int n) { + assert(n < sizeof(m512) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu512(&simd_onebit_masks[mask_idx]); +} + +// switches on bit N in the given vector. +static really_inline +void setbit512(m512 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m256 *sub; + if (n < 256) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 256; + } + setbit256(sub, n); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit512(m512 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m256 *sub; + if (n < 256) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 256; + } + clearbit256(sub, n); +} + +// tests bit N in the given vector. +static really_inline +char testbit512(m512 val, unsigned int n) { + assert(n < sizeof(val) * 8); + m256 sub; + if (n < 256) { + sub = val.lo; + } else { + sub = val.hi; + n -= 256; + } + return testbit256(sub, n); +} + +#endif // HAVE_SIMD_512_BITS + +#endif // ARCH_COMMON_SIMD_UTILS_H diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index 49200288..0724c94e 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -67,4 +67,6 @@ extern const char vbs_mask_data[]; #include "util/arch/arm/simd_utils.h" #endif +#include "util/arch/common/simd_utils.h" + #endif // SIMD_UTILS_H From e7e1308d7f709e6e6665db9ef042b7e335714198 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 16 Oct 2020 12:29:45 +0300 Subject: [PATCH 28/53] fix compilation paths for cpuid_flags for x86 --- CMakeLists.txt | 2 +- src/util/arch/x86/cpuid_flags.c | 2 +- src/util/arch/x86/cpuid_inline.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 566a7dcd..4077d396 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -577,7 +577,7 @@ set (hs_exec_common_SRCS ${hs_exec_common_SRCS} src/util/arch/x86/cpuid_flags.c ) -elif (ARCH_ARM32 OR ARCH_AARCH64) +else (ARCH_ARM32 OR ARCH_AARCH64) set (hs_exec_common_SRCS ${hs_exec_common_SRCS} src/util/arch/arm/cpuid_flags.c diff --git a/src/util/arch/x86/cpuid_flags.c b/src/util/arch/x86/cpuid_flags.c index 0b529c0b..81c7e456 100644 --- a/src/util/arch/x86/cpuid_flags.c +++ b/src/util/arch/x86/cpuid_flags.c @@ -26,7 +26,7 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include "cpuid_flags.h" +#include "util/arch/common/cpuid_flags.h" #include "cpuid_inline.h" #include "ue2common.h" #include "hs_compile.h" // for HS_MODE_ flags diff --git a/src/util/arch/x86/cpuid_inline.h b/src/util/arch/x86/cpuid_inline.h index b6768cc2..97f19aed 100644 --- a/src/util/arch/x86/cpuid_inline.h +++ b/src/util/arch/x86/cpuid_inline.h @@ -30,7 +30,7 @@ #define CPUID_INLINE_H_ #include "ue2common.h" -#include "cpuid_flags.h" +#include "util/arch/common/cpuid_flags.h" #if !defined(_WIN32) && !defined(CPUID_H_) #include From 83977db7abfd871f3fb2a37ee8534f46aa4cd994 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 16 Oct 2020 12:30:34 +0300 Subject: [PATCH 29/53] split arch-agnostic simd_utils.h functions into the common file --- src/util/arch/common/simd_utils.h | 48 +-- src/util/arch/x86/simd_utils.h | 644 ++---------------------------- 2 files changed, 48 insertions(+), 644 deletions(-) diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index e682e2d5..56d9dbaf 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -147,10 +147,12 @@ static really_inline int isnonzero256(m256 a) { } /** - * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit + * "Rich" version of diff256(). Takes two vectors a and b and returns a 8-bit * mask indicating which 32-bit words contain differences. */ -static really_inline u32 diffrich256(m256 a, m256 b) { +static really_inline +u32 diffrich256(m256 a, m256 b) { + return diffrich128(a.lo, b.lo) | (diffrich128(a.hi, b.hi) << 8); } /** @@ -311,26 +313,6 @@ m256 pshufb_m256(m256 a, m256 b) { return rv; } -#define cast256to128(a) _mm256_castsi256_si128(a) -#define cast128to256(a) _mm256_castsi128_si256(a) -#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) -#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) -#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) -#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) -#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) -#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) -#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a)) -#define extractlow32from256(a) movd(cast256to128(a)) -#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b) -#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b) -#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) - -#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) -#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) -#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) -#define set2x256(a) _mm512_broadcast_i64x4(a) -#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) - #endif // HAVE_SIMD_256_BITS /**** @@ -402,13 +384,6 @@ static really_inline int isnonzero384(m384 a) { return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); } -/** - * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit - * mask indicating which 32-bit words contain differences. - */ -static really_inline u32 diffrich384(m384 a, m384 b) { -} - /** * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and * returns a 12-bit mask indicating which 64-bit words contain differences. @@ -507,9 +482,6 @@ char testbit384(m384 val, unsigned int n) { ****/ #if !defined(HAVE_SIMD_512_BITS) -#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) -#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) -#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) static really_inline m512 zeroes512(void) { @@ -608,12 +580,6 @@ m512 lshift64_m512(m512 a, unsigned b) { return rv; } -#if defined(HAVE_AVX512) -#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) -#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) -#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) -#endif - static really_inline int diff512(m512 a, m512 b) { return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); @@ -621,9 +587,9 @@ int diff512(m512 a, m512 b) { static really_inline int isnonzero512(m512 a) { - m128 x = or128(a.lo.lo, a.lo.hi); - m128 y = or128(a.hi.lo, a.hi.hi); - return isnonzero128(or128(x, y)); + m256 x = or256(a.lo, a.lo); + m256 y = or256(a.hi, a.hi); + return isnonzero256(or256(x, y)); } /** diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index 2d099f56..4a1a691e 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -127,22 +127,8 @@ static really_inline u32 movd(const m128 in) { return _mm_cvtsi128_si32(in); } -#if defined(HAVE_AVX512) -static really_inline u32 movd512(const m512 in) { - // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in), - // so we use 2-step convertions to work around. - return _mm_cvtsi128_si32(_mm512_castsi512_si128(in)); -} -#endif - static really_inline u64a movq(const m128 in) { -#if defined(ARCH_X86_64) return _mm_cvtsi128_si64(in); -#else // 32-bit - this is horrific - u32 lo = movd(in); - u32 hi = movd(_mm_srli_epi64(in, 32)); - return (u64a)hi << 32 | lo; -#endif } /* another form of movq */ @@ -281,36 +267,6 @@ m128 pshufb_m128(m128 a, m128 b) { return result; } -static really_inline -m256 pshufb_m256(m256 a, m256 b) { -#if defined(HAVE_AVX2) - return _mm256_shuffle_epi8(a, b); -#else - m256 rv; - rv.lo = pshufb_m128(a.lo, b.lo); - rv.hi = pshufb_m128(a.hi, b.hi); - return rv; -#endif -} - -#if defined(HAVE_AVX512) -static really_inline -m512 pshufb_m512(m512 a, m512 b) { - return _mm512_shuffle_epi8(a, b); -} - -static really_inline -m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { - return _mm512_maskz_shuffle_epi8(k, a, b); -} - -#if defined(HAVE_AVX512VBMI) -#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a) -#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a) -#endif - -#endif - static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) { assert(amount >= -16 && amount <= 16); @@ -352,7 +308,12 @@ m128 set2x64(u64a hi, u64a lo) { **** 256-bit Primitives ****/ -#if defined(HAVE_AVX2) +#if defined(HAVE_SIMD_256_BITS) && defined(HAVE_AVX2) + +static really_inline +m256 pshufb_m256(m256 a, m256 b) { + return _mm256_shuffle_epi8(a, b); +} static really_really_inline m256 lshift64_m256(m256 a, unsigned b) { @@ -379,143 +340,41 @@ m256 set1_2x128(m128 a) { return _mm256_broadcastsi128_si256(a); } -#else - -static really_really_inline -m256 lshift64_m256(m256 a, int b) { - m256 rv = a; - rv.lo = lshift64_m128(rv.lo, b); - rv.hi = lshift64_m128(rv.hi, b); - return rv; -} - -static really_inline -m256 rshift64_m256(m256 a, int b) { - m256 rv = a; - rv.lo = rshift64_m128(rv.lo, b); - rv.hi = rshift64_m128(rv.hi, b); - return rv; -} - -static really_inline -m256 eq256(m256 a, m256 b) { - m256 rv; - rv.lo = eq128(a.lo, b.lo); - rv.hi = eq128(a.hi, b.hi); - return rv; -} - -static really_inline -u32 movemask256(m256 a) { - u32 lo_mask = movemask128(a.lo); - u32 hi_mask = movemask128(a.hi); - return lo_mask | (hi_mask << 16); -} - -static really_inline -m256 set1_2x128(m128 a) { - m256 rv = {a, a}; - return rv; -} -#endif - static really_inline m256 zeroes256(void) { -#if defined(HAVE_AVX2) return _mm256_setzero_si256(); -#else - m256 rv = {zeroes128(), zeroes128()}; - return rv; -#endif } static really_inline m256 ones256(void) { -#if defined(HAVE_AVX2) m256 rv = _mm256_set1_epi8(0xFF); -#else - m256 rv = {ones128(), ones128()}; -#endif return rv; } -#if defined(HAVE_AVX2) static really_inline m256 and256(m256 a, m256 b) { return _mm256_and_si256(a, b); } -#else -static really_inline m256 and256(m256 a, m256 b) { - m256 rv; - rv.lo = and128(a.lo, b.lo); - rv.hi = and128(a.hi, b.hi); - return rv; -} -#endif -#if defined(HAVE_AVX2) static really_inline m256 or256(m256 a, m256 b) { return _mm256_or_si256(a, b); } -#else -static really_inline m256 or256(m256 a, m256 b) { - m256 rv; - rv.lo = or128(a.lo, b.lo); - rv.hi = or128(a.hi, b.hi); - return rv; -} -#endif -#if defined(HAVE_AVX2) static really_inline m256 xor256(m256 a, m256 b) { return _mm256_xor_si256(a, b); } -#else -static really_inline m256 xor256(m256 a, m256 b) { - m256 rv; - rv.lo = xor128(a.lo, b.lo); - rv.hi = xor128(a.hi, b.hi); - return rv; -} -#endif -#if defined(HAVE_AVX2) static really_inline m256 not256(m256 a) { return _mm256_xor_si256(a, ones256()); } -#else -static really_inline m256 not256(m256 a) { - m256 rv; - rv.lo = not128(a.lo); - rv.hi = not128(a.hi); - return rv; -} -#endif -#if defined(HAVE_AVX2) static really_inline m256 andnot256(m256 a, m256 b) { return _mm256_andnot_si256(a, b); } -#else -static really_inline m256 andnot256(m256 a, m256 b) { - m256 rv; - rv.lo = andnot128(a.lo, b.lo); - rv.hi = andnot128(a.hi, b.hi); - return rv; -} -#endif static really_inline int diff256(m256 a, m256 b) { -#if defined(HAVE_AVX2) return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); -#else - return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); -#endif } static really_inline int isnonzero256(m256 a) { -#if defined(HAVE_AVX2) return !!diff256(a, zeroes256()); -#else - return isnonzero128(or128(a.lo, a.hi)); -#endif } /** @@ -523,16 +382,8 @@ static really_inline int isnonzero256(m256 a) { * mask indicating which 32-bit words contain differences. */ static really_inline u32 diffrich256(m256 a, m256 b) { -#if defined(HAVE_AVX2) a = _mm256_cmpeq_epi32(a, b); return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF; -#else - m128 z = zeroes128(); - a.lo = _mm_cmpeq_epi32(a.lo, b.lo); - a.hi = _mm_cmpeq_epi32(a.hi, b.hi); - m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z); - return ~(_mm_movemask_epi8(packed)) & 0xff; -#endif } /** @@ -547,24 +398,12 @@ static really_inline u32 diffrich64_256(m256 a, m256 b) { // aligned load static really_inline m256 load256(const void *ptr) { assert(ISALIGNED_N(ptr, alignof(m256))); -#if defined(HAVE_AVX2) return _mm256_load_si256((const m256 *)ptr); -#else - m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; - return rv; -#endif } // aligned load of 128-bit value to low and high part of 256-bit value static really_inline m256 load2x128(const void *ptr) { -#if defined(HAVE_AVX2) return set1_2x128(load128(ptr)); -#else - assert(ISALIGNED_N(ptr, alignof(m128))); - m256 rv; - rv.hi = rv.lo = load128(ptr); - return rv; -#endif } static really_inline m256 loadu2x128(const void *ptr) { @@ -574,32 +413,17 @@ static really_inline m256 loadu2x128(const void *ptr) { // aligned store static really_inline void store256(void *ptr, m256 a) { assert(ISALIGNED_N(ptr, alignof(m256))); -#if defined(HAVE_AVX2) _mm256_store_si256((m256 *)ptr, a); -#else - ptr = assume_aligned(ptr, 16); - *(m256 *)ptr = a; -#endif } // unaligned load static really_inline m256 loadu256(const void *ptr) { -#if defined(HAVE_AVX2) return _mm256_loadu_si256((const m256 *)ptr); -#else - m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; - return rv; -#endif } // unaligned store static really_inline void storeu256(void *ptr, m256 a) { -#if defined(HAVE_AVX2) _mm256_storeu_si256((m256 *)ptr, a); -#else - storeu128(ptr, a.lo); - storeu128((char *)ptr + 16, a.hi); -#endif } // packed unaligned store of first N bytes @@ -628,101 +452,19 @@ m256 mask1bit256(unsigned int n) { static really_inline m256 set1_32x8(u32 in) { -#if defined(HAVE_AVX2) return _mm256_set1_epi8(in); -#else - m256 rv; - rv.hi = set1_16x8(in); - rv.lo = set1_16x8(in); - return rv; -#endif } static really_inline m256 set8x32(u32 hi_3, u32 hi_2, u32 hi_1, u32 hi_0, u32 lo_3, u32 lo_2, u32 lo_1, u32 lo_0) { -#if defined(HAVE_AVX2) return _mm256_set_epi32(hi_3, hi_2, hi_1, hi_0, lo_3, lo_2, lo_1, lo_0); -#else - m256 rv; - rv.hi = set4x32(hi_3, hi_2, hi_1, hi_0); - rv.lo = set4x32(lo_3, lo_2, lo_1, lo_0); - return rv; -#endif } static really_inline m256 set4x64(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { -#if defined(HAVE_AVX2) return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0); -#else - m256 rv; - rv.hi = set2x64(hi_1, hi_0); - rv.lo = set2x64(lo_1, lo_0); - return rv; -#endif } -#if !defined(HAVE_AVX2) -// switches on bit N in the given vector. -static really_inline -void setbit256(m256 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 128; - } - setbit128(sub, n); -} - -// switches off bit N in the given vector. -static really_inline -void clearbit256(m256 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 128; - } - clearbit128(sub, n); -} - -// tests bit N in the given vector. -static really_inline -char testbit256(m256 val, unsigned int n) { - assert(n < sizeof(val) * 8); - m128 sub; - if (n < 128) { - sub = val.lo; - } else { - sub = val.hi; - n -= 128; - } - return testbit128(sub, n); -} - -static really_really_inline -m128 movdq_hi(m256 x) { - return x.hi; -} - -static really_really_inline -m128 movdq_lo(m256 x) { - return x.lo; -} - -static really_inline -m256 combine2x128(m128 hi, m128 lo) { - m256 rv = {lo, hi}; - return rv; -} - -#else // AVX2 - // switches on bit N in the given vector. static really_inline void setbit256(m256 *ptr, unsigned int n) { @@ -775,88 +517,12 @@ m256 combine2x128(m128 hi, m128 lo) { } #endif //AVX2 -#if defined(HAVE_AVX512) -#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) -#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) -#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) -#define set2x256(a) _mm512_broadcast_i64x4(a) -#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) -#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) -#endif - -/**** - **** 384-bit Primitives - ****/ - -static really_inline m384 and384(m384 a, m384 b) { - m384 rv; - rv.lo = and128(a.lo, b.lo); - rv.mid = and128(a.mid, b.mid); - rv.hi = and128(a.hi, b.hi); - return rv; -} - -static really_inline m384 or384(m384 a, m384 b) { - m384 rv; - rv.lo = or128(a.lo, b.lo); - rv.mid = or128(a.mid, b.mid); - rv.hi = or128(a.hi, b.hi); - return rv; -} - -static really_inline m384 xor384(m384 a, m384 b) { - m384 rv; - rv.lo = xor128(a.lo, b.lo); - rv.mid = xor128(a.mid, b.mid); - rv.hi = xor128(a.hi, b.hi); - return rv; -} -static really_inline m384 not384(m384 a) { - m384 rv; - rv.lo = not128(a.lo); - rv.mid = not128(a.mid); - rv.hi = not128(a.hi); - return rv; -} -static really_inline m384 andnot384(m384 a, m384 b) { - m384 rv; - rv.lo = andnot128(a.lo, b.lo); - rv.mid = andnot128(a.mid, b.mid); - rv.hi = andnot128(a.hi, b.hi); - return rv; -} - -static really_really_inline -m384 lshift64_m384(m384 a, unsigned b) { - m384 rv; - rv.lo = lshift64_m128(a.lo, b); - rv.mid = lshift64_m128(a.mid, b); - rv.hi = lshift64_m128(a.hi, b); - return rv; -} - -static really_inline m384 zeroes384(void) { - m384 rv = {zeroes128(), zeroes128(), zeroes128()}; - return rv; -} - -static really_inline m384 ones384(void) { - m384 rv = {ones128(), ones128(), ones128()}; - return rv; -} - -static really_inline int diff384(m384 a, m384 b) { - return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); -} - -static really_inline int isnonzero384(m384 a) { - return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); -} - +#if defined(HAVE_SIMD_128_BITS) /** * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit * mask indicating which 32-bit words contain differences. */ + static really_inline u32 diffrich384(m384 a, m384 b) { m128 z = zeroes128(); a.lo = _mm_cmpeq_epi32(a.lo, b.lo); @@ -867,102 +533,42 @@ static really_inline u32 diffrich384(m384 a, m384 b) { return ~(_mm_movemask_epi8(packed)) & 0xfff; } -/** - * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and - * returns a 12-bit mask indicating which 64-bit words contain differences. - */ -static really_inline u32 diffrich64_384(m384 a, m384 b) { - u32 d = diffrich384(a, b); - return (d | (d >> 1)) & 0x55555555; -} - -// aligned load -static really_inline m384 load384(const void *ptr) { - assert(ISALIGNED_16(ptr)); - m384 rv = { load128(ptr), load128((const char *)ptr + 16), - load128((const char *)ptr + 32) }; - return rv; -} - -// aligned store -static really_inline void store384(void *ptr, m384 a) { - assert(ISALIGNED_16(ptr)); - ptr = assume_aligned(ptr, 16); - *(m384 *)ptr = a; -} - -// unaligned load -static really_inline m384 loadu384(const void *ptr) { - m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16), - loadu128((const char *)ptr + 32)}; - return rv; -} - -// packed unaligned store of first N bytes -static really_inline -void storebytes384(void *ptr, m384 a, unsigned int n) { - assert(n <= sizeof(a)); - memcpy(ptr, &a, n); -} - -// packed unaligned load of first N bytes, pad with zero -static really_inline -m384 loadbytes384(const void *ptr, unsigned int n) { - m384 a = zeroes384(); - assert(n <= sizeof(a)); - memcpy(&a, ptr, n); - return a; -} - -// switches on bit N in the given vector. -static really_inline -void setbit384(m384 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else if (n < 256) { - sub = &ptr->mid; - } else { - sub = &ptr->hi; - } - setbit128(sub, n % 128); -} - -// switches off bit N in the given vector. -static really_inline -void clearbit384(m384 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else if (n < 256) { - sub = &ptr->mid; - } else { - sub = &ptr->hi; - } - clearbit128(sub, n % 128); -} - -// tests bit N in the given vector. -static really_inline -char testbit384(m384 val, unsigned int n) { - assert(n < sizeof(val) * 8); - m128 sub; - if (n < 128) { - sub = val.lo; - } else if (n < 256) { - sub = val.mid; - } else { - sub = val.hi; - } - return testbit128(sub, n % 128); -} +#endif // HAVE_SIMD_128_BITS /**** **** 512-bit Primitives ****/ +#if defined(HAVE_SIMD_512_BITS) + +#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) +#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) +#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) +#define set2x256(a) _mm512_broadcast_i64x4(a) +#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) +#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) + +static really_inline u32 movd512(const m512 in) { + // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in), + // so we use 2-step convertions to work around. + return _mm_cvtsi128_si32(_mm512_castsi512_si128(in)); +} + +static really_inline +m512 pshufb_m512(m512 a, m512 b) { + return _mm512_shuffle_epi8(a, b); +} + +static really_inline +m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { + return _mm512_maskz_shuffle_epi8(k, a, b); +} + +#if defined(HAVE_AVX512VBMI) +#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a) +#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a) +#endif + #define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) #define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) @@ -978,16 +584,10 @@ m512 zeroes512(void) { static really_inline m512 ones512(void) { -#if defined(HAVE_AVX512) return _mm512_set1_epi8(0xFF); //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512()); -#else - m512 rv = {ones256(), ones256()}; - return rv; -#endif } -#if defined(HAVE_AVX512) static really_inline m512 set1_64x8(u8 a) { return _mm512_set1_epi8(a); @@ -1015,69 +615,32 @@ static really_inline m512 set1_4x128(m128 a) { return _mm512_broadcast_i32x4(a); } -#endif static really_inline m512 and512(m512 a, m512 b) { -#if defined(HAVE_AVX512) return _mm512_and_si512(a, b); -#else - m512 rv; - rv.lo = and256(a.lo, b.lo); - rv.hi = and256(a.hi, b.hi); - return rv; -#endif } static really_inline m512 or512(m512 a, m512 b) { -#if defined(HAVE_AVX512) return _mm512_or_si512(a, b); -#else - m512 rv; - rv.lo = or256(a.lo, b.lo); - rv.hi = or256(a.hi, b.hi); - return rv; -#endif } static really_inline m512 xor512(m512 a, m512 b) { -#if defined(HAVE_AVX512) return _mm512_xor_si512(a, b); -#else - m512 rv; - rv.lo = xor256(a.lo, b.lo); - rv.hi = xor256(a.hi, b.hi); - return rv; -#endif } static really_inline m512 not512(m512 a) { -#if defined(HAVE_AVX512) return _mm512_xor_si512(a, ones512()); -#else - m512 rv; - rv.lo = not256(a.lo); - rv.hi = not256(a.hi); - return rv; -#endif } static really_inline m512 andnot512(m512 a, m512 b) { -#if defined(HAVE_AVX512) return _mm512_andnot_si512(a, b); -#else - m512 rv; - rv.lo = andnot256(a.lo, b.lo); - rv.hi = andnot256(a.hi, b.hi); - return rv; -#endif } -#if defined(HAVE_AVX512) static really_really_inline m512 lshift64_m512(m512 a, unsigned b) { #if defined(HAVE__BUILTIN_CONSTANT_P) @@ -1088,21 +651,10 @@ m512 lshift64_m512(m512 a, unsigned b) { m128 x = _mm_cvtsi32_si128(b); return _mm512_sll_epi64(a, x); } -#else -static really_really_inline -m512 lshift64_m512(m512 a, unsigned b) { - m512 rv; - rv.lo = lshift64_m256(a.lo, b); - rv.hi = lshift64_m256(a.hi, b); - return rv; -} -#endif -#if defined(HAVE_AVX512) #define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) #define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) #define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) -#endif #if !defined(_MM_CMPINT_NE) #define _MM_CMPINT_NE 0x4 @@ -1110,25 +662,12 @@ m512 lshift64_m512(m512 a, unsigned b) { static really_inline int diff512(m512 a, m512 b) { -#if defined(HAVE_AVX512) return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE); -#else - return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); -#endif } static really_inline int isnonzero512(m512 a) { -#if defined(HAVE_AVX512) return diff512(a, zeroes512()); -#elif defined(HAVE_AVX2) - m256 x = or256(a.lo, a.hi); - return !!diff256(x, zeroes256()); -#else - m128 x = or128(a.lo.lo, a.lo.hi); - m128 y = or128(a.hi.lo, a.hi.hi); - return isnonzero128(or128(x, y)); -#endif } /** @@ -1137,19 +676,7 @@ int isnonzero512(m512 a) { */ static really_inline u32 diffrich512(m512 a, m512 b) { -#if defined(HAVE_AVX512) return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE); -#elif defined(HAVE_AVX2) - return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); -#else - a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); - a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi); - a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo); - a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi); - m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi), - _mm_packs_epi32(a.hi.lo, a.hi.hi)); - return ~(_mm_movemask_epi8(packed)) & 0xffff; -#endif } /** @@ -1166,43 +693,22 @@ u32 diffrich64_512(m512 a, m512 b) { // aligned load static really_inline m512 load512(const void *ptr) { -#if defined(HAVE_AVX512) return _mm512_load_si512(ptr); -#else - assert(ISALIGNED_N(ptr, alignof(m256))); - m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; - return rv; -#endif } // aligned store static really_inline void store512(void *ptr, m512 a) { assert(ISALIGNED_N(ptr, alignof(m512))); -#if defined(HAVE_AVX512) return _mm512_store_si512(ptr, a); -#elif defined(HAVE_AVX2) - m512 *x = (m512 *)ptr; - store256(&x->lo, a.lo); - store256(&x->hi, a.hi); -#else - ptr = assume_aligned(ptr, 16); - *(m512 *)ptr = a; -#endif } // unaligned load static really_inline m512 loadu512(const void *ptr) { -#if defined(HAVE_AVX512) return _mm512_loadu_si512(ptr); -#else - m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; - return rv; -#endif } -#if defined(HAVE_AVX512) static really_inline m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { return _mm512_maskz_loadu_epi8(k, ptr); @@ -1217,7 +723,6 @@ static really_inline m512 set_mask_m512(__mmask64 k) { return _mm512_movm_epi8(k); } -#endif // packed unaligned store of first N bytes static really_inline @@ -1247,91 +752,24 @@ m512 mask1bit512(unsigned int n) { static really_inline void setbit512(m512 *ptr, unsigned int n) { assert(n < sizeof(*ptr) * 8); -#if !defined(HAVE_AVX2) - m128 *sub; - if (n < 128) { - sub = &ptr->lo.lo; - } else if (n < 256) { - sub = &ptr->lo.hi; - } else if (n < 384) { - sub = &ptr->hi.lo; - } else { - sub = &ptr->hi.hi; - } - setbit128(sub, n % 128); -#elif defined(HAVE_AVX512) *ptr = or512(mask1bit512(n), *ptr); -#else - m256 *sub; - if (n < 256) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 256; - } - setbit256(sub, n); -#endif } // switches off bit N in the given vector. static really_inline void clearbit512(m512 *ptr, unsigned int n) { assert(n < sizeof(*ptr) * 8); -#if !defined(HAVE_AVX2) - m128 *sub; - if (n < 128) { - sub = &ptr->lo.lo; - } else if (n < 256) { - sub = &ptr->lo.hi; - } else if (n < 384) { - sub = &ptr->hi.lo; - } else { - sub = &ptr->hi.hi; - } - clearbit128(sub, n % 128); -#elif defined(HAVE_AVX512) *ptr = andnot512(mask1bit512(n), *ptr); -#else - m256 *sub; - if (n < 256) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 256; - } - clearbit256(sub, n); -#endif } // tests bit N in the given vector. static really_inline char testbit512(m512 val, unsigned int n) { assert(n < sizeof(val) * 8); -#if !defined(HAVE_AVX2) - m128 sub; - if (n < 128) { - sub = val.lo.lo; - } else if (n < 256) { - sub = val.lo.hi; - } else if (n < 384) { - sub = val.hi.lo; - } else { - sub = val.hi.hi; - } - return testbit128(sub, n % 128); -#elif defined(HAVE_AVX512) const m512 mask = mask1bit512(n); return !!_mm512_test_epi8_mask(mask, val); -#else - m256 sub; - if (n < 256) { - sub = val.lo; - } else { - sub = val.hi; - n -= 256; - } - return testbit256(sub, n); -#endif } +#endif // HAVE_SIMD_512_BITS + #endif // ARCH_X86_SIMD_UTILS_H From 4bce012570ee4606528bf67561c0a49c0c3389e3 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 16 Oct 2020 12:32:44 +0300 Subject: [PATCH 30/53] Revert "move x86 popcount.h implementations to util/arch/x86/popcount.h" This reverts commit 6581aae90e55520353c03edb716de80ecc03521a. --- src/util/arch/common/popcount.h | 60 ----------------------------- src/util/arch/x86/popcount.h | 67 --------------------------------- src/util/popcount.h | 39 +++++++++++++------ 3 files changed, 27 insertions(+), 139 deletions(-) delete mode 100644 src/util/arch/common/popcount.h delete mode 100644 src/util/arch/x86/popcount.h diff --git a/src/util/arch/common/popcount.h b/src/util/arch/common/popcount.h deleted file mode 100644 index ef5776e8..00000000 --- a/src/util/arch/common/popcount.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2015-2017, Intel Corporation - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/** \file - * \brief Platform specific popcount functions - */ - -#ifndef POPCOUNT_ARCH_COMMON_H -#define POPCOUNT_ARCH_COMMON_H - -static really_inline -u32 popcount32_impl_c(u32 x) { - // Fast branch-free version from bit-twiddling hacks as older Intel - // processors do not have a POPCNT instruction. - x -= (x >> 1) & 0x55555555; - x = (x & 0x33333333) + ((x >> 2) & 0x33333333); - return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24; -} - -static really_inline -u32 popcount64_impl_c(u64a x) { -#if defined(ARCH_64_BIT) - // Fast branch-free version from bit-twiddling hacks as older Intel - // processors do not have a POPCNT instruction. - x -= (x >> 1) & 0x5555555555555555; - x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); - x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f; - return (x * 0x0101010101010101) >> 56; -#else - // Synthesise from two 32-bit cases. - return popcount32_impl_c(x >> 32) + popcount32_impl_c(x); -#endif -} - -#endif // POPCOUNT_ARCH_COMMON_H diff --git a/src/util/arch/x86/popcount.h b/src/util/arch/x86/popcount.h deleted file mode 100644 index 86929ede..00000000 --- a/src/util/arch/x86/popcount.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2015-2017, Intel Corporation - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/** \file - * \brief Platform specific popcount functions - */ - -#ifndef POPCOUNT_ARCH_X86_H -#define POPCOUNT_ARCH_X86_H - -#include "ue2common.h" -#include "util/arch.h" -#include "util/intrinsics.h" - -#include "util/arch/common/popcount.h" - -static really_inline -u32 popcount32_impl(u32 x) { -#if defined(HAVE_POPCOUNT_INSTR) - // Single-instruction builtin. - return _mm_popcnt_u32(x); -#else - return popcount32_impl_c(x); -#endif -} - -static really_inline -u32 popcount64_impl(u64a x) { -#if defined(ARCH_X86_64) -# if defined(HAVE_POPCOUNT_INSTR) - // Single-instruction builtin. - return (u32)_mm_popcnt_u64(x); -# else - return popcount64_impl_c(x); -# endif -#else - // Synthesise from two 32-bit cases. - return popcount32_impl(x >> 32) + popcount32_impl(x); -#endif -} - -#endif // POPCOUNT_ARCH_X86_h \ No newline at end of file diff --git a/src/util/popcount.h b/src/util/popcount.h index 5fd6dc33..eb08f6b1 100644 --- a/src/util/popcount.h +++ b/src/util/popcount.h @@ -33,26 +33,41 @@ #ifndef UTIL_POPCOUNT_H_ #define UTIL_POPCOUNT_H_ -#include "config.h" #include "ue2common.h" #include "util/arch.h" -#if defined(ARCH_IA32) || defined(ARCH_X86_64) -#include "util/arch/x86/popcount.h" -#else -#include "util/arch/common/popcount.h" -#define popcount32_impl(x) popcount32_impl_c(x) -#define popcount64_impl(x) popcount64_impl_c(x) -#endif - static really_inline u32 popcount32(u32 x) { - return popcount32_impl(x); +#if defined(HAVE_POPCOUNT_INSTR) + // Single-instruction builtin. + return _mm_popcnt_u32(x); +#else + // Fast branch-free version from bit-twiddling hacks as older Intel + // processors do not have a POPCNT instruction. + x -= (x >> 1) & 0x55555555; + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24; +#endif } static really_inline -u32 popcount64(u32 x) { - return popcount64_impl(x); +u32 popcount64(u64a x) { +#if defined(ARCH_X86_64) +# if defined(HAVE_POPCOUNT_INSTR) + // Single-instruction builtin. + return (u32)_mm_popcnt_u64(x); +# else + // Fast branch-free version from bit-twiddling hacks as older Intel + // processors do not have a POPCNT instruction. + x -= (x >> 1) & 0x5555555555555555; + x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f; + return (x * 0x0101010101010101) >> 56; +# endif +#else + // Synthesise from two 32-bit cases. + return popcount32(x >> 32) + popcount32(x); +#endif } #endif /* UTIL_POPCOUNT_H_ */ From c4db63665ad98115948f6c327f6f9952ecb49dd2 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 16 Oct 2020 13:02:40 +0300 Subject: [PATCH 31/53] scalar implementations of diffrich256 and diffrich384 --- src/util/arch/arm/cpuid_flags.c | 4 ++-- src/util/arch/common/simd_utils.h | 11 ++++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/util/arch/arm/cpuid_flags.c b/src/util/arch/arm/cpuid_flags.c index 8dbab473..1ba1a497 100644 --- a/src/util/arch/arm/cpuid_flags.c +++ b/src/util/arch/arm/cpuid_flags.c @@ -26,13 +26,13 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include "cpuid_flags.h" +#include "util/arch/common/cpuid_flags.h" #include "ue2common.h" #include "hs_compile.h" // for HS_MODE_ flags #include "util/arch.h" u64a cpuid_flags(void) { - return cap; + return 0; } u32 cpuid_tune(void) { diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 56d9dbaf..25cd03cc 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -152,7 +152,7 @@ static really_inline int isnonzero256(m256 a) { */ static really_inline u32 diffrich256(m256 a, m256 b) { - return diffrich128(a.lo, b.lo) | (diffrich128(a.hi, b.hi) << 8); + return diffrich128(a.lo, b.lo) | (diffrich128(a.hi, b.hi) << 4); } /** @@ -384,6 +384,15 @@ static really_inline int isnonzero384(m384 a) { return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); } +/** + * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline +u32 diffrich384(m384 a, m384 b) { + return diffrich128(a.lo, b.lo) | (diffrich128(a.mid, b.mid) << 4) | (diffrich128(a.hi, b.hi) << 8); +} + /** * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and * returns a 12-bit mask indicating which 64-bit words contain differences. From 149ea938c4412611f555c0c88af02666d7ccea23 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 16 Oct 2020 13:09:08 +0300 Subject: [PATCH 32/53] don't redefine function on x86 --- src/util/arch/common/simd_utils.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 25cd03cc..c16023ac 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -384,6 +384,7 @@ static really_inline int isnonzero384(m384 a) { return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); } +#if defined(HAVE_SIMD_128_BITS) && !defined(ARCH_IA32) && !defined(ARCH_X86_64) /** * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit * mask indicating which 32-bit words contain differences. @@ -392,6 +393,7 @@ static really_inline u32 diffrich384(m384 a, m384 b) { return diffrich128(a.lo, b.lo) | (diffrich128(a.mid, b.mid) << 4) | (diffrich128(a.hi, b.hi) << 8); } +#endif /** * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and From 0bef151437dcabce2b5541d7746c59286ce1a6d3 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 30 Oct 2020 10:38:05 +0200 Subject: [PATCH 33/53] don't use SSE directly in the tests --- unit/internal/simd_utils.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 623c2c99..5c0e0b40 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -658,34 +658,41 @@ TEST(SimdUtilsTest, movq) { char cmp[sizeof(m128)]; memset(cmp, 0x80, sizeof(m128)); - simd = set16x8(0x80); + simd = set1_16x8(0x80); r = movq(simd); ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd))); ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r))); +#if defined(HAVE_SIMD_128_BITS) +#if defined(ARCH_IA32) || defined(ARCH_X86_64) simd = _mm_set_epi64x(~0LL, 0x123456789abcdef); +#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) + int64x2_t a = { ~0LL, 0x123456789abcdefLL }; + simd = vreinterpretq_s64_s8(a); +#endif +#endif r = movq(simd); ASSERT_EQ(r, 0x123456789abcdef); } -TEST(SimdUtilsTest, set16x8) { +TEST(SimdUtilsTest, set1_16x8) { char cmp[sizeof(m128)]; for (unsigned i = 0; i < 256; i++) { - m128 simd = set16x8(i); + m128 simd = set1_16x8(i); memset(cmp, i, sizeof(simd)); ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd))); } } -TEST(SimdUtilsTest, set4x32) { +TEST(SimdUtilsTest, set1_4x32) { u32 cmp[4] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 }; - m128 simd = set4x32(cmp[0]); + m128 simd = set1_4x32(cmp[0]); ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd))); } -#if defined(HAVE_AVX2) +#if defined(HAVE_SIMD_256_BITS) TEST(SimdUtilsTest, set32x8) { char cmp[sizeof(m256)]; From 548242981d46ff30798b7cd567dc9bab0c296f77 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 30 Oct 2020 10:38:41 +0200 Subject: [PATCH 34/53] fix ARM implementations --- src/util/arch/arm/simd_utils.h | 59 ++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 74f447fb..bfcb9bfe 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -33,6 +33,8 @@ #ifndef ARCH_ARM_SIMD_UTILS_H #define ARCH_ARM_SIMD_UTILS_H +#include + #include "ue2common.h" #include "util/simd_types.h" #include "util/unaligned.h" @@ -41,7 +43,7 @@ #include // for memcpy static really_inline m128 ones128(void) { - return (m128) vdupq_n_s32(0xFF); + return (m128) vdupq_n_s8(0xFF); } static really_inline m128 zeroes128(void) { @@ -50,13 +52,13 @@ static really_inline m128 zeroes128(void) { /** \brief Bitwise not for m128*/ static really_inline m128 not128(m128 a) { - return (m128) veorq_s32(a, a); + return (m128) vmvnq_s32(a); } /** \brief Return 1 if a and b are different otherwise 0 */ static really_inline int diff128(m128 a, m128 b) { - m128 t = (m128)vceqq_s8((int8x16_t)a, (int8x16_t)b); - return (16 != vaddvq_u8((uint8x16_t)t)); + int res = vaddvq_s8((int8x16_t) vceqq_s32(a, b)); + return (-16 != res); } static really_inline int isnonzero128(m128 a) { @@ -69,7 +71,7 @@ static really_inline int isnonzero128(m128 a) { */ static really_inline u32 diffrich128(m128 a, m128 b) { static const uint32x4_t movemask = { 1, 2, 4, 8 }; - return vaddvq_u32(vandq_u32(vceqq_s32((int32x4_t)a, (int32x4_t)b), movemask)); + return vaddvq_u32(vandq_u32(vmvnq_s32(vceqq_s32((int32x4_t)a, (int32x4_t)b)), movemask)); } /** @@ -77,8 +79,8 @@ static really_inline u32 diffrich128(m128 a, m128 b) { * returns a 4-bit mask indicating which 64-bit words contain differences. */ static really_inline u32 diffrich64_128(m128 a, m128 b) { - static const uint64x2_t movemask = { 1, 2 }; - return vaddvq_u64(vandq_u64(vceqq_s64((int64x2_t)a, (int64x2_t)b), movemask)); + static const uint64x2_t movemask = { 1, 4 }; + return vaddvq_u64(vandq_u64(vmvnq_s32(vceqq_s64((int64x2_t)a, (int64x2_t)b)), movemask)); } static really_really_inline @@ -125,7 +127,7 @@ static really_inline u32 movd(const m128 in) { } static really_inline u64a movq(const m128 in) { - return vgetq_lane_u64((uint64x2_t) in, 0); + return vgetq_lane_u64((uint64x2_t) in, 1); } /* another form of movq */ @@ -134,16 +136,6 @@ m128 load_m128_from_u64a(const u64a *p) { return (m128) vdupq_n_u64(*p); } -static really_really_inline -m128 rshiftbyte_m128(m128 a, unsigned b) { - return (m128) vshrq_n_s8((int8x16_t)a, b); -} - -static really_really_inline -m128 lshiftbyte_m128(m128 a, unsigned b) { - return (m128) vshlq_n_s8((int8x16_t)a, b); -} - static really_inline u32 extract32from128(const m128 in, unsigned imm) { return vgetq_lane_u32((uint32x4_t) in, imm); } @@ -165,7 +157,7 @@ static really_inline m128 or128(m128 a, m128 b) { } static really_inline m128 andnot128(m128 a, m128 b) { - return (m128) vbicq_u32((uint32x4_t)a, (uint32x4_t)b); + return (m128) (m128) vandq_s8( vmvnq_s8(a), b); } // aligned load @@ -208,6 +200,24 @@ m128 loadbytes128(const void *ptr, unsigned int n) { return a; } +static really_inline +m128 variable_byte_shift_m128(m128 in, s32 amount) { + assert(amount >= -16 && amount <= 16); + m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); + return vqtbl1q_s8(in, shift_mask); +} + +static really_really_inline +m128 rshiftbyte_m128(m128 a, unsigned b) { + return variable_byte_shift_m128(a, -b);; +} + +static really_really_inline +m128 lshiftbyte_m128(m128 a, unsigned b) { + return variable_byte_shift_m128(a, b);; +} + + #ifdef __cplusplus extern "C" { #endif @@ -258,21 +268,14 @@ m128 pshufb_m128(m128 a, m128 b) { return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated); } -static really_inline -m128 variable_byte_shift_m128(m128 in, s32 amount) { - assert(amount >= -16 && amount <= 16); - m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); - return pshufb_m128(in, shift_mask); -} - static really_inline m128 max_u8_m128(m128 a, m128 b) { - return (m128) vmaxq_s8((int8x16_t)a, (int8x16_t)b); + return (m128) vmaxq_u8((int8x16_t)a, (int8x16_t)b); } static really_inline m128 min_u8_m128(m128 a, m128 b) { - return (m128) vminq_s8((int8x16_t)a, (int8x16_t)b); + return (m128) vminq_u8((int8x16_t)a, (int8x16_t)b); } static really_inline From 547f79b920771614d27e790e1e68221a8ab5c69f Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 30 Oct 2020 10:49:50 +0200 Subject: [PATCH 35/53] small optimization in storecompress*() --- src/util/state_compress.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/util/state_compress.c b/src/util/state_compress.c index 87eccce7..fa07eb2b 100644 --- a/src/util/state_compress.c +++ b/src/util/state_compress.c @@ -108,10 +108,10 @@ void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) { static really_inline void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) { // First, decompose our vectors into 64-bit chunks. - u64a x[2]; - memcpy(x, &xvec, sizeof(xvec)); - u64a m[2]; - memcpy(m, &mvec, sizeof(mvec)); + u64a ALIGN_ATTR(16) x[2]; + u64a ALIGN_ATTR(16) m[2]; + store128(x, xvec); + store128(m, mvec); // Count the number of bits of compressed state we're writing out per // chunk. @@ -215,10 +215,10 @@ void storecompressed256_32bit(void *ptr, m256 xvec, m256 mvec) { static really_really_inline void storecompressed256_64bit(void *ptr, m256 xvec, m256 mvec) { // First, decompose our vectors into 64-bit chunks. - u64a x[4]; - memcpy(x, &xvec, sizeof(xvec)); - u64a m[4]; - memcpy(m, &mvec, sizeof(mvec)); + u64a ALIGN_ATTR(32) x[4]; + u64a ALIGN_ATTR(32) m[4]; + store256(x, xvec); + store256(m, mvec); // Count the number of bits of compressed state we're writing out per // chunk. From 592b1905afdf175e124c5a1bd1282df718e559c6 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 30 Oct 2020 10:50:24 +0200 Subject: [PATCH 36/53] needed for ARM vector type conversions --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4077d396..55954384 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -288,6 +288,8 @@ if (ARCH_IA32 OR ARCH_X86_64) CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H) elseif (ARCH_ARM32 OR ARCH_AARCH64) CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_C_ARM_NEON_H) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions") endif() CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN) From 18296eee4715f8c03ddb3935441c0ea11d08b450 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 5 Nov 2020 17:31:20 +0200 Subject: [PATCH 37/53] fix 32-bit/64-bit detection --- cmake/platform.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/platform.cmake b/cmake/platform.cmake index 4591bf93..479b3680 100644 --- a/cmake/platform.cmake +++ b/cmake/platform.cmake @@ -5,10 +5,10 @@ CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error n CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32) -CHECK_C_SOURCE_COMPILES("#if !defined(__aarch64__)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) -CHECK_C_SOURCE_COMPILES("#if !(defined(__arm__) && !defined(__aarch64__))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32) +CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) +CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32) -if (DEFINED(ARCH_X86_64) OR DEFINED(ARCH_AARCH64)) +if (ARCH_X86_64 OR ARCH_AARCH64) set(ARCH_64_BIT TRUE) else() set(ARCH_32_BIT TRUE) From 7b8cf9754638e963d20f0e1ee32b97a9de596d0c Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 5 Nov 2020 19:18:53 +0200 Subject: [PATCH 38/53] add extra instructions (currently arm-only), fix order of elements in set4x32/set2x64 --- src/util/arch/arm/simd_utils.h | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index bfcb9bfe..7c5d11d5 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -83,6 +83,26 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) { return vaddvq_u64(vandq_u64(vmvnq_s32(vceqq_s64((int64x2_t)a, (int64x2_t)b)), movemask)); } +static really_really_inline +m128 add_2x64(m128 a, m128 b) { + return (m128) vaddq_u64((int64x2_t)a, (int64x2_t)b); +} + +static really_really_inline +m128 sub_2x64(m128 a, m128 b) { + return (m128) vsubq_u64((int64x2_t)a, (int64x2_t)b); +} + +static really_really_inline +m128 lshift_m128(m128 a, unsigned b) { + return (m128) vshlq_n_s32((int64x2_t)a, b); +} + +static really_really_inline +m128 rshift_m128(m128 a, unsigned b) { + return (m128) vshrq_n_s32((int64x2_t)a, b); +} + static really_really_inline m128 lshift64_m128(m128 a, unsigned b) { return (m128) vshlq_n_s64((int64x2_t)a, b); @@ -97,6 +117,10 @@ static really_inline m128 eq128(m128 a, m128 b) { return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b); } +static really_inline m128 eq64_m128(m128 a, m128 b) { + return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b); +} + static really_inline u32 movemask128(m128 a) { static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; @@ -290,13 +314,13 @@ m128 sub_u8_m128(m128 a, m128 b) { static really_inline m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) { - uint32_t __attribute__((aligned(16))) data[4] = { x3, x2, x1, x0 }; + uint32_t __attribute__((aligned(16))) data[4] = { x0, x1, x2, x3 }; return (m128) vld1q_u32((uint32_t *) data); } static really_inline m128 set2x64(u64a hi, u64a lo) { - uint64_t __attribute__((aligned(16))) data[2] = { hi, lo }; + uint64_t __attribute__((aligned(16))) data[2] = { lo, hi }; return (m128) vld1q_u64((uint64_t *) data); } From 33904180d87390b7f67d0c429bc8ac6255b6d97e Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 5 Nov 2020 19:20:06 +0200 Subject: [PATCH 39/53] add compress128 function and implementation --- src/util/arch/arm/bitutils.h | 102 ++++++++++++++++++++++++++++++++ src/util/arch/common/bitutils.h | 34 +++++++++-- src/util/arch/x86/bitutils.h | 5 ++ src/util/bitutils.h | 5 ++ 4 files changed, 142 insertions(+), 4 deletions(-) diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h index 514ddc5c..0b579dc9 100644 --- a/src/util/arch/arm/bitutils.h +++ b/src/util/arch/arm/bitutils.h @@ -104,6 +104,108 @@ u64a compress64_impl(u64a x, u64a m) { return compress64_impl_c(x, m); } +static really_inline +m128 compress128_impl(m128 x, m128 m) { + +/* x = and128(x, m); // clear irrelevant bits + + // Return zero quickly on trivial cases + if (diff128(x, zeroes128()) == 0) { + return zeroes128(); + }*/ + + + u64a ALIGN_ATTR(16) xv[2]; + u64a ALIGN_ATTR(16) mv[2]; + u64a ALIGN_ATTR(16) res[2]; + u64a ALIGN_ATTR(16) t[2]; + u64a ALIGN_ATTR(16) bbv[2]; + store128(xv, x); + store128(mv, m); + res[0] = 0; + res[1] = 0; + printf("x[%d] = %0llx\n", 0, xv[0]); + printf("x[%d] = %0llx\n", 1, xv[1]); + + m128 one = set1_2x64(1); + m128 bitset = one; + m128 vres = zeroes128(); + for (u64a bb = 1; mv[0] | mv[1]; bb <<= 1) { + printf("bb = %lld\n", bb); + store128(bbv, bitset); + printf("bb[%d] = %0lld\n", 0, bbv[0]); + printf("bb[%d] = %0lld\n", 1, bbv[1]); + printf("m[%d] = %0llx\n", 0, mv[0]); + printf("m[%d] = %0llx\n", 1, mv[1]); + printf("scalar: -m[%d] = %0llx\n", 0, -mv[0]); + printf("scalar: -m[%d] = %0llx\n", 1, -mv[1]); + m128 mm = sub_2x64(zeroes128(), m); + store128(t, mm); + printf("vector: -m[0] = %0llx\n", t[0]); + printf("vector: -m[1] = %0llx\n", t[1]); + m128 tv = and128(x, m); + store128(t, tv); + printf("vector: x[0] & m[0] = %0llx\n", t[0]); + printf("vector: x[1] & m[1] = %0llx\n", t[1]); + tv = and128(tv, mm); + store128(t, tv); + printf("vector: x[0] & m[0] & -m[0] = %0llx\n", t[0]); + printf("vector: x[1] & m[1] & -m[1] = %0llx\n", t[1]); + t[0] = xv[0] & mv[0]; + t[1] = xv[1] & mv[1]; + printf("scalar: x[0] & m[0] = %0llx\n", t[0]); + printf("scalar: x[1] & m[1] = %0llx\n", t[1]); + t[0] = xv[0] & mv[0] & -mv[0]; + t[1] = xv[1] & mv[1] & -mv[1]; + printf("scalar: x[0] & m[0] & -m[0] = %0llx\n", t[0]); + printf("scalar: x[1] & m[1] & -m[1] = %0llx\n", t[1]); + + if ( t[0] ) { + printf("x & m & -m != 0\n"); + res[0] |= bb; + printf("x[%d] = %0llx\n", 0, xv[0]); + } + if ( t[1] ) { + printf("x & m & -m != 0\n"); + res[1] |= bb; + printf("x[%d] = %0llx\n", 1, xv[1]); + } + + m128 mask = not128(eq64_m128(tv, zeroes128())); + store128(t, mask); + printf("mask: x[0] & m[0] & -m[0] != 0 : %0llx\n", t[0]); + printf("mask: x[1] & m[1] & -m[1] != 0 : %0llx\n", t[1]); + + mask = vandq_s64(bitset, mask); + store128(t, mask); + printf("mask: mask[0] & bitset[1] != 0 : %0llx\n", t[0]); + printf("mask: mask[1] & bitset[1] != 0 : %0llx\n", t[1]); + + vres = or128(vres, mask); + store128(t, vres); + printf("res: res[0] != 0 : %0llx\n", t[0]); + printf("res: res[1] != 0 : %0llx\n", t[1]); + if (t[0] != res[0]) { + printf("mismatch: t[0] != res[0]: %0llx != %0llx\n", t[0], res[0]); + } + if (t[1] != res[1]) { + printf("mismatch: t[1] != res[1]: %0llx != %0llx\n", t[1], res[1]); + } + + mv[0] &= mv[0] - 1; + mv[1] &= mv[1] - 1; + m = and128(m, sub_2x64(m, set1_2x64(1))); + printf("x[%d] = %0llx\n", 0, xv[0]); + printf("x[%d] = %0llx\n", 1, xv[1]); + bitset = lshift64_m128(bitset, 1); + } + store128(res, vres); + printf("final x[%d] = %0llx\n", 0, res[0]); + printf("final x[%d] = %0llx\n", 1, res[1]); +// x = load128(res); + return vres; +} + static really_inline u32 expand32_impl(u32 x, u32 m) { return expand32_impl_c(x, m); diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h index e86b8d44..88e71bba 100644 --- a/src/util/arch/common/bitutils.h +++ b/src/util/arch/common/bitutils.h @@ -35,6 +35,7 @@ #include "util/popcount.h" #include "util/unaligned.h" +#include "util/simd_utils.h" static really_inline u32 clz32_impl_c(u32 x) { @@ -177,7 +178,13 @@ u32 compress32_impl_c(u32 x, u32 m) { static really_inline u64a compress64_impl_c(u64a x, u64a m) { - // Return zero quickly on trivial cases + u64a res = 0; + for (u64a bb = 1; m != 0; bb += bb) { + if (x & m & -m) { res |= bb; } + m &= (m - 1); + } + return res; +/* // Return zero quickly on trivial cases if ((x & m) == 0) { return 0; } @@ -202,7 +209,20 @@ u64a compress64_impl_c(u64a x, u64a m) { mk = mk & ~mp; } - return x; + return x;*/ +} + +static really_inline +m128 compress128_impl_c(m128 xvec, m128 mvec) { + u64a ALIGN_ATTR(16) x[2]; + u64a ALIGN_ATTR(16) m[2]; + store128(x, xvec); + store128(m, mvec); + + compress64_impl_c(x[0], m[0]); + compress64_impl_c(x[1], m[1]); + + return xvec; } static really_inline @@ -242,7 +262,13 @@ u32 expand32_impl_c(u32 x, u32 m) { static really_inline u64a expand64_impl_c(u64a x, u64a m) { - // Return zero quickly on trivial cases + u64a res = 0; + for (u64a bb = 1; m != 0; bb += bb) { + if (x & bb) { res |= m & (-m); } + m &= (m - 1); + } + return res; +/* // Return zero quickly on trivial cases if (!x || !m) { return 0; } @@ -272,7 +298,7 @@ u64a expand64_impl_c(u64a x, u64a m) { x = (x & ~mv) | (t & mv); } - return x & m0; // clear out extraneous bits + return x & m0; // clear out extraneous bits*/ } diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h index ec4c95ad..a0769a5e 100644 --- a/src/util/arch/x86/bitutils.h +++ b/src/util/arch/x86/bitutils.h @@ -214,6 +214,11 @@ u64a compress64_impl(u64a x, u64a m) { #endif } +static really_inline +u64a compress128_impl(m128 x, m128 m) { + compress128_impl_c(x, m); +} + static really_inline u32 expand32_impl(u32 x, u32 m) { #if defined(HAVE_BMI2) diff --git a/src/util/bitutils.h b/src/util/bitutils.h index 556ba818..21d35388 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -120,6 +120,11 @@ u64a compress64(u64a x, u64a m) { return compress64_impl(x, m); } +static really_inline +m128 compress128(m128 x, m128 m) { + return compress128_impl(x, m); +} + static really_inline u32 expand32(u32 x, u32 m) { return expand32_impl(x, m); From 501f60e930f57f14010ca776677f4588e1f3362c Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 5 Nov 2020 19:20:37 +0200 Subject: [PATCH 40/53] add some debug info --- src/util/state_compress.c | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/src/util/state_compress.c b/src/util/state_compress.c index fa07eb2b..586e47f4 100644 --- a/src/util/state_compress.c +++ b/src/util/state_compress.c @@ -107,21 +107,29 @@ void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) { #if defined(ARCH_64_BIT) static really_inline void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) { + printf("storecompressed128_64bit()\n"); // First, decompose our vectors into 64-bit chunks. +/* u64a x[2]; + memcpy(x, &xvec, sizeof(xvec)); + u64a m[2]; + memcpy(m, &mvec, sizeof(mvec));*/ u64a ALIGN_ATTR(16) x[2]; u64a ALIGN_ATTR(16) m[2]; - store128(x, xvec); store128(m, mvec); + store128(x, xvec); // Count the number of bits of compressed state we're writing out per // chunk. - u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; + u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) }; + //m128 vbits = load128(bits); // Compress each 64-bit chunk individually. - u64a v[2] = { compress64(x[0], m[0]), compress64(x[1], m[1]) }; + //u64a v[2] = { compress64(x[0], m[0]), compress64(x[1], m[1]) }; + xvec = compress128(xvec, mvec); + store128(x, xvec); // Write packed data out. - pack_bits_64(ptr, v, bits, 2); + pack_bits_64(ptr, x, bits, 2); } #endif @@ -157,15 +165,33 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { #if defined(ARCH_64_BIT) static really_inline m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { + printf("loadcompressed128_64bit()\n"); // First, decompose our vectors into 64-bit chunks. - u64a m[2] = { movq(mvec), movq(rshiftbyte_m128(mvec, 8)) }; + u64a ALIGN_ATTR(16) m[2]; + store128(m, mvec); + printf("m[0] = %0llx\n", m[0]); + printf("m[1] = %0llx\n", m[1]); + +// m[0] = movq(mvec); +// m[1] = movq(rshiftbyte_m128(mvec, 8)); + //store128(m, mvec); +// printf("m[0] = %0llx\n", m[0]); +// printf("m[1] = %0llx\n", m[1]); u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; - u64a v[2]; + u64a ALIGN_ATTR(16) v[2]; + + printf("bits[0] = %0x\n", bits[0]); + printf("bits[1] = %0x\n", bits[1]); unpack_bits_64(v, (const u8 *)ptr, bits, 2); + printf("v[0] = %0llx\n", v[0]); + printf("v[1] = %0llx\n", v[1]); u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) }; + printf("x[0] = %0llx\n", x[0]); + printf("x[1] = %0llx\n", x[1]); + return set2x64(x[1], x[0]); } From 62fed20ad051848c39d735900b978ffe261a51d3 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 5 Nov 2020 19:21:16 +0200 Subject: [PATCH 41/53] add some debug and minor optimizations in unit test --- unit/internal/state_compress.cpp | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/unit/internal/state_compress.cpp b/unit/internal/state_compress.cpp index 56be8aae..00423702 100644 --- a/unit/internal/state_compress.cpp +++ b/unit/internal/state_compress.cpp @@ -98,8 +98,8 @@ TEST(state_compress, m128_1) { char buf[sizeof(m128)] = { 0 }; for (u32 i = 0; i < 16; i++) { - char mask_raw[16] = { 0 }; - char val_raw[16] = { 0 }; + char ALIGN_ATTR(16) mask_raw[16] = { 0 }; + char ALIGN_ATTR(16) val_raw[16] = { 0 }; memset(val_raw, (i << 4) + 3, 16); @@ -109,17 +109,32 @@ TEST(state_compress, m128_1) { mask_raw[15 - i] = 0xff; val_raw[15 - i] = i; - m128 val; - m128 mask; - - memcpy(&val, val_raw, sizeof(val)); - memcpy(&mask, mask_raw, sizeof(mask)); + m128 val = load128(val_raw); + m128 mask = load128(mask_raw); storecompressed128(&buf, &val, &mask, 0); m128 val_out; loadcompressed128(&val_out, &buf, &mask, 0); + int8_t ALIGN_ATTR(16) data[16]; + store128(data, val); + printf("val: "); + for (int j=0; j < 16; j++) printf("%02x ", data[j]); + printf("\n"); + store128(data, mask); + printf("mask: "); + for (int j=0; j < 16; j++) printf("%02x ", data[j]); + printf("\n"); + store128(data, and128(val, mask)); + printf("and128(val, mask): "); + for (int j=0; j < 16; j++) printf("%02x ", data[j]); + printf("\n"); + store128(data, val_out); + printf("val_out: "); + for (int j=0; j < 16; j++) printf("%02x ", data[j]); + printf("\n"); + EXPECT_TRUE(!diff128(and128(val, mask), val_out)); mask_raw[i] = 0x0f; From c4f1372814235f3eead54bdcc639dc6a2028a501 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 5 Nov 2020 20:33:17 +0200 Subject: [PATCH 42/53] remove debug from functions --- src/util/arch/arm/bitutils.h | 84 +----------------------------------- src/util/arch/x86/bitutils.h | 1 - src/util/state_compress.c | 22 ---------- 3 files changed, 1 insertion(+), 106 deletions(-) diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h index 0b579dc9..1d1e0167 100644 --- a/src/util/arch/arm/bitutils.h +++ b/src/util/arch/arm/bitutils.h @@ -107,102 +107,20 @@ u64a compress64_impl(u64a x, u64a m) { static really_inline m128 compress128_impl(m128 x, m128 m) { -/* x = and128(x, m); // clear irrelevant bits - - // Return zero quickly on trivial cases - if (diff128(x, zeroes128()) == 0) { - return zeroes128(); - }*/ - - - u64a ALIGN_ATTR(16) xv[2]; - u64a ALIGN_ATTR(16) mv[2]; - u64a ALIGN_ATTR(16) res[2]; - u64a ALIGN_ATTR(16) t[2]; - u64a ALIGN_ATTR(16) bbv[2]; - store128(xv, x); - store128(mv, m); - res[0] = 0; - res[1] = 0; - printf("x[%d] = %0llx\n", 0, xv[0]); - printf("x[%d] = %0llx\n", 1, xv[1]); - m128 one = set1_2x64(1); m128 bitset = one; m128 vres = zeroes128(); - for (u64a bb = 1; mv[0] | mv[1]; bb <<= 1) { - printf("bb = %lld\n", bb); - store128(bbv, bitset); - printf("bb[%d] = %0lld\n", 0, bbv[0]); - printf("bb[%d] = %0lld\n", 1, bbv[1]); - printf("m[%d] = %0llx\n", 0, mv[0]); - printf("m[%d] = %0llx\n", 1, mv[1]); - printf("scalar: -m[%d] = %0llx\n", 0, -mv[0]); - printf("scalar: -m[%d] = %0llx\n", 1, -mv[1]); + while (isnonzero128(m)) { m128 mm = sub_2x64(zeroes128(), m); - store128(t, mm); - printf("vector: -m[0] = %0llx\n", t[0]); - printf("vector: -m[1] = %0llx\n", t[1]); m128 tv = and128(x, m); - store128(t, tv); - printf("vector: x[0] & m[0] = %0llx\n", t[0]); - printf("vector: x[1] & m[1] = %0llx\n", t[1]); tv = and128(tv, mm); - store128(t, tv); - printf("vector: x[0] & m[0] & -m[0] = %0llx\n", t[0]); - printf("vector: x[1] & m[1] & -m[1] = %0llx\n", t[1]); - t[0] = xv[0] & mv[0]; - t[1] = xv[1] & mv[1]; - printf("scalar: x[0] & m[0] = %0llx\n", t[0]); - printf("scalar: x[1] & m[1] = %0llx\n", t[1]); - t[0] = xv[0] & mv[0] & -mv[0]; - t[1] = xv[1] & mv[1] & -mv[1]; - printf("scalar: x[0] & m[0] & -m[0] = %0llx\n", t[0]); - printf("scalar: x[1] & m[1] & -m[1] = %0llx\n", t[1]); - - if ( t[0] ) { - printf("x & m & -m != 0\n"); - res[0] |= bb; - printf("x[%d] = %0llx\n", 0, xv[0]); - } - if ( t[1] ) { - printf("x & m & -m != 0\n"); - res[1] |= bb; - printf("x[%d] = %0llx\n", 1, xv[1]); - } m128 mask = not128(eq64_m128(tv, zeroes128())); - store128(t, mask); - printf("mask: x[0] & m[0] & -m[0] != 0 : %0llx\n", t[0]); - printf("mask: x[1] & m[1] & -m[1] != 0 : %0llx\n", t[1]); - mask = vandq_s64(bitset, mask); - store128(t, mask); - printf("mask: mask[0] & bitset[1] != 0 : %0llx\n", t[0]); - printf("mask: mask[1] & bitset[1] != 0 : %0llx\n", t[1]); - vres = or128(vres, mask); - store128(t, vres); - printf("res: res[0] != 0 : %0llx\n", t[0]); - printf("res: res[1] != 0 : %0llx\n", t[1]); - if (t[0] != res[0]) { - printf("mismatch: t[0] != res[0]: %0llx != %0llx\n", t[0], res[0]); - } - if (t[1] != res[1]) { - printf("mismatch: t[1] != res[1]: %0llx != %0llx\n", t[1], res[1]); - } - - mv[0] &= mv[0] - 1; - mv[1] &= mv[1] - 1; m = and128(m, sub_2x64(m, set1_2x64(1))); - printf("x[%d] = %0llx\n", 0, xv[0]); - printf("x[%d] = %0llx\n", 1, xv[1]); bitset = lshift64_m128(bitset, 1); } - store128(res, vres); - printf("final x[%d] = %0llx\n", 0, res[0]); - printf("final x[%d] = %0llx\n", 1, res[1]); -// x = load128(res); return vres; } diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h index a0769a5e..424ad957 100644 --- a/src/util/arch/x86/bitutils.h +++ b/src/util/arch/x86/bitutils.h @@ -239,7 +239,6 @@ u64a expand64_impl(u64a x, u64a m) { #endif } - /* returns the first set bit after begin (if not ~0U). If no bit is set after * begin returns ~0U */ diff --git a/src/util/state_compress.c b/src/util/state_compress.c index 586e47f4..360ec39e 100644 --- a/src/util/state_compress.c +++ b/src/util/state_compress.c @@ -109,10 +109,6 @@ static really_inline void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) { printf("storecompressed128_64bit()\n"); // First, decompose our vectors into 64-bit chunks. -/* u64a x[2]; - memcpy(x, &xvec, sizeof(xvec)); - u64a m[2]; - memcpy(m, &mvec, sizeof(mvec));*/ u64a ALIGN_ATTR(16) x[2]; u64a ALIGN_ATTR(16) m[2]; store128(m, mvec); @@ -121,10 +117,8 @@ void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) { // Count the number of bits of compressed state we're writing out per // chunk. u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) }; - //m128 vbits = load128(bits); // Compress each 64-bit chunk individually. - //u64a v[2] = { compress64(x[0], m[0]), compress64(x[1], m[1]) }; xvec = compress128(xvec, mvec); store128(x, xvec); @@ -169,29 +163,13 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { // First, decompose our vectors into 64-bit chunks. u64a ALIGN_ATTR(16) m[2]; store128(m, mvec); - printf("m[0] = %0llx\n", m[0]); - printf("m[1] = %0llx\n", m[1]); - -// m[0] = movq(mvec); -// m[1] = movq(rshiftbyte_m128(mvec, 8)); - //store128(m, mvec); -// printf("m[0] = %0llx\n", m[0]); -// printf("m[1] = %0llx\n", m[1]); u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; u64a ALIGN_ATTR(16) v[2]; - printf("bits[0] = %0x\n", bits[0]); - printf("bits[1] = %0x\n", bits[1]); - unpack_bits_64(v, (const u8 *)ptr, bits, 2); - printf("v[0] = %0llx\n", v[0]); - printf("v[1] = %0llx\n", v[1]); u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) }; - printf("x[0] = %0llx\n", x[0]); - printf("x[1] = %0llx\n", x[1]); - return set2x64(x[1], x[0]); } From 606c53a05f1d6d36d6088cafccd384c94d7fa4d5 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 24 Nov 2020 17:55:03 +0200 Subject: [PATCH 43/53] fix compiler flag testcase --- cmake/arch.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index e3cc9f44..cb73ff49 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -78,6 +78,7 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64) CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> int main() { int32x4_t a = vdupq_n_s32(1); + (void)a; }" HAVE_NEON) else () message (FATAL_ERROR "Unsupported architecture") From 1c26f044a73491baa078b186ddc4cb2c4c8c7222 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 24 Nov 2020 17:56:40 +0200 Subject: [PATCH 44/53] when building in debug mode, vgetq_lane_*() and vextq_*() need immediate operands, and we have to use switch()'ed versions --- src/util/arch/arm/simd_utils.h | 63 +++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 7c5d11d5..232ca76f 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -161,11 +161,45 @@ m128 load_m128_from_u64a(const u64a *p) { } static really_inline u32 extract32from128(const m128 in, unsigned imm) { +#if !defined(DEBUG) return vgetq_lane_u32((uint32x4_t) in, imm); +#else + switch (imm) { + case 0: + return vgetq_lane_u32((uint32x4_t) in, 0); + break; + case 1: + return vgetq_lane_u32((uint32x4_t) in, 1); + break; + case 2: + return vgetq_lane_u32((uint32x4_t) in, 2); + break; + case 3: + return vgetq_lane_u32((uint32x4_t) in, 3); + break; + default: + return 0; + break; + } +#endif } -static really_inline u32 extract64from128(const m128 in, unsigned imm) { +static really_inline u64a extract64from128(const m128 in, unsigned imm) { +#if !defined(DEBUG) return vgetq_lane_u64((uint64x2_t) in, imm); +#else + switch (imm) { + case 0: + return vgetq_lane_u64((uint32x4_t) in, 0); + break; + case 1: + return vgetq_lane_u64((uint32x4_t) in, 1); + break; + default: + return 0; + break; + } +#endif } static really_inline m128 and128(m128 a, m128 b) { @@ -278,10 +312,37 @@ char testbit128(m128 val, unsigned int n) { return isnonzero128(and128(mask, val)); } +#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break; + static really_inline m128 palignr(m128 r, m128 l, int offset) { +#if !defined(DEBUG) return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); +#else + switch (offset) { + CASE_ALIGN_VECTORS(l, r, 0); + CASE_ALIGN_VECTORS(l, r, 1); + CASE_ALIGN_VECTORS(l, r, 2); + CASE_ALIGN_VECTORS(l, r, 3); + CASE_ALIGN_VECTORS(l, r, 4); + CASE_ALIGN_VECTORS(l, r, 5); + CASE_ALIGN_VECTORS(l, r, 6); + CASE_ALIGN_VECTORS(l, r, 7); + CASE_ALIGN_VECTORS(l, r, 8); + CASE_ALIGN_VECTORS(l, r, 9); + CASE_ALIGN_VECTORS(l, r, 10); + CASE_ALIGN_VECTORS(l, r, 11); + CASE_ALIGN_VECTORS(l, r, 12); + CASE_ALIGN_VECTORS(l, r, 13); + CASE_ALIGN_VECTORS(l, r, 14); + CASE_ALIGN_VECTORS(l, r, 15); + default: + return zeroes128(); + break; + } +#endif } +#undef CASE_ALIGN_VECTORS static really_inline m128 pshufb_m128(m128 a, m128 b) { From d76365240bd56ce981887e991f075839b5549aaf Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 24 Nov 2020 17:57:16 +0200 Subject: [PATCH 45/53] helper functions to print a m128 vector in debug mode --- src/util/arch/common/simd_utils.h | 38 +++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index c16023ac..39cb91f0 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -44,6 +44,44 @@ #error "You need at least a 128-bit capable SIMD engine!" #endif // HAVE_SIMD_128_BITS +#ifdef DEBUG +static inline void print_m128_16x8(char *label, m128 vector) { + uint8_t __attribute__((aligned(16))) data[16]; + store128(data, vector); + DEBUG_PRINTF("%s: ", label); + for(int i=0; i < 16; i++) + printf("%02x ", data[i]); + printf("\n"); +} + +static inline void print_m128_8x16(char *label, m128 vector) { + uint16_t __attribute__((aligned(16))) data[8]; + store128(data, vector); + DEBUG_PRINTF("%s: ", label); + for(int i=0; i < 8; i++) + printf("%04x ", data[i]); + printf("\n"); +} + +static inline void print_m128_4x32(char *label, m128 vector) { + uint32_t __attribute__((aligned(16))) data[4]; + store128(data, vector); + DEBUG_PRINTF("%s: ", label); + for(int i=0; i < 4; i++) + printf("%08x ", data[i]); + printf("\n"); +} + +static inline void print_m128_2x64(char *label, m128 vector) { + uint64_t __attribute__((aligned(16))) data[2]; + store128(data, vector); + DEBUG_PRINTF("%s: ", label); + for(int i=0; i < 2; i++) + printf("%016lx ", data[i]); + printf("\n"); +} +#endif + /**** **** 256-bit Primitives ****/ From 17ab42d8910d1c419f1c10ef1b3884c0d5a547c5 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 24 Nov 2020 17:59:42 +0200 Subject: [PATCH 46/53] small optimization that was for some reason failing in ARM, should be faster anyway --- src/fdr/teddy.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c index 97cff0b4..16947c61 100644 --- a/src/fdr/teddy.c +++ b/src/fdr/teddy.c @@ -901,8 +901,10 @@ do { \ #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ do { \ if (unlikely(diff128(var, ones128()))) { \ - u64a lo = movq(var); \ - u64a hi = movq(rshiftbyte_m128(var, 8)); \ + u64a __attribute__((aligned(16))) vector[2]; \ + store128(vector, var); \ + u64a lo = vector[0]; \ + u64a hi = vector[1]; \ CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn); \ CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn); \ } \ From 259c2572c15a10d5316dc51d8a3cf4e22ebfe793 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 3 Dec 2020 19:27:05 +0200 Subject: [PATCH 47/53] define debug vector print functions to NULL in non-debug mode --- src/util/arch/common/simd_utils.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 39cb91f0..0c67ee94 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -80,6 +80,11 @@ static inline void print_m128_2x64(char *label, m128 vector) { printf("%016lx ", data[i]); printf("\n"); } +#else +#define print_m128_16x8(label, vector) NULL +#define print_m128_8x16(label, vector) NULL +#define print_m128_4x32(label, vector) NULL +#define print_m128_2x64(label, vector) NULL #endif /**** From 38477b08bc286ad1eec77fabd981d4545257590f Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 3 Dec 2020 19:27:38 +0200 Subject: [PATCH 48/53] fix movq and load_m128_from_u64a and resp. test for NEON --- src/util/arch/arm/simd_utils.h | 4 ++-- unit/internal/simd_utils.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 232ca76f..c918eced 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -151,13 +151,13 @@ static really_inline u32 movd(const m128 in) { } static really_inline u64a movq(const m128 in) { - return vgetq_lane_u64((uint64x2_t) in, 1); + return vgetq_lane_u64((uint64x2_t) in, 0); } /* another form of movq */ static really_inline m128 load_m128_from_u64a(const u64a *p) { - return (m128) vdupq_n_u64(*p); + return (m128) vsetq_lane_u64(*p, zeroes128(), 0); } static really_inline u32 extract32from128(const m128 in, unsigned imm) { diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 5c0e0b40..bc1426b1 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -667,7 +667,7 @@ TEST(SimdUtilsTest, movq) { #if defined(ARCH_IA32) || defined(ARCH_X86_64) simd = _mm_set_epi64x(~0LL, 0x123456789abcdef); #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) - int64x2_t a = { ~0LL, 0x123456789abcdefLL }; + int64x2_t a = { 0x123456789abcdefLL, ~0LL }; simd = vreinterpretq_s64_s8(a); #endif #endif From c38722a68b07436a14f9daa8ba8b50548ff3c9f0 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 3 Dec 2020 19:27:58 +0200 Subject: [PATCH 49/53] add ARM platform --- src/database.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/database.h b/src/database.h index 5715ed67..7789b9ab 100644 --- a/src/database.h +++ b/src/database.h @@ -51,6 +51,7 @@ extern "C" // CPU type is the low 6 bits (we can't need more than 64, surely!) #define HS_PLATFORM_INTEL 1 +#define HS_PLATFORM_ARM 2 #define HS_PLATFORM_CPU_MASK 0x3F #define HS_PLATFORM_NOAVX2 (4<<13) From 39945b7775ebbe4d6bed86c475260db9bd87eb25 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 3 Dec 2020 19:30:50 +0200 Subject: [PATCH 50/53] clear zones array --- src/fdr/fdr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index b0f90b52..1a3b7003 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -726,6 +726,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr, assert(ISALIGNED_CL(confBase)); struct zone zones[ZONE_MAX]; assert(fdr->domain > 8 && fdr->domain < 16); + memset(zones, 0, sizeof(zones)); size_t numZone = prepareZones(a->buf, a->len, a->buf_history + a->len_history, From 773dc6fa69ff1ab28317a99966a057ad7006c6ad Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 7 Dec 2020 23:12:26 +0200 Subject: [PATCH 51/53] optimize *shiftbyte_m128() functions to use palign instead of variable_byte_shift_m128() --- src/util/arch/arm/simd_utils.h | 78 ++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index c918eced..f7b92e70 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -161,7 +161,7 @@ m128 load_m128_from_u64a(const u64a *p) { } static really_inline u32 extract32from128(const m128 in, unsigned imm) { -#if !defined(DEBUG) +#if defined(HS_OPTIMIZE) return vgetq_lane_u32((uint32x4_t) in, imm); #else switch (imm) { @@ -185,7 +185,7 @@ static really_inline u32 extract32from128(const m128 in, unsigned imm) { } static really_inline u64a extract64from128(const m128 in, unsigned imm) { -#if !defined(DEBUG) +#if defined(HS_OPTIMIZE) return vgetq_lane_u64((uint64x2_t) in, imm); #else switch (imm) { @@ -265,14 +265,52 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) { return vqtbl1q_s8(in, shift_mask); } +#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break; + +static really_inline +m128 palignr(m128 r, m128 l, int offset) { +#if defined(HS_OPTIMIZE) + return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); +#else + switch (offset) { + CASE_ALIGN_VECTORS(l, r, 0); + CASE_ALIGN_VECTORS(l, r, 1); + CASE_ALIGN_VECTORS(l, r, 2); + CASE_ALIGN_VECTORS(l, r, 3); + CASE_ALIGN_VECTORS(l, r, 4); + CASE_ALIGN_VECTORS(l, r, 5); + CASE_ALIGN_VECTORS(l, r, 6); + CASE_ALIGN_VECTORS(l, r, 7); + CASE_ALIGN_VECTORS(l, r, 8); + CASE_ALIGN_VECTORS(l, r, 9); + CASE_ALIGN_VECTORS(l, r, 10); + CASE_ALIGN_VECTORS(l, r, 11); + CASE_ALIGN_VECTORS(l, r, 12); + CASE_ALIGN_VECTORS(l, r, 13); + CASE_ALIGN_VECTORS(l, r, 14); + CASE_ALIGN_VECTORS(l, r, 15); + default: + return zeroes128(); + break; + } +#endif +} +#undef CASE_ALIGN_VECTORS + static really_really_inline m128 rshiftbyte_m128(m128 a, unsigned b) { - return variable_byte_shift_m128(a, -b);; + if (b) + return palignr(zeroes128(), a, b); + else + return a; } static really_really_inline m128 lshiftbyte_m128(m128 a, unsigned b) { - return variable_byte_shift_m128(a, b);; + if (b) + return palignr(a, zeroes128(), 16 - b); + else + return a; } @@ -312,38 +350,6 @@ char testbit128(m128 val, unsigned int n) { return isnonzero128(and128(mask, val)); } -#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break; - -static really_inline -m128 palignr(m128 r, m128 l, int offset) { -#if !defined(DEBUG) - return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); -#else - switch (offset) { - CASE_ALIGN_VECTORS(l, r, 0); - CASE_ALIGN_VECTORS(l, r, 1); - CASE_ALIGN_VECTORS(l, r, 2); - CASE_ALIGN_VECTORS(l, r, 3); - CASE_ALIGN_VECTORS(l, r, 4); - CASE_ALIGN_VECTORS(l, r, 5); - CASE_ALIGN_VECTORS(l, r, 6); - CASE_ALIGN_VECTORS(l, r, 7); - CASE_ALIGN_VECTORS(l, r, 8); - CASE_ALIGN_VECTORS(l, r, 9); - CASE_ALIGN_VECTORS(l, r, 10); - CASE_ALIGN_VECTORS(l, r, 11); - CASE_ALIGN_VECTORS(l, r, 12); - CASE_ALIGN_VECTORS(l, r, 13); - CASE_ALIGN_VECTORS(l, r, 14); - CASE_ALIGN_VECTORS(l, r, 15); - default: - return zeroes128(); - break; - } -#endif -} -#undef CASE_ALIGN_VECTORS - static really_inline m128 pshufb_m128(m128 a, m128 b) { /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. From e088c6ae2b87b771552d7c7b2e1ca1db2062beb1 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 7 Dec 2020 23:12:41 +0200 Subject: [PATCH 52/53] remove forgotten printf --- src/util/state_compress.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/util/state_compress.c b/src/util/state_compress.c index 360ec39e..5c26f043 100644 --- a/src/util/state_compress.c +++ b/src/util/state_compress.c @@ -107,7 +107,6 @@ void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) { #if defined(ARCH_64_BIT) static really_inline void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) { - printf("storecompressed128_64bit()\n"); // First, decompose our vectors into 64-bit chunks. u64a ALIGN_ATTR(16) x[2]; u64a ALIGN_ATTR(16) m[2]; @@ -159,7 +158,6 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { #if defined(ARCH_64_BIT) static really_inline m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { - printf("loadcompressed128_64bit()\n"); // First, decompose our vectors into 64-bit chunks. u64a ALIGN_ATTR(16) m[2]; store128(m, mvec); From 61b963a7179b4cd5f5774a45918c1b2db7805510 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 8 Dec 2020 11:42:30 +0200 Subject: [PATCH 53/53] fix x86 compilation --- src/util/arch/x86/bitutils.h | 4 ++-- src/util/arch/x86/simd_utils.h | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h index 424ad957..33fff7c2 100644 --- a/src/util/arch/x86/bitutils.h +++ b/src/util/arch/x86/bitutils.h @@ -215,8 +215,8 @@ u64a compress64_impl(u64a x, u64a m) { } static really_inline -u64a compress128_impl(m128 x, m128 m) { - compress128_impl_c(x, m); +m128 compress128_impl(m128 x, m128 m) { + return compress128_impl_c(x, m); } static really_inline diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index 4a1a691e..9555bf6c 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -33,10 +33,7 @@ #ifndef ARCH_X86_SIMD_UTILS_H #define ARCH_X86_SIMD_UTILS_H -#if !defined(_WIN32) && !defined(__SSSE3__) -#error SSSE3 instructions must be enabled -#endif - +#include "x86.h" #include "ue2common.h" #include "util/simd_types.h" #include "util/unaligned.h"