From 82011831387b5eac995f9945ee7a1125509476e8 Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Wed, 29 Mar 2017 16:39:16 +1100 Subject: [PATCH] Check compiler architecture flags in one place --- src/crc32.c | 9 +++-- src/database.h | 5 ++- src/fdr/fdr.c | 5 ++- src/fdr/teddy.h | 5 ++- src/fdr/teddy_avx2.c | 5 ++- src/hwlm/noodle_engine.c | 5 ++- src/nfa/limex_accel.c | 7 ++-- src/nfa/limex_shuffle.h | 5 ++- src/nfa/mcsheng.c | 7 ++-- src/nfa/multishufti.c | 5 ++- src/nfa/multitruffle.c | 5 ++- src/nfa/multivermicelli.c | 5 ++- src/nfa/shufti.c | 5 ++- src/nfa/shufti_common.h | 5 ++- src/nfa/truffle.c | 5 ++- src/nfa/truffle_common.h | 5 ++- src/util/arch.h | 75 +++++++++++++++++++++++++++++++++++ src/util/bitutils.h | 20 ++++------ src/util/cpuid_flags.c | 5 ++- src/util/masked_move.c | 3 +- src/util/masked_move.h | 6 ++- src/util/math.h | 4 +- src/util/popcount.h | 10 +---- src/util/simd_types.h | 7 ++-- src/util/simd_utils.h | 58 ++++++++++++++------------- src/util/state_compress.c | 11 ++--- unit/internal/bitutils.cpp | 5 ++- unit/internal/database.cpp | 5 ++- unit/internal/masked_move.cpp | 5 ++- unit/internal/shuffle.cpp | 5 ++- unit/internal/simd_utils.cpp | 5 ++- 31 files changed, 203 insertions(+), 109 deletions(-) create mode 100644 src/util/arch.h diff --git a/src/crc32.c b/src/crc32.c index b85acc7f..9a9b6f26 100644 --- a/src/crc32.c +++ b/src/crc32.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -29,6 +29,7 @@ #include "crc32.h" #include "config.h" #include "ue2common.h" +#include "util/arch.h" #if defined(HAVE_C_X86INTRIN_H) #include @@ -36,7 +37,7 @@ #include #endif -#ifndef __SSE4_2__ +#if !defined(HAVE_SSE42) /*** *** What follows is derived from Intel's Slicing-by-8 CRC32 impl, which is BSD @@ -582,7 +583,7 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf, return crc; } -#else // __SSE4_2__ +#else // HAVE_SSE42 #ifdef ARCH_64_BIT #define CRC_WORD 8 @@ -638,7 +639,7 @@ u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf, // Externally visible function u32 Crc32c_ComputeBuf(u32 inCrc32, const void *buf, size_t bufLen) { -#ifdef __SSE4_2__ +#if defined(HAVE_SSE42) u32 crc = crc32c_sse42(inCrc32, (const unsigned char *)buf, bufLen); #else u32 crc = crc32c_sb8_64_bit(inCrc32, (const unsigned char *)buf, bufLen); diff --git a/src/database.h b/src/database.h index 399513fc..9b24abd4 100644 --- a/src/database.h +++ b/src/database.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -41,6 +41,7 @@ extern "C" #include "hs_compile.h" // for HS_MODE_ flags #include "hs_version.h" #include "ue2common.h" +#include "util/arch.h" #define HS_DB_VERSION HS_VERSION_32BIT #define HS_DB_MAGIC (0xdbdbdbdbU) @@ -59,7 +60,7 @@ typedef u64a platform_t; static UNUSED const platform_t hs_current_platform = { -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) HS_PLATFORM_NOAVX2 | #endif 0, diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index d5d40c38..74e6c577 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -34,6 +34,7 @@ #include "flood_runtime.h" #include "teddy.h" #include "teddy_internal.h" +#include "util/arch.h" #include "util/simd_utils.h" #include "util/uniform_ops.h" @@ -123,7 +124,7 @@ const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = { static really_inline u64a andn(const u32 a, const u8 *b) { u64a r; -#if defined(__BMI__) +#if defined(HAVE_BMI) __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b)); #else r = unaligned_load_u32(b) & ~a; @@ -783,7 +784,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr, return HWLM_SUCCESS; } -#if defined(__AVX2__) +#if defined(HAVE_AVX2) #define ONLY_AVX2(func) func #else #define ONLY_AVX2(func) NULL diff --git a/src/fdr/teddy.h b/src/fdr/teddy.h index 78cba847..35756c53 100644 --- a/src/fdr/teddy.h +++ b/src/fdr/teddy.h @@ -34,6 +34,7 @@ #define TEDDY_H_ #include "hwlm/hwlm.h" // for hwlm_group_t +#include "util/arch.h" struct FDR; // forward declaration from fdr_internal.h struct FDR_Runtime_Args; @@ -70,7 +71,7 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control); -#if defined(__AVX2__) +#if defined(HAVE_AVX2) hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr, const struct FDR_Runtime_Args *a, @@ -104,6 +105,6 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control); -#endif /* __AVX2__ */ +#endif /* HAVE_AVX2 */ #endif /* TEDDY_H_ */ diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c index 22b74408..ebc1362d 100644 --- a/src/fdr/teddy_avx2.c +++ b/src/fdr/teddy_avx2.c @@ -35,9 +35,10 @@ #include "teddy.h" #include "teddy_internal.h" #include "teddy_runtime_common.h" +#include "util/arch.h" #include "util/simd_utils.h" -#if defined(__AVX2__) +#if defined(HAVE_AVX2) #ifdef ARCH_64_BIT #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ @@ -687,4 +688,4 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr, return HWLM_SUCCESS; } -#endif // __AVX2__ +#endif // HAVE_AVX2 diff --git a/src/hwlm/noodle_engine.c b/src/hwlm/noodle_engine.c index 1d1ab4e6..a30a59a5 100644 --- a/src/hwlm/noodle_engine.c +++ b/src/hwlm/noodle_engine.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,6 +33,7 @@ #include "noodle_engine.h" #include "noodle_internal.h" #include "ue2common.h" +#include "util/arch.h" #include "util/bitutils.h" #include "util/compare.h" #include "util/masked_move.h" @@ -109,7 +110,7 @@ hwlm_error_t final(const u8 *buf, size_t len, const u8 *key, size_t keyLen, return HWLM_SUCCESS; } -#if defined(__AVX2__) +#if defined(HAVE_AVX2) #define CHUNKSIZE 32 #define MASK_TYPE m256 #include "noodle_engine_avx2.c" diff --git a/src/nfa/limex_accel.c b/src/nfa/limex_accel.c index c74c7079..a96dea43 100644 --- a/src/nfa/limex_accel.c +++ b/src/nfa/limex_accel.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -44,6 +44,7 @@ #include "multivermicelli.h" #include "ue2common.h" #include "vermicelli.h" +#include "util/arch.h" #include "util/bitutils.h" #include "util/simd_utils.h" @@ -118,7 +119,7 @@ size_t doAccel256(const m256 *state, const struct LimExNFA256 *limex, DEBUG_PRINTF("using PSHUFB for 256-bit shuffle\n"); m256 accelPerm = limex->accelPermute; m256 accelComp = limex->accelCompare; -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) u32 idx1 = packedExtract128(s.lo, accelPerm.lo, accelComp.lo); u32 idx2 = packedExtract128(s.hi, accelPerm.hi, accelComp.hi); assert((idx1 & idx2) == 0); // should be no shared bits @@ -153,7 +154,7 @@ size_t doAccel512(const m512 *state, const struct LimExNFA512 *limex, DEBUG_PRINTF("using PSHUFB for 512-bit shuffle\n"); m512 accelPerm = limex->accelPermute; m512 accelComp = limex->accelCompare; -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) u32 idx1 = packedExtract128(s.lo.lo, accelPerm.lo.lo, accelComp.lo.lo); u32 idx2 = packedExtract128(s.lo.hi, accelPerm.lo.hi, accelComp.lo.hi); u32 idx3 = packedExtract128(s.hi.lo, accelPerm.hi.lo, accelComp.hi.lo); diff --git a/src/nfa/limex_shuffle.h b/src/nfa/limex_shuffle.h index 5ca8fce0..5d9b3ef8 100644 --- a/src/nfa/limex_shuffle.h +++ b/src/nfa/limex_shuffle.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -38,6 +38,7 @@ #define LIMEX_SHUFFLE_H #include "ue2common.h" +#include "util/arch.h" #include "util/bitutils.h" #include "util/simd_utils.h" @@ -49,7 +50,7 @@ u32 packedExtract128(m128 s, const m128 permute, const m128 compare) { return (u32)rv; } -#if defined(__AVX2__) +#if defined(HAVE_AVX2) static really_inline u32 packedExtract256(m256 s, const m256 permute, const m256 compare) { // vpshufb doesn't cross lanes, so this is a bit of a cheat diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c index 98db3f0a..322cde0a 100644 --- a/src/nfa/mcsheng.c +++ b/src/nfa/mcsheng.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Intel Corporation + * Copyright (c) 2016-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,6 +33,7 @@ #include "nfa_api.h" #include "nfa_api_queue.h" #include "nfa_internal.h" +#include "util/arch.h" #include "util/bitutils.h" #include "util/compare.h" #include "util/simd_utils.h" @@ -168,7 +169,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end, * extract a single copy of the state from the u32 for checking. */ u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101; -#if defined(HAVE_PEXT) && defined(ARCH_64_BIT) +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) u32 sheng_limit_x4 = sheng_limit * 0x01010101; m128 simd_stop_limit = set4x32(sheng_stop_limit_x4); m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit); @@ -189,7 +190,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end, u8 s_gpr; while (c < c_end) { -#if defined(HAVE_PEXT) && defined(ARCH_64_BIT) +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) /* This version uses pext for efficently bitbashing out scaled * versions of the bytes to process from a u64a */ diff --git a/src/nfa/multishufti.c b/src/nfa/multishufti.c index cb85b718..80a2bcd0 100644 --- a/src/nfa/multishufti.c +++ b/src/nfa/multishufti.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -34,12 +34,13 @@ #include "config.h" #include "ue2common.h" +#include "util/arch.h" #include "multishufti.h" #include "multiaccel_common.h" -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) #define MATCH_ALGO long_ #include "multiaccel_long.h" diff --git a/src/nfa/multitruffle.c b/src/nfa/multitruffle.c index 381bda93..c333414c 100644 --- a/src/nfa/multitruffle.c +++ b/src/nfa/multitruffle.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -28,6 +28,7 @@ #include "config.h" #include "ue2common.h" +#include "util/arch.h" #include "multitruffle.h" #include "util/bitutils.h" @@ -35,7 +36,7 @@ #include "multiaccel_common.h" -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) #define MATCH_ALGO long_ #include "multiaccel_long.h" diff --git a/src/nfa/multivermicelli.c b/src/nfa/multivermicelli.c index ab6d2cf2..fe6cbdb5 100644 --- a/src/nfa/multivermicelli.c +++ b/src/nfa/multivermicelli.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -28,12 +28,13 @@ #include "config.h" #include "ue2common.h" +#include "util/arch.h" #include "multivermicelli.h" #include "multiaccel_common.h" -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) #define MATCH_ALGO long_ #include "multiaccel_long.h" diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c index d68b1b04..f7b4403e 100644 --- a/src/nfa/shufti.c +++ b/src/nfa/shufti.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -34,6 +34,7 @@ #include "shufti.h" #include "ue2common.h" +#include "util/arch.h" #include "util/bitutils.h" #include "util/simd_utils.h" #include "util/unaligned.h" @@ -55,7 +56,7 @@ const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf, return buf_end; } -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) /* Normal SSSE3 shufti */ static really_inline diff --git a/src/nfa/shufti_common.h b/src/nfa/shufti_common.h index e63ad27a..7048a8b1 100644 --- a/src/nfa/shufti_common.h +++ b/src/nfa/shufti_common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -31,6 +31,7 @@ #include "ue2common.h" +#include "util/arch.h" #include "util/bitutils.h" #include "util/simd_utils.h" #include "util/unaligned.h" @@ -86,7 +87,7 @@ void dumpMsk##_t##AsChars(m##_t msk) { \ #endif -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) #ifdef DEBUG DUMP_MSK(128) diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c index 1eff269a..6d82f8e1 100644 --- a/src/nfa/truffle.c +++ b/src/nfa/truffle.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,12 +33,13 @@ #include "ue2common.h" #include "truffle.h" +#include "util/arch.h" #include "util/bitutils.h" #include "util/simd_utils.h" #include "truffle_common.h" -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) static really_inline const u8 *lastMatch(const u8 *buf, u32 z) { diff --git a/src/nfa/truffle_common.h b/src/nfa/truffle_common.h index 7368e550..dc9c726c 100644 --- a/src/nfa/truffle_common.h +++ b/src/nfa/truffle_common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -29,13 +29,14 @@ #ifndef TRUFFLE_COMMON_H_ #define TRUFFLE_COMMON_H_ +#include "util/arch.h" #include "util/bitutils.h" #include "util/simd_utils.h" /* * Common stuff for all versions of truffle (single, multi and multidouble) */ -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) static really_inline const u8 *firstMatch(const u8 *buf, u32 z) { diff --git a/src/util/arch.h b/src/util/arch.h new file mode 100644 index 00000000..8584ee65 --- /dev/null +++ b/src/util/arch.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Per-platform architecture definitions + */ + +#ifndef UTIL_ARCH_H_ +#define UTIL_ARCH_H_ + +#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2) +#define HAVE_SSE2 +#endif + +#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__)) +#define HAVE_SSE41 +#endif + +#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__)) +#define HAVE_SSE42 +#endif + +#if defined(__AVX__) +#define HAVE_AVX +#endif + +#if defined(__AVX2__) +#define HAVE_AVX2 +#endif + +/* + * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros + */ +#if defined(__POPCNT__) || \ + (defined(__INTEL_COMPILER) && defined(__SSE4_2__)) || \ + (defined(_WIN32) && defined(__AVX__)) +#define HAVE_POPCOUNT_INSTR +#endif + +#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) || \ + (defined(__INTEL_COMPILER) && defined(__AVX2__)) +#define HAVE_BMI +#endif + +#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) || \ + (defined(__INTEL_COMPILER) && defined(__AVX2__)) +#define HAVE_BMI2 +#endif + +#endif // UTIL_ARCH_H_ diff --git a/src/util/bitutils.h b/src/util/bitutils.h index f9e8d151..66a07571 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -35,6 +35,7 @@ #include "ue2common.h" #include "popcount.h" +#include "util/arch.h" #ifdef __cplusplus # if defined(HAVE_CXX_X86INTRIN_H) @@ -269,7 +270,7 @@ u32 findAndClearMSB_64(u64a *v) { static really_inline u32 compress32(u32 x, u32 m) { -#if defined(__BMI2__) +#if defined(HAVE_BMI2) // BMI2 has a single instruction for this operation. return _pext_u32(x, m); #else @@ -304,7 +305,7 @@ u32 compress32(u32 x, u32 m) { static really_inline u64a compress64(u64a x, u64a m) { -#if defined(ARCH_X86_64) && defined(__BMI2__) +#if defined(ARCH_X86_64) && defined(HAVE_BMI2) // BMI2 has a single instruction for this operation. return _pext_u64(x, m); #else @@ -340,7 +341,7 @@ u64a compress64(u64a x, u64a m) { static really_inline u32 expand32(u32 x, u32 m) { -#if defined(__BMI2__) +#if defined(HAVE_BMI2) // BMI2 has a single instruction for this operation. return _pdep_u32(x, m); #else @@ -380,7 +381,7 @@ u32 expand32(u32 x, u32 m) { static really_inline u64a expand64(u64a x, u64a m) { -#if defined(ARCH_X86_64) && defined(__BMI2__) +#if defined(ARCH_X86_64) && defined(HAVE_BMI2) // BMI2 has a single instruction for this operation. return _pdep_u64(x, m); #else @@ -471,14 +472,9 @@ u32 rank_in_mask64(u64a mask, u32 bit) { return popcount64(mask); } -#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) || \ - (defined(__INTEL_COMPILER) && defined(__AVX2__)) -#define HAVE_PEXT -#endif - static really_inline u32 pext32(u32 x, u32 mask) { -#if defined(HAVE_PEXT) +#if defined(HAVE_BMI2) // Intel BMI2 can do this operation in one instruction. return _pext_u32(x, mask); #else @@ -498,7 +494,7 @@ u32 pext32(u32 x, u32 mask) { static really_inline u64a pext64(u64a x, u64a mask) { -#if defined(HAVE_PEXT) && defined(ARCH_64_BIT) +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) // Intel BMI2 can do this operation in one instruction. return _pext_u64(x, mask); #else @@ -516,7 +512,7 @@ u64a pext64(u64a x, u64a mask) { #endif } -#if defined(HAVE_PEXT) && defined(ARCH_64_BIT) +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) static really_inline u64a pdep64(u64a x, u64a mask) { return _pdep_u64(x, mask); diff --git a/src/util/cpuid_flags.c b/src/util/cpuid_flags.c index dba147ee..8ac0e63c 100644 --- a/src/util/cpuid_flags.c +++ b/src/util/cpuid_flags.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -30,6 +30,7 @@ #include "ue2common.h" #include "hs_compile.h" // for HS_MODE_ flags #include "hs_internal.h" +#include "util/arch.h" #ifndef _WIN32 #include @@ -131,7 +132,7 @@ u64a cpuid_flags(void) { cap |= HS_CPU_FEATURES_AVX2; } -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) cap &= ~HS_CPU_FEATURES_AVX2; #endif diff --git a/src/util/masked_move.c b/src/util/masked_move.c index ec788db7..001cd49f 100644 --- a/src/util/masked_move.c +++ b/src/util/masked_move.c @@ -29,8 +29,9 @@ #include "ue2common.h" #include "masked_move.h" +#include "util/arch.h" -#if defined(__AVX2__) +#if defined(HAVE_AVX2) /* masks for masked moves */ /* magic mask for maskload (vmmaskmovq) - described in UE-2424 */ diff --git a/src/util/masked_move.h b/src/util/masked_move.h index 09276e80..b51ff632 100644 --- a/src/util/masked_move.h +++ b/src/util/masked_move.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -29,7 +29,9 @@ #ifndef MASKED_MOVE_H #define MASKED_MOVE_H -#if defined(__AVX2__) +#include "arch.h" + +#if defined(HAVE_AVX2) #include "unaligned.h" #include "simd_utils.h" diff --git a/src/util/math.h b/src/util/math.h index 80ad4927..3fd69dba 100644 --- a/src/util/math.h +++ b/src/util/math.h @@ -29,6 +29,8 @@ #ifndef UTIL_MATH_H_ #define UTIL_MATH_H_ +#include "arch.h" + #include #ifdef __cplusplus @@ -59,7 +61,7 @@ static really_inline double our_pow(double x, double y) { -#if defined(__AVX__) +#if defined(HAVE_AVX) /* * Clear the upper half of AVX registers before calling into the math lib. * On some versions of glibc this can save thousands of AVX-to-SSE diff --git a/src/util/popcount.h b/src/util/popcount.h index 15361380..eb08f6b1 100644 --- a/src/util/popcount.h +++ b/src/util/popcount.h @@ -34,15 +34,7 @@ #define UTIL_POPCOUNT_H_ #include "ue2common.h" - -// We have a native popcount where the compiler has defined __POPCNT__. -#if defined(__POPCNT__) -#define HAVE_POPCOUNT_INSTR -#elif defined(_WIN32) && defined(__AVX__) // TODO: fix win preproc -#define HAVE_POPCOUNT_INSTR -#elif defined(__INTEL_COMPILER) && defined(__SSE4_2__) -#define HAVE_POPCOUNT_INSTR -#endif +#include "util/arch.h" static really_inline u32 popcount32(u32 x) { diff --git a/src/util/simd_types.h b/src/util/simd_types.h index d6e5d6a3..ec86d4a9 100644 --- a/src/util/simd_types.h +++ b/src/util/simd_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -30,6 +30,7 @@ #define SIMD_TYPES_H #include "config.h" +#include "util/arch.h" #include "ue2common.h" // more recent headers are bestest, but only if we can use them @@ -61,13 +62,13 @@ #error no intrinsics! #endif -#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2) +#if defined(HAVE_SSE2) typedef __m128i m128; #else typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128; #endif -#if defined(__AVX2__) +#if defined(HAVE_AVX2) typedef __m256i m256; #else typedef ALIGN_AVX_DIRECTIVE struct {m128 lo; m128 hi;} m256; diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index 484b47c0..ccbcabbb 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -38,6 +38,8 @@ #endif #include "config.h" +#include "util/arch.h" + #include // for memcpy // more recent headers are bestest, but only if we can use them @@ -141,7 +143,7 @@ static really_inline u32 diffrich128(m128 a, m128 b) { * returns a 4-bit mask indicating which 64-bit words contain differences. */ static really_inline u32 diffrich64_128(m128 a, m128 b) { -#if defined(__SSE_41__) +#if defined(HAVE_SSE41) a = _mm_cmpeq_epi64(a, b); return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5; #else @@ -186,11 +188,11 @@ m128 load_m128_from_u64a(const u64a *p) { #define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) #define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) // TODO: this entire file needs restructuring - this carveout is awful #define extractlow64from256(a) movq(a.lo) #define extractlow32from256(a) movd(a.lo) -#if defined(__SSE4_1__) +#if defined(HAVE_SSE41) #define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4) #define extract64from256(a, imm) _mm_extract_epi64((imm >> 2) ? a.hi : a.lo, imm % 2) #else @@ -288,7 +290,7 @@ void clearbit128(m128 *ptr, unsigned int n) { static really_inline char testbit128(m128 val, unsigned int n) { const m128 mask = mask1bit128(n); -#if defined(__SSE4_1__) +#if defined(HAVE_SSE41) return !_mm_testz_si128(mask, val); #else return isnonzero128(and128(mask, val)); @@ -307,7 +309,7 @@ m128 pshufb(m128 a, m128 b) { static really_inline m256 vpshufb(m256 a, m256 b) { -#if defined(__AVX2__) +#if defined(HAVE_AVX2) return _mm256_shuffle_epi8(a, b); #else m256 rv; @@ -348,7 +350,7 @@ m128 sub_u8_m128(m128 a, m128 b) { **** 256-bit Primitives ****/ -#if defined(__AVX2__) +#if defined(HAVE_AVX2) #define lshift64_m256(a, b) _mm256_slli_epi64((a), (b)) #define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) @@ -413,7 +415,7 @@ m256 set2x128(m128 a) { #endif static really_inline m256 zeroes256(void) { -#if defined(__AVX2__) +#if defined(HAVE_AVX2) return _mm256_setzero_si256(); #else m256 rv = {zeroes128(), zeroes128()}; @@ -422,7 +424,7 @@ static really_inline m256 zeroes256(void) { } static really_inline m256 ones256(void) { -#if defined(__AVX2__) +#if defined(HAVE_AVX2) m256 rv = _mm256_set1_epi8(0xFF); #else m256 rv = {ones128(), ones128()}; @@ -430,7 +432,7 @@ static really_inline m256 ones256(void) { return rv; } -#if defined(__AVX2__) +#if defined(HAVE_AVX2) static really_inline m256 and256(m256 a, m256 b) { return _mm256_and_si256(a, b); } @@ -443,7 +445,7 @@ static really_inline m256 and256(m256 a, m256 b) { } #endif -#if defined(__AVX2__) +#if defined(HAVE_AVX2) static really_inline m256 or256(m256 a, m256 b) { return _mm256_or_si256(a, b); } @@ -456,7 +458,7 @@ static really_inline m256 or256(m256 a, m256 b) { } #endif -#if defined(__AVX2__) +#if defined(HAVE_AVX2) static really_inline m256 xor256(m256 a, m256 b) { return _mm256_xor_si256(a, b); } @@ -469,7 +471,7 @@ static really_inline m256 xor256(m256 a, m256 b) { } #endif -#if defined(__AVX2__) +#if defined(HAVE_AVX2) static really_inline m256 not256(m256 a) { return _mm256_xor_si256(a, ones256()); } @@ -482,7 +484,7 @@ static really_inline m256 not256(m256 a) { } #endif -#if defined(__AVX2__) +#if defined(HAVE_AVX2) static really_inline m256 andnot256(m256 a, m256 b) { return _mm256_andnot_si256(a, b); } @@ -496,7 +498,7 @@ static really_inline m256 andnot256(m256 a, m256 b) { #endif static really_inline int diff256(m256 a, m256 b) { -#if defined(__AVX2__) +#if defined(HAVE_AVX2) return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); #else return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); @@ -504,7 +506,7 @@ static really_inline int diff256(m256 a, m256 b) { } static really_inline int isnonzero256(m256 a) { -#if defined(__AVX2__) +#if defined(HAVE_AVX2) return !!diff256(a, zeroes256()); #else return isnonzero128(or128(a.lo, a.hi)); @@ -516,7 +518,7 @@ static really_inline int isnonzero256(m256 a) { * mask indicating which 32-bit words contain differences. */ static really_inline u32 diffrich256(m256 a, m256 b) { -#if defined(__AVX2__) +#if defined(HAVE_AVX2) a = _mm256_cmpeq_epi32(a, b); return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF; #else @@ -540,7 +542,7 @@ static really_inline u32 diffrich64_256(m256 a, m256 b) { // aligned load static really_inline m256 load256(const void *ptr) { assert(ISALIGNED_N(ptr, alignof(m256))); -#if defined(__AVX2__) +#if defined(HAVE_AVX2) return _mm256_load_si256((const m256 *)ptr); #else m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; @@ -550,7 +552,7 @@ static really_inline m256 load256(const void *ptr) { // aligned load of 128-bit value to low and high part of 256-bit value static really_inline m256 load2x128(const void *ptr) { -#if defined(__AVX2__) +#if defined(HAVE_AVX2) return set2x128(load128(ptr)); #else assert(ISALIGNED_N(ptr, alignof(m128))); @@ -567,7 +569,7 @@ static really_inline m256 loadu2x128(const void *ptr) { // aligned store static really_inline void store256(void *ptr, m256 a) { assert(ISALIGNED_N(ptr, alignof(m256))); -#if defined(__AVX2__) +#if defined(HAVE_AVX2) _mm256_store_si256((m256 *)ptr, a); #else ptr = assume_aligned(ptr, 16); @@ -577,7 +579,7 @@ static really_inline void store256(void *ptr, m256 a) { // unaligned load static really_inline m256 loadu256(const void *ptr) { -#if defined(__AVX2__) +#if defined(HAVE_AVX2) return _mm256_loadu_si256((const m256 *)ptr); #else m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; @@ -587,7 +589,7 @@ static really_inline m256 loadu256(const void *ptr) { // unaligned store static really_inline void storeu256(void *ptr, m256 a) { -#if defined(__AVX2__) +#if defined(HAVE_AVX2) _mm256_storeu_si256((m256 *)ptr, a); #else storeu128(ptr, a.lo); @@ -619,7 +621,7 @@ m256 mask1bit256(unsigned int n) { return loadu256(&simd_onebit_masks[mask_idx]); } -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) // switches on bit N in the given vector. static really_inline void setbit256(m256 *ptr, unsigned int n) { @@ -971,7 +973,7 @@ static really_inline int diff512(m512 a, m512 b) { } static really_inline int isnonzero512(m512 a) { -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) m128 x = or128(a.lo.lo, a.lo.hi); m128 y = or128(a.hi.lo, a.hi.hi); return isnonzero128(or128(x, y)); @@ -986,7 +988,7 @@ static really_inline int isnonzero512(m512 a) { * mask indicating which 32-bit words contain differences. */ static really_inline u32 diffrich512(m512 a, m512 b) { -#if defined(__AVX2__) +#if defined(HAVE_AVX2) return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); #else a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); @@ -1018,7 +1020,7 @@ static really_inline m512 load512(const void *ptr) { // aligned store static really_inline void store512(void *ptr, m512 a) { assert(ISALIGNED_N(ptr, alignof(m256))); -#if defined(__AVX2__) +#if defined(HAVE_AVX2) m512 *x = (m512 *)ptr; store256(&x->lo, a.lo); store256(&x->hi, a.hi); @@ -1054,7 +1056,7 @@ m512 loadbytes512(const void *ptr, unsigned int n) { static really_inline void setbit512(m512 *ptr, unsigned int n) { assert(n < sizeof(*ptr) * 8); -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) m128 *sub; if (n < 128) { sub = &ptr->lo.lo; @@ -1082,7 +1084,7 @@ void setbit512(m512 *ptr, unsigned int n) { static really_inline void clearbit512(m512 *ptr, unsigned int n) { assert(n < sizeof(*ptr) * 8); -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) m128 *sub; if (n < 128) { sub = &ptr->lo.lo; @@ -1110,7 +1112,7 @@ void clearbit512(m512 *ptr, unsigned int n) { static really_inline char testbit512(m512 val, unsigned int n) { assert(n < sizeof(val) * 8); -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) m128 sub; if (n < 128) { sub = val.lo.lo; diff --git a/src/util/state_compress.c b/src/util/state_compress.c index 2a821dad..87e62429 100644 --- a/src/util/state_compress.c +++ b/src/util/state_compress.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -31,6 +31,7 @@ */ #include "config.h" #include "ue2common.h" +#include "arch.h" #include "bitutils.h" #include "unaligned.h" #include "pack_bits.h" @@ -262,7 +263,7 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) { expand32(v[4], m[4]), expand32(v[5], m[5]), expand32(v[6], m[6]), expand32(v[7], m[7]) }; -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) }; #else @@ -289,7 +290,7 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) { u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]), expand64(v[2], m[2]), expand64(v[3], m[3]) }; -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), .hi = _mm_set_epi64x(x[3], x[2]) }; #else @@ -546,7 +547,7 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) { expand32(v[14], m[14]), expand32(v[15], m[15]) }; m512 xvec; -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]); xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]); xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]); @@ -581,7 +582,7 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) { expand64(v[4], m[4]), expand64(v[5], m[5]), expand64(v[6], m[6]), expand64(v[7], m[7]) }; -#if !defined(__AVX2__) +#if !defined(HAVE_AVX2) m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]), _mm_set_epi64x(x[3], x[2]) }, .hi = { _mm_set_epi64x(x[5], x[4]), diff --git a/unit/internal/bitutils.cpp b/unit/internal/bitutils.cpp index 31aaf17f..7241c0b8 100644 --- a/unit/internal/bitutils.cpp +++ b/unit/internal/bitutils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -29,6 +29,7 @@ #include "config.h" #include "gtest/gtest.h" +#include "util/arch.h" #include "util/bitutils.h" #include "util/popcount.h" @@ -437,7 +438,7 @@ TEST(BitUtils, rank_in_mask64) { ASSERT_EQ(31, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 63)); } -#if defined(HAVE_PEXT) && defined(ARCH_64_BIT) +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) TEST(BitUtils, pdep64) { u64a data = 0xF123456789ABCDEF; ASSERT_EQ(0xfULL, pdep64(data, 0xf)); diff --git a/unit/internal/database.cpp b/unit/internal/database.cpp index cb3e76b5..fa34ead2 100644 --- a/unit/internal/database.cpp +++ b/unit/internal/database.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,6 +33,7 @@ #include "crc32.h" #include "database.h" #include "ue2common.h" +#include "util/arch.h" #include "util/target_info.h" #include "gtest/gtest.h" @@ -47,7 +48,7 @@ TEST(DB, flagsToPlatform) { p.cpu_features = 0; -#if defined(__AVX2__) +#if defined(HAVE_AVX2) p.cpu_features |= HS_CPU_FEATURES_AVX2; #endif diff --git a/unit/internal/masked_move.cpp b/unit/internal/masked_move.cpp index 6a2d742d..7bd78c50 100644 --- a/unit/internal/masked_move.cpp +++ b/unit/internal/masked_move.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -31,11 +31,12 @@ #include #include "gtest/gtest.h" +#include "util/arch.h" #include "util/masked_move.h" namespace { -#if defined(__AVX2__) +#if defined(HAVE_AVX2) bool try_mask_len(const u8 *buf, u8 *target, size_t len) { memset(target, 0, 32); diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp index a4632c36..fcf337f2 100644 --- a/unit/internal/shuffle.cpp +++ b/unit/internal/shuffle.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -30,6 +30,7 @@ #include "gtest/gtest.h" +#include "util/arch.h" #include "util/simd_utils.h" #include "nfa/limex_shuffle.h" @@ -194,7 +195,7 @@ TEST(Shuffle, PackedExtract128_1) { } } -#if defined(__AVX2__) +#if defined(HAVE_AVX2) TEST(Shuffle, PackedExtract256_1) { // Try all possible one-bit masks for (unsigned int i = 0; i < 256; i++) { diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 7b34d92e..31b72648 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -30,6 +30,7 @@ #include "gtest/gtest.h" #include "util/alloc.h" +#include "util/arch.h" #include "util/make_unique.h" #include "util/simd_utils.h" @@ -620,7 +621,7 @@ TEST(SimdUtilsTest, set4x32) { ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd))); } -#if defined(__AVX2__) +#if defined(HAVE_AVX2) TEST(SimdUtilsTest, set32x8) { char cmp[sizeof(m256)];