Check compiler architecture flags in one place

2025-06-28 16:41:01 +03:00 · 2017-03-29 16:39:16 +11:00 · 2017-03-29 16:39:16 +11:00 · 8201183138
commit 8201183138
parent 5fe524fbb3
31 changed files with 203 additions and 109 deletions
--- a/src/crc32.c
+++ b/src/crc32.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -29,6 +29,7 @@
 #include "crc32.h"
 #include "config.h"
 #include "ue2common.h"
+#include "util/arch.h"

 #if defined(HAVE_C_X86INTRIN_H)
 #include <x86intrin.h>
@ -36,7 +37,7 @@
 #include <intrin.h>
 #endif

-#ifndef __SSE4_2__
+#if !defined(HAVE_SSE42)

 /***
 *** What follows is derived from Intel's Slicing-by-8 CRC32 impl, which is BSD
@ -582,7 +583,7 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
    return crc;
 }

-#else // __SSE4_2__
+#else // HAVE_SSE42

 #ifdef ARCH_64_BIT
 #define CRC_WORD 8
@ -638,7 +639,7 @@ u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,

 // Externally visible function
 u32 Crc32c_ComputeBuf(u32 inCrc32, const void *buf, size_t bufLen) {
-#ifdef __SSE4_2__
+#if defined(HAVE_SSE42)
    u32 crc = crc32c_sse42(inCrc32, (const unsigned char *)buf, bufLen);
 #else
    u32 crc = crc32c_sb8_64_bit(inCrc32, (const unsigned char *)buf, bufLen);
--- a/src/database.h
+++ b/src/database.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -41,6 +41,7 @@ extern "C"
 #include "hs_compile.h" // for HS_MODE_ flags
 #include "hs_version.h"
 #include "ue2common.h"
+#include "util/arch.h"

 #define HS_DB_VERSION HS_VERSION_32BIT
 #define HS_DB_MAGIC   (0xdbdbdbdbU)
@ -59,7 +60,7 @@ typedef u64a platform_t;

 static UNUSED
 const platform_t hs_current_platform = {
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    HS_PLATFORM_NOAVX2 |
 #endif
    0,
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@ -34,6 +34,7 @@
 #include "flood_runtime.h"
 #include "teddy.h"
 #include "teddy_internal.h"
+#include "util/arch.h"
 #include "util/simd_utils.h"
 #include "util/uniform_ops.h"

@ -123,7 +124,7 @@ const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
 static really_inline
 u64a andn(const u32 a, const u8 *b) {
    u64a r;
-#if defined(__BMI__)
+#if defined(HAVE_BMI)
    __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
 #else
    r = unaligned_load_u32(b) & ~a;
@ -783,7 +784,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
    return HWLM_SUCCESS;
 }

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 #define ONLY_AVX2(func) func
 #else
 #define ONLY_AVX2(func) NULL
--- a/src/fdr/teddy.h
+++ b/src/fdr/teddy.h
@ -34,6 +34,7 @@
 #define TEDDY_H_

 #include "hwlm/hwlm.h" // for hwlm_group_t
+#include "util/arch.h"

 struct FDR; // forward declaration from fdr_internal.h
 struct FDR_Runtime_Args;
@ -70,7 +71,7 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
                                      const struct FDR_Runtime_Args *a,
                                      hwlm_group_t control);

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)

 hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
                                           const struct FDR_Runtime_Args *a,
@ -104,6 +105,6 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
                                               const struct FDR_Runtime_Args *a,
                                               hwlm_group_t control);

-#endif /* __AVX2__ */
+#endif /* HAVE_AVX2 */

 #endif /* TEDDY_H_ */
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@ -35,9 +35,10 @@
 #include "teddy.h"
 #include "teddy_internal.h"
 #include "teddy_runtime_common.h"
+#include "util/arch.h"
 #include "util/simd_utils.h"

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)

 #ifdef ARCH_64_BIT
 #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
@ -687,4 +688,4 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
    return HWLM_SUCCESS;
 }

-#endif // __AVX2__
+#endif // HAVE_AVX2
--- a/src/hwlm/noodle_engine.c
+++ b/src/hwlm/noodle_engine.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -33,6 +33,7 @@
 #include "noodle_engine.h"
 #include "noodle_internal.h"
 #include "ue2common.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/compare.h"
 #include "util/masked_move.h"
@ -109,7 +110,7 @@ hwlm_error_t final(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
    return HWLM_SUCCESS;
 }

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 #define CHUNKSIZE 32
 #define MASK_TYPE m256
 #include "noodle_engine_avx2.c"
--- a/src/nfa/limex_accel.c
+++ b/src/nfa/limex_accel.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -44,6 +44,7 @@
 #include "multivermicelli.h"
 #include "ue2common.h"
 #include "vermicelli.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"

@ -118,7 +119,7 @@ size_t doAccel256(const m256 *state, const struct LimExNFA256 *limex,
    DEBUG_PRINTF("using PSHUFB for 256-bit shuffle\n");
    m256 accelPerm = limex->accelPermute;
    m256 accelComp = limex->accelCompare;
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    u32 idx1 = packedExtract128(s.lo, accelPerm.lo, accelComp.lo);
    u32 idx2 = packedExtract128(s.hi, accelPerm.hi, accelComp.hi);
    assert((idx1 & idx2) == 0); // should be no shared bits
@ -153,7 +154,7 @@ size_t doAccel512(const m512 *state, const struct LimExNFA512 *limex,
    DEBUG_PRINTF("using PSHUFB for 512-bit shuffle\n");
    m512 accelPerm = limex->accelPermute;
    m512 accelComp = limex->accelCompare;
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    u32 idx1 = packedExtract128(s.lo.lo, accelPerm.lo.lo, accelComp.lo.lo);
    u32 idx2 = packedExtract128(s.lo.hi, accelPerm.lo.hi, accelComp.lo.hi);
    u32 idx3 = packedExtract128(s.hi.lo, accelPerm.hi.lo, accelComp.hi.lo);
--- a/src/nfa/limex_shuffle.h
+++ b/src/nfa/limex_shuffle.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -38,6 +38,7 @@
 #define LIMEX_SHUFFLE_H

 #include "ue2common.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"

@ -49,7 +50,7 @@ u32 packedExtract128(m128 s, const m128 permute, const m128 compare) {
    return (u32)rv;
 }

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 static really_inline
 u32 packedExtract256(m256 s, const m256 permute, const m256 compare) {
    // vpshufb doesn't cross lanes, so this is a bit of a cheat
--- a/src/nfa/mcsheng.c
+++ b/src/nfa/mcsheng.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -33,6 +33,7 @@
 #include "nfa_api.h"
 #include "nfa_api_queue.h"
 #include "nfa_internal.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/compare.h"
 #include "util/simd_utils.h"
@ -168,7 +169,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
     * extract a single copy of the state from the u32 for checking. */
    u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101;

-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
    u32 sheng_limit_x4 = sheng_limit * 0x01010101;
    m128 simd_stop_limit = set4x32(sheng_stop_limit_x4);
    m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit);
@ -189,7 +190,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,

    u8 s_gpr;
    while (c < c_end) {
-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
        /* This version uses pext for efficently bitbashing out scaled
         * versions of the bytes to process from a u64a */

--- a/src/nfa/multishufti.c
+++ b/src/nfa/multishufti.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -34,12 +34,13 @@

 #include "config.h"
 #include "ue2common.h"
+#include "util/arch.h"

 #include "multishufti.h"

 #include "multiaccel_common.h"

-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)

 #define MATCH_ALGO long_
 #include "multiaccel_long.h"
--- a/src/nfa/multitruffle.c
+++ b/src/nfa/multitruffle.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -28,6 +28,7 @@

 #include "config.h"
 #include "ue2common.h"
+#include "util/arch.h"

 #include "multitruffle.h"
 #include "util/bitutils.h"
@ -35,7 +36,7 @@

 #include "multiaccel_common.h"

-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)

 #define MATCH_ALGO long_
 #include "multiaccel_long.h"
--- a/src/nfa/multivermicelli.c
+++ b/src/nfa/multivermicelli.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -28,12 +28,13 @@

 #include "config.h"
 #include "ue2common.h"
+#include "util/arch.h"

 #include "multivermicelli.h"

 #include "multiaccel_common.h"

-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)

 #define MATCH_ALGO long_
 #include "multiaccel_long.h"
--- a/src/nfa/shufti.c
+++ b/src/nfa/shufti.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -34,6 +34,7 @@

 #include "shufti.h"
 #include "ue2common.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/unaligned.h"
@ -55,7 +56,7 @@ const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
    return buf_end;
 }

-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
 /* Normal SSSE3 shufti */

 static really_inline
--- a/src/nfa/shufti_common.h
+++ b/src/nfa/shufti_common.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -31,6 +31,7 @@

 #include "ue2common.h"

+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/unaligned.h"
@ -86,7 +87,7 @@ void dumpMsk##_t##AsChars(m##_t msk) {              \

 #endif

-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)

 #ifdef DEBUG
 DUMP_MSK(128)
--- a/src/nfa/truffle.c
+++ b/src/nfa/truffle.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -33,12 +33,13 @@

 #include "ue2common.h"
 #include "truffle.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"

 #include "truffle_common.h"

-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)

 static really_inline
 const u8 *lastMatch(const u8 *buf, u32 z) {
--- a/src/nfa/truffle_common.h
+++ b/src/nfa/truffle_common.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -29,13 +29,14 @@
 #ifndef TRUFFLE_COMMON_H_
 #define TRUFFLE_COMMON_H_

+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"

 /*
 * Common stuff for all versions of truffle (single, multi and multidouble)
 */
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)

 static really_inline
 const u8 *firstMatch(const u8 *buf, u32 z) {
--- a/src/util/arch.h
+++ b/src/util/arch.h
@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Per-platform architecture definitions
+ */
+
+#ifndef UTIL_ARCH_H_
+#define UTIL_ARCH_H_
+
+#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
+#define HAVE_SSE2
+#endif
+
+#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__))
+#define HAVE_SSE41
+#endif
+
+#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__))
+#define HAVE_SSE42
+#endif
+
+#if defined(__AVX__)
+#define HAVE_AVX
+#endif
+
+#if defined(__AVX2__)
+#define HAVE_AVX2
+#endif
+
+/*
+ * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
+ */
+#if defined(__POPCNT__) ||                                                     \
+    (defined(__INTEL_COMPILER) && defined(__SSE4_2__)) ||                      \
+    (defined(_WIN32) && defined(__AVX__))
+#define HAVE_POPCOUNT_INSTR
+#endif
+
+#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) ||              \
+    (defined(__INTEL_COMPILER) && defined(__AVX2__))
+#define HAVE_BMI
+#endif
+
+#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) ||             \
+    (defined(__INTEL_COMPILER) && defined(__AVX2__))
+#define HAVE_BMI2
+#endif
+
+#endif // UTIL_ARCH_H_
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@ -35,6 +35,7 @@

 #include "ue2common.h"
 #include "popcount.h"
+#include "util/arch.h"

 #ifdef __cplusplus
 # if defined(HAVE_CXX_X86INTRIN_H)
@ -269,7 +270,7 @@ u32 findAndClearMSB_64(u64a *v) {

 static really_inline
 u32 compress32(u32 x, u32 m) {
-#if defined(__BMI2__)
+#if defined(HAVE_BMI2)
    // BMI2 has a single instruction for this operation.
    return _pext_u32(x, m);
 #else
@ -304,7 +305,7 @@ u32 compress32(u32 x, u32 m) {

 static really_inline
 u64a compress64(u64a x, u64a m) {
-#if defined(ARCH_X86_64) && defined(__BMI2__)
+#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
    // BMI2 has a single instruction for this operation.
    return _pext_u64(x, m);
 #else
@ -340,7 +341,7 @@ u64a compress64(u64a x, u64a m) {

 static really_inline
 u32 expand32(u32 x, u32 m) {
-#if defined(__BMI2__)
+#if defined(HAVE_BMI2)
    // BMI2 has a single instruction for this operation.
    return _pdep_u32(x, m);
 #else
@ -380,7 +381,7 @@ u32 expand32(u32 x, u32 m) {

 static really_inline
 u64a expand64(u64a x, u64a m) {
-#if defined(ARCH_X86_64) && defined(__BMI2__)
+#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
    // BMI2 has a single instruction for this operation.
    return _pdep_u64(x, m);
 #else
@ -471,14 +472,9 @@ u32 rank_in_mask64(u64a mask, u32 bit) {
    return popcount64(mask);
 }

-#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) ||             \
-    (defined(__INTEL_COMPILER) && defined(__AVX2__))
-#define HAVE_PEXT
-#endif
-
 static really_inline
 u32 pext32(u32 x, u32 mask) {
-#if defined(HAVE_PEXT)
+#if defined(HAVE_BMI2)
    // Intel BMI2 can do this operation in one instruction.
    return _pext_u32(x, mask);
 #else
@ -498,7 +494,7 @@ u32 pext32(u32 x, u32 mask) {

 static really_inline
 u64a pext64(u64a x, u64a mask) {
-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
    // Intel BMI2 can do this operation in one instruction.
    return _pext_u64(x, mask);
 #else
@ -516,7 +512,7 @@ u64a pext64(u64a x, u64a mask) {
 #endif
 }

-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
 static really_inline
 u64a pdep64(u64a x, u64a mask) {
    return _pdep_u64(x, mask);
--- a/src/util/cpuid_flags.c
+++ b/src/util/cpuid_flags.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -30,6 +30,7 @@
 #include "ue2common.h"
 #include "hs_compile.h" // for HS_MODE_ flags
 #include "hs_internal.h"
+#include "util/arch.h"

 #ifndef _WIN32
 #include <cpuid.h>
@ -131,7 +132,7 @@ u64a cpuid_flags(void) {
        cap |= HS_CPU_FEATURES_AVX2;
    }

-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    cap &= ~HS_CPU_FEATURES_AVX2;
 #endif

--- a/src/util/masked_move.c
+++ b/src/util/masked_move.c
@ -29,8 +29,9 @@

 #include "ue2common.h"
 #include "masked_move.h"
+#include "util/arch.h"

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 /* masks for masked moves */

 /* magic mask for maskload (vmmaskmovq) - described in UE-2424 */
--- a/src/util/masked_move.h
+++ b/src/util/masked_move.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -29,7 +29,9 @@
 #ifndef MASKED_MOVE_H
 #define MASKED_MOVE_H

-#if defined(__AVX2__)
+#include "arch.h"
+
+#if defined(HAVE_AVX2)

 #include "unaligned.h"
 #include "simd_utils.h"
--- a/src/util/math.h
+++ b/src/util/math.h
@ -29,6 +29,8 @@
 #ifndef UTIL_MATH_H_
 #define UTIL_MATH_H_

+#include "arch.h"
+
 #include <math.h>

 #ifdef __cplusplus
@ -59,7 +61,7 @@

 static really_inline
 double our_pow(double x, double y) {
-#if defined(__AVX__)
+#if defined(HAVE_AVX)
    /*
     * Clear the upper half of AVX registers before calling into the math lib.
     * On some versions of glibc this can save thousands of AVX-to-SSE
--- a/src/util/popcount.h
+++ b/src/util/popcount.h
@ -34,15 +34,7 @@
 #define UTIL_POPCOUNT_H_

 #include "ue2common.h"
-
-// We have a native popcount where the compiler has defined __POPCNT__.
-#if defined(__POPCNT__)
-#define HAVE_POPCOUNT_INSTR
-#elif defined(_WIN32) && defined(__AVX__) // TODO: fix win preproc
-#define HAVE_POPCOUNT_INSTR
-#elif defined(__INTEL_COMPILER) && defined(__SSE4_2__)
-#define HAVE_POPCOUNT_INSTR
-#endif
+#include "util/arch.h"

 static really_inline
 u32 popcount32(u32 x) {
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -30,6 +30,7 @@
 #define SIMD_TYPES_H

 #include "config.h"
+#include "util/arch.h"
 #include "ue2common.h"

 // more recent headers are bestest, but only if we can use them
@ -61,13 +62,13 @@
 #error no intrinsics!
 #endif

-#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
+#if defined(HAVE_SSE2)
 typedef __m128i m128;
 #else
 typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
 #endif

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 typedef __m256i m256;
 #else
 typedef ALIGN_AVX_DIRECTIVE struct {m128 lo; m128 hi;} m256;
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@ -38,6 +38,8 @@
 #endif

 #include "config.h"
+#include "util/arch.h"
+
 #include <string.h> // for memcpy

 // more recent headers are bestest, but only if we can use them
@ -141,7 +143,7 @@ static really_inline u32 diffrich128(m128 a, m128 b) {
 * returns a 4-bit mask indicating which 64-bit words contain differences.
 */
 static really_inline u32 diffrich64_128(m128 a, m128 b) {
-#if defined(__SSE_41__)
+#if defined(HAVE_SSE41)
    a = _mm_cmpeq_epi64(a, b);
    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
 #else
@ -186,11 +188,11 @@ m128 load_m128_from_u64a(const u64a *p) {
 #define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
 #define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)

-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
 // TODO: this entire file needs restructuring - this carveout is awful
 #define extractlow64from256(a) movq(a.lo)
 #define extractlow32from256(a) movd(a.lo)
-#if defined(__SSE4_1__)
+#if defined(HAVE_SSE41)
 #define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
 #define extract64from256(a, imm) _mm_extract_epi64((imm >> 2) ? a.hi : a.lo, imm % 2)
 #else
@ -288,7 +290,7 @@ void clearbit128(m128 *ptr, unsigned int n) {
 static really_inline
 char testbit128(m128 val, unsigned int n) {
    const m128 mask = mask1bit128(n);
-#if defined(__SSE4_1__)
+#if defined(HAVE_SSE41)
    return !_mm_testz_si128(mask, val);
 #else
    return isnonzero128(and128(mask, val));
@ -307,7 +309,7 @@ m128 pshufb(m128 a, m128 b) {

 static really_inline
 m256 vpshufb(m256 a, m256 b) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    return _mm256_shuffle_epi8(a, b);
 #else
    m256 rv;
@ -348,7 +350,7 @@ m128 sub_u8_m128(m128 a, m128 b) {
 **** 256-bit Primitives
 ****/

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 #define lshift64_m256(a, b) _mm256_slli_epi64((a), (b))
 #define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))

@ -413,7 +415,7 @@ m256 set2x128(m128 a) {
 #endif

 static really_inline m256 zeroes256(void) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    return _mm256_setzero_si256();
 #else
    m256 rv = {zeroes128(), zeroes128()};
@ -422,7 +424,7 @@ static really_inline m256 zeroes256(void) {
 }

 static really_inline m256 ones256(void) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    m256 rv = _mm256_set1_epi8(0xFF);
 #else
    m256 rv = {ones128(), ones128()};
@ -430,7 +432,7 @@ static really_inline m256 ones256(void) {
    return rv;
 }

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 static really_inline m256 and256(m256 a, m256 b) {
    return _mm256_and_si256(a, b);
 }
@ -443,7 +445,7 @@ static really_inline m256 and256(m256 a, m256 b) {
 }
 #endif

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 static really_inline m256 or256(m256 a, m256 b) {
    return _mm256_or_si256(a, b);
 }
@ -456,7 +458,7 @@ static really_inline m256 or256(m256 a, m256 b) {
 }
 #endif

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 static really_inline m256 xor256(m256 a, m256 b) {
    return _mm256_xor_si256(a, b);
 }
@ -469,7 +471,7 @@ static really_inline m256 xor256(m256 a, m256 b) {
 }
 #endif

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 static really_inline m256 not256(m256 a) {
    return _mm256_xor_si256(a, ones256());
 }
@ -482,7 +484,7 @@ static really_inline m256 not256(m256 a) {
 }
 #endif

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 static really_inline m256 andnot256(m256 a, m256 b) {
    return _mm256_andnot_si256(a, b);
 }
@ -496,7 +498,7 @@ static really_inline m256 andnot256(m256 a, m256 b) {
 #endif

 static really_inline int diff256(m256 a, m256 b) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
 #else
    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
@ -504,7 +506,7 @@ static really_inline int diff256(m256 a, m256 b) {
 }

 static really_inline int isnonzero256(m256 a) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    return !!diff256(a, zeroes256());
 #else
    return isnonzero128(or128(a.lo, a.hi));
@ -516,7 +518,7 @@ static really_inline int isnonzero256(m256 a) {
 * mask indicating which 32-bit words contain differences.
 */
 static really_inline u32 diffrich256(m256 a, m256 b) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    a = _mm256_cmpeq_epi32(a, b);
    return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
 #else
@ -540,7 +542,7 @@ static really_inline u32 diffrich64_256(m256 a, m256 b) {
 // aligned load
 static really_inline m256 load256(const void *ptr) {
    assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    return _mm256_load_si256((const m256 *)ptr);
 #else
    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
@ -550,7 +552,7 @@ static really_inline m256 load256(const void *ptr) {

 // aligned load  of 128-bit value to low and high part of 256-bit value
 static really_inline m256 load2x128(const void *ptr) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    return set2x128(load128(ptr));
 #else
    assert(ISALIGNED_N(ptr, alignof(m128)));
@ -567,7 +569,7 @@ static really_inline m256 loadu2x128(const void *ptr) {
 // aligned store
 static really_inline void store256(void *ptr, m256 a) {
    assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    _mm256_store_si256((m256 *)ptr, a);
 #else
    ptr = assume_aligned(ptr, 16);
@ -577,7 +579,7 @@ static really_inline void store256(void *ptr, m256 a) {

 // unaligned load
 static really_inline m256 loadu256(const void *ptr) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    return _mm256_loadu_si256((const m256 *)ptr);
 #else
    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
@ -587,7 +589,7 @@ static really_inline m256 loadu256(const void *ptr) {

 // unaligned store
 static really_inline void storeu256(void *ptr, m256 a) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    _mm256_storeu_si256((m256 *)ptr, a);
 #else
    storeu128(ptr, a.lo);
@ -619,7 +621,7 @@ m256 mask1bit256(unsigned int n) {
    return loadu256(&simd_onebit_masks[mask_idx]);
 }

-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
 // switches on bit N in the given vector.
 static really_inline
 void setbit256(m256 *ptr, unsigned int n) {
@ -971,7 +973,7 @@ static really_inline int diff512(m512 a, m512 b) {
 }

 static really_inline int isnonzero512(m512 a) {
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    m128 x = or128(a.lo.lo, a.lo.hi);
    m128 y = or128(a.hi.lo, a.hi.hi);
    return isnonzero128(or128(x, y));
@ -986,7 +988,7 @@ static really_inline int isnonzero512(m512 a) {
 * mask indicating which 32-bit words contain differences.
 */
 static really_inline u32 diffrich512(m512 a, m512 b) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
 #else
    a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
@ -1018,7 +1020,7 @@ static really_inline m512 load512(const void *ptr) {
 // aligned store
 static really_inline void store512(void *ptr, m512 a) {
    assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    m512 *x = (m512 *)ptr;
    store256(&x->lo, a.lo);
    store256(&x->hi, a.hi);
@ -1054,7 +1056,7 @@ m512 loadbytes512(const void *ptr, unsigned int n) {
 static really_inline
 void setbit512(m512 *ptr, unsigned int n) {
    assert(n < sizeof(*ptr) * 8);
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    m128 *sub;
    if (n < 128) {
        sub = &ptr->lo.lo;
@ -1082,7 +1084,7 @@ void setbit512(m512 *ptr, unsigned int n) {
 static really_inline
 void clearbit512(m512 *ptr, unsigned int n) {
    assert(n < sizeof(*ptr) * 8);
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    m128 *sub;
    if (n < 128) {
        sub = &ptr->lo.lo;
@ -1110,7 +1112,7 @@ void clearbit512(m512 *ptr, unsigned int n) {
 static really_inline
 char testbit512(m512 val, unsigned int n) {
    assert(n < sizeof(val) * 8);
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    m128 sub;
    if (n < 128) {
        sub = val.lo.lo;
--- a/src/util/state_compress.c
+++ b/src/util/state_compress.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -31,6 +31,7 @@
 */
 #include "config.h"
 #include "ue2common.h"
+#include "arch.h"
 #include "bitutils.h"
 #include "unaligned.h"
 #include "pack_bits.h"
@ -262,7 +263,7 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
                 expand32(v[4], m[4]), expand32(v[5], m[5]),
                 expand32(v[6], m[6]), expand32(v[7], m[7]) };

-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
                  .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) };
 #else
@ -289,7 +290,7 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
    u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
                  expand64(v[2], m[2]), expand64(v[3], m[3]) };

-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
                  .hi = _mm_set_epi64x(x[3], x[2]) };
 #else
@ -546,7 +547,7 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
                  expand32(v[14], m[14]), expand32(v[15], m[15]) };

    m512 xvec;
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
    xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
    xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
@ -581,7 +582,7 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
                  expand64(v[4], m[4]), expand64(v[5], m[5]),
                  expand64(v[6], m[6]), expand64(v[7], m[7]) };

-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
    m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
                          _mm_set_epi64x(x[3], x[2]) },
                  .hi = { _mm_set_epi64x(x[5], x[4]),
--- a/unit/internal/bitutils.cpp
+++ b/unit/internal/bitutils.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -29,6 +29,7 @@
 #include "config.h"

 #include "gtest/gtest.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/popcount.h"

@ -437,7 +438,7 @@ TEST(BitUtils, rank_in_mask64) {
    ASSERT_EQ(31, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 63));
 }

-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
 TEST(BitUtils, pdep64) {
    u64a data = 0xF123456789ABCDEF;
    ASSERT_EQ(0xfULL, pdep64(data, 0xf));
--- a/unit/internal/database.cpp
+++ b/unit/internal/database.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -33,6 +33,7 @@
 #include "crc32.h"
 #include "database.h"
 #include "ue2common.h"
+#include "util/arch.h"
 #include "util/target_info.h"

 #include "gtest/gtest.h"
@ -47,7 +48,7 @@ TEST(DB, flagsToPlatform) {

    p.cpu_features = 0;

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
    p.cpu_features |= HS_CPU_FEATURES_AVX2;
 #endif

--- a/unit/internal/masked_move.cpp
+++ b/unit/internal/masked_move.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -31,11 +31,12 @@
 #include <cstring>

 #include "gtest/gtest.h"
+#include "util/arch.h"
 #include "util/masked_move.h"

 namespace {

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)

 bool try_mask_len(const u8 *buf, u8 *target, size_t len) {
    memset(target, 0, 32);
--- a/unit/internal/shuffle.cpp
+++ b/unit/internal/shuffle.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -30,6 +30,7 @@

 #include "gtest/gtest.h"

+#include "util/arch.h"
 #include "util/simd_utils.h"
 #include "nfa/limex_shuffle.h"

@ -194,7 +195,7 @@ TEST(Shuffle, PackedExtract128_1) {
    }
 }

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 TEST(Shuffle, PackedExtract256_1) {
    // Try all possible one-bit masks
    for (unsigned int i = 0; i < 256; i++) {
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -30,6 +30,7 @@

 #include "gtest/gtest.h"
 #include "util/alloc.h"
+#include "util/arch.h"
 #include "util/make_unique.h"
 #include "util/simd_utils.h"

@ -620,7 +621,7 @@ TEST(SimdUtilsTest, set4x32) {
    ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
 }

-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 TEST(SimdUtilsTest, set32x8) {
    char cmp[sizeof(m256)];