mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-09-29 19:24:25 +03:00
Check compiler architecture flags in one place
This commit is contained in:
75
src/util/arch.h
Normal file
75
src/util/arch.h
Normal file
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Per-platform architecture definitions
|
||||
*/
|
||||
|
||||
#ifndef UTIL_ARCH_H_
|
||||
#define UTIL_ARCH_H_
|
||||
|
||||
#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
|
||||
#define HAVE_SSE2
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__))
|
||||
#define HAVE_SSE41
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__))
|
||||
#define HAVE_SSE42
|
||||
#endif
|
||||
|
||||
#if defined(__AVX__)
|
||||
#define HAVE_AVX
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define HAVE_AVX2
|
||||
#endif
|
||||
|
||||
/*
|
||||
* ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
|
||||
*/
|
||||
#if defined(__POPCNT__) || \
|
||||
(defined(__INTEL_COMPILER) && defined(__SSE4_2__)) || \
|
||||
(defined(_WIN32) && defined(__AVX__))
|
||||
#define HAVE_POPCOUNT_INSTR
|
||||
#endif
|
||||
|
||||
#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) || \
|
||||
(defined(__INTEL_COMPILER) && defined(__AVX2__))
|
||||
#define HAVE_BMI
|
||||
#endif
|
||||
|
||||
#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) || \
|
||||
(defined(__INTEL_COMPILER) && defined(__AVX2__))
|
||||
#define HAVE_BMI2
|
||||
#endif
|
||||
|
||||
#endif // UTIL_ARCH_H_
|
@@ -35,6 +35,7 @@
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "popcount.h"
|
||||
#include "util/arch.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
# if defined(HAVE_CXX_X86INTRIN_H)
|
||||
@@ -269,7 +270,7 @@ u32 findAndClearMSB_64(u64a *v) {
|
||||
|
||||
static really_inline
|
||||
u32 compress32(u32 x, u32 m) {
|
||||
#if defined(__BMI2__)
|
||||
#if defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pext_u32(x, m);
|
||||
#else
|
||||
@@ -304,7 +305,7 @@ u32 compress32(u32 x, u32 m) {
|
||||
|
||||
static really_inline
|
||||
u64a compress64(u64a x, u64a m) {
|
||||
#if defined(ARCH_X86_64) && defined(__BMI2__)
|
||||
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pext_u64(x, m);
|
||||
#else
|
||||
@@ -340,7 +341,7 @@ u64a compress64(u64a x, u64a m) {
|
||||
|
||||
static really_inline
|
||||
u32 expand32(u32 x, u32 m) {
|
||||
#if defined(__BMI2__)
|
||||
#if defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pdep_u32(x, m);
|
||||
#else
|
||||
@@ -380,7 +381,7 @@ u32 expand32(u32 x, u32 m) {
|
||||
|
||||
static really_inline
|
||||
u64a expand64(u64a x, u64a m) {
|
||||
#if defined(ARCH_X86_64) && defined(__BMI2__)
|
||||
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pdep_u64(x, m);
|
||||
#else
|
||||
@@ -471,14 +472,9 @@ u32 rank_in_mask64(u64a mask, u32 bit) {
|
||||
return popcount64(mask);
|
||||
}
|
||||
|
||||
#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) || \
|
||||
(defined(__INTEL_COMPILER) && defined(__AVX2__))
|
||||
#define HAVE_PEXT
|
||||
#endif
|
||||
|
||||
static really_inline
|
||||
u32 pext32(u32 x, u32 mask) {
|
||||
#if defined(HAVE_PEXT)
|
||||
#if defined(HAVE_BMI2)
|
||||
// Intel BMI2 can do this operation in one instruction.
|
||||
return _pext_u32(x, mask);
|
||||
#else
|
||||
@@ -498,7 +494,7 @@ u32 pext32(u32 x, u32 mask) {
|
||||
|
||||
static really_inline
|
||||
u64a pext64(u64a x, u64a mask) {
|
||||
#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
|
||||
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
|
||||
// Intel BMI2 can do this operation in one instruction.
|
||||
return _pext_u64(x, mask);
|
||||
#else
|
||||
@@ -516,7 +512,7 @@ u64a pext64(u64a x, u64a mask) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
|
||||
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
|
||||
static really_inline
|
||||
u64a pdep64(u64a x, u64a mask) {
|
||||
return _pdep_u64(x, mask);
|
||||
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "ue2common.h"
|
||||
#include "hs_compile.h" // for HS_MODE_ flags
|
||||
#include "hs_internal.h"
|
||||
#include "util/arch.h"
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <cpuid.h>
|
||||
@@ -131,7 +132,7 @@ u64a cpuid_flags(void) {
|
||||
cap |= HS_CPU_FEATURES_AVX2;
|
||||
}
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(HAVE_AVX2)
|
||||
cap &= ~HS_CPU_FEATURES_AVX2;
|
||||
#endif
|
||||
|
||||
|
@@ -29,8 +29,9 @@
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "masked_move.h"
|
||||
#include "util/arch.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
/* masks for masked moves */
|
||||
|
||||
/* magic mask for maskload (vmmaskmovq) - described in UE-2424 */
|
||||
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -29,7 +29,9 @@
|
||||
#ifndef MASKED_MOVE_H
|
||||
#define MASKED_MOVE_H
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#include "arch.h"
|
||||
|
||||
#if defined(HAVE_AVX2)
|
||||
|
||||
#include "unaligned.h"
|
||||
#include "simd_utils.h"
|
||||
|
@@ -29,6 +29,8 @@
|
||||
#ifndef UTIL_MATH_H_
|
||||
#define UTIL_MATH_H_
|
||||
|
||||
#include "arch.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -59,7 +61,7 @@
|
||||
|
||||
static really_inline
|
||||
double our_pow(double x, double y) {
|
||||
#if defined(__AVX__)
|
||||
#if defined(HAVE_AVX)
|
||||
/*
|
||||
* Clear the upper half of AVX registers before calling into the math lib.
|
||||
* On some versions of glibc this can save thousands of AVX-to-SSE
|
||||
|
@@ -34,15 +34,7 @@
|
||||
#define UTIL_POPCOUNT_H_
|
||||
|
||||
#include "ue2common.h"
|
||||
|
||||
// We have a native popcount where the compiler has defined __POPCNT__.
|
||||
#if defined(__POPCNT__)
|
||||
#define HAVE_POPCOUNT_INSTR
|
||||
#elif defined(_WIN32) && defined(__AVX__) // TODO: fix win preproc
|
||||
#define HAVE_POPCOUNT_INSTR
|
||||
#elif defined(__INTEL_COMPILER) && defined(__SSE4_2__)
|
||||
#define HAVE_POPCOUNT_INSTR
|
||||
#endif
|
||||
#include "util/arch.h"
|
||||
|
||||
static really_inline
|
||||
u32 popcount32(u32 x) {
|
||||
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -30,6 +30,7 @@
|
||||
#define SIMD_TYPES_H
|
||||
|
||||
#include "config.h"
|
||||
#include "util/arch.h"
|
||||
#include "ue2common.h"
|
||||
|
||||
// more recent headers are bestest, but only if we can use them
|
||||
@@ -61,13 +62,13 @@
|
||||
#error no intrinsics!
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
|
||||
#if defined(HAVE_SSE2)
|
||||
typedef __m128i m128;
|
||||
#else
|
||||
typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
typedef __m256i m256;
|
||||
#else
|
||||
typedef ALIGN_AVX_DIRECTIVE struct {m128 lo; m128 hi;} m256;
|
||||
|
@@ -38,6 +38,8 @@
|
||||
#endif
|
||||
|
||||
#include "config.h"
|
||||
#include "util/arch.h"
|
||||
|
||||
#include <string.h> // for memcpy
|
||||
|
||||
// more recent headers are bestest, but only if we can use them
|
||||
@@ -141,7 +143,7 @@ static really_inline u32 diffrich128(m128 a, m128 b) {
|
||||
* returns a 4-bit mask indicating which 64-bit words contain differences.
|
||||
*/
|
||||
static really_inline u32 diffrich64_128(m128 a, m128 b) {
|
||||
#if defined(__SSE_41__)
|
||||
#if defined(HAVE_SSE41)
|
||||
a = _mm_cmpeq_epi64(a, b);
|
||||
return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
|
||||
#else
|
||||
@@ -186,11 +188,11 @@ m128 load_m128_from_u64a(const u64a *p) {
|
||||
#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
|
||||
#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(HAVE_AVX2)
|
||||
// TODO: this entire file needs restructuring - this carveout is awful
|
||||
#define extractlow64from256(a) movq(a.lo)
|
||||
#define extractlow32from256(a) movd(a.lo)
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(HAVE_SSE41)
|
||||
#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
|
||||
#define extract64from256(a, imm) _mm_extract_epi64((imm >> 2) ? a.hi : a.lo, imm % 2)
|
||||
#else
|
||||
@@ -288,7 +290,7 @@ void clearbit128(m128 *ptr, unsigned int n) {
|
||||
static really_inline
|
||||
char testbit128(m128 val, unsigned int n) {
|
||||
const m128 mask = mask1bit128(n);
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(HAVE_SSE41)
|
||||
return !_mm_testz_si128(mask, val);
|
||||
#else
|
||||
return isnonzero128(and128(mask, val));
|
||||
@@ -307,7 +309,7 @@ m128 pshufb(m128 a, m128 b) {
|
||||
|
||||
static really_inline
|
||||
m256 vpshufb(m256 a, m256 b) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
return _mm256_shuffle_epi8(a, b);
|
||||
#else
|
||||
m256 rv;
|
||||
@@ -348,7 +350,7 @@ m128 sub_u8_m128(m128 a, m128 b) {
|
||||
**** 256-bit Primitives
|
||||
****/
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
#define lshift64_m256(a, b) _mm256_slli_epi64((a), (b))
|
||||
#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
|
||||
|
||||
@@ -413,7 +415,7 @@ m256 set2x128(m128 a) {
|
||||
#endif
|
||||
|
||||
static really_inline m256 zeroes256(void) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
return _mm256_setzero_si256();
|
||||
#else
|
||||
m256 rv = {zeroes128(), zeroes128()};
|
||||
@@ -422,7 +424,7 @@ static really_inline m256 zeroes256(void) {
|
||||
}
|
||||
|
||||
static really_inline m256 ones256(void) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
m256 rv = _mm256_set1_epi8(0xFF);
|
||||
#else
|
||||
m256 rv = {ones128(), ones128()};
|
||||
@@ -430,7 +432,7 @@ static really_inline m256 ones256(void) {
|
||||
return rv;
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
static really_inline m256 and256(m256 a, m256 b) {
|
||||
return _mm256_and_si256(a, b);
|
||||
}
|
||||
@@ -443,7 +445,7 @@ static really_inline m256 and256(m256 a, m256 b) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
static really_inline m256 or256(m256 a, m256 b) {
|
||||
return _mm256_or_si256(a, b);
|
||||
}
|
||||
@@ -456,7 +458,7 @@ static really_inline m256 or256(m256 a, m256 b) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
static really_inline m256 xor256(m256 a, m256 b) {
|
||||
return _mm256_xor_si256(a, b);
|
||||
}
|
||||
@@ -469,7 +471,7 @@ static really_inline m256 xor256(m256 a, m256 b) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
static really_inline m256 not256(m256 a) {
|
||||
return _mm256_xor_si256(a, ones256());
|
||||
}
|
||||
@@ -482,7 +484,7 @@ static really_inline m256 not256(m256 a) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
static really_inline m256 andnot256(m256 a, m256 b) {
|
||||
return _mm256_andnot_si256(a, b);
|
||||
}
|
||||
@@ -496,7 +498,7 @@ static really_inline m256 andnot256(m256 a, m256 b) {
|
||||
#endif
|
||||
|
||||
static really_inline int diff256(m256 a, m256 b) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
|
||||
#else
|
||||
return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
|
||||
@@ -504,7 +506,7 @@ static really_inline int diff256(m256 a, m256 b) {
|
||||
}
|
||||
|
||||
static really_inline int isnonzero256(m256 a) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
return !!diff256(a, zeroes256());
|
||||
#else
|
||||
return isnonzero128(or128(a.lo, a.hi));
|
||||
@@ -516,7 +518,7 @@ static really_inline int isnonzero256(m256 a) {
|
||||
* mask indicating which 32-bit words contain differences.
|
||||
*/
|
||||
static really_inline u32 diffrich256(m256 a, m256 b) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
a = _mm256_cmpeq_epi32(a, b);
|
||||
return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
|
||||
#else
|
||||
@@ -540,7 +542,7 @@ static really_inline u32 diffrich64_256(m256 a, m256 b) {
|
||||
// aligned load
|
||||
static really_inline m256 load256(const void *ptr) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m256)));
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
return _mm256_load_si256((const m256 *)ptr);
|
||||
#else
|
||||
m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
|
||||
@@ -550,7 +552,7 @@ static really_inline m256 load256(const void *ptr) {
|
||||
|
||||
// aligned load of 128-bit value to low and high part of 256-bit value
|
||||
static really_inline m256 load2x128(const void *ptr) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
return set2x128(load128(ptr));
|
||||
#else
|
||||
assert(ISALIGNED_N(ptr, alignof(m128)));
|
||||
@@ -567,7 +569,7 @@ static really_inline m256 loadu2x128(const void *ptr) {
|
||||
// aligned store
|
||||
static really_inline void store256(void *ptr, m256 a) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m256)));
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
_mm256_store_si256((m256 *)ptr, a);
|
||||
#else
|
||||
ptr = assume_aligned(ptr, 16);
|
||||
@@ -577,7 +579,7 @@ static really_inline void store256(void *ptr, m256 a) {
|
||||
|
||||
// unaligned load
|
||||
static really_inline m256 loadu256(const void *ptr) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
return _mm256_loadu_si256((const m256 *)ptr);
|
||||
#else
|
||||
m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
|
||||
@@ -587,7 +589,7 @@ static really_inline m256 loadu256(const void *ptr) {
|
||||
|
||||
// unaligned store
|
||||
static really_inline void storeu256(void *ptr, m256 a) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
_mm256_storeu_si256((m256 *)ptr, a);
|
||||
#else
|
||||
storeu128(ptr, a.lo);
|
||||
@@ -619,7 +621,7 @@ m256 mask1bit256(unsigned int n) {
|
||||
return loadu256(&simd_onebit_masks[mask_idx]);
|
||||
}
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(HAVE_AVX2)
|
||||
// switches on bit N in the given vector.
|
||||
static really_inline
|
||||
void setbit256(m256 *ptr, unsigned int n) {
|
||||
@@ -971,7 +973,7 @@ static really_inline int diff512(m512 a, m512 b) {
|
||||
}
|
||||
|
||||
static really_inline int isnonzero512(m512 a) {
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(HAVE_AVX2)
|
||||
m128 x = or128(a.lo.lo, a.lo.hi);
|
||||
m128 y = or128(a.hi.lo, a.hi.hi);
|
||||
return isnonzero128(or128(x, y));
|
||||
@@ -986,7 +988,7 @@ static really_inline int isnonzero512(m512 a) {
|
||||
* mask indicating which 32-bit words contain differences.
|
||||
*/
|
||||
static really_inline u32 diffrich512(m512 a, m512 b) {
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
|
||||
#else
|
||||
a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
|
||||
@@ -1018,7 +1020,7 @@ static really_inline m512 load512(const void *ptr) {
|
||||
// aligned store
|
||||
static really_inline void store512(void *ptr, m512 a) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m256)));
|
||||
#if defined(__AVX2__)
|
||||
#if defined(HAVE_AVX2)
|
||||
m512 *x = (m512 *)ptr;
|
||||
store256(&x->lo, a.lo);
|
||||
store256(&x->hi, a.hi);
|
||||
@@ -1054,7 +1056,7 @@ m512 loadbytes512(const void *ptr, unsigned int n) {
|
||||
static really_inline
|
||||
void setbit512(m512 *ptr, unsigned int n) {
|
||||
assert(n < sizeof(*ptr) * 8);
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(HAVE_AVX2)
|
||||
m128 *sub;
|
||||
if (n < 128) {
|
||||
sub = &ptr->lo.lo;
|
||||
@@ -1082,7 +1084,7 @@ void setbit512(m512 *ptr, unsigned int n) {
|
||||
static really_inline
|
||||
void clearbit512(m512 *ptr, unsigned int n) {
|
||||
assert(n < sizeof(*ptr) * 8);
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(HAVE_AVX2)
|
||||
m128 *sub;
|
||||
if (n < 128) {
|
||||
sub = &ptr->lo.lo;
|
||||
@@ -1110,7 +1112,7 @@ void clearbit512(m512 *ptr, unsigned int n) {
|
||||
static really_inline
|
||||
char testbit512(m512 val, unsigned int n) {
|
||||
assert(n < sizeof(val) * 8);
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(HAVE_AVX2)
|
||||
m128 sub;
|
||||
if (n < 128) {
|
||||
sub = val.lo.lo;
|
||||
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015, Intel Corporation
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@@ -31,6 +31,7 @@
|
||||
*/
|
||||
#include "config.h"
|
||||
#include "ue2common.h"
|
||||
#include "arch.h"
|
||||
#include "bitutils.h"
|
||||
#include "unaligned.h"
|
||||
#include "pack_bits.h"
|
||||
@@ -262,7 +263,7 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
|
||||
expand32(v[4], m[4]), expand32(v[5], m[5]),
|
||||
expand32(v[6], m[6]), expand32(v[7], m[7]) };
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(HAVE_AVX2)
|
||||
m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
|
||||
.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) };
|
||||
#else
|
||||
@@ -289,7 +290,7 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
|
||||
u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
|
||||
expand64(v[2], m[2]), expand64(v[3], m[3]) };
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(HAVE_AVX2)
|
||||
m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
|
||||
.hi = _mm_set_epi64x(x[3], x[2]) };
|
||||
#else
|
||||
@@ -546,7 +547,7 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
|
||||
expand32(v[14], m[14]), expand32(v[15], m[15]) };
|
||||
|
||||
m512 xvec;
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(HAVE_AVX2)
|
||||
xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
|
||||
xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
|
||||
xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
|
||||
@@ -581,7 +582,7 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
|
||||
expand64(v[4], m[4]), expand64(v[5], m[5]),
|
||||
expand64(v[6], m[6]), expand64(v[7], m[7]) };
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(HAVE_AVX2)
|
||||
m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
|
||||
_mm_set_epi64x(x[3], x[2]) },
|
||||
.hi = { _mm_set_epi64x(x[5], x[4]),
|
||||
|
Reference in New Issue
Block a user