Merge pull request #2 from VectorCamp/develop

Develop
This commit is contained in:
Konstantinos Margaritis 2020-12-21 20:50:27 +02:00 committed by GitHub
commit 124455a4a8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
51 changed files with 3594 additions and 1952 deletions

View File

@ -175,7 +175,7 @@ else()
string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
endforeach ()
if (CMAKE_COMPILER_IS_GNUCC)
if (ARCH_IA32 OR ARCH_X86_64 AND CMAKE_COMPILER_IS_GNUCC)
message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
# If gcc doesn't recognise the host cpu, then mtune=native becomes
# generic, which isn't very good in some cases. march=native looks at
@ -281,10 +281,16 @@ else()
endif()
CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
if (ARCH_IA32 OR ARCH_X86_64)
CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
elseif (ARCH_ARM32 OR ARCH_AARCH64)
CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_C_ARM_NEON_H)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions")
endif()
CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
@ -564,11 +570,22 @@ install(FILES ${hs_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs")
set (hs_exec_common_SRCS
src/alloc.c
src/scratch.c
src/util/cpuid_flags.c
src/util/cpuid_flags.h
src/util/arch/common/cpuid_flags.h
src/util/multibit.c
)
if (ARCH_IA32 OR ARCH_X86_64)
set (hs_exec_common_SRCS
${hs_exec_common_SRCS}
src/util/arch/x86/cpuid_flags.c
)
else (ARCH_ARM32 OR ARCH_AARCH64)
set (hs_exec_common_SRCS
${hs_exec_common_SRCS}
src/util/arch/arm/cpuid_flags.c
)
endif ()
set (hs_exec_SRCS
${hs_HEADERS}
src/hs_version.h
@ -694,7 +711,6 @@ set (hs_exec_SRCS
src/util/exhaust.h
src/util/fatbit.h
src/util/join.h
src/util/masked_move.h
src/util/multibit.h
src/util/multibit.c
src/util/multibit_compress.h
@ -716,7 +732,8 @@ set (hs_exec_SRCS
set (hs_exec_avx2_SRCS
src/fdr/teddy_avx2.c
src/util/masked_move.c
src/util/arch/x86/masked_move.c
src/util/arch/x86/masked_move.h
)

View File

@ -6,7 +6,10 @@ if (HAVE_C_X86INTRIN_H)
set (INTRIN_INC_H "x86intrin.h")
elseif (HAVE_C_INTRIN_H)
set (INTRIN_INC_H "intrin.h")
else ()
elseif (HAVE_C_ARM_NEON_H)
set (INTRIN_INC_H "arm_neon.h")
set (FAT_RUNTIME OFF)
else()
message (FATAL_ERROR "No intrinsics header found")
endif ()
@ -29,15 +32,16 @@ else (NOT FAT_RUNTIME)
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ARCH_C_FLAGS}")
endif ()
# ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
if (ARCH_IA32 OR ARCH_X86_64)
# ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
int main() {
__m128i a = _mm_set1_epi8(1);
(void)_mm_shuffle_epi8(a, a);
}" HAVE_SSSE3)
# now look for AVX2
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
# now look for AVX2
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
#if !defined(__AVX2__)
#error no avx2
#endif
@ -47,8 +51,8 @@ int main(){
(void)_mm256_xor_si256(z, z);
}" HAVE_AVX2)
# and now for AVX512
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
# and now for AVX512
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
#if !defined(__AVX512BW__)
#error no avx512bw
#endif
@ -58,8 +62,8 @@ int main(){
(void)_mm512_abs_epi8(z);
}" HAVE_AVX512)
# and now for AVX512VBMI
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
# and now for AVX512VBMI
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
#if !defined(__AVX512VBMI__)
#error no avx512vbmi
#endif
@ -70,26 +74,39 @@ int main(){
(void)_mm512_permutexvar_epi8(idx, a);
}" HAVE_AVX512VBMI)
elseif (ARCH_ARM32 OR ARCH_AARCH64)
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
int main() {
int32x4_t a = vdupq_n_s32(1);
(void)a;
}" HAVE_NEON)
else ()
message (FATAL_ERROR "Unsupported architecture")
endif ()
if (FAT_RUNTIME)
if (NOT HAVE_SSSE3)
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
message(FATAL_ERROR "SSSE3 support required to build fat runtime")
endif ()
if (NOT HAVE_AVX2)
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2)
message(FATAL_ERROR "AVX2 support required to build fat runtime")
endif ()
if (BUILD_AVX512 AND NOT HAVE_AVX512)
if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX512 AND NOT HAVE_AVX512)
message(FATAL_ERROR "AVX512 support requested but not supported")
endif ()
else (NOT FAT_RUNTIME)
if (NOT HAVE_AVX2)
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2)
message(STATUS "Building without AVX2 support")
endif ()
if (NOT HAVE_AVX512)
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512)
message(STATUS "Building without AVX512 support")
endif ()
if (NOT HAVE_SSSE3)
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
endif ()
if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON)
message(FATAL_ERROR "NEON support required for ARM support")
endif ()
endif ()
unset (CMAKE_REQUIRED_FLAGS)

View File

@ -15,6 +15,12 @@
/* "Define if building for EM64T" */
#cmakedefine ARCH_X86_64
/* "Define if building for ARM32" */
#cmakedefine ARCH_ARM32
/* "Define if building for AARCH64" */
#cmakedefine ARCH_AARCH64
/* internal build, switch on dump support. */
#cmakedefine DUMP_SUPPORT
@ -45,6 +51,9 @@
/* C compiler has intrin.h */
#cmakedefine HAVE_C_INTRIN_H
/* C compiler has arm_neon.h */
#cmakedefine HAVE_C_ARM_NEON_H
/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
0 if you don't. */
#cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP

View File

@ -1,9 +1,15 @@
# determine the target arch
# really only interested in the preprocessor here
CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT)
CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64)
CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT)
CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
set(ARCH_X86_64 ${ARCH_64_BIT})
set(ARCH_IA32 ${ARCH_32_BIT})
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
if (ARCH_X86_64 OR ARCH_AARCH64)
set(ARCH_64_BIT TRUE)
else()
set(ARCH_32_BIT TRUE)
endif()

View File

@ -30,7 +30,6 @@
#include "config.h"
#include "ue2common.h"
#include "util/arch.h"
#include "util/intrinsics.h"
#if !defined(HAVE_SSE42)
@ -579,53 +578,7 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
}
#else // HAVE_SSE42
#ifdef ARCH_64_BIT
#define CRC_WORD 8
#define CRC_TYPE u64a
#define CRC_FUNC _mm_crc32_u64
#else
#define CRC_WORD 4
#define CRC_TYPE u32
#define CRC_FUNC _mm_crc32_u32
#endif
/*
* Use the crc32 instruction from SSE4.2 to compute our checksum - same
* polynomial as the above function.
*/
static really_inline
u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
const size_t length) {
u32 crc = running_crc;
// Process byte-by-byte until p_buf is aligned
const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD);
size_t init_bytes = aligned_buf - p_buf;
size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD;
size_t end_bytes = length - init_bytes - running_length;
while (p_buf < aligned_buf) {
crc = _mm_crc32_u8(crc, *p_buf++);
}
// Main aligned loop, processes a word at a time.
for (size_t li = 0; li < running_length/CRC_WORD; li++) {
CRC_TYPE block = *(const CRC_TYPE *)p_buf;
crc = CRC_FUNC(crc, block);
p_buf += CRC_WORD;
}
// Remaining bytes
for(size_t li = 0; li < end_bytes; li++) {
crc = _mm_crc32_u8(crc, *p_buf++);
}
return crc;
}
#include "util/arch/x86/crc32.h"
#endif
#ifdef VERIFY_ASSERTION

View File

@ -51,6 +51,7 @@ extern "C"
// CPU type is the low 6 bits (we can't need more than 64, surely!)
#define HS_PLATFORM_INTEL 1
#define HS_PLATFORM_ARM 2
#define HS_PLATFORM_CPU_MASK 0x3F
#define HS_PLATFORM_NOAVX2 (4<<13)

View File

@ -30,7 +30,9 @@
#include "hs_common.h"
#include "hs_runtime.h"
#include "ue2common.h"
#include "util/cpuid_inline.h"
#if defined(ARCH_X86_64)
#include "util/arch/x86/cpuid_inline.h"
#endif
#include "util/join.h"
#if defined(DISABLE_AVX512_DISPATCH)

View File

@ -36,6 +36,7 @@
#include "teddy.h"
#include "teddy_internal.h"
#include "util/arch.h"
#include "util/bitutils.h"
#include "util/simd_utils.h"
#include "util/uniform_ops.h"
@ -119,20 +120,6 @@ const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
};
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
* so we force its generation.
*/
static really_inline
u64a andn(const u32 a, const u8 *b) {
u64a r;
#if defined(HAVE_BMI) && !defined(NO_ASM)
__asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
#else
r = unaligned_load_u32(b) & ~a;
#endif
return r;
}
/* generates an initial state mask based on the last byte-ish of history rather
* than being all accepting. If there is no history to consider, the state is
* generated based on the minimum length of each bucket in order to prevent
@ -739,6 +726,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
assert(ISALIGNED_CL(confBase));
struct zone zones[ZONE_MAX];
assert(fdr->domain > 8 && fdr->domain < 16);
memset(zones, 0, sizeof(zones));
size_t numZone = prepareZones(a->buf, a->len,
a->buf_history + a->len_history,

View File

@ -311,26 +311,26 @@ const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
#define PREPARE_MASKS_1 \
dup_mask[0] = set4x128(maskBase[0]); \
dup_mask[1] = set4x128(maskBase[1]);
dup_mask[0] = set1_4x128(maskBase[0]); \
dup_mask[1] = set1_4x128(maskBase[1]);
#define PREPARE_MASKS_2 \
PREPARE_MASKS_1 \
dup_mask[2] = set4x128(maskBase[2]); \
dup_mask[3] = set4x128(maskBase[3]);
dup_mask[2] = set1_4x128(maskBase[2]); \
dup_mask[3] = set1_4x128(maskBase[3]);
#define PREPARE_MASKS_3 \
PREPARE_MASKS_2 \
dup_mask[4] = set4x128(maskBase[4]); \
dup_mask[5] = set4x128(maskBase[5]);
dup_mask[4] = set1_4x128(maskBase[4]); \
dup_mask[5] = set1_4x128(maskBase[5]);
#define PREPARE_MASKS_4 \
PREPARE_MASKS_3 \
dup_mask[6] = set4x128(maskBase[6]); \
dup_mask[7] = set4x128(maskBase[7]);
dup_mask[6] = set1_4x128(maskBase[6]); \
dup_mask[7] = set1_4x128(maskBase[7]);
#define PREPARE_MASKS(n) \
m512 lo_mask = set64x8(0xf); \
m512 lo_mask = set1_64x8(0xf); \
m512 dup_mask[n * 2]; \
m512 sl_msk[n - 1]; \
PREPARE_MASKS_##n \
@ -570,26 +570,26 @@ m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
&c_0, &c_16, &c_32, &c_48)
#define PREPARE_MASKS_1 \
dup_mask[0] = set4x128(maskBase[0]); \
dup_mask[1] = set4x128(maskBase[1]);
dup_mask[0] = set1_4x128(maskBase[0]); \
dup_mask[1] = set1_4x128(maskBase[1]);
#define PREPARE_MASKS_2 \
PREPARE_MASKS_1 \
dup_mask[2] = set4x128(maskBase[2]); \
dup_mask[3] = set4x128(maskBase[3]);
dup_mask[2] = set1_4x128(maskBase[2]); \
dup_mask[3] = set1_4x128(maskBase[3]);
#define PREPARE_MASKS_3 \
PREPARE_MASKS_2 \
dup_mask[4] = set4x128(maskBase[4]); \
dup_mask[5] = set4x128(maskBase[5]);
dup_mask[4] = set1_4x128(maskBase[4]); \
dup_mask[5] = set1_4x128(maskBase[5]);
#define PREPARE_MASKS_4 \
PREPARE_MASKS_3 \
dup_mask[6] = set4x128(maskBase[6]); \
dup_mask[7] = set4x128(maskBase[7]);
dup_mask[6] = set1_4x128(maskBase[6]); \
dup_mask[7] = set1_4x128(maskBase[7]);
#define PREPARE_MASKS(n) \
m512 lo_mask = set64x8(0xf); \
m512 lo_mask = set1_64x8(0xf); \
m512 dup_mask[n * 2]; \
PREPARE_MASKS_##n
@ -713,7 +713,7 @@ do { \
#define PREP_SHUF_MASK \
PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr)); \
*c_128 = *(ptr + 15); \
m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
*c_0 = *(ptr + 31)
#define SHIFT_OR_M1 \
@ -805,26 +805,26 @@ m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask,
prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
#define PREPARE_MASKS_1 \
dup_mask[0] = set2x128(maskBase[0]); \
dup_mask[1] = set2x128(maskBase[1]);
dup_mask[0] = set1_2x128(maskBase[0]); \
dup_mask[1] = set1_2x128(maskBase[1]);
#define PREPARE_MASKS_2 \
PREPARE_MASKS_1 \
dup_mask[2] = set2x128(maskBase[2]); \
dup_mask[3] = set2x128(maskBase[3]);
dup_mask[2] = set1_2x128(maskBase[2]); \
dup_mask[3] = set1_2x128(maskBase[3]);
#define PREPARE_MASKS_3 \
PREPARE_MASKS_2 \
dup_mask[4] = set2x128(maskBase[4]); \
dup_mask[5] = set2x128(maskBase[5]);
dup_mask[4] = set1_2x128(maskBase[4]); \
dup_mask[5] = set1_2x128(maskBase[5]);
#define PREPARE_MASKS_4 \
PREPARE_MASKS_3 \
dup_mask[6] = set2x128(maskBase[6]); \
dup_mask[7] = set2x128(maskBase[7]);
dup_mask[6] = set1_2x128(maskBase[6]); \
dup_mask[7] = set1_2x128(maskBase[7]);
#define PREPARE_MASKS(n) \
m256 lo_mask = set32x8(0xf); \
m256 lo_mask = set1_32x8(0xf); \
m256 dup_mask[n * 2]; \
PREPARE_MASKS_##n
@ -901,8 +901,10 @@ do { \
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
do { \
if (unlikely(diff128(var, ones128()))) { \
u64a lo = movq(var); \
u64a hi = movq(rshiftbyte_m128(var, 8)); \
u64a __attribute__((aligned(16))) vector[2]; \
store128(vector, var); \
u64a lo = vector[0]; \
u64a hi = vector[1]; \
CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn); \
CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn); \
} \
@ -925,7 +927,7 @@ do { \
static really_inline
m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
m128 mask = set16x8(0xf);
m128 mask = set1_16x8(0xf);
m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask);
return or128(pshufb_m128(maskBase[0 * 2], lo),
@ -934,7 +936,7 @@ m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
static really_inline
m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
m128 mask = set16x8(0xf);
m128 mask = set1_16x8(0xf);
m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m1(maskBase, val);
@ -949,7 +951,7 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
static really_inline
m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
m128 val) {
m128 mask = set16x8(0xf);
m128 mask = set1_16x8(0xf);
m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
@ -964,7 +966,7 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
static really_inline
m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
m128 *old_3, m128 val) {
m128 mask = set16x8(0xf);
m128 mask = set1_16x8(0xf);
m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);

View File

@ -501,15 +501,15 @@ m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
const u8 *buf_history, size_t len_history,
const u32 nMasks) {
m128 p_mask128;
m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
buf_history, len_history, nMasks));
*p_mask = set2x128(p_mask128);
*p_mask = set1_2x128(p_mask128);
return ret;
}
static really_inline
m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
m256 mask = set32x8(0xf);
m256 mask = set1_32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
return or256(pshufb_m256(maskBase[0 * 2], lo),
@ -518,7 +518,7 @@ m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
static really_inline
m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
m256 mask = set32x8(0xf);
m256 mask = set1_32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m1(maskBase, val);
@ -533,7 +533,7 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
static really_inline
m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
m256 val) {
m256 mask = set32x8(0xf);
m256 mask = set1_32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
@ -548,7 +548,7 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
static really_inline
m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
m256 *old_3, m256 val) {
m256 mask = set32x8(0xf);
m256 mask = set1_32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);

View File

@ -44,8 +44,11 @@
#include "parser/prefilter.h"
#include "parser/unsupported.h"
#include "util/compile_error.h"
#include "util/cpuid_flags.h"
#include "util/cpuid_inline.h"
#include "util/arch/common/cpuid_flags.h"
#if defined(ARCH_X86_64)
#include "util/arch/x86/cpuid_inline.h"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#endif
#include "util/depth.h"
#include "util/popcount.h"
#include "util/target_info.h"

View File

@ -26,16 +26,23 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "hs_common.h"
#include "util/cpuid_flags.h"
#include "util/cpuid_inline.h"
#include "ue2common.h"
#if defined(ARCH_X86_64)
#include "util/arch/x86/cpuid_inline.h"
#endif
HS_PUBLIC_API
hs_error_t HS_CDECL hs_valid_platform(void) {
/* Hyperscan requires SSSE3, anything else is a bonus */
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
if (check_ssse3()) {
return HS_SUCCESS;
} else {
return HS_ARCH_ERROR;
}
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
return HS_SUCCESS;
#endif
}

View File

@ -39,10 +39,13 @@
#include "util/compare.h"
#include "util/intrinsics.h"
#include "util/join.h"
#include "util/masked_move.h"
#include "util/partial_store.h"
#include "util/simd_utils.h"
#if defined(HAVE_AVX2)
#include "util/arch/x86/masked_move.h"
#endif
#include <ctype.h>
#include <stdbool.h>
#include <string.h>

View File

@ -30,11 +30,11 @@
static really_inline m256 getMask(u8 c, bool noCase) {
u8 k = caseClear8(c, noCase);
return set32x8(k);
return set1_32x8(k);
}
static really_inline m256 getCaseMask(void) {
return set32x8(0xdf);
return set1_32x8(0xdf);
}
static really_inline

View File

@ -30,11 +30,11 @@
static really_inline m128 getMask(u8 c, bool noCase) {
u8 k = caseClear8(c, noCase);
return set16x8(k);
return set1_16x8(k);
}
static really_inline m128 getCaseMask(void) {
return set16x8(0xdf);
return set1_16x8(0xdf);
}
static really_inline

View File

@ -59,7 +59,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
if (len) {
m128 ss_char = load128(sherman_state);
m128 cur_char = set16x8(cprime);
m128 cur_char = set1_16x8(cprime);
u32 z = movemask128(eq128(ss_char, cur_char));

View File

@ -72,7 +72,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
if (len) {
m128 ss_char = load128(sherman_state);
m128 cur_char = set16x8(cprime);
m128 cur_char = set1_16x8(cprime);
u32 z = movemask128(eq128(ss_char, cur_char));
@ -153,7 +153,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
assert(s_in); /* should not already be dead */
assert(soft_c_end <= hard_c_end);
DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
m128 s = set16x8(s_in - 1);
m128 s = set1_16x8(s_in - 1);
const u8 *c = *c_inout;
const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
if (!do_accel) {
@ -171,8 +171,8 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
u32 sheng_limit_x4 = sheng_limit * 0x01010101;
m128 simd_stop_limit = set4x32(sheng_stop_limit_x4);
m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit);
m128 simd_stop_limit = set1_4x32(sheng_stop_limit_x4);
m128 accel_delta = set1_16x8(sheng_limit - sheng_stop_limit);
DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit,
m->sheng_accel_limit, sheng_stop_limit);
#endif

View File

@ -52,7 +52,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
}
DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
m128 cur_state = set16x8(*state);
m128 cur_state = set1_16x8(*state);
const m128 *masks = s->shuffle_masks;
while (likely(cur_buf != end)) {

View File

@ -86,7 +86,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
return MO_CONTINUE_MATCHING;
}
m128 cur_state = set16x8(*state);
m128 cur_state = set1_16x8(*state);
const m128 *masks = s->shuffle_masks;
while (likely(end - cur_buf >= 4)) {

View File

@ -159,7 +159,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
}
const m128 zeroes = zeroes128();
const m128 low4bits = _mm_set1_epi8(0xf);
const m128 low4bits = set1_16x8(0xf);
const u8 *rv;
size_t min = (size_t)buf % 16;
@ -246,7 +246,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
}
const m128 zeroes = zeroes128();
const m128 low4bits = _mm_set1_epi8(0xf);
const m128 low4bits = set1_16x8(0xf);
const u8 *rv;
assert(buf_end - buf >= 16);
@ -320,7 +320,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
m128 mask2_lo, m128 mask2_hi,
const u8 *buf, const u8 *buf_end) {
const m128 ones = ones128();
const m128 low4bits = _mm_set1_epi8(0xf);
const m128 low4bits = set1_16x8(0xf);
const u8 *rv;
size_t min = (size_t)buf % 16;
@ -455,15 +455,15 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
buf, buf_end);
}
const m256 low4bits = set32x8(0xf);
const m256 low4bits = set1_32x8(0xf);
if (buf_end - buf <= 32) {
return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits);
}
const m256 zeroes = zeroes256();
const m256 wide_mask_lo = set2x128(mask_lo);
const m256 wide_mask_hi = set2x128(mask_hi);
const m256 wide_mask_lo = set1_2x128(mask_lo);
const m256 wide_mask_hi = set1_2x128(mask_hi);
const u8 *rv;
size_t min = (size_t)buf % 32;
@ -579,15 +579,15 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
buf, buf_end);
}
const m256 low4bits = set32x8(0xf);
const m256 low4bits = set1_32x8(0xf);
if (buf_end - buf <= 32) {
return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits);
}
const m256 zeroes = zeroes256();
const m256 wide_mask_lo = set2x128(mask_lo);
const m256 wide_mask_hi = set2x128(mask_hi);
const m256 wide_mask_lo = set1_2x128(mask_lo);
const m256 wide_mask_hi = set1_2x128(mask_hi);
const u8 *rv;
assert(buf_end - buf >= 32);
@ -676,7 +676,7 @@ static really_inline
const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
const m256 low4bits = set32x8(0xf);
const m256 low4bits = set1_32x8(0xf);
// run shufti over two overlapping 16-byte unaligned reads
const m256 mask1 = combine2x128(mask1_hi, mask1_lo);
const m256 mask2 = combine2x128(mask2_hi, mask2_lo);
@ -708,11 +708,11 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
}
const m256 ones = ones256();
const m256 low4bits = set32x8(0xf);
const m256 wide_mask1_lo = set2x128(mask1_lo);
const m256 wide_mask1_hi = set2x128(mask1_hi);
const m256 wide_mask2_lo = set2x128(mask2_lo);
const m256 wide_mask2_hi = set2x128(mask2_hi);
const m256 low4bits = set1_32x8(0xf);
const m256 wide_mask1_lo = set1_2x128(mask1_lo);
const m256 wide_mask1_hi = set1_2x128(mask1_hi);
const m256 wide_mask2_lo = set1_2x128(mask2_lo);
const m256 wide_mask2_hi = set1_2x128(mask2_hi);
const u8 *rv;
size_t min = (size_t)buf % 32;

View File

@ -64,8 +64,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
static really_inline
u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
m128 highconst = _mm_set1_epi8(0x80);
m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
m128 highconst = set1_16x8(0x80);
m128 shuf_mask_hi = set1_2x64(0x8040201008040201);
// and now do the real work
m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
@ -260,8 +260,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
static really_inline
u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
m256 highconst = _mm256_set1_epi8(0x80);
m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
m256 highconst = set1_32x8(0x80);
m256 shuf_mask_hi = set1_4x64(0x8040201008040201);
// and now do the real work
m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v);
@ -315,8 +315,8 @@ const u8 *truffleExec(m128 shuf_mask_lo_highclear,
m128 shuf_mask_lo_highset,
const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("len %zu\n", buf_end - buf);
const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
const m256 wide_set = set2x128(shuf_mask_lo_highset);
const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear);
const m256 wide_set = set1_2x128(shuf_mask_lo_highset);
assert(buf && buf_end);
assert(buf < buf_end);
@ -382,8 +382,8 @@ const u8 *truffleRevMini(m256 shuf_mask_lo_highclear,
const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
m128 shuf_mask_lo_highset,
const u8 *buf, const u8 *buf_end) {
const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
const m256 wide_set = set2x128(shuf_mask_lo_highset);
const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear);
const m256 wide_set = set1_2x128(shuf_mask_lo_highset);
assert(buf && buf_end);
assert(buf < buf_end);
const u8 *rv;

View File

@ -36,7 +36,7 @@
#define VERM_BOUNDARY 16
#define VERM_TYPE m128
#define VERM_SET_FN set16x8
#define VERM_SET_FN set1_16x8
static really_inline
const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
@ -74,7 +74,7 @@ static really_inline
const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf,
const u8 *buf_end, char negate) {
assert((size_t)buf % 16 == 0);
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 31 < buf_end; buf += 32) {
m128 data = load128(buf);
@ -122,7 +122,7 @@ const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) {
// returns NULL if not found
static really_inline
const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) {
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf); // unaligned
u32 z = movemask128(eq128(chars, and128(casemask, data)));
if (negate) {
@ -157,7 +157,7 @@ static really_inline
const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
const u8 *buf, const u8 *buf_end) {
assert((size_t)buf % 16 == 0);
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 16 < buf_end; buf += 16) {
m128 data = load128(buf);
@ -219,7 +219,7 @@ const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
static really_inline
const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
/* due to laziness, nonalphas and nocase having interesting behaviour */
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf); // unaligned
m128 v = and128(casemask, data);
u32 z = movemask128(and128(eq128(chars1, v),
@ -277,7 +277,7 @@ static really_inline
const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf,
const u8 *buf_end, char negate) {
assert((size_t)buf_end % 16 == 0);
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 15 < buf_end; buf_end -= 16) {
m128 data = load128(buf_end - 16);
@ -309,7 +309,7 @@ const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) {
// returns NULL if not found
static really_inline
const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) {
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf); // unaligned
u32 z = movemask128(eq128(chars, and128(casemask, data)));
if (negate) {
@ -344,7 +344,7 @@ static really_inline
const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
const u8 *buf, const u8 *buf_end) {
assert((size_t)buf_end % 16 == 0);
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 16 < buf_end; buf_end -= 16) {
m128 data = load128(buf_end - 16);
@ -381,7 +381,7 @@ const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
static really_inline
const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
/* due to laziness, nonalphas and nocase having interesting behaviour */
m128 casemask = set16x8(CASE_CLEAR);
m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf);
m128 v = and128(casemask, data);
u32 z = movemask128(and128(eq128(chars2, v),
@ -398,7 +398,7 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
#define VERM_BOUNDARY 64
#define VERM_TYPE m512
#define VERM_SET_FN set64x8
#define VERM_SET_FN set1_64x8
static really_inline
const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {

View File

@ -47,7 +47,7 @@ char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end,
u32 count = *count_inout;
m128 chars = set16x8(c);
m128 chars = set1_16x8(c);
for (; d + 16 <= d_end; d_end -= 16) {
m128 data = loadu128(d_end - 16);
@ -94,7 +94,7 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
u32 count = *count_inout;
const m128 zeroes = zeroes128();
const m128 low4bits = _mm_set1_epi8(0xf);
const m128 low4bits = set1_16x8(0xf);
for (; d + 16 <= d_end; d_end -= 16) {
m128 data = loadu128(d_end - 16);

View File

@ -938,7 +938,7 @@ int roseCheckShufti16x16(const struct core_info *ci, const u8 *hi_mask,
return 1;
}
m256 data_m256 = set2x128(data);
m256 data_m256 = set1_2x128(data);
m256 hi_mask_m256 = loadu256(hi_mask);
m256 lo_mask_m256 = loadu256(lo_mask);
m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
@ -974,8 +974,8 @@ int roseCheckShufti32x8(const struct core_info *ci, const u8 *hi_mask,
m128 hi_mask_m128 = loadu128(hi_mask);
m128 lo_mask_m128 = loadu128(lo_mask);
m256 hi_mask_m256 = set2x128(hi_mask_m128);
m256 lo_mask_m256 = set2x128(lo_mask_m128);
m256 hi_mask_m256 = set1_2x128(hi_mask_m128);
m256 lo_mask_m256 = set1_2x128(lo_mask_m128);
m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
if (validateShuftiMask32x8(data, hi_mask_m256, lo_mask_m256,
bucket_select_mask_m256,
@ -1287,7 +1287,7 @@ int roseCheckMultipathShufti16x8(const struct hs_scratch *scratch,
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
expand_valid = set64x2(valid_hi, valid_lo);
expand_valid = set2x64(valid_hi, valid_lo);
valid_path_mask = ~movemask128(pshufb_m128(expand_valid,
data_select_mask));
}
@ -1332,7 +1332,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
u32 valid_data_mask;
m128 data_m128 = getData128(ci, offset, &valid_data_mask);
m256 data_double = set2x128(data_m128);
m256 data_double = set1_2x128(data_m128);
m256 data_select_mask = loadu256(ri->data_select_mask);
u32 valid_path_mask = 0;
@ -1346,7 +1346,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
valid_lo);
valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
data_select_mask));
@ -1393,7 +1393,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
u32 valid_data_mask;
m128 data_m128 = getData128(ci, offset, &valid_data_mask);
m256 data_double = set2x128(data_m128);
m256 data_double = set1_2x128(data_m128);
m256 data_select_mask = loadu256(ri->data_select_mask);
u32 valid_path_mask = 0;
@ -1407,7 +1407,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
valid_lo);
valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
data_select_mask));
@ -1460,7 +1460,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
u32 valid_data_mask;
m128 data_m128 = getData128(ci, offset, &valid_data_mask);
m256 data_m256 = set2x128(data_m128);
m256 data_m256 = set1_2x128(data_m128);
m256 data_select_mask_1 = loadu256(ri->data_select_mask);
m256 data_select_mask_2 = loadu256(ri->data_select_mask + 32);
@ -1475,7 +1475,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
valid_lo);
u32 valid_path_1 = movemask256(pshufb_m256(expand_valid,
data_select_mask_1));

View File

@ -47,7 +47,7 @@ static really_inline
int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
const m256 lo_mask, const m256 and_mask,
const u32 neg_mask, const u32 valid_data_mask) {
m256 low4bits = set32x8(0xf);
m256 low4bits = set1_32x8(0xf);
m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
m256 c_hi = pshufb_m256(hi_mask,
rshift64_m256(andnot256(low4bits, data), 4));
@ -78,7 +78,7 @@ int validateShuftiMask16x8(const m128 data, const m256 nib_mask,
const m128 and_mask, const u32 neg_mask,
const u32 valid_data_mask) {
m256 data_m256 = combine2x128(rshift64_m128(data, 4), data);
m256 low4bits = set32x8(0xf);
m256 low4bits = set1_32x8(0xf);
m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits));
m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
m128 nresult = eq128(and128(t, and_mask), zeroes128());
@ -101,7 +101,7 @@ static really_inline
int validateShuftiMask32x8(const m256 data, const m256 hi_mask,
const m256 lo_mask, const m256 and_mask,
const u32 neg_mask, const u32 valid_data_mask) {
m256 low4bits = set32x8(0xf);
m256 low4bits = set1_32x8(0xf);
m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
m256 c_hi = pshufb_m256(hi_mask,
rshift64_m256(andnot256(low4bits, data), 4));
@ -133,7 +133,7 @@ int validateShuftiMask32x16(const m256 data,
const m256 bucket_mask_hi,
const m256 bucket_mask_lo, const u32 neg_mask,
const u32 valid_data_mask) {
m256 low4bits = set32x8(0xf);
m256 low4bits = set1_32x8(0xf);
m256 data_lo = and256(data, low4bits);
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
@ -201,7 +201,7 @@ int validateMultipathShuftiMask16x8(const m128 data,
const u32 neg_mask,
const u32 valid_path_mask) {
m256 data_256 = combine2x128(rshift64_m128(data, 4), data);
m256 low4bits = set32x8(0xf);
m256 low4bits = set1_32x8(0xf);
m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits));
m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
m128 result = and128(t, bucket_select_mask);
@ -220,7 +220,7 @@ int validateMultipathShuftiMask32x8(const m256 data,
const u32 hi_bits, const u32 lo_bits,
const u32 neg_mask,
const u32 valid_path_mask) {
m256 low4bits = set32x8(0xf);
m256 low4bits = set1_32x8(0xf);
m256 data_lo = and256(data, low4bits);
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
m256 c_lo = pshufb_m256(lo_mask, data_lo);
@ -244,7 +244,7 @@ int validateMultipathShuftiMask32x16(const m256 data,
const u32 hi_bits, const u32 lo_bits,
const u32 neg_mask,
const u32 valid_path_mask) {
m256 low4bits = set32x8(0xf);
m256 low4bits = set1_32x8(0xf);
m256 data_lo = and256(data, low4bits);
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
@ -271,7 +271,7 @@ int validateMultipathShuftiMask64(const m256 data_1, const m256 data_2,
const u64a hi_bits, const u64a lo_bits,
const u64a neg_mask,
const u64a valid_path_mask) {
m256 low4bits = set32x8(0xf);
m256 low4bits = set1_32x8(0xf);
m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits));
m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits));
m256 c_hi_1 = pshufb_m256(hi_mask,

View File

@ -33,58 +33,13 @@
#ifndef UTIL_ARCH_H_
#define UTIL_ARCH_H_
#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
#define HAVE_SSE2
#include "config.h"
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
#include "util/arch/x86/x86.h"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#include "util/arch/arm/arm.h"
#endif
#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__))
#define HAVE_SSE41
#endif
#endif // UTIL_ARCH_X86_H_
#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__))
#define HAVE_SSE42
#endif
#if defined(__AVX__)
#define HAVE_AVX
#endif
#if defined(__AVX2__)
#define HAVE_AVX2
#endif
#if defined(__AVX512BW__)
#define HAVE_AVX512
#endif
#if defined(__AVX512VBMI__)
#define HAVE_AVX512VBMI
#endif
/*
* ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
*/
#if defined(__POPCNT__) || \
(defined(__INTEL_COMPILER) && defined(__SSE4_2__)) || \
(defined(_WIN32) && defined(__AVX__))
#define HAVE_POPCOUNT_INSTR
#endif
#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) || \
(defined(__INTEL_COMPILER) && defined(__AVX2__))
#define HAVE_BMI
#endif
#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) || \
(defined(__INTEL_COMPILER) && defined(__AVX2__))
#define HAVE_BMI2
#endif
/*
* MSVC uses a different form of inline asm
*/
#if defined(_WIN32) && defined(_MSC_VER)
#define NO_ASM
#endif
#endif // UTIL_ARCH_H_

42
src/util/arch/arm/arm.h Normal file
View File

@ -0,0 +1,42 @@
/*
* Copyright (c) 2017-2020, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Per-platform architecture definitions
*/
#ifndef UTIL_ARCH_ARM_H_
#define UTIL_ARCH_ARM_H_
#if defined(__ARM_NEON) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
#define HAVE_NEON
#define HAVE_SIMD_128_BITS
#endif
#endif // UTIL_ARCH_ARM_H_

View File

@ -0,0 +1,199 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Bit-twiddling primitives (ctz, compress etc)
*/
#ifndef BITUTILS_ARCH_ARM_H
#define BITUTILS_ARCH_ARM_H
#include "ue2common.h"
#include "util/popcount.h"
#include "util/arch.h"
#include "util/intrinsics.h"
#include "util/arch/common/bitutils.h"
static really_inline
u32 clz32_impl(u32 x) {
return clz32_impl_c(x);
}
static really_inline
u32 clz64_impl(u64a x) {
return clz64_impl_c(x);
}
static really_inline
u32 ctz32_impl(u32 x) {
return ctz32_impl_c(x);
}
static really_inline
u32 ctz64_impl(u64a x) {
return ctz64_impl_c(x);
}
static really_inline
u32 lg2_impl(u32 x) {
return lg2_impl_c(x);
}
static really_inline
u64a lg2_64_impl(u64a x) {
return lg2_64_impl_c(x);
}
static really_inline
u32 findAndClearLSB_32_impl(u32 *v) {
return findAndClearLSB_32_impl_c(v);
}
static really_inline
u32 findAndClearLSB_64_impl(u64a *v) {
return findAndClearLSB_64_impl_c(v);
}
static really_inline
u32 findAndClearMSB_32_impl(u32 *v) {
u32 val = *v;
u32 offset = 31 - clz32_impl(val);
*v = val & ~(1 << offset);
assert(offset < 32);
return offset;
}
static really_inline
u32 findAndClearMSB_64_impl(u64a *v) {
return findAndClearMSB_64_impl_c(v);
}
static really_inline
u32 compress32_impl(u32 x, u32 m) {
return compress32_impl_c(x, m);
}
static really_inline
u64a compress64_impl(u64a x, u64a m) {
return compress64_impl_c(x, m);
}
static really_inline
m128 compress128_impl(m128 x, m128 m) {
m128 one = set1_2x64(1);
m128 bitset = one;
m128 vres = zeroes128();
while (isnonzero128(m)) {
m128 mm = sub_2x64(zeroes128(), m);
m128 tv = and128(x, m);
tv = and128(tv, mm);
m128 mask = not128(eq64_m128(tv, zeroes128()));
mask = vandq_s64(bitset, mask);
vres = or128(vres, mask);
m = and128(m, sub_2x64(m, set1_2x64(1)));
bitset = lshift64_m128(bitset, 1);
}
return vres;
}
static really_inline
u32 expand32_impl(u32 x, u32 m) {
return expand32_impl_c(x, m);
}
static really_inline
u64a expand64_impl(u64a x, u64a m) {
return expand64_impl_c(x, m);
}
/* returns the first set bit after begin (if not ~0U). If no bit is set after
* begin returns ~0U
*/
static really_inline
u32 bf64_iterate_impl(u64a bitfield, u32 begin) {
if (begin != ~0U) {
/* switch off all bits at or below begin. Note: not legal to shift by
* by size of the datatype or larger. */
assert(begin <= 63);
bitfield &= ~((2ULL << begin) - 1);
}
if (!bitfield) {
return ~0U;
}
return ctz64_impl(bitfield);
}
static really_inline
char bf64_set_impl(u64a *bitfield, u32 i) {
return bf64_set_impl_c(bitfield, i);
}
static really_inline
void bf64_unset_impl(u64a *bitfield, u32 i) {
return bf64_unset_impl_c(bitfield, i);
}
static really_inline
u32 rank_in_mask32_impl(u32 mask, u32 bit) {
return rank_in_mask32_impl_c(mask, bit);
}
static really_inline
u32 rank_in_mask64_impl(u64a mask, u32 bit) {
return rank_in_mask64_impl_c(mask, bit);
}
static really_inline
u32 pext32_impl(u32 x, u32 mask) {
return pext32_impl_c(x, mask);
}
static really_inline
u64a pext64_impl(u64a x, u64a mask) {
return pext64_impl_c(x, mask);
}
static really_inline
u64a pdep64(u64a x, u64a mask) {
return pdep64_impl_c(x, mask);
}
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
* so we force its generation.
*/
static really_inline
u64a andn_impl(const u32 a, const u8 *b) {
return andn_impl_c(a, b);
}
#endif // BITUTILS_ARCH_ARM_H

View File

@ -0,0 +1,40 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "util/arch/common/cpuid_flags.h"
#include "ue2common.h"
#include "hs_compile.h" // for HS_MODE_ flags
#include "util/arch.h"
u64a cpuid_flags(void) {
return 0;
}
u32 cpuid_tune(void) {
return HS_TUNE_FAMILY_GENERIC;
}

View File

@ -0,0 +1,37 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SIMD_TYPES_ARM_H
#define SIMD_TYPES_ARM_H
#if !defined(m128) && defined(HAVE_NEON)
typedef int32x4_t m128;
#endif
#endif /* SIMD_TYPES_ARM_H */

View File

@ -0,0 +1,394 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief SIMD types and primitive operations.
*/
#ifndef ARCH_ARM_SIMD_UTILS_H
#define ARCH_ARM_SIMD_UTILS_H
#include <stdio.h>
#include "ue2common.h"
#include "util/simd_types.h"
#include "util/unaligned.h"
#include "util/intrinsics.h"
#include <string.h> // for memcpy
static really_inline m128 ones128(void) {
return (m128) vdupq_n_s8(0xFF);
}
static really_inline m128 zeroes128(void) {
return (m128) vdupq_n_s32(0);
}
/** \brief Bitwise not for m128*/
static really_inline m128 not128(m128 a) {
return (m128) vmvnq_s32(a);
}
/** \brief Return 1 if a and b are different otherwise 0 */
static really_inline int diff128(m128 a, m128 b) {
int res = vaddvq_s8((int8x16_t) vceqq_s32(a, b));
return (-16 != res);
}
static really_inline int isnonzero128(m128 a) {
return !!diff128(a, zeroes128());
}
/**
* "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
* mask indicating which 32-bit words contain differences.
*/
static really_inline u32 diffrich128(m128 a, m128 b) {
static const uint32x4_t movemask = { 1, 2, 4, 8 };
return vaddvq_u32(vandq_u32(vmvnq_s32(vceqq_s32((int32x4_t)a, (int32x4_t)b)), movemask));
}
/**
* "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
* returns a 4-bit mask indicating which 64-bit words contain differences.
*/
static really_inline u32 diffrich64_128(m128 a, m128 b) {
static const uint64x2_t movemask = { 1, 4 };
return vaddvq_u64(vandq_u64(vmvnq_s32(vceqq_s64((int64x2_t)a, (int64x2_t)b)), movemask));
}
static really_really_inline
m128 add_2x64(m128 a, m128 b) {
return (m128) vaddq_u64((int64x2_t)a, (int64x2_t)b);
}
static really_really_inline
m128 sub_2x64(m128 a, m128 b) {
return (m128) vsubq_u64((int64x2_t)a, (int64x2_t)b);
}
static really_really_inline
m128 lshift_m128(m128 a, unsigned b) {
return (m128) vshlq_n_s32((int64x2_t)a, b);
}
static really_really_inline
m128 rshift_m128(m128 a, unsigned b) {
return (m128) vshrq_n_s32((int64x2_t)a, b);
}
static really_really_inline
m128 lshift64_m128(m128 a, unsigned b) {
return (m128) vshlq_n_s64((int64x2_t)a, b);
}
static really_really_inline
m128 rshift64_m128(m128 a, unsigned b) {
return (m128) vshrq_n_s64((int64x2_t)a, b);
}
static really_inline m128 eq128(m128 a, m128 b) {
return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b);
}
static really_inline m128 eq64_m128(m128 a, m128 b) {
return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b);
}
static really_inline u32 movemask128(m128 a) {
static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
// Compute the mask from the input
uint64x2_t mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
// Get the resulting bytes
uint16_t output;
vst1q_lane_u8((uint8_t*)&output + 0, (uint8x16_t)mask, 0);
vst1q_lane_u8((uint8_t*)&output + 1, (uint8x16_t)mask, 8);
return output;
}
static really_inline m128 set1_16x8(u8 c) {
return (m128) vdupq_n_u8(c);
}
static really_inline m128 set1_4x32(u32 c) {
return (m128) vdupq_n_u32(c);
}
static really_inline m128 set1_2x64(u64a c) {
return (m128) vdupq_n_u64(c);
}
static really_inline u32 movd(const m128 in) {
return vgetq_lane_u32((uint32x4_t) in, 0);
}
static really_inline u64a movq(const m128 in) {
return vgetq_lane_u64((uint64x2_t) in, 0);
}
/* another form of movq */
static really_inline
m128 load_m128_from_u64a(const u64a *p) {
return (m128) vsetq_lane_u64(*p, zeroes128(), 0);
}
static really_inline u32 extract32from128(const m128 in, unsigned imm) {
#if defined(HS_OPTIMIZE)
return vgetq_lane_u32((uint32x4_t) in, imm);
#else
switch (imm) {
case 0:
return vgetq_lane_u32((uint32x4_t) in, 0);
break;
case 1:
return vgetq_lane_u32((uint32x4_t) in, 1);
break;
case 2:
return vgetq_lane_u32((uint32x4_t) in, 2);
break;
case 3:
return vgetq_lane_u32((uint32x4_t) in, 3);
break;
default:
return 0;
break;
}
#endif
}
static really_inline u64a extract64from128(const m128 in, unsigned imm) {
#if defined(HS_OPTIMIZE)
return vgetq_lane_u64((uint64x2_t) in, imm);
#else
switch (imm) {
case 0:
return vgetq_lane_u64((uint32x4_t) in, 0);
break;
case 1:
return vgetq_lane_u64((uint32x4_t) in, 1);
break;
default:
return 0;
break;
}
#endif
}
static really_inline m128 and128(m128 a, m128 b) {
return (m128) vandq_s8((int8x16_t)a, (int8x16_t)b);
}
static really_inline m128 xor128(m128 a, m128 b) {
return (m128) veorq_s8((int8x16_t)a, (int8x16_t)b);
}
static really_inline m128 or128(m128 a, m128 b) {
return (m128) vorrq_s8((int8x16_t)a, (int8x16_t)b);
}
static really_inline m128 andnot128(m128 a, m128 b) {
return (m128) (m128) vandq_s8( vmvnq_s8(a), b);
}
// aligned load
static really_inline m128 load128(const void *ptr) {
assert(ISALIGNED_N(ptr, alignof(m128)));
ptr = assume_aligned(ptr, 16);
return (m128) vld1q_s32((const int32_t *)ptr);
}
// aligned store
static really_inline void store128(void *ptr, m128 a) {
assert(ISALIGNED_N(ptr, alignof(m128)));
ptr = assume_aligned(ptr, 16);
vst1q_s32((int32_t *)ptr, a);
}
// unaligned load
static really_inline m128 loadu128(const void *ptr) {
return (m128) vld1q_s32((const int32_t *)ptr);
}
// unaligned store
static really_inline void storeu128(void *ptr, m128 a) {
vst1q_s32((int32_t *)ptr, a);
}
// packed unaligned store of first N bytes
static really_inline
void storebytes128(void *ptr, m128 a, unsigned int n) {
assert(n <= sizeof(a));
memcpy(ptr, &a, n);
}
// packed unaligned load of first N bytes, pad with zero
static really_inline
m128 loadbytes128(const void *ptr, unsigned int n) {
m128 a = zeroes128();
assert(n <= sizeof(a));
memcpy(&a, ptr, n);
return a;
}
static really_inline
m128 variable_byte_shift_m128(m128 in, s32 amount) {
assert(amount >= -16 && amount <= 16);
m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
return vqtbl1q_s8(in, shift_mask);
}
#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break;
static really_inline
m128 palignr(m128 r, m128 l, int offset) {
#if defined(HS_OPTIMIZE)
return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
#else
switch (offset) {
CASE_ALIGN_VECTORS(l, r, 0);
CASE_ALIGN_VECTORS(l, r, 1);
CASE_ALIGN_VECTORS(l, r, 2);
CASE_ALIGN_VECTORS(l, r, 3);
CASE_ALIGN_VECTORS(l, r, 4);
CASE_ALIGN_VECTORS(l, r, 5);
CASE_ALIGN_VECTORS(l, r, 6);
CASE_ALIGN_VECTORS(l, r, 7);
CASE_ALIGN_VECTORS(l, r, 8);
CASE_ALIGN_VECTORS(l, r, 9);
CASE_ALIGN_VECTORS(l, r, 10);
CASE_ALIGN_VECTORS(l, r, 11);
CASE_ALIGN_VECTORS(l, r, 12);
CASE_ALIGN_VECTORS(l, r, 13);
CASE_ALIGN_VECTORS(l, r, 14);
CASE_ALIGN_VECTORS(l, r, 15);
default:
return zeroes128();
break;
}
#endif
}
#undef CASE_ALIGN_VECTORS
static really_really_inline
m128 rshiftbyte_m128(m128 a, unsigned b) {
if (b)
return palignr(zeroes128(), a, b);
else
return a;
}
static really_really_inline
m128 lshiftbyte_m128(m128 a, unsigned b) {
if (b)
return palignr(a, zeroes128(), 16 - b);
else
return a;
}
#ifdef __cplusplus
extern "C" {
#endif
extern const u8 simd_onebit_masks[];
#ifdef __cplusplus
}
#endif
static really_inline
m128 mask1bit128(unsigned int n) {
assert(n < sizeof(m128) * 8);
u32 mask_idx = ((n % 8) * 64) + 95;
mask_idx -= n / 8;
return loadu128(&simd_onebit_masks[mask_idx]);
}
// switches on bit N in the given vector.
static really_inline
void setbit128(m128 *ptr, unsigned int n) {
*ptr = or128(mask1bit128(n), *ptr);
}
// switches off bit N in the given vector.
static really_inline
void clearbit128(m128 *ptr, unsigned int n) {
*ptr = andnot128(mask1bit128(n), *ptr);
}
// tests bit N in the given vector.
static really_inline
char testbit128(m128 val, unsigned int n) {
const m128 mask = mask1bit128(n);
return isnonzero128(and128(mask, val));
}
static really_inline
m128 pshufb_m128(m128 a, m128 b) {
/* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
In NEON, if >=16, then the result is zero, otherwise it is that lane.
btranslated is the version that is converted from Intel to NEON. */
int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f));
return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated);
}
static really_inline
m128 max_u8_m128(m128 a, m128 b) {
return (m128) vmaxq_u8((int8x16_t)a, (int8x16_t)b);
}
static really_inline
m128 min_u8_m128(m128 a, m128 b) {
return (m128) vminq_u8((int8x16_t)a, (int8x16_t)b);
}
static really_inline
m128 sadd_u8_m128(m128 a, m128 b) {
return (m128) vqaddq_u8((uint8x16_t)a, (uint8x16_t)b);
}
static really_inline
m128 sub_u8_m128(m128 a, m128 b) {
return (m128) vsubq_u8((uint8x16_t)a, (uint8x16_t)b);
}
static really_inline
m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) {
uint32_t __attribute__((aligned(16))) data[4] = { x0, x1, x2, x3 };
return (m128) vld1q_u32((uint32_t *) data);
}
static really_inline
m128 set2x64(u64a hi, u64a lo) {
uint64_t __attribute__((aligned(16))) data[2] = { lo, hi };
return (m128) vld1q_u64((uint64_t *) data);
}
#endif // ARCH_ARM_SIMD_UTILS_H

View File

@ -0,0 +1,418 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Bit-twiddling primitives (ctz, compress etc)
*/
#ifndef BITUTILS_ARCH_COMMON_H
#define BITUTILS_ARCH_COMMON_H
#include "util/popcount.h"
#include "util/unaligned.h"
#include "util/simd_utils.h"
static really_inline
u32 clz32_impl_c(u32 x) {
return (u32)__builtin_clz(x);
}
static really_inline
u32 clz64_impl_c(u64a x) {
return (u32)__builtin_clzll(x);
}
// CTZ (count trailing zero) implementations.
static really_inline
u32 ctz32_impl_c(u32 x) {
return (u32)__builtin_ctz(x);
}
static really_inline
u32 ctz64_impl_c(u64a x) {
return (u32)__builtin_ctzll(x);
}
static really_inline
u32 lg2_impl_c(u32 x) {
if (!x) {
return 0;
}
return 31 - clz32_impl_c(x);
}
static really_inline
u64a lg2_64_impl_c(u64a x) {
if (!x) {
return 0;
}
return 63 - clz64_impl_c(x);
}
static really_inline
u32 findAndClearLSB_32_impl_c(u32 *v) {
u32 val = *v;
u32 offset = ctz32_impl_c(val);
*v = val & (val - 1);
assert(offset < 32);
return offset;
}
static really_inline
u32 findAndClearLSB_64_impl_c(u64a *v) {
#ifdef ARCH_64_BIT
// generic variant using gcc's builtin on 64-bit
u64a val = *v, offset;
offset = ctz64_impl_c(val);
*v = val & (val - 1);
#else
// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
// inline calls to __builtin_ctzll
u32 v1 = (u32)*v;
u32 v2 = (u32)(*v >> 32);
u32 offset;
if (v1) {
offset = findAndClearLSB_32_impl_c(&v1);
*v = (u64a)v1 | ((u64a)v2 << 32);
} else {
offset = findAndClearLSB_32_impl_c(&v2) + 32;
*v = (u64a)v2 << 32;
}
#endif
assert(offset < 64);
return (u32)offset;
}
static really_inline
u32 findAndClearMSB_32_impl_c(u32 *v) {
u32 val = *v;
u32 offset = 31 - clz32_impl_c(val);
*v = val & ~(1 << offset);
assert(offset < 32);
return offset;
}
static really_inline
u32 findAndClearMSB_64_impl_c(u64a *v) {
#ifdef ARCH_64_BIT
// generic variant using gcc's builtin on 64-bit
u64a val = *v, offset;
offset = 63 - clz64_impl_c(val);
*v = val & ~(1ULL << offset);
#else
// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
// inline calls to __builtin_ctzll
u32 v1 = (u32)*v;
u32 v2 = (*v >> 32);
u32 offset;
if (v2) {
offset = findAndClearMSB_32_impl_c(&v2) + 32;
*v = ((u64a)v2 << 32) | (u64a)v1;
} else {
offset = findAndClearMSB_32_impl_c(&v1);
*v = (u64a)v1;
}
#endif
assert(offset < 64);
return (u32)offset;
}
static really_inline
u32 compress32_impl_c(u32 x, u32 m) {
// Return zero quickly on trivial cases
if ((x & m) == 0) {
return 0;
}
u32 mk, mp, mv, t;
x &= m; // clear irrelevant bits
mk = ~m << 1; // we will count 0's to right
for (u32 i = 0; i < 5; i++) {
mp = mk ^ (mk << 1);
mp ^= mp << 2;
mp ^= mp << 4;
mp ^= mp << 8;
mp ^= mp << 16;
mv = mp & m; // bits to move
m = (m ^ mv) | (mv >> (1 << i)); // compress m
t = x & mv;
x = (x ^ t) | (t >> (1 << i)); // compress x
mk = mk & ~mp;
}
return x;
}
static really_inline
u64a compress64_impl_c(u64a x, u64a m) {
u64a res = 0;
for (u64a bb = 1; m != 0; bb += bb) {
if (x & m & -m) { res |= bb; }
m &= (m - 1);
}
return res;
/* // Return zero quickly on trivial cases
if ((x & m) == 0) {
return 0;
}
u64a mk, mp, mv, t;
x &= m; // clear irrelevant bits
mk = ~m << 1; // we will count 0's to right
for (u32 i = 0; i < 6; i++) {
mp = mk ^ (mk << 1);
mp ^= mp << 2;
mp ^= mp << 4;
mp ^= mp << 8;
mp ^= mp << 16;
mp ^= mp << 32;
mv = mp & m; // bits to move
m = (m ^ mv) | (mv >> (1 << i)); // compress m
t = x & mv;
x = (x ^ t) | (t >> (1 << i)); // compress x
mk = mk & ~mp;
}
return x;*/
}
static really_inline
m128 compress128_impl_c(m128 xvec, m128 mvec) {
u64a ALIGN_ATTR(16) x[2];
u64a ALIGN_ATTR(16) m[2];
store128(x, xvec);
store128(m, mvec);
compress64_impl_c(x[0], m[0]);
compress64_impl_c(x[1], m[1]);
return xvec;
}
static really_inline
u32 expand32_impl_c(u32 x, u32 m) {
// Return zero quickly on trivial cases
if (!x || !m) {
return 0;
}
u32 m0, mk, mp, mv, t;
u32 array[5];
m0 = m; // save original mask
mk = ~m << 1; // we will count 0's to right
for (int i = 0; i < 5; i++) {
mp = mk ^ (mk << 1); // parallel suffix
mp = mp ^ (mp << 2);
mp = mp ^ (mp << 4);
mp = mp ^ (mp << 8);
mp = mp ^ (mp << 16);
mv = mp & m; // bits to move
array[i] = mv;
m = (m ^ mv) | (mv >> (1 << i)); // compress m
mk = mk & ~mp;
}
for (int i = 4; i >= 0; i--) {
mv = array[i];
t = x << (1 << i);
x = (x & ~mv) | (t & mv);
}
return x & m0; // clear out extraneous bits
}
static really_inline
u64a expand64_impl_c(u64a x, u64a m) {
u64a res = 0;
for (u64a bb = 1; m != 0; bb += bb) {
if (x & bb) { res |= m & (-m); }
m &= (m - 1);
}
return res;
/* // Return zero quickly on trivial cases
if (!x || !m) {
return 0;
}
u64a m0, mk, mp, mv, t;
u64a array[6];
m0 = m; // save original mask
mk = ~m << 1; // we will count 0's to right
for (int i = 0; i < 6; i++) {
mp = mk ^ (mk << 1); // parallel suffix
mp = mp ^ (mp << 2);
mp = mp ^ (mp << 4);
mp = mp ^ (mp << 8);
mp = mp ^ (mp << 16);
mp = mp ^ (mp << 32);
mv = mp & m; // bits to move
array[i] = mv;
m = (m ^ mv) | (mv >> (1 << i)); // compress m
mk = mk & ~mp;
}
for (int i = 5; i >= 0; i--) {
mv = array[i];
t = x << (1 << i);
x = (x & ~mv) | (t & mv);
}
return x & m0; // clear out extraneous bits*/
}
/* returns the first set bit after begin (if not ~0U). If no bit is set after
* begin returns ~0U
*/
static really_inline
u32 bf64_iterate_impl_c(u64a bitfield, u32 begin) {
if (begin != ~0U) {
/* switch off all bits at or below begin. Note: not legal to shift by
* by size of the datatype or larger. */
assert(begin <= 63);
bitfield &= ~((2ULL << begin) - 1);
}
if (!bitfield) {
return ~0U;
}
return ctz64_impl_c(bitfield);
}
static really_inline
char bf64_set_impl_c(u64a *bitfield, u32 i) {
u64a mask = 1ULL << i;
char was_set = !!(*bitfield & mask);
*bitfield |= mask;
return was_set;
}
static really_inline
void bf64_unset_impl_c(u64a *bitfield, u32 i) {
*bitfield &= ~(1ULL << i);
}
static really_inline
u32 rank_in_mask32_impl_c(u32 mask, u32 bit) {
mask &= (u32)(1U << bit) - 1;
return popcount32(mask);
}
static really_inline
u32 rank_in_mask64_impl_c(u64a mask, u32 bit) {
mask &= (u64a)(1ULL << bit) - 1;
return popcount64(mask);
}
static really_inline
u32 pext32_impl_c(u32 x, u32 mask) {
u32 result = 0, num = 1;
while (mask != 0) {
u32 bit = findAndClearLSB_32_impl_c(&mask);
if (x & (1U << bit)) {
assert(num != 0); // more than 32 bits!
result |= num;
}
num <<= 1;
}
return result;
}
static really_inline
u64a pext64_impl_c(u64a x, u64a mask) {
u32 result = 0, num = 1;
while (mask != 0) {
u32 bit = findAndClearLSB_64_impl_c(&mask);
if (x & (1ULL << bit)) {
assert(num != 0); // more than 32 bits!
result |= num;
}
num <<= 1;
}
return result;
}
static really_inline
u64a pdep64_impl_c(u64a x, u64a _m) {
/* Taken from:
* https://gcc.gnu.org/legacy-ml/gcc-patches/2017-06/msg01408.html
*/
u64a result = 0x0UL;
const u64a mask = 0x8000000000000000UL;
u64a m = _m;
u64a c, t;
u64a p;
/* The pop-count of the mask gives the number of the bits from
source to process. This is also needed to shift bits from the
source into the correct position for the result. */
p = 64 - __builtin_popcountl (_m);
/* The loop is for the number of '1' bits in the mask and clearing
each mask bit as it is processed. */
while (m != 0)
{
c = __builtin_clzl (m);
t = x << (p - c);
m ^= (mask >> c);
result |= (t & (mask >> c));
p++;
}
return (result);
}
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
* so we force its generation.
*/
static really_inline
u64a andn_impl_c(const u32 a, const u8 *b) {
return unaligned_load_u32(b) & ~a;
}
#endif // BITUTILS_ARCH_COMMON_H

View File

@ -31,7 +31,7 @@
#include "ue2common.h"
#if !defined(_WIN32) && !defined(CPUID_H_)
#if (defined(ARCH_IA32) || defined(ARCH_X86_64)) && !defined(_WIN32) && !defined(CPUID_H_)
#include <cpuid.h>
/* system header doesn't have a header guard */
#define CPUID_H_

View File

@ -0,0 +1,773 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief SIMD types and primitive operations.
*/
#ifndef ARCH_COMMON_SIMD_UTILS_H
#define ARCH_COMMON_SIMD_UTILS_H
#include "ue2common.h"
#include "util/simd_types.h"
#include "util/unaligned.h"
#include "util/intrinsics.h"
#include <string.h> // for memcpy
#if !defined(HAVE_SIMD_128_BITS)
#error "You need at least a 128-bit capable SIMD engine!"
#endif // HAVE_SIMD_128_BITS
#ifdef DEBUG
static inline void print_m128_16x8(char *label, m128 vector) {
uint8_t __attribute__((aligned(16))) data[16];
store128(data, vector);
DEBUG_PRINTF("%s: ", label);
for(int i=0; i < 16; i++)
printf("%02x ", data[i]);
printf("\n");
}
static inline void print_m128_8x16(char *label, m128 vector) {
uint16_t __attribute__((aligned(16))) data[8];
store128(data, vector);
DEBUG_PRINTF("%s: ", label);
for(int i=0; i < 8; i++)
printf("%04x ", data[i]);
printf("\n");
}
static inline void print_m128_4x32(char *label, m128 vector) {
uint32_t __attribute__((aligned(16))) data[4];
store128(data, vector);
DEBUG_PRINTF("%s: ", label);
for(int i=0; i < 4; i++)
printf("%08x ", data[i]);
printf("\n");
}
static inline void print_m128_2x64(char *label, m128 vector) {
uint64_t __attribute__((aligned(16))) data[2];
store128(data, vector);
DEBUG_PRINTF("%s: ", label);
for(int i=0; i < 2; i++)
printf("%016lx ", data[i]);
printf("\n");
}
#else
#define print_m128_16x8(label, vector) NULL
#define print_m128_8x16(label, vector) NULL
#define print_m128_4x32(label, vector) NULL
#define print_m128_2x64(label, vector) NULL
#endif
/****
**** 256-bit Primitives
****/
#if !defined(HAVE_SIMD_256_BITS)
static really_really_inline
m256 lshift64_m256(m256 a, int b) {
m256 rv = a;
rv.lo = lshift64_m128(rv.lo, b);
rv.hi = lshift64_m128(rv.hi, b);
return rv;
}
static really_inline
m256 rshift64_m256(m256 a, int b) {
m256 rv = a;
rv.lo = rshift64_m128(rv.lo, b);
rv.hi = rshift64_m128(rv.hi, b);
return rv;
}
static really_inline
m256 eq256(m256 a, m256 b) {
m256 rv;
rv.lo = eq128(a.lo, b.lo);
rv.hi = eq128(a.hi, b.hi);
return rv;
}
static really_inline
u32 movemask256(m256 a) {
u32 lo_mask = movemask128(a.lo);
u32 hi_mask = movemask128(a.hi);
return lo_mask | (hi_mask << 16);
}
static really_inline m256 set1_4x64(u64a c) {
m128 a128 = set1_2x64(c);
m256 rv = {a128, a128};
return rv;
}
static really_inline
m256 set1_2x128(m128 a) {
m256 rv = {a, a};
return rv;
}
static really_inline m256 zeroes256(void) {
m256 rv = {zeroes128(), zeroes128()};
return rv;
}
static really_inline m256 ones256(void) {
m256 rv = {ones128(), ones128()};
return rv;
}
static really_inline m256 and256(m256 a, m256 b) {
m256 rv;
rv.lo = and128(a.lo, b.lo);
rv.hi = and128(a.hi, b.hi);
return rv;
}
static really_inline m256 or256(m256 a, m256 b) {
m256 rv;
rv.lo = or128(a.lo, b.lo);
rv.hi = or128(a.hi, b.hi);
return rv;
}
static really_inline m256 xor256(m256 a, m256 b) {
m256 rv;
rv.lo = xor128(a.lo, b.lo);
rv.hi = xor128(a.hi, b.hi);
return rv;
}
static really_inline m256 not256(m256 a) {
m256 rv;
rv.lo = not128(a.lo);
rv.hi = not128(a.hi);
return rv;
}
static really_inline m256 andnot256(m256 a, m256 b) {
m256 rv;
rv.lo = andnot128(a.lo, b.lo);
rv.hi = andnot128(a.hi, b.hi);
return rv;
}
static really_inline int diff256(m256 a, m256 b) {
return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
}
static really_inline int isnonzero256(m256 a) {
return isnonzero128(or128(a.lo, a.hi));
}
/**
* "Rich" version of diff256(). Takes two vectors a and b and returns a 8-bit
* mask indicating which 32-bit words contain differences.
*/
static really_inline
u32 diffrich256(m256 a, m256 b) {
return diffrich128(a.lo, b.lo) | (diffrich128(a.hi, b.hi) << 4);
}
/**
* "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
* returns an 8-bit mask indicating which 64-bit words contain differences.
*/
static really_inline u32 diffrich64_256(m256 a, m256 b) {
u32 d = diffrich256(a, b);
return (d | (d >> 1)) & 0x55555555;
}
// aligned load
static really_inline m256 load256(const void *ptr) {
assert(ISALIGNED_N(ptr, alignof(m256)));
m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
return rv;
}
// aligned load of 128-bit value to low and high part of 256-bit value
static really_inline m256 load2x128(const void *ptr) {
return set1_2x128(load128(ptr));
}
static really_inline m256 loadu2x128(const void *ptr) {
return set1_2x128(loadu128(ptr));
}
// aligned store
static really_inline void store256(void *ptr, m256 a) {
assert(ISALIGNED_N(ptr, alignof(m256)));
ptr = assume_aligned(ptr, 16);
*(m256 *)ptr = a;
}
// unaligned load
static really_inline m256 loadu256(const void *ptr) {
m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
return rv;
}
// unaligned store
static really_inline void storeu256(void *ptr, m256 a) {
storeu128(ptr, a.lo);
storeu128((char *)ptr + 16, a.hi);
}
// packed unaligned store of first N bytes
static really_inline
void storebytes256(void *ptr, m256 a, unsigned int n) {
assert(n <= sizeof(a));
memcpy(ptr, &a, n);
}
// packed unaligned load of first N bytes, pad with zero
static really_inline
m256 loadbytes256(const void *ptr, unsigned int n) {
m256 a = zeroes256();
assert(n <= sizeof(a));
memcpy(&a, ptr, n);
return a;
}
static really_inline
m256 mask1bit256(unsigned int n) {
assert(n < sizeof(m256) * 8);
u32 mask_idx = ((n % 8) * 64) + 95;
mask_idx -= n / 8;
return loadu256(&simd_onebit_masks[mask_idx]);
}
static really_inline
m256 set1_32x8(u32 in) {
m256 rv;
rv.hi = set1_16x8(in);
rv.lo = set1_16x8(in);
return rv;
}
static really_inline
m256 set8x32(u32 hi_3, u32 hi_2, u32 hi_1, u32 hi_0, u32 lo_3, u32 lo_2, u32 lo_1, u32 lo_0) {
m256 rv;
rv.hi = set4x32(hi_3, hi_2, hi_1, hi_0);
rv.lo = set4x32(lo_3, lo_2, lo_1, lo_0);
return rv;
}
static really_inline
m256 set4x64(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
m256 rv;
rv.hi = set2x64(hi_1, hi_0);
rv.lo = set2x64(lo_1, lo_0);
return rv;
}
// switches on bit N in the given vector.
static really_inline
void setbit256(m256 *ptr, unsigned int n) {
assert(n < sizeof(*ptr) * 8);
m128 *sub;
if (n < 128) {
sub = &ptr->lo;
} else {
sub = &ptr->hi;
n -= 128;
}
setbit128(sub, n);
}
// switches off bit N in the given vector.
static really_inline
void clearbit256(m256 *ptr, unsigned int n) {
assert(n < sizeof(*ptr) * 8);
m128 *sub;
if (n < 128) {
sub = &ptr->lo;
} else {
sub = &ptr->hi;
n -= 128;
}
clearbit128(sub, n);
}
// tests bit N in the given vector.
static really_inline
char testbit256(m256 val, unsigned int n) {
assert(n < sizeof(val) * 8);
m128 sub;
if (n < 128) {
sub = val.lo;
} else {
sub = val.hi;
n -= 128;
}
return testbit128(sub, n);
}
static really_really_inline
m128 movdq_hi(m256 x) {
return x.hi;
}
static really_really_inline
m128 movdq_lo(m256 x) {
return x.lo;
}
static really_inline
m256 combine2x128(m128 hi, m128 lo) {
m256 rv = {lo, hi};
return rv;
}
static really_inline
m256 pshufb_m256(m256 a, m256 b) {
m256 rv;
rv.lo = pshufb_m128(a.lo, b.lo);
rv.hi = pshufb_m128(a.hi, b.hi);
return rv;
}
#endif // HAVE_SIMD_256_BITS
/****
**** 384-bit Primitives
****/
static really_inline m384 and384(m384 a, m384 b) {
m384 rv;
rv.lo = and128(a.lo, b.lo);
rv.mid = and128(a.mid, b.mid);
rv.hi = and128(a.hi, b.hi);
return rv;
}
static really_inline m384 or384(m384 a, m384 b) {
m384 rv;
rv.lo = or128(a.lo, b.lo);
rv.mid = or128(a.mid, b.mid);
rv.hi = or128(a.hi, b.hi);
return rv;
}
static really_inline m384 xor384(m384 a, m384 b) {
m384 rv;
rv.lo = xor128(a.lo, b.lo);
rv.mid = xor128(a.mid, b.mid);
rv.hi = xor128(a.hi, b.hi);
return rv;
}
static really_inline m384 not384(m384 a) {
m384 rv;
rv.lo = not128(a.lo);
rv.mid = not128(a.mid);
rv.hi = not128(a.hi);
return rv;
}
static really_inline m384 andnot384(m384 a, m384 b) {
m384 rv;
rv.lo = andnot128(a.lo, b.lo);
rv.mid = andnot128(a.mid, b.mid);
rv.hi = andnot128(a.hi, b.hi);
return rv;
}
static really_really_inline
m384 lshift64_m384(m384 a, unsigned b) {
m384 rv;
rv.lo = lshift64_m128(a.lo, b);
rv.mid = lshift64_m128(a.mid, b);
rv.hi = lshift64_m128(a.hi, b);
return rv;
}
static really_inline m384 zeroes384(void) {
m384 rv = {zeroes128(), zeroes128(), zeroes128()};
return rv;
}
static really_inline m384 ones384(void) {
m384 rv = {ones128(), ones128(), ones128()};
return rv;
}
static really_inline int diff384(m384 a, m384 b) {
return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
}
static really_inline int isnonzero384(m384 a) {
return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
}
#if defined(HAVE_SIMD_128_BITS) && !defined(ARCH_IA32) && !defined(ARCH_X86_64)
/**
* "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
* mask indicating which 32-bit words contain differences.
*/
static really_inline
u32 diffrich384(m384 a, m384 b) {
return diffrich128(a.lo, b.lo) | (diffrich128(a.mid, b.mid) << 4) | (diffrich128(a.hi, b.hi) << 8);
}
#endif
/**
* "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
* returns a 12-bit mask indicating which 64-bit words contain differences.
*/
static really_inline u32 diffrich64_384(m384 a, m384 b) {
u32 d = diffrich384(a, b);
return (d | (d >> 1)) & 0x55555555;
}
// aligned load
static really_inline m384 load384(const void *ptr) {
assert(ISALIGNED_16(ptr));
m384 rv = { load128(ptr), load128((const char *)ptr + 16),
load128((const char *)ptr + 32) };
return rv;
}
// aligned store
static really_inline void store384(void *ptr, m384 a) {
assert(ISALIGNED_16(ptr));
ptr = assume_aligned(ptr, 16);
*(m384 *)ptr = a;
}
// unaligned load
static really_inline m384 loadu384(const void *ptr) {
m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
loadu128((const char *)ptr + 32)};
return rv;
}
// packed unaligned store of first N bytes
static really_inline
void storebytes384(void *ptr, m384 a, unsigned int n) {
assert(n <= sizeof(a));
memcpy(ptr, &a, n);
}
// packed unaligned load of first N bytes, pad with zero
static really_inline
m384 loadbytes384(const void *ptr, unsigned int n) {
m384 a = zeroes384();
assert(n <= sizeof(a));
memcpy(&a, ptr, n);
return a;
}
// switches on bit N in the given vector.
static really_inline
void setbit384(m384 *ptr, unsigned int n) {
assert(n < sizeof(*ptr) * 8);
m128 *sub;
if (n < 128) {
sub = &ptr->lo;
} else if (n < 256) {
sub = &ptr->mid;
} else {
sub = &ptr->hi;
}
setbit128(sub, n % 128);
}
// switches off bit N in the given vector.
static really_inline
void clearbit384(m384 *ptr, unsigned int n) {
assert(n < sizeof(*ptr) * 8);
m128 *sub;
if (n < 128) {
sub = &ptr->lo;
} else if (n < 256) {
sub = &ptr->mid;
} else {
sub = &ptr->hi;
}
clearbit128(sub, n % 128);
}
// tests bit N in the given vector.
static really_inline
char testbit384(m384 val, unsigned int n) {
assert(n < sizeof(val) * 8);
m128 sub;
if (n < 128) {
sub = val.lo;
} else if (n < 256) {
sub = val.mid;
} else {
sub = val.hi;
}
return testbit128(sub, n % 128);
}
/****
**** 512-bit Primitives
****/
#if !defined(HAVE_SIMD_512_BITS)
static really_inline
m512 zeroes512(void) {
m512 rv = {zeroes256(), zeroes256()};
return rv;
}
static really_inline
m512 ones512(void) {
m512 rv = {ones256(), ones256()};
return rv;
}
static really_inline
m512 set1_64x8(u8 a) {
m256 a256 = set1_32x8(a);
m512 rv = {a256, a256};
return rv;
}
static really_inline
m512 set1_8x64(u64a a) {
m256 a256 = set1_4x64(a);
m512 rv = {a256, a256};
return rv;
}
static really_inline
m512 set8x64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
m512 rv;
rv.lo = set4x64(lo_3, lo_2, lo_1, lo_0);
rv.hi = set4x64(hi_3, hi_2, hi_1, hi_0);
return rv;
}
/*
static really_inline
m512 swap256in512(m512 a) {
m512 idx = set8x64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
return vpermq512(idx, a);
}*/
static really_inline
m512 set1_4x128(m128 a) {
m256 a256 = set1_2x128(a);
m512 rv = {a256, a256};
return rv;
}
static really_inline
m512 and512(m512 a, m512 b) {
m512 rv;
rv.lo = and256(a.lo, b.lo);
rv.hi = and256(a.hi, b.hi);
return rv;
}
static really_inline
m512 or512(m512 a, m512 b) {
m512 rv;
rv.lo = or256(a.lo, b.lo);
rv.hi = or256(a.hi, b.hi);
return rv;
}
static really_inline
m512 xor512(m512 a, m512 b) {
m512 rv;
rv.lo = xor256(a.lo, b.lo);
rv.hi = xor256(a.hi, b.hi);
return rv;
}
static really_inline
m512 not512(m512 a) {
m512 rv;
rv.lo = not256(a.lo);
rv.hi = not256(a.hi);
return rv;
}
static really_inline
m512 andnot512(m512 a, m512 b) {
m512 rv;
rv.lo = andnot256(a.lo, b.lo);
rv.hi = andnot256(a.hi, b.hi);
return rv;
}
static really_really_inline
m512 lshift64_m512(m512 a, unsigned b) {
m512 rv;
rv.lo = lshift64_m256(a.lo, b);
rv.hi = lshift64_m256(a.hi, b);
return rv;
}
static really_inline
int diff512(m512 a, m512 b) {
return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
}
static really_inline
int isnonzero512(m512 a) {
m256 x = or256(a.lo, a.lo);
m256 y = or256(a.hi, a.hi);
return isnonzero256(or256(x, y));
}
/**
* "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
* mask indicating which 32-bit words contain differences.
*/
static really_inline
u32 diffrich512(m512 a, m512 b) {
return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
}
/**
* "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
* returns a 16-bit mask indicating which 64-bit words contain differences.
*/
static really_inline
u32 diffrich64_512(m512 a, m512 b) {
//TODO: cmp_epi64?
u32 d = diffrich512(a, b);
return (d | (d >> 1)) & 0x55555555;
}
// aligned load
static really_inline
m512 load512(const void *ptr) {
assert(ISALIGNED_N(ptr, alignof(m256)));
m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
return rv;
}
// aligned store
static really_inline
void store512(void *ptr, m512 a) {
assert(ISALIGNED_N(ptr, alignof(m512)));
m512 *x = (m512 *)ptr;
store256(&x->lo, a.lo);
store256(&x->hi, a.hi);
}
// unaligned load
static really_inline
m512 loadu512(const void *ptr) {
m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
return rv;
}
/*static really_inline
m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
}
static really_inline
m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
}
static really_inline
m512 set_mask_m512(__mmask64 k) {
}*/
// packed unaligned store of first N bytes
static really_inline
void storebytes512(void *ptr, m512 a, unsigned int n) {
assert(n <= sizeof(a));
memcpy(ptr, &a, n);
}
// packed unaligned load of first N bytes, pad with zero
static really_inline
m512 loadbytes512(const void *ptr, unsigned int n) {
m512 a = zeroes512();
assert(n <= sizeof(a));
memcpy(&a, ptr, n);
return a;
}
static really_inline
m512 mask1bit512(unsigned int n) {
assert(n < sizeof(m512) * 8);
u32 mask_idx = ((n % 8) * 64) + 95;
mask_idx -= n / 8;
return loadu512(&simd_onebit_masks[mask_idx]);
}
// switches on bit N in the given vector.
static really_inline
void setbit512(m512 *ptr, unsigned int n) {
assert(n < sizeof(*ptr) * 8);
m256 *sub;
if (n < 256) {
sub = &ptr->lo;
} else {
sub = &ptr->hi;
n -= 256;
}
setbit256(sub, n);
}
// switches off bit N in the given vector.
static really_inline
void clearbit512(m512 *ptr, unsigned int n) {
assert(n < sizeof(*ptr) * 8);
m256 *sub;
if (n < 256) {
sub = &ptr->lo;
} else {
sub = &ptr->hi;
n -= 256;
}
clearbit256(sub, n);
}
// tests bit N in the given vector.
static really_inline
char testbit512(m512 val, unsigned int n) {
assert(n < sizeof(val) * 8);
m256 sub;
if (n < 256) {
sub = val.lo;
} else {
sub = val.hi;
n -= 256;
}
return testbit256(sub, n);
}
#endif // HAVE_SIMD_512_BITS
#endif // ARCH_COMMON_SIMD_UTILS_H

View File

@ -0,0 +1,322 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Bit-twiddling primitives (ctz, compress etc)
*/
#ifndef BITUTILS_ARCH_X86_H
#define BITUTILS_ARCH_X86_H
#include "ue2common.h"
#include "util/popcount.h"
#include "util/arch.h"
#include "util/intrinsics.h"
#include "util/arch/common/bitutils.h"
static really_inline
u32 clz32_impl(u32 x) {
#if defined(_WIN32)
unsigned long r;
_BitScanReverse(&r, x);
return 31 - r;
#else
return clz32_impl_c(x);
#endif
}
static really_inline
u32 clz64_impl(u64a x) {
#if defined(_WIN64)
unsigned long r;
_BitScanReverse64(&r, x);
return 63 - r;
#elif defined(_WIN32)
unsigned long x1 = (u32)x;
unsigned long x2 = (u32)(x >> 32);
unsigned long r;
if (x2) {
_BitScanReverse(&r, x2);
return (u32)(31 - r);
}
_BitScanReverse(&r, (u32)x1);
return (u32)(63 - r);
#else
return clz64_impl_c(x);
#endif
}
// CTZ (count trailing zero) implementations.
static really_inline
u32 ctz32_impl(u32 x) {
#if defined(_WIN32)
unsigned long r;
_BitScanForward(&r, x);
return r;
#else
return ctz32_impl_c(x);
#endif
}
static really_inline
u32 ctz64_impl(u64a x) {
#if defined(_WIN64)
unsigned long r;
_BitScanForward64(&r, x);
return r;
#elif defined(_WIN32)
unsigned long r;
if (_BitScanForward(&r, (u32)x)) {
return (u32)r;
}
_BitScanForward(&r, x >> 32);
return (u32)(r + 32);
#else
return ctz64_impl_c(x);
#endif
}
static really_inline
u32 lg2_impl(u32 x) {
return lg2_impl_c(x);
}
static really_inline
u64a lg2_64_impl(u64a x) {
return lg2_64_impl_c(x);
}
static really_inline
u32 findAndClearLSB_32_impl(u32 *v) {
#ifndef NO_ASM
u32 val = *v, offset;
__asm__ ("bsf %1, %0\n"
"btr %0, %1\n"
: "=r" (offset), "=r" (val)
: "1" (val));
*v = val;
assert(offset < 32);
return offset;
#else
return findAndClearLSB_32_impl_c(v);
#endif
}
static really_inline
u32 findAndClearLSB_64_impl(u64a *v) {
#ifdef ARCH_64_BIT
#if !defined(NO_ASM)
u64a val = *v, offset;
__asm__ ("bsfq %1, %0\n"
"btrq %0, %1\n"
: "=r" (offset), "=r" (val)
: "1" (val));
*v = val;
#else
// generic variant using gcc's builtin on 64-bit
u64a val = *v, offset;
offset = ctz64(val);
*v = val & (val - 1);
#endif // ARCH_X86_64
assert(offset < 64);
return (u32)offset;
#else
return findAndClearLSB_64_impl_c(v);
#endif
}
static really_inline
u32 findAndClearMSB_32_impl(u32 *v) {
#if !defined(NO_ASM)
u32 val = *v, offset;
__asm__ ("bsr %1, %0\n"
"btr %0, %1\n"
: "=r" (offset), "=r" (val)
: "1" (val));
*v = val;
#else
u32 val = *v;
u32 offset = 31 - clz32_impl(val);
*v = val & ~(1 << offset);
#endif
assert(offset < 32);
return offset;
}
static really_inline
u32 findAndClearMSB_64_impl(u64a *v) {
#ifdef ARCH_64_BIT
#if !defined(NO_ASM)
u64a val = *v, offset;
__asm__ ("bsrq %1, %0\n"
"btrq %0, %1\n"
: "=r" (offset), "=r" (val)
: "1" (val));
*v = val;
#else
// generic variant using gcc's builtin on 64-bit
u64a val = *v, offset;
offset = 63 - clz64_impl(val);
*v = val & ~(1ULL << offset);
#endif // ARCH_X86_64
assert(offset < 64);
return (u32)offset;
#else
return findAndClearMSB_64_impl_c(v);
#endif
}
static really_inline
u32 compress32_impl(u32 x, u32 m) {
#if defined(HAVE_BMI2)
// BMI2 has a single instruction for this operation.
return _pext_u32(x, m);
#else
return compress32_impl_c(x, m);
#endif
}
static really_inline
u64a compress64_impl(u64a x, u64a m) {
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
// BMI2 has a single instruction for this operation.
return _pext_u64(x, m);
#else
return compress64_impl_c(x, m);
#endif
}
static really_inline
m128 compress128_impl(m128 x, m128 m) {
return compress128_impl_c(x, m);
}
static really_inline
u32 expand32_impl(u32 x, u32 m) {
#if defined(HAVE_BMI2)
// BMI2 has a single instruction for this operation.
return _pdep_u32(x, m);
#else
return expand32_impl_c(x, m);
#endif
}
static really_inline
u64a expand64_impl(u64a x, u64a m) {
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
// BMI2 has a single instruction for this operation.
return _pdep_u64(x, m);
#else
return expand64_impl_c(x, m);
#endif
}
/* returns the first set bit after begin (if not ~0U). If no bit is set after
* begin returns ~0U
*/
static really_inline
u32 bf64_iterate_impl(u64a bitfield, u32 begin) {
if (begin != ~0U) {
/* switch off all bits at or below begin. Note: not legal to shift by
* by size of the datatype or larger. */
assert(begin <= 63);
bitfield &= ~((2ULL << begin) - 1);
}
if (!bitfield) {
return ~0U;
}
return ctz64_impl(bitfield);
}
static really_inline
char bf64_set_impl(u64a *bitfield, u32 i) {
return bf64_set_impl_c(bitfield, i);
}
static really_inline
void bf64_unset_impl(u64a *bitfield, u32 i) {
return bf64_unset_impl_c(bitfield, i);
}
static really_inline
u32 rank_in_mask32_impl(u32 mask, u32 bit) {
return rank_in_mask32_impl_c(mask, bit);
}
static really_inline
u32 rank_in_mask64_impl(u64a mask, u32 bit) {
return rank_in_mask64_impl_c(mask, bit);
}
static really_inline
u32 pext32_impl(u32 x, u32 mask) {
#if defined(HAVE_BMI2)
// Intel BMI2 can do this operation in one instruction.
return _pext_u32(x, mask);
#else
return pext32_impl_c(x, mask);
#endif
}
static really_inline
u64a pext64_impl(u64a x, u64a mask) {
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
// Intel BMI2 can do this operation in one instruction.
return _pext_u64(x, mask);
#else
return pext64_impl_c(x, mask);
#endif
}
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
static really_inline
u64a pdep64(u64a x, u64a mask) {
return _pdep_u64(x, mask);
}
#endif
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
* so we force its generation.
*/
static really_inline
u64a andn_impl(const u32 a, const u8 *b) {
#if defined(HAVE_BMI) && !defined(NO_ASM)
u64a r;
__asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
return r;
#else
return andn_impl_c(a, b);
#endif
}
#endif // BITUTILS_ARCH_X86_H

View File

@ -26,7 +26,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "cpuid_flags.h"
#include "util/arch/common/cpuid_flags.h"
#include "cpuid_inline.h"
#include "ue2common.h"
#include "hs_compile.h" // for HS_MODE_ flags

View File

@ -30,7 +30,7 @@
#define CPUID_INLINE_H_
#include "ue2common.h"
#include "cpuid_flags.h"
#include "util/arch/common/cpuid_flags.h"
#if !defined(_WIN32) && !defined(CPUID_H_)
#include <cpuid.h>

82
src/util/arch/x86/crc32.h Normal file
View File

@ -0,0 +1,82 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef UTIL_ARCH_X86_CRC32_H_
#define UTIL_ARCH_X86_CRC32_H_
#include "util/arch/x86/x86.h"
#include "util/intrinsics.h"
#ifdef ARCH_64_BIT
#define CRC_WORD 8
#define CRC_TYPE u64a
#define CRC_FUNC _mm_crc32_u64
#else
#define CRC_WORD 4
#define CRC_TYPE u32
#define CRC_FUNC _mm_crc32_u32
#endif
/*
* Use the crc32 instruction from SSE4.2 to compute our checksum - same
* polynomial as the above function.
*/
static really_inline
u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
const size_t length) {
u32 crc = running_crc;
// Process byte-by-byte until p_buf is aligned
const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD);
size_t init_bytes = aligned_buf - p_buf;
size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD;
size_t end_bytes = length - init_bytes - running_length;
while (p_buf < aligned_buf) {
crc = _mm_crc32_u8(crc, *p_buf++);
}
// Main aligned loop, processes a word at a time.
for (size_t li = 0; li < running_length/CRC_WORD; li++) {
CRC_TYPE block = *(const CRC_TYPE *)p_buf;
crc = CRC_FUNC(crc, block);
p_buf += CRC_WORD;
}
// Remaining bytes
for(size_t li = 0; li < end_bytes; li++) {
crc = _mm_crc32_u8(crc, *p_buf++);
}
return crc;
}
#endif // UTIL_ARCH_X86_CRC32_H_

View File

@ -29,12 +29,12 @@
#ifndef MASKED_MOVE_H
#define MASKED_MOVE_H
#include "arch.h"
#include "x86.h"
#if defined(HAVE_AVX2)
#include "unaligned.h"
#include "simd_utils.h"
#include "util/unaligned.h"
#include "util/simd_utils.h"
#ifdef __cplusplus
extern "C" {

View File

@ -0,0 +1,45 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SIMD_TYPES_X86_H
#define SIMD_TYPES_X86_H
#if !defined(m128) && defined(HAVE_SSE2)
typedef __m128i m128;
#endif
#if !defined(m128) && defined(HAVE_AVX2)
typedef __m256i m256;
#endif
#if !defined(m512) && defined(HAVE_AVX512)
typedef __m512i m512;
#endif
#endif /* SIMD_TYPES_X86_H */

View File

@ -0,0 +1,772 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief SIMD types and primitive operations.
*/
#ifndef ARCH_X86_SIMD_UTILS_H
#define ARCH_X86_SIMD_UTILS_H
#include "x86.h"
#include "ue2common.h"
#include "util/simd_types.h"
#include "util/unaligned.h"
#include "util/intrinsics.h"
#include <string.h> // for memcpy
static really_inline m128 ones128(void) {
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
/* gcc gets this right */
return _mm_set1_epi8(0xFF);
#else
/* trick from Intel's optimization guide to generate all-ones.
* ICC converts this to the single cmpeq instruction */
return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
#endif
}
static really_inline m128 zeroes128(void) {
return _mm_setzero_si128();
}
/** \brief Bitwise not for m128*/
static really_inline m128 not128(m128 a) {
return _mm_xor_si128(a, ones128());
}
/** \brief Return 1 if a and b are different otherwise 0 */
static really_inline int diff128(m128 a, m128 b) {
return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
}
static really_inline int isnonzero128(m128 a) {
return !!diff128(a, zeroes128());
}
/**
* "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
* mask indicating which 32-bit words contain differences.
*/
static really_inline u32 diffrich128(m128 a, m128 b) {
a = _mm_cmpeq_epi32(a, b);
return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
}
/**
* "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
* returns a 4-bit mask indicating which 64-bit words contain differences.
*/
static really_inline u32 diffrich64_128(m128 a, m128 b) {
#if defined(HAVE_SSE41)
a = _mm_cmpeq_epi64(a, b);
return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
#else
u32 d = diffrich128(a, b);
return (d | (d >> 1)) & 0x5;
#endif
}
static really_really_inline
m128 lshift64_m128(m128 a, unsigned b) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return _mm_slli_epi64(a, b);
}
#endif
m128 x = _mm_cvtsi32_si128(b);
return _mm_sll_epi64(a, x);
}
#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
#define eq128(a, b) _mm_cmpeq_epi8((a), (b))
#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
static really_inline m128 set1_16x8(u8 c) {
return _mm_set1_epi8(c);
}
static really_inline m128 set1_4x32(u32 c) {
return _mm_set1_epi32(c);
}
static really_inline m128 set1_2x64(u64a c) {
return _mm_set1_epi64x(c);
}
static really_inline u32 movd(const m128 in) {
return _mm_cvtsi128_si32(in);
}
static really_inline u64a movq(const m128 in) {
return _mm_cvtsi128_si64(in);
}
/* another form of movq */
static really_inline
m128 load_m128_from_u64a(const u64a *p) {
return _mm_set_epi64x(0LL, *p);
}
#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
#if defined(HAVE_SSE41)
#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
#else
#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
#endif
#if !defined(HAVE_AVX2)
// TODO: this entire file needs restructuring - this carveout is awful
#define extractlow64from256(a) movq(a.lo)
#define extractlow32from256(a) movd(a.lo)
#if defined(HAVE_SSE41)
#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
#else
#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
#endif
#endif // !AVX2
static really_inline m128 and128(m128 a, m128 b) {
return _mm_and_si128(a,b);
}
static really_inline m128 xor128(m128 a, m128 b) {
return _mm_xor_si128(a,b);
}
static really_inline m128 or128(m128 a, m128 b) {
return _mm_or_si128(a,b);
}
static really_inline m128 andnot128(m128 a, m128 b) {
return _mm_andnot_si128(a, b);
}
// aligned load
static really_inline m128 load128(const void *ptr) {
assert(ISALIGNED_N(ptr, alignof(m128)));
ptr = assume_aligned(ptr, 16);
return _mm_load_si128((const m128 *)ptr);
}
// aligned store
static really_inline void store128(void *ptr, m128 a) {
assert(ISALIGNED_N(ptr, alignof(m128)));
ptr = assume_aligned(ptr, 16);
*(m128 *)ptr = a;
}
// unaligned load
static really_inline m128 loadu128(const void *ptr) {
return _mm_loadu_si128((const m128 *)ptr);
}
// unaligned store
static really_inline void storeu128(void *ptr, m128 a) {
_mm_storeu_si128 ((m128 *)ptr, a);
}
// packed unaligned store of first N bytes
static really_inline
void storebytes128(void *ptr, m128 a, unsigned int n) {
assert(n <= sizeof(a));
memcpy(ptr, &a, n);
}
// packed unaligned load of first N bytes, pad with zero
static really_inline
m128 loadbytes128(const void *ptr, unsigned int n) {
m128 a = zeroes128();
assert(n <= sizeof(a));
memcpy(&a, ptr, n);
return a;
}
#ifdef __cplusplus
extern "C" {
#endif
extern const u8 simd_onebit_masks[];
#ifdef __cplusplus
}
#endif
static really_inline
m128 mask1bit128(unsigned int n) {
assert(n < sizeof(m128) * 8);
u32 mask_idx = ((n % 8) * 64) + 95;
mask_idx -= n / 8;
return loadu128(&simd_onebit_masks[mask_idx]);
}
// switches on bit N in the given vector.
static really_inline
void setbit128(m128 *ptr, unsigned int n) {
*ptr = or128(mask1bit128(n), *ptr);
}
// switches off bit N in the given vector.
static really_inline
void clearbit128(m128 *ptr, unsigned int n) {
*ptr = andnot128(mask1bit128(n), *ptr);
}
// tests bit N in the given vector.
static really_inline
char testbit128(m128 val, unsigned int n) {
const m128 mask = mask1bit128(n);
#if defined(HAVE_SSE41)
return !_mm_testz_si128(mask, val);
#else
return isnonzero128(and128(mask, val));
#endif
}
// offset must be an immediate
#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
static really_inline
m128 pshufb_m128(m128 a, m128 b) {
m128 result;
result = _mm_shuffle_epi8(a, b);
return result;
}
static really_inline
m128 variable_byte_shift_m128(m128 in, s32 amount) {
assert(amount >= -16 && amount <= 16);
m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
return pshufb_m128(in, shift_mask);
}
static really_inline
m128 max_u8_m128(m128 a, m128 b) {
return _mm_max_epu8(a, b);
}
static really_inline
m128 min_u8_m128(m128 a, m128 b) {
return _mm_min_epu8(a, b);
}
static really_inline
m128 sadd_u8_m128(m128 a, m128 b) {
return _mm_adds_epu8(a, b);
}
static really_inline
m128 sub_u8_m128(m128 a, m128 b) {
return _mm_sub_epi8(a, b);
}
static really_inline
m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) {
return _mm_set_epi32(x3, x2, x1, x0);
}
static really_inline
m128 set2x64(u64a hi, u64a lo) {
return _mm_set_epi64x(hi, lo);
}
/****
**** 256-bit Primitives
****/
#if defined(HAVE_SIMD_256_BITS) && defined(HAVE_AVX2)
static really_inline
m256 pshufb_m256(m256 a, m256 b) {
return _mm256_shuffle_epi8(a, b);
}
static really_really_inline
m256 lshift64_m256(m256 a, unsigned b) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return _mm256_slli_epi64(a, b);
}
#endif
m128 x = _mm_cvtsi32_si128(b);
return _mm256_sll_epi64(a, x);
}
#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
static really_inline m256 set1_4x64(u64a c) {
return _mm256_set1_epi64x(c);
}
#define eq256(a, b) _mm256_cmpeq_epi8((a), (b))
#define movemask256(a) ((u32)_mm256_movemask_epi8((a)))
static really_inline
m256 set1_2x128(m128 a) {
return _mm256_broadcastsi128_si256(a);
}
static really_inline m256 zeroes256(void) {
return _mm256_setzero_si256();
}
static really_inline m256 ones256(void) {
m256 rv = _mm256_set1_epi8(0xFF);
return rv;
}
static really_inline m256 and256(m256 a, m256 b) {
return _mm256_and_si256(a, b);
}
static really_inline m256 or256(m256 a, m256 b) {
return _mm256_or_si256(a, b);
}
static really_inline m256 xor256(m256 a, m256 b) {
return _mm256_xor_si256(a, b);
}
static really_inline m256 not256(m256 a) {
return _mm256_xor_si256(a, ones256());
}
static really_inline m256 andnot256(m256 a, m256 b) {
return _mm256_andnot_si256(a, b);
}
static really_inline int diff256(m256 a, m256 b) {
return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
}
static really_inline int isnonzero256(m256 a) {
return !!diff256(a, zeroes256());
}
/**
* "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
* mask indicating which 32-bit words contain differences.
*/
static really_inline u32 diffrich256(m256 a, m256 b) {
a = _mm256_cmpeq_epi32(a, b);
return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
}
/**
* "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
* returns an 8-bit mask indicating which 64-bit words contain differences.
*/
static really_inline u32 diffrich64_256(m256 a, m256 b) {
u32 d = diffrich256(a, b);
return (d | (d >> 1)) & 0x55555555;
}
// aligned load
static really_inline m256 load256(const void *ptr) {
assert(ISALIGNED_N(ptr, alignof(m256)));
return _mm256_load_si256((const m256 *)ptr);
}
// aligned load of 128-bit value to low and high part of 256-bit value
static really_inline m256 load2x128(const void *ptr) {
return set1_2x128(load128(ptr));
}
static really_inline m256 loadu2x128(const void *ptr) {
return set1_2x128(loadu128(ptr));
}
// aligned store
static really_inline void store256(void *ptr, m256 a) {
assert(ISALIGNED_N(ptr, alignof(m256)));
_mm256_store_si256((m256 *)ptr, a);
}
// unaligned load
static really_inline m256 loadu256(const void *ptr) {
return _mm256_loadu_si256((const m256 *)ptr);
}
// unaligned store
static really_inline void storeu256(void *ptr, m256 a) {
_mm256_storeu_si256((m256 *)ptr, a);
}
// packed unaligned store of first N bytes
static really_inline
void storebytes256(void *ptr, m256 a, unsigned int n) {
assert(n <= sizeof(a));
memcpy(ptr, &a, n);
}
// packed unaligned load of first N bytes, pad with zero
static really_inline
m256 loadbytes256(const void *ptr, unsigned int n) {
m256 a = zeroes256();
assert(n <= sizeof(a));
memcpy(&a, ptr, n);
return a;
}
static really_inline
m256 mask1bit256(unsigned int n) {
assert(n < sizeof(m256) * 8);
u32 mask_idx = ((n % 8) * 64) + 95;
mask_idx -= n / 8;
return loadu256(&simd_onebit_masks[mask_idx]);
}
static really_inline
m256 set1_32x8(u32 in) {
return _mm256_set1_epi8(in);
}
static really_inline
m256 set8x32(u32 hi_3, u32 hi_2, u32 hi_1, u32 hi_0, u32 lo_3, u32 lo_2, u32 lo_1, u32 lo_0) {
return _mm256_set_epi32(hi_3, hi_2, hi_1, hi_0, lo_3, lo_2, lo_1, lo_0);
}
static really_inline
m256 set4x64(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
}
// switches on bit N in the given vector.
static really_inline
void setbit256(m256 *ptr, unsigned int n) {
*ptr = or256(mask1bit256(n), *ptr);
}
static really_inline
void clearbit256(m256 *ptr, unsigned int n) {
*ptr = andnot256(mask1bit256(n), *ptr);
}
// tests bit N in the given vector.
static really_inline
char testbit256(m256 val, unsigned int n) {
const m256 mask = mask1bit256(n);
return !_mm256_testz_si256(mask, val);
}
static really_really_inline
m128 movdq_hi(m256 x) {
return _mm256_extracti128_si256(x, 1);
}
static really_really_inline
m128 movdq_lo(m256 x) {
return _mm256_extracti128_si256(x, 0);
}
#define cast256to128(a) _mm256_castsi256_si128(a)
#define cast128to256(a) _mm256_castsi128_si256(a)
#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
#define extractlow32from256(a) movd(cast256to128(a))
#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
static really_inline
m256 combine2x128(m128 hi, m128 lo) {
#if defined(_mm256_set_m128i)
return _mm256_set_m128i(hi, lo);
#else
return insert128to256(cast128to256(lo), hi, 1);
#endif
}
#endif //AVX2
#if defined(HAVE_SIMD_128_BITS)
/**
* "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
* mask indicating which 32-bit words contain differences.
*/
static really_inline u32 diffrich384(m384 a, m384 b) {
m128 z = zeroes128();
a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
_mm_packs_epi32(a.hi, z));
return ~(_mm_movemask_epi8(packed)) & 0xfff;
}
#endif // HAVE_SIMD_128_BITS
/****
**** 512-bit Primitives
****/
#if defined(HAVE_SIMD_512_BITS)
#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
#define set2x256(a) _mm512_broadcast_i64x4(a)
#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
static really_inline u32 movd512(const m512 in) {
// NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
// so we use 2-step convertions to work around.
return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
}
static really_inline
m512 pshufb_m512(m512 a, m512 b) {
return _mm512_shuffle_epi8(a, b);
}
static really_inline
m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
return _mm512_maskz_shuffle_epi8(k, a, b);
}
#if defined(HAVE_AVX512VBMI)
#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
#endif
#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
static really_inline
m512 zeroes512(void) {
#if defined(HAVE_AVX512)
return _mm512_setzero_si512();
#else
m512 rv = {zeroes256(), zeroes256()};
return rv;
#endif
}
static really_inline
m512 ones512(void) {
return _mm512_set1_epi8(0xFF);
//return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
}
static really_inline
m512 set1_64x8(u8 a) {
return _mm512_set1_epi8(a);
}
static really_inline
m512 set1_8x64(u64a a) {
return _mm512_set1_epi64(a);
}
static really_inline
m512 set8x64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
lo_3, lo_2, lo_1, lo_0);
}
static really_inline
m512 swap256in512(m512 a) {
m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
return vpermq512(idx, a);
}
static really_inline
m512 set1_4x128(m128 a) {
return _mm512_broadcast_i32x4(a);
}
static really_inline
m512 and512(m512 a, m512 b) {
return _mm512_and_si512(a, b);
}
static really_inline
m512 or512(m512 a, m512 b) {
return _mm512_or_si512(a, b);
}
static really_inline
m512 xor512(m512 a, m512 b) {
return _mm512_xor_si512(a, b);
}
static really_inline
m512 not512(m512 a) {
return _mm512_xor_si512(a, ones512());
}
static really_inline
m512 andnot512(m512 a, m512 b) {
return _mm512_andnot_si512(a, b);
}
static really_really_inline
m512 lshift64_m512(m512 a, unsigned b) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return _mm512_slli_epi64(a, b);
}
#endif
m128 x = _mm_cvtsi32_si128(b);
return _mm512_sll_epi64(a, x);
}
#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
#if !defined(_MM_CMPINT_NE)
#define _MM_CMPINT_NE 0x4
#endif
static really_inline
int diff512(m512 a, m512 b) {
return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
}
static really_inline
int isnonzero512(m512 a) {
return diff512(a, zeroes512());
}
/**
* "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
* mask indicating which 32-bit words contain differences.
*/
static really_inline
u32 diffrich512(m512 a, m512 b) {
return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
}
/**
* "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
* returns a 16-bit mask indicating which 64-bit words contain differences.
*/
static really_inline
u32 diffrich64_512(m512 a, m512 b) {
//TODO: cmp_epi64?
u32 d = diffrich512(a, b);
return (d | (d >> 1)) & 0x55555555;
}
// aligned load
static really_inline
m512 load512(const void *ptr) {
return _mm512_load_si512(ptr);
}
// aligned store
static really_inline
void store512(void *ptr, m512 a) {
assert(ISALIGNED_N(ptr, alignof(m512)));
return _mm512_store_si512(ptr, a);
}
// unaligned load
static really_inline
m512 loadu512(const void *ptr) {
return _mm512_loadu_si512(ptr);
}
static really_inline
m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
return _mm512_maskz_loadu_epi8(k, ptr);
}
static really_inline
m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
return _mm512_mask_loadu_epi8(src, k, ptr);
}
static really_inline
m512 set_mask_m512(__mmask64 k) {
return _mm512_movm_epi8(k);
}
// packed unaligned store of first N bytes
static really_inline
void storebytes512(void *ptr, m512 a, unsigned int n) {
assert(n <= sizeof(a));
memcpy(ptr, &a, n);
}
// packed unaligned load of first N bytes, pad with zero
static really_inline
m512 loadbytes512(const void *ptr, unsigned int n) {
m512 a = zeroes512();
assert(n <= sizeof(a));
memcpy(&a, ptr, n);
return a;
}
static really_inline
m512 mask1bit512(unsigned int n) {
assert(n < sizeof(m512) * 8);
u32 mask_idx = ((n % 8) * 64) + 95;
mask_idx -= n / 8;
return loadu512(&simd_onebit_masks[mask_idx]);
}
// switches on bit N in the given vector.
static really_inline
void setbit512(m512 *ptr, unsigned int n) {
assert(n < sizeof(*ptr) * 8);
*ptr = or512(mask1bit512(n), *ptr);
}
// switches off bit N in the given vector.
static really_inline
void clearbit512(m512 *ptr, unsigned int n) {
assert(n < sizeof(*ptr) * 8);
*ptr = andnot512(mask1bit512(n), *ptr);
}
// tests bit N in the given vector.
static really_inline
char testbit512(m512 val, unsigned int n) {
assert(n < sizeof(val) * 8);
const m512 mask = mask1bit512(n);
return !!_mm512_test_epi8_mask(mask, val);
}
#endif // HAVE_SIMD_512_BITS
#endif // ARCH_X86_SIMD_UTILS_H

96
src/util/arch/x86/x86.h Normal file
View File

@ -0,0 +1,96 @@
/*
* Copyright (c) 2017-2020, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Per-platform architecture definitions
*/
#ifndef UTIL_ARCH_X86_H_
#define UTIL_ARCH_X86_H_
#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
#define HAVE_SSE2
#define HAVE_SIMD_128_BITS
#endif
#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__))
#define HAVE_SSE41
#define HAVE_SIMD_128_BITS
#endif
#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__))
#define HAVE_SSE42
#define HAVE_SIMD_128_BITS
#endif
#if defined(__AVX__)
#define HAVE_AVX
#define HAVE_SIMD_256_BITS
#endif
#if defined(__AVX2__)
#define HAVE_AVX2
#define HAVE_SIMD_256_BITS
#endif
#if defined(__AVX512BW__)
#define HAVE_AVX512
#define HAVE_SIMD_512_BITS
#endif
#if defined(__AVX512VBMI__)
#define HAVE_AVX512VBMI
#endif
/*
* ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
*/
#if defined(__POPCNT__) || \
(defined(__INTEL_COMPILER) && defined(__SSE4_2__)) || \
(defined(_WIN32) && defined(__AVX__))
#define HAVE_POPCOUNT_INSTR
#endif
#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) || \
(defined(__INTEL_COMPILER) && defined(__AVX2__))
#define HAVE_BMI
#endif
#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) || \
(defined(__INTEL_COMPILER) && defined(__AVX2__))
#define HAVE_BMI2
#endif
/*
* MSVC uses a different form of inline asm
*/
#if defined(_WIN32) && defined(_MSC_VER)
#define NO_ASM
#endif
#endif // UTIL_ARCH_X86_H_

View File

@ -33,6 +33,7 @@
#ifndef BITUTILS_H
#define BITUTILS_H
#include "config.h"
#include "ue2common.h"
#include "popcount.h"
#include "util/arch.h"
@ -43,351 +44,95 @@
#define DOUBLE_CASE_CLEAR 0xdfdf
#define OCTO_CASE_CLEAR 0xdfdfdfdfdfdfdfdfULL
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
#include "util/arch/x86/bitutils.h"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#include "util/arch/arm/bitutils.h"
#endif
static really_inline
u32 clz32(u32 x) {
assert(x); // behaviour not defined for x == 0
#if defined(_WIN32)
unsigned long r;
_BitScanReverse(&r, x);
return 31 - r;
#else
return (u32)__builtin_clz(x);
#endif
return clz32_impl(x);
}
static really_inline
u32 clz64(u64a x) {
assert(x); // behaviour not defined for x == 0
#if defined(_WIN64)
unsigned long r;
_BitScanReverse64(&r, x);
return 63 - r;
#elif defined(_WIN32)
unsigned long x1 = (u32)x;
unsigned long x2 = (u32)(x >> 32);
unsigned long r;
if (x2) {
_BitScanReverse(&r, x2);
return (u32)(31 - r);
}
_BitScanReverse(&r, (u32)x1);
return (u32)(63 - r);
#else
return (u32)__builtin_clzll(x);
#endif
return clz64_impl(x);
}
// CTZ (count trailing zero) implementations.
static really_inline
u32 ctz32(u32 x) {
assert(x); // behaviour not defined for x == 0
#if defined(_WIN32)
unsigned long r;
_BitScanForward(&r, x);
return r;
#else
return (u32)__builtin_ctz(x);
#endif
return ctz32_impl(x);
}
static really_inline
u32 ctz64(u64a x) {
assert(x); // behaviour not defined for x == 0
#if defined(_WIN64)
unsigned long r;
_BitScanForward64(&r, x);
return r;
#elif defined(_WIN32)
unsigned long r;
if (_BitScanForward(&r, (u32)x)) {
return (u32)r;
}
_BitScanForward(&r, x >> 32);
return (u32)(r + 32);
#else
return (u32)__builtin_ctzll(x);
#endif
return ctz64_impl(x);
}
static really_inline
u32 lg2(u32 x) {
if (!x) {
return 0;
}
return 31 - clz32(x);
return lg2_impl(x);
}
static really_inline
u64a lg2_64(u64a x) {
if (!x) {
return 0;
}
return 63 - clz64(x);
return lg2_64_impl(x);
}
static really_inline
u32 findAndClearLSB_32(u32 *v) {
assert(*v != 0); // behaviour not defined in this case
#ifndef NO_ASM
u32 val = *v, offset;
__asm__ ("bsf %1, %0\n"
"btr %0, %1\n"
: "=r" (offset), "=r" (val)
: "1" (val));
*v = val;
#else
u32 val = *v;
u32 offset = ctz32(val);
*v = val & (val - 1);
#endif
assert(offset < 32);
return offset;
return findAndClearLSB_32_impl(v);
}
static really_inline
u32 findAndClearLSB_64(u64a *v) {
assert(*v != 0); // behaviour not defined in this case
#ifdef ARCH_64_BIT
#if defined(ARCH_X86_64) && !defined(NO_ASM)
u64a val = *v, offset;
__asm__ ("bsfq %1, %0\n"
"btrq %0, %1\n"
: "=r" (offset), "=r" (val)
: "1" (val));
*v = val;
#else
// generic variant using gcc's builtin on 64-bit
u64a val = *v, offset;
offset = ctz64(val);
*v = val & (val - 1);
#endif // ARCH_X86_64
#else
// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
// inline calls to __builtin_ctzll
u32 v1 = (u32)*v;
u32 v2 = (u32)(*v >> 32);
u32 offset;
if (v1) {
offset = findAndClearLSB_32(&v1);
*v = (u64a)v1 | ((u64a)v2 << 32);
} else {
offset = findAndClearLSB_32(&v2) + 32;
*v = (u64a)v2 << 32;
}
#endif
assert(offset < 64);
return (u32)offset;
return findAndClearLSB_64_impl(v);
}
static really_inline
u32 findAndClearMSB_32(u32 *v) {
assert(*v != 0); // behaviour not defined in this case
#ifndef NO_ASM
u32 val = *v, offset;
__asm__ ("bsr %1, %0\n"
"btr %0, %1\n"
: "=r" (offset), "=r" (val)
: "1" (val));
*v = val;
#else
u32 val = *v;
u32 offset = 31 - clz32(val);
*v = val & ~(1 << offset);
#endif
assert(offset < 32);
return offset;
return findAndClearMSB_32_impl(v);
}
static really_inline
u32 findAndClearMSB_64(u64a *v) {
assert(*v != 0); // behaviour not defined in this case
#ifdef ARCH_64_BIT
#if defined(ARCH_X86_64) && !defined(NO_ASM)
u64a val = *v, offset;
__asm__ ("bsrq %1, %0\n"
"btrq %0, %1\n"
: "=r" (offset), "=r" (val)
: "1" (val));
*v = val;
#else
// generic variant using gcc's builtin on 64-bit
u64a val = *v, offset;
offset = 63 - clz64(val);
*v = val & ~(1ULL << offset);
#endif // ARCH_X86_64
#else
// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
// inline calls to __builtin_ctzll
u32 v1 = (u32)*v;
u32 v2 = (*v >> 32);
u32 offset;
if (v2) {
offset = findAndClearMSB_32(&v2) + 32;
*v = ((u64a)v2 << 32) | (u64a)v1;
} else {
offset = findAndClearMSB_32(&v1);
*v = (u64a)v1;
}
#endif
assert(offset < 64);
return (u32)offset;
return findAndClearMSB_64_impl(v);
}
static really_inline
u32 compress32(u32 x, u32 m) {
#if defined(HAVE_BMI2)
// BMI2 has a single instruction for this operation.
return _pext_u32(x, m);
#else
// Return zero quickly on trivial cases
if ((x & m) == 0) {
return 0;
}
u32 mk, mp, mv, t;
x &= m; // clear irrelevant bits
mk = ~m << 1; // we will count 0's to right
for (u32 i = 0; i < 5; i++) {
mp = mk ^ (mk << 1);
mp ^= mp << 2;
mp ^= mp << 4;
mp ^= mp << 8;
mp ^= mp << 16;
mv = mp & m; // bits to move
m = (m ^ mv) | (mv >> (1 << i)); // compress m
t = x & mv;
x = (x ^ t) | (t >> (1 << i)); // compress x
mk = mk & ~mp;
}
return x;
#endif
return compress32_impl(x, m);
}
static really_inline
u64a compress64(u64a x, u64a m) {
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
// BMI2 has a single instruction for this operation.
return _pext_u64(x, m);
#else
return compress64_impl(x, m);
}
// Return zero quickly on trivial cases
if ((x & m) == 0) {
return 0;
}
u64a mk, mp, mv, t;
x &= m; // clear irrelevant bits
mk = ~m << 1; // we will count 0's to right
for (u32 i = 0; i < 6; i++) {
mp = mk ^ (mk << 1);
mp ^= mp << 2;
mp ^= mp << 4;
mp ^= mp << 8;
mp ^= mp << 16;
mp ^= mp << 32;
mv = mp & m; // bits to move
m = (m ^ mv) | (mv >> (1 << i)); // compress m
t = x & mv;
x = (x ^ t) | (t >> (1 << i)); // compress x
mk = mk & ~mp;
}
return x;
#endif
static really_inline
m128 compress128(m128 x, m128 m) {
return compress128_impl(x, m);
}
static really_inline
u32 expand32(u32 x, u32 m) {
#if defined(HAVE_BMI2)
// BMI2 has a single instruction for this operation.
return _pdep_u32(x, m);
#else
// Return zero quickly on trivial cases
if (!x || !m) {
return 0;
}
u32 m0, mk, mp, mv, t;
u32 array[5];
m0 = m; // save original mask
mk = ~m << 1; // we will count 0's to right
for (int i = 0; i < 5; i++) {
mp = mk ^ (mk << 1); // parallel suffix
mp = mp ^ (mp << 2);
mp = mp ^ (mp << 4);
mp = mp ^ (mp << 8);
mp = mp ^ (mp << 16);
mv = mp & m; // bits to move
array[i] = mv;
m = (m ^ mv) | (mv >> (1 << i)); // compress m
mk = mk & ~mp;
}
for (int i = 4; i >= 0; i--) {
mv = array[i];
t = x << (1 << i);
x = (x & ~mv) | (t & mv);
}
return x & m0; // clear out extraneous bits
#endif
return expand32_impl(x, m);
}
static really_inline
u64a expand64(u64a x, u64a m) {
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
// BMI2 has a single instruction for this operation.
return _pdep_u64(x, m);
#else
// Return zero quickly on trivial cases
if (!x || !m) {
return 0;
}
u64a m0, mk, mp, mv, t;
u64a array[6];
m0 = m; // save original mask
mk = ~m << 1; // we will count 0's to right
for (int i = 0; i < 6; i++) {
mp = mk ^ (mk << 1); // parallel suffix
mp = mp ^ (mp << 2);
mp = mp ^ (mp << 4);
mp = mp ^ (mp << 8);
mp = mp ^ (mp << 16);
mp = mp ^ (mp << 32);
mv = mp & m; // bits to move
array[i] = mv;
m = (m ^ mv) | (mv >> (1 << i)); // compress m
mk = mk & ~mp;
}
for (int i = 5; i >= 0; i--) {
mv = array[i];
t = x << (1 << i);
x = (x & ~mv) | (t & mv);
}
return x & m0; // clear out extraneous bits
#endif
return expand64_impl(x, m);
}
@ -396,97 +141,45 @@ u64a expand64(u64a x, u64a m) {
*/
static really_inline
u32 bf64_iterate(u64a bitfield, u32 begin) {
if (begin != ~0U) {
/* switch off all bits at or below begin. Note: not legal to shift by
* by size of the datatype or larger. */
assert(begin <= 63);
bitfield &= ~((2ULL << begin) - 1);
}
if (!bitfield) {
return ~0U;
}
return ctz64(bitfield);
return bf64_iterate_impl(bitfield, begin);
}
static really_inline
char bf64_set(u64a *bitfield, u32 i) {
assert(i < 64);
u64a mask = 1ULL << i;
char was_set = !!(*bitfield & mask);
*bitfield |= mask;
return was_set;
return bf64_set_impl(bitfield, i);
}
static really_inline
void bf64_unset(u64a *bitfield, u32 i) {
assert(i < 64);
*bitfield &= ~(1ULL << i);
return bf64_unset_impl(bitfield, i);
}
static really_inline
u32 rank_in_mask32(u32 mask, u32 bit) {
assert(bit < sizeof(u32) * 8);
assert(mask & (u32)(1U << bit));
mask &= (u32)(1U << bit) - 1;
return popcount32(mask);
return rank_in_mask32_impl(mask, bit);
}
static really_inline
u32 rank_in_mask64(u64a mask, u32 bit) {
assert(bit < sizeof(u64a) * 8);
assert(mask & (u64a)(1ULL << bit));
mask &= (u64a)(1ULL << bit) - 1;
return popcount64(mask);
return rank_in_mask64_impl(mask, bit);
}
static really_inline
u32 pext32(u32 x, u32 mask) {
#if defined(HAVE_BMI2)
// Intel BMI2 can do this operation in one instruction.
return _pext_u32(x, mask);
#else
u32 result = 0, num = 1;
while (mask != 0) {
u32 bit = findAndClearLSB_32(&mask);
if (x & (1U << bit)) {
assert(num != 0); // more than 32 bits!
result |= num;
}
num <<= 1;
}
return result;
#endif
return pext32_impl(x, mask);
}
static really_inline
u64a pext64(u64a x, u64a mask) {
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
// Intel BMI2 can do this operation in one instruction.
return _pext_u64(x, mask);
#else
u32 result = 0, num = 1;
while (mask != 0) {
u32 bit = findAndClearLSB_64(&mask);
if (x & (1ULL << bit)) {
assert(num != 0); // more than 32 bits!
result |= num;
}
num <<= 1;
}
return result;
#endif
return pext64_impl(x, mask);
}
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
* so we force its generation.
*/
static really_inline
u64a pdep64(u64a x, u64a mask) {
return _pdep_u64(x, mask);
u64a andn(const u32 a, const u8 *b) {
return andn_impl_c(a, b);
}
#endif
#endif // BITUTILS_H

View File

@ -45,6 +45,10 @@
# endif
#endif
#if defined(HAVE_C_ARM_NEON_H)
# define USE_ARM_NEON_H
#endif
#ifdef __cplusplus
# if defined(HAVE_CXX_INTRIN_H)
# define USE_INTRIN_H
@ -59,6 +63,8 @@
#include <x86intrin.h>
#elif defined(USE_INTRIN_H)
#include <intrin.h>
#elif defined(USE_ARM_NEON_H)
#include <arm_neon.h>
#else
#error no intrinsics file
#endif

View File

@ -34,22 +34,22 @@
#include "util/intrinsics.h"
#include "ue2common.h"
#if defined(HAVE_SSE2)
typedef __m128i m128;
#else
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
#include "util/arch/x86/simd_types.h"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#include "util/arch/arm/simd_types.h"
#endif
#if !defined(m128) && !defined(HAVE_SIMD_128_BITS)
typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
#endif
#if defined(HAVE_AVX2)
typedef __m256i m256;
#else
#if !defined(m256) && !defined(HAVE_SIMD_256_BITS)
typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
#endif
typedef struct {m128 lo; m128 mid; m128 hi;} m384;
#if defined(HAVE_AVX512)
typedef __m512i m512;
#else
#if !defined(m512) && !defined(HAVE_SIMD_512_BITS)
typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
#endif

File diff suppressed because it is too large Load Diff

View File

@ -108,20 +108,21 @@ void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) {
static really_inline
void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) {
// First, decompose our vectors into 64-bit chunks.
u64a x[2];
memcpy(x, &xvec, sizeof(xvec));
u64a m[2];
memcpy(m, &mvec, sizeof(mvec));
u64a ALIGN_ATTR(16) x[2];
u64a ALIGN_ATTR(16) m[2];
store128(m, mvec);
store128(x, xvec);
// Count the number of bits of compressed state we're writing out per
// chunk.
u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) };
// Compress each 64-bit chunk individually.
u64a v[2] = { compress64(x[0], m[0]), compress64(x[1], m[1]) };
xvec = compress128(xvec, mvec);
store128(x, xvec);
// Write packed data out.
pack_bits_64(ptr, v, bits, 2);
pack_bits_64(ptr, x, bits, 2);
}
#endif
@ -150,7 +151,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
expand32(v[2], m[2]), expand32(v[3], m[3]) };
return _mm_set_epi32(x[3], x[2], x[1], x[0]);
return set4x32(x[3], x[2], x[1], x[0]);
}
#endif
@ -158,16 +159,17 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
static really_inline
m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
// First, decompose our vectors into 64-bit chunks.
u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) };
u64a ALIGN_ATTR(16) m[2];
store128(m, mvec);
u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
u64a v[2];
u64a ALIGN_ATTR(16) v[2];
unpack_bits_64(v, (const u8 *)ptr, bits, 2);
u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
return _mm_set_epi64x(x[1], x[0]);
return set2x64(x[1], x[0]);
}
#endif
@ -215,10 +217,10 @@ void storecompressed256_32bit(void *ptr, m256 xvec, m256 mvec) {
static really_really_inline
void storecompressed256_64bit(void *ptr, m256 xvec, m256 mvec) {
// First, decompose our vectors into 64-bit chunks.
u64a x[4];
memcpy(x, &xvec, sizeof(xvec));
u64a m[4];
memcpy(m, &mvec, sizeof(mvec));
u64a ALIGN_ATTR(32) x[4];
u64a ALIGN_ATTR(32) m[4];
store256(x, xvec);
store256(m, mvec);
// Count the number of bits of compressed state we're writing out per
// chunk.
@ -264,11 +266,11 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
expand32(v[6], m[6]), expand32(v[7], m[7]) };
#if !defined(HAVE_AVX2)
m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) };
m256 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]),
.hi = set4x32(x[7], x[6], x[5], x[4]) };
#else
m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]);
m256 xvec = set8x32(x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]);
#endif
return xvec;
}
@ -291,10 +293,10 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
expand64(v[2], m[2]), expand64(v[3], m[3]) };
#if !defined(HAVE_AVX2)
m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
.hi = _mm_set_epi64x(x[3], x[2]) };
m256 xvec = { .lo = set2x64(x[1], x[0]),
.hi = set2x64(x[3], x[2]) };
#else
m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]);
m256 xvec = set4x64(x[3], x[2], x[1], x[0]);
#endif
return xvec;
}
@ -402,9 +404,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
expand32(v[8], m[8]), expand32(v[9], m[9]),
expand32(v[10], m[10]), expand32(v[11], m[11]) };
m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
.mid = _mm_set_epi32(x[7], x[6], x[5], x[4]),
.hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) };
m384 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]),
.mid = set4x32(x[7], x[6], x[5], x[4]),
.hi = set4x32(x[11], x[10], x[9], x[8]) };
return xvec;
}
#endif
@ -427,9 +429,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
expand64(v[2], m[2]), expand64(v[3], m[3]),
expand64(v[4], m[4]), expand64(v[5], m[5]) };
m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
.mid = _mm_set_epi64x(x[3], x[2]),
.hi = _mm_set_epi64x(x[5], x[4]) };
m384 xvec = { .lo = set2x64(x[1], x[0]),
.mid = set2x64(x[3], x[2]),
.hi = set2x64(x[5], x[4]) };
return xvec;
}
#endif
@ -548,20 +550,20 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
m512 xvec;
#if defined(HAVE_AVX512)
xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12],
x[11], x[10], x[9], x[8],
x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]);
xvec = set32x16(x[15], x[14], x[13], x[12],
x[11], x[10], x[9], x[8],
x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]);
#elif defined(HAVE_AVX2)
xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]);
xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12],
x[11], x[10], x[9], x[8]);
xvec.lo = set8x32(x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]);
xvec.hi = set8x32(x[15], x[14], x[13], x[12],
x[11], x[10], x[9], x[8]);
#else
xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
xvec.lo.lo = set4x32(x[3], x[2], x[1], x[0]);
xvec.lo.hi = set4x32(x[7], x[6], x[5], x[4]);
xvec.hi.lo = set4x32(x[11], x[10], x[9], x[8]);
xvec.hi.hi = set4x32(x[15], x[14], x[13], x[12]);
#endif
return xvec;
}
@ -588,16 +590,16 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
expand64(v[6], m[6]), expand64(v[7], m[7]) };
#if defined(HAVE_AVX512)
m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4],
m512 xvec = set64x8(x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]);
#elif defined(HAVE_AVX2)
m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
.hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
m512 xvec = { .lo = set4x64(x[3], x[2], x[1], x[0]),
.hi = set4x64(x[7], x[6], x[5], x[4])};
#else
m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
_mm_set_epi64x(x[3], x[2]) },
.hi = { _mm_set_epi64x(x[5], x[4]),
_mm_set_epi64x(x[7], x[6]) } };
m512 xvec = { .lo = { set2x64(x[1], x[0]),
set2x64(x[3], x[2]) },
.hi = { set2x64(x[5], x[4]),
set2x64(x[7], x[6]) } };
#endif
return xvec;
}

View File

@ -29,7 +29,10 @@
#include "hs_compile.h" // for various hs_platform_info flags
#include "target_info.h"
#include "util/cpuid_flags.h"
#include "util/arch/common/cpuid_flags.h"
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#endif
namespace ue2 {

View File

@ -658,34 +658,41 @@ TEST(SimdUtilsTest, movq) {
char cmp[sizeof(m128)];
memset(cmp, 0x80, sizeof(m128));
simd = set16x8(0x80);
simd = set1_16x8(0x80);
r = movq(simd);
ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r)));
#if defined(HAVE_SIMD_128_BITS)
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
int64x2_t a = { 0x123456789abcdefLL, ~0LL };
simd = vreinterpretq_s64_s8(a);
#endif
#endif
r = movq(simd);
ASSERT_EQ(r, 0x123456789abcdef);
}
TEST(SimdUtilsTest, set16x8) {
TEST(SimdUtilsTest, set1_16x8) {
char cmp[sizeof(m128)];
for (unsigned i = 0; i < 256; i++) {
m128 simd = set16x8(i);
m128 simd = set1_16x8(i);
memset(cmp, i, sizeof(simd));
ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
}
}
TEST(SimdUtilsTest, set4x32) {
TEST(SimdUtilsTest, set1_4x32) {
u32 cmp[4] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 };
m128 simd = set4x32(cmp[0]);
m128 simd = set1_4x32(cmp[0]);
ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
}
#if defined(HAVE_AVX2)
#if defined(HAVE_SIMD_256_BITS)
TEST(SimdUtilsTest, set32x8) {
char cmp[sizeof(m256)];

View File

@ -98,8 +98,8 @@ TEST(state_compress, m128_1) {
char buf[sizeof(m128)] = { 0 };
for (u32 i = 0; i < 16; i++) {
char mask_raw[16] = { 0 };
char val_raw[16] = { 0 };
char ALIGN_ATTR(16) mask_raw[16] = { 0 };
char ALIGN_ATTR(16) val_raw[16] = { 0 };
memset(val_raw, (i << 4) + 3, 16);
@ -109,17 +109,32 @@ TEST(state_compress, m128_1) {
mask_raw[15 - i] = 0xff;
val_raw[15 - i] = i;
m128 val;
m128 mask;
memcpy(&val, val_raw, sizeof(val));
memcpy(&mask, mask_raw, sizeof(mask));
m128 val = load128(val_raw);
m128 mask = load128(mask_raw);
storecompressed128(&buf, &val, &mask, 0);
m128 val_out;
loadcompressed128(&val_out, &buf, &mask, 0);
int8_t ALIGN_ATTR(16) data[16];
store128(data, val);
printf("val: ");
for (int j=0; j < 16; j++) printf("%02x ", data[j]);
printf("\n");
store128(data, mask);
printf("mask: ");
for (int j=0; j < 16; j++) printf("%02x ", data[j]);
printf("\n");
store128(data, and128(val, mask));
printf("and128(val, mask): ");
for (int j=0; j < 16; j++) printf("%02x ", data[j]);
printf("\n");
store128(data, val_out);
printf("val_out: ");
for (int j=0; j < 16; j++) printf("%02x ", data[j]);
printf("\n");
EXPECT_TRUE(!diff128(and128(val, mask), val_out));
mask_raw[i] = 0x0f;