mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
commit
124455a4a8
@ -175,7 +175,7 @@ else()
|
||||
string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
|
||||
endforeach ()
|
||||
|
||||
if (CMAKE_COMPILER_IS_GNUCC)
|
||||
if (ARCH_IA32 OR ARCH_X86_64 AND CMAKE_COMPILER_IS_GNUCC)
|
||||
message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
|
||||
# If gcc doesn't recognise the host cpu, then mtune=native becomes
|
||||
# generic, which isn't very good in some cases. march=native looks at
|
||||
@ -281,10 +281,16 @@ else()
|
||||
endif()
|
||||
|
||||
CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
|
||||
CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
|
||||
CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
|
||||
CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
|
||||
CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
|
||||
if (ARCH_IA32 OR ARCH_X86_64)
|
||||
CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
|
||||
CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
|
||||
CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
|
||||
CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
|
||||
elseif (ARCH_ARM32 OR ARCH_AARCH64)
|
||||
CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_C_ARM_NEON_H)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions")
|
||||
endif()
|
||||
|
||||
CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
|
||||
CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
|
||||
@ -564,11 +570,22 @@ install(FILES ${hs_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs")
|
||||
set (hs_exec_common_SRCS
|
||||
src/alloc.c
|
||||
src/scratch.c
|
||||
src/util/cpuid_flags.c
|
||||
src/util/cpuid_flags.h
|
||||
src/util/arch/common/cpuid_flags.h
|
||||
src/util/multibit.c
|
||||
)
|
||||
|
||||
if (ARCH_IA32 OR ARCH_X86_64)
|
||||
set (hs_exec_common_SRCS
|
||||
${hs_exec_common_SRCS}
|
||||
src/util/arch/x86/cpuid_flags.c
|
||||
)
|
||||
else (ARCH_ARM32 OR ARCH_AARCH64)
|
||||
set (hs_exec_common_SRCS
|
||||
${hs_exec_common_SRCS}
|
||||
src/util/arch/arm/cpuid_flags.c
|
||||
)
|
||||
endif ()
|
||||
|
||||
set (hs_exec_SRCS
|
||||
${hs_HEADERS}
|
||||
src/hs_version.h
|
||||
@ -694,7 +711,6 @@ set (hs_exec_SRCS
|
||||
src/util/exhaust.h
|
||||
src/util/fatbit.h
|
||||
src/util/join.h
|
||||
src/util/masked_move.h
|
||||
src/util/multibit.h
|
||||
src/util/multibit.c
|
||||
src/util/multibit_compress.h
|
||||
@ -716,7 +732,8 @@ set (hs_exec_SRCS
|
||||
|
||||
set (hs_exec_avx2_SRCS
|
||||
src/fdr/teddy_avx2.c
|
||||
src/util/masked_move.c
|
||||
src/util/arch/x86/masked_move.c
|
||||
src/util/arch/x86/masked_move.h
|
||||
)
|
||||
|
||||
|
||||
|
@ -6,7 +6,10 @@ if (HAVE_C_X86INTRIN_H)
|
||||
set (INTRIN_INC_H "x86intrin.h")
|
||||
elseif (HAVE_C_INTRIN_H)
|
||||
set (INTRIN_INC_H "intrin.h")
|
||||
else ()
|
||||
elseif (HAVE_C_ARM_NEON_H)
|
||||
set (INTRIN_INC_H "arm_neon.h")
|
||||
set (FAT_RUNTIME OFF)
|
||||
else()
|
||||
message (FATAL_ERROR "No intrinsics header found")
|
||||
endif ()
|
||||
|
||||
@ -29,15 +32,16 @@ else (NOT FAT_RUNTIME)
|
||||
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ARCH_C_FLAGS}")
|
||||
endif ()
|
||||
|
||||
# ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
if (ARCH_IA32 OR ARCH_X86_64)
|
||||
# ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
int main() {
|
||||
__m128i a = _mm_set1_epi8(1);
|
||||
(void)_mm_shuffle_epi8(a, a);
|
||||
}" HAVE_SSSE3)
|
||||
|
||||
# now look for AVX2
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
# now look for AVX2
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
#if !defined(__AVX2__)
|
||||
#error no avx2
|
||||
#endif
|
||||
@ -47,8 +51,8 @@ int main(){
|
||||
(void)_mm256_xor_si256(z, z);
|
||||
}" HAVE_AVX2)
|
||||
|
||||
# and now for AVX512
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
# and now for AVX512
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
#if !defined(__AVX512BW__)
|
||||
#error no avx512bw
|
||||
#endif
|
||||
@ -58,8 +62,8 @@ int main(){
|
||||
(void)_mm512_abs_epi8(z);
|
||||
}" HAVE_AVX512)
|
||||
|
||||
# and now for AVX512VBMI
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
# and now for AVX512VBMI
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
#if !defined(__AVX512VBMI__)
|
||||
#error no avx512vbmi
|
||||
#endif
|
||||
@ -70,26 +74,39 @@ int main(){
|
||||
(void)_mm512_permutexvar_epi8(idx, a);
|
||||
}" HAVE_AVX512VBMI)
|
||||
|
||||
elseif (ARCH_ARM32 OR ARCH_AARCH64)
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
int main() {
|
||||
int32x4_t a = vdupq_n_s32(1);
|
||||
(void)a;
|
||||
}" HAVE_NEON)
|
||||
else ()
|
||||
message (FATAL_ERROR "Unsupported architecture")
|
||||
endif ()
|
||||
|
||||
if (FAT_RUNTIME)
|
||||
if (NOT HAVE_SSSE3)
|
||||
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
|
||||
message(FATAL_ERROR "SSSE3 support required to build fat runtime")
|
||||
endif ()
|
||||
if (NOT HAVE_AVX2)
|
||||
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2)
|
||||
message(FATAL_ERROR "AVX2 support required to build fat runtime")
|
||||
endif ()
|
||||
if (BUILD_AVX512 AND NOT HAVE_AVX512)
|
||||
if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX512 AND NOT HAVE_AVX512)
|
||||
message(FATAL_ERROR "AVX512 support requested but not supported")
|
||||
endif ()
|
||||
else (NOT FAT_RUNTIME)
|
||||
if (NOT HAVE_AVX2)
|
||||
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2)
|
||||
message(STATUS "Building without AVX2 support")
|
||||
endif ()
|
||||
if (NOT HAVE_AVX512)
|
||||
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512)
|
||||
message(STATUS "Building without AVX512 support")
|
||||
endif ()
|
||||
if (NOT HAVE_SSSE3)
|
||||
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
|
||||
message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
|
||||
endif ()
|
||||
if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON)
|
||||
message(FATAL_ERROR "NEON support required for ARM support")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
unset (CMAKE_REQUIRED_FLAGS)
|
||||
|
@ -15,6 +15,12 @@
|
||||
/* "Define if building for EM64T" */
|
||||
#cmakedefine ARCH_X86_64
|
||||
|
||||
/* "Define if building for ARM32" */
|
||||
#cmakedefine ARCH_ARM32
|
||||
|
||||
/* "Define if building for AARCH64" */
|
||||
#cmakedefine ARCH_AARCH64
|
||||
|
||||
/* internal build, switch on dump support. */
|
||||
#cmakedefine DUMP_SUPPORT
|
||||
|
||||
@ -45,6 +51,9 @@
|
||||
/* C compiler has intrin.h */
|
||||
#cmakedefine HAVE_C_INTRIN_H
|
||||
|
||||
/* C compiler has arm_neon.h */
|
||||
#cmakedefine HAVE_C_ARM_NEON_H
|
||||
|
||||
/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
|
||||
0 if you don't. */
|
||||
#cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP
|
||||
|
@ -1,9 +1,15 @@
|
||||
# determine the target arch
|
||||
|
||||
# really only interested in the preprocessor here
|
||||
CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT)
|
||||
CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64)
|
||||
|
||||
CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT)
|
||||
CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
|
||||
|
||||
set(ARCH_X86_64 ${ARCH_64_BIT})
|
||||
set(ARCH_IA32 ${ARCH_32_BIT})
|
||||
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
|
||||
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
|
||||
|
||||
if (ARCH_X86_64 OR ARCH_AARCH64)
|
||||
set(ARCH_64_BIT TRUE)
|
||||
else()
|
||||
set(ARCH_32_BIT TRUE)
|
||||
endif()
|
||||
|
49
src/crc32.c
49
src/crc32.c
@ -30,7 +30,6 @@
|
||||
#include "config.h"
|
||||
#include "ue2common.h"
|
||||
#include "util/arch.h"
|
||||
#include "util/intrinsics.h"
|
||||
|
||||
#if !defined(HAVE_SSE42)
|
||||
|
||||
@ -579,53 +578,7 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
|
||||
}
|
||||
|
||||
#else // HAVE_SSE42
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
#define CRC_WORD 8
|
||||
#define CRC_TYPE u64a
|
||||
#define CRC_FUNC _mm_crc32_u64
|
||||
#else
|
||||
#define CRC_WORD 4
|
||||
#define CRC_TYPE u32
|
||||
#define CRC_FUNC _mm_crc32_u32
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Use the crc32 instruction from SSE4.2 to compute our checksum - same
|
||||
* polynomial as the above function.
|
||||
*/
|
||||
static really_inline
|
||||
u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
|
||||
const size_t length) {
|
||||
u32 crc = running_crc;
|
||||
|
||||
// Process byte-by-byte until p_buf is aligned
|
||||
|
||||
const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD);
|
||||
size_t init_bytes = aligned_buf - p_buf;
|
||||
size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD;
|
||||
size_t end_bytes = length - init_bytes - running_length;
|
||||
|
||||
while (p_buf < aligned_buf) {
|
||||
crc = _mm_crc32_u8(crc, *p_buf++);
|
||||
}
|
||||
|
||||
// Main aligned loop, processes a word at a time.
|
||||
|
||||
for (size_t li = 0; li < running_length/CRC_WORD; li++) {
|
||||
CRC_TYPE block = *(const CRC_TYPE *)p_buf;
|
||||
crc = CRC_FUNC(crc, block);
|
||||
p_buf += CRC_WORD;
|
||||
}
|
||||
|
||||
// Remaining bytes
|
||||
|
||||
for(size_t li = 0; li < end_bytes; li++) {
|
||||
crc = _mm_crc32_u8(crc, *p_buf++);
|
||||
}
|
||||
|
||||
return crc;
|
||||
}
|
||||
#include "util/arch/x86/crc32.h"
|
||||
#endif
|
||||
|
||||
#ifdef VERIFY_ASSERTION
|
||||
|
@ -51,6 +51,7 @@ extern "C"
|
||||
// CPU type is the low 6 bits (we can't need more than 64, surely!)
|
||||
|
||||
#define HS_PLATFORM_INTEL 1
|
||||
#define HS_PLATFORM_ARM 2
|
||||
#define HS_PLATFORM_CPU_MASK 0x3F
|
||||
|
||||
#define HS_PLATFORM_NOAVX2 (4<<13)
|
||||
|
@ -30,7 +30,9 @@
|
||||
#include "hs_common.h"
|
||||
#include "hs_runtime.h"
|
||||
#include "ue2common.h"
|
||||
#include "util/cpuid_inline.h"
|
||||
#if defined(ARCH_X86_64)
|
||||
#include "util/arch/x86/cpuid_inline.h"
|
||||
#endif
|
||||
#include "util/join.h"
|
||||
|
||||
#if defined(DISABLE_AVX512_DISPATCH)
|
||||
|
@ -36,6 +36,7 @@
|
||||
#include "teddy.h"
|
||||
#include "teddy_internal.h"
|
||||
#include "util/arch.h"
|
||||
#include "util/bitutils.h"
|
||||
#include "util/simd_utils.h"
|
||||
#include "util/uniform_ops.h"
|
||||
|
||||
@ -119,20 +120,6 @@ const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
|
||||
};
|
||||
|
||||
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
|
||||
* so we force its generation.
|
||||
*/
|
||||
static really_inline
|
||||
u64a andn(const u32 a, const u8 *b) {
|
||||
u64a r;
|
||||
#if defined(HAVE_BMI) && !defined(NO_ASM)
|
||||
__asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
|
||||
#else
|
||||
r = unaligned_load_u32(b) & ~a;
|
||||
#endif
|
||||
return r;
|
||||
}
|
||||
|
||||
/* generates an initial state mask based on the last byte-ish of history rather
|
||||
* than being all accepting. If there is no history to consider, the state is
|
||||
* generated based on the minimum length of each bucket in order to prevent
|
||||
@ -739,6 +726,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
|
||||
assert(ISALIGNED_CL(confBase));
|
||||
struct zone zones[ZONE_MAX];
|
||||
assert(fdr->domain > 8 && fdr->domain < 16);
|
||||
memset(zones, 0, sizeof(zones));
|
||||
|
||||
size_t numZone = prepareZones(a->buf, a->len,
|
||||
a->buf_history + a->len_history,
|
||||
|
@ -311,26 +311,26 @@ const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
|
||||
sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
|
||||
|
||||
#define PREPARE_MASKS_1 \
|
||||
dup_mask[0] = set4x128(maskBase[0]); \
|
||||
dup_mask[1] = set4x128(maskBase[1]);
|
||||
dup_mask[0] = set1_4x128(maskBase[0]); \
|
||||
dup_mask[1] = set1_4x128(maskBase[1]);
|
||||
|
||||
#define PREPARE_MASKS_2 \
|
||||
PREPARE_MASKS_1 \
|
||||
dup_mask[2] = set4x128(maskBase[2]); \
|
||||
dup_mask[3] = set4x128(maskBase[3]);
|
||||
dup_mask[2] = set1_4x128(maskBase[2]); \
|
||||
dup_mask[3] = set1_4x128(maskBase[3]);
|
||||
|
||||
#define PREPARE_MASKS_3 \
|
||||
PREPARE_MASKS_2 \
|
||||
dup_mask[4] = set4x128(maskBase[4]); \
|
||||
dup_mask[5] = set4x128(maskBase[5]);
|
||||
dup_mask[4] = set1_4x128(maskBase[4]); \
|
||||
dup_mask[5] = set1_4x128(maskBase[5]);
|
||||
|
||||
#define PREPARE_MASKS_4 \
|
||||
PREPARE_MASKS_3 \
|
||||
dup_mask[6] = set4x128(maskBase[6]); \
|
||||
dup_mask[7] = set4x128(maskBase[7]);
|
||||
dup_mask[6] = set1_4x128(maskBase[6]); \
|
||||
dup_mask[7] = set1_4x128(maskBase[7]);
|
||||
|
||||
#define PREPARE_MASKS(n) \
|
||||
m512 lo_mask = set64x8(0xf); \
|
||||
m512 lo_mask = set1_64x8(0xf); \
|
||||
m512 dup_mask[n * 2]; \
|
||||
m512 sl_msk[n - 1]; \
|
||||
PREPARE_MASKS_##n \
|
||||
@ -570,26 +570,26 @@ m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
|
||||
&c_0, &c_16, &c_32, &c_48)
|
||||
|
||||
#define PREPARE_MASKS_1 \
|
||||
dup_mask[0] = set4x128(maskBase[0]); \
|
||||
dup_mask[1] = set4x128(maskBase[1]);
|
||||
dup_mask[0] = set1_4x128(maskBase[0]); \
|
||||
dup_mask[1] = set1_4x128(maskBase[1]);
|
||||
|
||||
#define PREPARE_MASKS_2 \
|
||||
PREPARE_MASKS_1 \
|
||||
dup_mask[2] = set4x128(maskBase[2]); \
|
||||
dup_mask[3] = set4x128(maskBase[3]);
|
||||
dup_mask[2] = set1_4x128(maskBase[2]); \
|
||||
dup_mask[3] = set1_4x128(maskBase[3]);
|
||||
|
||||
#define PREPARE_MASKS_3 \
|
||||
PREPARE_MASKS_2 \
|
||||
dup_mask[4] = set4x128(maskBase[4]); \
|
||||
dup_mask[5] = set4x128(maskBase[5]);
|
||||
dup_mask[4] = set1_4x128(maskBase[4]); \
|
||||
dup_mask[5] = set1_4x128(maskBase[5]);
|
||||
|
||||
#define PREPARE_MASKS_4 \
|
||||
PREPARE_MASKS_3 \
|
||||
dup_mask[6] = set4x128(maskBase[6]); \
|
||||
dup_mask[7] = set4x128(maskBase[7]);
|
||||
dup_mask[6] = set1_4x128(maskBase[6]); \
|
||||
dup_mask[7] = set1_4x128(maskBase[7]);
|
||||
|
||||
#define PREPARE_MASKS(n) \
|
||||
m512 lo_mask = set64x8(0xf); \
|
||||
m512 lo_mask = set1_64x8(0xf); \
|
||||
m512 dup_mask[n * 2]; \
|
||||
PREPARE_MASKS_##n
|
||||
|
||||
@ -713,7 +713,7 @@ do { \
|
||||
#define PREP_SHUF_MASK \
|
||||
PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr)); \
|
||||
*c_128 = *(ptr + 15); \
|
||||
m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
|
||||
m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
|
||||
*c_0 = *(ptr + 31)
|
||||
|
||||
#define SHIFT_OR_M1 \
|
||||
@ -805,26 +805,26 @@ m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask,
|
||||
prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
|
||||
|
||||
#define PREPARE_MASKS_1 \
|
||||
dup_mask[0] = set2x128(maskBase[0]); \
|
||||
dup_mask[1] = set2x128(maskBase[1]);
|
||||
dup_mask[0] = set1_2x128(maskBase[0]); \
|
||||
dup_mask[1] = set1_2x128(maskBase[1]);
|
||||
|
||||
#define PREPARE_MASKS_2 \
|
||||
PREPARE_MASKS_1 \
|
||||
dup_mask[2] = set2x128(maskBase[2]); \
|
||||
dup_mask[3] = set2x128(maskBase[3]);
|
||||
dup_mask[2] = set1_2x128(maskBase[2]); \
|
||||
dup_mask[3] = set1_2x128(maskBase[3]);
|
||||
|
||||
#define PREPARE_MASKS_3 \
|
||||
PREPARE_MASKS_2 \
|
||||
dup_mask[4] = set2x128(maskBase[4]); \
|
||||
dup_mask[5] = set2x128(maskBase[5]);
|
||||
dup_mask[4] = set1_2x128(maskBase[4]); \
|
||||
dup_mask[5] = set1_2x128(maskBase[5]);
|
||||
|
||||
#define PREPARE_MASKS_4 \
|
||||
PREPARE_MASKS_3 \
|
||||
dup_mask[6] = set2x128(maskBase[6]); \
|
||||
dup_mask[7] = set2x128(maskBase[7]);
|
||||
dup_mask[6] = set1_2x128(maskBase[6]); \
|
||||
dup_mask[7] = set1_2x128(maskBase[7]);
|
||||
|
||||
#define PREPARE_MASKS(n) \
|
||||
m256 lo_mask = set32x8(0xf); \
|
||||
m256 lo_mask = set1_32x8(0xf); \
|
||||
m256 dup_mask[n * 2]; \
|
||||
PREPARE_MASKS_##n
|
||||
|
||||
@ -901,8 +901,10 @@ do { \
|
||||
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||
do { \
|
||||
if (unlikely(diff128(var, ones128()))) { \
|
||||
u64a lo = movq(var); \
|
||||
u64a hi = movq(rshiftbyte_m128(var, 8)); \
|
||||
u64a __attribute__((aligned(16))) vector[2]; \
|
||||
store128(vector, var); \
|
||||
u64a lo = vector[0]; \
|
||||
u64a hi = vector[1]; \
|
||||
CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn); \
|
||||
CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn); \
|
||||
} \
|
||||
@ -925,7 +927,7 @@ do { \
|
||||
|
||||
static really_inline
|
||||
m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
|
||||
m128 mask = set16x8(0xf);
|
||||
m128 mask = set1_16x8(0xf);
|
||||
m128 lo = and128(val, mask);
|
||||
m128 hi = and128(rshift64_m128(val, 4), mask);
|
||||
return or128(pshufb_m128(maskBase[0 * 2], lo),
|
||||
@ -934,7 +936,7 @@ m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
|
||||
|
||||
static really_inline
|
||||
m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
|
||||
m128 mask = set16x8(0xf);
|
||||
m128 mask = set1_16x8(0xf);
|
||||
m128 lo = and128(val, mask);
|
||||
m128 hi = and128(rshift64_m128(val, 4), mask);
|
||||
m128 r = prep_conf_teddy_m1(maskBase, val);
|
||||
@ -949,7 +951,7 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
|
||||
static really_inline
|
||||
m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
|
||||
m128 val) {
|
||||
m128 mask = set16x8(0xf);
|
||||
m128 mask = set1_16x8(0xf);
|
||||
m128 lo = and128(val, mask);
|
||||
m128 hi = and128(rshift64_m128(val, 4), mask);
|
||||
m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
|
||||
@ -964,7 +966,7 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
|
||||
static really_inline
|
||||
m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
|
||||
m128 *old_3, m128 val) {
|
||||
m128 mask = set16x8(0xf);
|
||||
m128 mask = set1_16x8(0xf);
|
||||
m128 lo = and128(val, mask);
|
||||
m128 hi = and128(rshift64_m128(val, 4), mask);
|
||||
m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
|
||||
|
@ -501,15 +501,15 @@ m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
const u8 *buf_history, size_t len_history,
|
||||
const u32 nMasks) {
|
||||
m128 p_mask128;
|
||||
m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
|
||||
m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
|
||||
buf_history, len_history, nMasks));
|
||||
*p_mask = set2x128(p_mask128);
|
||||
*p_mask = set1_2x128(p_mask128);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
|
||||
m256 mask = set32x8(0xf);
|
||||
m256 mask = set1_32x8(0xf);
|
||||
m256 lo = and256(val, mask);
|
||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||
return or256(pshufb_m256(maskBase[0 * 2], lo),
|
||||
@ -518,7 +518,7 @@ m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
|
||||
|
||||
static really_inline
|
||||
m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
|
||||
m256 mask = set32x8(0xf);
|
||||
m256 mask = set1_32x8(0xf);
|
||||
m256 lo = and256(val, mask);
|
||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||
m256 r = prep_conf_fat_teddy_m1(maskBase, val);
|
||||
@ -533,7 +533,7 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
|
||||
static really_inline
|
||||
m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
|
||||
m256 val) {
|
||||
m256 mask = set32x8(0xf);
|
||||
m256 mask = set1_32x8(0xf);
|
||||
m256 lo = and256(val, mask);
|
||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
|
||||
@ -548,7 +548,7 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
|
||||
static really_inline
|
||||
m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
|
||||
m256 *old_3, m256 val) {
|
||||
m256 mask = set32x8(0xf);
|
||||
m256 mask = set1_32x8(0xf);
|
||||
m256 lo = and256(val, mask);
|
||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
|
||||
|
@ -44,8 +44,11 @@
|
||||
#include "parser/prefilter.h"
|
||||
#include "parser/unsupported.h"
|
||||
#include "util/compile_error.h"
|
||||
#include "util/cpuid_flags.h"
|
||||
#include "util/cpuid_inline.h"
|
||||
#include "util/arch/common/cpuid_flags.h"
|
||||
#if defined(ARCH_X86_64)
|
||||
#include "util/arch/x86/cpuid_inline.h"
|
||||
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
|
||||
#endif
|
||||
#include "util/depth.h"
|
||||
#include "util/popcount.h"
|
||||
#include "util/target_info.h"
|
||||
|
@ -26,16 +26,23 @@
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "hs_common.h"
|
||||
#include "util/cpuid_flags.h"
|
||||
#include "util/cpuid_inline.h"
|
||||
#include "ue2common.h"
|
||||
#if defined(ARCH_X86_64)
|
||||
#include "util/arch/x86/cpuid_inline.h"
|
||||
#endif
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_valid_platform(void) {
|
||||
/* Hyperscan requires SSSE3, anything else is a bonus */
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
if (check_ssse3()) {
|
||||
return HS_SUCCESS;
|
||||
} else {
|
||||
return HS_ARCH_ERROR;
|
||||
}
|
||||
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
|
||||
return HS_SUCCESS;
|
||||
#endif
|
||||
}
|
||||
|
@ -39,10 +39,13 @@
|
||||
#include "util/compare.h"
|
||||
#include "util/intrinsics.h"
|
||||
#include "util/join.h"
|
||||
#include "util/masked_move.h"
|
||||
#include "util/partial_store.h"
|
||||
#include "util/simd_utils.h"
|
||||
|
||||
#if defined(HAVE_AVX2)
|
||||
#include "util/arch/x86/masked_move.h"
|
||||
#endif
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
|
@ -30,11 +30,11 @@
|
||||
|
||||
static really_inline m256 getMask(u8 c, bool noCase) {
|
||||
u8 k = caseClear8(c, noCase);
|
||||
return set32x8(k);
|
||||
return set1_32x8(k);
|
||||
}
|
||||
|
||||
static really_inline m256 getCaseMask(void) {
|
||||
return set32x8(0xdf);
|
||||
return set1_32x8(0xdf);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
|
@ -30,11 +30,11 @@
|
||||
|
||||
static really_inline m128 getMask(u8 c, bool noCase) {
|
||||
u8 k = caseClear8(c, noCase);
|
||||
return set16x8(k);
|
||||
return set1_16x8(k);
|
||||
}
|
||||
|
||||
static really_inline m128 getCaseMask(void) {
|
||||
return set16x8(0xdf);
|
||||
return set1_16x8(0xdf);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
|
@ -59,7 +59,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
|
||||
|
||||
if (len) {
|
||||
m128 ss_char = load128(sherman_state);
|
||||
m128 cur_char = set16x8(cprime);
|
||||
m128 cur_char = set1_16x8(cprime);
|
||||
|
||||
u32 z = movemask128(eq128(ss_char, cur_char));
|
||||
|
||||
|
@ -72,7 +72,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
|
||||
|
||||
if (len) {
|
||||
m128 ss_char = load128(sherman_state);
|
||||
m128 cur_char = set16x8(cprime);
|
||||
m128 cur_char = set1_16x8(cprime);
|
||||
|
||||
u32 z = movemask128(eq128(ss_char, cur_char));
|
||||
|
||||
@ -153,7 +153,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
|
||||
assert(s_in); /* should not already be dead */
|
||||
assert(soft_c_end <= hard_c_end);
|
||||
DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
|
||||
m128 s = set16x8(s_in - 1);
|
||||
m128 s = set1_16x8(s_in - 1);
|
||||
const u8 *c = *c_inout;
|
||||
const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
|
||||
if (!do_accel) {
|
||||
@ -171,8 +171,8 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
|
||||
|
||||
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
|
||||
u32 sheng_limit_x4 = sheng_limit * 0x01010101;
|
||||
m128 simd_stop_limit = set4x32(sheng_stop_limit_x4);
|
||||
m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit);
|
||||
m128 simd_stop_limit = set1_4x32(sheng_stop_limit_x4);
|
||||
m128 accel_delta = set1_16x8(sheng_limit - sheng_stop_limit);
|
||||
DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit,
|
||||
m->sheng_accel_limit, sheng_stop_limit);
|
||||
#endif
|
||||
|
@ -52,7 +52,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
|
||||
}
|
||||
DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
|
||||
|
||||
m128 cur_state = set16x8(*state);
|
||||
m128 cur_state = set1_16x8(*state);
|
||||
const m128 *masks = s->shuffle_masks;
|
||||
|
||||
while (likely(cur_buf != end)) {
|
||||
|
@ -86,7 +86,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
|
||||
return MO_CONTINUE_MATCHING;
|
||||
}
|
||||
|
||||
m128 cur_state = set16x8(*state);
|
||||
m128 cur_state = set1_16x8(*state);
|
||||
const m128 *masks = s->shuffle_masks;
|
||||
|
||||
while (likely(end - cur_buf >= 4)) {
|
||||
|
@ -159,7 +159,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
|
||||
}
|
||||
|
||||
const m128 zeroes = zeroes128();
|
||||
const m128 low4bits = _mm_set1_epi8(0xf);
|
||||
const m128 low4bits = set1_16x8(0xf);
|
||||
const u8 *rv;
|
||||
|
||||
size_t min = (size_t)buf % 16;
|
||||
@ -246,7 +246,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
|
||||
}
|
||||
|
||||
const m128 zeroes = zeroes128();
|
||||
const m128 low4bits = _mm_set1_epi8(0xf);
|
||||
const m128 low4bits = set1_16x8(0xf);
|
||||
const u8 *rv;
|
||||
|
||||
assert(buf_end - buf >= 16);
|
||||
@ -320,7 +320,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
|
||||
m128 mask2_lo, m128 mask2_hi,
|
||||
const u8 *buf, const u8 *buf_end) {
|
||||
const m128 ones = ones128();
|
||||
const m128 low4bits = _mm_set1_epi8(0xf);
|
||||
const m128 low4bits = set1_16x8(0xf);
|
||||
const u8 *rv;
|
||||
|
||||
size_t min = (size_t)buf % 16;
|
||||
@ -455,15 +455,15 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
|
||||
buf, buf_end);
|
||||
}
|
||||
|
||||
const m256 low4bits = set32x8(0xf);
|
||||
const m256 low4bits = set1_32x8(0xf);
|
||||
|
||||
if (buf_end - buf <= 32) {
|
||||
return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits);
|
||||
}
|
||||
|
||||
const m256 zeroes = zeroes256();
|
||||
const m256 wide_mask_lo = set2x128(mask_lo);
|
||||
const m256 wide_mask_hi = set2x128(mask_hi);
|
||||
const m256 wide_mask_lo = set1_2x128(mask_lo);
|
||||
const m256 wide_mask_hi = set1_2x128(mask_hi);
|
||||
const u8 *rv;
|
||||
|
||||
size_t min = (size_t)buf % 32;
|
||||
@ -579,15 +579,15 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
|
||||
buf, buf_end);
|
||||
}
|
||||
|
||||
const m256 low4bits = set32x8(0xf);
|
||||
const m256 low4bits = set1_32x8(0xf);
|
||||
|
||||
if (buf_end - buf <= 32) {
|
||||
return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits);
|
||||
}
|
||||
|
||||
const m256 zeroes = zeroes256();
|
||||
const m256 wide_mask_lo = set2x128(mask_lo);
|
||||
const m256 wide_mask_hi = set2x128(mask_hi);
|
||||
const m256 wide_mask_lo = set1_2x128(mask_lo);
|
||||
const m256 wide_mask_hi = set1_2x128(mask_hi);
|
||||
const u8 *rv;
|
||||
|
||||
assert(buf_end - buf >= 32);
|
||||
@ -676,7 +676,7 @@ static really_inline
|
||||
const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
|
||||
m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
|
||||
DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
|
||||
const m256 low4bits = set32x8(0xf);
|
||||
const m256 low4bits = set1_32x8(0xf);
|
||||
// run shufti over two overlapping 16-byte unaligned reads
|
||||
const m256 mask1 = combine2x128(mask1_hi, mask1_lo);
|
||||
const m256 mask2 = combine2x128(mask2_hi, mask2_lo);
|
||||
@ -708,11 +708,11 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
|
||||
}
|
||||
|
||||
const m256 ones = ones256();
|
||||
const m256 low4bits = set32x8(0xf);
|
||||
const m256 wide_mask1_lo = set2x128(mask1_lo);
|
||||
const m256 wide_mask1_hi = set2x128(mask1_hi);
|
||||
const m256 wide_mask2_lo = set2x128(mask2_lo);
|
||||
const m256 wide_mask2_hi = set2x128(mask2_hi);
|
||||
const m256 low4bits = set1_32x8(0xf);
|
||||
const m256 wide_mask1_lo = set1_2x128(mask1_lo);
|
||||
const m256 wide_mask1_hi = set1_2x128(mask1_hi);
|
||||
const m256 wide_mask2_lo = set1_2x128(mask2_lo);
|
||||
const m256 wide_mask2_hi = set1_2x128(mask2_hi);
|
||||
const u8 *rv;
|
||||
|
||||
size_t min = (size_t)buf % 32;
|
||||
|
@ -64,8 +64,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
|
||||
static really_inline
|
||||
u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
|
||||
|
||||
m128 highconst = _mm_set1_epi8(0x80);
|
||||
m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
|
||||
m128 highconst = set1_16x8(0x80);
|
||||
m128 shuf_mask_hi = set1_2x64(0x8040201008040201);
|
||||
|
||||
// and now do the real work
|
||||
m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
|
||||
@ -260,8 +260,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
|
||||
static really_inline
|
||||
u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
|
||||
|
||||
m256 highconst = _mm256_set1_epi8(0x80);
|
||||
m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
|
||||
m256 highconst = set1_32x8(0x80);
|
||||
m256 shuf_mask_hi = set1_4x64(0x8040201008040201);
|
||||
|
||||
// and now do the real work
|
||||
m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v);
|
||||
@ -315,8 +315,8 @@ const u8 *truffleExec(m128 shuf_mask_lo_highclear,
|
||||
m128 shuf_mask_lo_highset,
|
||||
const u8 *buf, const u8 *buf_end) {
|
||||
DEBUG_PRINTF("len %zu\n", buf_end - buf);
|
||||
const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
|
||||
const m256 wide_set = set2x128(shuf_mask_lo_highset);
|
||||
const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear);
|
||||
const m256 wide_set = set1_2x128(shuf_mask_lo_highset);
|
||||
|
||||
assert(buf && buf_end);
|
||||
assert(buf < buf_end);
|
||||
@ -382,8 +382,8 @@ const u8 *truffleRevMini(m256 shuf_mask_lo_highclear,
|
||||
const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
|
||||
m128 shuf_mask_lo_highset,
|
||||
const u8 *buf, const u8 *buf_end) {
|
||||
const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
|
||||
const m256 wide_set = set2x128(shuf_mask_lo_highset);
|
||||
const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear);
|
||||
const m256 wide_set = set1_2x128(shuf_mask_lo_highset);
|
||||
assert(buf && buf_end);
|
||||
assert(buf < buf_end);
|
||||
const u8 *rv;
|
||||
|
@ -36,7 +36,7 @@
|
||||
|
||||
#define VERM_BOUNDARY 16
|
||||
#define VERM_TYPE m128
|
||||
#define VERM_SET_FN set16x8
|
||||
#define VERM_SET_FN set1_16x8
|
||||
|
||||
static really_inline
|
||||
const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
|
||||
@ -74,7 +74,7 @@ static really_inline
|
||||
const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf,
|
||||
const u8 *buf_end, char negate) {
|
||||
assert((size_t)buf % 16 == 0);
|
||||
m128 casemask = set16x8(CASE_CLEAR);
|
||||
m128 casemask = set1_16x8(CASE_CLEAR);
|
||||
|
||||
for (; buf + 31 < buf_end; buf += 32) {
|
||||
m128 data = load128(buf);
|
||||
@ -122,7 +122,7 @@ const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) {
|
||||
// returns NULL if not found
|
||||
static really_inline
|
||||
const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) {
|
||||
m128 casemask = set16x8(CASE_CLEAR);
|
||||
m128 casemask = set1_16x8(CASE_CLEAR);
|
||||
m128 data = loadu128(buf); // unaligned
|
||||
u32 z = movemask128(eq128(chars, and128(casemask, data)));
|
||||
if (negate) {
|
||||
@ -157,7 +157,7 @@ static really_inline
|
||||
const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
|
||||
const u8 *buf, const u8 *buf_end) {
|
||||
assert((size_t)buf % 16 == 0);
|
||||
m128 casemask = set16x8(CASE_CLEAR);
|
||||
m128 casemask = set1_16x8(CASE_CLEAR);
|
||||
|
||||
for (; buf + 16 < buf_end; buf += 16) {
|
||||
m128 data = load128(buf);
|
||||
@ -219,7 +219,7 @@ const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
|
||||
static really_inline
|
||||
const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
|
||||
/* due to laziness, nonalphas and nocase having interesting behaviour */
|
||||
m128 casemask = set16x8(CASE_CLEAR);
|
||||
m128 casemask = set1_16x8(CASE_CLEAR);
|
||||
m128 data = loadu128(buf); // unaligned
|
||||
m128 v = and128(casemask, data);
|
||||
u32 z = movemask128(and128(eq128(chars1, v),
|
||||
@ -277,7 +277,7 @@ static really_inline
|
||||
const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf,
|
||||
const u8 *buf_end, char negate) {
|
||||
assert((size_t)buf_end % 16 == 0);
|
||||
m128 casemask = set16x8(CASE_CLEAR);
|
||||
m128 casemask = set1_16x8(CASE_CLEAR);
|
||||
|
||||
for (; buf + 15 < buf_end; buf_end -= 16) {
|
||||
m128 data = load128(buf_end - 16);
|
||||
@ -309,7 +309,7 @@ const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) {
|
||||
// returns NULL if not found
|
||||
static really_inline
|
||||
const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) {
|
||||
m128 casemask = set16x8(CASE_CLEAR);
|
||||
m128 casemask = set1_16x8(CASE_CLEAR);
|
||||
m128 data = loadu128(buf); // unaligned
|
||||
u32 z = movemask128(eq128(chars, and128(casemask, data)));
|
||||
if (negate) {
|
||||
@ -344,7 +344,7 @@ static really_inline
|
||||
const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
|
||||
const u8 *buf, const u8 *buf_end) {
|
||||
assert((size_t)buf_end % 16 == 0);
|
||||
m128 casemask = set16x8(CASE_CLEAR);
|
||||
m128 casemask = set1_16x8(CASE_CLEAR);
|
||||
|
||||
for (; buf + 16 < buf_end; buf_end -= 16) {
|
||||
m128 data = load128(buf_end - 16);
|
||||
@ -381,7 +381,7 @@ const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
|
||||
static really_inline
|
||||
const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
|
||||
/* due to laziness, nonalphas and nocase having interesting behaviour */
|
||||
m128 casemask = set16x8(CASE_CLEAR);
|
||||
m128 casemask = set1_16x8(CASE_CLEAR);
|
||||
m128 data = loadu128(buf);
|
||||
m128 v = and128(casemask, data);
|
||||
u32 z = movemask128(and128(eq128(chars2, v),
|
||||
@ -398,7 +398,7 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
|
||||
|
||||
#define VERM_BOUNDARY 64
|
||||
#define VERM_TYPE m512
|
||||
#define VERM_SET_FN set64x8
|
||||
#define VERM_SET_FN set1_64x8
|
||||
|
||||
static really_inline
|
||||
const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
|
||||
|
@ -47,7 +47,7 @@ char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end,
|
||||
|
||||
u32 count = *count_inout;
|
||||
|
||||
m128 chars = set16x8(c);
|
||||
m128 chars = set1_16x8(c);
|
||||
|
||||
for (; d + 16 <= d_end; d_end -= 16) {
|
||||
m128 data = loadu128(d_end - 16);
|
||||
@ -94,7 +94,7 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
|
||||
u32 count = *count_inout;
|
||||
|
||||
const m128 zeroes = zeroes128();
|
||||
const m128 low4bits = _mm_set1_epi8(0xf);
|
||||
const m128 low4bits = set1_16x8(0xf);
|
||||
|
||||
for (; d + 16 <= d_end; d_end -= 16) {
|
||||
m128 data = loadu128(d_end - 16);
|
||||
|
@ -938,7 +938,7 @@ int roseCheckShufti16x16(const struct core_info *ci, const u8 *hi_mask,
|
||||
return 1;
|
||||
}
|
||||
|
||||
m256 data_m256 = set2x128(data);
|
||||
m256 data_m256 = set1_2x128(data);
|
||||
m256 hi_mask_m256 = loadu256(hi_mask);
|
||||
m256 lo_mask_m256 = loadu256(lo_mask);
|
||||
m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
|
||||
@ -974,8 +974,8 @@ int roseCheckShufti32x8(const struct core_info *ci, const u8 *hi_mask,
|
||||
|
||||
m128 hi_mask_m128 = loadu128(hi_mask);
|
||||
m128 lo_mask_m128 = loadu128(lo_mask);
|
||||
m256 hi_mask_m256 = set2x128(hi_mask_m128);
|
||||
m256 lo_mask_m256 = set2x128(lo_mask_m128);
|
||||
m256 hi_mask_m256 = set1_2x128(hi_mask_m128);
|
||||
m256 lo_mask_m256 = set1_2x128(lo_mask_m128);
|
||||
m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
|
||||
if (validateShuftiMask32x8(data, hi_mask_m256, lo_mask_m256,
|
||||
bucket_select_mask_m256,
|
||||
@ -1287,7 +1287,7 @@ int roseCheckMultipathShufti16x8(const struct hs_scratch *scratch,
|
||||
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
|
||||
DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
|
||||
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
|
||||
expand_valid = set64x2(valid_hi, valid_lo);
|
||||
expand_valid = set2x64(valid_hi, valid_lo);
|
||||
valid_path_mask = ~movemask128(pshufb_m128(expand_valid,
|
||||
data_select_mask));
|
||||
}
|
||||
@ -1332,7 +1332,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
|
||||
|
||||
u32 valid_data_mask;
|
||||
m128 data_m128 = getData128(ci, offset, &valid_data_mask);
|
||||
m256 data_double = set2x128(data_m128);
|
||||
m256 data_double = set1_2x128(data_m128);
|
||||
m256 data_select_mask = loadu256(ri->data_select_mask);
|
||||
|
||||
u32 valid_path_mask = 0;
|
||||
@ -1346,7 +1346,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
|
||||
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
|
||||
DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
|
||||
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
|
||||
expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
|
||||
expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
|
||||
valid_lo);
|
||||
valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
|
||||
data_select_mask));
|
||||
@ -1393,7 +1393,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
|
||||
|
||||
u32 valid_data_mask;
|
||||
m128 data_m128 = getData128(ci, offset, &valid_data_mask);
|
||||
m256 data_double = set2x128(data_m128);
|
||||
m256 data_double = set1_2x128(data_m128);
|
||||
m256 data_select_mask = loadu256(ri->data_select_mask);
|
||||
|
||||
u32 valid_path_mask = 0;
|
||||
@ -1407,7 +1407,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
|
||||
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
|
||||
DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
|
||||
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
|
||||
expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
|
||||
expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
|
||||
valid_lo);
|
||||
valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
|
||||
data_select_mask));
|
||||
@ -1460,7 +1460,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
|
||||
|
||||
u32 valid_data_mask;
|
||||
m128 data_m128 = getData128(ci, offset, &valid_data_mask);
|
||||
m256 data_m256 = set2x128(data_m128);
|
||||
m256 data_m256 = set1_2x128(data_m128);
|
||||
m256 data_select_mask_1 = loadu256(ri->data_select_mask);
|
||||
m256 data_select_mask_2 = loadu256(ri->data_select_mask + 32);
|
||||
|
||||
@ -1475,7 +1475,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
|
||||
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
|
||||
DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
|
||||
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
|
||||
expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
|
||||
expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
|
||||
valid_lo);
|
||||
u32 valid_path_1 = movemask256(pshufb_m256(expand_valid,
|
||||
data_select_mask_1));
|
||||
|
@ -47,7 +47,7 @@ static really_inline
|
||||
int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
|
||||
const m256 lo_mask, const m256 and_mask,
|
||||
const u32 neg_mask, const u32 valid_data_mask) {
|
||||
m256 low4bits = set32x8(0xf);
|
||||
m256 low4bits = set1_32x8(0xf);
|
||||
m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
|
||||
m256 c_hi = pshufb_m256(hi_mask,
|
||||
rshift64_m256(andnot256(low4bits, data), 4));
|
||||
@ -78,7 +78,7 @@ int validateShuftiMask16x8(const m128 data, const m256 nib_mask,
|
||||
const m128 and_mask, const u32 neg_mask,
|
||||
const u32 valid_data_mask) {
|
||||
m256 data_m256 = combine2x128(rshift64_m128(data, 4), data);
|
||||
m256 low4bits = set32x8(0xf);
|
||||
m256 low4bits = set1_32x8(0xf);
|
||||
m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits));
|
||||
m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
|
||||
m128 nresult = eq128(and128(t, and_mask), zeroes128());
|
||||
@ -101,7 +101,7 @@ static really_inline
|
||||
int validateShuftiMask32x8(const m256 data, const m256 hi_mask,
|
||||
const m256 lo_mask, const m256 and_mask,
|
||||
const u32 neg_mask, const u32 valid_data_mask) {
|
||||
m256 low4bits = set32x8(0xf);
|
||||
m256 low4bits = set1_32x8(0xf);
|
||||
m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
|
||||
m256 c_hi = pshufb_m256(hi_mask,
|
||||
rshift64_m256(andnot256(low4bits, data), 4));
|
||||
@ -133,7 +133,7 @@ int validateShuftiMask32x16(const m256 data,
|
||||
const m256 bucket_mask_hi,
|
||||
const m256 bucket_mask_lo, const u32 neg_mask,
|
||||
const u32 valid_data_mask) {
|
||||
m256 low4bits = set32x8(0xf);
|
||||
m256 low4bits = set1_32x8(0xf);
|
||||
m256 data_lo = and256(data, low4bits);
|
||||
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
|
||||
m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
|
||||
@ -201,7 +201,7 @@ int validateMultipathShuftiMask16x8(const m128 data,
|
||||
const u32 neg_mask,
|
||||
const u32 valid_path_mask) {
|
||||
m256 data_256 = combine2x128(rshift64_m128(data, 4), data);
|
||||
m256 low4bits = set32x8(0xf);
|
||||
m256 low4bits = set1_32x8(0xf);
|
||||
m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits));
|
||||
m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
|
||||
m128 result = and128(t, bucket_select_mask);
|
||||
@ -220,7 +220,7 @@ int validateMultipathShuftiMask32x8(const m256 data,
|
||||
const u32 hi_bits, const u32 lo_bits,
|
||||
const u32 neg_mask,
|
||||
const u32 valid_path_mask) {
|
||||
m256 low4bits = set32x8(0xf);
|
||||
m256 low4bits = set1_32x8(0xf);
|
||||
m256 data_lo = and256(data, low4bits);
|
||||
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
|
||||
m256 c_lo = pshufb_m256(lo_mask, data_lo);
|
||||
@ -244,7 +244,7 @@ int validateMultipathShuftiMask32x16(const m256 data,
|
||||
const u32 hi_bits, const u32 lo_bits,
|
||||
const u32 neg_mask,
|
||||
const u32 valid_path_mask) {
|
||||
m256 low4bits = set32x8(0xf);
|
||||
m256 low4bits = set1_32x8(0xf);
|
||||
m256 data_lo = and256(data, low4bits);
|
||||
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
|
||||
m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
|
||||
@ -271,7 +271,7 @@ int validateMultipathShuftiMask64(const m256 data_1, const m256 data_2,
|
||||
const u64a hi_bits, const u64a lo_bits,
|
||||
const u64a neg_mask,
|
||||
const u64a valid_path_mask) {
|
||||
m256 low4bits = set32x8(0xf);
|
||||
m256 low4bits = set1_32x8(0xf);
|
||||
m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits));
|
||||
m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits));
|
||||
m256 c_hi_1 = pshufb_m256(hi_mask,
|
||||
|
@ -33,58 +33,13 @@
|
||||
#ifndef UTIL_ARCH_H_
|
||||
#define UTIL_ARCH_H_
|
||||
|
||||
#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
|
||||
#define HAVE_SSE2
|
||||
#include "config.h"
|
||||
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
#include "util/arch/x86/x86.h"
|
||||
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
|
||||
#include "util/arch/arm/arm.h"
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__))
|
||||
#define HAVE_SSE41
|
||||
#endif
|
||||
#endif // UTIL_ARCH_X86_H_
|
||||
|
||||
#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__))
|
||||
#define HAVE_SSE42
|
||||
#endif
|
||||
|
||||
#if defined(__AVX__)
|
||||
#define HAVE_AVX
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define HAVE_AVX2
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512BW__)
|
||||
#define HAVE_AVX512
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512VBMI__)
|
||||
#define HAVE_AVX512VBMI
|
||||
#endif
|
||||
|
||||
/*
|
||||
* ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
|
||||
*/
|
||||
#if defined(__POPCNT__) || \
|
||||
(defined(__INTEL_COMPILER) && defined(__SSE4_2__)) || \
|
||||
(defined(_WIN32) && defined(__AVX__))
|
||||
#define HAVE_POPCOUNT_INSTR
|
||||
#endif
|
||||
|
||||
#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) || \
|
||||
(defined(__INTEL_COMPILER) && defined(__AVX2__))
|
||||
#define HAVE_BMI
|
||||
#endif
|
||||
|
||||
#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) || \
|
||||
(defined(__INTEL_COMPILER) && defined(__AVX2__))
|
||||
#define HAVE_BMI2
|
||||
#endif
|
||||
|
||||
/*
|
||||
* MSVC uses a different form of inline asm
|
||||
*/
|
||||
#if defined(_WIN32) && defined(_MSC_VER)
|
||||
#define NO_ASM
|
||||
#endif
|
||||
|
||||
#endif // UTIL_ARCH_H_
|
||||
|
42
src/util/arch/arm/arm.h
Normal file
42
src/util/arch/arm/arm.h
Normal file
@ -0,0 +1,42 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2020, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Per-platform architecture definitions
|
||||
*/
|
||||
|
||||
#ifndef UTIL_ARCH_ARM_H_
|
||||
#define UTIL_ARCH_ARM_H_
|
||||
|
||||
#if defined(__ARM_NEON) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
|
||||
#define HAVE_NEON
|
||||
#define HAVE_SIMD_128_BITS
|
||||
#endif
|
||||
|
||||
#endif // UTIL_ARCH_ARM_H_
|
||||
|
199
src/util/arch/arm/bitutils.h
Normal file
199
src/util/arch/arm/bitutils.h
Normal file
@ -0,0 +1,199 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Bit-twiddling primitives (ctz, compress etc)
|
||||
*/
|
||||
|
||||
#ifndef BITUTILS_ARCH_ARM_H
|
||||
#define BITUTILS_ARCH_ARM_H
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "util/popcount.h"
|
||||
#include "util/arch.h"
|
||||
#include "util/intrinsics.h"
|
||||
|
||||
#include "util/arch/common/bitutils.h"
|
||||
|
||||
static really_inline
|
||||
u32 clz32_impl(u32 x) {
|
||||
return clz32_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 clz64_impl(u64a x) {
|
||||
return clz64_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 ctz32_impl(u32 x) {
|
||||
return ctz32_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 ctz64_impl(u64a x) {
|
||||
return ctz64_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 lg2_impl(u32 x) {
|
||||
return lg2_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a lg2_64_impl(u64a x) {
|
||||
return lg2_64_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_32_impl(u32 *v) {
|
||||
return findAndClearLSB_32_impl_c(v);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_64_impl(u64a *v) {
|
||||
return findAndClearLSB_64_impl_c(v);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_32_impl(u32 *v) {
|
||||
u32 val = *v;
|
||||
u32 offset = 31 - clz32_impl(val);
|
||||
*v = val & ~(1 << offset);
|
||||
assert(offset < 32);
|
||||
return offset;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_64_impl(u64a *v) {
|
||||
return findAndClearMSB_64_impl_c(v);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 compress32_impl(u32 x, u32 m) {
|
||||
return compress32_impl_c(x, m);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a compress64_impl(u64a x, u64a m) {
|
||||
return compress64_impl_c(x, m);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 compress128_impl(m128 x, m128 m) {
|
||||
|
||||
m128 one = set1_2x64(1);
|
||||
m128 bitset = one;
|
||||
m128 vres = zeroes128();
|
||||
while (isnonzero128(m)) {
|
||||
m128 mm = sub_2x64(zeroes128(), m);
|
||||
m128 tv = and128(x, m);
|
||||
tv = and128(tv, mm);
|
||||
|
||||
m128 mask = not128(eq64_m128(tv, zeroes128()));
|
||||
mask = vandq_s64(bitset, mask);
|
||||
vres = or128(vres, mask);
|
||||
m = and128(m, sub_2x64(m, set1_2x64(1)));
|
||||
bitset = lshift64_m128(bitset, 1);
|
||||
}
|
||||
return vres;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 expand32_impl(u32 x, u32 m) {
|
||||
return expand32_impl_c(x, m);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a expand64_impl(u64a x, u64a m) {
|
||||
return expand64_impl_c(x, m);
|
||||
}
|
||||
|
||||
/* returns the first set bit after begin (if not ~0U). If no bit is set after
|
||||
* begin returns ~0U
|
||||
*/
|
||||
static really_inline
|
||||
u32 bf64_iterate_impl(u64a bitfield, u32 begin) {
|
||||
if (begin != ~0U) {
|
||||
/* switch off all bits at or below begin. Note: not legal to shift by
|
||||
* by size of the datatype or larger. */
|
||||
assert(begin <= 63);
|
||||
bitfield &= ~((2ULL << begin) - 1);
|
||||
}
|
||||
|
||||
if (!bitfield) {
|
||||
return ~0U;
|
||||
}
|
||||
|
||||
return ctz64_impl(bitfield);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
char bf64_set_impl(u64a *bitfield, u32 i) {
|
||||
return bf64_set_impl_c(bitfield, i);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void bf64_unset_impl(u64a *bitfield, u32 i) {
|
||||
return bf64_unset_impl_c(bitfield, i);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask32_impl(u32 mask, u32 bit) {
|
||||
return rank_in_mask32_impl_c(mask, bit);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask64_impl(u64a mask, u32 bit) {
|
||||
return rank_in_mask64_impl_c(mask, bit);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 pext32_impl(u32 x, u32 mask) {
|
||||
return pext32_impl_c(x, mask);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a pext64_impl(u64a x, u64a mask) {
|
||||
return pext64_impl_c(x, mask);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a pdep64(u64a x, u64a mask) {
|
||||
return pdep64_impl_c(x, mask);
|
||||
}
|
||||
|
||||
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
|
||||
* so we force its generation.
|
||||
*/
|
||||
static really_inline
|
||||
u64a andn_impl(const u32 a, const u8 *b) {
|
||||
return andn_impl_c(a, b);
|
||||
}
|
||||
|
||||
#endif // BITUTILS_ARCH_ARM_H
|
40
src/util/arch/arm/cpuid_flags.c
Normal file
40
src/util/arch/arm/cpuid_flags.c
Normal file
@ -0,0 +1,40 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "util/arch/common/cpuid_flags.h"
|
||||
#include "ue2common.h"
|
||||
#include "hs_compile.h" // for HS_MODE_ flags
|
||||
#include "util/arch.h"
|
||||
|
||||
u64a cpuid_flags(void) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u32 cpuid_tune(void) {
|
||||
return HS_TUNE_FAMILY_GENERIC;
|
||||
}
|
37
src/util/arch/arm/simd_types.h
Normal file
37
src/util/arch/arm/simd_types.h
Normal file
@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef SIMD_TYPES_ARM_H
|
||||
#define SIMD_TYPES_ARM_H
|
||||
|
||||
#if !defined(m128) && defined(HAVE_NEON)
|
||||
typedef int32x4_t m128;
|
||||
#endif
|
||||
|
||||
#endif /* SIMD_TYPES_ARM_H */
|
||||
|
394
src/util/arch/arm/simd_utils.h
Normal file
394
src/util/arch/arm/simd_utils.h
Normal file
@ -0,0 +1,394 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2020, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief SIMD types and primitive operations.
|
||||
*/
|
||||
|
||||
#ifndef ARCH_ARM_SIMD_UTILS_H
|
||||
#define ARCH_ARM_SIMD_UTILS_H
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "util/simd_types.h"
|
||||
#include "util/unaligned.h"
|
||||
#include "util/intrinsics.h"
|
||||
|
||||
#include <string.h> // for memcpy
|
||||
|
||||
static really_inline m128 ones128(void) {
|
||||
return (m128) vdupq_n_s8(0xFF);
|
||||
}
|
||||
|
||||
static really_inline m128 zeroes128(void) {
|
||||
return (m128) vdupq_n_s32(0);
|
||||
}
|
||||
|
||||
/** \brief Bitwise not for m128*/
|
||||
static really_inline m128 not128(m128 a) {
|
||||
return (m128) vmvnq_s32(a);
|
||||
}
|
||||
|
||||
/** \brief Return 1 if a and b are different otherwise 0 */
|
||||
static really_inline int diff128(m128 a, m128 b) {
|
||||
int res = vaddvq_s8((int8x16_t) vceqq_s32(a, b));
|
||||
return (-16 != res);
|
||||
}
|
||||
|
||||
static really_inline int isnonzero128(m128 a) {
|
||||
return !!diff128(a, zeroes128());
|
||||
}
|
||||
|
||||
/**
|
||||
* "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
|
||||
* mask indicating which 32-bit words contain differences.
|
||||
*/
|
||||
static really_inline u32 diffrich128(m128 a, m128 b) {
|
||||
static const uint32x4_t movemask = { 1, 2, 4, 8 };
|
||||
return vaddvq_u32(vandq_u32(vmvnq_s32(vceqq_s32((int32x4_t)a, (int32x4_t)b)), movemask));
|
||||
}
|
||||
|
||||
/**
|
||||
* "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
|
||||
* returns a 4-bit mask indicating which 64-bit words contain differences.
|
||||
*/
|
||||
static really_inline u32 diffrich64_128(m128 a, m128 b) {
|
||||
static const uint64x2_t movemask = { 1, 4 };
|
||||
return vaddvq_u64(vandq_u64(vmvnq_s32(vceqq_s64((int64x2_t)a, (int64x2_t)b)), movemask));
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m128 add_2x64(m128 a, m128 b) {
|
||||
return (m128) vaddq_u64((int64x2_t)a, (int64x2_t)b);
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m128 sub_2x64(m128 a, m128 b) {
|
||||
return (m128) vsubq_u64((int64x2_t)a, (int64x2_t)b);
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m128 lshift_m128(m128 a, unsigned b) {
|
||||
return (m128) vshlq_n_s32((int64x2_t)a, b);
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m128 rshift_m128(m128 a, unsigned b) {
|
||||
return (m128) vshrq_n_s32((int64x2_t)a, b);
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m128 lshift64_m128(m128 a, unsigned b) {
|
||||
return (m128) vshlq_n_s64((int64x2_t)a, b);
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m128 rshift64_m128(m128 a, unsigned b) {
|
||||
return (m128) vshrq_n_s64((int64x2_t)a, b);
|
||||
}
|
||||
|
||||
static really_inline m128 eq128(m128 a, m128 b) {
|
||||
return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b);
|
||||
}
|
||||
|
||||
static really_inline m128 eq64_m128(m128 a, m128 b) {
|
||||
return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b);
|
||||
}
|
||||
|
||||
static really_inline u32 movemask128(m128 a) {
|
||||
static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
|
||||
|
||||
// Compute the mask from the input
|
||||
uint64x2_t mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
|
||||
|
||||
// Get the resulting bytes
|
||||
uint16_t output;
|
||||
vst1q_lane_u8((uint8_t*)&output + 0, (uint8x16_t)mask, 0);
|
||||
vst1q_lane_u8((uint8_t*)&output + 1, (uint8x16_t)mask, 8);
|
||||
return output;
|
||||
}
|
||||
|
||||
static really_inline m128 set1_16x8(u8 c) {
|
||||
return (m128) vdupq_n_u8(c);
|
||||
}
|
||||
|
||||
static really_inline m128 set1_4x32(u32 c) {
|
||||
return (m128) vdupq_n_u32(c);
|
||||
}
|
||||
|
||||
static really_inline m128 set1_2x64(u64a c) {
|
||||
return (m128) vdupq_n_u64(c);
|
||||
}
|
||||
|
||||
static really_inline u32 movd(const m128 in) {
|
||||
return vgetq_lane_u32((uint32x4_t) in, 0);
|
||||
}
|
||||
|
||||
static really_inline u64a movq(const m128 in) {
|
||||
return vgetq_lane_u64((uint64x2_t) in, 0);
|
||||
}
|
||||
|
||||
/* another form of movq */
|
||||
static really_inline
|
||||
m128 load_m128_from_u64a(const u64a *p) {
|
||||
return (m128) vsetq_lane_u64(*p, zeroes128(), 0);
|
||||
}
|
||||
|
||||
static really_inline u32 extract32from128(const m128 in, unsigned imm) {
|
||||
#if defined(HS_OPTIMIZE)
|
||||
return vgetq_lane_u32((uint32x4_t) in, imm);
|
||||
#else
|
||||
switch (imm) {
|
||||
case 0:
|
||||
return vgetq_lane_u32((uint32x4_t) in, 0);
|
||||
break;
|
||||
case 1:
|
||||
return vgetq_lane_u32((uint32x4_t) in, 1);
|
||||
break;
|
||||
case 2:
|
||||
return vgetq_lane_u32((uint32x4_t) in, 2);
|
||||
break;
|
||||
case 3:
|
||||
return vgetq_lane_u32((uint32x4_t) in, 3);
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline u64a extract64from128(const m128 in, unsigned imm) {
|
||||
#if defined(HS_OPTIMIZE)
|
||||
return vgetq_lane_u64((uint64x2_t) in, imm);
|
||||
#else
|
||||
switch (imm) {
|
||||
case 0:
|
||||
return vgetq_lane_u64((uint32x4_t) in, 0);
|
||||
break;
|
||||
case 1:
|
||||
return vgetq_lane_u64((uint32x4_t) in, 1);
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline m128 and128(m128 a, m128 b) {
|
||||
return (m128) vandq_s8((int8x16_t)a, (int8x16_t)b);
|
||||
}
|
||||
|
||||
static really_inline m128 xor128(m128 a, m128 b) {
|
||||
return (m128) veorq_s8((int8x16_t)a, (int8x16_t)b);
|
||||
}
|
||||
|
||||
static really_inline m128 or128(m128 a, m128 b) {
|
||||
return (m128) vorrq_s8((int8x16_t)a, (int8x16_t)b);
|
||||
}
|
||||
|
||||
static really_inline m128 andnot128(m128 a, m128 b) {
|
||||
return (m128) (m128) vandq_s8( vmvnq_s8(a), b);
|
||||
}
|
||||
|
||||
// aligned load
|
||||
static really_inline m128 load128(const void *ptr) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m128)));
|
||||
ptr = assume_aligned(ptr, 16);
|
||||
return (m128) vld1q_s32((const int32_t *)ptr);
|
||||
}
|
||||
|
||||
// aligned store
|
||||
static really_inline void store128(void *ptr, m128 a) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m128)));
|
||||
ptr = assume_aligned(ptr, 16);
|
||||
vst1q_s32((int32_t *)ptr, a);
|
||||
}
|
||||
|
||||
// unaligned load
|
||||
static really_inline m128 loadu128(const void *ptr) {
|
||||
return (m128) vld1q_s32((const int32_t *)ptr);
|
||||
}
|
||||
|
||||
// unaligned store
|
||||
static really_inline void storeu128(void *ptr, m128 a) {
|
||||
vst1q_s32((int32_t *)ptr, a);
|
||||
}
|
||||
|
||||
// packed unaligned store of first N bytes
|
||||
static really_inline
|
||||
void storebytes128(void *ptr, m128 a, unsigned int n) {
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(ptr, &a, n);
|
||||
}
|
||||
|
||||
// packed unaligned load of first N bytes, pad with zero
|
||||
static really_inline
|
||||
m128 loadbytes128(const void *ptr, unsigned int n) {
|
||||
m128 a = zeroes128();
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(&a, ptr, n);
|
||||
return a;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
||||
assert(amount >= -16 && amount <= 16);
|
||||
m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
|
||||
return vqtbl1q_s8(in, shift_mask);
|
||||
}
|
||||
|
||||
#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break;
|
||||
|
||||
static really_inline
|
||||
m128 palignr(m128 r, m128 l, int offset) {
|
||||
#if defined(HS_OPTIMIZE)
|
||||
return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
|
||||
#else
|
||||
switch (offset) {
|
||||
CASE_ALIGN_VECTORS(l, r, 0);
|
||||
CASE_ALIGN_VECTORS(l, r, 1);
|
||||
CASE_ALIGN_VECTORS(l, r, 2);
|
||||
CASE_ALIGN_VECTORS(l, r, 3);
|
||||
CASE_ALIGN_VECTORS(l, r, 4);
|
||||
CASE_ALIGN_VECTORS(l, r, 5);
|
||||
CASE_ALIGN_VECTORS(l, r, 6);
|
||||
CASE_ALIGN_VECTORS(l, r, 7);
|
||||
CASE_ALIGN_VECTORS(l, r, 8);
|
||||
CASE_ALIGN_VECTORS(l, r, 9);
|
||||
CASE_ALIGN_VECTORS(l, r, 10);
|
||||
CASE_ALIGN_VECTORS(l, r, 11);
|
||||
CASE_ALIGN_VECTORS(l, r, 12);
|
||||
CASE_ALIGN_VECTORS(l, r, 13);
|
||||
CASE_ALIGN_VECTORS(l, r, 14);
|
||||
CASE_ALIGN_VECTORS(l, r, 15);
|
||||
default:
|
||||
return zeroes128();
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#undef CASE_ALIGN_VECTORS
|
||||
|
||||
static really_really_inline
|
||||
m128 rshiftbyte_m128(m128 a, unsigned b) {
|
||||
if (b)
|
||||
return palignr(zeroes128(), a, b);
|
||||
else
|
||||
return a;
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m128 lshiftbyte_m128(m128 a, unsigned b) {
|
||||
if (b)
|
||||
return palignr(a, zeroes128(), 16 - b);
|
||||
else
|
||||
return a;
|
||||
}
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
extern const u8 simd_onebit_masks[];
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
static really_inline
|
||||
m128 mask1bit128(unsigned int n) {
|
||||
assert(n < sizeof(m128) * 8);
|
||||
u32 mask_idx = ((n % 8) * 64) + 95;
|
||||
mask_idx -= n / 8;
|
||||
return loadu128(&simd_onebit_masks[mask_idx]);
|
||||
}
|
||||
|
||||
// switches on bit N in the given vector.
|
||||
static really_inline
|
||||
void setbit128(m128 *ptr, unsigned int n) {
|
||||
*ptr = or128(mask1bit128(n), *ptr);
|
||||
}
|
||||
|
||||
// switches off bit N in the given vector.
|
||||
static really_inline
|
||||
void clearbit128(m128 *ptr, unsigned int n) {
|
||||
*ptr = andnot128(mask1bit128(n), *ptr);
|
||||
}
|
||||
|
||||
// tests bit N in the given vector.
|
||||
static really_inline
|
||||
char testbit128(m128 val, unsigned int n) {
|
||||
const m128 mask = mask1bit128(n);
|
||||
|
||||
return isnonzero128(and128(mask, val));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 pshufb_m128(m128 a, m128 b) {
|
||||
/* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
|
||||
In NEON, if >=16, then the result is zero, otherwise it is that lane.
|
||||
btranslated is the version that is converted from Intel to NEON. */
|
||||
int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f));
|
||||
return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 max_u8_m128(m128 a, m128 b) {
|
||||
return (m128) vmaxq_u8((int8x16_t)a, (int8x16_t)b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 min_u8_m128(m128 a, m128 b) {
|
||||
return (m128) vminq_u8((int8x16_t)a, (int8x16_t)b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 sadd_u8_m128(m128 a, m128 b) {
|
||||
return (m128) vqaddq_u8((uint8x16_t)a, (uint8x16_t)b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 sub_u8_m128(m128 a, m128 b) {
|
||||
return (m128) vsubq_u8((uint8x16_t)a, (uint8x16_t)b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) {
|
||||
uint32_t __attribute__((aligned(16))) data[4] = { x0, x1, x2, x3 };
|
||||
return (m128) vld1q_u32((uint32_t *) data);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 set2x64(u64a hi, u64a lo) {
|
||||
uint64_t __attribute__((aligned(16))) data[2] = { lo, hi };
|
||||
return (m128) vld1q_u64((uint64_t *) data);
|
||||
}
|
||||
|
||||
#endif // ARCH_ARM_SIMD_UTILS_H
|
418
src/util/arch/common/bitutils.h
Normal file
418
src/util/arch/common/bitutils.h
Normal file
@ -0,0 +1,418 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Bit-twiddling primitives (ctz, compress etc)
|
||||
*/
|
||||
|
||||
#ifndef BITUTILS_ARCH_COMMON_H
|
||||
#define BITUTILS_ARCH_COMMON_H
|
||||
|
||||
#include "util/popcount.h"
|
||||
#include "util/unaligned.h"
|
||||
#include "util/simd_utils.h"
|
||||
|
||||
static really_inline
|
||||
u32 clz32_impl_c(u32 x) {
|
||||
return (u32)__builtin_clz(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 clz64_impl_c(u64a x) {
|
||||
return (u32)__builtin_clzll(x);
|
||||
}
|
||||
|
||||
// CTZ (count trailing zero) implementations.
|
||||
static really_inline
|
||||
u32 ctz32_impl_c(u32 x) {
|
||||
return (u32)__builtin_ctz(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 ctz64_impl_c(u64a x) {
|
||||
return (u32)__builtin_ctzll(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 lg2_impl_c(u32 x) {
|
||||
if (!x) {
|
||||
return 0;
|
||||
}
|
||||
return 31 - clz32_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a lg2_64_impl_c(u64a x) {
|
||||
if (!x) {
|
||||
return 0;
|
||||
}
|
||||
return 63 - clz64_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_32_impl_c(u32 *v) {
|
||||
u32 val = *v;
|
||||
u32 offset = ctz32_impl_c(val);
|
||||
*v = val & (val - 1);
|
||||
|
||||
assert(offset < 32);
|
||||
return offset;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_64_impl_c(u64a *v) {
|
||||
#ifdef ARCH_64_BIT
|
||||
// generic variant using gcc's builtin on 64-bit
|
||||
u64a val = *v, offset;
|
||||
offset = ctz64_impl_c(val);
|
||||
*v = val & (val - 1);
|
||||
#else
|
||||
// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
|
||||
// inline calls to __builtin_ctzll
|
||||
u32 v1 = (u32)*v;
|
||||
u32 v2 = (u32)(*v >> 32);
|
||||
u32 offset;
|
||||
if (v1) {
|
||||
offset = findAndClearLSB_32_impl_c(&v1);
|
||||
*v = (u64a)v1 | ((u64a)v2 << 32);
|
||||
} else {
|
||||
offset = findAndClearLSB_32_impl_c(&v2) + 32;
|
||||
*v = (u64a)v2 << 32;
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(offset < 64);
|
||||
return (u32)offset;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_32_impl_c(u32 *v) {
|
||||
u32 val = *v;
|
||||
u32 offset = 31 - clz32_impl_c(val);
|
||||
*v = val & ~(1 << offset);
|
||||
|
||||
assert(offset < 32);
|
||||
return offset;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_64_impl_c(u64a *v) {
|
||||
#ifdef ARCH_64_BIT
|
||||
// generic variant using gcc's builtin on 64-bit
|
||||
u64a val = *v, offset;
|
||||
offset = 63 - clz64_impl_c(val);
|
||||
*v = val & ~(1ULL << offset);
|
||||
#else
|
||||
// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
|
||||
// inline calls to __builtin_ctzll
|
||||
u32 v1 = (u32)*v;
|
||||
u32 v2 = (*v >> 32);
|
||||
u32 offset;
|
||||
if (v2) {
|
||||
offset = findAndClearMSB_32_impl_c(&v2) + 32;
|
||||
*v = ((u64a)v2 << 32) | (u64a)v1;
|
||||
} else {
|
||||
offset = findAndClearMSB_32_impl_c(&v1);
|
||||
*v = (u64a)v1;
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(offset < 64);
|
||||
return (u32)offset;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 compress32_impl_c(u32 x, u32 m) {
|
||||
|
||||
// Return zero quickly on trivial cases
|
||||
if ((x & m) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u32 mk, mp, mv, t;
|
||||
|
||||
x &= m; // clear irrelevant bits
|
||||
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
for (u32 i = 0; i < 5; i++) {
|
||||
mp = mk ^ (mk << 1);
|
||||
mp ^= mp << 2;
|
||||
mp ^= mp << 4;
|
||||
mp ^= mp << 8;
|
||||
mp ^= mp << 16;
|
||||
|
||||
mv = mp & m; // bits to move
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
t = x & mv;
|
||||
x = (x ^ t) | (t >> (1 << i)); // compress x
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a compress64_impl_c(u64a x, u64a m) {
|
||||
u64a res = 0;
|
||||
for (u64a bb = 1; m != 0; bb += bb) {
|
||||
if (x & m & -m) { res |= bb; }
|
||||
m &= (m - 1);
|
||||
}
|
||||
return res;
|
||||
/* // Return zero quickly on trivial cases
|
||||
if ((x & m) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u64a mk, mp, mv, t;
|
||||
|
||||
x &= m; // clear irrelevant bits
|
||||
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
for (u32 i = 0; i < 6; i++) {
|
||||
mp = mk ^ (mk << 1);
|
||||
mp ^= mp << 2;
|
||||
mp ^= mp << 4;
|
||||
mp ^= mp << 8;
|
||||
mp ^= mp << 16;
|
||||
mp ^= mp << 32;
|
||||
|
||||
mv = mp & m; // bits to move
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
t = x & mv;
|
||||
x = (x ^ t) | (t >> (1 << i)); // compress x
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
return x;*/
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 compress128_impl_c(m128 xvec, m128 mvec) {
|
||||
u64a ALIGN_ATTR(16) x[2];
|
||||
u64a ALIGN_ATTR(16) m[2];
|
||||
store128(x, xvec);
|
||||
store128(m, mvec);
|
||||
|
||||
compress64_impl_c(x[0], m[0]);
|
||||
compress64_impl_c(x[1], m[1]);
|
||||
|
||||
return xvec;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 expand32_impl_c(u32 x, u32 m) {
|
||||
// Return zero quickly on trivial cases
|
||||
if (!x || !m) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u32 m0, mk, mp, mv, t;
|
||||
u32 array[5];
|
||||
|
||||
m0 = m; // save original mask
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
mp = mk ^ (mk << 1); // parallel suffix
|
||||
mp = mp ^ (mp << 2);
|
||||
mp = mp ^ (mp << 4);
|
||||
mp = mp ^ (mp << 8);
|
||||
mp = mp ^ (mp << 16);
|
||||
mv = mp & m; // bits to move
|
||||
array[i] = mv;
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
for (int i = 4; i >= 0; i--) {
|
||||
mv = array[i];
|
||||
t = x << (1 << i);
|
||||
x = (x & ~mv) | (t & mv);
|
||||
}
|
||||
|
||||
return x & m0; // clear out extraneous bits
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a expand64_impl_c(u64a x, u64a m) {
|
||||
|
||||
u64a res = 0;
|
||||
for (u64a bb = 1; m != 0; bb += bb) {
|
||||
if (x & bb) { res |= m & (-m); }
|
||||
m &= (m - 1);
|
||||
}
|
||||
return res;
|
||||
/* // Return zero quickly on trivial cases
|
||||
if (!x || !m) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u64a m0, mk, mp, mv, t;
|
||||
u64a array[6];
|
||||
|
||||
m0 = m; // save original mask
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
|
||||
for (int i = 0; i < 6; i++) {
|
||||
mp = mk ^ (mk << 1); // parallel suffix
|
||||
mp = mp ^ (mp << 2);
|
||||
mp = mp ^ (mp << 4);
|
||||
mp = mp ^ (mp << 8);
|
||||
mp = mp ^ (mp << 16);
|
||||
mp = mp ^ (mp << 32);
|
||||
mv = mp & m; // bits to move
|
||||
array[i] = mv;
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
for (int i = 5; i >= 0; i--) {
|
||||
mv = array[i];
|
||||
t = x << (1 << i);
|
||||
x = (x & ~mv) | (t & mv);
|
||||
}
|
||||
|
||||
return x & m0; // clear out extraneous bits*/
|
||||
}
|
||||
|
||||
|
||||
/* returns the first set bit after begin (if not ~0U). If no bit is set after
|
||||
* begin returns ~0U
|
||||
*/
|
||||
static really_inline
|
||||
u32 bf64_iterate_impl_c(u64a bitfield, u32 begin) {
|
||||
if (begin != ~0U) {
|
||||
/* switch off all bits at or below begin. Note: not legal to shift by
|
||||
* by size of the datatype or larger. */
|
||||
assert(begin <= 63);
|
||||
bitfield &= ~((2ULL << begin) - 1);
|
||||
}
|
||||
|
||||
if (!bitfield) {
|
||||
return ~0U;
|
||||
}
|
||||
|
||||
return ctz64_impl_c(bitfield);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
char bf64_set_impl_c(u64a *bitfield, u32 i) {
|
||||
u64a mask = 1ULL << i;
|
||||
char was_set = !!(*bitfield & mask);
|
||||
*bitfield |= mask;
|
||||
|
||||
return was_set;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void bf64_unset_impl_c(u64a *bitfield, u32 i) {
|
||||
*bitfield &= ~(1ULL << i);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask32_impl_c(u32 mask, u32 bit) {
|
||||
mask &= (u32)(1U << bit) - 1;
|
||||
return popcount32(mask);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask64_impl_c(u64a mask, u32 bit) {
|
||||
mask &= (u64a)(1ULL << bit) - 1;
|
||||
return popcount64(mask);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 pext32_impl_c(u32 x, u32 mask) {
|
||||
|
||||
u32 result = 0, num = 1;
|
||||
while (mask != 0) {
|
||||
u32 bit = findAndClearLSB_32_impl_c(&mask);
|
||||
if (x & (1U << bit)) {
|
||||
assert(num != 0); // more than 32 bits!
|
||||
result |= num;
|
||||
}
|
||||
num <<= 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a pext64_impl_c(u64a x, u64a mask) {
|
||||
|
||||
u32 result = 0, num = 1;
|
||||
while (mask != 0) {
|
||||
u32 bit = findAndClearLSB_64_impl_c(&mask);
|
||||
if (x & (1ULL << bit)) {
|
||||
assert(num != 0); // more than 32 bits!
|
||||
result |= num;
|
||||
}
|
||||
num <<= 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a pdep64_impl_c(u64a x, u64a _m) {
|
||||
/* Taken from:
|
||||
* https://gcc.gnu.org/legacy-ml/gcc-patches/2017-06/msg01408.html
|
||||
*/
|
||||
|
||||
u64a result = 0x0UL;
|
||||
const u64a mask = 0x8000000000000000UL;
|
||||
u64a m = _m;
|
||||
u64a c, t;
|
||||
u64a p;
|
||||
|
||||
/* The pop-count of the mask gives the number of the bits from
|
||||
source to process. This is also needed to shift bits from the
|
||||
source into the correct position for the result. */
|
||||
p = 64 - __builtin_popcountl (_m);
|
||||
|
||||
/* The loop is for the number of '1' bits in the mask and clearing
|
||||
each mask bit as it is processed. */
|
||||
while (m != 0)
|
||||
{
|
||||
c = __builtin_clzl (m);
|
||||
t = x << (p - c);
|
||||
m ^= (mask >> c);
|
||||
result |= (t & (mask >> c));
|
||||
p++;
|
||||
}
|
||||
return (result);
|
||||
}
|
||||
|
||||
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
|
||||
* so we force its generation.
|
||||
*/
|
||||
static really_inline
|
||||
u64a andn_impl_c(const u32 a, const u8 *b) {
|
||||
return unaligned_load_u32(b) & ~a;
|
||||
}
|
||||
|
||||
#endif // BITUTILS_ARCH_COMMON_H
|
@ -31,7 +31,7 @@
|
||||
|
||||
#include "ue2common.h"
|
||||
|
||||
#if !defined(_WIN32) && !defined(CPUID_H_)
|
||||
#if (defined(ARCH_IA32) || defined(ARCH_X86_64)) && !defined(_WIN32) && !defined(CPUID_H_)
|
||||
#include <cpuid.h>
|
||||
/* system header doesn't have a header guard */
|
||||
#define CPUID_H_
|
773
src/util/arch/common/simd_utils.h
Normal file
773
src/util/arch/common/simd_utils.h
Normal file
@ -0,0 +1,773 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2020, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief SIMD types and primitive operations.
|
||||
*/
|
||||
|
||||
#ifndef ARCH_COMMON_SIMD_UTILS_H
|
||||
#define ARCH_COMMON_SIMD_UTILS_H
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "util/simd_types.h"
|
||||
#include "util/unaligned.h"
|
||||
#include "util/intrinsics.h"
|
||||
|
||||
#include <string.h> // for memcpy
|
||||
|
||||
#if !defined(HAVE_SIMD_128_BITS)
|
||||
#error "You need at least a 128-bit capable SIMD engine!"
|
||||
#endif // HAVE_SIMD_128_BITS
|
||||
|
||||
#ifdef DEBUG
|
||||
static inline void print_m128_16x8(char *label, m128 vector) {
|
||||
uint8_t __attribute__((aligned(16))) data[16];
|
||||
store128(data, vector);
|
||||
DEBUG_PRINTF("%s: ", label);
|
||||
for(int i=0; i < 16; i++)
|
||||
printf("%02x ", data[i]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static inline void print_m128_8x16(char *label, m128 vector) {
|
||||
uint16_t __attribute__((aligned(16))) data[8];
|
||||
store128(data, vector);
|
||||
DEBUG_PRINTF("%s: ", label);
|
||||
for(int i=0; i < 8; i++)
|
||||
printf("%04x ", data[i]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static inline void print_m128_4x32(char *label, m128 vector) {
|
||||
uint32_t __attribute__((aligned(16))) data[4];
|
||||
store128(data, vector);
|
||||
DEBUG_PRINTF("%s: ", label);
|
||||
for(int i=0; i < 4; i++)
|
||||
printf("%08x ", data[i]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static inline void print_m128_2x64(char *label, m128 vector) {
|
||||
uint64_t __attribute__((aligned(16))) data[2];
|
||||
store128(data, vector);
|
||||
DEBUG_PRINTF("%s: ", label);
|
||||
for(int i=0; i < 2; i++)
|
||||
printf("%016lx ", data[i]);
|
||||
printf("\n");
|
||||
}
|
||||
#else
|
||||
#define print_m128_16x8(label, vector) NULL
|
||||
#define print_m128_8x16(label, vector) NULL
|
||||
#define print_m128_4x32(label, vector) NULL
|
||||
#define print_m128_2x64(label, vector) NULL
|
||||
#endif
|
||||
|
||||
/****
|
||||
**** 256-bit Primitives
|
||||
****/
|
||||
|
||||
#if !defined(HAVE_SIMD_256_BITS)
|
||||
|
||||
static really_really_inline
|
||||
m256 lshift64_m256(m256 a, int b) {
|
||||
m256 rv = a;
|
||||
rv.lo = lshift64_m128(rv.lo, b);
|
||||
rv.hi = lshift64_m128(rv.hi, b);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 rshift64_m256(m256 a, int b) {
|
||||
m256 rv = a;
|
||||
rv.lo = rshift64_m128(rv.lo, b);
|
||||
rv.hi = rshift64_m128(rv.hi, b);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 eq256(m256 a, m256 b) {
|
||||
m256 rv;
|
||||
rv.lo = eq128(a.lo, b.lo);
|
||||
rv.hi = eq128(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 movemask256(m256 a) {
|
||||
u32 lo_mask = movemask128(a.lo);
|
||||
u32 hi_mask = movemask128(a.hi);
|
||||
return lo_mask | (hi_mask << 16);
|
||||
}
|
||||
|
||||
static really_inline m256 set1_4x64(u64a c) {
|
||||
m128 a128 = set1_2x64(c);
|
||||
m256 rv = {a128, a128};
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 set1_2x128(m128 a) {
|
||||
m256 rv = {a, a};
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline m256 zeroes256(void) {
|
||||
m256 rv = {zeroes128(), zeroes128()};
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline m256 ones256(void) {
|
||||
m256 rv = {ones128(), ones128()};
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline m256 and256(m256 a, m256 b) {
|
||||
m256 rv;
|
||||
rv.lo = and128(a.lo, b.lo);
|
||||
rv.hi = and128(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline m256 or256(m256 a, m256 b) {
|
||||
m256 rv;
|
||||
rv.lo = or128(a.lo, b.lo);
|
||||
rv.hi = or128(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline m256 xor256(m256 a, m256 b) {
|
||||
m256 rv;
|
||||
rv.lo = xor128(a.lo, b.lo);
|
||||
rv.hi = xor128(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline m256 not256(m256 a) {
|
||||
m256 rv;
|
||||
rv.lo = not128(a.lo);
|
||||
rv.hi = not128(a.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline m256 andnot256(m256 a, m256 b) {
|
||||
m256 rv;
|
||||
rv.lo = andnot128(a.lo, b.lo);
|
||||
rv.hi = andnot128(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline int diff256(m256 a, m256 b) {
|
||||
return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
|
||||
}
|
||||
|
||||
static really_inline int isnonzero256(m256 a) {
|
||||
return isnonzero128(or128(a.lo, a.hi));
|
||||
}
|
||||
|
||||
/**
|
||||
* "Rich" version of diff256(). Takes two vectors a and b and returns a 8-bit
|
||||
* mask indicating which 32-bit words contain differences.
|
||||
*/
|
||||
static really_inline
|
||||
u32 diffrich256(m256 a, m256 b) {
|
||||
return diffrich128(a.lo, b.lo) | (diffrich128(a.hi, b.hi) << 4);
|
||||
}
|
||||
|
||||
/**
|
||||
* "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
|
||||
* returns an 8-bit mask indicating which 64-bit words contain differences.
|
||||
*/
|
||||
static really_inline u32 diffrich64_256(m256 a, m256 b) {
|
||||
u32 d = diffrich256(a, b);
|
||||
return (d | (d >> 1)) & 0x55555555;
|
||||
}
|
||||
|
||||
// aligned load
|
||||
static really_inline m256 load256(const void *ptr) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m256)));
|
||||
m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
|
||||
return rv;
|
||||
}
|
||||
|
||||
// aligned load of 128-bit value to low and high part of 256-bit value
|
||||
static really_inline m256 load2x128(const void *ptr) {
|
||||
return set1_2x128(load128(ptr));
|
||||
}
|
||||
|
||||
static really_inline m256 loadu2x128(const void *ptr) {
|
||||
return set1_2x128(loadu128(ptr));
|
||||
}
|
||||
|
||||
// aligned store
|
||||
static really_inline void store256(void *ptr, m256 a) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m256)));
|
||||
ptr = assume_aligned(ptr, 16);
|
||||
*(m256 *)ptr = a;
|
||||
}
|
||||
|
||||
// unaligned load
|
||||
static really_inline m256 loadu256(const void *ptr) {
|
||||
m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
|
||||
return rv;
|
||||
}
|
||||
|
||||
// unaligned store
|
||||
static really_inline void storeu256(void *ptr, m256 a) {
|
||||
storeu128(ptr, a.lo);
|
||||
storeu128((char *)ptr + 16, a.hi);
|
||||
}
|
||||
|
||||
// packed unaligned store of first N bytes
|
||||
static really_inline
|
||||
void storebytes256(void *ptr, m256 a, unsigned int n) {
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(ptr, &a, n);
|
||||
}
|
||||
|
||||
// packed unaligned load of first N bytes, pad with zero
|
||||
static really_inline
|
||||
m256 loadbytes256(const void *ptr, unsigned int n) {
|
||||
m256 a = zeroes256();
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(&a, ptr, n);
|
||||
return a;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 mask1bit256(unsigned int n) {
|
||||
assert(n < sizeof(m256) * 8);
|
||||
u32 mask_idx = ((n % 8) * 64) + 95;
|
||||
mask_idx -= n / 8;
|
||||
return loadu256(&simd_onebit_masks[mask_idx]);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 set1_32x8(u32 in) {
|
||||
m256 rv;
|
||||
rv.hi = set1_16x8(in);
|
||||
rv.lo = set1_16x8(in);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 set8x32(u32 hi_3, u32 hi_2, u32 hi_1, u32 hi_0, u32 lo_3, u32 lo_2, u32 lo_1, u32 lo_0) {
|
||||
m256 rv;
|
||||
rv.hi = set4x32(hi_3, hi_2, hi_1, hi_0);
|
||||
rv.lo = set4x32(lo_3, lo_2, lo_1, lo_0);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 set4x64(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
|
||||
m256 rv;
|
||||
rv.hi = set2x64(hi_1, hi_0);
|
||||
rv.lo = set2x64(lo_1, lo_0);
|
||||
return rv;
|
||||
}
|
||||
|
||||
// switches on bit N in the given vector.
|
||||
static really_inline
|
||||
void setbit256(m256 *ptr, unsigned int n) {
|
||||
assert(n < sizeof(*ptr) * 8);
|
||||
m128 *sub;
|
||||
if (n < 128) {
|
||||
sub = &ptr->lo;
|
||||
} else {
|
||||
sub = &ptr->hi;
|
||||
n -= 128;
|
||||
}
|
||||
setbit128(sub, n);
|
||||
}
|
||||
|
||||
// switches off bit N in the given vector.
|
||||
static really_inline
|
||||
void clearbit256(m256 *ptr, unsigned int n) {
|
||||
assert(n < sizeof(*ptr) * 8);
|
||||
m128 *sub;
|
||||
if (n < 128) {
|
||||
sub = &ptr->lo;
|
||||
} else {
|
||||
sub = &ptr->hi;
|
||||
n -= 128;
|
||||
}
|
||||
clearbit128(sub, n);
|
||||
}
|
||||
|
||||
// tests bit N in the given vector.
|
||||
static really_inline
|
||||
char testbit256(m256 val, unsigned int n) {
|
||||
assert(n < sizeof(val) * 8);
|
||||
m128 sub;
|
||||
if (n < 128) {
|
||||
sub = val.lo;
|
||||
} else {
|
||||
sub = val.hi;
|
||||
n -= 128;
|
||||
}
|
||||
return testbit128(sub, n);
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m128 movdq_hi(m256 x) {
|
||||
return x.hi;
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m128 movdq_lo(m256 x) {
|
||||
return x.lo;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 combine2x128(m128 hi, m128 lo) {
|
||||
m256 rv = {lo, hi};
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 pshufb_m256(m256 a, m256 b) {
|
||||
m256 rv;
|
||||
rv.lo = pshufb_m128(a.lo, b.lo);
|
||||
rv.hi = pshufb_m128(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
#endif // HAVE_SIMD_256_BITS
|
||||
|
||||
/****
|
||||
**** 384-bit Primitives
|
||||
****/
|
||||
|
||||
static really_inline m384 and384(m384 a, m384 b) {
|
||||
m384 rv;
|
||||
rv.lo = and128(a.lo, b.lo);
|
||||
rv.mid = and128(a.mid, b.mid);
|
||||
rv.hi = and128(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline m384 or384(m384 a, m384 b) {
|
||||
m384 rv;
|
||||
rv.lo = or128(a.lo, b.lo);
|
||||
rv.mid = or128(a.mid, b.mid);
|
||||
rv.hi = or128(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline m384 xor384(m384 a, m384 b) {
|
||||
m384 rv;
|
||||
rv.lo = xor128(a.lo, b.lo);
|
||||
rv.mid = xor128(a.mid, b.mid);
|
||||
rv.hi = xor128(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
static really_inline m384 not384(m384 a) {
|
||||
m384 rv;
|
||||
rv.lo = not128(a.lo);
|
||||
rv.mid = not128(a.mid);
|
||||
rv.hi = not128(a.hi);
|
||||
return rv;
|
||||
}
|
||||
static really_inline m384 andnot384(m384 a, m384 b) {
|
||||
m384 rv;
|
||||
rv.lo = andnot128(a.lo, b.lo);
|
||||
rv.mid = andnot128(a.mid, b.mid);
|
||||
rv.hi = andnot128(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m384 lshift64_m384(m384 a, unsigned b) {
|
||||
m384 rv;
|
||||
rv.lo = lshift64_m128(a.lo, b);
|
||||
rv.mid = lshift64_m128(a.mid, b);
|
||||
rv.hi = lshift64_m128(a.hi, b);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline m384 zeroes384(void) {
|
||||
m384 rv = {zeroes128(), zeroes128(), zeroes128()};
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline m384 ones384(void) {
|
||||
m384 rv = {ones128(), ones128(), ones128()};
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline int diff384(m384 a, m384 b) {
|
||||
return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
|
||||
}
|
||||
|
||||
static really_inline int isnonzero384(m384 a) {
|
||||
return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
|
||||
}
|
||||
|
||||
#if defined(HAVE_SIMD_128_BITS) && !defined(ARCH_IA32) && !defined(ARCH_X86_64)
|
||||
/**
|
||||
* "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
|
||||
* mask indicating which 32-bit words contain differences.
|
||||
*/
|
||||
static really_inline
|
||||
u32 diffrich384(m384 a, m384 b) {
|
||||
return diffrich128(a.lo, b.lo) | (diffrich128(a.mid, b.mid) << 4) | (diffrich128(a.hi, b.hi) << 8);
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
|
||||
* returns a 12-bit mask indicating which 64-bit words contain differences.
|
||||
*/
|
||||
static really_inline u32 diffrich64_384(m384 a, m384 b) {
|
||||
u32 d = diffrich384(a, b);
|
||||
return (d | (d >> 1)) & 0x55555555;
|
||||
}
|
||||
|
||||
// aligned load
|
||||
static really_inline m384 load384(const void *ptr) {
|
||||
assert(ISALIGNED_16(ptr));
|
||||
m384 rv = { load128(ptr), load128((const char *)ptr + 16),
|
||||
load128((const char *)ptr + 32) };
|
||||
return rv;
|
||||
}
|
||||
|
||||
// aligned store
|
||||
static really_inline void store384(void *ptr, m384 a) {
|
||||
assert(ISALIGNED_16(ptr));
|
||||
ptr = assume_aligned(ptr, 16);
|
||||
*(m384 *)ptr = a;
|
||||
}
|
||||
|
||||
// unaligned load
|
||||
static really_inline m384 loadu384(const void *ptr) {
|
||||
m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
|
||||
loadu128((const char *)ptr + 32)};
|
||||
return rv;
|
||||
}
|
||||
|
||||
// packed unaligned store of first N bytes
|
||||
static really_inline
|
||||
void storebytes384(void *ptr, m384 a, unsigned int n) {
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(ptr, &a, n);
|
||||
}
|
||||
|
||||
// packed unaligned load of first N bytes, pad with zero
|
||||
static really_inline
|
||||
m384 loadbytes384(const void *ptr, unsigned int n) {
|
||||
m384 a = zeroes384();
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(&a, ptr, n);
|
||||
return a;
|
||||
}
|
||||
|
||||
// switches on bit N in the given vector.
|
||||
static really_inline
|
||||
void setbit384(m384 *ptr, unsigned int n) {
|
||||
assert(n < sizeof(*ptr) * 8);
|
||||
m128 *sub;
|
||||
if (n < 128) {
|
||||
sub = &ptr->lo;
|
||||
} else if (n < 256) {
|
||||
sub = &ptr->mid;
|
||||
} else {
|
||||
sub = &ptr->hi;
|
||||
}
|
||||
setbit128(sub, n % 128);
|
||||
}
|
||||
|
||||
// switches off bit N in the given vector.
|
||||
static really_inline
|
||||
void clearbit384(m384 *ptr, unsigned int n) {
|
||||
assert(n < sizeof(*ptr) * 8);
|
||||
m128 *sub;
|
||||
if (n < 128) {
|
||||
sub = &ptr->lo;
|
||||
} else if (n < 256) {
|
||||
sub = &ptr->mid;
|
||||
} else {
|
||||
sub = &ptr->hi;
|
||||
}
|
||||
clearbit128(sub, n % 128);
|
||||
}
|
||||
|
||||
// tests bit N in the given vector.
|
||||
static really_inline
|
||||
char testbit384(m384 val, unsigned int n) {
|
||||
assert(n < sizeof(val) * 8);
|
||||
m128 sub;
|
||||
if (n < 128) {
|
||||
sub = val.lo;
|
||||
} else if (n < 256) {
|
||||
sub = val.mid;
|
||||
} else {
|
||||
sub = val.hi;
|
||||
}
|
||||
return testbit128(sub, n % 128);
|
||||
}
|
||||
|
||||
|
||||
/****
|
||||
**** 512-bit Primitives
|
||||
****/
|
||||
|
||||
#if !defined(HAVE_SIMD_512_BITS)
|
||||
|
||||
static really_inline
|
||||
m512 zeroes512(void) {
|
||||
m512 rv = {zeroes256(), zeroes256()};
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 ones512(void) {
|
||||
m512 rv = {ones256(), ones256()};
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 set1_64x8(u8 a) {
|
||||
m256 a256 = set1_32x8(a);
|
||||
m512 rv = {a256, a256};
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 set1_8x64(u64a a) {
|
||||
m256 a256 = set1_4x64(a);
|
||||
m512 rv = {a256, a256};
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 set8x64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
|
||||
u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
|
||||
m512 rv;
|
||||
rv.lo = set4x64(lo_3, lo_2, lo_1, lo_0);
|
||||
rv.hi = set4x64(hi_3, hi_2, hi_1, hi_0);
|
||||
return rv;
|
||||
}
|
||||
/*
|
||||
static really_inline
|
||||
m512 swap256in512(m512 a) {
|
||||
m512 idx = set8x64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
|
||||
return vpermq512(idx, a);
|
||||
}*/
|
||||
|
||||
static really_inline
|
||||
m512 set1_4x128(m128 a) {
|
||||
m256 a256 = set1_2x128(a);
|
||||
m512 rv = {a256, a256};
|
||||
return rv;
|
||||
}
|
||||
|
||||
|
||||
static really_inline
|
||||
m512 and512(m512 a, m512 b) {
|
||||
m512 rv;
|
||||
rv.lo = and256(a.lo, b.lo);
|
||||
rv.hi = and256(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 or512(m512 a, m512 b) {
|
||||
m512 rv;
|
||||
rv.lo = or256(a.lo, b.lo);
|
||||
rv.hi = or256(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 xor512(m512 a, m512 b) {
|
||||
m512 rv;
|
||||
rv.lo = xor256(a.lo, b.lo);
|
||||
rv.hi = xor256(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 not512(m512 a) {
|
||||
m512 rv;
|
||||
rv.lo = not256(a.lo);
|
||||
rv.hi = not256(a.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 andnot512(m512 a, m512 b) {
|
||||
m512 rv;
|
||||
rv.lo = andnot256(a.lo, b.lo);
|
||||
rv.hi = andnot256(a.hi, b.hi);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m512 lshift64_m512(m512 a, unsigned b) {
|
||||
m512 rv;
|
||||
rv.lo = lshift64_m256(a.lo, b);
|
||||
rv.hi = lshift64_m256(a.hi, b);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
int diff512(m512 a, m512 b) {
|
||||
return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
int isnonzero512(m512 a) {
|
||||
m256 x = or256(a.lo, a.lo);
|
||||
m256 y = or256(a.hi, a.hi);
|
||||
return isnonzero256(or256(x, y));
|
||||
}
|
||||
|
||||
/**
|
||||
* "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
|
||||
* mask indicating which 32-bit words contain differences.
|
||||
*/
|
||||
static really_inline
|
||||
u32 diffrich512(m512 a, m512 b) {
|
||||
return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
|
||||
}
|
||||
|
||||
/**
|
||||
* "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
|
||||
* returns a 16-bit mask indicating which 64-bit words contain differences.
|
||||
*/
|
||||
static really_inline
|
||||
u32 diffrich64_512(m512 a, m512 b) {
|
||||
//TODO: cmp_epi64?
|
||||
u32 d = diffrich512(a, b);
|
||||
return (d | (d >> 1)) & 0x55555555;
|
||||
}
|
||||
|
||||
// aligned load
|
||||
static really_inline
|
||||
m512 load512(const void *ptr) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m256)));
|
||||
m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
|
||||
return rv;
|
||||
}
|
||||
|
||||
// aligned store
|
||||
static really_inline
|
||||
void store512(void *ptr, m512 a) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m512)));
|
||||
m512 *x = (m512 *)ptr;
|
||||
store256(&x->lo, a.lo);
|
||||
store256(&x->hi, a.hi);
|
||||
}
|
||||
|
||||
// unaligned load
|
||||
static really_inline
|
||||
m512 loadu512(const void *ptr) {
|
||||
m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
|
||||
return rv;
|
||||
}
|
||||
|
||||
/*static really_inline
|
||||
m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 set_mask_m512(__mmask64 k) {
|
||||
}*/
|
||||
|
||||
// packed unaligned store of first N bytes
|
||||
static really_inline
|
||||
void storebytes512(void *ptr, m512 a, unsigned int n) {
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(ptr, &a, n);
|
||||
}
|
||||
|
||||
// packed unaligned load of first N bytes, pad with zero
|
||||
static really_inline
|
||||
m512 loadbytes512(const void *ptr, unsigned int n) {
|
||||
m512 a = zeroes512();
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(&a, ptr, n);
|
||||
return a;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 mask1bit512(unsigned int n) {
|
||||
assert(n < sizeof(m512) * 8);
|
||||
u32 mask_idx = ((n % 8) * 64) + 95;
|
||||
mask_idx -= n / 8;
|
||||
return loadu512(&simd_onebit_masks[mask_idx]);
|
||||
}
|
||||
|
||||
// switches on bit N in the given vector.
|
||||
static really_inline
|
||||
void setbit512(m512 *ptr, unsigned int n) {
|
||||
assert(n < sizeof(*ptr) * 8);
|
||||
m256 *sub;
|
||||
if (n < 256) {
|
||||
sub = &ptr->lo;
|
||||
} else {
|
||||
sub = &ptr->hi;
|
||||
n -= 256;
|
||||
}
|
||||
setbit256(sub, n);
|
||||
}
|
||||
|
||||
// switches off bit N in the given vector.
|
||||
static really_inline
|
||||
void clearbit512(m512 *ptr, unsigned int n) {
|
||||
assert(n < sizeof(*ptr) * 8);
|
||||
m256 *sub;
|
||||
if (n < 256) {
|
||||
sub = &ptr->lo;
|
||||
} else {
|
||||
sub = &ptr->hi;
|
||||
n -= 256;
|
||||
}
|
||||
clearbit256(sub, n);
|
||||
}
|
||||
|
||||
// tests bit N in the given vector.
|
||||
static really_inline
|
||||
char testbit512(m512 val, unsigned int n) {
|
||||
assert(n < sizeof(val) * 8);
|
||||
m256 sub;
|
||||
if (n < 256) {
|
||||
sub = val.lo;
|
||||
} else {
|
||||
sub = val.hi;
|
||||
n -= 256;
|
||||
}
|
||||
return testbit256(sub, n);
|
||||
}
|
||||
|
||||
#endif // HAVE_SIMD_512_BITS
|
||||
|
||||
#endif // ARCH_COMMON_SIMD_UTILS_H
|
322
src/util/arch/x86/bitutils.h
Normal file
322
src/util/arch/x86/bitutils.h
Normal file
@ -0,0 +1,322 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Bit-twiddling primitives (ctz, compress etc)
|
||||
*/
|
||||
|
||||
#ifndef BITUTILS_ARCH_X86_H
|
||||
#define BITUTILS_ARCH_X86_H
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "util/popcount.h"
|
||||
#include "util/arch.h"
|
||||
#include "util/intrinsics.h"
|
||||
|
||||
#include "util/arch/common/bitutils.h"
|
||||
|
||||
static really_inline
|
||||
u32 clz32_impl(u32 x) {
|
||||
#if defined(_WIN32)
|
||||
unsigned long r;
|
||||
_BitScanReverse(&r, x);
|
||||
return 31 - r;
|
||||
#else
|
||||
return clz32_impl_c(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 clz64_impl(u64a x) {
|
||||
#if defined(_WIN64)
|
||||
unsigned long r;
|
||||
_BitScanReverse64(&r, x);
|
||||
return 63 - r;
|
||||
#elif defined(_WIN32)
|
||||
unsigned long x1 = (u32)x;
|
||||
unsigned long x2 = (u32)(x >> 32);
|
||||
unsigned long r;
|
||||
if (x2) {
|
||||
_BitScanReverse(&r, x2);
|
||||
return (u32)(31 - r);
|
||||
}
|
||||
_BitScanReverse(&r, (u32)x1);
|
||||
return (u32)(63 - r);
|
||||
#else
|
||||
return clz64_impl_c(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
// CTZ (count trailing zero) implementations.
|
||||
static really_inline
|
||||
u32 ctz32_impl(u32 x) {
|
||||
#if defined(_WIN32)
|
||||
unsigned long r;
|
||||
_BitScanForward(&r, x);
|
||||
return r;
|
||||
#else
|
||||
return ctz32_impl_c(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 ctz64_impl(u64a x) {
|
||||
#if defined(_WIN64)
|
||||
unsigned long r;
|
||||
_BitScanForward64(&r, x);
|
||||
return r;
|
||||
#elif defined(_WIN32)
|
||||
unsigned long r;
|
||||
if (_BitScanForward(&r, (u32)x)) {
|
||||
return (u32)r;
|
||||
}
|
||||
_BitScanForward(&r, x >> 32);
|
||||
return (u32)(r + 32);
|
||||
#else
|
||||
return ctz64_impl_c(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 lg2_impl(u32 x) {
|
||||
return lg2_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a lg2_64_impl(u64a x) {
|
||||
return lg2_64_impl_c(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_32_impl(u32 *v) {
|
||||
#ifndef NO_ASM
|
||||
u32 val = *v, offset;
|
||||
__asm__ ("bsf %1, %0\n"
|
||||
"btr %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
|
||||
assert(offset < 32);
|
||||
return offset;
|
||||
#else
|
||||
return findAndClearLSB_32_impl_c(v);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_64_impl(u64a *v) {
|
||||
#ifdef ARCH_64_BIT
|
||||
#if !defined(NO_ASM)
|
||||
u64a val = *v, offset;
|
||||
__asm__ ("bsfq %1, %0\n"
|
||||
"btrq %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
// generic variant using gcc's builtin on 64-bit
|
||||
u64a val = *v, offset;
|
||||
offset = ctz64(val);
|
||||
*v = val & (val - 1);
|
||||
#endif // ARCH_X86_64
|
||||
assert(offset < 64);
|
||||
return (u32)offset;
|
||||
#else
|
||||
return findAndClearLSB_64_impl_c(v);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_32_impl(u32 *v) {
|
||||
#if !defined(NO_ASM)
|
||||
u32 val = *v, offset;
|
||||
__asm__ ("bsr %1, %0\n"
|
||||
"btr %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
u32 val = *v;
|
||||
u32 offset = 31 - clz32_impl(val);
|
||||
*v = val & ~(1 << offset);
|
||||
#endif
|
||||
assert(offset < 32);
|
||||
return offset;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_64_impl(u64a *v) {
|
||||
#ifdef ARCH_64_BIT
|
||||
#if !defined(NO_ASM)
|
||||
u64a val = *v, offset;
|
||||
__asm__ ("bsrq %1, %0\n"
|
||||
"btrq %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
// generic variant using gcc's builtin on 64-bit
|
||||
u64a val = *v, offset;
|
||||
offset = 63 - clz64_impl(val);
|
||||
*v = val & ~(1ULL << offset);
|
||||
#endif // ARCH_X86_64
|
||||
assert(offset < 64);
|
||||
return (u32)offset;
|
||||
#else
|
||||
return findAndClearMSB_64_impl_c(v);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 compress32_impl(u32 x, u32 m) {
|
||||
#if defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pext_u32(x, m);
|
||||
#else
|
||||
return compress32_impl_c(x, m);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a compress64_impl(u64a x, u64a m) {
|
||||
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pext_u64(x, m);
|
||||
#else
|
||||
return compress64_impl_c(x, m);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 compress128_impl(m128 x, m128 m) {
|
||||
return compress128_impl_c(x, m);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 expand32_impl(u32 x, u32 m) {
|
||||
#if defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pdep_u32(x, m);
|
||||
#else
|
||||
return expand32_impl_c(x, m);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a expand64_impl(u64a x, u64a m) {
|
||||
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pdep_u64(x, m);
|
||||
#else
|
||||
return expand64_impl_c(x, m);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* returns the first set bit after begin (if not ~0U). If no bit is set after
|
||||
* begin returns ~0U
|
||||
*/
|
||||
static really_inline
|
||||
u32 bf64_iterate_impl(u64a bitfield, u32 begin) {
|
||||
if (begin != ~0U) {
|
||||
/* switch off all bits at or below begin. Note: not legal to shift by
|
||||
* by size of the datatype or larger. */
|
||||
assert(begin <= 63);
|
||||
bitfield &= ~((2ULL << begin) - 1);
|
||||
}
|
||||
|
||||
if (!bitfield) {
|
||||
return ~0U;
|
||||
}
|
||||
|
||||
return ctz64_impl(bitfield);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
char bf64_set_impl(u64a *bitfield, u32 i) {
|
||||
return bf64_set_impl_c(bitfield, i);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void bf64_unset_impl(u64a *bitfield, u32 i) {
|
||||
return bf64_unset_impl_c(bitfield, i);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask32_impl(u32 mask, u32 bit) {
|
||||
return rank_in_mask32_impl_c(mask, bit);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask64_impl(u64a mask, u32 bit) {
|
||||
return rank_in_mask64_impl_c(mask, bit);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 pext32_impl(u32 x, u32 mask) {
|
||||
#if defined(HAVE_BMI2)
|
||||
// Intel BMI2 can do this operation in one instruction.
|
||||
return _pext_u32(x, mask);
|
||||
#else
|
||||
return pext32_impl_c(x, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a pext64_impl(u64a x, u64a mask) {
|
||||
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
|
||||
// Intel BMI2 can do this operation in one instruction.
|
||||
return _pext_u64(x, mask);
|
||||
#else
|
||||
return pext64_impl_c(x, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
|
||||
static really_inline
|
||||
u64a pdep64(u64a x, u64a mask) {
|
||||
return _pdep_u64(x, mask);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
|
||||
* so we force its generation.
|
||||
*/
|
||||
static really_inline
|
||||
u64a andn_impl(const u32 a, const u8 *b) {
|
||||
#if defined(HAVE_BMI) && !defined(NO_ASM)
|
||||
u64a r;
|
||||
__asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
|
||||
return r;
|
||||
#else
|
||||
return andn_impl_c(a, b);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // BITUTILS_ARCH_X86_H
|
@ -26,7 +26,7 @@
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "cpuid_flags.h"
|
||||
#include "util/arch/common/cpuid_flags.h"
|
||||
#include "cpuid_inline.h"
|
||||
#include "ue2common.h"
|
||||
#include "hs_compile.h" // for HS_MODE_ flags
|
@ -30,7 +30,7 @@
|
||||
#define CPUID_INLINE_H_
|
||||
|
||||
#include "ue2common.h"
|
||||
#include "cpuid_flags.h"
|
||||
#include "util/arch/common/cpuid_flags.h"
|
||||
|
||||
#if !defined(_WIN32) && !defined(CPUID_H_)
|
||||
#include <cpuid.h>
|
82
src/util/arch/x86/crc32.h
Normal file
82
src/util/arch/x86/crc32.h
Normal file
@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef UTIL_ARCH_X86_CRC32_H_
|
||||
#define UTIL_ARCH_X86_CRC32_H_
|
||||
|
||||
#include "util/arch/x86/x86.h"
|
||||
#include "util/intrinsics.h"
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
#define CRC_WORD 8
|
||||
#define CRC_TYPE u64a
|
||||
#define CRC_FUNC _mm_crc32_u64
|
||||
#else
|
||||
#define CRC_WORD 4
|
||||
#define CRC_TYPE u32
|
||||
#define CRC_FUNC _mm_crc32_u32
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Use the crc32 instruction from SSE4.2 to compute our checksum - same
|
||||
* polynomial as the above function.
|
||||
*/
|
||||
static really_inline
|
||||
u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
|
||||
const size_t length) {
|
||||
u32 crc = running_crc;
|
||||
|
||||
// Process byte-by-byte until p_buf is aligned
|
||||
|
||||
const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD);
|
||||
size_t init_bytes = aligned_buf - p_buf;
|
||||
size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD;
|
||||
size_t end_bytes = length - init_bytes - running_length;
|
||||
|
||||
while (p_buf < aligned_buf) {
|
||||
crc = _mm_crc32_u8(crc, *p_buf++);
|
||||
}
|
||||
|
||||
// Main aligned loop, processes a word at a time.
|
||||
|
||||
for (size_t li = 0; li < running_length/CRC_WORD; li++) {
|
||||
CRC_TYPE block = *(const CRC_TYPE *)p_buf;
|
||||
crc = CRC_FUNC(crc, block);
|
||||
p_buf += CRC_WORD;
|
||||
}
|
||||
|
||||
// Remaining bytes
|
||||
|
||||
for(size_t li = 0; li < end_bytes; li++) {
|
||||
crc = _mm_crc32_u8(crc, *p_buf++);
|
||||
}
|
||||
|
||||
return crc;
|
||||
}
|
||||
|
||||
#endif // UTIL_ARCH_X86_CRC32_H_
|
@ -29,12 +29,12 @@
|
||||
#ifndef MASKED_MOVE_H
|
||||
#define MASKED_MOVE_H
|
||||
|
||||
#include "arch.h"
|
||||
#include "x86.h"
|
||||
|
||||
#if defined(HAVE_AVX2)
|
||||
|
||||
#include "unaligned.h"
|
||||
#include "simd_utils.h"
|
||||
#include "util/unaligned.h"
|
||||
#include "util/simd_utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
45
src/util/arch/x86/simd_types.h
Normal file
45
src/util/arch/x86/simd_types.h
Normal file
@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef SIMD_TYPES_X86_H
|
||||
#define SIMD_TYPES_X86_H
|
||||
|
||||
#if !defined(m128) && defined(HAVE_SSE2)
|
||||
typedef __m128i m128;
|
||||
#endif
|
||||
|
||||
#if !defined(m128) && defined(HAVE_AVX2)
|
||||
typedef __m256i m256;
|
||||
#endif
|
||||
|
||||
#if !defined(m512) && defined(HAVE_AVX512)
|
||||
typedef __m512i m512;
|
||||
#endif
|
||||
|
||||
#endif /* SIMD_TYPES_X86_H */
|
||||
|
772
src/util/arch/x86/simd_utils.h
Normal file
772
src/util/arch/x86/simd_utils.h
Normal file
@ -0,0 +1,772 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2020, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief SIMD types and primitive operations.
|
||||
*/
|
||||
|
||||
#ifndef ARCH_X86_SIMD_UTILS_H
|
||||
#define ARCH_X86_SIMD_UTILS_H
|
||||
|
||||
#include "x86.h"
|
||||
#include "ue2common.h"
|
||||
#include "util/simd_types.h"
|
||||
#include "util/unaligned.h"
|
||||
#include "util/intrinsics.h"
|
||||
|
||||
#include <string.h> // for memcpy
|
||||
|
||||
static really_inline m128 ones128(void) {
|
||||
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
|
||||
/* gcc gets this right */
|
||||
return _mm_set1_epi8(0xFF);
|
||||
#else
|
||||
/* trick from Intel's optimization guide to generate all-ones.
|
||||
* ICC converts this to the single cmpeq instruction */
|
||||
return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline m128 zeroes128(void) {
|
||||
return _mm_setzero_si128();
|
||||
}
|
||||
|
||||
/** \brief Bitwise not for m128*/
|
||||
static really_inline m128 not128(m128 a) {
|
||||
return _mm_xor_si128(a, ones128());
|
||||
}
|
||||
|
||||
/** \brief Return 1 if a and b are different otherwise 0 */
|
||||
static really_inline int diff128(m128 a, m128 b) {
|
||||
return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
|
||||
}
|
||||
|
||||
static really_inline int isnonzero128(m128 a) {
|
||||
return !!diff128(a, zeroes128());
|
||||
}
|
||||
|
||||
/**
|
||||
* "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
|
||||
* mask indicating which 32-bit words contain differences.
|
||||
*/
|
||||
static really_inline u32 diffrich128(m128 a, m128 b) {
|
||||
a = _mm_cmpeq_epi32(a, b);
|
||||
return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
|
||||
}
|
||||
|
||||
/**
|
||||
* "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
|
||||
* returns a 4-bit mask indicating which 64-bit words contain differences.
|
||||
*/
|
||||
static really_inline u32 diffrich64_128(m128 a, m128 b) {
|
||||
#if defined(HAVE_SSE41)
|
||||
a = _mm_cmpeq_epi64(a, b);
|
||||
return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
|
||||
#else
|
||||
u32 d = diffrich128(a, b);
|
||||
return (d | (d >> 1)) & 0x5;
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m128 lshift64_m128(m128 a, unsigned b) {
|
||||
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||
if (__builtin_constant_p(b)) {
|
||||
return _mm_slli_epi64(a, b);
|
||||
}
|
||||
#endif
|
||||
m128 x = _mm_cvtsi32_si128(b);
|
||||
return _mm_sll_epi64(a, x);
|
||||
}
|
||||
|
||||
#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
|
||||
#define eq128(a, b) _mm_cmpeq_epi8((a), (b))
|
||||
#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
|
||||
|
||||
static really_inline m128 set1_16x8(u8 c) {
|
||||
return _mm_set1_epi8(c);
|
||||
}
|
||||
|
||||
static really_inline m128 set1_4x32(u32 c) {
|
||||
return _mm_set1_epi32(c);
|
||||
}
|
||||
|
||||
static really_inline m128 set1_2x64(u64a c) {
|
||||
return _mm_set1_epi64x(c);
|
||||
}
|
||||
|
||||
static really_inline u32 movd(const m128 in) {
|
||||
return _mm_cvtsi128_si32(in);
|
||||
}
|
||||
|
||||
static really_inline u64a movq(const m128 in) {
|
||||
return _mm_cvtsi128_si64(in);
|
||||
}
|
||||
|
||||
/* another form of movq */
|
||||
static really_inline
|
||||
m128 load_m128_from_u64a(const u64a *p) {
|
||||
return _mm_set_epi64x(0LL, *p);
|
||||
}
|
||||
|
||||
#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
|
||||
#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
|
||||
|
||||
#if defined(HAVE_SSE41)
|
||||
#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
|
||||
#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
|
||||
#else
|
||||
#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
|
||||
#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
|
||||
#endif
|
||||
|
||||
#if !defined(HAVE_AVX2)
|
||||
// TODO: this entire file needs restructuring - this carveout is awful
|
||||
#define extractlow64from256(a) movq(a.lo)
|
||||
#define extractlow32from256(a) movd(a.lo)
|
||||
#if defined(HAVE_SSE41)
|
||||
#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
|
||||
#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
|
||||
#else
|
||||
#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
|
||||
#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
|
||||
#endif
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
static really_inline m128 and128(m128 a, m128 b) {
|
||||
return _mm_and_si128(a,b);
|
||||
}
|
||||
|
||||
static really_inline m128 xor128(m128 a, m128 b) {
|
||||
return _mm_xor_si128(a,b);
|
||||
}
|
||||
|
||||
static really_inline m128 or128(m128 a, m128 b) {
|
||||
return _mm_or_si128(a,b);
|
||||
}
|
||||
|
||||
static really_inline m128 andnot128(m128 a, m128 b) {
|
||||
return _mm_andnot_si128(a, b);
|
||||
}
|
||||
|
||||
// aligned load
|
||||
static really_inline m128 load128(const void *ptr) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m128)));
|
||||
ptr = assume_aligned(ptr, 16);
|
||||
return _mm_load_si128((const m128 *)ptr);
|
||||
}
|
||||
|
||||
// aligned store
|
||||
static really_inline void store128(void *ptr, m128 a) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m128)));
|
||||
ptr = assume_aligned(ptr, 16);
|
||||
*(m128 *)ptr = a;
|
||||
}
|
||||
|
||||
// unaligned load
|
||||
static really_inline m128 loadu128(const void *ptr) {
|
||||
return _mm_loadu_si128((const m128 *)ptr);
|
||||
}
|
||||
|
||||
// unaligned store
|
||||
static really_inline void storeu128(void *ptr, m128 a) {
|
||||
_mm_storeu_si128 ((m128 *)ptr, a);
|
||||
}
|
||||
|
||||
// packed unaligned store of first N bytes
|
||||
static really_inline
|
||||
void storebytes128(void *ptr, m128 a, unsigned int n) {
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(ptr, &a, n);
|
||||
}
|
||||
|
||||
// packed unaligned load of first N bytes, pad with zero
|
||||
static really_inline
|
||||
m128 loadbytes128(const void *ptr, unsigned int n) {
|
||||
m128 a = zeroes128();
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(&a, ptr, n);
|
||||
return a;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
extern const u8 simd_onebit_masks[];
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
static really_inline
|
||||
m128 mask1bit128(unsigned int n) {
|
||||
assert(n < sizeof(m128) * 8);
|
||||
u32 mask_idx = ((n % 8) * 64) + 95;
|
||||
mask_idx -= n / 8;
|
||||
return loadu128(&simd_onebit_masks[mask_idx]);
|
||||
}
|
||||
|
||||
// switches on bit N in the given vector.
|
||||
static really_inline
|
||||
void setbit128(m128 *ptr, unsigned int n) {
|
||||
*ptr = or128(mask1bit128(n), *ptr);
|
||||
}
|
||||
|
||||
// switches off bit N in the given vector.
|
||||
static really_inline
|
||||
void clearbit128(m128 *ptr, unsigned int n) {
|
||||
*ptr = andnot128(mask1bit128(n), *ptr);
|
||||
}
|
||||
|
||||
// tests bit N in the given vector.
|
||||
static really_inline
|
||||
char testbit128(m128 val, unsigned int n) {
|
||||
const m128 mask = mask1bit128(n);
|
||||
#if defined(HAVE_SSE41)
|
||||
return !_mm_testz_si128(mask, val);
|
||||
#else
|
||||
return isnonzero128(and128(mask, val));
|
||||
#endif
|
||||
}
|
||||
|
||||
// offset must be an immediate
|
||||
#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
|
||||
|
||||
static really_inline
|
||||
m128 pshufb_m128(m128 a, m128 b) {
|
||||
m128 result;
|
||||
result = _mm_shuffle_epi8(a, b);
|
||||
return result;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 variable_byte_shift_m128(m128 in, s32 amount) {
|
||||
assert(amount >= -16 && amount <= 16);
|
||||
m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
|
||||
return pshufb_m128(in, shift_mask);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 max_u8_m128(m128 a, m128 b) {
|
||||
return _mm_max_epu8(a, b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 min_u8_m128(m128 a, m128 b) {
|
||||
return _mm_min_epu8(a, b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 sadd_u8_m128(m128 a, m128 b) {
|
||||
return _mm_adds_epu8(a, b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 sub_u8_m128(m128 a, m128 b) {
|
||||
return _mm_sub_epi8(a, b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) {
|
||||
return _mm_set_epi32(x3, x2, x1, x0);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 set2x64(u64a hi, u64a lo) {
|
||||
return _mm_set_epi64x(hi, lo);
|
||||
}
|
||||
|
||||
/****
|
||||
**** 256-bit Primitives
|
||||
****/
|
||||
|
||||
#if defined(HAVE_SIMD_256_BITS) && defined(HAVE_AVX2)
|
||||
|
||||
static really_inline
|
||||
m256 pshufb_m256(m256 a, m256 b) {
|
||||
return _mm256_shuffle_epi8(a, b);
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m256 lshift64_m256(m256 a, unsigned b) {
|
||||
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||
if (__builtin_constant_p(b)) {
|
||||
return _mm256_slli_epi64(a, b);
|
||||
}
|
||||
#endif
|
||||
m128 x = _mm_cvtsi32_si128(b);
|
||||
return _mm256_sll_epi64(a, x);
|
||||
}
|
||||
|
||||
#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
|
||||
|
||||
static really_inline m256 set1_4x64(u64a c) {
|
||||
return _mm256_set1_epi64x(c);
|
||||
}
|
||||
|
||||
#define eq256(a, b) _mm256_cmpeq_epi8((a), (b))
|
||||
#define movemask256(a) ((u32)_mm256_movemask_epi8((a)))
|
||||
|
||||
static really_inline
|
||||
m256 set1_2x128(m128 a) {
|
||||
return _mm256_broadcastsi128_si256(a);
|
||||
}
|
||||
|
||||
static really_inline m256 zeroes256(void) {
|
||||
return _mm256_setzero_si256();
|
||||
}
|
||||
|
||||
static really_inline m256 ones256(void) {
|
||||
m256 rv = _mm256_set1_epi8(0xFF);
|
||||
return rv;
|
||||
}
|
||||
|
||||
static really_inline m256 and256(m256 a, m256 b) {
|
||||
return _mm256_and_si256(a, b);
|
||||
}
|
||||
|
||||
static really_inline m256 or256(m256 a, m256 b) {
|
||||
return _mm256_or_si256(a, b);
|
||||
}
|
||||
|
||||
static really_inline m256 xor256(m256 a, m256 b) {
|
||||
return _mm256_xor_si256(a, b);
|
||||
}
|
||||
|
||||
static really_inline m256 not256(m256 a) {
|
||||
return _mm256_xor_si256(a, ones256());
|
||||
}
|
||||
|
||||
static really_inline m256 andnot256(m256 a, m256 b) {
|
||||
return _mm256_andnot_si256(a, b);
|
||||
}
|
||||
|
||||
static really_inline int diff256(m256 a, m256 b) {
|
||||
return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
|
||||
}
|
||||
|
||||
static really_inline int isnonzero256(m256 a) {
|
||||
return !!diff256(a, zeroes256());
|
||||
}
|
||||
|
||||
/**
|
||||
* "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
|
||||
* mask indicating which 32-bit words contain differences.
|
||||
*/
|
||||
static really_inline u32 diffrich256(m256 a, m256 b) {
|
||||
a = _mm256_cmpeq_epi32(a, b);
|
||||
return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
|
||||
}
|
||||
|
||||
/**
|
||||
* "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
|
||||
* returns an 8-bit mask indicating which 64-bit words contain differences.
|
||||
*/
|
||||
static really_inline u32 diffrich64_256(m256 a, m256 b) {
|
||||
u32 d = diffrich256(a, b);
|
||||
return (d | (d >> 1)) & 0x55555555;
|
||||
}
|
||||
|
||||
// aligned load
|
||||
static really_inline m256 load256(const void *ptr) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m256)));
|
||||
return _mm256_load_si256((const m256 *)ptr);
|
||||
}
|
||||
|
||||
// aligned load of 128-bit value to low and high part of 256-bit value
|
||||
static really_inline m256 load2x128(const void *ptr) {
|
||||
return set1_2x128(load128(ptr));
|
||||
}
|
||||
|
||||
static really_inline m256 loadu2x128(const void *ptr) {
|
||||
return set1_2x128(loadu128(ptr));
|
||||
}
|
||||
|
||||
// aligned store
|
||||
static really_inline void store256(void *ptr, m256 a) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m256)));
|
||||
_mm256_store_si256((m256 *)ptr, a);
|
||||
}
|
||||
|
||||
// unaligned load
|
||||
static really_inline m256 loadu256(const void *ptr) {
|
||||
return _mm256_loadu_si256((const m256 *)ptr);
|
||||
}
|
||||
|
||||
// unaligned store
|
||||
static really_inline void storeu256(void *ptr, m256 a) {
|
||||
_mm256_storeu_si256((m256 *)ptr, a);
|
||||
}
|
||||
|
||||
// packed unaligned store of first N bytes
|
||||
static really_inline
|
||||
void storebytes256(void *ptr, m256 a, unsigned int n) {
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(ptr, &a, n);
|
||||
}
|
||||
|
||||
// packed unaligned load of first N bytes, pad with zero
|
||||
static really_inline
|
||||
m256 loadbytes256(const void *ptr, unsigned int n) {
|
||||
m256 a = zeroes256();
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(&a, ptr, n);
|
||||
return a;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 mask1bit256(unsigned int n) {
|
||||
assert(n < sizeof(m256) * 8);
|
||||
u32 mask_idx = ((n % 8) * 64) + 95;
|
||||
mask_idx -= n / 8;
|
||||
return loadu256(&simd_onebit_masks[mask_idx]);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 set1_32x8(u32 in) {
|
||||
return _mm256_set1_epi8(in);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 set8x32(u32 hi_3, u32 hi_2, u32 hi_1, u32 hi_0, u32 lo_3, u32 lo_2, u32 lo_1, u32 lo_0) {
|
||||
return _mm256_set_epi32(hi_3, hi_2, hi_1, hi_0, lo_3, lo_2, lo_1, lo_0);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 set4x64(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
|
||||
return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
|
||||
}
|
||||
|
||||
// switches on bit N in the given vector.
|
||||
static really_inline
|
||||
void setbit256(m256 *ptr, unsigned int n) {
|
||||
*ptr = or256(mask1bit256(n), *ptr);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void clearbit256(m256 *ptr, unsigned int n) {
|
||||
*ptr = andnot256(mask1bit256(n), *ptr);
|
||||
}
|
||||
|
||||
// tests bit N in the given vector.
|
||||
static really_inline
|
||||
char testbit256(m256 val, unsigned int n) {
|
||||
const m256 mask = mask1bit256(n);
|
||||
return !_mm256_testz_si256(mask, val);
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m128 movdq_hi(m256 x) {
|
||||
return _mm256_extracti128_si256(x, 1);
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m128 movdq_lo(m256 x) {
|
||||
return _mm256_extracti128_si256(x, 0);
|
||||
}
|
||||
|
||||
#define cast256to128(a) _mm256_castsi256_si128(a)
|
||||
#define cast128to256(a) _mm256_castsi128_si256(a)
|
||||
#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
|
||||
#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
|
||||
#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
|
||||
#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
|
||||
#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
|
||||
#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
|
||||
#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
|
||||
#define extractlow32from256(a) movd(cast256to128(a))
|
||||
#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
|
||||
#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
|
||||
#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
|
||||
|
||||
static really_inline
|
||||
m256 combine2x128(m128 hi, m128 lo) {
|
||||
#if defined(_mm256_set_m128i)
|
||||
return _mm256_set_m128i(hi, lo);
|
||||
#else
|
||||
return insert128to256(cast128to256(lo), hi, 1);
|
||||
#endif
|
||||
}
|
||||
#endif //AVX2
|
||||
|
||||
#if defined(HAVE_SIMD_128_BITS)
|
||||
/**
|
||||
* "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
|
||||
* mask indicating which 32-bit words contain differences.
|
||||
*/
|
||||
|
||||
static really_inline u32 diffrich384(m384 a, m384 b) {
|
||||
m128 z = zeroes128();
|
||||
a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
|
||||
a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
|
||||
a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
|
||||
m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
|
||||
_mm_packs_epi32(a.hi, z));
|
||||
return ~(_mm_movemask_epi8(packed)) & 0xfff;
|
||||
}
|
||||
|
||||
#endif // HAVE_SIMD_128_BITS
|
||||
|
||||
/****
|
||||
**** 512-bit Primitives
|
||||
****/
|
||||
|
||||
#if defined(HAVE_SIMD_512_BITS)
|
||||
|
||||
#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
|
||||
#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
|
||||
#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
|
||||
#define set2x256(a) _mm512_broadcast_i64x4(a)
|
||||
#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
|
||||
#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
|
||||
|
||||
static really_inline u32 movd512(const m512 in) {
|
||||
// NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
|
||||
// so we use 2-step convertions to work around.
|
||||
return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 pshufb_m512(m512 a, m512 b) {
|
||||
return _mm512_shuffle_epi8(a, b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
|
||||
return _mm512_maskz_shuffle_epi8(k, a, b);
|
||||
}
|
||||
|
||||
#if defined(HAVE_AVX512VBMI)
|
||||
#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
|
||||
#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
|
||||
#endif
|
||||
|
||||
#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
|
||||
#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
|
||||
|
||||
static really_inline
|
||||
m512 zeroes512(void) {
|
||||
#if defined(HAVE_AVX512)
|
||||
return _mm512_setzero_si512();
|
||||
#else
|
||||
m512 rv = {zeroes256(), zeroes256()};
|
||||
return rv;
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 ones512(void) {
|
||||
return _mm512_set1_epi8(0xFF);
|
||||
//return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 set1_64x8(u8 a) {
|
||||
return _mm512_set1_epi8(a);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 set1_8x64(u64a a) {
|
||||
return _mm512_set1_epi64(a);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 set8x64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
|
||||
u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
|
||||
return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
|
||||
lo_3, lo_2, lo_1, lo_0);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 swap256in512(m512 a) {
|
||||
m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
|
||||
return vpermq512(idx, a);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 set1_4x128(m128 a) {
|
||||
return _mm512_broadcast_i32x4(a);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 and512(m512 a, m512 b) {
|
||||
return _mm512_and_si512(a, b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 or512(m512 a, m512 b) {
|
||||
return _mm512_or_si512(a, b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 xor512(m512 a, m512 b) {
|
||||
return _mm512_xor_si512(a, b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 not512(m512 a) {
|
||||
return _mm512_xor_si512(a, ones512());
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 andnot512(m512 a, m512 b) {
|
||||
return _mm512_andnot_si512(a, b);
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
m512 lshift64_m512(m512 a, unsigned b) {
|
||||
#if defined(HAVE__BUILTIN_CONSTANT_P)
|
||||
if (__builtin_constant_p(b)) {
|
||||
return _mm512_slli_epi64(a, b);
|
||||
}
|
||||
#endif
|
||||
m128 x = _mm_cvtsi32_si128(b);
|
||||
return _mm512_sll_epi64(a, x);
|
||||
}
|
||||
|
||||
#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
|
||||
#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
|
||||
#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
|
||||
|
||||
#if !defined(_MM_CMPINT_NE)
|
||||
#define _MM_CMPINT_NE 0x4
|
||||
#endif
|
||||
|
||||
static really_inline
|
||||
int diff512(m512 a, m512 b) {
|
||||
return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
int isnonzero512(m512 a) {
|
||||
return diff512(a, zeroes512());
|
||||
}
|
||||
|
||||
/**
|
||||
* "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
|
||||
* mask indicating which 32-bit words contain differences.
|
||||
*/
|
||||
static really_inline
|
||||
u32 diffrich512(m512 a, m512 b) {
|
||||
return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
|
||||
}
|
||||
|
||||
/**
|
||||
* "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
|
||||
* returns a 16-bit mask indicating which 64-bit words contain differences.
|
||||
*/
|
||||
static really_inline
|
||||
u32 diffrich64_512(m512 a, m512 b) {
|
||||
//TODO: cmp_epi64?
|
||||
u32 d = diffrich512(a, b);
|
||||
return (d | (d >> 1)) & 0x55555555;
|
||||
}
|
||||
|
||||
// aligned load
|
||||
static really_inline
|
||||
m512 load512(const void *ptr) {
|
||||
return _mm512_load_si512(ptr);
|
||||
}
|
||||
|
||||
// aligned store
|
||||
static really_inline
|
||||
void store512(void *ptr, m512 a) {
|
||||
assert(ISALIGNED_N(ptr, alignof(m512)));
|
||||
return _mm512_store_si512(ptr, a);
|
||||
}
|
||||
|
||||
// unaligned load
|
||||
static really_inline
|
||||
m512 loadu512(const void *ptr) {
|
||||
return _mm512_loadu_si512(ptr);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
|
||||
return _mm512_maskz_loadu_epi8(k, ptr);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
|
||||
return _mm512_mask_loadu_epi8(src, k, ptr);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 set_mask_m512(__mmask64 k) {
|
||||
return _mm512_movm_epi8(k);
|
||||
}
|
||||
|
||||
// packed unaligned store of first N bytes
|
||||
static really_inline
|
||||
void storebytes512(void *ptr, m512 a, unsigned int n) {
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(ptr, &a, n);
|
||||
}
|
||||
|
||||
// packed unaligned load of first N bytes, pad with zero
|
||||
static really_inline
|
||||
m512 loadbytes512(const void *ptr, unsigned int n) {
|
||||
m512 a = zeroes512();
|
||||
assert(n <= sizeof(a));
|
||||
memcpy(&a, ptr, n);
|
||||
return a;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 mask1bit512(unsigned int n) {
|
||||
assert(n < sizeof(m512) * 8);
|
||||
u32 mask_idx = ((n % 8) * 64) + 95;
|
||||
mask_idx -= n / 8;
|
||||
return loadu512(&simd_onebit_masks[mask_idx]);
|
||||
}
|
||||
|
||||
// switches on bit N in the given vector.
|
||||
static really_inline
|
||||
void setbit512(m512 *ptr, unsigned int n) {
|
||||
assert(n < sizeof(*ptr) * 8);
|
||||
*ptr = or512(mask1bit512(n), *ptr);
|
||||
}
|
||||
|
||||
// switches off bit N in the given vector.
|
||||
static really_inline
|
||||
void clearbit512(m512 *ptr, unsigned int n) {
|
||||
assert(n < sizeof(*ptr) * 8);
|
||||
*ptr = andnot512(mask1bit512(n), *ptr);
|
||||
}
|
||||
|
||||
// tests bit N in the given vector.
|
||||
static really_inline
|
||||
char testbit512(m512 val, unsigned int n) {
|
||||
assert(n < sizeof(val) * 8);
|
||||
const m512 mask = mask1bit512(n);
|
||||
return !!_mm512_test_epi8_mask(mask, val);
|
||||
}
|
||||
|
||||
#endif // HAVE_SIMD_512_BITS
|
||||
|
||||
#endif // ARCH_X86_SIMD_UTILS_H
|
96
src/util/arch/x86/x86.h
Normal file
96
src/util/arch/x86/x86.h
Normal file
@ -0,0 +1,96 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2020, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Per-platform architecture definitions
|
||||
*/
|
||||
|
||||
#ifndef UTIL_ARCH_X86_H_
|
||||
#define UTIL_ARCH_X86_H_
|
||||
|
||||
#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
|
||||
#define HAVE_SSE2
|
||||
#define HAVE_SIMD_128_BITS
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__))
|
||||
#define HAVE_SSE41
|
||||
#define HAVE_SIMD_128_BITS
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__))
|
||||
#define HAVE_SSE42
|
||||
#define HAVE_SIMD_128_BITS
|
||||
#endif
|
||||
|
||||
#if defined(__AVX__)
|
||||
#define HAVE_AVX
|
||||
#define HAVE_SIMD_256_BITS
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define HAVE_AVX2
|
||||
#define HAVE_SIMD_256_BITS
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512BW__)
|
||||
#define HAVE_AVX512
|
||||
#define HAVE_SIMD_512_BITS
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512VBMI__)
|
||||
#define HAVE_AVX512VBMI
|
||||
#endif
|
||||
|
||||
/*
|
||||
* ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
|
||||
*/
|
||||
#if defined(__POPCNT__) || \
|
||||
(defined(__INTEL_COMPILER) && defined(__SSE4_2__)) || \
|
||||
(defined(_WIN32) && defined(__AVX__))
|
||||
#define HAVE_POPCOUNT_INSTR
|
||||
#endif
|
||||
|
||||
#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) || \
|
||||
(defined(__INTEL_COMPILER) && defined(__AVX2__))
|
||||
#define HAVE_BMI
|
||||
#endif
|
||||
|
||||
#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) || \
|
||||
(defined(__INTEL_COMPILER) && defined(__AVX2__))
|
||||
#define HAVE_BMI2
|
||||
#endif
|
||||
|
||||
/*
|
||||
* MSVC uses a different form of inline asm
|
||||
*/
|
||||
#if defined(_WIN32) && defined(_MSC_VER)
|
||||
#define NO_ASM
|
||||
#endif
|
||||
|
||||
#endif // UTIL_ARCH_X86_H_
|
@ -33,6 +33,7 @@
|
||||
#ifndef BITUTILS_H
|
||||
#define BITUTILS_H
|
||||
|
||||
#include "config.h"
|
||||
#include "ue2common.h"
|
||||
#include "popcount.h"
|
||||
#include "util/arch.h"
|
||||
@ -43,351 +44,95 @@
|
||||
#define DOUBLE_CASE_CLEAR 0xdfdf
|
||||
#define OCTO_CASE_CLEAR 0xdfdfdfdfdfdfdfdfULL
|
||||
|
||||
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
#include "util/arch/x86/bitutils.h"
|
||||
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
|
||||
#include "util/arch/arm/bitutils.h"
|
||||
#endif
|
||||
|
||||
static really_inline
|
||||
u32 clz32(u32 x) {
|
||||
assert(x); // behaviour not defined for x == 0
|
||||
#if defined(_WIN32)
|
||||
unsigned long r;
|
||||
_BitScanReverse(&r, x);
|
||||
return 31 - r;
|
||||
#else
|
||||
return (u32)__builtin_clz(x);
|
||||
#endif
|
||||
|
||||
return clz32_impl(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 clz64(u64a x) {
|
||||
assert(x); // behaviour not defined for x == 0
|
||||
#if defined(_WIN64)
|
||||
unsigned long r;
|
||||
_BitScanReverse64(&r, x);
|
||||
return 63 - r;
|
||||
#elif defined(_WIN32)
|
||||
unsigned long x1 = (u32)x;
|
||||
unsigned long x2 = (u32)(x >> 32);
|
||||
unsigned long r;
|
||||
if (x2) {
|
||||
_BitScanReverse(&r, x2);
|
||||
return (u32)(31 - r);
|
||||
}
|
||||
_BitScanReverse(&r, (u32)x1);
|
||||
return (u32)(63 - r);
|
||||
#else
|
||||
return (u32)__builtin_clzll(x);
|
||||
#endif
|
||||
|
||||
return clz64_impl(x);
|
||||
}
|
||||
|
||||
// CTZ (count trailing zero) implementations.
|
||||
static really_inline
|
||||
u32 ctz32(u32 x) {
|
||||
assert(x); // behaviour not defined for x == 0
|
||||
#if defined(_WIN32)
|
||||
unsigned long r;
|
||||
_BitScanForward(&r, x);
|
||||
return r;
|
||||
#else
|
||||
return (u32)__builtin_ctz(x);
|
||||
#endif
|
||||
|
||||
return ctz32_impl(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 ctz64(u64a x) {
|
||||
assert(x); // behaviour not defined for x == 0
|
||||
#if defined(_WIN64)
|
||||
unsigned long r;
|
||||
_BitScanForward64(&r, x);
|
||||
return r;
|
||||
#elif defined(_WIN32)
|
||||
unsigned long r;
|
||||
if (_BitScanForward(&r, (u32)x)) {
|
||||
return (u32)r;
|
||||
}
|
||||
_BitScanForward(&r, x >> 32);
|
||||
return (u32)(r + 32);
|
||||
#else
|
||||
return (u32)__builtin_ctzll(x);
|
||||
#endif
|
||||
|
||||
return ctz64_impl(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 lg2(u32 x) {
|
||||
if (!x) {
|
||||
return 0;
|
||||
}
|
||||
return 31 - clz32(x);
|
||||
return lg2_impl(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a lg2_64(u64a x) {
|
||||
if (!x) {
|
||||
return 0;
|
||||
}
|
||||
return 63 - clz64(x);
|
||||
return lg2_64_impl(x);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_32(u32 *v) {
|
||||
assert(*v != 0); // behaviour not defined in this case
|
||||
#ifndef NO_ASM
|
||||
u32 val = *v, offset;
|
||||
__asm__ ("bsf %1, %0\n"
|
||||
"btr %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
u32 val = *v;
|
||||
u32 offset = ctz32(val);
|
||||
*v = val & (val - 1);
|
||||
#endif
|
||||
|
||||
assert(offset < 32);
|
||||
return offset;
|
||||
return findAndClearLSB_32_impl(v);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearLSB_64(u64a *v) {
|
||||
assert(*v != 0); // behaviour not defined in this case
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
#if defined(ARCH_X86_64) && !defined(NO_ASM)
|
||||
u64a val = *v, offset;
|
||||
__asm__ ("bsfq %1, %0\n"
|
||||
"btrq %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
// generic variant using gcc's builtin on 64-bit
|
||||
u64a val = *v, offset;
|
||||
offset = ctz64(val);
|
||||
*v = val & (val - 1);
|
||||
#endif // ARCH_X86_64
|
||||
#else
|
||||
// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
|
||||
// inline calls to __builtin_ctzll
|
||||
u32 v1 = (u32)*v;
|
||||
u32 v2 = (u32)(*v >> 32);
|
||||
u32 offset;
|
||||
if (v1) {
|
||||
offset = findAndClearLSB_32(&v1);
|
||||
*v = (u64a)v1 | ((u64a)v2 << 32);
|
||||
} else {
|
||||
offset = findAndClearLSB_32(&v2) + 32;
|
||||
*v = (u64a)v2 << 32;
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(offset < 64);
|
||||
return (u32)offset;
|
||||
return findAndClearLSB_64_impl(v);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_32(u32 *v) {
|
||||
assert(*v != 0); // behaviour not defined in this case
|
||||
#ifndef NO_ASM
|
||||
u32 val = *v, offset;
|
||||
__asm__ ("bsr %1, %0\n"
|
||||
"btr %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
u32 val = *v;
|
||||
u32 offset = 31 - clz32(val);
|
||||
*v = val & ~(1 << offset);
|
||||
#endif
|
||||
assert(offset < 32);
|
||||
return offset;
|
||||
return findAndClearMSB_32_impl(v);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 findAndClearMSB_64(u64a *v) {
|
||||
assert(*v != 0); // behaviour not defined in this case
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
#if defined(ARCH_X86_64) && !defined(NO_ASM)
|
||||
u64a val = *v, offset;
|
||||
__asm__ ("bsrq %1, %0\n"
|
||||
"btrq %0, %1\n"
|
||||
: "=r" (offset), "=r" (val)
|
||||
: "1" (val));
|
||||
*v = val;
|
||||
#else
|
||||
// generic variant using gcc's builtin on 64-bit
|
||||
u64a val = *v, offset;
|
||||
offset = 63 - clz64(val);
|
||||
*v = val & ~(1ULL << offset);
|
||||
#endif // ARCH_X86_64
|
||||
#else
|
||||
// fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
|
||||
// inline calls to __builtin_ctzll
|
||||
u32 v1 = (u32)*v;
|
||||
u32 v2 = (*v >> 32);
|
||||
u32 offset;
|
||||
if (v2) {
|
||||
offset = findAndClearMSB_32(&v2) + 32;
|
||||
*v = ((u64a)v2 << 32) | (u64a)v1;
|
||||
} else {
|
||||
offset = findAndClearMSB_32(&v1);
|
||||
*v = (u64a)v1;
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(offset < 64);
|
||||
return (u32)offset;
|
||||
return findAndClearMSB_64_impl(v);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 compress32(u32 x, u32 m) {
|
||||
#if defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pext_u32(x, m);
|
||||
#else
|
||||
|
||||
// Return zero quickly on trivial cases
|
||||
if ((x & m) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u32 mk, mp, mv, t;
|
||||
|
||||
x &= m; // clear irrelevant bits
|
||||
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
for (u32 i = 0; i < 5; i++) {
|
||||
mp = mk ^ (mk << 1);
|
||||
mp ^= mp << 2;
|
||||
mp ^= mp << 4;
|
||||
mp ^= mp << 8;
|
||||
mp ^= mp << 16;
|
||||
|
||||
mv = mp & m; // bits to move
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
t = x & mv;
|
||||
x = (x ^ t) | (t >> (1 << i)); // compress x
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
return x;
|
||||
#endif
|
||||
return compress32_impl(x, m);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a compress64(u64a x, u64a m) {
|
||||
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pext_u64(x, m);
|
||||
#else
|
||||
return compress64_impl(x, m);
|
||||
}
|
||||
|
||||
// Return zero quickly on trivial cases
|
||||
if ((x & m) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u64a mk, mp, mv, t;
|
||||
|
||||
x &= m; // clear irrelevant bits
|
||||
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
for (u32 i = 0; i < 6; i++) {
|
||||
mp = mk ^ (mk << 1);
|
||||
mp ^= mp << 2;
|
||||
mp ^= mp << 4;
|
||||
mp ^= mp << 8;
|
||||
mp ^= mp << 16;
|
||||
mp ^= mp << 32;
|
||||
|
||||
mv = mp & m; // bits to move
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
t = x & mv;
|
||||
x = (x ^ t) | (t >> (1 << i)); // compress x
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
return x;
|
||||
#endif
|
||||
static really_inline
|
||||
m128 compress128(m128 x, m128 m) {
|
||||
return compress128_impl(x, m);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 expand32(u32 x, u32 m) {
|
||||
#if defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pdep_u32(x, m);
|
||||
#else
|
||||
|
||||
// Return zero quickly on trivial cases
|
||||
if (!x || !m) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u32 m0, mk, mp, mv, t;
|
||||
u32 array[5];
|
||||
|
||||
m0 = m; // save original mask
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
mp = mk ^ (mk << 1); // parallel suffix
|
||||
mp = mp ^ (mp << 2);
|
||||
mp = mp ^ (mp << 4);
|
||||
mp = mp ^ (mp << 8);
|
||||
mp = mp ^ (mp << 16);
|
||||
mv = mp & m; // bits to move
|
||||
array[i] = mv;
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
for (int i = 4; i >= 0; i--) {
|
||||
mv = array[i];
|
||||
t = x << (1 << i);
|
||||
x = (x & ~mv) | (t & mv);
|
||||
}
|
||||
|
||||
return x & m0; // clear out extraneous bits
|
||||
#endif
|
||||
return expand32_impl(x, m);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a expand64(u64a x, u64a m) {
|
||||
#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
|
||||
// BMI2 has a single instruction for this operation.
|
||||
return _pdep_u64(x, m);
|
||||
#else
|
||||
|
||||
// Return zero quickly on trivial cases
|
||||
if (!x || !m) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
u64a m0, mk, mp, mv, t;
|
||||
u64a array[6];
|
||||
|
||||
m0 = m; // save original mask
|
||||
mk = ~m << 1; // we will count 0's to right
|
||||
|
||||
for (int i = 0; i < 6; i++) {
|
||||
mp = mk ^ (mk << 1); // parallel suffix
|
||||
mp = mp ^ (mp << 2);
|
||||
mp = mp ^ (mp << 4);
|
||||
mp = mp ^ (mp << 8);
|
||||
mp = mp ^ (mp << 16);
|
||||
mp = mp ^ (mp << 32);
|
||||
mv = mp & m; // bits to move
|
||||
array[i] = mv;
|
||||
m = (m ^ mv) | (mv >> (1 << i)); // compress m
|
||||
mk = mk & ~mp;
|
||||
}
|
||||
|
||||
for (int i = 5; i >= 0; i--) {
|
||||
mv = array[i];
|
||||
t = x << (1 << i);
|
||||
x = (x & ~mv) | (t & mv);
|
||||
}
|
||||
|
||||
return x & m0; // clear out extraneous bits
|
||||
#endif
|
||||
return expand64_impl(x, m);
|
||||
}
|
||||
|
||||
|
||||
@ -396,97 +141,45 @@ u64a expand64(u64a x, u64a m) {
|
||||
*/
|
||||
static really_inline
|
||||
u32 bf64_iterate(u64a bitfield, u32 begin) {
|
||||
if (begin != ~0U) {
|
||||
/* switch off all bits at or below begin. Note: not legal to shift by
|
||||
* by size of the datatype or larger. */
|
||||
assert(begin <= 63);
|
||||
bitfield &= ~((2ULL << begin) - 1);
|
||||
}
|
||||
|
||||
if (!bitfield) {
|
||||
return ~0U;
|
||||
}
|
||||
|
||||
return ctz64(bitfield);
|
||||
return bf64_iterate_impl(bitfield, begin);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
char bf64_set(u64a *bitfield, u32 i) {
|
||||
assert(i < 64);
|
||||
u64a mask = 1ULL << i;
|
||||
char was_set = !!(*bitfield & mask);
|
||||
*bitfield |= mask;
|
||||
|
||||
return was_set;
|
||||
return bf64_set_impl(bitfield, i);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void bf64_unset(u64a *bitfield, u32 i) {
|
||||
assert(i < 64);
|
||||
*bitfield &= ~(1ULL << i);
|
||||
return bf64_unset_impl(bitfield, i);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask32(u32 mask, u32 bit) {
|
||||
assert(bit < sizeof(u32) * 8);
|
||||
assert(mask & (u32)(1U << bit));
|
||||
mask &= (u32)(1U << bit) - 1;
|
||||
return popcount32(mask);
|
||||
return rank_in_mask32_impl(mask, bit);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 rank_in_mask64(u64a mask, u32 bit) {
|
||||
assert(bit < sizeof(u64a) * 8);
|
||||
assert(mask & (u64a)(1ULL << bit));
|
||||
mask &= (u64a)(1ULL << bit) - 1;
|
||||
return popcount64(mask);
|
||||
return rank_in_mask64_impl(mask, bit);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u32 pext32(u32 x, u32 mask) {
|
||||
#if defined(HAVE_BMI2)
|
||||
// Intel BMI2 can do this operation in one instruction.
|
||||
return _pext_u32(x, mask);
|
||||
#else
|
||||
|
||||
u32 result = 0, num = 1;
|
||||
while (mask != 0) {
|
||||
u32 bit = findAndClearLSB_32(&mask);
|
||||
if (x & (1U << bit)) {
|
||||
assert(num != 0); // more than 32 bits!
|
||||
result |= num;
|
||||
}
|
||||
num <<= 1;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
return pext32_impl(x, mask);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
u64a pext64(u64a x, u64a mask) {
|
||||
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
|
||||
// Intel BMI2 can do this operation in one instruction.
|
||||
return _pext_u64(x, mask);
|
||||
#else
|
||||
|
||||
u32 result = 0, num = 1;
|
||||
while (mask != 0) {
|
||||
u32 bit = findAndClearLSB_64(&mask);
|
||||
if (x & (1ULL << bit)) {
|
||||
assert(num != 0); // more than 32 bits!
|
||||
result |= num;
|
||||
}
|
||||
num <<= 1;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
return pext64_impl(x, mask);
|
||||
}
|
||||
|
||||
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
|
||||
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
|
||||
* so we force its generation.
|
||||
*/
|
||||
static really_inline
|
||||
u64a pdep64(u64a x, u64a mask) {
|
||||
return _pdep_u64(x, mask);
|
||||
u64a andn(const u32 a, const u8 *b) {
|
||||
return andn_impl_c(a, b);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // BITUTILS_H
|
||||
|
@ -45,6 +45,10 @@
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_C_ARM_NEON_H)
|
||||
# define USE_ARM_NEON_H
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
# if defined(HAVE_CXX_INTRIN_H)
|
||||
# define USE_INTRIN_H
|
||||
@ -59,6 +63,8 @@
|
||||
#include <x86intrin.h>
|
||||
#elif defined(USE_INTRIN_H)
|
||||
#include <intrin.h>
|
||||
#elif defined(USE_ARM_NEON_H)
|
||||
#include <arm_neon.h>
|
||||
#else
|
||||
#error no intrinsics file
|
||||
#endif
|
||||
|
@ -34,22 +34,22 @@
|
||||
#include "util/intrinsics.h"
|
||||
#include "ue2common.h"
|
||||
|
||||
#if defined(HAVE_SSE2)
|
||||
typedef __m128i m128;
|
||||
#else
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
#include "util/arch/x86/simd_types.h"
|
||||
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
|
||||
#include "util/arch/arm/simd_types.h"
|
||||
#endif
|
||||
|
||||
#if !defined(m128) && !defined(HAVE_SIMD_128_BITS)
|
||||
typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX2)
|
||||
typedef __m256i m256;
|
||||
#else
|
||||
#if !defined(m256) && !defined(HAVE_SIMD_256_BITS)
|
||||
typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
|
||||
#endif
|
||||
|
||||
typedef struct {m128 lo; m128 mid; m128 hi;} m384;
|
||||
#if defined(HAVE_AVX512)
|
||||
typedef __m512i m512;
|
||||
#else
|
||||
#if !defined(m512) && !defined(HAVE_SIMD_512_BITS)
|
||||
typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
|
||||
#endif
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -108,20 +108,21 @@ void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) {
|
||||
static really_inline
|
||||
void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) {
|
||||
// First, decompose our vectors into 64-bit chunks.
|
||||
u64a x[2];
|
||||
memcpy(x, &xvec, sizeof(xvec));
|
||||
u64a m[2];
|
||||
memcpy(m, &mvec, sizeof(mvec));
|
||||
u64a ALIGN_ATTR(16) x[2];
|
||||
u64a ALIGN_ATTR(16) m[2];
|
||||
store128(m, mvec);
|
||||
store128(x, xvec);
|
||||
|
||||
// Count the number of bits of compressed state we're writing out per
|
||||
// chunk.
|
||||
u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
|
||||
u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) };
|
||||
|
||||
// Compress each 64-bit chunk individually.
|
||||
u64a v[2] = { compress64(x[0], m[0]), compress64(x[1], m[1]) };
|
||||
xvec = compress128(xvec, mvec);
|
||||
store128(x, xvec);
|
||||
|
||||
// Write packed data out.
|
||||
pack_bits_64(ptr, v, bits, 2);
|
||||
pack_bits_64(ptr, x, bits, 2);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -150,7 +151,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
|
||||
u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
|
||||
expand32(v[2], m[2]), expand32(v[3], m[3]) };
|
||||
|
||||
return _mm_set_epi32(x[3], x[2], x[1], x[0]);
|
||||
return set4x32(x[3], x[2], x[1], x[0]);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -158,16 +159,17 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
|
||||
static really_inline
|
||||
m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
|
||||
// First, decompose our vectors into 64-bit chunks.
|
||||
u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) };
|
||||
u64a ALIGN_ATTR(16) m[2];
|
||||
store128(m, mvec);
|
||||
|
||||
u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
|
||||
u64a v[2];
|
||||
u64a ALIGN_ATTR(16) v[2];
|
||||
|
||||
unpack_bits_64(v, (const u8 *)ptr, bits, 2);
|
||||
|
||||
u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
|
||||
|
||||
return _mm_set_epi64x(x[1], x[0]);
|
||||
return set2x64(x[1], x[0]);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -215,10 +217,10 @@ void storecompressed256_32bit(void *ptr, m256 xvec, m256 mvec) {
|
||||
static really_really_inline
|
||||
void storecompressed256_64bit(void *ptr, m256 xvec, m256 mvec) {
|
||||
// First, decompose our vectors into 64-bit chunks.
|
||||
u64a x[4];
|
||||
memcpy(x, &xvec, sizeof(xvec));
|
||||
u64a m[4];
|
||||
memcpy(m, &mvec, sizeof(mvec));
|
||||
u64a ALIGN_ATTR(32) x[4];
|
||||
u64a ALIGN_ATTR(32) m[4];
|
||||
store256(x, xvec);
|
||||
store256(m, mvec);
|
||||
|
||||
// Count the number of bits of compressed state we're writing out per
|
||||
// chunk.
|
||||
@ -264,11 +266,11 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
|
||||
expand32(v[6], m[6]), expand32(v[7], m[7]) };
|
||||
|
||||
#if !defined(HAVE_AVX2)
|
||||
m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
|
||||
.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) };
|
||||
m256 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]),
|
||||
.hi = set4x32(x[7], x[6], x[5], x[4]) };
|
||||
#else
|
||||
m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4],
|
||||
x[3], x[2], x[1], x[0]);
|
||||
m256 xvec = set8x32(x[7], x[6], x[5], x[4],
|
||||
x[3], x[2], x[1], x[0]);
|
||||
#endif
|
||||
return xvec;
|
||||
}
|
||||
@ -291,10 +293,10 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
|
||||
expand64(v[2], m[2]), expand64(v[3], m[3]) };
|
||||
|
||||
#if !defined(HAVE_AVX2)
|
||||
m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
|
||||
.hi = _mm_set_epi64x(x[3], x[2]) };
|
||||
m256 xvec = { .lo = set2x64(x[1], x[0]),
|
||||
.hi = set2x64(x[3], x[2]) };
|
||||
#else
|
||||
m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]);
|
||||
m256 xvec = set4x64(x[3], x[2], x[1], x[0]);
|
||||
#endif
|
||||
return xvec;
|
||||
}
|
||||
@ -402,9 +404,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
|
||||
expand32(v[8], m[8]), expand32(v[9], m[9]),
|
||||
expand32(v[10], m[10]), expand32(v[11], m[11]) };
|
||||
|
||||
m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
|
||||
.mid = _mm_set_epi32(x[7], x[6], x[5], x[4]),
|
||||
.hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) };
|
||||
m384 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]),
|
||||
.mid = set4x32(x[7], x[6], x[5], x[4]),
|
||||
.hi = set4x32(x[11], x[10], x[9], x[8]) };
|
||||
return xvec;
|
||||
}
|
||||
#endif
|
||||
@ -427,9 +429,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
|
||||
expand64(v[2], m[2]), expand64(v[3], m[3]),
|
||||
expand64(v[4], m[4]), expand64(v[5], m[5]) };
|
||||
|
||||
m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
|
||||
.mid = _mm_set_epi64x(x[3], x[2]),
|
||||
.hi = _mm_set_epi64x(x[5], x[4]) };
|
||||
m384 xvec = { .lo = set2x64(x[1], x[0]),
|
||||
.mid = set2x64(x[3], x[2]),
|
||||
.hi = set2x64(x[5], x[4]) };
|
||||
return xvec;
|
||||
}
|
||||
#endif
|
||||
@ -548,20 +550,20 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
|
||||
|
||||
m512 xvec;
|
||||
#if defined(HAVE_AVX512)
|
||||
xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12],
|
||||
x[11], x[10], x[9], x[8],
|
||||
x[7], x[6], x[5], x[4],
|
||||
x[3], x[2], x[1], x[0]);
|
||||
xvec = set32x16(x[15], x[14], x[13], x[12],
|
||||
x[11], x[10], x[9], x[8],
|
||||
x[7], x[6], x[5], x[4],
|
||||
x[3], x[2], x[1], x[0]);
|
||||
#elif defined(HAVE_AVX2)
|
||||
xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4],
|
||||
x[3], x[2], x[1], x[0]);
|
||||
xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12],
|
||||
x[11], x[10], x[9], x[8]);
|
||||
xvec.lo = set8x32(x[7], x[6], x[5], x[4],
|
||||
x[3], x[2], x[1], x[0]);
|
||||
xvec.hi = set8x32(x[15], x[14], x[13], x[12],
|
||||
x[11], x[10], x[9], x[8]);
|
||||
#else
|
||||
xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
|
||||
xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
|
||||
xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
|
||||
xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
|
||||
xvec.lo.lo = set4x32(x[3], x[2], x[1], x[0]);
|
||||
xvec.lo.hi = set4x32(x[7], x[6], x[5], x[4]);
|
||||
xvec.hi.lo = set4x32(x[11], x[10], x[9], x[8]);
|
||||
xvec.hi.hi = set4x32(x[15], x[14], x[13], x[12]);
|
||||
#endif
|
||||
return xvec;
|
||||
}
|
||||
@ -588,16 +590,16 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
|
||||
expand64(v[6], m[6]), expand64(v[7], m[7]) };
|
||||
|
||||
#if defined(HAVE_AVX512)
|
||||
m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4],
|
||||
m512 xvec = set64x8(x[7], x[6], x[5], x[4],
|
||||
x[3], x[2], x[1], x[0]);
|
||||
#elif defined(HAVE_AVX2)
|
||||
m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
|
||||
.hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
|
||||
m512 xvec = { .lo = set4x64(x[3], x[2], x[1], x[0]),
|
||||
.hi = set4x64(x[7], x[6], x[5], x[4])};
|
||||
#else
|
||||
m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
|
||||
_mm_set_epi64x(x[3], x[2]) },
|
||||
.hi = { _mm_set_epi64x(x[5], x[4]),
|
||||
_mm_set_epi64x(x[7], x[6]) } };
|
||||
m512 xvec = { .lo = { set2x64(x[1], x[0]),
|
||||
set2x64(x[3], x[2]) },
|
||||
.hi = { set2x64(x[5], x[4]),
|
||||
set2x64(x[7], x[6]) } };
|
||||
#endif
|
||||
return xvec;
|
||||
}
|
||||
|
@ -29,7 +29,10 @@
|
||||
|
||||
#include "hs_compile.h" // for various hs_platform_info flags
|
||||
#include "target_info.h"
|
||||
#include "util/cpuid_flags.h"
|
||||
#include "util/arch/common/cpuid_flags.h"
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
|
||||
#endif
|
||||
|
||||
namespace ue2 {
|
||||
|
||||
|
@ -658,34 +658,41 @@ TEST(SimdUtilsTest, movq) {
|
||||
|
||||
char cmp[sizeof(m128)];
|
||||
memset(cmp, 0x80, sizeof(m128));
|
||||
simd = set16x8(0x80);
|
||||
simd = set1_16x8(0x80);
|
||||
r = movq(simd);
|
||||
ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
|
||||
ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r)));
|
||||
|
||||
#if defined(HAVE_SIMD_128_BITS)
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
|
||||
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
|
||||
int64x2_t a = { 0x123456789abcdefLL, ~0LL };
|
||||
simd = vreinterpretq_s64_s8(a);
|
||||
#endif
|
||||
#endif
|
||||
r = movq(simd);
|
||||
ASSERT_EQ(r, 0x123456789abcdef);
|
||||
}
|
||||
|
||||
|
||||
TEST(SimdUtilsTest, set16x8) {
|
||||
TEST(SimdUtilsTest, set1_16x8) {
|
||||
char cmp[sizeof(m128)];
|
||||
|
||||
for (unsigned i = 0; i < 256; i++) {
|
||||
m128 simd = set16x8(i);
|
||||
m128 simd = set1_16x8(i);
|
||||
memset(cmp, i, sizeof(simd));
|
||||
ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SimdUtilsTest, set4x32) {
|
||||
TEST(SimdUtilsTest, set1_4x32) {
|
||||
u32 cmp[4] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 };
|
||||
m128 simd = set4x32(cmp[0]);
|
||||
m128 simd = set1_4x32(cmp[0]);
|
||||
ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
|
||||
}
|
||||
|
||||
#if defined(HAVE_AVX2)
|
||||
#if defined(HAVE_SIMD_256_BITS)
|
||||
TEST(SimdUtilsTest, set32x8) {
|
||||
char cmp[sizeof(m256)];
|
||||
|
||||
|
@ -98,8 +98,8 @@ TEST(state_compress, m128_1) {
|
||||
char buf[sizeof(m128)] = { 0 };
|
||||
|
||||
for (u32 i = 0; i < 16; i++) {
|
||||
char mask_raw[16] = { 0 };
|
||||
char val_raw[16] = { 0 };
|
||||
char ALIGN_ATTR(16) mask_raw[16] = { 0 };
|
||||
char ALIGN_ATTR(16) val_raw[16] = { 0 };
|
||||
|
||||
memset(val_raw, (i << 4) + 3, 16);
|
||||
|
||||
@ -109,17 +109,32 @@ TEST(state_compress, m128_1) {
|
||||
mask_raw[15 - i] = 0xff;
|
||||
val_raw[15 - i] = i;
|
||||
|
||||
m128 val;
|
||||
m128 mask;
|
||||
|
||||
memcpy(&val, val_raw, sizeof(val));
|
||||
memcpy(&mask, mask_raw, sizeof(mask));
|
||||
m128 val = load128(val_raw);
|
||||
m128 mask = load128(mask_raw);
|
||||
|
||||
storecompressed128(&buf, &val, &mask, 0);
|
||||
|
||||
m128 val_out;
|
||||
loadcompressed128(&val_out, &buf, &mask, 0);
|
||||
|
||||
int8_t ALIGN_ATTR(16) data[16];
|
||||
store128(data, val);
|
||||
printf("val: ");
|
||||
for (int j=0; j < 16; j++) printf("%02x ", data[j]);
|
||||
printf("\n");
|
||||
store128(data, mask);
|
||||
printf("mask: ");
|
||||
for (int j=0; j < 16; j++) printf("%02x ", data[j]);
|
||||
printf("\n");
|
||||
store128(data, and128(val, mask));
|
||||
printf("and128(val, mask): ");
|
||||
for (int j=0; j < 16; j++) printf("%02x ", data[j]);
|
||||
printf("\n");
|
||||
store128(data, val_out);
|
||||
printf("val_out: ");
|
||||
for (int j=0; j < 16; j++) printf("%02x ", data[j]);
|
||||
printf("\n");
|
||||
|
||||
EXPECT_TRUE(!diff128(and128(val, mask), val_out));
|
||||
|
||||
mask_raw[i] = 0x0f;
|
||||
|
Loading…
x
Reference in New Issue
Block a user