Merge pull request #86 from VectorCamp/develop

New release 5.4.6
This commit is contained in:
Konstantinos Margaritis 2022-01-21 12:25:40 +02:00 committed by GitHub
commit e6f856407e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
62 changed files with 3987 additions and 1942 deletions

View File

@ -3,7 +3,7 @@ project (vectorscan C CXX)
set (HS_MAJOR_VERSION 5) set (HS_MAJOR_VERSION 5)
set (HS_MINOR_VERSION 4) set (HS_MINOR_VERSION 4)
set (HS_PATCH_VERSION 3) set (HS_PATCH_VERSION 6)
set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION}) set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@ -128,11 +128,9 @@ CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in r
CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in debug builds, disabled in release builds" OFF "NOT RELEASE_BUILD" ON) CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in debug builds, disabled in release builds" OFF "NOT RELEASE_BUILD" ON)
option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime" option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime" OFF)
OFF)
option(BUILD_AVX512VBMI "Experimental: support avx512vbmi in the fat runtime" option(BUILD_AVX512VBMI "Experimental: support avx512vbmi in the fat runtime" OFF)
OFF)
if (BUILD_AVX512VBMI) if (BUILD_AVX512VBMI)
set(BUILD_AVX512 ON) set(BUILD_AVX512 ON)
@ -140,47 +138,95 @@ endif ()
# TODO: per platform config files? # TODO: per platform config files?
# remove CMake's idea of optimisation # remove CMake's idea of optimisation
foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES}) foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES})
string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}") string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}")
string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}") string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
endforeach () endforeach ()
if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE_AARCH64) if (CMAKE_C_COMPILER_ID MATCHES "Intel")
message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}") set(SKYLAKE_FLAG "-xCORE-AVX512")
# If gcc doesn't recognise the host cpu, then mtune=native becomes else ()
# generic, which isn't very good in some cases. march=native looks at set(SKYLAKE_FLAG "-march=skylake-avx512")
# cpuid info and then chooses the best microarch it can (and replaces set(ICELAKE_FLAG "-march=icelake-server")
# the flag), so use that for tune. endif ()
# arg1 might exist if using ccache if(ARCH_PPC64EL)
string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1) set(ARCH_FLAG mcpu)
set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native) else()
execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} set(ARCH_FLAG march)
OUTPUT_VARIABLE _GCC_OUTPUT) endif()
string(FIND "${_GCC_OUTPUT}" "march" POS)
string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1"
GNUCC_ARCH "${_GCC_OUTPUT}")
if (ARCH_IA32 OR ARCH_X86_64) # Detect best GNUCC_ARCH to tune for
# test the parsed flag if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE)
set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH}) message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
OUTPUT_QUIET ERROR_QUIET # If gcc doesn't recognise the host cpu, then mtune=native becomes
INPUT_FILE /dev/null # generic, which isn't very good in some cases. march=native looks at
RESULT_VARIABLE GNUCC_TUNE_TEST) # cpuid info and then chooses the best microarch it can (and replaces
if (NOT GNUCC_TUNE_TEST EQUAL 0) # the flag), so use that for tune.
message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid")
endif() # arg1 might exist if using ccache
set(TUNE_FLAG ${GNUCC_ARCH}) string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
else() set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -${ARCH_FLAG}=native -mtune=native)
set(TUNE_FLAG native) execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
endif() OUTPUT_VARIABLE _GCC_OUTPUT)
elseif (NOT TUNE_FLAG) string(FIND "${_GCC_OUTPUT}" "${ARCH_FLAG}" POS)
string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
string(REGEX REPLACE "${ARCH_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}")
# test the parsed flag
set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH})
execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
OUTPUT_QUIET ERROR_QUIET
INPUT_FILE /dev/null
RESULT_VARIABLE GNUCC_TUNE_TEST)
if (NOT GNUCC_TUNE_TEST EQUAL 0)
message(WARNING "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid, falling back to -mtune=native")
set(TUNE_FLAG native) set(TUNE_FLAG native)
else()
set(TUNE_FLAG ${GNUCC_ARCH})
message(STATUS "gcc will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
endif() endif()
elseif (CMAKE_COMPILER_IS_CLANG AND NOT CROSS_COMPILE)
if (ARCH_IA32 OR ARCH_X86_64)
set(GNUCC_ARCH native)
set(TUNE_FLAG generic)
elseif(ARCH_AARCH64)
set(GNUCC_ARCH armv8)
set(TUNE_FLAG generic)
elseif(ARCH_ARM32)
set(GNUCC_ARCH armv7a)
set(TUNE_FLAG generic)
else()
set(GNUCC_ARCH native)
set(TUNE_FLAG generic)
endif()
message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
elseif (CROSS_COMPILE)
set(GNUCC_ARCH generic)
set(TUNE_FLAG generic)
endif()
if (ARCH_IA32 OR ARCH_X86_64)
if (NOT FAT_RUNTIME)
if (BUILD_AVX512)
set(ARCH_C_FLAGS "${SKYLAKE_FLAG}")
set(ARCH_CXX_FLAGS "${SKYLAKE_FLAG}")
elseif (BUILD_AVX2)
set(ARCH_C_FLAGS "-mavx2")
set(ARCH_CXX_FLAGS "-mavx2")
else()
set(ARCH_C_FLAGS "-msse4.2")
set(ARCH_CXX_FLAGS "-msse4.2")
endif()
else()
set(ARCH_C_FLAGS "-msse4.2")
set(ARCH_CXX_FLAGS "-msse4.2")
endif()
endif()
if (ARCH_AARCH64)
if (BUILD_SVE2_BITPERM) if (BUILD_SVE2_BITPERM)
set(GNUCC_ARCH "${GNUCC_ARCH}+sve2-bitperm") set(GNUCC_ARCH "${GNUCC_ARCH}+sve2-bitperm")
elseif (BUILD_SVE2) elseif (BUILD_SVE2)
@ -188,80 +234,95 @@ endif ()
elseif (BUILD_SVE) elseif (BUILD_SVE)
set(GNUCC_ARCH "${GNUCC_ARCH}+sve") set(GNUCC_ARCH "${GNUCC_ARCH}+sve")
endif () endif ()
endif(ARCH_AARCH64)
# compiler version checks TODO: test more compilers
if (CMAKE_COMPILER_IS_GNUCXX) message(STATUS "ARCH_C_FLAGS : ${ARCH_C_FLAGS}")
set(GNUCXX_MINVER "4.8.1") message(STATUS "ARCH_CXX_FLAGS : ${ARCH_CXX_FLAGS}")
message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER) if (NOT FAT_RUNTIME)
message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++11 support") set(ARCH_C_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_C_FLAGS}")
endif() set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_CXX_FLAGS}")
endif()
#if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64)
# if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
# set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
# endif()
# if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
# set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
# endif()
#endif()
#if(ARCH_PPC64EL)
# if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
# set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}")
# endif()
# if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
# set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}")
# endif()
#endif()
# compiler version checks TODO: test more compilers
if (CMAKE_COMPILER_IS_GNUCXX)
set(GNUCXX_MINVER "9")
message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER)
message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++17 support")
endif() endif()
endif()
if(RELEASE_BUILD) if(RELEASE_BUILD)
if (NOT CMAKE_BUILD_TYPE MATCHES MINSIZEREL) if (NOT CMAKE_BUILD_TYPE MATCHES MINSIZEREL)
set(OPT_C_FLAG "-O3") set(OPT_C_FLAG "-O3")
set(OPT_CXX_FLAG "-O3") set(OPT_CXX_FLAG "-O3")
else ()
set(OPT_C_FLAG "-Os")
set(OPT_CXX_FLAG "-Os")
endif ()
else()
set(OPT_C_FLAG "-O0")
set(OPT_CXX_FLAG "-O0")
endif(RELEASE_BUILD)
# set compiler flags - more are tested and added later
set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing -fno-new-ttp-matching")
if (NOT RELEASE_BUILD)
# -Werror is most useful during development, don't potentially break
# release builds
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
endif()
if (DISABLE_ASSERTS)
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG")
endif()
if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
endif()
if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
endif()
if(CMAKE_COMPILER_IS_GNUCC)
# spurious warnings?
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized")
endif()
if(CMAKE_COMPILER_IS_GNUCXX)
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0")
endif ()
# don't complain about abi
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
endif()
if (NOT(ARCH_IA32 AND RELEASE_BUILD))
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
endif()
if (CMAKE_C_COMPILER_ID MATCHES "Intel")
set(SKYLAKE_FLAG "-xCORE-AVX512")
else () else ()
set(SKYLAKE_FLAG "-march=skylake-avx512") set(OPT_C_FLAG "-Os")
set(ICELAKE_FLAG "-march=icelake-server") set(OPT_CXX_FLAG "-Os")
endif () endif ()
else()
set(OPT_C_FLAG "-O0")
set(OPT_CXX_FLAG "-O0")
endif(RELEASE_BUILD)
# set compiler flags - more are tested and added later
set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
if (NOT CMAKE_COMPILER_IS_CLANG)
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-new-ttp-matching")
endif()
if (NOT RELEASE_BUILD)
# -Werror is most useful during development, don't potentially break
# release builds
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
endif()
if (DISABLE_ASSERTS)
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG")
endif()
if(CMAKE_COMPILER_IS_GNUCC)
# spurious warnings?
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized")
endif()
if(CMAKE_COMPILER_IS_GNUCXX)
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0")
endif ()
# don't complain about abi
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
endif()
if (NOT(ARCH_IA32 AND RELEASE_BUILD))
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
endif()
CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H) CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
if (ARCH_IA32 OR ARCH_X86_64) if (ARCH_IA32 OR ARCH_X86_64)
@ -277,8 +338,8 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64)
message(FATAL_ERROR "arm_sve.h is required to build for SVE.") message(FATAL_ERROR "arm_sve.h is required to build for SVE.")
endif() endif()
endif() endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions") elseif (ARCH_PPC64EL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions") CHECK_INCLUDE_FILE_CXX(altivec.h HAVE_C_PPC64EL_ALTIVEC_H)
endif() endif()
CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN) CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
@ -304,8 +365,7 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
# This is a Linux-only feature for now - requires platform support # This is a Linux-only feature for now - requires platform support
# elsewhere # elsewhere
message(STATUS "generator is ${CMAKE_GENERATOR}") message(STATUS "generator is ${CMAKE_GENERATOR}")
if (CMAKE_C_COMPILER_ID MATCHES "Clang" AND if (CMAKE_C_COMPILER_IS_CLANG AND CMAKE_C_COMPILER_VERSION VERSION_LESS "3.9")
CMAKE_C_COMPILER_VERSION VERSION_LESS "3.9")
message (STATUS "Clang v3.9 or higher required for fat runtime, cannot build fat runtime") message (STATUS "Clang v3.9 or higher required for fat runtime, cannot build fat runtime")
set (FAT_RUNTIME_REQUISITES FALSE) set (FAT_RUNTIME_REQUISITES FALSE)
elseif (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR elseif (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR
@ -329,7 +389,10 @@ include (${CMAKE_MODULE_PATH}/arch.cmake)
# testing a builtin takes a little more work # testing a builtin takes a little more work
CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED) CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED) CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P) # Clang does not use __builtin_constant_p() the same way as gcc
if (NOT CMAKE_COMPILER_IS_CLANG)
CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
endif()
set(C_FLAGS_TO_CHECK set(C_FLAGS_TO_CHECK
# Variable length arrays are way bad, most especially at run time # Variable length arrays are way bad, most especially at run time
@ -428,19 +491,22 @@ if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
set(FREEBSD true) set(FREEBSD true)
endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
if (NOT FAT_RUNTIME)
if (CROSS_COMPILE_AARCH64) if (FAT_RUNTIME)
if (NOT (ARCH_IA32 OR ARCH_X86_64))
message(FATAL_ERROR "Fat runtime is not supported on non-Intel architectures")
else()
message(STATUS "Building runtime for multiple microarchitectures")
endif()
else()
if (CROSS_COMPILE)
message(STATUS "Building for target CPU: ${ARCH_C_FLAGS}") message(STATUS "Building for target CPU: ${ARCH_C_FLAGS}")
else() else()
message(STATUS "Building for current host CPU: ${ARCH_C_FLAGS}") message(STATUS "Building for current host CPU: ${ARCH_C_FLAGS}")
endif() endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
else()
message(STATUS "Building runtime for multiple microarchitectures")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
endif() endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
add_subdirectory(util) add_subdirectory(util)
add_subdirectory(doc/dev-reference) add_subdirectory(doc/dev-reference)
@ -522,7 +588,7 @@ set (hs_exec_common_SRCS
${hs_exec_common_SRCS} ${hs_exec_common_SRCS}
src/util/arch/x86/cpuid_flags.c src/util/arch/x86/cpuid_flags.c
) )
elseif (ARCH_ARM32 OR ARCH_AARCH64) elseif (ARCH_ARM32 OR ARCH_AARCH64 OR ARCH_PPC64EL)
set (hs_exec_common_SRCS set (hs_exec_common_SRCS
${hs_exec_common_SRCS} ${hs_exec_common_SRCS}
src/util/arch/arm/cpuid_flags.c src/util/arch/arm/cpuid_flags.c
@ -618,9 +684,8 @@ set (hs_exec_SRCS
src/nfa/tamarama_internal.h src/nfa/tamarama_internal.h
src/nfa/truffle.cpp src/nfa/truffle.cpp
src/nfa/truffle.h src/nfa/truffle.h
src/nfa/vermicelli.h src/nfa/vermicelli.hpp
src/nfa/vermicelli_run.h src/nfa/vermicelli_run.h
src/nfa/vermicelli_sse.h
src/som/som.h src/som/som.h
src/som/som_operation.h src/som/som_operation.h
src/som/som_runtime.h src/som/som_runtime.h
@ -681,9 +746,19 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64)
set (hs_exec_SRCS set (hs_exec_SRCS
${hs_exec_SRCS} ${hs_exec_SRCS}
src/util/supervector/arch/arm/impl.cpp) src/util/supervector/arch/arm/impl.cpp)
elseif (ARCH_PPC64EL)
set (hs_exec_SRCS
${hs_exec_SRCS}
src/util/supervector/arch/ppc64el/impl.cpp)
endif () endif ()
endif() endif()
if (NOT BUILD_SVE2)
set (hs_exec_SRCS
${hs_exec_SRCS}
src/nfa/vermicelli_simd.cpp)
endif()
set (hs_exec_avx2_SRCS set (hs_exec_avx2_SRCS
src/fdr/teddy_avx2.c src/fdr/teddy_avx2.c
src/util/arch/x86/masked_move.c src/util/arch/x86/masked_move.c
@ -1148,10 +1223,6 @@ if (NOT FAT_RUNTIME)
set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C) set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
add_library(hs_compile OBJECT ${hs_compile_SRCS}) add_library(hs_compile OBJECT ${hs_compile_SRCS})
if (ARCH_IA32)
set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-mssse3")
endif (ARCH_IA32)
add_library(hs STATIC add_library(hs STATIC
src/hs_version.c src/hs_version.c
src/hs_valid_platform.c src/hs_valid_platform.c
@ -1182,14 +1253,14 @@ else (FAT_RUNTIME)
add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS}) add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>) list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>)
set_target_properties(hs_exec_core2 PROPERTIES set_target_properties(hs_exec_core2 PROPERTIES
COMPILE_FLAGS "-march=core2" COMPILE_FLAGS "-march=core2 -msse4.2"
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in" RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
) )
add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS}) add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS})
list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_corei7>) list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_corei7>)
set_target_properties(hs_exec_corei7 PROPERTIES set_target_properties(hs_exec_corei7 PROPERTIES
COMPILE_FLAGS "-march=corei7 -mssse3" COMPILE_FLAGS "-march=corei7 -msse4.2"
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in" RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
) )
@ -1231,10 +1302,6 @@ else (FAT_RUNTIME)
${RUNTIME_LIBS}) ${RUNTIME_LIBS})
set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C) set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
add_library(hs_compile OBJECT ${hs_compile_SRCS}) add_library(hs_compile OBJECT ${hs_compile_SRCS})
if (ARCH_IA32 OR ARCH_X86_64)
set_target_properties(hs_exec_common PROPERTIES COMPILE_FLAGS "-mssse3")
set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-mssse3")
endif ()
# we want the static lib for testing # we want the static lib for testing
add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c
@ -1251,14 +1318,14 @@ else (FAT_RUNTIME)
add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS}) add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>) list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
set_target_properties(hs_exec_shared_core2 PROPERTIES set_target_properties(hs_exec_shared_core2 PROPERTIES
COMPILE_FLAGS "-march=core2" COMPILE_FLAGS "-march=core2 -msse4.2"
POSITION_INDEPENDENT_CODE TRUE POSITION_INDEPENDENT_CODE TRUE
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in" RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
) )
add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS}) add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS})
list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_corei7>) list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_corei7>)
set_target_properties(hs_exec_shared_corei7 PROPERTIES set_target_properties(hs_exec_shared_corei7 PROPERTIES
COMPILE_FLAGS "-march=corei7 -mssse3" COMPILE_FLAGS "-march=corei7 -msse4.2"
POSITION_INDEPENDENT_CODE TRUE POSITION_INDEPENDENT_CODE TRUE
RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in" RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
) )

View File

@ -191,6 +191,34 @@ int main(){
); );
} }
for (size_t i = 0; i < std::size(sizes); i++) {
MicroBenchmark bench("Vermicelli", sizes[i]);
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark &b) {
return vermicelliExec('a', 'b', b.buf.data(), b.buf.data() + b.size);
}
);
}
for (size_t i = 0; i < std::size(sizes); i++) {
MicroBenchmark bench("Reverse Vermicelli", sizes[i]);
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
[&](MicroBenchmark &b) {
b.chars.set('a');
ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
memset(b.buf.data(), 'b', b.size);
},
[&](MicroBenchmark &b) {
return rvermicelliExec('a', 'b', b.buf.data(), b.buf.data() + b.size);
}
);
}
for (size_t i = 0; i < std::size(sizes); i++) { for (size_t i = 0; i < std::size(sizes); i++) {
//we imitate the noodle unit tests //we imitate the noodle unit tests
std::string str; std::string str;

View File

@ -30,6 +30,7 @@
#include "nfa/shufticompile.h" #include "nfa/shufticompile.h"
#include "nfa/truffle.h" #include "nfa/truffle.h"
#include "nfa/trufflecompile.h" #include "nfa/trufflecompile.h"
#include "nfa/vermicelli.hpp"
#include "hwlm/noodle_build.h" #include "hwlm/noodle_build.h"
#include "hwlm/noodle_engine.h" #include "hwlm/noodle_engine.h"
#include "hwlm/noodle_internal.h" #include "hwlm/noodle_internal.h"

View File

@ -9,6 +9,9 @@ elseif (HAVE_C_INTRIN_H)
elseif (HAVE_C_ARM_NEON_H) elseif (HAVE_C_ARM_NEON_H)
set (INTRIN_INC_H "arm_neon.h") set (INTRIN_INC_H "arm_neon.h")
set (FAT_RUNTIME OFF) set (FAT_RUNTIME OFF)
elseif (HAVE_C_PPC64EL_ALTIVEC_H)
set (INTRIN_INC_H "altivec.h")
set (FAT_RUNTIME OFF)
else() else()
message (FATAL_ERROR "No intrinsics header found") message (FATAL_ERROR "No intrinsics header found")
endif () endif ()
@ -85,7 +88,7 @@ if (FAT_RUNTIME)
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}") set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}")
endif (BUILD_AVX512VBMI) endif (BUILD_AVX512VBMI)
elseif (BUILD_AVX2) elseif (BUILD_AVX2)
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2 -mavx") set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2 -mavx2")
elseif () elseif ()
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-i7 -mssse3") set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-i7 -mssse3")
endif () endif ()
@ -95,12 +98,12 @@ else (NOT FAT_RUNTIME)
endif () endif ()
if (ARCH_IA32 OR ARCH_X86_64) if (ARCH_IA32 OR ARCH_X86_64)
# ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic # ensure we have the minimum of SSE4.2 - call a SSE4.2 intrinsic
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
int main() { int main() {
__m128i a = _mm_set1_epi8(1); __m128i a = _mm_set1_epi8(1);
(void)_mm_shuffle_epi8(a, a); (void)_mm_shuffle_epi8(a, a);
}" HAVE_SSSE3) }" HAVE_SSE42)
# now look for AVX2 # now look for AVX2
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
@ -136,13 +139,26 @@ int main(){
(void)_mm512_permutexvar_epi8(idx, a); (void)_mm512_permutexvar_epi8(idx, a);
}" HAVE_AVX512VBMI) }" HAVE_AVX512VBMI)
elseif (!ARCH_ARM32 AND !ARCH_AARCH64)
elseif (ARCH_ARM32 OR ARCH_AARCH64)
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
int main() {
int32x4_t a = vdupq_n_s32(1);
(void)a;
}" HAVE_NEON)
elseif (ARCH_PPC64EL)
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
int main() {
vector int a = vec_splat_s32(1);
(void)a;
}" HAVE_VSX)
else ()
message (FATAL_ERROR "Unsupported architecture") message (FATAL_ERROR "Unsupported architecture")
endif () endif ()
if (FAT_RUNTIME) if (FAT_RUNTIME)
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3) if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSE42)
message(FATAL_ERROR "SSSE3 support required to build fat runtime") message(FATAL_ERROR "SSE4.2 support required to build fat runtime")
endif () endif ()
if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX2 AND NOT HAVE_AVX2) if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX2 AND NOT HAVE_AVX2)
message(FATAL_ERROR "AVX2 support required to build fat runtime") message(FATAL_ERROR "AVX2 support required to build fat runtime")
@ -163,12 +179,16 @@ else (NOT FAT_RUNTIME)
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512VBMI) if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512VBMI)
message(STATUS "Building without AVX512VBMI support") message(STATUS "Building without AVX512VBMI support")
endif () endif ()
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3) if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSE42)
message(FATAL_ERROR "A minimum of SSSE3 compiler support is required") message(FATAL_ERROR "A minimum of SSE4.2 compiler support is required")
endif () endif ()
if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON) if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON)
message(FATAL_ERROR "NEON support required for ARM support") message(FATAL_ERROR "NEON support required for ARM support")
endif () endif ()
if (ARCH_PPPC64EL AND NOT HAVE_VSX)
message(FATAL_ERROR "VSX support required for Power support")
endif ()
endif () endif ()
unset (PREV_FLAGS) unset (PREV_FLAGS)

View File

@ -21,6 +21,9 @@
/* "Define if building for AARCH64" */ /* "Define if building for AARCH64" */
#cmakedefine ARCH_AARCH64 #cmakedefine ARCH_AARCH64
/* "Define if building for PPC64EL" */
#cmakedefine ARCH_PPC64EL
/* "Define if cross compiling for AARCH64" */ /* "Define if cross compiling for AARCH64" */
#cmakedefine CROSS_COMPILE_AARCH64 #cmakedefine CROSS_COMPILE_AARCH64
@ -75,6 +78,9 @@
/* C compiler has arm_sve.h */ /* C compiler has arm_sve.h */
#cmakedefine HAVE_C_ARM_SVE_H #cmakedefine HAVE_C_ARM_SVE_H
/* C compiler has arm_neon.h */
#cmakedefine HAVE_C_PPC64EL_ALTIVEC_H
/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
0 if you don't. */ 0 if you don't. */
#cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP

View File

@ -1,3 +1,8 @@
# determine compiler
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CMAKE_COMPILER_IS_CLANG TRUE)
endif()
# determine the target arch # determine the target arch
if (CROSS_COMPILE_AARCH64) if (CROSS_COMPILE_AARCH64)
@ -7,15 +12,13 @@ if (CROSS_COMPILE_AARCH64)
else() else()
# really only interested in the preprocessor here # really only interested in the preprocessor here
CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64) CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64)
CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32) CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32) CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !(defined(__LITTLE_ENDIAN__) && defined(__VSX__))\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL)
if (ARCH_X86_64 OR ARCH_AARCH64) if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL)
set(ARCH_64_BIT TRUE) set(ARCH_64_BIT TRUE)
else() else()
set(ARCH_32_BIT TRUE) set(ARCH_32_BIT TRUE)
endif() endif()
endif() endif()

View File

@ -112,6 +112,7 @@
* *
*/ */
#include <random>
#include <algorithm> #include <algorithm>
#include <cstring> #include <cstring>
#include <chrono> #include <chrono>
@ -151,6 +152,8 @@ using std::set;
using std::min; using std::min;
using std::max; using std::max;
using std::copy; using std::copy;
using std::random_device;
using std::mt19937;
enum Criterion { enum Criterion {
CRITERION_THROUGHPUT, CRITERION_THROUGHPUT,
@ -731,7 +734,9 @@ int main(int argc, char **argv) {
count++; count++;
cout << "." << std::flush; cout << "." << std::flush;
vector<unsigned> sv(s.begin(), s.end()); vector<unsigned> sv(s.begin(), s.end());
random_shuffle(sv.begin(), sv.end()); random_device rng;
mt19937 urng(rng());
shuffle(sv.begin(), sv.end(), urng);
unsigned groups = factor_max + 1; unsigned groups = factor_max + 1;
for (unsigned current_group = 0; current_group < groups; for (unsigned current_group = 0; current_group < groups;
current_group++) { current_group++) {

View File

@ -893,10 +893,10 @@ do { \
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
do { \ do { \
if (unlikely(diff128(var, ones128()))) { \ if (unlikely(diff128(var, ones128()))) { \
u64a __attribute__((aligned(16))) vector[2]; \ u64a __attribute__((aligned(16))) vec[2]; \
store128(vector, var); \ store128(vec, var); \
u64a lo = vector[0]; \ u64a lo = vec[0]; \
u64a hi = vector[1]; \ u64a hi = vec[1]; \
CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn); \ CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn); \
CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn); \ CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn); \
} \ } \

View File

@ -44,5 +44,7 @@ hs_error_t HS_CDECL hs_valid_platform(void) {
} }
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
return HS_SUCCESS; return HS_SUCCESS;
#elif defined(ARCH_PPC64EL)
return HS_SUCCESS;
#endif #endif
} }

View File

@ -39,7 +39,7 @@
#include "nfa/accel.h" #include "nfa/accel.h"
#include "nfa/shufti.h" #include "nfa/shufti.h"
#include "nfa/truffle.h" #include "nfa/truffle.h"
#include "nfa/vermicelli.h" #include "nfa/vermicelli.hpp"
#include <string.h> #include <string.h>
#define MIN_ACCEL_LEN_BLOCK 16 #define MIN_ACCEL_LEN_BLOCK 16

View File

@ -30,26 +30,7 @@
/* SIMD engine agnostic noodle scan parts */ /* SIMD engine agnostic noodle scan parts */
#include "util/supervector/supervector.hpp" #include "util/supervector/supervector.hpp"
#include "util/supervector/casemask.hpp"
static u8 CASEMASK[] = { 0xff, 0xdf };
static really_inline
u8 caseClear8(u8 x, bool noCase)
{
return static_cast<u8>(x & CASEMASK[(u8)noCase]);
}
template<uint16_t S>
static really_inline SuperVector<S> getMask(u8 c, bool noCase) {
u8 k = caseClear8(c, noCase);
return SuperVector<S>(k);
}
template<uint16_t S>
static really_inline SuperVector<S> getCaseMask(void) {
return SuperVector<S>(CASEMASK[1]);
}
static really_really_inline static really_really_inline
hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,

View File

@ -170,7 +170,7 @@ hwlm_error_t scanDoubleOnce(const struct noodTable *n, const u8 *buf,
svbool_t pg = svwhilelt_b8_s64(0, e - d); svbool_t pg = svwhilelt_b8_s64(0, e - d);
svbool_t pg_rot = svwhilelt_b8_s64(0, e - d + 1); svbool_t pg_rot = svwhilelt_b8_s64(0, e - d + 1);
svbool_t matched, matched_rot; svbool_t matched, matched_rot;
svbool_t any = doubleMatched(chars, d, pg, pg_rot, &matched, &matched_rot); svbool_t any = doubleMatched(svreinterpret_u16(chars), d, pg, pg_rot, &matched, &matched_rot);
return doubleCheckMatched(n, buf, len, cbi, d, matched, matched_rot, any); return doubleCheckMatched(n, buf, len, cbi, d, matched, matched_rot, any);
} }
@ -187,7 +187,7 @@ hwlm_error_t scanDoubleLoop(const struct noodTable *n, const u8 *buf,
for (size_t i = 0; i < loops; i++, d += svcntb()) { for (size_t i = 0; i < loops; i++, d += svcntb()) {
DEBUG_PRINTF("d %p \n", d); DEBUG_PRINTF("d %p \n", d);
svbool_t matched, matched_rot; svbool_t matched, matched_rot;
svbool_t any = doubleMatched(chars, d, svptrue_b8(), svptrue_b8(), svbool_t any = doubleMatched(svreinterpret_u16(chars), d, svptrue_b8(), svptrue_b8(),
&matched, &matched_rot); &matched, &matched_rot);
hwlm_error_t rv = doubleCheckMatched(n, buf, len, cbi, d, hwlm_error_t rv = doubleCheckMatched(n, buf, len, cbi, d,
matched, matched_rot, any); matched, matched_rot, any);
@ -220,7 +220,7 @@ hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
} }
++d; ++d;
svuint16_t chars = getCharMaskDouble(n->key0, n->key1, noCase); svuint8_t chars = svreinterpret_u8(getCharMaskDouble(n->key0, n->key1, noCase));
if (scan_len <= svcntb()) { if (scan_len <= svcntb()) {
return scanDoubleOnce(n, buf, len, cbi, chars, d, e); return scanDoubleOnce(n, buf, len, cbi, chars, d, e);
@ -234,4 +234,4 @@ hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
RETURN_IF_TERMINATED(rv); RETURN_IF_TERMINATED(rv);
} }
return scanDoubleLoop(n, buf, len, cbi, chars, d1, e); return scanDoubleLoop(n, buf, len, cbi, chars, d1, e);
} }

View File

@ -30,7 +30,7 @@
#include "accel.h" #include "accel.h"
#include "shufti.h" #include "shufti.h"
#include "truffle.h" #include "truffle.h"
#include "vermicelli.h" #include "vermicelli.hpp"
#include "ue2common.h" #include "ue2common.h"
const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) { const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {

View File

@ -1,7 +1,6 @@
/* /*
* Copyright (c) 2015-2017, Intel Corporation * Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC * Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2021, Arm Limited
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met: * modification, are permitted provided that the following conditions are met:
@ -30,7 +29,6 @@
/** \file /** \file
* \brief Shufti: character class acceleration. * \brief Shufti: character class acceleration.
*
*/ */
template <uint16_t S> template <uint16_t S>
@ -73,4 +71,4 @@ SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi,
t.print8("t"); t.print8("t");
return !t.eq(SuperVector<S>::Ones()); return !t.eq(SuperVector<S>::Ones());
} }

125
src/nfa/arm/vermicelli.hpp Normal file
View File

@ -0,0 +1,125 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Vermicelli: single-byte and double-byte acceleration.
*/
template <uint16_t S>
static really_inline
const u8 *vermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
return first_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *vermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = !chars.eq(casemask & data);
return first_zero_match_inverted<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *rvermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
return last_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *rvermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, u16 const len) {
data.print8("data");
chars.print8("chars");
casemask.print8("casemask");
SuperVector<S> mask = !chars.eq(casemask & data);
mask.print8("mask");
return last_zero_match_inverted<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *vermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v);
SuperVector<S> mask2 = chars2.eq(v);
SuperVector<S> mask = mask1 & (mask2 >> 1);
DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) return buf - 1;
return first_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *rvermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v);
SuperVector<S> mask2 = chars2.eq(v);
SuperVector<S> mask = (mask1 << 1)& mask2;
DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) {
mask = mask | (SuperVector<S>::Ones() >> (S-1));
}
return last_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2,
SuperVector<S> const mask1, SuperVector<S> const mask2,
u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len) {
SuperVector<S> v1 = chars1.eq(data & mask1);
SuperVector<S> v2 = chars2.eq(data & mask2);
SuperVector<S> mask = v1 & (v2 >> 1);
DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (((buf[0] & m1) == c2) && ((buf[-1] & m2) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) return buf - 1;
return first_non_zero_match<S>(buf, mask, len);
}

View File

@ -40,7 +40,7 @@
#include "repeat.h" #include "repeat.h"
#include "shufti.h" #include "shufti.h"
#include "truffle.h" #include "truffle.h"
#include "vermicelli.h" #include "vermicelli.hpp"
#include "util/bitutils.h" #include "util/bitutils.h"
#include "util/multibit.h" #include "util/multibit.h"
#include "util/partial_store.h" #include "util/partial_store.h"

View File

@ -40,7 +40,7 @@
#include "repeat_internal.h" #include "repeat_internal.h"
#include "shufti.h" #include "shufti.h"
#include "truffle.h" #include "truffle.h"
#include "vermicelli.h" #include "vermicelli.hpp"
#include "util/partial_store.h" #include "util/partial_store.h"
#include "util/unaligned.h" #include "util/unaligned.h"
@ -533,4 +533,4 @@ char lbrFwdScanTruf(const struct NFA *nfa, const u8 *buf,
#ifdef HAVE_SVE2 #ifdef HAVE_SVE2
#include "lbr_sve.h" #include "lbr_sve.h"
#endif #endif

View File

@ -40,7 +40,7 @@
#include "shufti.h" #include "shufti.h"
#include "truffle.h" #include "truffle.h"
#include "ue2common.h" #include "ue2common.h"
#include "vermicelli.h" #include "vermicelli.hpp"
#include "util/arch.h" #include "util/arch.h"
#include "util/bitutils.h" #include "util/bitutils.h"
#include "util/simd_utils.h" #include "util/simd_utils.h"

View File

@ -36,7 +36,7 @@
#include "shufti.h" #include "shufti.h"
#include "truffle.h" #include "truffle.h"
#include "ue2common.h" #include "ue2common.h"
#include "vermicelli.h" #include "vermicelli.hpp"
#include "vermicelli_run.h" #include "vermicelli_run.h"
#include "util/multibit.h" #include "util/multibit.h"
#include "util/partial_store.h" #include "util/partial_store.h"

View File

@ -35,7 +35,7 @@
#include "accel.h" #include "accel.h"
#include "nfa_internal.h" #include "nfa_internal.h"
#include "vermicelli.h" #include "vermicelli.hpp"
#include "util/unaligned.h" #include "util/unaligned.h"
static really_inline static really_inline

View File

@ -0,0 +1,76 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2021, Arm Limited
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Shufti: character class acceleration.
*
*/
template <uint16_t S>
static really_inline
const SuperVector<S> blockSingleMask(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars) {
const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
SuperVector<S> c_lo = chars & low4bits;
SuperVector<S> c_hi = chars.template vshr_8_imm<4>();
c_lo = mask_lo.template pshufb<false>(c_lo);
c_hi = mask_hi.template pshufb<false>(c_hi);
return (c_lo & c_hi).eq(SuperVector<S>::Zeroes());
}
template <uint16_t S>
static really_inline
SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> chars) {
const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
SuperVector<S> chars_lo = chars & low4bits;
chars_lo.print8("chars_lo");
SuperVector<S> chars_hi = chars.template vshr_64_imm<4>() & low4bits;
chars_hi.print8("chars_hi");
SuperVector<S> c1_lo = mask1_lo.template pshufb<true>(chars_lo);
c1_lo.print8("c1_lo");
SuperVector<S> c1_hi = mask1_hi.template pshufb<true>(chars_hi);
c1_hi.print8("c1_hi");
SuperVector<S> t1 = c1_lo | c1_hi;
t1.print8("t1");
SuperVector<S> c2_lo = mask2_lo.template pshufb<true>(chars_lo);
c2_lo.print8("c2_lo");
SuperVector<S> c2_hi = mask2_hi.template pshufb<true>(chars_hi);
c2_hi.print8("c2_hi");
SuperVector<S> t2 = c2_lo | c2_hi;
t2.print8("t2");
t2.template vshr_128_imm<1>().print8("t2.vshr_128(1)");
SuperVector<S> t = t1 | (t2.template vshr_128_imm<1>());
t.print8("t");
return t.eq(SuperVector<S>::Ones());
}

View File

@ -0,0 +1,62 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Truffle: character class acceleration.
*
*/
template <uint16_t S>
static really_inline
const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars) {
chars.print8("chars");
shuf_mask_lo_highclear.print8("shuf_mask_lo_highclear");
shuf_mask_lo_highset.print8("shuf_mask_lo_highset");
SuperVector<S> highconst = SuperVector<S>::dup_u8(0x80);
highconst.print8("highconst");
SuperVector<S> shuf_mask_hi = SuperVector<S>::dup_u64(0x8040201008040201);
shuf_mask_hi.print8("shuf_mask_hi");
SuperVector<S> shuf1 = shuf_mask_lo_highclear.pshufb(chars);
shuf1.print8("shuf1");
SuperVector<S> t1 = chars ^ highconst;
t1.print8("t1");
SuperVector<S> shuf2 = shuf_mask_lo_highset.pshufb(t1);
shuf2.print8("shuf2");
SuperVector<S> t2 = highconst.opandnot(chars.template vshr_64_imm<4>());
t2.print8("t2");
SuperVector<S> shuf3 = shuf_mask_hi.pshufb(t2);
shuf3.print8("shuf3");
SuperVector<S> res = (shuf1 | shuf2) & shuf3;
res.print8("(shuf1 | shuf2) & shuf3");
return res.eq(SuperVector<S>::Zeroes());
}

View File

@ -0,0 +1,126 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Vermicelli: single-byte and double-byte acceleration.
*/
template <uint16_t S>
static really_inline
const u8 *vermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
return first_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *vermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
return first_zero_match_inverted<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *rvermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
return last_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *rvermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, u16 const len) {
data.print8("data");
chars.print8("chars");
casemask.print8("casemask");
SuperVector<S> mask = chars.eq(casemask & data);
mask.print8("mask");
return last_zero_match_inverted<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *vermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v);
SuperVector<S> mask2 = chars2.eq(v);
SuperVector<S> mask = mask1 & (mask2 >> 1);
DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) return buf - 1;
return first_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *rvermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v);
SuperVector<S> mask2 = chars2.eq(v);
SuperVector<S> mask = (mask1 << 1)& mask2;
DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) {
mask = mask | (SuperVector<S>::Ones() >> (S-1));
}
return last_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2,
SuperVector<S> const mask1, SuperVector<S> const mask2,
u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len) {
SuperVector<S> v1 = chars1.eq(data & mask1);
SuperVector<S> v2 = chars2.eq(data & mask2);
SuperVector<S> mask = v1 & (v2 >> 1);
DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (((buf[0] & m1) == c2) && ((buf[-1] & m2) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) return buf - 1;
return first_non_zero_match<S>(buf, mask, len);
}

View File

@ -56,6 +56,8 @@ SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi,
#include "x86/shufti.hpp" #include "x86/shufti.hpp"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#include "arm/shufti.hpp" #include "arm/shufti.hpp"
#elif defined(ARCH_PPC64EL)
#include "ppc64el/shufti.hpp"
#endif #endif
template <uint16_t S> template <uint16_t S>
@ -63,7 +65,7 @@ static really_inline
const u8 *fwdBlock(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars, const u8 *buf) { const u8 *fwdBlock(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars, const u8 *buf) {
SuperVector<S> v = blockSingleMask(mask_lo, mask_hi, chars); SuperVector<S> v = blockSingleMask(mask_lo, mask_hi, chars);
return firstMatch<S>(buf, v); return first_zero_match_inverted<S>(buf, v);
} }
template <uint16_t S> template <uint16_t S>
@ -71,7 +73,7 @@ static really_inline
const u8 *revBlock(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars, const u8 *buf) { const u8 *revBlock(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars, const u8 *buf) {
SuperVector<S> v = blockSingleMask(mask_lo, mask_hi, chars); SuperVector<S> v = blockSingleMask(mask_lo, mask_hi, chars);
return lastMatch<S>(buf, v); return last_zero_match_inverted<S>(buf, v);
} }
template <uint16_t S> template <uint16_t S>
@ -80,7 +82,7 @@ const u8 *fwdBlockDouble(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, Super
SuperVector<S> mask = blockDoubleMask(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars); SuperVector<S> mask = blockDoubleMask(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars);
return firstMatch<S>(buf, mask); return first_zero_match_inverted<S>(buf, mask);
} }
template <uint16_t S> template <uint16_t S>

View File

@ -49,14 +49,15 @@ const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, Supe
#include "x86/truffle.hpp" #include "x86/truffle.hpp"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#include "arm/truffle.hpp" #include "arm/truffle.hpp"
#elif defined(ARCH_PPC64EL)
#include "ppc64el/truffle.hpp"
#endif #endif
template <uint16_t S> template <uint16_t S>
static really_inline static really_inline
const u8 *fwdBlock(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars, const u8 *buf) { const u8 *fwdBlock(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars, const u8 *buf) {
SuperVector<S> res = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); SuperVector<S> res = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
return first_zero_match_inverted<S>(buf, res);
return firstMatch<S>(buf, res);
} }
template <uint16_t S> template <uint16_t S>
@ -120,7 +121,7 @@ static really_inline
const u8 *revBlock(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> v, const u8 *revBlock(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> v,
const u8 *buf) { const u8 *buf) {
SuperVector<S> res = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); SuperVector<S> res = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
return lastMatch<S>(buf, res); return last_zero_match_inverted<S>(buf, res);
} }
template <uint16_t S> template <uint16_t S>

View File

@ -1,121 +0,0 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2021, Arm Limited
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Vermicelli: single-byte and double-byte acceleration.
*/
#ifndef VERMICELLI_H
#define VERMICELLI_H
#include "util/bitutils.h"
#include "util/simd_utils.h"
#include "util/unaligned.h"
#if !defined(HAVE_AVX512)
#include "vermicelli_common.h"
#endif
#ifdef HAVE_SVE2
#include "vermicelli_sve.h"
#else
#include "vermicelli_sse.h"
#endif
static really_inline
const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) "
"over %zu bytes\n", c1, m1, c2, m2, (size_t)(buf_end - buf));
assert(buf < buf_end);
VERM_TYPE chars1 = VERM_SET_FN(c1);
VERM_TYPE chars2 = VERM_SET_FN(c2);
VERM_TYPE mask1 = VERM_SET_FN(m1);
VERM_TYPE mask2 = VERM_SET_FN(m2);
#ifdef HAVE_AVX512
if (buf_end - buf <= VERM_BOUNDARY) {
const u8 *ptr = dvermMiniMasked(chars1, chars2, mask1, mask2, buf,
buf_end);
if (ptr) {
return ptr;
}
/* check for partial match at end */
if ((buf_end[-1] & m1) == (u8)c1) {
DEBUG_PRINTF("partial!!!\n");
return buf_end - 1;
}
return buf_end;
}
#endif
assert((buf_end - buf) >= VERM_BOUNDARY);
uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
if (min) {
// Input isn't aligned, so we need to run one iteration with an
// unaligned load, then skip buf forward to the next aligned address.
// There's some small overlap here, but we don't mind scanning it twice
// if we can do it quickly, do we?
const u8 *p = dvermPreconditionMasked(chars1, chars2, mask1, mask2, buf);
if (p) {
return p;
}
buf += VERM_BOUNDARY - min;
assert(buf < buf_end);
}
// Aligned loops from here on in
const u8 *ptr = dvermSearchAlignedMasked(chars1, chars2, mask1, mask2, c1,
c2, m1, m2, buf, buf_end);
if (ptr) {
return ptr;
}
// Tidy up the mess at the end
ptr = dvermPreconditionMasked(chars1, chars2, mask1, mask2,
buf_end - VERM_BOUNDARY);
if (ptr) {
return ptr;
}
/* check for partial match at end */
if ((buf_end[-1] & m1) == (u8)c1) {
DEBUG_PRINTF("partial!!!\n");
return buf_end - 1;
}
return buf_end;
}
#endif /* VERMICELLI_H */

View File

@ -1,5 +1,6 @@
/* /*
* Copyright (c) 2015-2020, Intel Corporation * Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2021, Arm Limited * Copyright (c) 2021, Arm Limited
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@ -28,52 +29,74 @@
*/ */
/** \file /** \file
* \brief Vermicelli: Implementation shared between architectures. * \brief Vermicelli: single-byte and double-byte acceleration.
*
* (users should include vermicelli.h instead of this)
*/ */
#define VERM_BOUNDARY 16 #ifndef VERMICELLI_HPP
#define VERM_TYPE m128 #define VERMICELLI_HPP
#define VERM_SET_FN set1_16x8
// returns NULL if not found #include "util/bitutils.h"
static really_inline
const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
m128 mask1, m128 mask2, const u8 *buf) {
m128 data = loadu128(buf); // unaligned
m128 v1 = eq128(chars1, and128(data, mask1));
m128 v2 = eq128(chars2, and128(data, mask2));
u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
/* no fixup of the boundary required - the aligned run will pick it up */ #ifdef HAVE_SVE2
if (unlikely(z)) { #include "vermicelli_sve.h"
u32 pos = ctz32(z); #else
return buf + pos;
} #ifdef __cplusplus
return NULL; extern "C" {
#endif
const u8 *vermicelliExec(char c, char noCase, const u8 *buf, const u8 *buf_end);
#ifdef __cplusplus
} }
#endif
static really_inline #ifdef __cplusplus
const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2, extern "C" {
m128 mask1, m128 mask2, u8 c1, u8 c2, u8 m1, #endif
u8 m2, const u8 *buf, const u8 *buf_end) { const u8 *nvermicelliExec(char c, char noCase, const u8 *buf, const u8 *buf_end);
assert((size_t)buf % 16 == 0); #ifdef __cplusplus
}
#endif
for (; buf + 16 < buf_end; buf += 16) { #ifdef __cplusplus
m128 data = load128(buf); extern "C" {
m128 v1 = eq128(chars1, and128(data, mask1)); #endif
m128 v2 = eq128(chars2, and128(data, mask2)); const u8 *rvermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end);
u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1))); #ifdef __cplusplus
}
#endif
if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) { #ifdef __cplusplus
z |= (1 << 15); extern "C" {
} #endif
if (unlikely(z)) { const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end);
u32 pos = ctz32(z); #ifdef __cplusplus
return buf + pos; }
} #endif
}
return NULL; #ifdef __cplusplus
} extern "C" {
#endif
const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, const u8 *buf_end);
#ifdef __cplusplus
}
#endif
#ifdef __cplusplus
extern "C" {
#endif
const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, const u8 *buf_end);
#ifdef __cplusplus
}
#endif
#ifdef __cplusplus
extern "C" {
#endif
const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2, const u8 *buf, const u8 *buf_end);
#ifdef __cplusplus
}
#endif
#endif
#endif /* VERMICELLI_HPP */

View File

@ -26,7 +26,10 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "vermicelli.h" #include "vermicelli.hpp"
#define VERM_BOUNDARY 16
#define VERM_TYPE m128
static really_inline static really_inline
const u8 *find_xverm_run(char c, char nocase, u32 repeat, UNUSED const u8 *buf, const u8 *find_xverm_run(char c, char nocase, u32 repeat, UNUSED const u8 *buf,

551
src/nfa/vermicelli_simd.cpp Normal file
View File

@ -0,0 +1,551 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2021, Arm Limited
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Vermicelli: single-byte and double-byte acceleration.
*/
#include "util/bitutils.h"
#include "util/simd_utils.h"
#include "vermicelli.hpp"
#include "util/supervector/casemask.hpp"
#include "util/match.hpp"
template <uint16_t S>
static really_inline
const u8 *vermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len);
template <uint16_t S>
static really_inline
const u8 *vermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len);
template <uint16_t S>
static really_inline
const u8 *rvermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len);
template <uint16_t S>
static really_inline
const u8 *rvermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, u16 const len);
template <uint16_t S>
static really_inline
const u8 *vermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len);
template <uint16_t S>
static really_inline
const u8 *rvermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len);
template <uint16_t S>
static really_inline
const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2,
SuperVector<S> const mask1, SuperVector<S> const mask2,
u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len);
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
#include "x86/vermicelli.hpp"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#include "arm/vermicelli.hpp"
#elif defined(ARCH_PPC64EL)
#include "ppc64el/vermicelli.hpp"
#endif
template <uint16_t S>
static const u8 *vermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u8 const *buf_end) {
assert(buf && buf_end);
assert(buf < buf_end);
DEBUG_PRINTF("verm %p len %zu\n", buf, buf_end - buf);
DEBUG_PRINTF("b %s\n", buf);
const u8 *d = buf;
const u8 *rv;
__builtin_prefetch(d + 64);
__builtin_prefetch(d + 2*64);
__builtin_prefetch(d + 3*64);
__builtin_prefetch(d + 4*64);
DEBUG_PRINTF("start %p end %p \n", d, buf_end);
assert(d < buf_end);
if (d + S <= buf_end) {
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDUP_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d);
rv = vermicelliBlock(data, chars, casemask, d, S);
if (rv) return rv;
d = d1;
}
while(d + S <= buf_end) {
__builtin_prefetch(d + 64);
DEBUG_PRINTF("d %p \n", d);
SuperVector<S> data = SuperVector<S>::load(d);
rv = vermicelliBlock(data, chars, casemask, d, S);
if (rv) return rv;
d += S;
}
}
DEBUG_PRINTF("d %p e %p \n", d, buf_end);
// finish off tail
if (d != buf_end) {
SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d);
rv = vermicelliBlock(data, chars, casemask, d, buf_end - d);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
return buf_end;
}
template <uint16_t S>
static const u8 *nvermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, const u8 *buf_end) {
assert(buf && buf_end);
assert(buf < buf_end);
DEBUG_PRINTF("verm %p len %zu\n", buf, buf_end - buf);
DEBUG_PRINTF("b %s\n", buf);
const u8 *d = buf;
const u8 *rv;
__builtin_prefetch(d + 64);
__builtin_prefetch(d + 2*64);
__builtin_prefetch(d + 3*64);
__builtin_prefetch(d + 4*64);
DEBUG_PRINTF("start %p end %p \n", d, buf_end);
assert(d < buf_end);
if (d + S <= buf_end) {
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDUP_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d);
rv = vermicelliBlockNeg(data, chars, casemask, d, S);
if (rv) return rv;
d = d1;
}
while(d + S <= buf_end) {
__builtin_prefetch(d + 64);
DEBUG_PRINTF("d %p \n", d);
SuperVector<S> data = SuperVector<S>::load(d);
rv = vermicelliBlockNeg(data, chars, casemask, d, S);
if (rv) return rv;
d += S;
}
}
DEBUG_PRINTF("d %p e %p \n", d, buf_end);
// finish off tail
if (d != buf_end) {
SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d);
rv = vermicelliBlockNeg(data, chars, casemask, d, buf_end - d);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
return buf_end;
}
// Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if
// character not found.
template <uint16_t S>
const u8 *rvermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, const u8 *buf_end) {
assert(buf && buf_end);
assert(buf < buf_end);
DEBUG_PRINTF("rverm %p len %zu\n", buf, buf_end - buf);
DEBUG_PRINTF("b %s\n", buf);
const u8 *d = buf_end;
const u8 *rv;
__builtin_prefetch(d - 64);
__builtin_prefetch(d - 2*64);
__builtin_prefetch(d - 3*64);
__builtin_prefetch(d - 4*64);
DEBUG_PRINTF("start %p end %p \n", buf, d);
assert(d > buf);
if (d - S >= buf) {
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDDOWN_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d - S);
rv = rvermicelliBlock(data, chars, casemask, d - S, S);
DEBUG_PRINTF("rv %p \n", rv);
if (rv) return rv;
d = d1;
}
while (d - S >= buf) {
DEBUG_PRINTF("aligned %p \n", d);
// On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(d - 64);
d -= S;
SuperVector<S> data = SuperVector<S>::load(d);
rv = rvermicelliBlock(data, chars, casemask, d, S);
if (rv) return rv;
}
}
DEBUG_PRINTF("tail d %p e %p \n", buf, d);
// finish off head
if (d != buf) {
SuperVector<S> data = SuperVector<S>::loadu(buf);
rv = rvermicelliBlock(data, chars, casemask, buf, d - buf);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
return buf - 1;
}
// Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if
// character not found.
template <uint16_t S>
const u8 *rnvermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, const u8 *buf_end) {
assert(buf && buf_end);
assert(buf < buf_end);
DEBUG_PRINTF("rverm %p len %zu\n", buf, buf_end - buf);
DEBUG_PRINTF("b %s\n", buf);
const u8 *d = buf_end;
const u8 *rv;
__builtin_prefetch(d - 64);
__builtin_prefetch(d - 2*64);
__builtin_prefetch(d - 3*64);
__builtin_prefetch(d - 4*64);
DEBUG_PRINTF("start %p end %p \n", buf, d);
assert(d > buf);
if (d - S >= buf) {
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDDOWN_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d - S);
rv = rvermicelliBlockNeg(data, chars, casemask, d - S, S);
DEBUG_PRINTF("rv %p \n", rv);
if (rv) return rv;
d = d1;
}
while (d - S >= buf) {
DEBUG_PRINTF("aligned %p \n", d);
// On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(d - 64);
d -= S;
SuperVector<S> data = SuperVector<S>::load(d);
rv = rvermicelliBlockNeg(data, chars, casemask, d, S);
if (rv) return rv;
}
}
DEBUG_PRINTF("tail d %p e %p \n", buf, d);
// finish off head
if (d != buf) {
SuperVector<S> data = SuperVector<S>::loadu(buf);
rv = rvermicelliBlockNeg(data, chars, casemask, buf, d - buf);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
return buf - 1;
}
template <uint16_t S>
static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector<S> const casemask,
const u8 *buf, const u8 *buf_end) {
assert(buf && buf_end);
assert(buf < buf_end);
DEBUG_PRINTF("verm %p len %zu\n", buf, buf_end - buf);
DEBUG_PRINTF("b %s\n", buf);
const u8 *d = buf;
const u8 *rv;
// SuperVector<S> lastmask1{0};
const SuperVector<VECTORSIZE> chars1 = SuperVector<VECTORSIZE>::dup_u8(c1);
const SuperVector<VECTORSIZE> chars2 = SuperVector<VECTORSIZE>::dup_u8(c2);
const u8 casechar = casemask.u.u8[0];
__builtin_prefetch(d + 64);
__builtin_prefetch(d + 2*64);
__builtin_prefetch(d + 3*64);
__builtin_prefetch(d + 4*64);
DEBUG_PRINTF("start %p end %p \n", d, buf_end);
assert(d < buf_end);
if (d + S <= buf_end) {
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDUP_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d);
rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, S);
if (rv) return rv;
d = d1;
}
while(d + S <= buf_end) {
__builtin_prefetch(d + 64);
DEBUG_PRINTF("d %p \n", d);
SuperVector<S> data = SuperVector<S>::load(d);
rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, S);
if (rv) return rv;
d += S;
}
}
DEBUG_PRINTF("tail d %p e %p \n", d, buf_end);
// finish off tail
if (d != buf_end) {
SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d);
rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, buf_end - d);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
DEBUG_PRINTF("real tail d %p e %p \n", d, buf_end);
/* check for partial match at end */
u8 mask = casemask.u.u8[0];
if ((buf_end[-1] & mask) == (u8)c1) {
DEBUG_PRINTF("partial!!!\n");
return buf_end - 1;
}
return buf_end;
}
// /* returns highest offset of c2 (NOTE: not c1) */
template <uint16_t S>
const u8 *rvermicelliDoubleExecReal(char c1, char c2, SuperVector<S> const casemask, const u8 *buf, const u8 *buf_end) {
assert(buf && buf_end);
assert(buf < buf_end);
DEBUG_PRINTF("rverm %p len %zu\n", buf, buf_end - buf);
DEBUG_PRINTF("b %s\n", buf);
char s[255];
snprintf(s, buf_end - buf + 1, "%s", buf);
DEBUG_PRINTF("b %s\n", s);
const u8 *d = buf_end;
const u8 *rv;
const SuperVector<VECTORSIZE> chars1 = SuperVector<VECTORSIZE>::dup_u8(c1);
const SuperVector<VECTORSIZE> chars2 = SuperVector<VECTORSIZE>::dup_u8(c2);
const u8 casechar = casemask.u.u8[0];
__builtin_prefetch(d - 64);
__builtin_prefetch(d - 2*64);
__builtin_prefetch(d - 3*64);
__builtin_prefetch(d - 4*64);
DEBUG_PRINTF("start %p end %p \n", buf, d);
assert(d > buf);
if (d - S >= buf) {
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDDOWN_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d - S);
rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d - S, S);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
d = d1;
}
while (d - S >= buf) {
DEBUG_PRINTF("aligned %p \n", d);
// On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(d - 64);
d -= S;
SuperVector<S> data = SuperVector<S>::load(d);
rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, S);
if (rv) return rv;
}
}
DEBUG_PRINTF("tail d %p e %p \n", buf, d);
// finish off head
if (d != buf) {
SuperVector<S> data = SuperVector<S>::loadu(buf);
rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, buf, d - buf);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
return buf - 1;
}
template <uint16_t S>
static const u8 *vermicelliDoubleMaskedExecReal(u8 const c1, u8 const c2, u8 const m1, u8 const m2,
const u8 *buf, const u8 *buf_end) {
assert(buf && buf_end);
assert(buf < buf_end);
DEBUG_PRINTF("verm %p len %zu\n", buf, buf_end - buf);
DEBUG_PRINTF("b %s\n", buf);
const u8 *d = buf;
const u8 *rv;
// SuperVector<S> lastmask1{0};
const SuperVector<VECTORSIZE> chars1 = SuperVector<VECTORSIZE>::dup_u8(c1);
const SuperVector<VECTORSIZE> chars2 = SuperVector<VECTORSIZE>::dup_u8(c2);
const SuperVector<VECTORSIZE> mask1 = SuperVector<VECTORSIZE>::dup_u8(m1);
const SuperVector<VECTORSIZE> mask2 = SuperVector<VECTORSIZE>::dup_u8(m2);
__builtin_prefetch(d + 64);
__builtin_prefetch(d + 2*64);
__builtin_prefetch(d + 3*64);
__builtin_prefetch(d + 4*64);
DEBUG_PRINTF("start %p end %p \n", d, buf_end);
assert(d < buf_end);
if (d + S <= buf_end) {
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
u8 const *d1 = ROUNDUP_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d);
rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d, S);
if (rv) return rv;
d = d1;
}
while(d + S <= buf_end) {
__builtin_prefetch(d + 64);
DEBUG_PRINTF("d %p \n", d);
SuperVector<S> data = SuperVector<S>::load(d);
rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d, S);
if (rv) return rv;
d += S;
}
}
DEBUG_PRINTF("tail d %p e %p \n", d, buf_end);
// finish off tail
if (d != buf_end) {
SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d);
rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d, buf_end - d);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
DEBUG_PRINTF("real tail d %p e %p \n", d, buf_end);
/* check for partial match at end */
if ((buf_end[-1] & m1) == (u8)c1) {
DEBUG_PRINTF("partial!!!\n");
return buf_end - 1;
}
return buf_end;
}
extern "C" const u8 *vermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n",
nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
assert(buf < buf_end);
const SuperVector<VECTORSIZE> chars = SuperVector<VECTORSIZE>::dup_u8(c);
const SuperVector<VECTORSIZE> casemask{nocase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
return vermicelliExecReal<VECTORSIZE>(chars, casemask, buf, buf_end);
}
/* like vermicelliExec except returns the address of the first character which
* is not c */
extern "C" const u8 *nvermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("nverm scan %s\\x%02hhx over %zu bytes\n",
nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
assert(buf < buf_end);
const SuperVector<VECTORSIZE> chars = SuperVector<VECTORSIZE>::dup_u8(c);
const SuperVector<VECTORSIZE> casemask{nocase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
return nvermicelliExecReal<VECTORSIZE>(chars, casemask, buf, buf_end);
}
extern "C" const u8 *rvermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n",
nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
assert(buf < buf_end);
const SuperVector<VECTORSIZE> chars = SuperVector<VECTORSIZE>::dup_u8(c);
const SuperVector<VECTORSIZE> casemask{nocase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
return rvermicelliExecReal<VECTORSIZE>(chars, casemask, buf, buf_end);
}
extern "C" const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n",
nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
assert(buf < buf_end);
const SuperVector<VECTORSIZE> chars = SuperVector<VECTORSIZE>::dup_u8(c);
const SuperVector<VECTORSIZE> casemask{nocase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
return rnvermicelliExecReal<VECTORSIZE>(chars, casemask, buf, buf_end);
}
extern "C" const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
assert(buf < buf_end);
const SuperVector<VECTORSIZE> casemask{nocase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
return vermicelliDoubleExecReal<VECTORSIZE>(c1, c2, casemask, buf, buf_end);
}
extern "C" const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
assert(buf < buf_end);
const SuperVector<VECTORSIZE> casemask{nocase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
return rvermicelliDoubleExecReal<VECTORSIZE>(c1, c2, casemask, buf, buf_end);
}
extern "C" const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) "
"over %zu bytes\n", c1, m1, c2, m2, (size_t)(buf_end - buf));
assert(buf < buf_end);
return vermicelliDoubleMaskedExecReal<VECTORSIZE>(c1, c2, m1, m2, buf, buf_end);
}

File diff suppressed because it is too large Load Diff

View File

@ -270,25 +270,24 @@ static really_inline
const u8 *dvermSearch(svuint8_t chars, const u8 *buf, const u8 *buf_end) { const u8 *dvermSearch(svuint8_t chars, const u8 *buf, const u8 *buf_end) {
size_t len = buf_end - buf; size_t len = buf_end - buf;
if (len <= svcntb()) { if (len <= svcntb()) {
return dvermSearchOnce(chars, buf, buf_end); return dvermSearchOnce(svreinterpret_u16(chars), buf, buf_end);
} }
// peel off first part to align to the vector size // peel off first part to align to the vector size
const u8 *aligned_buf = ROUNDUP_PTR(buf, svcntb_pat(SV_POW2)); const u8 *aligned_buf = ROUNDUP_PTR(buf, svcntb_pat(SV_POW2));
assert(aligned_buf < buf_end); assert(aligned_buf < buf_end);
if (buf != aligned_buf) { if (buf != aligned_buf) {
const u8 *ptr = dvermSearchLoopBody(chars, buf); const u8 *ptr = dvermSearchLoopBody(svreinterpret_u16(chars), buf);
if (ptr) return ptr; if (ptr) return ptr;
} }
buf = aligned_buf; buf = aligned_buf;
size_t loops = (buf_end - buf) / svcntb(); size_t loops = (buf_end - buf) / svcntb();
DEBUG_PRINTF("loops %zu \n", loops); DEBUG_PRINTF("loops %zu \n", loops);
for (size_t i = 0; i < loops; i++, buf += svcntb()) { for (size_t i = 0; i < loops; i++, buf += svcntb()) {
const u8 *ptr = dvermSearchLoopBody(chars, buf); const u8 *ptr = dvermSearchLoopBody(svreinterpret_u16(chars), buf);
if (ptr) return ptr; if (ptr) return ptr;
} }
DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end); DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end);
return buf == buf_end ? NULL : dvermSearchLoopBody(chars, return buf == buf_end ? NULL : dvermSearchLoopBody(svreinterpret_u16(chars), buf_end - svcntb());
buf_end - svcntb());
} }
static really_inline static really_inline
@ -372,7 +371,7 @@ const u8 *vermicelliDoubleExec(char c1, char c2, bool nocase, const u8 *buf,
assert(buf < buf_end); assert(buf < buf_end);
if (buf_end - buf > 1) { if (buf_end - buf > 1) {
++buf; ++buf;
svuint16_t chars = getCharMaskDouble(c1, c2, nocase); svuint8_t chars = svreinterpret_u8(getCharMaskDouble(c1, c2, nocase));
const u8 *ptr = dvermSearch(chars, buf, buf_end); const u8 *ptr = dvermSearch(chars, buf, buf_end);
if (ptr) { if (ptr) {
return ptr; return ptr;
@ -459,7 +458,7 @@ const u8 *vermicelliDouble16Exec(const m128 mask, const u64a firsts,
DEBUG_PRINTF("double verm16 scan over %td bytes\n", buf_end - buf); DEBUG_PRINTF("double verm16 scan over %td bytes\n", buf_end - buf);
if (buf_end - buf > 1) { if (buf_end - buf > 1) {
++buf; ++buf;
svuint16_t chars = svreinterpret_u16(getDupSVEMaskFrom128(mask)); svuint8_t chars = svreinterpret_u8(getDupSVEMaskFrom128(mask));
const u8 *ptr = dvermSearch(chars, buf, buf_end); const u8 *ptr = dvermSearch(chars, buf, buf_end);
if (ptr) { if (ptr) {
return ptr; return ptr;
@ -480,7 +479,7 @@ const u8 *vermicelliDoubleMasked16Exec(const m128 mask, char c1, char m1,
DEBUG_PRINTF("double verm16 masked scan over %td bytes\n", buf_end - buf); DEBUG_PRINTF("double verm16 masked scan over %td bytes\n", buf_end - buf);
if (buf_end - buf > 1) { if (buf_end - buf > 1) {
++buf; ++buf;
svuint16_t chars = svreinterpret_u16(getDupSVEMaskFrom128(mask)); svuint8_t chars = getDupSVEMaskFrom128(mask);
const u8 *ptr = dvermSearch(chars, buf, buf_end); const u8 *ptr = dvermSearch(chars, buf, buf_end);
if (ptr) { if (ptr) {
return ptr; return ptr;
@ -494,3 +493,96 @@ const u8 *vermicelliDoubleMasked16Exec(const m128 mask, char c1, char m1,
return buf_end; return buf_end;
} }
// returns NULL if not found
static really_inline
const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
m128 mask1, m128 mask2, const u8 *buf) {
m128 data = loadu128(buf); // unaligned
m128 v1 = eq128(chars1, and128(data, mask1));
m128 v2 = eq128(chars2, and128(data, mask2));
u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
/* no fixup of the boundary required - the aligned run will pick it up */
if (unlikely(z)) {
u32 pos = ctz32(z);
return buf + pos;
}
return NULL;
}
static really_inline
const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2,
m128 mask1, m128 mask2, u8 c1, u8 c2, u8 m1,
u8 m2, const u8 *buf, const u8 *buf_end) {
assert((size_t)buf % 16 == 0);
for (; buf + 16 < buf_end; buf += 16) {
m128 data = load128(buf);
m128 v1 = eq128(chars1, and128(data, mask1));
m128 v2 = eq128(chars2, and128(data, mask2));
u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) {
z |= (1 << 15);
}
if (unlikely(z)) {
u32 pos = ctz32(z);
return buf + pos;
}
}
return NULL;
}
static really_inline
const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) "
"over %zu bytes\n", c1, m1, c2, m2, (size_t)(buf_end - buf));
assert(buf < buf_end);
m128 chars1 = set1_16x8(c1);
m128 chars2 = set1_16x8(c2);
m128 mask1 = set1_16x8(m1);
m128 mask2 = set1_16x8(m2);
assert((buf_end - buf) >= 16);
uintptr_t min = (uintptr_t)buf % 16;
if (min) {
// Input isn't aligned, so we need to run one iteration with an
// unaligned load, then skip buf forward to the next aligned address.
// There's some small overlap here, but we don't mind scanning it twice
// if we can do it quickly, do we?
const u8 *p = dvermPreconditionMasked(chars1, chars2, mask1, mask2, buf);
if (p) {
return p;
}
buf += 16 - min;
assert(buf < buf_end);
}
// Aligned loops from here on in
const u8 *ptr = dvermSearchAlignedMasked(chars1, chars2, mask1, mask2, c1,
c2, m1, m2, buf, buf_end);
if (ptr) {
return ptr;
}
// Tidy up the mess at the end
ptr = dvermPreconditionMasked(chars1, chars2, mask1, mask2,
buf_end - 16);
if (ptr) {
return ptr;
}
/* check for partial match at end */
if ((buf_end[-1] & m1) == (u8)c1) {
DEBUG_PRINTF("partial!!!\n");
return buf_end - 1;
}
return buf_end;
}

View File

@ -31,12 +31,6 @@
* \brief Shufti: character class acceleration. * \brief Shufti: character class acceleration.
*/ */
#ifndef SHUFTI_SIMD_X86_HPP
#define SHUFTI_SIMD_X86_HPP
#include "util/supervector/supervector.hpp"
#include "util/match.hpp"
template <uint16_t S> template <uint16_t S>
static really_inline static really_inline
const SuperVector<S> blockSingleMask(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars) { const SuperVector<S> blockSingleMask(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars) {
@ -44,12 +38,10 @@ const SuperVector<S> blockSingleMask(SuperVector<S> mask_lo, SuperVector<S> mask
SuperVector<S> c_lo = chars & low4bits; SuperVector<S> c_lo = chars & low4bits;
SuperVector<S> c_hi = chars.template vshr_64_imm<4>() & low4bits; SuperVector<S> c_hi = chars.template vshr_64_imm<4>() & low4bits;
c_lo = mask_lo.template pshufb(c_lo); c_lo = mask_lo.pshufb(c_lo);
c_hi = mask_hi.template pshufb(c_hi); c_hi = mask_hi.pshufb(c_hi);
SuperVector c = c_lo & c_hi; return (c_lo & c_hi).eq(SuperVector<S>::Zeroes());
return c.eq(SuperVector<S>::Zeroes());
} }
template <uint16_t S> template <uint16_t S>
@ -80,5 +72,3 @@ SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi,
return c.eq(SuperVector<S>::Ones()); return c.eq(SuperVector<S>::Ones());
} }
#endif // SHUFTI_SIMD_X86_HPP

125
src/nfa/x86/vermicelli.hpp Normal file
View File

@ -0,0 +1,125 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Vermicelli: single-byte and double-byte acceleration.
*/
template <uint16_t S>
static really_inline
const u8 *vermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
return first_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *vermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
return first_zero_match_inverted<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *rvermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
return last_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *rvermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, u16 const len) {
data.print8("data");
chars.print8("chars");
casemask.print8("casemask");
SuperVector<S> mask = chars.eq(casemask & data);
mask.print8("mask");
return last_zero_match_inverted<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *vermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v);
SuperVector<S> mask2 = chars2.eq(v);
SuperVector<S> mask = mask1 & (mask2 >> 1);
DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) return buf - 1;
return first_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *rvermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v);
SuperVector<S> mask2 = chars2.eq(v);
SuperVector<S> mask = (mask1 << 1)& mask2;
DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) {
mask = mask | (SuperVector<S>::Ones() >> (S-1));
}
return last_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2,
SuperVector<S> const mask1, SuperVector<S> const mask2,
u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len) {
SuperVector<S> v1 = chars1.eq(data & mask1);
SuperVector<S> v2 = chars2.eq(data & mask2);
SuperVector<S> mask = v1 & (v2 >> 1);
DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
bool partial_match = (((buf[0] & m1) == c2) && ((buf[-1] & m2) == c1));
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) return buf - 1;
return first_non_zero_match<S>(buf, mask, len);
}

View File

@ -39,6 +39,8 @@
#include "util/arch/x86/x86.h" #include "util/arch/x86/x86.h"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#include "util/arch/arm/arm.h" #include "util/arch/arm/arm.h"
#elif defined(ARCH_PPC64EL)
#include "util/arch/ppc64el/ppc64el.h"
#endif #endif
#endif // UTIL_ARCH_X86_H_ #endif // UTIL_ARCH_X86_H_

View File

@ -29,9 +29,46 @@
template <> template <>
really_really_inline really_really_inline
const u8 *firstMatch<16>(const u8 *buf, SuperVector<16> mask) { const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) {
uint32x4_t res_t = vreinterpretq_u32_u8(mask.u.v128[0]); uint32x4_t m = mask.u.u32x4[0];
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(res_t, res_t)), 0); uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) {
typename SuperVector<16>::movemask_type z = mask.movemask();
DEBUG_PRINTF("z %08x\n", z);
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
u32 pos = ctz32(z & 0xffff);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
DEBUG_PRINTF("buf + pos %p\n", buf + pos);
return buf + pos;
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) {
uint32x4_t m = mask.u.u32x4[0];
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) {
typename SuperVector<16>::movemask_type z = mask.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
u32 pos = clz32(z & 0xffff);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos >= 16 && pos < 32);
return buf + (31 - pos);
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) {
uint32x4_t m = mask.u.u32x4[0];
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) { if (vmax != 0) {
typename SuperVector<16>::movemask_type z = mask.movemask(); typename SuperVector<16>::movemask_type z = mask.movemask();
DEBUG_PRINTF("z %08x\n", z); DEBUG_PRINTF("z %08x\n", z);
@ -48,9 +85,9 @@ const u8 *firstMatch<16>(const u8 *buf, SuperVector<16> mask) {
template <> template <>
really_really_inline really_really_inline
const u8 *lastMatch<16>(const u8 *buf, SuperVector<16> mask) { const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) {
uint32x4_t res_t = vreinterpretq_u32_u8(mask.u.v128[0]); uint32x4_t m = mask.u.u32x4[0];
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(res_t, res_t)), 0); uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) { if (vmax != 0) {
typename SuperVector<16>::movemask_type z = mask.movemask(); typename SuperVector<16>::movemask_type z = mask.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("buf %p z %08x \n", buf, z);

View File

@ -100,7 +100,7 @@ static really_inline int isnonzero128(m128 a) {
*/ */
static really_inline u32 diffrich128(m128 a, m128 b) { static really_inline u32 diffrich128(m128 a, m128 b) {
static const uint32x4_t movemask = { 1, 2, 4, 8 }; static const uint32x4_t movemask = { 1, 2, 4, 8 };
return vaddvq_u32(vandq_u32(vmvnq_s32(vceqq_s32((int32x4_t)a, (int32x4_t)b)), movemask)); return vaddvq_u32(vandq_u32(vmvnq_u32(vceqq_u32((uint32x4_t)a, (uint32x4_t)b)), movemask));
} }
/** /**
@ -109,53 +109,281 @@ static really_inline u32 diffrich128(m128 a, m128 b) {
*/ */
static really_inline u32 diffrich64_128(m128 a, m128 b) { static really_inline u32 diffrich64_128(m128 a, m128 b) {
static const uint64x2_t movemask = { 1, 4 }; static const uint64x2_t movemask = { 1, 4 };
return vaddvq_u64(vandq_u64(vmvnq_s32(vceqq_s64((int64x2_t)a, (int64x2_t)b)), movemask)); return (u32) vaddvq_u64(vandq_u64((uint64x2_t)vmvnq_u32((uint32x4_t)vceqq_u64((uint64x2_t)a, (uint64x2_t)b)), movemask));
} }
static really_really_inline static really_really_inline
m128 add_2x64(m128 a, m128 b) { m128 add_2x64(m128 a, m128 b) {
return (m128) vaddq_u64((int64x2_t)a, (int64x2_t)b); return (m128) vaddq_u64((uint64x2_t)a, (uint64x2_t)b);
} }
static really_really_inline static really_really_inline
m128 sub_2x64(m128 a, m128 b) { m128 sub_2x64(m128 a, m128 b) {
return (m128) vsubq_u64((int64x2_t)a, (int64x2_t)b); return (m128) vsubq_u64((uint64x2_t)a, (uint64x2_t)b);
} }
static really_really_inline static really_inline
m128 lshift_m128(m128 a, unsigned b) { m128 lshift_m128(m128 a, unsigned b) {
return (m128) vshlq_n_s32((int64x2_t)a, b); #if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return (m128) vshlq_n_u32((uint32x4_t)a, b);
}
#endif
#define CASE_LSHIFT_m128(a, offset) case offset: return (m128)vshlq_n_u32((uint32x4_t)(a), (offset)); break;
switch (b) {
case 0: return a; break;
CASE_LSHIFT_m128(a, 1);
CASE_LSHIFT_m128(a, 2);
CASE_LSHIFT_m128(a, 3);
CASE_LSHIFT_m128(a, 4);
CASE_LSHIFT_m128(a, 5);
CASE_LSHIFT_m128(a, 6);
CASE_LSHIFT_m128(a, 7);
CASE_LSHIFT_m128(a, 8);
CASE_LSHIFT_m128(a, 9);
CASE_LSHIFT_m128(a, 10);
CASE_LSHIFT_m128(a, 11);
CASE_LSHIFT_m128(a, 12);
CASE_LSHIFT_m128(a, 13);
CASE_LSHIFT_m128(a, 14);
CASE_LSHIFT_m128(a, 15);
CASE_LSHIFT_m128(a, 16);
CASE_LSHIFT_m128(a, 17);
CASE_LSHIFT_m128(a, 18);
CASE_LSHIFT_m128(a, 19);
CASE_LSHIFT_m128(a, 20);
CASE_LSHIFT_m128(a, 21);
CASE_LSHIFT_m128(a, 22);
CASE_LSHIFT_m128(a, 23);
CASE_LSHIFT_m128(a, 24);
CASE_LSHIFT_m128(a, 25);
CASE_LSHIFT_m128(a, 26);
CASE_LSHIFT_m128(a, 27);
CASE_LSHIFT_m128(a, 28);
CASE_LSHIFT_m128(a, 29);
CASE_LSHIFT_m128(a, 30);
CASE_LSHIFT_m128(a, 31);
default: return zeroes128(); break;
}
#undef CASE_LSHIFT_m128
} }
static really_really_inline static really_really_inline
m128 rshift_m128(m128 a, unsigned b) { m128 rshift_m128(m128 a, unsigned b) {
return (m128) vshrq_n_s32((int64x2_t)a, b); #if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return (m128) vshrq_n_u32((uint32x4_t)a, b);
}
#endif
#define CASE_RSHIFT_m128(a, offset) case offset: return (m128)vshrq_n_u32((uint32x4_t)(a), (offset)); break;
switch (b) {
case 0: return a; break;
CASE_RSHIFT_m128(a, 1);
CASE_RSHIFT_m128(a, 2);
CASE_RSHIFT_m128(a, 3);
CASE_RSHIFT_m128(a, 4);
CASE_RSHIFT_m128(a, 5);
CASE_RSHIFT_m128(a, 6);
CASE_RSHIFT_m128(a, 7);
CASE_RSHIFT_m128(a, 8);
CASE_RSHIFT_m128(a, 9);
CASE_RSHIFT_m128(a, 10);
CASE_RSHIFT_m128(a, 11);
CASE_RSHIFT_m128(a, 12);
CASE_RSHIFT_m128(a, 13);
CASE_RSHIFT_m128(a, 14);
CASE_RSHIFT_m128(a, 15);
CASE_RSHIFT_m128(a, 16);
CASE_RSHIFT_m128(a, 17);
CASE_RSHIFT_m128(a, 18);
CASE_RSHIFT_m128(a, 19);
CASE_RSHIFT_m128(a, 20);
CASE_RSHIFT_m128(a, 21);
CASE_RSHIFT_m128(a, 22);
CASE_RSHIFT_m128(a, 23);
CASE_RSHIFT_m128(a, 24);
CASE_RSHIFT_m128(a, 25);
CASE_RSHIFT_m128(a, 26);
CASE_RSHIFT_m128(a, 27);
CASE_RSHIFT_m128(a, 28);
CASE_RSHIFT_m128(a, 29);
CASE_RSHIFT_m128(a, 30);
CASE_RSHIFT_m128(a, 31);
default: return zeroes128(); break;
}
#undef CASE_RSHIFT_m128
} }
static really_really_inline static really_really_inline
m128 lshift64_m128(m128 a, unsigned b) { m128 lshift64_m128(m128 a, unsigned b) {
return (m128) vshlq_n_s64((int64x2_t)a, b); #if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return (m128) vshlq_n_u64((uint64x2_t)a, b);
}
#endif
#define CASE_LSHIFT64_m128(a, offset) case offset: return (m128)vshlq_n_u64((uint64x2_t)(a), (offset)); break;
switch (b) {
case 0: return a; break;
CASE_LSHIFT64_m128(a, 1);
CASE_LSHIFT64_m128(a, 2);
CASE_LSHIFT64_m128(a, 3);
CASE_LSHIFT64_m128(a, 4);
CASE_LSHIFT64_m128(a, 5);
CASE_LSHIFT64_m128(a, 6);
CASE_LSHIFT64_m128(a, 7);
CASE_LSHIFT64_m128(a, 8);
CASE_LSHIFT64_m128(a, 9);
CASE_LSHIFT64_m128(a, 10);
CASE_LSHIFT64_m128(a, 11);
CASE_LSHIFT64_m128(a, 12);
CASE_LSHIFT64_m128(a, 13);
CASE_LSHIFT64_m128(a, 14);
CASE_LSHIFT64_m128(a, 15);
CASE_LSHIFT64_m128(a, 16);
CASE_LSHIFT64_m128(a, 17);
CASE_LSHIFT64_m128(a, 18);
CASE_LSHIFT64_m128(a, 19);
CASE_LSHIFT64_m128(a, 20);
CASE_LSHIFT64_m128(a, 21);
CASE_LSHIFT64_m128(a, 22);
CASE_LSHIFT64_m128(a, 23);
CASE_LSHIFT64_m128(a, 24);
CASE_LSHIFT64_m128(a, 25);
CASE_LSHIFT64_m128(a, 26);
CASE_LSHIFT64_m128(a, 27);
CASE_LSHIFT64_m128(a, 28);
CASE_LSHIFT64_m128(a, 29);
CASE_LSHIFT64_m128(a, 30);
CASE_LSHIFT64_m128(a, 31);
CASE_LSHIFT64_m128(a, 32);
CASE_LSHIFT64_m128(a, 33);
CASE_LSHIFT64_m128(a, 34);
CASE_LSHIFT64_m128(a, 35);
CASE_LSHIFT64_m128(a, 36);
CASE_LSHIFT64_m128(a, 37);
CASE_LSHIFT64_m128(a, 38);
CASE_LSHIFT64_m128(a, 39);
CASE_LSHIFT64_m128(a, 40);
CASE_LSHIFT64_m128(a, 41);
CASE_LSHIFT64_m128(a, 42);
CASE_LSHIFT64_m128(a, 43);
CASE_LSHIFT64_m128(a, 44);
CASE_LSHIFT64_m128(a, 45);
CASE_LSHIFT64_m128(a, 46);
CASE_LSHIFT64_m128(a, 47);
CASE_LSHIFT64_m128(a, 48);
CASE_LSHIFT64_m128(a, 49);
CASE_LSHIFT64_m128(a, 50);
CASE_LSHIFT64_m128(a, 51);
CASE_LSHIFT64_m128(a, 52);
CASE_LSHIFT64_m128(a, 53);
CASE_LSHIFT64_m128(a, 54);
CASE_LSHIFT64_m128(a, 55);
CASE_LSHIFT64_m128(a, 56);
CASE_LSHIFT64_m128(a, 57);
CASE_LSHIFT64_m128(a, 58);
CASE_LSHIFT64_m128(a, 59);
CASE_LSHIFT64_m128(a, 60);
CASE_LSHIFT64_m128(a, 61);
CASE_LSHIFT64_m128(a, 62);
CASE_LSHIFT64_m128(a, 63);
default: return zeroes128(); break;
}
#undef CASE_LSHIFT64_m128
} }
static really_really_inline static really_really_inline
m128 rshift64_m128(m128 a, unsigned b) { m128 rshift64_m128(m128 a, unsigned b) {
return (m128) vshrq_n_s64((int64x2_t)a, b); #if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(b)) {
return (m128) vshrq_n_u64((uint64x2_t)a, b);
}
#endif
#define CASE_RSHIFT64_m128(a, offset) case offset: return (m128)vshrq_n_u64((uint64x2_t)(a), (offset)); break;
switch (b) {
case 0: return a; break;
CASE_RSHIFT64_m128(a, 1);
CASE_RSHIFT64_m128(a, 2);
CASE_RSHIFT64_m128(a, 3);
CASE_RSHIFT64_m128(a, 4);
CASE_RSHIFT64_m128(a, 5);
CASE_RSHIFT64_m128(a, 6);
CASE_RSHIFT64_m128(a, 7);
CASE_RSHIFT64_m128(a, 8);
CASE_RSHIFT64_m128(a, 9);
CASE_RSHIFT64_m128(a, 10);
CASE_RSHIFT64_m128(a, 11);
CASE_RSHIFT64_m128(a, 12);
CASE_RSHIFT64_m128(a, 13);
CASE_RSHIFT64_m128(a, 14);
CASE_RSHIFT64_m128(a, 15);
CASE_RSHIFT64_m128(a, 16);
CASE_RSHIFT64_m128(a, 17);
CASE_RSHIFT64_m128(a, 18);
CASE_RSHIFT64_m128(a, 19);
CASE_RSHIFT64_m128(a, 20);
CASE_RSHIFT64_m128(a, 21);
CASE_RSHIFT64_m128(a, 22);
CASE_RSHIFT64_m128(a, 23);
CASE_RSHIFT64_m128(a, 24);
CASE_RSHIFT64_m128(a, 25);
CASE_RSHIFT64_m128(a, 26);
CASE_RSHIFT64_m128(a, 27);
CASE_RSHIFT64_m128(a, 28);
CASE_RSHIFT64_m128(a, 29);
CASE_RSHIFT64_m128(a, 30);
CASE_RSHIFT64_m128(a, 31);
CASE_RSHIFT64_m128(a, 32);
CASE_RSHIFT64_m128(a, 33);
CASE_RSHIFT64_m128(a, 34);
CASE_RSHIFT64_m128(a, 35);
CASE_RSHIFT64_m128(a, 36);
CASE_RSHIFT64_m128(a, 37);
CASE_RSHIFT64_m128(a, 38);
CASE_RSHIFT64_m128(a, 39);
CASE_RSHIFT64_m128(a, 40);
CASE_RSHIFT64_m128(a, 41);
CASE_RSHIFT64_m128(a, 42);
CASE_RSHIFT64_m128(a, 43);
CASE_RSHIFT64_m128(a, 44);
CASE_RSHIFT64_m128(a, 45);
CASE_RSHIFT64_m128(a, 46);
CASE_RSHIFT64_m128(a, 47);
CASE_RSHIFT64_m128(a, 48);
CASE_RSHIFT64_m128(a, 49);
CASE_RSHIFT64_m128(a, 50);
CASE_RSHIFT64_m128(a, 51);
CASE_RSHIFT64_m128(a, 52);
CASE_RSHIFT64_m128(a, 53);
CASE_RSHIFT64_m128(a, 54);
CASE_RSHIFT64_m128(a, 55);
CASE_RSHIFT64_m128(a, 56);
CASE_RSHIFT64_m128(a, 57);
CASE_RSHIFT64_m128(a, 58);
CASE_RSHIFT64_m128(a, 59);
CASE_RSHIFT64_m128(a, 60);
CASE_RSHIFT64_m128(a, 61);
CASE_RSHIFT64_m128(a, 62);
CASE_RSHIFT64_m128(a, 63);
default: return zeroes128(); break;
}
#undef CASE_RSHIFT64_m128
} }
static really_inline m128 eq128(m128 a, m128 b) { static really_inline m128 eq128(m128 a, m128 b) {
return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b); return (m128) vceqq_u8((uint8x16_t)a, (uint8x16_t)b);
} }
static really_inline m128 eq64_m128(m128 a, m128 b) { static really_inline m128 eq64_m128(m128 a, m128 b) {
return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b); return (m128) vceqq_u64((uint64x2_t)a, (uint64x2_t)b);
} }
static really_inline u32 movemask128(m128 a) { static really_inline u32 movemask128(m128 a) {
static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
// Compute the mask from the input // Compute the mask from the input
uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers)))); uint8x16_t mask = (uint8x16_t) vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
uint64x2_t mask1 = (m128)vextq_s8(mask, zeroes128(), 7); uint8x16_t mask1 = vextq_u8(mask, (uint8x16_t)zeroes128(), 7);
mask = vorrq_u8(mask, mask1); mask = vorrq_u8(mask, mask1);
// Get the resulting bytes // Get the resulting bytes
@ -187,13 +415,15 @@ static really_inline u64a movq(const m128 in) {
/* another form of movq */ /* another form of movq */
static really_inline static really_inline
m128 load_m128_from_u64a(const u64a *p) { m128 load_m128_from_u64a(const u64a *p) {
return (m128) vsetq_lane_u64(*p, zeroes128(), 0); return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0);
} }
static really_inline u32 extract32from128(const m128 in, unsigned imm) { static really_inline u32 extract32from128(const m128 in, unsigned imm) {
#if defined(HS_OPTIMIZE) #if defined(HAVE__BUILTIN_CONSTANT_P)
return vgetq_lane_u32((uint32x4_t) in, imm); if (__builtin_constant_p(imm)) {
#else return vgetq_lane_u32((uint32x4_t) in, imm);
}
#endif
switch (imm) { switch (imm) {
case 0: case 0:
return vgetq_lane_u32((uint32x4_t) in, 0); return vgetq_lane_u32((uint32x4_t) in, 0);
@ -211,33 +441,33 @@ static really_inline u32 extract32from128(const m128 in, unsigned imm) {
return 0; return 0;
break; break;
} }
#endif
} }
static really_inline u64a extract64from128(const m128 in, unsigned imm) { static really_inline u64a extract64from128(const m128 in, unsigned imm) {
#if defined(HS_OPTIMIZE) #if defined(HAVE__BUILTIN_CONSTANT_P)
return vgetq_lane_u64((uint64x2_t) in, imm); if (__builtin_constant_p(imm)) {
#else return vgetq_lane_u64((uint64x2_t) in, imm);
}
#endif
switch (imm) { switch (imm) {
case 0: case 0:
return vgetq_lane_u64((uint32x4_t) in, 0); return vgetq_lane_u64((uint64x2_t) in, 0);
break; break;
case 1: case 1:
return vgetq_lane_u64((uint32x4_t) in, 1); return vgetq_lane_u64((uint64x2_t) in, 1);
break; break;
default: default:
return 0; return 0;
break; break;
} }
#endif
} }
static really_inline m128 low64from128(const m128 in) { static really_inline m128 low64from128(const m128 in) {
return vcombine_u64(vget_low_u64(in), vdup_n_u64(0)); return (m128) vcombine_u64(vget_low_u64((uint64x2_t)in), vdup_n_u64(0));
} }
static really_inline m128 high64from128(const m128 in) { static really_inline m128 high64from128(const m128 in) {
return vcombine_u64(vget_high_u64(in), vdup_n_u64(0)); return (m128) vcombine_u64(vget_high_u64((uint64x2_t)in), vdup_n_u64(0));
} }
static really_inline m128 add128(m128 a, m128 b) { static really_inline m128 add128(m128 a, m128 b) {
@ -257,7 +487,7 @@ static really_inline m128 or128(m128 a, m128 b) {
} }
static really_inline m128 andnot128(m128 a, m128 b) { static really_inline m128 andnot128(m128 a, m128 b) {
return (m128) (m128) vandq_s8( vmvnq_s8(a), b); return (m128) vandq_s8( vmvnq_s8((int8x16_t) a), (int8x16_t) b);
} }
// aligned load // aligned load
@ -328,11 +558,12 @@ m128 palignr_imm(m128 r, m128 l, int offset) {
static really_really_inline static really_really_inline
m128 palignr(m128 r, m128 l, int offset) { m128 palignr(m128 r, m128 l, int offset) {
#if defined(HS_OPTIMIZE) #if defined(HAVE__BUILTIN_CONSTANT_P)
return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset); if (__builtin_constant_p(offset)) {
#else return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
return palignr_imm(r, l, offset); }
#endif #endif
return palignr_imm(r, l, offset);
} }
#undef CASE_ALIGN_VECTORS #undef CASE_ALIGN_VECTORS
@ -401,12 +632,12 @@ m128 pshufb_m128(m128 a, m128 b) {
static really_inline static really_inline
m128 max_u8_m128(m128 a, m128 b) { m128 max_u8_m128(m128 a, m128 b) {
return (m128) vmaxq_u8((int8x16_t)a, (int8x16_t)b); return (m128) vmaxq_u8((uint8x16_t)a, (uint8x16_t)b);
} }
static really_inline static really_inline
m128 min_u8_m128(m128 a, m128 b) { m128 min_u8_m128(m128 a, m128 b) {
return (m128) vminq_u8((int8x16_t)a, (int8x16_t)b); return (m128) vminq_u8((uint8x16_t)a, (uint8x16_t)b);
} }
static really_inline static really_inline

View File

@ -46,46 +46,46 @@
#endif // HAVE_SIMD_128_BITS #endif // HAVE_SIMD_128_BITS
#ifdef DEBUG #ifdef DEBUG
static inline void print_m128_16x8(const char *label, m128 vector) { static inline void print_m128_16x8(const char *label, m128 vec) {
uint8_t ALIGN_ATTR(16) data[16]; uint8_t ALIGN_ATTR(16) data[16];
store128(data, vector); store128(data, vec);
DEBUG_PRINTF("%s: ", label); DEBUG_PRINTF("%12s: ", label);
for(int i=0; i < 16; i++) for(int i=15; i >=0; i--)
printf("%02x ", data[i]); printf("%02x ", data[i]);
printf("\n"); printf("\n");
} }
static inline void print_m128_8x16(const char *label, m128 vector) { static inline void print_m128_8x16(const char *label, m128 vec) {
uint16_t ALIGN_ATTR(16) data[8]; uint16_t ALIGN_ATTR(16) data[8];
store128(data, vector); store128(data, vec);
DEBUG_PRINTF("%s: ", label); DEBUG_PRINTF("%12s: ", label);
for(int i=0; i < 8; i++) for(int i=7; i >= 0; i--)
printf("%04x ", data[i]); printf("%04x ", data[i]);
printf("\n"); printf("\n");
} }
static inline void print_m128_4x32(const char *label, m128 vector) { static inline void print_m128_4x32(const char *label, m128 vec) {
uint32_t ALIGN_ATTR(16) data[4]; uint32_t ALIGN_ATTR(16) data[4];
store128(data, vector); store128(data, vec);
DEBUG_PRINTF("%s: ", label); DEBUG_PRINTF("%12s: ", label);
for(int i=0; i < 4; i++) for(int i=3; i >= 0; i--)
printf("%08x ", data[i]); printf("%08x ", data[i]);
printf("\n"); printf("\n");
} }
static inline void print_m128_2x64(const char *label, m128 vector) { static inline void print_m128_2x64(const char *label, m128 vec) {
uint64_t ALIGN_ATTR(16) data[2]; uint64_t ALIGN_ATTR(16) data[2];
store128(data, vector); store128(data, vec);
DEBUG_PRINTF("%s: ", label); DEBUG_PRINTF("%12s: ", label);
for(int i=0; i < 2; i++) for(int i=1; i >= 0; i--)
printf("%016lx ", data[i]); printf("%016lx ", data[i]);
printf("\n"); printf("\n");
} }
#else #else
#define print_m128_16x8(label, vector) ; #define print_m128_16x8(label, vec) ;
#define print_m128_8x16(label, vector) ; #define print_m128_8x16(label, vec) ;
#define print_m128_4x32(label, vector) ; #define print_m128_4x32(label, vec) ;
#define print_m128_2x64(label, vector) ; #define print_m128_2x64(label, vec) ;
#endif #endif
/**** /****

View File

@ -0,0 +1,216 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Bit-twiddling primitives (ctz, compress etc)
*/
#ifndef BITUTILS_ARCH_PPC64EL_H
#define BITUTILS_ARCH_PPC64EL_H
#include "ue2common.h"
#include "util/popcount.h"
#include "util/arch.h"
#include "util/intrinsics.h"
#include "util/arch/common/bitutils.h"
static really_inline
u32 clz32_impl(u32 x) {
return clz32_impl_c(x);
}
static really_inline
u32 clz64_impl(u64a x) {
return clz64_impl_c(x);
}
static really_inline
u32 ctz32_impl(u32 x) {
return ctz32_impl_c(x);
}
static really_inline
u32 ctz64_impl(u64a x) {
return ctz64_impl_c(x);
}
static really_inline
u32 lg2_impl(u32 x) {
return lg2_impl_c(x);
}
static really_inline
u64a lg2_64_impl(u64a x) {
return lg2_64_impl_c(x);
}
static really_inline
u32 findAndClearLSB_32_impl(u32 *v) {
return findAndClearLSB_32_impl_c(v);
}
static really_inline
u32 findAndClearLSB_64_impl(u64a *v) {
return findAndClearLSB_64_impl_c(v);
}
static really_inline
u32 findAndClearMSB_32_impl(u32 *v) {
u32 val = *v;
u32 offset = 31 - clz32_impl(val);
*v = val & ~(1 << offset);
assert(offset < 32);
return offset;
}
static really_inline
u32 findAndClearMSB_64_impl(u64a *v) {
return findAndClearMSB_64_impl_c(v);
}
static really_inline
u32 compress32_impl(u32 x, u32 m) {
return compress32_impl_c(x, m);
}
static really_inline
u64a compress64_impl(u64a x, u64a m) {
return compress64_impl_c(x, m);
}
static really_inline
m128 compress128_impl(m128 x, m128 m) {
m128 one = set1_2x64(1);
m128 bitset = one;
m128 vres = zeroes128();
while (isnonzero128(m)) {
m128 mm = sub_2x64(zeroes128(), m);
m128 tv = and128(x, m);
tv = and128(tv, mm);
m128 mask = not128(eq64_m128(tv, zeroes128()));
mask = and128(bitset, mask);
vres = or128(vres, mask);
m = and128(m, sub_2x64(m, one));
bitset = lshift64_m128(bitset, 1);
}
return vres;
}
static really_inline
u32 expand32_impl(u32 x, u32 m) {
return expand32_impl_c(x, m);
}
static really_inline
u64a expand64_impl(u64a x, u64a m) {
return expand64_impl_c(x, m);
}
static really_inline
m128 expand128_impl(m128 x, m128 m) {
m128 one = set1_2x64(1);
m128 bb = one;
m128 res = zeroes128();
while (isnonzero128(m)) {
m128 xm = and128(x, bb);
m128 mm = sub_2x64(zeroes128(), m);
m128 mask = not128(eq64_m128(xm, zeroes128()));
mask = and128(mask, and128(m,mm));
res = or128(res, mask);
m = and128(m, sub_2x64(m, one));
bb = lshift64_m128(bb, 1);
}
return res;
}
/* returns the first set bit after begin (if not ~0U). If no bit is set after
* begin returns ~0U
*/
static really_inline
u32 bf64_iterate_impl(u64a bitfield, u32 begin) {
if (begin != ~0U) {
/* switch off all bits at or below begin. Note: not legal to shift by
* by size of the datatype or larger. */
assert(begin <= 63);
bitfield &= ~((2ULL << begin) - 1);
}
if (!bitfield) {
return ~0U;
}
return ctz64_impl(bitfield);
}
static really_inline
char bf64_set_impl(u64a *bitfield, u32 i) {
return bf64_set_impl_c(bitfield, i);
}
static really_inline
void bf64_unset_impl(u64a *bitfield, u32 i) {
return bf64_unset_impl_c(bitfield, i);
}
static really_inline
u32 rank_in_mask32_impl(u32 mask, u32 bit) {
return rank_in_mask32_impl_c(mask, bit);
}
static really_inline
u32 rank_in_mask64_impl(u64a mask, u32 bit) {
return rank_in_mask64_impl_c(mask, bit);
}
static really_inline
u32 pext32_impl(u32 x, u32 mask) {
return pext32_impl_c(x, mask);
}
static really_inline
u64a pext64_impl(u64a x, u64a mask) {
return pext64_impl_c(x, mask);
}
static really_inline
u64a pdep64(u64a x, u64a mask) {
return pdep64_impl_c(x, mask);
}
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
* so we force its generation.
*/
static really_inline
u64a andn_impl(const u32 a, const u8 *b) {
return andn_impl_c(a, b);
}
#endif // BITUTILS_ARCH_ARM_H

View File

@ -0,0 +1,98 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
template <>
really_really_inline
const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z)) {
u32 pos = ctz32(z);
DEBUG_PRINTF("~z %08x\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
return buf + pos;
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z)) {
u32 pos = clz32(z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos >= 16 && pos < 32);
return buf + (31 - pos);
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z != 0xffff)) {
u32 pos = ctz32(~z & 0xffff);
DEBUG_PRINTF("~z %08x\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
return buf + pos;
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z != 0xffff)) {
u32 pos = clz32(~z & 0xffff);
DEBUG_PRINTF("~z %08x\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos >= 16 && pos < 32);
return buf + (31 - pos);
} else {
return NULL; // no match
}
}

View File

@ -0,0 +1,43 @@
/*
* Copyright (c) 2017-2020, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief Per-platform architecture definitions
*/
#ifndef UTIL_ARCH_PPC64EL_H_
#define UTIL_ARCH_PPC64EL_H_
#if defined(__VSX__) && defined(ARCH_PPC64EL)
#define HAVE_VSX
#define HAVE_SIMD_128_BITS
#define VECTORSIZE 16
#endif
#endif // UTIL_ARCH_ARM_H_

View File

@ -0,0 +1,37 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef ARCH_PPC64EL_SIMD_TYPES_H
#define ARCH_PPC64EL_SIMD_TYPES_H
#if !defined(m128) && defined(HAVE_VSX)
typedef __vector int m128;
#endif
#endif /* ARCH_PPC64EL_SIMD_TYPES_H */

View File

@ -0,0 +1,494 @@
/*
* Copyright (c) 2015-2020, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/** \file
* \brief SIMD types and primitive operations.
*/
#ifndef ARCH_PPC64EL_SIMD_UTILS_H
#define ARCH_PPC64EL_SIMD_UTILS_H
#include <stdio.h>
#include "ue2common.h"
#include "util/simd_types.h"
#include "util/unaligned.h"
#include "util/intrinsics.h"
#include <string.h> // for memcpy
typedef __vector unsigned long long int uint64x2_t;
typedef __vector signed long long int int64x2_t;
typedef __vector unsigned int uint32x4_t;
typedef __vector signed int int32x4_t;
typedef __vector unsigned short int uint16x8_t;
typedef __vector signed short int int16x8_t;
typedef __vector unsigned char uint8x16_t;
typedef __vector signed char int8x16_t;
typedef unsigned long long int ulong64_t;
typedef signed long long int long64_t;
/*
typedef __vector uint64_t uint64x2_t;
typedef __vector int64_t int64x2_t;
typedef __vector uint32_t uint32x4_t;
typedef __vector int32_t int32x4_t;
typedef __vector uint16_t uint16x8_t;
typedef __vector int16_t int16x8_t;
typedef __vector uint8_t uint8x16_t;
typedef __vector int8_t int8x16_t;*/
#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
/** \brief LUT for the mask1bit functions. */
ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
ZEROES_32, ZEROES_32,
ZEROES_31, 0x01, ZEROES_32,
ZEROES_31, 0x02, ZEROES_32,
ZEROES_31, 0x04, ZEROES_32,
ZEROES_31, 0x08, ZEROES_32,
ZEROES_31, 0x10, ZEROES_32,
ZEROES_31, 0x20, ZEROES_32,
ZEROES_31, 0x40, ZEROES_32,
ZEROES_31, 0x80, ZEROES_32,
ZEROES_32, ZEROES_32,
};
static really_inline m128 ones128(void) {
return (m128) vec_splat_u8(-1);
}
static really_inline m128 zeroes128(void) {
return (m128) vec_splat_s32(0);
}
/** \brief Bitwise not for m128*/
static really_inline m128 not128(m128 a) {
//return (m128)vec_xor(a, a);
return (m128) vec_xor(a,ones128());
}
/** \brief Return 1 if a and b are different otherwise 0 */
static really_inline int diff128(m128 a, m128 b) {
return vec_any_ne(a, b);
}
static really_inline int isnonzero128(m128 a) {
return !!diff128(a, zeroes128());
}
/**
* "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
* mask indicating which 32-bit words contain differences.
*/
static really_inline u32 diffrich128(m128 a, m128 b) {
static const m128 movemask = { 1, 2, 4, 8 };
m128 mask = (m128) vec_cmpeq(a, b); // _mm_cmpeq_epi32 (a, b);
mask = vec_and(not128(mask), movemask);
m128 sum = vec_sums(mask, zeroes128());
//sum = vec_sld(zeroes128(), sum, 4);
//s32 ALIGN_ATTR(16) x;
//vec_ste(sum, 0, &x);
//return x; // it could be ~(movemask_128(mask)) & 0x;
return sum[3];
}
/**
* "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
* returns a 4-bit mask indicating which 64-bit words contain differences.
*/
static really_inline u32 diffrich64_128(m128 a, m128 b) {
static const uint64x2_t movemask = { 1, 4 };
uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
mask = (uint64x2_t) vec_and((uint64x2_t)not128((m128)mask), movemask);
m128 sum = vec_sums((m128)mask, zeroes128());
//sum = vec_sld(zeroes128(), sum, 4);
//s32 ALIGN_ATTR(16) x;
//vec_ste(sum, 0, &x);
//return x;
return sum[3];
}
static really_really_inline
m128 add_2x64(m128 a, m128 b) {
return (m128) vec_add((uint64x2_t)a, (uint64x2_t)b);
}
static really_really_inline
m128 sub_2x64(m128 a, m128 b) {
return (m128) vec_sub((uint64x2_t)a, (uint64x2_t)b);
}
static really_really_inline
m128 lshift_m128(m128 a, unsigned b) {
switch(b){
case 1: return vec_sld(a, zeroes128(), 1); break;
case 2: return vec_sld(a, zeroes128(), 2); break;
case 3: return vec_sld(a, zeroes128(), 3); break;
case 4: return vec_sld(a, zeroes128(), 4); break;
case 5: return vec_sld(a, zeroes128(), 5); break;
case 6: return vec_sld(a, zeroes128(), 6); break;
case 7: return vec_sld(a, zeroes128(), 7); break;
case 8: return vec_sld(a, zeroes128(), 8); break;
case 9: return vec_sld(a, zeroes128(), 9); break;
case 10: return vec_sld(a, zeroes128(), 10); break;
case 11: return vec_sld(a, zeroes128(), 11); break;
case 12: return vec_sld(a, zeroes128(), 12); break;
case 13: return vec_sld(a, zeroes128(), 13); break;
case 14: return vec_sld(a, zeroes128(), 14); break;
case 15: return vec_sld(a, zeroes128(), 15); break;
}
return a;
}
static really_really_inline
m128 rshift_m128(m128 a, unsigned b) {
switch(b){
case 1: return vec_sld(zeroes128(), a, 15); break;
case 2: return vec_sld(zeroes128(), a, 14); break;
case 3: return vec_sld(zeroes128(), a, 13); break;
case 4: return vec_sld(zeroes128(), a, 12); break;
case 5: return vec_sld(zeroes128(), a, 11); break;
case 6: return vec_sld(zeroes128(), a, 10); break;
case 7: return vec_sld(zeroes128(), a, 9); break;
case 8: return vec_sld(zeroes128(), a, 8); break;
case 9: return vec_sld(zeroes128(), a, 7); break;
case 10: return vec_sld(zeroes128(), a, 6); break;
case 11: return vec_sld(zeroes128(), a, 5); break;
case 12: return vec_sld(zeroes128(), a, 4); break;
case 13: return vec_sld(zeroes128(), a, 3); break;
case 14: return vec_sld(zeroes128(), a, 2); break;
case 15: return vec_sld(zeroes128(), a, 1); break;
}
return a;
}
static really_really_inline
m128 lshift64_m128(m128 a, unsigned b) {
uint64x2_t shift_indices = vec_splats((ulong64_t)b);
return (m128) vec_sl((int64x2_t)a, shift_indices);
}
static really_really_inline
m128 rshift64_m128(m128 a, unsigned b) {
uint64x2_t shift_indices = vec_splats((ulong64_t)b);
return (m128) vec_sr((int64x2_t)a, shift_indices);
}
static really_inline m128 eq128(m128 a, m128 b) {
return (m128) vec_cmpeq((uint8x16_t)a, (uint8x16_t)b);
}
static really_inline m128 eq64_m128(m128 a, m128 b) {
return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
}
static really_inline u32 movemask128(m128 a) {
uint8x16_t s1 = vec_sr((uint8x16_t)a, vec_splat_u8(7));
uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff));
uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and);
uint32x4_t ss2 = vec_sr((uint32x4_t)s2, vec_splat_u32(14));
uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff));
uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9);
uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
return s5[0];
}
static really_inline m128 set1_16x8(u8 c) {
return (m128) vec_splats(c);
}
static really_inline m128 set1_4x32(u32 c) {
return (m128) vec_splats(c);
}
static really_inline m128 set1_2x64(u64a c) {
return (m128) vec_splats(c);
}
static really_inline u32 movd(const m128 in) {
return (u32) vec_extract((uint32x4_t)in, 0);
}
static really_inline u64a movq(const m128 in) {
u64a ALIGN_ATTR(16) a[2];
vec_xst((uint64x2_t) in, 0, a);
return a[0];
}
/* another form of movq */
static really_inline
m128 load_m128_from_u64a(const u64a *p) {
m128 vec =(m128) vec_splats(*p);
return rshift_m128(vec,8);
}
static really_inline u32 extract32from128(const m128 in, unsigned imm) {
u32 ALIGN_ATTR(16) a[4];
vec_xst((uint32x4_t) in, 0, a);
switch (imm) {
case 0:
return a[0];break;
case 1:
return a[1];break;
case 2:
return a[2];break;
case 3:
return a[3];break;
default:
return 0;break;
}
}
static really_inline u64a extract64from128(const m128 in, unsigned imm) {
u64a ALIGN_ATTR(16) a[2];
vec_xst((uint64x2_t) in, 0, a);
switch (imm) {
case 0:
return a[0];break;
case 1:
return a[1];break;
default:
return 0;
break;
}
}
static really_inline m128 low64from128(const m128 in) {
return rshift_m128(in,8);
}
static really_inline m128 high64from128(const m128 in) {
return lshift_m128(in,8);
}
static really_inline m128 add128(m128 a, m128 b) {
return (m128) vec_add((uint64x2_t)a, (uint64x2_t)b);
}
static really_inline m128 and128(m128 a, m128 b) {
return (m128) vec_and((int8x16_t)a, (int8x16_t)b);
}
static really_inline m128 xor128(m128 a, m128 b) {
return (m128) vec_xor((int8x16_t)a, (int8x16_t)b);
}
static really_inline m128 or128(m128 a, m128 b) {
return (m128) vec_or((int8x16_t)a, (int8x16_t)b);
}
static really_inline m128 andnot128(m128 a, m128 b) {
return (m128) and128(not128(a),b);
}
// aligned load
static really_inline m128 load128(const void *ptr) {
assert(ISALIGNED_N(ptr, alignof(m128)));
return (m128) vec_xl(0, (const int32_t*)ptr);
}
// aligned store
static really_inline void store128(void *ptr, m128 a) {
assert(ISALIGNED_N(ptr, alignof(m128)));
vec_st(a, 0, (int32_t*)ptr);
}
// unaligned load
static really_inline m128 loadu128(const void *ptr) {
return (m128) vec_xl(0, (const int32_t*)ptr);
}
// unaligned store
static really_inline void storeu128(void *ptr, m128 a) {
vec_xst(a, 0, (int32_t*)ptr);
}
// packed unaligned store of first N bytes
static really_inline
void storebytes128(void *ptr, m128 a, unsigned int n) {
assert(n <= sizeof(a));
memcpy(ptr, &a, n);
}
// packed unaligned load of first N bytes, pad with zero
static really_inline
m128 loadbytes128(const void *ptr, unsigned int n) {
m128 a = zeroes128();
assert(n <= sizeof(a));
memcpy(&a, ptr, n);
return a;
}
#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vec_sld((int8x16_t)(b), (int8x16_t)(a), (16 - offset)); break;
static really_really_inline
m128 palignr_imm(m128 r, m128 l, int offset) {
switch (offset) {
case 0: return l; break;
CASE_ALIGN_VECTORS(l, r, 1);
CASE_ALIGN_VECTORS(l, r, 2);
CASE_ALIGN_VECTORS(l, r, 3);
CASE_ALIGN_VECTORS(l, r, 4);
CASE_ALIGN_VECTORS(l, r, 5);
CASE_ALIGN_VECTORS(l, r, 6);
CASE_ALIGN_VECTORS(l, r, 7);
CASE_ALIGN_VECTORS(l, r, 8);
CASE_ALIGN_VECTORS(l, r, 9);
CASE_ALIGN_VECTORS(l, r, 10);
CASE_ALIGN_VECTORS(l, r, 11);
CASE_ALIGN_VECTORS(l, r, 12);
CASE_ALIGN_VECTORS(l, r, 13);
CASE_ALIGN_VECTORS(l, r, 14);
CASE_ALIGN_VECTORS(l, r, 15);
case 16: return r; break;
default: return zeroes128(); break;
}
}
static really_really_inline
m128 palignr(m128 r, m128 l, int offset) {
#if defined(HS_OPTIMIZE)
// need a faster way to do this.
return palignr_imm(r, l, offset);
#else
return palignr_imm(r, l, offset);
#endif
}
#undef CASE_ALIGN_VECTORS
static really_really_inline
m128 rshiftbyte_m128(m128 a, unsigned b) {
return rshift_m128(a,b);
}
static really_really_inline
m128 lshiftbyte_m128(m128 a, unsigned b) {
return lshift_m128(a,b);
}
static really_inline
m128 variable_byte_shift_m128(m128 in, s32 amount) {
assert(amount >= -16 && amount <= 16);
if (amount < 0){
return palignr_imm(zeroes128(), in, -amount);
} else{
return palignr_imm(in, zeroes128(), 16 - amount);
}
}
static really_inline
m128 mask1bit128(unsigned int n) {
assert(n < sizeof(m128) * 8);
u32 mask_idx = ((n % 8) * 64) + 95;
mask_idx -= n / 8;
return loadu128(&simd_onebit_masks[mask_idx]);
}
// switches on bit N in the given vector.
static really_inline
void setbit128(m128 *ptr, unsigned int n) {
*ptr = or128(mask1bit128(n), *ptr);
}
// switches off bit N in the given vector.
static really_inline
void clearbit128(m128 *ptr, unsigned int n) {
*ptr = andnot128(mask1bit128(n), *ptr);
}
// tests bit N in the given vector.
static really_inline
char testbit128(m128 val, unsigned int n) {
const m128 mask = mask1bit128(n);
return isnonzero128(and128(mask, val));
}
static really_inline
m128 pshufb_m128(m128 a, m128 b) {
/* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
below is the version that is converted from Intel to PPC. */
uint8x16_t mask =(uint8x16_t)vec_cmpge((uint8x16_t)b, (uint8x16_t)vec_splats((uint8_t)0x80));
uint8x16_t res = vec_perm ((uint8x16_t)a, (uint8x16_t)a, (uint8x16_t)b);
return (m128) vec_sel((uint8x16_t)res, (uint8x16_t)zeroes128(), (uint8x16_t)mask);
}
static really_inline
m128 max_u8_m128(m128 a, m128 b) {
return (m128) vec_max((uint8x16_t)a, (uint8x16_t)b);
}
static really_inline
m128 min_u8_m128(m128 a, m128 b) {
return (m128) vec_min((uint8x16_t)a, (uint8x16_t)b);
}
static really_inline
m128 sadd_u8_m128(m128 a, m128 b) {
return (m128) vec_adds((uint8x16_t)a, (uint8x16_t)b);
}
static really_inline
m128 sub_u8_m128(m128 a, m128 b) {
return (m128) vec_sub((uint8x16_t)a, (uint8x16_t)b);
}
static really_inline
m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) {
uint32x4_t v = { x0, x1, x2, x3 };
return (m128) v;
}
static really_inline
m128 set2x64(u64a hi, u64a lo) {
uint64x2_t v = { lo, hi };
return (m128) v;
}
#endif // ARCH_PPC64EL_SIMD_UTILS_H

View File

@ -29,7 +29,106 @@
template <> template <>
really_really_inline really_really_inline
const u8 *firstMatch<16>(const u8 *buf, SuperVector<16> v) { const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z)) {
u32 pos = ctz32(z);
DEBUG_PRINTF("~z %08x\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
return buf + pos;
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%08x\n", z);
if (unlikely(z)) {
u32 pos = ctz32(z);
assert(pos < 32);
DEBUG_PRINTF("match @ pos %u\n", pos);
return buf + pos;
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
SuperVector<64>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask %016llx\n", mask);
z &= mask;
DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z)) {
u32 pos = ctz64(z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 64);
return buf + pos;
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z)) {
u32 pos = clz32(z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos >= 16 && pos < 32);
return buf + (31 - pos);
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%08x\n", z);
if (unlikely(z)) {
u32 pos = clz32(z);
assert(pos < 32);
DEBUG_PRINTF("match @ pos %u\n", pos);
return buf + (31 - pos);
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
SuperVector<64>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask %016llx\n", mask);
z &= mask;
DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z)) {
u32 pos = clz64(z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 64);
return buf + (63 - pos);
} else {
return NULL; // no match
}
}
template <>
really_really_inline
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask(); SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z); DEBUG_PRINTF("z %08x\n", z);
@ -46,7 +145,7 @@ const u8 *firstMatch<16>(const u8 *buf, SuperVector<16> v) {
template <> template <>
really_really_inline really_really_inline
const u8 *firstMatch<32>(const u8 *buf, SuperVector<32> v) { const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask(); SuperVector<32>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%08x\n", z); DEBUG_PRINTF("z 0x%08x\n", z);
if (unlikely(z != 0xffffffff)) { if (unlikely(z != 0xffffffff)) {
@ -60,11 +159,15 @@ const u8 *firstMatch<32>(const u8 *buf, SuperVector<32> v) {
} }
template <> template <>
really_really_inline really_really_inline
const u8 *firstMatch<64>(const u8 *buf, SuperVector<64>v) { const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
SuperVector<64>::movemask_type z = v.movemask(); SuperVector<64>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%016llx\n", z); DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z != ~0ULL)) { u64a mask = (~0ULL) >> (64 - len);
u32 pos = ctz64(~z); DEBUG_PRINTF("mask %016llx\n", mask);
z = ~z & mask;
DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z)) {
u32 pos = ctz64(z);
DEBUG_PRINTF("match @ pos %u\n", pos); DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 64); assert(pos < 64);
return buf + pos; return buf + pos;
@ -75,7 +178,7 @@ const u8 *firstMatch<64>(const u8 *buf, SuperVector<64>v) {
template <> template <>
really_really_inline really_really_inline
const u8 *lastMatch<16>(const u8 *buf, SuperVector<16> v) { const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
SuperVector<16>::movemask_type z = v.movemask(); SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z); DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z); DEBUG_PRINTF("z %08x\n", z);
@ -92,10 +195,10 @@ const u8 *lastMatch<16>(const u8 *buf, SuperVector<16> v) {
template<> template<>
really_really_inline really_really_inline
const u8 *lastMatch<32>(const u8 *buf, SuperVector<32> v) { const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_t UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask(); SuperVector<32>::movemask_type z = v.movemask();
if (unlikely(z != 0xffffffff)) { if (unlikely(z != 0xffffffff)) {
u32 pos = clz32(~z); u32 pos = clz32(~z & 0xffffffff);
DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
assert(pos < 32); assert(pos < 32);
return buf + (31 - pos); return buf + (31 - pos);
@ -106,11 +209,17 @@ const u8 *lastMatch<32>(const u8 *buf, SuperVector<32> v) {
template <> template <>
really_really_inline really_really_inline
const u8 *lastMatch<64>(const u8 *buf, SuperVector<64> v) { const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v, uint16_t len) {
v.print8("v");
SuperVector<64>::movemask_type z = v.movemask(); SuperVector<64>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%016llx\n", z); DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z != ~0ULL)) { u64a mask = (~0ULL) >> (64 - len);
u32 pos = clz64(~z); DEBUG_PRINTF("mask %016llx\n", mask);
z = ~z & mask;
DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z)) {
u32 pos = clz64(z);
DEBUG_PRINTF("~z 0x%016llx\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos); DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 64); assert(pos < 64);
return buf + (63 - pos); return buf + (63 - pos);

View File

@ -30,7 +30,7 @@
#ifndef SIMD_TYPES_X86_H #ifndef SIMD_TYPES_X86_H
#define SIMD_TYPES_X86_H #define SIMD_TYPES_X86_H
#if !defined(m128) && defined(HAVE_SSE2) #if !defined(m128) && defined(HAVE_SSE42)
typedef __m128i m128; typedef __m128i m128;
#endif #endif

View File

@ -49,6 +49,8 @@
#include "util/arch/x86/bitutils.h" #include "util/arch/x86/bitutils.h"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#include "util/arch/arm/bitutils.h" #include "util/arch/arm/bitutils.h"
#elif defined(ARCH_PPC64EL)
#include "util/arch/ppc64el/bitutils.h"
#endif #endif
static really_inline static really_inline

View File

@ -49,6 +49,10 @@
# define USE_ARM_NEON_H # define USE_ARM_NEON_H
#endif #endif
#if defined(HAVE_C_PPC64EL_ALTIVEC_H)
# define USE_PPC64EL_ALTIVEC_H
#endif
#ifdef __cplusplus #ifdef __cplusplus
# if defined(HAVE_CXX_INTRIN_H) # if defined(HAVE_CXX_INTRIN_H)
# define USE_INTRIN_H # define USE_INTRIN_H
@ -68,6 +72,8 @@
# if defined(HAVE_SVE) # if defined(HAVE_SVE)
# include <arm_sve.h> # include <arm_sve.h>
# endif # endif
#elif defined(USE_PPC64EL_ALTIVEC_H)
#include <altivec.h>
#else #else
#error no intrinsics file #error no intrinsics file
#endif #endif

View File

@ -38,15 +38,23 @@
#include "util/supervector/supervector.hpp" #include "util/supervector/supervector.hpp"
template <u16 S> template <u16 S>
const u8 *firstMatch(const u8 *buf, SuperVector<S> v); const u8 *first_non_zero_match(const u8 *buf, SuperVector<S> v, u16 const len = S);
template <u16 S> template <u16 S>
const u8 *lastMatch(const u8 *buf, SuperVector<S> v); const u8 *last_non_zero_match(const u8 *buf, SuperVector<S> v, u16 const len = S);
template <u16 S>
const u8 *first_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 const len = S);
template <u16 S>
const u8 *last_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 len = S);
#if defined(ARCH_IA32) || defined(ARCH_X86_64) #if defined(ARCH_IA32) || defined(ARCH_X86_64)
#include "util/arch/x86/match.hpp" #include "util/arch/x86/match.hpp"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#include "util/arch/arm/match.hpp" #include "util/arch/arm/match.hpp"
#elif defined(ARCH_PPC64EL)
#include "util/arch/ppc64el/match.hpp"
#endif #endif
#endif // MATCH_HPP #endif // MATCH_HPP

View File

@ -38,6 +38,8 @@
#include "util/arch/x86/simd_types.h" #include "util/arch/x86/simd_types.h"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#include "util/arch/arm/simd_types.h" #include "util/arch/arm/simd_types.h"
#elif defined(ARCH_PPC64EL)
#include "util/arch/ppc64el/simd_types.h"
#endif #endif
#if !defined(m128) && !defined(HAVE_SIMD_128_BITS) #if !defined(m128) && !defined(HAVE_SIMD_128_BITS)
@ -49,6 +51,7 @@ typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
#endif #endif
typedef struct {m128 lo; m128 mid; m128 hi;} m384; typedef struct {m128 lo; m128 mid; m128 hi;} m384;
#if !defined(m512) && !defined(HAVE_SIMD_512_BITS) #if !defined(m512) && !defined(HAVE_SIMD_512_BITS)
typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512; typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
#endif #endif

View File

@ -65,6 +65,8 @@ extern const char vbs_mask_data[];
#include "util/arch/x86/simd_utils.h" #include "util/arch/x86/simd_utils.h"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#include "util/arch/arm/simd_utils.h" #include "util/arch/arm/simd_utils.h"
#elif defined(ARCH_PPC64EL)
#include "util/arch/ppc64el/simd_utils.h"
#endif #endif
#include "util/arch/common/simd_utils.h" #include "util/arch/common/simd_utils.h"

View File

@ -45,72 +45,114 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<int8x16_t>(int8x16_t const other) really_inline SuperVector<16>::SuperVector(int8x16_t other)
{ {
u.v128[0] = static_cast<m128>(other); u.s8x16[0] = other;
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<uint8x16_t>(uint8x16_t const other) really_inline SuperVector<16>::SuperVector(uint8x16_t other)
{ {
u.v128[0] = static_cast<m128>(other); u.u8x16[0] = other;
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<int8_t>(int8_t const other) really_inline SuperVector<16>::SuperVector(int16x8_t other)
{ {
u.v128[0] = vdupq_n_s8(other); u.s16x8[0] = other;
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<uint8_t>(uint8_t const other) really_inline SuperVector<16>::SuperVector(uint16x8_t other)
{ {
u.v128[0] = vdupq_n_u8(other); u.u16x8[0] = other;
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<int16_t>(int16_t const other) really_inline SuperVector<16>::SuperVector(int32x4_t other)
{ {
u.v128[0] = vdupq_n_s16(other); u.s32x4[0] = other;
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<uint16_t>(uint16_t const other) really_inline SuperVector<16>::SuperVector(uint32x4_t other)
{ {
u.v128[0] = vdupq_n_u16(other); u.u32x4[0] = other;
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<int32_t>(int32_t const other) really_inline SuperVector<16>::SuperVector(int64x2_t other)
{ {
u.v128[0] = vdupq_n_s32(other); u.s64x2[0] = other;
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<uint32_t>(uint32_t const other) really_inline SuperVector<16>::SuperVector(uint64x2_t other)
{ {
u.v128[0] = vdupq_n_u32(other); u.u64x2[0] = other;
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<int64_t>(int64_t const other) really_inline SuperVector<16>::SuperVector(int8_t const other)
{ {
u.v128[0] = vdupq_n_s64(other); u.s8x16[0] = vdupq_n_s8(other);
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<uint64_t>(uint64_t const other) really_inline SuperVector<16>::SuperVector(uint8_t const other)
{ {
u.v128[0] = vdupq_n_u64(other); u.u8x16[0] = vdupq_n_u8(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(int16_t const other)
{
u.s16x8[0] = vdupq_n_s16(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(uint16_t const other)
{
u.u16x8[0] = vdupq_n_u16(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(int32_t const other)
{
u.s32x4[0] = vdupq_n_s32(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(uint32_t const other)
{
u.u32x4[0] = vdupq_n_u32(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(int64_t const other)
{
u.s64x2[0] = vdupq_n_s64(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(uint64_t const other)
{
u.u64x2[0] = vdupq_n_u64(other);
} }
// Constants // Constants
@ -137,37 +179,37 @@ really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
template <> template <>
really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
{ {
return {vandq_s8(u.v128[0], b.u.v128[0])}; return {vandq_u8(u.u8x16[0], b.u.u8x16[0])};
} }
template <> template <>
really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
{ {
return {vorrq_s8(u.v128[0], b.u.v128[0])}; return {vorrq_u8(u.u8x16[0], b.u.u8x16[0])};
} }
template <> template <>
really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
{ {
return {veorq_s8(u.v128[0], b.u.v128[0])}; return {veorq_u8(u.u8x16[0], b.u.u8x16[0])};
} }
template <> template <>
really_inline SuperVector<16> SuperVector<16>::operator!() const really_inline SuperVector<16> SuperVector<16>::operator!() const
{ {
return {vmvnq_s8(u.v128[0])}; return {vmvnq_u8(u.u8x16[0])};
} }
template <> template <>
really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
{ {
return {vandq_s8(vmvnq_s8(u.v128[0]), b.u.v128[0])}; return {vandq_u8(vmvnq_u8(u.u8x16[0]), b.u.u8x16[0])};
} }
template <> template <>
really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
{ {
return {vceqq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])}; return {vceqq_u8(u.u8x16[0], b.u.u8x16[0])};
} }
template <> template <>
@ -179,25 +221,25 @@ really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const
template <> template <>
really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
{ {
return {vcgtq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])}; return {vcgtq_s8(u.s8x16[0], b.u.s8x16[0])};
} }
template <> template <>
really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
{ {
return {vcgeq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])}; return {vcgeq_u8(u.u8x16[0], b.u.u8x16[0])};
} }
template <> template <>
really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
{ {
return {vcltq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])}; return {vcltq_s8(u.s8x16[0], b.u.s8x16[0])};
} }
template <> template <>
really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
{ {
return {vcgeq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])}; return {vcgeq_s8(u.s8x16[0], b.u.s8x16[0])};
} }
template <> template <>
@ -209,12 +251,12 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
template <> template <>
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void) const really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void) const
{ {
SuperVector powers{0x8040201008040201UL}; SuperVector powers = SuperVector::dup_u64(0x8040201008040201UL);
// Compute the mask from the input // Compute the mask from the input
uint64x2_t mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint16x8_t)u.v128[0], powers.u.v128[0])))); uint8x16_t mask = (uint8x16_t) vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(u.u8x16[0], powers.u.u8x16[0]))));
uint64x2_t mask1 = (m128)vextq_s8(mask, vdupq_n_u8(0), 7); uint64x2_t mask1 = (uint64x2_t) vextq_u8(mask, vdupq_n_u8(0), 7);
mask = vorrq_u8(mask, mask1); mask = vorrq_u8(mask, (uint8x16_t) mask1);
// Get the resulting bytes // Get the resulting bytes
uint16_t output; uint16_t output;
@ -232,35 +274,35 @@ template <>
template<uint8_t N> template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
{ {
return {(m128)vshlq_n_s8(u.v128[0], N)}; return {vshlq_n_u8(u.u8x16[0], N)};
} }
template <> template <>
template<uint8_t N> template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
{ {
return {(m128)vshlq_n_s16(u.v128[0], N)}; return {vshlq_n_u16(u.u16x8[0], N)};
} }
template <> template <>
template<uint8_t N> template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
{ {
return {(m128)vshlq_n_s32(u.v128[0], N)}; return {vshlq_n_u32(u.u32x4[0], N)};
} }
template <> template <>
template<uint8_t N> template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
{ {
return {(m128)vshlq_n_s64(u.v128[0], N)}; return {vshlq_n_u64(u.u64x2[0], N)};
} }
template <> template <>
template<uint8_t N> template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
{ {
return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - N)}; return {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - N)};
} }
template <> template <>
@ -274,35 +316,35 @@ template <>
template<uint8_t N> template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
{ {
return {(m128)vshrq_n_s8(u.v128[0], N)}; return {vshrq_n_u8(u.u8x16[0], N)};
} }
template <> template <>
template<uint8_t N> template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
{ {
return {(m128)vshrq_n_s16(u.v128[0], N)}; return {vshrq_n_u16(u.u16x8[0], N)};
} }
template <> template <>
template<uint8_t N> template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
{ {
return {(m128)vshrq_n_s32(u.v128[0], N)}; return {vshrq_n_u32(u.u32x4[0], N)};
} }
template <> template <>
template<uint8_t N> template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
{ {
return {(m128)vshrq_n_s64(u.v128[0], N)}; return {vshrq_n_u64(u.u64x2[0], N)};
} }
template <> template <>
template<uint8_t N> template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
{ {
return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), N)}; return {vextq_u8(u.u8x16[0], vdupq_n_u8(0), N)};
} }
template <> template <>
@ -334,7 +376,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const
if (N == 0) return *this; if (N == 0) return *this;
if (N == 16) return Zeroes(); if (N == 16) return Zeroes();
SuperVector result; SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s8(u.v128[0], n)}; }); Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(v->u.u8x16[0], n)}; });
return result; return result;
} }
@ -344,7 +386,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
if (N == 0) return *this; if (N == 0) return *this;
if (N == 16) return Zeroes(); if (N == 16) return Zeroes();
SuperVector result; SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s16(u.v128[0], n)}; }); Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(v->u.u16x8[0], n)}; });
return result; return result;
} }
@ -352,9 +394,9 @@ template <>
really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
{ {
if (N == 0) return *this; if (N == 0) return *this;
if (N == 16) return Zeroes(); if (N == 32) return Zeroes();
SuperVector result; SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s32(u.v128[0], n)}; }); Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(v->u.u32x4[0], n)}; });
return result; return result;
} }
@ -362,9 +404,9 @@ template <>
really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
{ {
if (N == 0) return *this; if (N == 0) return *this;
if (N == 16) return Zeroes(); if (N == 64) return Zeroes();
SuperVector result; SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s64(u.v128[0], n)}; }); Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(v->u.u64x2[0], n)}; });
return result; return result;
} }
@ -374,7 +416,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
if (N == 0) return *this; if (N == 0) return *this;
if (N == 16) return Zeroes(); if (N == 16) return Zeroes();
SuperVector result; SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - n)}; }); Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), v->u.u8x16[0], 16 - n)}; });
return result; return result;
} }
@ -388,9 +430,9 @@ template <>
really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const
{ {
if (N == 0) return *this; if (N == 0) return *this;
if (N == 16) return Zeroes(); if (N == 8) return Zeroes();
SuperVector result; SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s8(u.v128[0], n)}; }); Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(v->u.u8x16[0], n)}; });
return result; return result;
} }
@ -400,7 +442,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
if (N == 0) return *this; if (N == 0) return *this;
if (N == 16) return Zeroes(); if (N == 16) return Zeroes();
SuperVector result; SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s16(u.v128[0], n)}; }); Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(v->u.u16x8[0], n)}; });
return result; return result;
} }
@ -408,9 +450,9 @@ template <>
really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
{ {
if (N == 0) return *this; if (N == 0) return *this;
if (N == 16) return Zeroes(); if (N == 32) return Zeroes();
SuperVector result; SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s32(u.v128[0], n)}; }); Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(v->u.u32x4[0], n)}; });
return result; return result;
} }
@ -418,9 +460,9 @@ template <>
really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
{ {
if (N == 0) return *this; if (N == 0) return *this;
if (N == 16) return Zeroes(); if (N == 64) return Zeroes();
SuperVector result; SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s64(u.v128[0], n)}; }); Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(v->u.u64x2[0], n)}; });
return result; return result;
} }
@ -430,7 +472,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
if (N == 0) return *this; if (N == 0) return *this;
if (N == 16) return Zeroes(); if (N == 16) return Zeroes();
SuperVector result; SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), n)}; }); Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(v->u.u8x16[0], vdupq_n_u8(0), n)}; });
return result; return result;
} }
@ -440,34 +482,27 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
return vshr_128(N); return vshr_128(N);
} }
#ifdef HS_OPTIMIZE
template <>
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
{
return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), N)};
}
#else
template <> template <>
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
{ {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(N)) {
return {vextq_u8(u.u8x16[0], vdupq_n_u8(0), N)};
}
#endif
return vshr_128(N); return vshr_128(N);
} }
#endif
#ifdef HS_OPTIMIZE
template <>
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
{
return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - N)};
}
#else
template <> template <>
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
{ {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(N)) {
return {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - N)};
}
#endif
return vshl_128(N); return vshl_128(N);
} }
#endif
template<> template<>
really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N) really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N)
@ -505,49 +540,46 @@ really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint
return mask & v; return mask & v;
} }
#ifdef HS_OPTIMIZE
template<> template<>
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
{ {
if (offset == 16) { #if defined(HAVE__BUILTIN_CONSTANT_P)
return *this; if (__builtin_constant_p(offset)) {
} else { if (offset == 16) {
return {vextq_s8((int16x8_t)other.u.v128[0], (int16x8_t)u.v128[0], offset)}; return *this;
} else {
return {vextq_u8(other.u.u8x16[0], u.u8x16[0], offset)};
}
} }
} #endif
#else
template<>
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
{
switch(offset) { switch(offset) {
case 0: return other; break; case 0: return other; break;
case 1: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 1)}; break; case 1: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 1)}; break;
case 2: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 2)}; break; case 2: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 2)}; break;
case 3: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 3)}; break; case 3: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 3)}; break;
case 4: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 4)}; break; case 4: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 4)}; break;
case 5: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 5)}; break; case 5: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 5)}; break;
case 6: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 6)}; break; case 6: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 6)}; break;
case 7: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 7)}; break; case 7: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 7)}; break;
case 8: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 8)}; break; case 8: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 8)}; break;
case 9: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 9)}; break; case 9: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 9)}; break;
case 10: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 10)}; break; case 10: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 10)}; break;
case 11: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 11)}; break; case 11: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 11)}; break;
case 12: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 12)}; break; case 12: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 12)}; break;
case 13: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 13)}; break; case 13: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 13)}; break;
case 14: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 14)}; break; case 14: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 14)}; break;
case 15: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 15)}; break; case 15: return {vextq_u8( other.u.u8x16[0], u.u8x16[0], 15)}; break;
case 16: return *this; break; case 16: return *this; break;
default: break; default: break;
} }
return *this; return *this;
} }
#endif
template<> template<>
template<> template<>
really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b) really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b)
{ {
return {vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)b.u.v128[0])}; return {vqtbl1q_u8(u.u8x16[0], b.u.u8x16[0])};
} }
template<> template<>
@ -565,7 +597,7 @@ template<>
really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len) really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
{ {
SuperVector mask = Ones_vshr(16 -len); SuperVector mask = Ones_vshr(16 -len);
return mask & pshufb<true>(b); return mask & pshufb(b);
} }
#endif // SIMD_IMPL_HPP #endif // SIMD_IMPL_HPP

View File

@ -0,0 +1,593 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SIMD_IMPL_HPP
#define SIMD_IMPL_HPP
#include <cstdint>
#include <cstdio>
#include "ue2common.h"
#include "util/arch.h"
#include "util/unaligned.h"
#include "util/supervector/supervector.hpp"
#include <iostream>
// 128-bit Powerpc64le implementation
template<>
really_inline SuperVector<16>::SuperVector(SuperVector const &other)
{
u.v128[0] = other.u.v128[0];
}
template<>
really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
{
u.v128[0] = v;
};
template<>
template<>
really_inline SuperVector<16>::SuperVector(int8_t const other)
{
u.v128[0] = (m128) vec_splats(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(uint8_t const other)
{
u.v128[0] = (m128) vec_splats(static_cast<uint8_t>(other));
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(int16_t const other)
{
u.v128[0] = (m128) vec_splats(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(uint16_t const other)
{
u.v128[0] = (m128) vec_splats(static_cast<uint16_t>(other));
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(int32_t const other)
{
u.v128[0] = (m128) vec_splats(other);
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(uint32_t const other)
{
u.v128[0] = (m128) vec_splats(static_cast<uint32_t>(other));
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(int64_t const other)
{
u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
}
template<>
template<>
really_inline SuperVector<16>::SuperVector(uint64_t const other)
{
u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
}
// Constants
template<>
really_inline SuperVector<16> SuperVector<16>::Ones(void)
{
return {(m128) vec_splat_s8(-1)};
}
template<>
really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
{
return {(m128) vec_splat_s8(0)};
}
// Methods
template <>
really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
{
u.v128[0] = other.u.v128[0];
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
{
return {vec_and(u.v128[0], b.u.v128[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
{
return {vec_or(u.v128[0], b.u.v128[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
{
return {(m128) vec_xor(u.v128[0], b.u.v128[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator!() const
{
return {(m128) vec_xor(u.v128[0], u.v128[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
{
m128 not_res = vec_xor(u.v128[0], (m128)vec_splat_s8(-1));
return {(m128) vec_and(not_res, (m128)b.u.v128[0]) };
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
{
return {(m128) vec_cmpeq(u.s8x16[0], b.u.s8x16[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const
{
return !(*this == b);
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
{
return {(m128) vec_cmpgt(u.v128[0], b.u.v128[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
{
return {(m128) vec_cmpge(u.v128[0], b.u.v128[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
{
return {(m128) vec_cmpgt(b.u.v128[0], u.v128[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
{
return {(m128) vec_cmpge(b.u.v128[0], u.v128[0])};
}
template <>
really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
{
return (*this == b);
}
template <>
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const
{
uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7));
uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff));
uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and);
uint32x4_t ss2 = vec_sr((uint32x4_t)s2 , vec_splat_u32(14));
uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff));
uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
uint64x2_t ss4 = vec_sld((uint64x2_t) vec_splats(0), s4, 9);
uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
return s5[0];
}
template <>
really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
{
return eq(b).movemask();
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
{
return { (m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)N)) };
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
{
return { (m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)N)) };
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
{
return { (m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)N)) };
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
{
return { (m128) vec_sl(u.s64x2[0], vec_splats((ulong64_t)N)) };
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
{
return { (m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), N)};
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshl_imm() const
{
return vshl_128_imm<N>();
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
{
return { (m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)N)) };
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
{
return { (m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)N)) };
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
{
return { (m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)N)) };
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
{
return { (m128) vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) };
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
{
return { (m128) vec_sld((int8x16_t)vec_splat_s8(0), u.s8x16[0], 16 - N) };
}
template <>
template<uint8_t N>
really_inline SuperVector<16> SuperVector<16>::vshr_imm() const
{
return vshr_128_imm<N>();
}
#if !defined(HS_OPTIMIZE)
template SuperVector<16> SuperVector<16>::vshl_8_imm<4>() const;
template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const;
template SuperVector<16> SuperVector<16>::vshl_128_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshl_128_imm<4>() const;
template SuperVector<16> SuperVector<16>::vshr_8_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshr_8_imm<4>() const;
template SuperVector<16> SuperVector<16>::vshr_16_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const;
template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const;
template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const;
#endif
template <>
really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
return result;
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
return result;
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
return result;
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
return result;
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(v->u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; });
return result;
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const
{
return vshl_128(N);
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
return result;
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
return result;
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
return result;
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
return result;
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N) const
{
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), v->u.s8x16[0], 16 - n)}; });
return result;
}
template <>
really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
{
return vshr_128(N);
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
{
switch(N) {
case 1: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 15)}; break;
case 2: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 14)}; break;
case 3: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 13)}; break;
case 4: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 12)}; break;
case 5: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 11)}; break;
case 6: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 10)}; break;
case 7: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 9)}; break;
case 8: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 8)}; break;
case 9: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 7)}; break;
case 10: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 6)}; break;
case 11: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 5)}; break;
case 12: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 4)}; break;
case 13: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 3)}; break;
case 14: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 2)}; break;
case 15: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 1)}; break;
case 16: return Zeroes(); break;
default: break;
}
return *this;
}
template <>
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
{
switch(N) {
case 1: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 1)}; break;
case 2: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 2)}; break;
case 3: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 3)}; break;
case 4: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 4)}; break;
case 5: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 5)}; break;
case 6: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 6)}; break;
case 7: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 7)}; break;
case 8: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 8)}; break;
case 9: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 9)}; break;
case 10: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 10)}; break;
case 11: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 11)}; break;
case 12: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 12)}; break;
case 13: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 13)}; break;
case 14: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 14)}; break;
case 15: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 15)}; break;
case 16: return Zeroes(); break;
default: break;
}
return *this;
}
template<>
really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N)
{
return Ones().vshr_128(N);
}
template<>
really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
{
return Ones().vshl_128(N);
}
template <>
really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
{
return (m128) vec_xl(0, (const long64_t*)ptr);
}
template <>
really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
{
assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
return (m128) vec_xl(0, (const long64_t*)ptr);
}
template <>
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
{
SuperVector<16> mask = Ones_vshr(16 -len);
mask.print8("mask");
SuperVector<16> v = loadu(ptr);
v.print8("v");
return mask & v;
}
template<>
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
{
switch(offset) {
case 0: return other; break;
case 1: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 15)}; break;
case 2: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 14)}; break;
case 3: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 13)}; break;
case 4: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 12)}; break;
case 5: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 11)}; break;
case 6: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 10)}; break;
case 7: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 9)}; break;
case 8: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 8)}; break;
case 9: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 7)}; break;
case 10: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 6)}; break;
case 11: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 5)}; break;
case 12: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 4)}; break;
case 13: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 3)}; break;
case 14: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 2)}; break;
case 15: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 1)}; break;
default: break;
}
return *this;
}
template<>
template<>
really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b)
{
/* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
below is the version that is converted from Intel to PPC. */
uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], (uint8x16_t)vec_splats((uint8_t)0x80));
uint8x16_t res = vec_perm (u.u8x16[0], u.u8x16[0], b.u.u8x16[0]);
return (m128) vec_sel(res, (uint8x16_t)vec_splat_s8(0), mask);
}
template<>
template<>
really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b)
{
/* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
btranslated is the version that is converted from Intel to PPC. */
SuperVector<16> btranslated = b & SuperVector<16>::dup_s8(0x8f);
return pshufb<false>(btranslated);
}
template<>
really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
{
SuperVector<16> mask = Ones_vshr(16 -len);
return mask & pshufb(b);
}
#endif

View File

@ -0,0 +1,44 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
typedef __vector unsigned long long int uint64x2_t;
typedef __vector signed long long int int64x2_t;
typedef __vector unsigned int uint32x4_t;
typedef __vector signed int int32x4_t;
typedef __vector unsigned short int uint16x8_t;
typedef __vector signed short int int16x8_t;
typedef __vector unsigned char uint8x16_t;
typedef __vector signed char int8x16_t;
typedef unsigned long long int ulong64_t;
typedef signed long long int long64_t;
#if !defined(m128) && defined(HAVE_VSX)
typedef __vector int m128;
#endif

View File

@ -55,56 +55,56 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<int8_t>(int8_t const other) really_inline SuperVector<16>::SuperVector(int8_t const other)
{ {
u.v128[0] = _mm_set1_epi8(other); u.v128[0] = _mm_set1_epi8(other);
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<uint8_t>(uint8_t const other) really_inline SuperVector<16>::SuperVector(uint8_t const other)
{ {
u.v128[0] = _mm_set1_epi8(static_cast<int8_t>(other)); u.v128[0] = _mm_set1_epi8(static_cast<int8_t>(other));
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<int16_t>(int16_t const other) really_inline SuperVector<16>::SuperVector(int16_t const other)
{ {
u.v128[0] = _mm_set1_epi16(other); u.v128[0] = _mm_set1_epi16(other);
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<uint16_t>(uint16_t const other) really_inline SuperVector<16>::SuperVector(uint16_t const other)
{ {
u.v128[0] = _mm_set1_epi16(static_cast<int16_t>(other)); u.v128[0] = _mm_set1_epi16(static_cast<int16_t>(other));
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<int32_t>(int32_t const other) really_inline SuperVector<16>::SuperVector(int32_t const other)
{ {
u.v128[0] = _mm_set1_epi32(other); u.v128[0] = _mm_set1_epi32(other);
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<uint32_t>(uint32_t const other) really_inline SuperVector<16>::SuperVector(uint32_t const other)
{ {
u.v128[0] = _mm_set1_epi32(static_cast<int32_t>(other)); u.v128[0] = _mm_set1_epi32(static_cast<int32_t>(other));
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<int64_t>(int64_t const other) really_inline SuperVector<16>::SuperVector(int64_t const other)
{ {
u.v128[0] = _mm_set1_epi64x(other); u.v128[0] = _mm_set1_epi64x(other);
} }
template<> template<>
template<> template<>
really_inline SuperVector<16>::SuperVector<uint64_t>(uint64_t const other) really_inline SuperVector<16>::SuperVector(uint64_t const other)
{ {
u.v128[0] = _mm_set1_epi64x(static_cast<int64_t>(other)); u.v128[0] = _mm_set1_epi64x(static_cast<int64_t>(other));
} }
@ -520,16 +520,18 @@ really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint
return mask & v; return mask & v;
} }
#ifdef HS_OPTIMIZE
template<>
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
{
return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], offset)};
}
#else
template<> template<>
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
{ {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(offset)) {
if (offset == 16) {
return *this;
} else {
return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], offset)};
}
}
#endif
switch(offset) { switch(offset) {
case 0: return other; break; case 0: return other; break;
case 1: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 1)}; break; case 1: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 1)}; break;
@ -551,7 +553,6 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in
} }
return *this; return *this;
} }
#endif
template<> template<>
template<> template<>
@ -607,56 +608,56 @@ really_inline SuperVector<32>::SuperVector(SuperVector<16> const lo, SuperVector
template<> template<>
template<> template<>
really_inline SuperVector<32>::SuperVector<int8_t>(int8_t const other) really_inline SuperVector<32>::SuperVector(int8_t const other)
{ {
u.v256[0] = _mm256_set1_epi8(other); u.v256[0] = _mm256_set1_epi8(other);
} }
template<> template<>
template<> template<>
really_inline SuperVector<32>::SuperVector<uint8_t>(uint8_t const other) really_inline SuperVector<32>::SuperVector(uint8_t const other)
{ {
u.v256[0] = _mm256_set1_epi8(static_cast<int8_t>(other)); u.v256[0] = _mm256_set1_epi8(static_cast<int8_t>(other));
} }
template<> template<>
template<> template<>
really_inline SuperVector<32>::SuperVector<int16_t>(int16_t const other) really_inline SuperVector<32>::SuperVector(int16_t const other)
{ {
u.v256[0] = _mm256_set1_epi16(other); u.v256[0] = _mm256_set1_epi16(other);
} }
template<> template<>
template<> template<>
really_inline SuperVector<32>::SuperVector<uint16_t>(uint16_t const other) really_inline SuperVector<32>::SuperVector(uint16_t const other)
{ {
u.v256[0] = _mm256_set1_epi16(static_cast<int16_t>(other)); u.v256[0] = _mm256_set1_epi16(static_cast<int16_t>(other));
} }
template<> template<>
template<> template<>
really_inline SuperVector<32>::SuperVector<int32_t>(int32_t const other) really_inline SuperVector<32>::SuperVector(int32_t const other)
{ {
u.v256[0] = _mm256_set1_epi32(other); u.v256[0] = _mm256_set1_epi32(other);
} }
template<> template<>
template<> template<>
really_inline SuperVector<32>::SuperVector<uint32_t>(uint32_t const other) really_inline SuperVector<32>::SuperVector(uint32_t const other)
{ {
u.v256[0] = _mm256_set1_epi32(static_cast<int32_t>(other)); u.v256[0] = _mm256_set1_epi32(static_cast<int32_t>(other));
} }
template<> template<>
template<> template<>
really_inline SuperVector<32>::SuperVector<int64_t>(int64_t const other) really_inline SuperVector<32>::SuperVector(int64_t const other)
{ {
u.v256[0] = _mm256_set1_epi64x(other); u.v256[0] = _mm256_set1_epi64x(other);
} }
template<> template<>
template<> template<>
really_inline SuperVector<32>::SuperVector<uint64_t>(uint64_t const other) really_inline SuperVector<32>::SuperVector(uint64_t const other)
{ {
u.v256[0] = _mm256_set1_epi64x(static_cast<int64_t>(other)); u.v256[0] = _mm256_set1_epi64x(static_cast<int64_t>(other));
} }
@ -803,7 +804,7 @@ really_inline SuperVector<32> SuperVector<32>::vshl_128_imm() const
template <> template <>
template<uint8_t N> template<uint8_t N>
really_inline SuperVector<16> SuperVector<32>::vshl_256_imm() const really_inline SuperVector<32> SuperVector<32>::vshl_256_imm() const
{ {
if (N == 0) return *this; if (N == 0) return *this;
if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))}; if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
@ -949,11 +950,11 @@ really_inline SuperVector<32> SuperVector<32>::vshl_256(uint8_t const N) const
SuperVector result; SuperVector result;
Unroller<1, 16>::iterator([&,v=this](auto const i) { Unroller<1, 16>::iterator([&,v=this](auto const i) {
constexpr uint8_t n = i.value; constexpr uint8_t n = i.value;
if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};; if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};;
}); });
Unroller<17, 32>::iterator([&,v=this](auto const i) { Unroller<17, 32>::iterator([&,v=this](auto const i) {
constexpr uint8_t n = i.value; constexpr uint8_t n = i.value;
if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)}; if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)};
}); });
return result; return result;
} }
@ -1037,47 +1038,41 @@ really_inline SuperVector<32> SuperVector<32>::vshr(uint8_t const N) const
return vshr_256(N); return vshr_256(N);
} }
#ifdef HS_OPTIMIZE
template <> template <>
really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const
{ {
// As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx #if defined(HAVE__BUILTIN_CONSTANT_P)
if (N < 16) { if (__builtin_constant_p(N)) {
return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], N)}; // As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
} else if (N == 16) { if (N < 16) {
return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))}; return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], N)};
} else { } else if (N == 16) {
return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16)}; return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))};
} else {
return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16)};
}
} }
} #endif
#else
template <>
really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const
{
return vshr_256(N); return vshr_256(N);
} }
#endif
#ifdef HS_OPTIMIZE
template <> template <>
really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const
{ {
// As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx #if defined(HAVE__BUILTIN_CONSTANT_P)
if (N < 16) { if (__builtin_constant_p(N)) {
return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)}; // As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
} else if (N == 16) { if (N < 16) {
return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))}; return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)};
} else { } else if (N == 16) {
return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), N - 16)}; return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
} else {
return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), N - 16)};
}
} }
} #endif
#else
template <>
really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const
{
return vshl_256(N); return vshl_256(N);
} }
#endif
template<> template<>
really_inline SuperVector<32> SuperVector<32>::Ones_vshr(uint8_t const N) really_inline SuperVector<32> SuperVector<32>::Ones_vshr(uint8_t const N)
@ -1132,16 +1127,18 @@ really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, uint
#endif #endif
} }
#ifdef HS_OPTIMIZE
template<>
really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset)
{
return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], offset)};
}
#else
template<> template<>
really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset) really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset)
{ {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(offset)) {
if (offset == 16) {
return *this;
} else {
return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], offset)};
}
}
#endif
// As found here: https://stackoverflow.com/questions/8517970/mm-alignr-epi8-palignr-equivalent-in-avx2#8637458 // As found here: https://stackoverflow.com/questions/8517970/mm-alignr-epi8-palignr-equivalent-in-avx2#8637458
switch (offset){ switch (offset){
case 0 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 0), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 0)); break; case 0 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 0), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 0)); break;
@ -1180,7 +1177,6 @@ really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, in
} }
return *this; return *this;
} }
#endif
template<> template<>
template<> template<>
@ -1244,56 +1240,56 @@ really_inline SuperVector<64>::SuperVector(m128 const v)
template<> template<>
template<> template<>
really_inline SuperVector<64>::SuperVector<int8_t>(int8_t const o) really_inline SuperVector<64>::SuperVector(int8_t const o)
{ {
u.v512[0] = _mm512_set1_epi8(o); u.v512[0] = _mm512_set1_epi8(o);
} }
template<> template<>
template<> template<>
really_inline SuperVector<64>::SuperVector<uint8_t>(uint8_t const o) really_inline SuperVector<64>::SuperVector(uint8_t const o)
{ {
u.v512[0] = _mm512_set1_epi8(static_cast<int8_t>(o)); u.v512[0] = _mm512_set1_epi8(static_cast<int8_t>(o));
} }
template<> template<>
template<> template<>
really_inline SuperVector<64>::SuperVector<int16_t>(int16_t const o) really_inline SuperVector<64>::SuperVector(int16_t const o)
{ {
u.v512[0] = _mm512_set1_epi16(o); u.v512[0] = _mm512_set1_epi16(o);
} }
template<> template<>
template<> template<>
really_inline SuperVector<64>::SuperVector<uint16_t>(uint16_t const o) really_inline SuperVector<64>::SuperVector(uint16_t const o)
{ {
u.v512[0] = _mm512_set1_epi16(static_cast<int16_t>(o)); u.v512[0] = _mm512_set1_epi16(static_cast<int16_t>(o));
} }
template<> template<>
template<> template<>
really_inline SuperVector<64>::SuperVector<int32_t>(int32_t const o) really_inline SuperVector<64>::SuperVector(int32_t const o)
{ {
u.v512[0] = _mm512_set1_epi32(o); u.v512[0] = _mm512_set1_epi32(o);
} }
template<> template<>
template<> template<>
really_inline SuperVector<64>::SuperVector<uint32_t>(uint32_t const o) really_inline SuperVector<64>::SuperVector(uint32_t const o)
{ {
u.v512[0] = _mm512_set1_epi32(static_cast<int32_t>(o)); u.v512[0] = _mm512_set1_epi32(static_cast<int32_t>(o));
} }
template<> template<>
template<> template<>
really_inline SuperVector<64>::SuperVector<int64_t>(int64_t const o) really_inline SuperVector<64>::SuperVector(int64_t const o)
{ {
u.v512[0] = _mm512_set1_epi64(o); u.v512[0] = _mm512_set1_epi64(o);
} }
template<> template<>
template<> template<>
really_inline SuperVector<64>::SuperVector<uint64_t>(uint64_t const o) really_inline SuperVector<64>::SuperVector(uint64_t const o)
{ {
u.v512[0] = _mm512_set1_epi64(static_cast<int64_t>(o)); u.v512[0] = _mm512_set1_epi64(static_cast<int64_t>(o));
} }
@ -1772,16 +1768,18 @@ really_inline SuperVector<64> SuperVector<64>::pshufb_maskz(SuperVector<64> b, u
return {_mm512_maskz_shuffle_epi8(mask, u.v512[0], b.u.v512[0])}; return {_mm512_maskz_shuffle_epi8(mask, u.v512[0], b.u.v512[0])};
} }
#ifdef HS_OPTIMIZE
template<>
really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t offset)
{
return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], offset)};
}
#else
template<> template<>
really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t offset) really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t offset)
{ {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(offset)) {
if (offset == 16) {
return *this;
} else {
return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], offset)};
}
}
#endif
if(offset == 0) { if(offset == 0) {
return *this; return *this;
} else if (offset < 32){ } else if (offset < 32){
@ -1802,7 +1800,6 @@ really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t
return *this; return *this;
} }
} }
#endif
#endif // HAVE_AVX512 #endif // HAVE_AVX512

View File

@ -0,0 +1,54 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CASEMASK_HPP
#define CASEMASK_HPP
#include "util/supervector/supervector.hpp"
static u8 CASEMASK[] = { 0xff, 0xdf };
static really_inline
u8 caseClear8(u8 x, bool noCase)
{
return static_cast<u8>(x & CASEMASK[(u8)noCase]);
}
template<uint16_t S>
static really_inline SuperVector<S> getMask(u8 c, bool noCase) {
u8 k = caseClear8(c, noCase);
return SuperVector<S>(k);
}
template<uint16_t S>
static really_inline SuperVector<S> getCaseMask(void) {
return SuperVector<S>(CASEMASK[1]);
}
#endif // CASEMASK_HPP

View File

@ -38,6 +38,8 @@
#include "util/supervector/arch/x86/types.hpp" #include "util/supervector/arch/x86/types.hpp"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#include "util/supervector/arch/arm/types.hpp" #include "util/supervector/arch/arm/types.hpp"
#elif defined(ARCH_PPC64EL)
#include "util/supervector/arch/ppc64el/types.hpp"
#endif #endif
#if defined(HAVE_SIMD_512_BITS) #if defined(HAVE_SIMD_512_BITS)
@ -162,6 +164,18 @@ public:
typename BaseVector<16>::type ALIGN_ATTR(BaseVector<16>::size) v128[SIZE / BaseVector<16>::size]; typename BaseVector<16>::type ALIGN_ATTR(BaseVector<16>::size) v128[SIZE / BaseVector<16>::size];
typename BaseVector<32>::type ALIGN_ATTR(BaseVector<32>::size) v256[SIZE / BaseVector<32>::size]; typename BaseVector<32>::type ALIGN_ATTR(BaseVector<32>::size) v256[SIZE / BaseVector<32>::size];
typename BaseVector<64>::type ALIGN_ATTR(BaseVector<64>::size) v512[SIZE / BaseVector<64>::size]; typename BaseVector<64>::type ALIGN_ATTR(BaseVector<64>::size) v512[SIZE / BaseVector<64>::size];
#if defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL)
uint64x2_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size];
int64x2_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
uint32x4_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size];
int32x4_t ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size];
uint16x8_t ALIGN_ATTR(BaseVector<16>::size) u16x8[SIZE / BaseVector<16>::size];
int16x8_t ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size];
uint8x16_t ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size];
int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size];
#endif
uint64_t u64[SIZE / sizeof(uint64_t)]; uint64_t u64[SIZE / sizeof(uint64_t)];
int64_t s64[SIZE / sizeof(int64_t)]; int64_t s64[SIZE / sizeof(int64_t)];
uint32_t u32[SIZE / sizeof(uint32_t)]; uint32_t u32[SIZE / sizeof(uint32_t)];
@ -175,12 +189,12 @@ public:
} u; } u;
constexpr SuperVector() {}; constexpr SuperVector() {};
constexpr SuperVector(SuperVector const &other) SuperVector(SuperVector const &other)
:u(other.u) {}; :u(other.u) {};
SuperVector(typename base_type::type const v); SuperVector(typename base_type::type const v);
template<typename T> template<typename T>
SuperVector(T const other); SuperVector(T other);
SuperVector(SuperVector<SIZE/2> const lo, SuperVector<SIZE/2> const hi); SuperVector(SuperVector<SIZE/2> const lo, SuperVector<SIZE/2> const hi);
SuperVector(previous_type const lo, previous_type const hi); SuperVector(previous_type const lo, previous_type const hi);
@ -353,6 +367,8 @@ struct Unroller<End, End>
#include "util/supervector/arch/x86/impl.cpp" #include "util/supervector/arch/x86/impl.cpp"
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
#include "util/supervector/arch/arm/impl.cpp" #include "util/supervector/arch/arm/impl.cpp"
#elif defined(ARCH_PPC64EL)
#include "util/supervector/arch/ppc64el/impl.cpp"
#endif #endif
#endif #endif

View File

@ -63,7 +63,7 @@ target_link_libraries(unit-hyperscan hs expressionutil)
endif() endif()
if (NOT (RELEASE_BUILD OR FAT_RUNTIME)) if (NOT FAT_RUNTIME )
set(unit_internal_SOURCES set(unit_internal_SOURCES
${gtest_SOURCES} ${gtest_SOURCES}
internal/bitfield.cpp internal/bitfield.cpp
@ -72,8 +72,6 @@ set(unit_internal_SOURCES
internal/compare.cpp internal/compare.cpp
internal/database.cpp internal/database.cpp
internal/depth.cpp internal/depth.cpp
internal/fdr.cpp
internal/fdr_flood.cpp
internal/fdr_loadval.cpp internal/fdr_loadval.cpp
internal/flat_set.cpp internal/flat_set.cpp
internal/flat_map.cpp internal/flat_map.cpp
@ -81,7 +79,6 @@ set(unit_internal_SOURCES
internal/graph_undirected.cpp internal/graph_undirected.cpp
internal/insertion_ordered.cpp internal/insertion_ordered.cpp
internal/lbr.cpp internal/lbr.cpp
internal/limex_nfa.cpp
internal/multi_bit.cpp internal/multi_bit.cpp
internal/multi_bit_compress.cpp internal/multi_bit_compress.cpp
internal/nfagraph_common.h internal/nfagraph_common.h
@ -121,13 +118,22 @@ if (BUILD_AVX2)
set(unit_internal_SOURCES set(unit_internal_SOURCES
${unit_internal_SOURCES} ${unit_internal_SOURCES}
internal/masked_move.cpp internal/masked_move.cpp
) )
endif(BUILD_AVX2) endif(BUILD_AVX2)
if (NOT RELEASE_BUILD)
set(unit_internal_SOURCES
${unit_internal_SOURCES}
internal/fdr.cpp
internal/fdr_flood.cpp
internal/limex_nfa.cpp
)
endif(NOT RELEASE_BUILD)
add_executable(unit-internal ${unit_internal_SOURCES}) add_executable(unit-internal ${unit_internal_SOURCES})
set_target_properties(unit-internal PROPERTIES COMPILE_FLAGS "${HS_CXX_FLAGS}") set_target_properties(unit-internal PROPERTIES COMPILE_FLAGS "${HS_CXX_FLAGS}")
target_link_libraries(unit-internal hs corpusomatic) target_link_libraries(unit-internal hs corpusomatic)
endif(NOT (RELEASE_BUILD OR FAT_RUNTIME)) endif(NOT FAT_RUNTIME)
if (BUILD_CHIMERA) if (BUILD_CHIMERA)
# enable Chimera unit tests # enable Chimera unit tests
@ -178,9 +184,10 @@ else()
else () else ()
add_custom_target( add_custom_target(
unit unit
COMMAND bin/unit-internal
COMMAND bin/unit-hyperscan COMMAND bin/unit-hyperscan
WORKING_DIRECTORY ${CMAKE_BINARY_DIR} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
DEPENDS unit-hyperscan DEPENDS unit-internal unit-hyperscan
) )
endif() endif()
endif() endif()

View File

@ -30,7 +30,7 @@
#include "config.h" #include "config.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "nfa/vermicelli.h" #include "nfa/vermicelli.hpp"
#define BOUND (~(VERM_BOUNDARY - 1)) #define BOUND (~(VERM_BOUNDARY - 1))
@ -563,4 +563,4 @@ TEST(RNVermicelli16, Exec5) {
} }
} }
#endif // HAVE_SVE2 #endif // HAVE_SVE2

View File

@ -183,11 +183,11 @@ void build_pshufb_masks_onebit(unsigned int bit, T *permute, T *compare) {
TEST(Shuffle, PackedExtract128_1) { TEST(Shuffle, PackedExtract128_1) {
// Try all possible one-bit masks // Try all possible one-bit masks
for (unsigned int i = 0; i < 128; i++) { for (unsigned int i = 0; i < 1; i++) {
// shuffle a single 1 bit to the front // shuffle a single 1 bit to the front
m128 permute, compare; m128 permute, compare;
build_pshufb_masks_onebit(i, &permute, &compare); build_pshufb_masks_onebit(i, &permute, &compare);
EXPECT_EQ(1U, packedExtract128(setbit<m128>(i), permute, compare)); EXPECT_EQ(1U, packedExtract128(setbit<m128>(i), permute, compare));
EXPECT_EQ(1U, packedExtract128(ones128(), permute, compare)); EXPECT_EQ(1U, packedExtract128(ones128(), permute, compare));
// we should get zero out of these cases // we should get zero out of these cases
EXPECT_EQ(0U, packedExtract128(zeroes128(), permute, compare)); EXPECT_EQ(0U, packedExtract128(zeroes128(), permute, compare));
@ -199,6 +199,7 @@ TEST(Shuffle, PackedExtract128_1) {
} }
} }
TEST(Shuffle, PackedExtract_templatized_128_1) { TEST(Shuffle, PackedExtract_templatized_128_1) {
// Try all possible one-bit masks // Try all possible one-bit masks
for (unsigned int i = 0; i < 128; i++) { for (unsigned int i = 0; i < 128; i++) {
@ -219,6 +220,7 @@ TEST(Shuffle, PackedExtract_templatized_128_1) {
} }
#if defined(HAVE_AVX2) #if defined(HAVE_AVX2)
TEST(Shuffle, PackedExtract256_1) { TEST(Shuffle, PackedExtract256_1) {
// Try all possible one-bit masks // Try all possible one-bit masks

View File

@ -667,7 +667,10 @@ TEST(SimdUtilsTest, movq) {
simd = _mm_set_epi64x(~0LL, 0x123456789abcdef); simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
int64x2_t a = { 0x123456789abcdefLL, ~0LL }; int64x2_t a = { 0x123456789abcdefLL, ~0LL };
simd = vreinterpretq_s64_s8(a); simd = vreinterpretq_s32_s64(a);
#elif defined(ARCH_PPC64EL)
int64x2_t a = {0x123456789abcdefLL, ~0LL };
simd = (m128) a;
#endif #endif
#endif #endif
r = movq(simd); r = movq(simd);
@ -816,4 +819,126 @@ TEST(SimdUtilsTest, sub_u8_m128) {
EXPECT_TRUE(!diff128(result, loadu128(expec))); EXPECT_TRUE(!diff128(result, loadu128(expec)));
} }
TEST(SimdUtilsTest, load_m128_from_u64a) {
srand (time(NULL));
u64a tmp = rand();
m128 res = load_m128_from_u64a(&tmp);
m128 cmp = set2x64(0LL, tmp);
//print_m128_16x8("res",res);
//print_m128_16x8("cmp",cmp);
EXPECT_TRUE(!diff128(res, cmp));
}
TEST(SimdUtilsTest, movemask_128) {
srand (time(NULL));
u8 vec[16] = {0};
u8 vec2[16] = {0};
u16 r = rand() % 100 + 1;
for(int i=0; i<16; i++) {
if (r & (1 << i)) {
vec[i] = 0xff;
}
}
m128 v = loadu128(vec);
u16 mask = movemask128(v);
for(int i=0; i<16; i++) {
if (mask & (1 << i)) {
vec2[i] = 0xff;
}
}
for (int i=0; i<16; i++) {
ASSERT_EQ(vec[i],vec2[i]);
}
}
TEST(SimdUtilsTest, pshufb_m128) {
srand (time(NULL));
u8 vec[16];
for (int i=0; i<16; i++) {
vec[i] = rand() % 1000 + 1;
}
u8 vec2[16];
for (int i=0; i<16; i++) {
vec2[i]=i + (rand() % 100 + 0);
}
// On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
// In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
// Thus bellow we have to check that case to NEON or PPC.
//Insure that vec3 has at least 1 or more 0x80 elements
u8 vec3[16] = {0};
vec3[15] = 0x80;
for (int i=0; i<15; i++) {
int l = rand() % 1000 + 0;
if (l % 16 ==0){
vec3[i]= 0x80;
} else{
vec3[i]= vec2[i];
}
}
/*
printf("vec3: ");
for(int i=15; i>=0; i--) { printf("%02x, ", vec3[i]); }
printf("\n");
*/
//Test Special Case
m128 v1 = loadu128(vec);
m128 v2 = loadu128(vec3);
m128 vres = pshufb_m128(v1, v2);
u8 res[16];
storeu128(res, vres);
for (int i=0; i<16; i++) {
if(vec3[i] & 0x80){
ASSERT_EQ(res[i], 0);
}else{
ASSERT_EQ(vec[vec3[i] % 16 ], res[i]);
}
}
//Test Other Cases
v1 = loadu128(vec);
v2 = loadu128(vec2);
vres = pshufb_m128(v1, v2);
storeu128(res, vres);
for (int i=0; i<16; i++) {
if(vec2[i] & 0x80){
ASSERT_EQ(res[i], 0);
}else{
ASSERT_EQ(vec[vec2[i] % 16 ], res[i]);
}
}
}
/*Define ALIGNR128 macro*/
#define TEST_ALIGNR128(v1, v2, buf, l) { \
m128 v_aligned = palignr(v2,v1, l); \
storeu128(res, v_aligned); \
for (size_t i=0; i<16; i++) { \
ASSERT_EQ(res[i], vec[i + l]); \
} \
}
TEST(SimdUtilsTest, Alignr128){
u8 vec[32];
u8 res[16];
for (int i=0; i<32; i++) {
vec[i]=i;
}
m128 v1 = loadu128(vec);
m128 v2 = loadu128(vec+16);
for (int j = 0; j<16; j++){
TEST_ALIGNR128(v1, v2, vec, j);
}
}
} // namespace } // namespace

View File

@ -155,10 +155,14 @@ TEST(SuperVectorUtilsTest,OPXOR128c){
TEST(SuperVectorUtilsTest,OPANDNOT128c){ TEST(SuperVectorUtilsTest,OPANDNOT128c){
auto SP1 = SuperVector<16>::Zeroes(); auto SP1 = SuperVector<16>::Zeroes();
auto SP2 = SuperVector<16>::Ones(); auto SP2 = SuperVector<16>::Ones();
SP1 = SP1.opandnot(SP2);
for (int i=0; i<16; i++) {
ASSERT_EQ(SP1.u.u8[i],0xff);
}
SP2 = SP2.opandnot(SP1); SP2 = SP2.opandnot(SP1);
for (int i=0; i<16; i++) { for (int i=0; i<16; i++) {
ASSERT_EQ(SP2.u.s8[i],0); ASSERT_EQ(SP2.u.u8[i],0);
} }
} }
TEST(SuperVectorUtilsTest,Movemask128c){ TEST(SuperVectorUtilsTest,Movemask128c){
@ -280,13 +284,17 @@ TEST(SuperVectorUtilsTest,pshufb128c) {
} }
u8 vec2[16]; u8 vec2[16];
for (int i=0; i<16; i++) { for (int i=0; i<16; i++) {
vec2[i]=i; vec2[i]=i + (rand() % 15 + 0);
} }
auto SP1 = SuperVector<16>::loadu(vec); auto SP1 = SuperVector<16>::loadu(vec);
auto SP2 = SuperVector<16>::loadu(vec2); auto SP2 = SuperVector<16>::loadu(vec2);
auto SResult = SP1.template pshufb<true>(SP2); auto SResult = SP1.template pshufb<true>(SP2);
for (int i=0; i<16; i++) { for (int i=0; i<16; i++) {
ASSERT_EQ(vec[vec2[i]],SResult.u.u8[i]); if(vec2[i] & 0x80){
ASSERT_EQ(SResult.u.u8[i], 0);
}else{
ASSERT_EQ(vec[vec2[i] % 16 ],SResult.u.u8[i]);
}
} }
} }

View File

@ -30,7 +30,7 @@
#include "config.h" #include "config.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "nfa/vermicelli.h" #include "nfa/vermicelli.hpp"
TEST(Vermicelli, ExecNoMatch1) { TEST(Vermicelli, ExecNoMatch1) {
char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@ -1150,4 +1150,4 @@ TEST(DoubleVermicelliMasked16, Exec5) {
} }
} }
#endif // HAVE_SVE2 #endif // HAVE_SVE2

View File

@ -33,9 +33,6 @@ SET(corpusomatic_SRCS
ng_find_matches.cpp ng_find_matches.cpp
) )
add_library(corpusomatic STATIC ${corpusomatic_SRCS}) add_library(corpusomatic STATIC ${corpusomatic_SRCS})
if (ARCH_IA32 OR ARCH_X86_64)
set_target_properties(corpusomatic PROPERTIES COMPILE_FLAGS "-mssse3")
endif ()
set(databaseutil_SRCS set(databaseutil_SRCS
database_util.cpp database_util.cpp